├── .env.example
├── .gitignore
├── .python-version
├── .vscode
    ├── extensions.json
    └── settings.json
├── AgentQ.txt
├── LICENSE
├── README.md
├── agentq
    ├── __init__.py
    ├── __main__.py
    ├── config
    │   ├── __init__.py
    │   └── config.py
    ├── core
    │   ├── agent
    │   │   ├── __init__.py
    │   │   ├── agentq.py
    │   │   ├── agentq_actor.py
    │   │   ├── agentq_critic.py
    │   │   ├── base.py
    │   │   ├── browser_nav_agent.py
    │   │   ├── captcha_agent.py
    │   │   ├── eval_agent.py
    │   │   ├── planner_agent.py
    │   │   └── vision_agent.py
    │   ├── mcts
    │   │   ├── __init__.py
    │   │   ├── browser_mcts.py
    │   │   ├── core
    │   │   │   ├── base.py
    │   │   │   └── mcts.py
    │   │   ├── example
    │   │   │   └── grid.py
    │   │   └── visualization
    │   │   │   ├── __init__.py
    │   │   │   ├── __main__.py
    │   │   │   ├── tree_log.py
    │   │   │   ├── tree_snapshot.py
    │   │   │   └── visualizer_client.py
    │   ├── memory
    │   │   ├── __init__.py
    │   │   └── ltm.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── models.py
    │   ├── orchestrator
    │   │   └── orchestrator.py
    │   ├── prompts
    │   │   ├── __init__.py
    │   │   └── prompts.py
    │   ├── skills
    │   │   ├── __init__.py
    │   │   ├── click_using_selector.py
    │   │   ├── enter_text_and_click.py
    │   │   ├── enter_text_using_selector.py
    │   │   ├── get_dom_with_content_type.py
    │   │   ├── get_screenshot.py
    │   │   ├── get_url.py
    │   │   ├── get_user_input.py
    │   │   ├── open_url.py
    │   │   ├── pdf_text_extractor.py
    │   │   ├── press_key_combination.py
    │   │   ├── solve_captcha.py
    │   │   └── upload_file.py
    │   └── web_driver
    │   │   ├── __init__.py
    │   │   └── playwright.py
    ├── user_preferences
    │   └── user_preferences.txt
    └── utils
    │   ├── __init__.py
    │   ├── _pydantic.py
    │   ├── cli_helper.py
    │   ├── dom_helper.py
    │   ├── dom_mutation_observer.py
    │   ├── extract_json.py
    │   ├── function_utils.py
    │   ├── get_detailed_accessibility_tree.py
    │   ├── logger.py
    │   ├── message_type.py
    │   └── ui_messagetype.py
├── dpo_pairs.jsonl
├── logs.txt
├── output.txt
├── poetry.lock
├── pyproject.toml
├── requirements.txt
├── server.py
└── test
    ├── __init__.py
    ├── evaluators.py
    ├── run_tests.py
    ├── tasks
        ├── annotator_dry_run_webvoyager_tasks_30.json
        ├── test.json
        ├── two_tasks.json
        ├── webvoyager_sampled_data.json
        └── webvoyager_test.json
    ├── test_config_auditor.py
    ├── test_tasks_formatter.py
    ├── test_utils.py
    └── tests_processor.py


/.env.example:
--------------------------------------------------------------------------------
 1 | # the model name must be gpt-4o-2024-08-06 as it is dependent on structured output from open ai
 2 | MODEL_NAME="gpt-4o-2024-08-06"
 3 | 
 4 | LITELLM_LOG="ERROR"
 5 | 
 6 | OPENAI_API_KEY=""
 7 | 
 8 | # you can skip adding langfuse api keys. refer to the readme on how to disable tracing with langfuse. 
 9 | LANGFUSE_SECRET_KEY="sk-lf-"
10 | LANGFUSE_PUBLIC_KEY="pk-lf-"
11 | LANGFUSE_HOST="https://cloud.langfuse.com"


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | .venv/
3 | __pycache__
4 | log_files/
5 | logs/
6 | .DS_STORE
7 | results/
8 | output.txt


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10.13


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |   "recommendations": ["charliermarsh.ruff"]
3 | }
4 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "[python]": {
 3 |     "editor.formatOnSave": true,
 4 |     "editor.defaultFormatter": "charliermarsh.ruff",
 5 |     "editor.codeActionsOnSave": {
 6 |       "source.fixAll": "explicit",
 7 |       "source.organizeImports": "explicit"
 8 |     }
 9 |   },
10 |   "notebook.formatOnSave.enabled": true,
11 |   "notebook.codeActionsOnSave": {
12 |     "notebook.source.fixAll": "explicit",
13 |     "notebook.source.organizeImports": "explicit"
14 |   },
15 |   "ruff.nativeServer": "on"
16 | }
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Sentient Engineering
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # agentq - advanced reasoning and learning for autonomous AI agents
  2 | 
  3 | agentq utilises various kinds of agentic architectures to complete a task on the web reliably.
  4 | it has
  5 | 
  6 | ```
  7 | 1. a planner <> navigator multi-agent architecutre
  8 | 2. a solo planner-actor agent
  9 | 3. an actor <> critic multi-agent architecture
 10 | 4. actor <> critic architecture + monte carlo tree search based reinforcement learning + dpo finetuning
 11 | ```
 12 | 
 13 | this repo also contains an oss implementation of the research paper [agent q](https://arxiv.org/abs/2408.07199) - thus the name.
 14 | 
 15 | ### setup
 16 | 
 17 | 1. we recommend installing poetry before proceeding with the next steps. you can install poetry using these [instructions](https://python-poetry.org/docs/#installation)
 18 | 
 19 | 2. install dependencies
 20 | 
 21 | ```bash
 22 | poetry install
 23 | ```
 24 | 
 25 | 3. start chrome in dev mode - in a seaparate terminal, use the command to start a chrome instance and do necesssary logins to job websites like linkedin/ wellfound, etc.
 26 | 
 27 | for mac, use command -
 28 | 
 29 | ```bash
 30 | sudo /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
 31 | ```
 32 | 
 33 | for linux -
 34 | 
 35 | ```bash
 36 | google-chrome --remote-debugging-port=9222
 37 | ```
 38 | 
 39 | for windows -
 40 | 
 41 | ```bash
 42 | "C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222
 43 | ```
 44 | 
 45 | 4. set up env - add openai and [langfuse](https://langfuse.com) keys to .env file. you can refer .env.example. currently adding langfuse is required. If you do not want tracing - then you can do the following changes
 46 | 
 47 |    - directly import open ai client via `import openai` rather than `from langfuse.openai import openai` in the `./agentq/core/agent/base.py` file.
 48 |    - you would also have to comment out the @obseve decorator and the below piece of code from the `run` function in the same file
 49 | 
 50 |    ```python
 51 |    langfuse_context.update_current_trace(
 52 |                name=self.agnet_name,
 53 |                session_id=session_id
 54 |          )
 55 |    ```
 56 | 
 57 | 5. run the agent
 58 | 
 59 | ```bash
 60 | python -u -m agentq
 61 | ```
 62 | 
 63 | ### run evals
 64 | 
 65 | ```bash
 66 |  python -m test.tests_processor --orchestrator_type fsm
 67 | ```
 68 | 
 69 | ### generate dpo pairs for RL
 70 | 
 71 | ```bash
 72 | python -m agentq.core.mcts.browser_mcts
 73 | ```
 74 | 
 75 | #### citations
 76 | 
 77 | a bunch of amazing work in the space has inspired this.
 78 | 
 79 | ```
 80 | @misc{putta2024agentqadvancedreasoning,
 81 | title={Agent Q: Advanced Reasoning and Learning for Autonomous AI Agents},
 82 | author={Pranav Putta and Edmund Mills and Naman Garg and Sumeet Motwani and Chelsea Finn and Divyansh Garg and Rafael Rafailov},
 83 | year={2024},
 84 | eprint={2408.07199},
 85 | archivePrefix={arXiv},
 86 | primaryClass={cs.AI},
 87 | url={https://arxiv.org/abs/2408.07199},
 88 | }
 89 | ```
 90 | 
 91 | ```
 92 | @inproceedings{yao2022webshop,
 93 |   bibtex_show = {true},
 94 |   title = {WebShop: Towards Scalable Real-World Web Interaction with Grounded Language Agents},
 95 |   author = {Yao, Shunyu and Chen, Howard and Yang, John and Narasimhan, Karthik},
 96 |   booktitle = {ArXiv},
 97 |   year = {preprint},
 98 |   html = {https://arxiv.org/abs/2207.01206},
 99 |   tag = {NLP}
100 | }
101 | ```
102 | 
103 | ```
104 | @article{he2024webvoyager,
105 | title={WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models},
106 | author={He, Hongliang and Yao, Wenlin and Ma, Kaixin and Yu, Wenhao and Dai, Yong and Zhang, Hongming and Lan, Zhenzhong and Yu, Dong},
107 | journal={arXiv preprint arXiv:2401.13919},
108 | year={2024}
109 | }
110 | ```
111 | 
112 | ```
113 | @misc{abuelsaad2024-agente,
114 | title={Agent-E: From Autonomous Web Navigation to Foundational Design Principles in Agentic Systems},
115 | author={Tamer Abuelsaad and Deepak Akkil and Prasenjit Dey and Ashish Jagmohan and Aditya Vempaty and Ravi Kokku},
116 | year={2024},
117 | eprint={2407.13032},
118 | archivePrefix={arXiv},
119 | primaryClass={cs.AI},
120 | url={https://arxiv.org/abs/2407.13032},
121 | }
122 | ```
123 | 


--------------------------------------------------------------------------------
/agentq/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/__init__.py


--------------------------------------------------------------------------------
/agentq/__main__.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from playwright.async_api import Page
 4 | 
 5 | from agentq.core.agent.agentq import AgentQ
 6 | from agentq.core.agent.agentq_actor import AgentQActor
 7 | from agentq.core.agent.agentq_critic import AgentQCritic
 8 | from agentq.core.agent.browser_nav_agent import BrowserNavAgent
 9 | from agentq.core.agent.planner_agent import PlannerAgent
10 | from agentq.core.models.models import State
11 | from agentq.core.orchestrator.orchestrator import Orchestrator
12 | 
13 | state_to_agent_map = {
14 |     State.PLAN: PlannerAgent(),
15 |     State.BROWSE: BrowserNavAgent(),
16 |     State.AGENTQ_BASE: AgentQ(),
17 |     State.AGENTQ_ACTOR: AgentQActor(),
18 |     State.AGENTQ_CRITIC: AgentQCritic(),
19 | }
20 | 
21 | 
22 | async def run_agent(command):
23 |     orchestrator = Orchestrator(state_to_agent_map=state_to_agent_map, eval_mode=True)
24 |     await orchestrator.start()
25 |     page: Page = await orchestrator.playwright_manager.get_current_page()
26 |     await page.set_extra_http_headers({"User-Agent": "AgentQ-Bot"})
27 |     await page.goto(
28 |         "http://localhost:3000/abc", wait_until="networkidle", timeout=30000
29 |     )
30 |     result = await orchestrator.execute_command(command)
31 |     return result
32 | 
33 | 
34 | def run_agent_sync(command):
35 |     if asyncio.get_event_loop().is_closed():
36 |         loop = asyncio.new_event_loop()
37 |         asyncio.set_event_loop(loop)
38 |     else:
39 |         loop = asyncio.get_event_loop()
40 | 
41 |     return loop.run_until_complete(run_agent(command))
42 | 
43 | 
44 | async def main():
45 |     orchestrator = Orchestrator(state_to_agent_map=state_to_agent_map)
46 |     await orchestrator.start()
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     asyncio.run(main())
51 | 


--------------------------------------------------------------------------------
/agentq/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/config/__init__.py


--------------------------------------------------------------------------------
/agentq/config/config.py:
--------------------------------------------------------------------------------
 1 | # config.py at the project source code root
 2 | import os
 3 | 
 4 | # Get the absolute path of the current file (config.py)
 5 | CURRENT_FILE_PATH = os.path.abspath(__file__)
 6 | 
 7 | # Get the project root directory (two levels up from config.py)
 8 | PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(CURRENT_FILE_PATH)))
 9 | 
10 | # Define other paths relative to the project root
11 | PROJECT_SOURCE_ROOT = os.path.join(PROJECT_ROOT, "agentq")
12 | SOURCE_LOG_FOLDER_PATH = os.path.join(PROJECT_SOURCE_ROOT, "log_files")
13 | PROJECT_TEMP_PATH = os.path.join(PROJECT_SOURCE_ROOT, "temp")
14 | USER_PREFERENCES_PATH = os.path.join(PROJECT_SOURCE_ROOT, "user_preferences")
15 | PROJECT_TEST_ROOT = os.path.join(PROJECT_ROOT, "test")
16 | 
17 | # Check if the log folder exists, and if not, create it
18 | if not os.path.exists(SOURCE_LOG_FOLDER_PATH):
19 |     os.makedirs(SOURCE_LOG_FOLDER_PATH)
20 |     print(f"Created log folder at: {SOURCE_LOG_FOLDER_PATH}")
21 | 
22 | # create user prefernces folder if it does not exist
23 | if not os.path.exists(USER_PREFERENCES_PATH):
24 |     os.makedirs(USER_PREFERENCES_PATH)
25 |     print(f"Created user preferences folder at: {USER_PREFERENCES_PATH}")
26 | 
27 | if not os.path.exists(PROJECT_TEMP_PATH):
28 |     os.makedirs(PROJECT_TEMP_PATH)
29 |     print(f"Created temp folder at: {PROJECT_TEMP_PATH}")
30 | 


--------------------------------------------------------------------------------
/agentq/core/agent/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/agent/__init__.py


--------------------------------------------------------------------------------
/agentq/core/agent/agentq.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from string import Template
 3 | 
 4 | from agentq.core.agent.base import BaseAgent
 5 | from agentq.core.memory import ltm
 6 | from agentq.core.models.models import AgentQBaseInput, AgentQBaseOutput
 7 | from agentq.core.prompts.prompts import LLM_PROMPTS
 8 | 
 9 | 
10 | class AgentQ(BaseAgent):
11 |     def __init__(self):
12 |         self.name = "agentq"
13 |         self.ltm = None
14 |         self.ltm = self.__get_ltm()
15 |         self.system_prompt = self.__modify_system_prompt(self.ltm)
16 |         super().__init__(
17 |             name=self.name,
18 |             system_prompt=self.system_prompt,
19 |             input_format=AgentQBaseInput,
20 |             output_format=AgentQBaseOutput,
21 |             keep_message_history=False,
22 |         )
23 | 
24 |     @staticmethod
25 |     def __get_ltm():
26 |         return ltm.get_user_ltm()
27 | 
28 |     def __modify_system_prompt(self, ltm):
29 |         system_prompt: str = LLM_PROMPTS["AGENTQ_BASE_PROMPT"]
30 | 
31 |         substitutions = {
32 |             "task_information": ltm if ltm is not None else "",
33 |         }
34 | 
35 |         # Use safe_substitute to avoid KeyError
36 |         system_prompt = Template(system_prompt).safe_substitute(substitutions)
37 | 
38 |         # Add today's day & date to the system prompt
39 |         today = datetime.now()
40 |         today_date = today.strftime("%d/%m/%Y")
41 |         weekday = today.strftime("%A")
42 |         system_prompt += f"\nToday's date is: {today_date}"
43 |         system_prompt += f"\nCurrent weekday is: {weekday}"
44 | 
45 |         return system_prompt
46 | 


--------------------------------------------------------------------------------
/agentq/core/agent/agentq_actor.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from string import Template
 3 | 
 4 | from agentq.core.agent.base import BaseAgent
 5 | from agentq.core.memory import ltm
 6 | from agentq.core.models.models import AgentQActorInput, AgentQActorOutput
 7 | from agentq.core.prompts.prompts import LLM_PROMPTS
 8 | 
 9 | 
10 | class AgentQActor(BaseAgent):
11 |     def __init__(self):
12 |         self.name = "actor"
13 |         self.ltm = None
14 |         self.ltm = self.__get_ltm()
15 |         self.system_prompt = self.__modify_system_prompt(self.ltm)
16 |         super().__init__(
17 |             name=self.name,
18 |             system_prompt=self.system_prompt,
19 |             input_format=AgentQActorInput,
20 |             output_format=AgentQActorOutput,
21 |             keep_message_history=False,
22 |         )
23 | 
24 |     @staticmethod
25 |     def __get_ltm():
26 |         return ltm.get_user_ltm()
27 | 
28 |     def __modify_system_prompt(self, ltm):
29 |         system_prompt: str = LLM_PROMPTS["AGENTQ_ACTOR_PROMPT"]
30 | 
31 |         substitutions = {
32 |             "basic_user_information": ltm if ltm is not None else "",
33 |         }
34 | 
35 |         # Use safe_substitute to avoid KeyError
36 |         system_prompt = Template(system_prompt).safe_substitute(substitutions)
37 | 
38 |         # Add today's day & date to the system prompt
39 |         today = datetime.now()
40 |         today_date = today.strftime("%d/%m/%Y")
41 |         weekday = today.strftime("%A")
42 |         system_prompt += f"\nToday's date is: {today_date}"
43 |         system_prompt += f"\nCurrent weekday is: {weekday}"
44 | 
45 |         return system_prompt
46 | 


--------------------------------------------------------------------------------
/agentq/core/agent/agentq_critic.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from string import Template
 3 | 
 4 | from agentq.core.agent.base import BaseAgent
 5 | from agentq.core.memory import ltm
 6 | from agentq.core.models.models import AgentQCriticInput, AgentQCriticOutput
 7 | from agentq.core.prompts.prompts import LLM_PROMPTS
 8 | 
 9 | 
10 | class AgentQCritic(BaseAgent):
11 |     def __init__(self):
12 |         self.name = "critic"
13 |         self.ltm = None
14 |         self.ltm = self.__get_ltm()
15 |         self.system_prompt = self.__modify_system_prompt(self.ltm)
16 |         super().__init__(
17 |             name=self.name,
18 |             system_prompt=self.system_prompt,
19 |             input_format=AgentQCriticInput,
20 |             output_format=AgentQCriticOutput,
21 |             keep_message_history=False,
22 |         )
23 | 
24 |     @staticmethod
25 |     def __get_ltm():
26 |         return ltm.get_user_ltm()
27 | 
28 |     def __modify_system_prompt(self, ltm):
29 |         system_prompt: str = LLM_PROMPTS["AGENTQ_CRITIC_PROMPT"]
30 | 
31 |         substitutions = {
32 |             "basic_user_information": ltm if ltm is not None else "",
33 |         }
34 | 
35 |         # Use safe_substitute to avoid KeyError
36 |         system_prompt = Template(system_prompt).safe_substitute(substitutions)
37 | 
38 |         # Add today's day & date to the system prompt
39 |         today = datetime.now()
40 |         today_date = today.strftime("%d/%m/%Y")
41 |         weekday = today.strftime("%A")
42 |         system_prompt += f"\nToday's date is: {today_date}"
43 |         system_prompt += f"\nCurrent weekday is: {weekday}"
44 | 
45 |         return system_prompt
46 | 


--------------------------------------------------------------------------------
/agentq/core/agent/base.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import Callable, List, Optional, Tuple, Type
  4 | 
  5 | import instructor
  6 | import instructor.patch
  7 | import litellm
  8 | import openai
  9 | from instructor import Mode
 10 | from langsmith import traceable
 11 | from pydantic import BaseModel
 12 | 
 13 | from agentq.utils.function_utils import get_function_schema
 14 | from agentq.utils.logger import logger
 15 | 
 16 | 
 17 | class BaseAgent:
 18 |     def __init__(
 19 |         self,
 20 |         name: str,
 21 |         system_prompt: str,
 22 |         input_format: Type[BaseModel],
 23 |         output_format: Type[BaseModel],
 24 |         tools: Optional[List[Tuple[Callable, str]]] = None,
 25 |         keep_message_history: bool = True,
 26 |         client: str = "openai",
 27 |     ):
 28 |         # Metdata
 29 |         self.agent_name = name
 30 | 
 31 |         # Messages
 32 |         self.system_prompt = system_prompt
 33 |         # handling the case where agent has to do async intialisation as system prompt depends on some async functions.
 34 |         # in those cases, we do init with empty system prompt string and then handle adding system prompt to messages array in the agent itself
 35 |         if self.system_prompt:
 36 |             self._initialize_messages()
 37 |         self.keep_message_history = keep_message_history
 38 | 
 39 |         # Input-output format
 40 |         self.input_format = input_format
 41 |         self.output_format = output_format
 42 | 
 43 |         # Set global configurations for litellm
 44 |         litellm.logging = True
 45 |         litellm.set_verbose = True
 46 | 
 47 |         # Llm client
 48 |         if client == "openai":
 49 |             self.client = openai.Client()
 50 |         elif client == "together":
 51 |             self.client = openai.OpenAI(
 52 |                 base_url="https://api.together.xyz/v1",
 53 |                 api_key=os.environ["TOGETHER_API_KEY"],
 54 |             )
 55 | 
 56 |         self.client = instructor.from_openai(self.client, mode=Mode.JSON)
 57 | 
 58 |         # Tools
 59 |         self.tools_list = []
 60 |         self.executable_functions_list = {}
 61 |         if tools:
 62 |             self._initialize_tools(tools)
 63 | 
 64 |     def _initialize_tools(self, tools: List[Tuple[Callable, str]]):
 65 |         for func, func_desc in tools:
 66 |             self.tools_list.append(get_function_schema(func, description=func_desc))
 67 |             self.executable_functions_list[func.__name__] = func
 68 | 
 69 |     def _initialize_messages(self):
 70 |         self.messages = [{"role": "system", "content": self.system_prompt}]
 71 | 
 72 |     @traceable(run_type="chain", name="agent_run")
 73 |     async def run(
 74 |         self,
 75 |         input_data: BaseModel,
 76 |         screenshot: str = None,
 77 |         session_id: str = None,
 78 |         # model: str = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
 79 |         model: str = "gpt-4o-2024-08-06",
 80 |     ) -> BaseModel:
 81 |         if not isinstance(input_data, self.input_format):
 82 |             raise ValueError(f"Input data must be of type {self.input_format.__name__}")
 83 | 
 84 |         # Handle message history.
 85 |         if not self.keep_message_history:
 86 |             self._initialize_messages()
 87 | 
 88 |         if screenshot:
 89 |             self.messages.append(
 90 |                 {
 91 |                     "role": "user",
 92 |                     "content": [
 93 |                         {
 94 |                             "type": "text",
 95 |                             "text": input_data.model_dump_json(
 96 |                                 exclude={"current_page_dom", "current_page_url"}
 97 |                             ),
 98 |                         },
 99 |                         {"type": "image_url", "image_url": {"url": screenshot}},
100 |                     ],
101 |                 }
102 |             )
103 |         else:
104 |             self.messages.append(
105 |                 {
106 |                     "role": "user",
107 |                     "content": input_data.model_dump_json(
108 |                         exclude={"current_page_dom", "current_page_url"}
109 |                     ),
110 |                 }
111 |             )
112 | 
113 |         # input dom and current page url in a separate message so that the LLM can pay attention to completed tasks better. *based on personal vibe check*
114 |         if hasattr(input_data, "current_page_dom") and hasattr(
115 |             input_data, "current_page_url"
116 |         ):
117 |             self.messages.append(
118 |                 {
119 |                     "role": "user",
120 |                     "content": f"Current page URL:\n{input_data.current_page_url}\n\n Current page DOM:\n{input_data.current_page_dom}",
121 |                 }
122 |             )
123 | 
124 |         # logger.info(self.messages)
125 | 
126 |         # TODO: add a max_turn here to prevent a inifinite fallout
127 |         while True:
128 |             # TODO:
129 |             # 1. exeception handling while calling the client
130 |             # 2. remove the else block as JSON mode in instrutor won't allow us to pass in tools.
131 |             if len(self.tools_list) == 0:
132 |                 response = self.client.chat.completions.create(
133 |                     model=model,
134 |                     # model="gpt-4o-2024-08-06",
135 |                     # model="gpt-4o-mini",
136 |                     # model="groq/llama3-groq-70b-8192-tool-use-preview",
137 |                     # model="xlam-1b-fc-r",
138 |                     messages=self.messages,
139 |                     response_model=self.output_format,
140 |                     max_retries=4,
141 |                 )
142 |             else:
143 |                 response = self.client.chat.completions.create(
144 |                     model=model,
145 |                     messages=self.messages,
146 |                     response_model=self.output_format,
147 |                     tool_choice="auto",
148 |                     tools=self.tools_list,
149 |                 )
150 | 
151 |             # instructor directly outputs response.choices[0].message. so we will do response_message = response
152 |             # response_message = response.choices[0].message
153 | 
154 |             # instructor does not support funciton in JSON mode
155 |             # if response_message.tool_calls:
156 |             #     tool_calls = response_message.tool_calls
157 | 
158 |             # if tool_calls:
159 |             #     self.messages.append(response_message)
160 |             #     for tool_call in tool_calls:
161 |             #         await self._append_tool_response(tool_call)
162 |             #     continue
163 | 
164 |             # parsed_response_content: self.output_format = response_message.parsed
165 | 
166 |             try:
167 |                 assert isinstance(response, self.output_format)
168 |             except AssertionError:
169 |                 raise TypeError(
170 |                     f"Expected response_message to be of type {self.output_format.__name__}, but got {type(response).__name__}"
171 |                 )
172 |             return response
173 | 
174 |     async def _append_tool_response(self, tool_call):
175 |         function_name = tool_call.function.name
176 |         function_to_call = self.executable_functions_list[function_name]
177 |         function_args = json.loads(tool_call.function.arguments)
178 |         try:
179 |             function_response = await function_to_call(**function_args)
180 |             # print(function_response)
181 |             self.messages.append(
182 |                 {
183 |                     "tool_call_id": tool_call.id,
184 |                     "role": "tool",
185 |                     "name": function_name,
186 |                     "content": str(function_response),
187 |                 }
188 |             )
189 |         except Exception as e:
190 |             logger.error(f"Error occurred calling the tool {function_name}: {str(e)}")
191 |             self.messages.append(
192 |                 {
193 |                     "tool_call_id": tool_call.id,
194 |                     "role": "tool",
195 |                     "name": function_name,
196 |                     "content": str(
197 |                         "The tool responded with an error, please try again with a different tool or modify the parameters of the tool",
198 |                         function_response,
199 |                     ),
200 |                 }
201 |             )
202 | 


--------------------------------------------------------------------------------
/agentq/core/agent/browser_nav_agent.py:
--------------------------------------------------------------------------------
 1 | from agentq.core.agent.base import BaseAgent
 2 | from agentq.core.models.models import BrowserNavInput, BrowserNavOutput
 3 | from agentq.core.prompts.prompts import LLM_PROMPTS
 4 | from agentq.core.skills.click_using_selector import click as click_element
 5 | from agentq.core.skills.enter_text_and_click import enter_text_and_click
 6 | from agentq.core.skills.enter_text_using_selector import bulk_enter_text, entertext
 7 | from agentq.core.skills.get_dom_with_content_type import get_dom_with_content_type
 8 | from agentq.core.skills.get_url import geturl
 9 | from agentq.core.skills.open_url import openurl
10 | from agentq.core.skills.pdf_text_extractor import extract_text_from_pdf
11 | from agentq.core.skills.press_key_combination import press_key_combination
12 | from agentq.core.skills.solve_captcha import solve_captcha
13 | from agentq.core.skills.upload_file import upload_file
14 | 
15 | 
16 | class BrowserNavAgent(BaseAgent):
17 |     def __init__(self):
18 |         self.name = "executor"
19 | 
20 |         super().__init__(
21 |             name=self.name,
22 |             system_prompt=LLM_PROMPTS["BROWSER_AGENT_PROMPT"],
23 |             input_format=BrowserNavInput,
24 |             output_format=BrowserNavOutput,
25 |             keep_message_history=False,
26 |             tools=self._get_tools(),
27 |         )
28 | 
29 |     def _get_tools(self):
30 |         return [
31 |             (openurl, LLM_PROMPTS["OPEN_URL_PROMPT"]),
32 |             (enter_text_and_click, LLM_PROMPTS["ENTER_TEXT_AND_CLICK_PROMPT"]),
33 |             (
34 |                 get_dom_with_content_type,
35 |                 LLM_PROMPTS["GET_DOM_WITH_CONTENT_TYPE_PROMPT"],
36 |             ),
37 |             (click_element, LLM_PROMPTS["CLICK_PROMPT"]),
38 |             (geturl, LLM_PROMPTS["GET_URL_PROMPT"]),
39 |             (bulk_enter_text, LLM_PROMPTS["BULK_ENTER_TEXT_PROMPT"]),
40 |             (entertext, LLM_PROMPTS["ENTER_TEXT_PROMPT"]),
41 |             (press_key_combination, LLM_PROMPTS["PRESS_KEY_COMBINATION_PROMPT"]),
42 |             (extract_text_from_pdf, LLM_PROMPTS["EXTRACT_TEXT_FROM_PDF_PROMPT"]),
43 |             (upload_file, LLM_PROMPTS["UPLOAD_FILE_PROMPT"]),
44 |             (solve_captcha, LLM_PROMPTS["SOLVE_CAPTCHA_PROMPT"]),
45 |         ]
46 | 


--------------------------------------------------------------------------------
/agentq/core/agent/captcha_agent.py:
--------------------------------------------------------------------------------
 1 | from agentq.core.agent.base import BaseAgent
 2 | from agentq.core.models.models import CaptchaAgentInput, CaptchaAgentOutput
 3 | from agentq.core.prompts.prompts import LLM_PROMPTS
 4 | 
 5 | 
 6 | class CaptchaAgent(BaseAgent):
 7 |     def __init__(self):
 8 |         self.name = "captcha_solver"
 9 |         super().__init__(
10 |             name=self.name,
11 |             system_prompt=LLM_PROMPTS["CAPTCHA_AGENT_PROMPT"],
12 |             input_format=CaptchaAgentInput,
13 |             output_format=CaptchaAgentOutput,
14 |             keep_message_history=False,
15 |         )
16 | 


--------------------------------------------------------------------------------
/agentq/core/agent/eval_agent.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from string import Template
 3 | 
 4 | from agentq.core.agent.base import BaseAgent
 5 | from agentq.core.memory import ltm
 6 | from agentq.core.models.models import EvalAgentInput, EvalAgentOutput
 7 | from agentq.core.prompts.prompts import LLM_PROMPTS
 8 | 
 9 | 
10 | class EvalAgent(BaseAgent):
11 |     def __init__(self):
12 |         self.name = "eval"
13 |         self.ltm = None
14 |         # not passing ltm to the eval agent
15 |         # self.ltm = self.__get_ltm()
16 |         self.system_prompt = self.__modify_system_prompt(self.ltm)
17 |         super().__init__(
18 |             name=self.name,
19 |             system_prompt=self.system_prompt,
20 |             input_format=EvalAgentInput,
21 |             output_format=EvalAgentOutput,
22 |             keep_message_history=False,
23 |         )
24 | 
25 |     @staticmethod
26 |     def __get_ltm():
27 |         return ltm.get_user_ltm()
28 | 
29 |     def __modify_system_prompt(self, ltm):
30 |         system_prompt: str = LLM_PROMPTS["EVAL_AGENT_PROMPT"]
31 | 
32 |         substitutions = {
33 |             "basic_user_information": ltm if ltm is not None else "",
34 |         }
35 | 
36 |         # Use safe_substitute to avoid KeyError
37 |         system_prompt = Template(system_prompt).safe_substitute(substitutions)
38 | 
39 |         # Add today's day & date to the system prompt
40 |         today = datetime.now()
41 |         today_date = today.strftime("%d/%m/%Y")
42 |         weekday = today.strftime("%A")
43 |         system_prompt += f"\nToday's date is: {today_date}"
44 |         system_prompt += f"\nCurrent weekday is: {weekday}"
45 | 
46 |         return system_prompt
47 | 


--------------------------------------------------------------------------------
/agentq/core/agent/planner_agent.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from string import Template
 3 | from typing import Optional
 4 | 
 5 | from agentq.core.agent.base import BaseAgent
 6 | from agentq.core.memory import ltm
 7 | from agentq.core.models.models import PlannerInput, PlannerOutput
 8 | from agentq.core.prompts.prompts import LLM_PROMPTS
 9 | 
10 | 
11 | class PlannerAgent(BaseAgent):
12 |     def __init__(self):
13 |         ltm: Optional[str] = None
14 |         ltm = self.__get_ltm()
15 |         system_prompt = self.__modify_system_prompt(ltm)
16 |         self.name = "planner"
17 | 
18 |         super().__init__(
19 |             name=self.name,
20 |             system_prompt=system_prompt,
21 |             input_format=PlannerInput,
22 |             output_format=PlannerOutput,
23 |             keep_message_history=False,
24 |         )
25 | 
26 |     def __get_ltm(self):
27 |         return ltm.get_user_ltm()
28 | 
29 |     def __modify_system_prompt(self, ltm):
30 |         system_prompt: str = LLM_PROMPTS["PLANNER_AGENT_PROMPT"]
31 | 
32 |         # Add user ltm to system prompt
33 |         
34 |         if ltm is not None: 
35 |             ltm = "\n" + ltm
36 |             system_prompt = Template(system_prompt).substitute(basic_user_information=ltm)
37 | 
38 |         # Add today's day & date to the system prompt
39 |         today = datetime.now()
40 |         today_date = today.strftime("%d/%m/%Y")
41 |         weekday = today.strftime("%A")
42 |         system_prompt += f"\nToday's date is: {today_date}"
43 |         system_prompt += f"\nCurrent weekday is: {weekday}"
44 | 
45 |         return system_prompt
46 | 


--------------------------------------------------------------------------------
/agentq/core/agent/vision_agent.py:
--------------------------------------------------------------------------------
 1 | from agentq.core.agent.base import BaseAgent
 2 | from agentq.core.models.models import VisionInput, VisionOutput
 3 | from agentq.core.prompts.prompts import LLM_PROMPTS
 4 | 
 5 | 
 6 | class VisionAgent(BaseAgent):
 7 |     def __init__(self, client: str = "openai"):
 8 |         system_prompt: str = LLM_PROMPTS["VISION_AGENT_PROMPT"]
 9 |         self.name = "vision"
10 | 
11 |         super().__init__(
12 |             name=self.name,
13 |             system_prompt=system_prompt,
14 |             input_format=VisionInput,
15 |             output_format=VisionOutput,
16 |             keep_message_history=False,
17 |             client=client,
18 |         )
19 | 


--------------------------------------------------------------------------------
/agentq/core/mcts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/mcts/__init__.py


--------------------------------------------------------------------------------
/agentq/core/mcts/core/base.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from typing import Generic, Protocol, Tuple, TypeVar, Union, runtime_checkable
  3 | 
  4 | State = TypeVar("State")
  5 | Action = TypeVar("Action")
  6 | Example = TypeVar("Example")
  7 | Trace = tuple[list[State], list[Action]]
  8 | 
  9 | 
 10 | class WorldModel(ABC, Generic[State, Action, Example]):
 11 |     def __init__(self) -> None:
 12 |         self.example = None
 13 |         self.prompt = None
 14 | 
 15 |     @abstractmethod
 16 |     async def init_state(self) -> State: ...
 17 | 
 18 |     @abstractmethod
 19 |     async def step(
 20 |         self, state: State, action: Action
 21 |     ) -> Union[State, Tuple[State, dict]]:
 22 |         """Returns the next state and optionally an auxiliary data dict
 23 | 
 24 |         :param state: The current state
 25 |         :param action: The action to take
 26 |         :return: The next state and optionally an auxiliary data dict
 27 |         """
 28 |         ...
 29 | 
 30 |     @abstractmethod
 31 |     async def is_terminal(self, state: State) -> bool: ...
 32 | 
 33 |     def update_example(self, example: Example, prompt=None) -> None:
 34 |         if prompt is not None:
 35 |             self.prompt = prompt
 36 |         self.example = example
 37 | 
 38 | 
 39 | class DefaultWorldModel(WorldModel):
 40 |     # A default implementation of WorldModel that only
 41 |     # saves the action sequence as the state
 42 | 
 43 |     def __init__(self, base_model) -> None:
 44 |         super().__init__()
 45 |         self.base_model = base_model
 46 | 
 47 |     async def init_state(self):
 48 |         return []
 49 | 
 50 |     async def step(self, state, action):
 51 |         return state + [action], {}
 52 | 
 53 |     async def is_terminal(self, state):
 54 |         # By default the state is never terminal
 55 |         return False
 56 | 
 57 | 
 58 | class SearchConfig(ABC, Generic[State, Action, Example]):
 59 |     def __init__(self) -> None:
 60 |         self.example = None
 61 |         self.prompt = None
 62 | 
 63 |     @abstractmethod
 64 |     async def get_actions(self, state: State) -> list[Action]: ...
 65 | 
 66 |     def fast_reward(self, state: State, action: Action) -> tuple[float, dict]:
 67 |         return 0, {}
 68 | 
 69 |     @abstractmethod
 70 |     async def reward(self, state, action, **kwargs) -> tuple[float, dict]: ...
 71 | 
 72 |     def update_example(self, example: Example, prompt=None) -> None:
 73 |         if prompt is not None:
 74 |             self.prompt = prompt
 75 |         self.example = example
 76 | 
 77 | 
 78 | @runtime_checkable
 79 | class AlgorithmOutput(Protocol[State]):
 80 |     terminal_state: State
 81 |     trace: Trace
 82 | 
 83 | 
 84 | class SearchAlgorithm(ABC):
 85 |     def __init__(self, **kwargs): ...
 86 | 
 87 |     @abstractmethod
 88 |     async def __call__(
 89 |         self, world_model: WorldModel, search_config: SearchConfig, **kwargs
 90 |     ) -> AlgorithmOutput: ...
 91 | 
 92 | 
 93 | class Reasoner(ABC, Generic[State, Action, Example]):
 94 |     def __init__(
 95 |         self,
 96 |         world_model: WorldModel[State, Action, Example],
 97 |         search_config: SearchConfig[State, Action, Example],
 98 |         search_algo: SearchAlgorithm,
 99 |     ) -> None:
100 |         self.world_model = world_model
101 |         self.search_config = search_config
102 |         self.search_algo = search_algo
103 | 
104 |     async def __call__(
105 |         self, example: Example, prompt=None, **kwargs
106 |     ) -> AlgorithmOutput[State]:
107 |         self.world_model.update_example(example, prompt=prompt)
108 |         self.search_config.update_example(example, prompt=prompt)
109 |         return await self.search_algo(self.world_model, self.search_config, **kwargs)
110 | 


--------------------------------------------------------------------------------
/agentq/core/mcts/example/grid.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from typing import List, NamedTuple, Tuple
  3 | 
  4 | from agentq.core.mcts.core.base import Reasoner, SearchConfig, WorldModel
  5 | from agentq.core.mcts.core.mcts import MCTS, MCTSResult
  6 | 
  7 | 
  8 | class GridState(NamedTuple):
  9 |     position: Tuple[int, int]
 10 |     grid: List[List[int]]
 11 | 
 12 | 
 13 | class GridAction(NamedTuple):
 14 |     direction: str  # up, down, left, right
 15 | 
 16 | 
 17 | class GridWorldModel(WorldModel[GridState, GridAction, None]):
 18 |     def __init__(self, grid: List[List[int]]):
 19 |         self.grid = grid
 20 |         self.height = len(grid)
 21 |         self.width = len(grid[0])
 22 | 
 23 |     async def init_state(self) -> GridState:
 24 |         for i in range(self.height):
 25 |             for j in range(self.width):
 26 |                 if self.grid[i][j] == 2:
 27 |                     return GridState((i, j), self.grid)
 28 |         raise ValueError("No initial position (2) found in the grid")
 29 | 
 30 |     async def step(
 31 |         self, state: GridState, action: GridAction
 32 |     ) -> Tuple[GridState, dict]:
 33 |         x, y = state.position
 34 |         if action.direction == "up":
 35 |             new_x, new_y = x - 1, y
 36 |         elif action.direction == "down":
 37 |             new_x, new_y = x + 1, y
 38 |         elif action.direction == "left":
 39 |             new_x, new_y = x, y - 1
 40 |         elif action.direction == "right":
 41 |             new_x, new_y = x, y + 1
 42 |         else:
 43 |             raise ValueError(f"Invalid action: {action}")
 44 | 
 45 |         # Check for valid position
 46 |         if (
 47 |             0 <= new_x < self.height
 48 |             and 0 <= new_y < self.width
 49 |             and state.grid[new_x][new_y] != 1
 50 |         ):
 51 |             new_position = (new_x, new_y)
 52 |         else:
 53 |             new_position = state.position
 54 | 
 55 |         new_state = GridState(new_position, state.grid)
 56 |         return new_state, {}
 57 | 
 58 |     async def is_terminal(self, state: GridState) -> bool:
 59 |         # x, y = state.position
 60 |         # return state.grid[x][y] == 3
 61 |         return is_terminal(state)
 62 | 
 63 | 
 64 | class GridSearchConfig(SearchConfig[GridState, GridAction, None]):
 65 |     def __init__(self):
 66 |         super().__init__()
 67 | 
 68 |     async def get_actions(self, state: GridState) -> List[GridAction]:
 69 |         return [
 70 |             GridAction("up"),
 71 |             GridAction("down"),
 72 |             GridAction("left"),
 73 |             GridAction("right"),
 74 |         ]
 75 | 
 76 |     async def reward(
 77 |         self, state: GridState, action: GridAction, **kwargs
 78 |     ) -> Tuple[float, dict]:
 79 |         if is_terminal(state):
 80 |             return 1.0, {}  # good move
 81 |         # else:
 82 |         #     return 0.0, {}
 83 |         else:
 84 |             return -0.01, {}  # small penalty for each step to encourage shorter path
 85 | 
 86 | 
 87 | def is_terminal(state: GridState) -> bool:
 88 |     x, y = state.position
 89 |     return state.grid[x][y] == 3
 90 | 
 91 | 
 92 | class MCTSGridWrapper(Reasoner[GridState, GridAction, None]):
 93 |     def __init__(
 94 |         self,
 95 |         grid: List[List[int]],
 96 |         n_iterations: int = 1000,
 97 |         exploration_weight: float = 1.0,
 98 |     ) -> None:
 99 |         self.grid = grid
100 |         world_model = GridWorldModel(grid)
101 |         search_config = GridSearchConfig()
102 |         search_algo = MCTS(
103 |             n_iters=n_iterations,
104 |             w_exp=exploration_weight,
105 |             cum_reward=sum,
106 |             # calc_q=np.mean,
107 |             simulate_strategy="random",
108 |             output_strategy="max_reward",
109 |             depth_limit=len(grid) * len(grid[0]),
110 |         )
111 |         super().__init__(world_model, search_config, search_algo)
112 | 
113 |     async def __call__(self) -> MCTSResult:
114 |         # return self.search_algo(self.world_model, self.search_config)
115 |         return await super().__call__(example=None)
116 | 
117 |     @staticmethod
118 |     def print_path(result: MCTSResult):
119 |         if result.trace is None or len(result.trace) == 0:
120 |             print("No valid path found")
121 |             return
122 | 
123 |         states, actions = result.trace
124 |         print("Path found: ")
125 |         for i, (state, action) in enumerate(zip(states, actions)):
126 |             print(f"Step{i}: Position {state.position}, Action: {action.direction}")
127 | 
128 |         print(f"Final position: {states[-1].position}")
129 |         print(f"Cumulative reward: {result.cum_reward}")
130 | 
131 | 
132 | async def main():
133 |     # 0: Empty cell
134 |     # 1: Blocked cell
135 |     # 2: Initial position
136 |     # 3: Exit (terminal state)
137 |     grid = [
138 |         [0, 0, 0, 0, 0],
139 |         [0, 1, 0, 1, 0],
140 |         [0, 0, 0, 0, 0],
141 |         [0, 0, 0, 0, 0],
142 |         [0, 0, 3, 1, 2],
143 |     ]
144 | 
145 |     mcts_wrapper = MCTSGridWrapper(grid, n_iterations=10000, exploration_weight=1.0)
146 |     result = await mcts_wrapper()
147 | 
148 |     MCTSGridWrapper.print_path(result)
149 | 
150 | 
151 | if __name__ == "__main__":
152 |     print("[DEBUG] Script started")
153 |     asyncio.run(main())
154 |     print("[DEBUG] Script finished")
155 | 


--------------------------------------------------------------------------------
/agentq/core/mcts/visualization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/mcts/visualization/__init__.py


--------------------------------------------------------------------------------
/agentq/core/mcts/visualization/__main__.py:
--------------------------------------------------------------------------------
 1 | def main():
 2 |     import argparse
 3 | 
 4 |     from agentq.core.mcts.visualization import VisualizerClient
 5 | 
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("tree_log", type=str)
 8 |     parser.add_argument("--base_url", type=str)
 9 |     args = parser.parse_args()
10 | 
11 |     if args.base_url is None:
12 |         client = VisualizerClient()
13 |     else:
14 |         client = VisualizerClient(args.base_url)
15 | 
16 |     with open(args.tree_log) as f:
17 |         data = f.read()
18 |     result = client.post_log(data)
19 |     print(result.access_url)
20 | 


--------------------------------------------------------------------------------
/agentq/core/mcts/visualization/tree_log.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from enum import Enum
  3 | from typing import Sequence, Union
  4 | 
  5 | from agentq.core.mcts.core.mcts import MCTSNode, MCTSResult
  6 | from agentq.core.mcts.visualization.tree_snapshot import (
  7 |     EdgeData,
  8 |     EdgeId,
  9 |     NodeData,
 10 |     NodeId,
 11 |     TreeSnapshot,
 12 | )
 13 | 
 14 | 
 15 | class TreeLogEncoder(json.JSONEncoder):
 16 |     def default(self, o):
 17 |         from numpy import float32
 18 | 
 19 |         if isinstance(o, TreeSnapshot.Node):
 20 |             return o.__dict__
 21 |         elif isinstance(o, TreeSnapshot.Edge):
 22 |             return o.__dict__
 23 |         elif isinstance(o, TreeSnapshot):
 24 |             return o.__dict__()
 25 |         elif isinstance(o, float32):
 26 |             return float(o)
 27 |         elif isinstance(o, TreeLog):
 28 |             return {"logs": list(o)}
 29 |         elif hasattr(o, "__dict__"):
 30 |             return o.__dict__
 31 |         elif isinstance(o, Enum):
 32 |             return o.value
 33 |         else:
 34 |             return str(o)
 35 | 
 36 | 
 37 | class TreeLog:
 38 |     def __init__(self, tree_snapshots: Sequence[TreeSnapshot]) -> None:
 39 |         self._tree_snapshots = tree_snapshots
 40 | 
 41 |     def __getitem__(self, item):
 42 |         return self._tree_snapshots[item]
 43 | 
 44 |     def __iter__(self):
 45 |         return iter(self._tree_snapshots)
 46 | 
 47 |     def __len__(self):
 48 |         return len(self._tree_snapshots)
 49 | 
 50 |     def __str__(self):
 51 |         return json.dumps(self, cls=TreeLogEncoder, indent=2)
 52 | 
 53 |     @classmethod
 54 |     def from_mcts_results(
 55 |         cls,
 56 |         mcts_results: MCTSResult,
 57 |         node_data_factory: callable = None,
 58 |         edge_data_factory: callable = None,
 59 |     ) -> "TreeLog":
 60 |         def get_reward_details(n: MCTSNode) -> Union[dict, None]:
 61 |             if hasattr(n, "reward_details"):
 62 |                 return n.reward_details
 63 |             return n.fast_reward_details if hasattr(n, "fast_reward_details") else None
 64 | 
 65 |         def default_node_data_factory(n: MCTSNode) -> NodeData:
 66 |             if not n.state:
 67 |                 return NodeData({})
 68 |             if hasattr(n.state, "_asdict"):
 69 |                 state_dict = n.state._asdict()
 70 |             elif isinstance(n.state, list):
 71 |                 state_dict = {idx: value for idx, value in enumerate(n.state)}
 72 |             else:
 73 |                 try:
 74 |                     state_dict = json.loads(json.dumps(n.state, cls=TreeLogEncoder))
 75 |                 except TypeError:
 76 |                     state_dict = str(n.state)
 77 | 
 78 |             # Add color information to the node data
 79 |             state_dict["color"] = "green"
 80 |             return NodeData(state_dict)
 81 | 
 82 |         def default_edge_data_factory(n: MCTSNode) -> EdgeData:
 83 |             edge_data = {"Q": n.Q, "reward": n.reward, **get_reward_details(n)}
 84 | 
 85 |             # Add color information to the edge data
 86 |             edge_data["color"] = "brown"
 87 |             return EdgeData(edge_data)
 88 | 
 89 |         node_data_factory = node_data_factory or default_node_data_factory
 90 |         edge_data_factory = edge_data_factory or default_edge_data_factory
 91 | 
 92 |         snapshots = []
 93 | 
 94 |         def all_nodes(node: MCTSNode):
 95 |             node_id = NodeId(node.id)
 96 | 
 97 |             nodes[node_id] = TreeSnapshot.Node(node_id, node_data_factory(node))
 98 |             if node.children is None:
 99 |                 return
100 |             for child in node.children:
101 |                 edge_id = EdgeId(len(edges))
102 |                 edges.append(
103 |                     TreeSnapshot.Edge(
104 |                         edge_id, node.id, child.id, edge_data_factory(child)
105 |                     )
106 |                 )
107 |                 all_nodes(child)
108 | 
109 |         if mcts_results.tree_state_after_each_iter is None:
110 |             tree_states = [mcts_results.tree_state]
111 |         else:
112 |             tree_states = mcts_results.tree_state_after_each_iter
113 |         for step in range(len(tree_states)):
114 |             edges = []
115 |             nodes = {}
116 | 
117 |             root = tree_states[step]
118 |             all_nodes(root)
119 |             tree = TreeSnapshot(list(nodes.values()), edges)
120 | 
121 |             if mcts_results.trace_in_each_iter:
122 |                 trace = mcts_results.trace_in_each_iter[step]
123 |                 for step_idx in range(len(trace) - 1):
124 |                     in_node_id = trace[step_idx].id
125 |                     out_node_id = trace[step_idx + 1].id
126 |                     for edges in tree.out_edges(in_node_id):
127 |                         if edges.target == out_node_id:
128 |                             nodes[in_node_id].selected_edge = edges.id
129 |                             break
130 | 
131 |             for node in tree.nodes.values():
132 |                 if node.selected_edge is None and tree.children(node.id):
133 |                     node.selected_edge = max(
134 |                         tree.out_edges(node.id),
135 |                         key=lambda edge: edge.data.get("Q", -float("inf")),
136 |                     ).id
137 | 
138 |             snapshots.append(tree)
139 | 
140 |         return cls(snapshots)
141 | 


--------------------------------------------------------------------------------
/agentq/core/mcts/visualization/tree_snapshot.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from dataclasses import dataclass
 3 | from typing import NewType, Optional, Collection
 4 | 
 5 | NodeId = NewType("NodeId", int)
 6 | EdgeId = NewType("EdgeId", int)
 7 | NodeData = NewType("NodeData", dict)
 8 | EdgeData = NewType("EdgeData", dict)
 9 | 
10 | 
11 | class TreeSnapshot:
12 |     @dataclass
13 |     class Node:
14 |         id: NodeId
15 |         data: NodeData
16 |         selected_edge: Optional[EdgeId] = None
17 | 
18 |     @dataclass
19 |     class Edge:
20 |         id: EdgeId
21 |         source: NodeId
22 |         target: NodeId
23 |         data: EdgeData
24 | 
25 |     def __init__(self, nodes: Collection[Node], edges: Collection[Edge]) -> None:
26 |         self.nodes: dict[NodeId, TreeSnapshot.Node] = {node.id: node for node in nodes}
27 |         self.edges: dict[EdgeId, TreeSnapshot.Edge] = {edge.id: edge for edge in edges}
28 |         self._parent = {}
29 |         self._children: dict[NodeId, set[NodeId]] = defaultdict(set)
30 | 
31 |         for edge in edges:
32 |             self._parent[edge.target] = edge.source
33 |             self._children[edge.source].add(edge.target)
34 | 
35 |         assert len(self._parent) == len(self.nodes) - 1
36 |         assert self._connected()
37 | 
38 |     def _connected(self) -> bool:
39 |         visited = set()
40 |         queue = [next(iter(self.nodes))]
41 |         while queue:
42 |             node = queue.pop()
43 |             visited.add(node)
44 |             queue.extend(self._children[node] - visited)
45 |         return len(visited) == len(self.nodes)
46 | 
47 |     def node(self, node_id: NodeId) -> Node:
48 |         return self.nodes[node_id]
49 | 
50 |     def edge(self, edge_id: EdgeId) -> Edge:
51 |         return self.edges[edge_id]
52 | 
53 |     def out_edges(self, node_id: NodeId) -> Collection[Edge]:
54 |         return [self.edge(edge_id) for edge_id in self.edges if self.edge(edge_id).source == node_id]
55 | 
56 |     def in_edges(self, node_id: NodeId) -> Collection[Edge]:
57 |         return [self.edge(edge_id) for edge_id in self.edges if self.edge(edge_id).target == node_id]
58 | 
59 |     def parent(self, node_id: NodeId) -> NodeId:
60 |         return self._parent[node_id]
61 | 
62 |     def children(self, node_id: NodeId) -> Collection[NodeId]:
63 |         return self._children[node_id]
64 | 
65 |     def __dict__(self):
66 |         return {
67 |             "nodes": self.nodes,
68 |             "edges": self.edges,
69 |         }
70 | 


--------------------------------------------------------------------------------
/agentq/core/mcts/visualization/visualizer_client.py:
--------------------------------------------------------------------------------
 1 | import dataclasses
 2 | import json
 3 | import webbrowser
 4 | from typing import Optional, Union
 5 | 
 6 | import requests
 7 | 
 8 | from agentq.core.mcts.core.mcts import MCTSResult
 9 | from agentq.core.mcts.visualization.tree_log import TreeLog, TreeLogEncoder
10 | 
11 | _API_DEFAULT_BASE_URL = "https://2wz3t0av30.execute-api.us-west-1.amazonaws.com/staging"
12 | _VISUALIZER_DEFAULT_BASE_URL = "https://www.llm-reasoners.net"
13 | 
14 | 
15 | class VisualizerClient:
16 |     def __init__(self, base_url: str = _API_DEFAULT_BASE_URL) -> None:
17 |         self.base_url = base_url
18 | 
19 |     @dataclasses.dataclass
20 |     class TreeLogReceipt:
21 |         id: str
22 |         access_key: str
23 | 
24 |         @property
25 |         def access_url(self) -> str:
26 |             return f"{_VISUALIZER_DEFAULT_BASE_URL}/visualizer/{self.id}?accessKey={self.access_key}"
27 | 
28 |     def post_log(self, data: Union[TreeLog, str, dict]) -> Optional[TreeLogReceipt]:
29 |         if isinstance(data, TreeLog):
30 |             data = json.dumps(data, cls=TreeLogEncoder)
31 |         if isinstance(data, dict):
32 |             data = json.dumps(data, cls=TreeLogEncoder)
33 | 
34 |         url = f"{self.base_url}/logs"
35 |         headers = {"Content-Type": "application/json"}
36 |         response = requests.post(url, headers=headers, data=data)
37 | 
38 |         if response.status_code != 200:
39 |             print(
40 |                 f"POST Log failed with status code: {response.status_code}, message: {response.text}"
41 |             )
42 |             return None
43 | 
44 |         return self.TreeLogReceipt(**response.json())
45 | 
46 | 
47 | def present_visualizer(receipt: VisualizerClient.TreeLogReceipt):
48 |     print(f"Visualizer URL: {receipt.access_url}")
49 |     webbrowser.open(receipt.access_url)
50 | 
51 | 
52 | def visualize(result: Union[TreeLog, MCTSResult], **kwargs):
53 |     tree_log: TreeLog
54 | 
55 |     if isinstance(result, TreeLog):
56 |         tree_log = result
57 |     elif isinstance(result, MCTSResult):
58 |         tree_log = TreeLog.from_mcts_results(result, **kwargs)
59 |     else:
60 |         raise TypeError(f"Unsupported result type: {type(result)}")
61 | 
62 |     receipt = VisualizerClient().post_log(tree_log)
63 | 
64 |     if receipt is not None:
65 |         present_visualizer(receipt)
66 | 


--------------------------------------------------------------------------------
/agentq/core/memory/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/memory/__init__.py


--------------------------------------------------------------------------------
/agentq/core/memory/ltm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from agentq.config.config import USER_PREFERENCES_PATH
 4 | from agentq.utils.logger import logger
 5 | 
 6 | 
 7 | def get_user_ltm():
 8 |     user_preference_file_name = "user_preferences.txt"
 9 |     user_preference_file = os.path.join(
10 |         USER_PREFERENCES_PATH, user_preference_file_name
11 |     )
12 |     try:
13 |         with open(user_preference_file) as file:
14 |             user_pref = file.read()
15 |         return user_pref
16 |     except FileNotFoundError:
17 |         logger.warning(f"User preference file not found: {user_preference_file}")
18 | 
19 |     return None
20 | 


--------------------------------------------------------------------------------
/agentq/core/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/models/__init__.py


--------------------------------------------------------------------------------
/agentq/core/models/models.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum, IntEnum
  2 | from typing import List, Literal, Optional, Union
  3 | 
  4 | from pydantic import BaseModel
  5 | from pydantic.fields import Field
  6 | 
  7 | 
  8 | # Global
  9 | class State(str, Enum):
 10 |     PLAN = "plan"
 11 |     BROWSE = "browse"
 12 |     COMPLETED = "completed"
 13 |     AGENTQ_BASE = "agentq_base"
 14 |     AGENTQ_ACTOR = "agentq_actor"
 15 |     AGENTQ_CRITIC = "agentq_critic"
 16 | 
 17 | 
 18 | class ActionType(str, Enum):
 19 |     CLICK = "CLICK"
 20 |     TYPE = "TYPE"
 21 |     GOTO_URL = "GOTO_URL"
 22 |     ENTER_TEXT_AND_CLICK = "ENTER_TEXT_AND_CLICK"
 23 |     SOLVE_CAPTCHA = "SOLVE_CAPTCHA"
 24 |     # GET_DOM_TEXT_CONTENT = "GET_DOM_TEXT_CONTENT"
 25 |     # GET_DOM_INPUT_FILEDS = "GET_DOM_INPUT_FILEDS"
 26 |     # GET_DOM_ALL_CONTENTS = "GET_DOM_ALL_CONTENTS"
 27 |     # GET_CURRENT_URL = "GET_CURRENT_URL"
 28 | 
 29 | 
 30 | class ClickAction(BaseModel):
 31 |     type: Literal[ActionType.CLICK] = Field(
 32 |         description="""Executes a click action on the element matching the given mmid attribute value. MMID is always a number. Returns Success if click was successful or appropriate error message if the element could not be clicked."""
 33 |     )
 34 |     mmid: int = Field(
 35 |         description="The mmid number of the element that needs to be clicked e.g. 114. mmid will always be a number"
 36 |     )
 37 |     wait_before_execution: Optional[float] = Field(
 38 |         description="Optional wait time in seconds before executing the click event logic"
 39 |     )
 40 | 
 41 | 
 42 | class TypeAction(BaseModel):
 43 |     type: Literal[ActionType.TYPE] = Field(
 44 |         description="""Single enter given text in the DOM element matching the given mmid attribute value. This will only enter the text and not press enter or anything else.
 45 |    Returns Success if text entry was successful or appropriate error message if text could not be entered."""
 46 |     )
 47 |     mmid: int = Field(
 48 |         description="The mmid number of the element that needs to be clicked e.g. 114. mmid will always be a number"
 49 |     )
 50 |     content: str = Field(
 51 |         description="The text to enter in the element identified by the query_selector."
 52 |     )
 53 | 
 54 | 
 55 | class GotoAction(BaseModel):
 56 |     type: Literal[ActionType.GOTO_URL] = Field(
 57 |         description="Opens a specified URL in the web browser instance. Returns url of the new page if successful or appropriate error message if the page could not be opened."
 58 |     )
 59 |     website: str = Field(
 60 |         description="The URL to navigate to. Value must include the protocol (http:// or https://)."
 61 |     )
 62 |     timeout: Optional[float] = Field(
 63 |         description="Additional wait time in seconds after initial load."
 64 |     )
 65 | 
 66 | 
 67 | class EnterTextAndClickAction(BaseModel):
 68 |     type: Literal[ActionType.ENTER_TEXT_AND_CLICK] = Field(
 69 |         description="""Enters text into a specified element and clicks another element, both identified by their mmid. Ideal for seamless actions like submitting search queries, this integrated approach ensures superior performance over separate text entry and click commands. Successfully completes when both actions are executed without errors, returning True; otherwise, it provides False or an explanatory message of any failure encountered."""
 70 |     )
 71 |     text_element_mmid: int = Field(
 72 |         description="The mmid number of the element where the text will be entered"
 73 |     )
 74 |     text_to_enter: str = Field(
 75 |         description="The text that will be entered into the element specified by text_element_mmid"
 76 |     )
 77 |     click_element_mmid: int = Field(
 78 |         description="The mmid number of the element that will be clicked after text entry."
 79 |     )
 80 |     wait_before_click_execution: Optional[float] = Field(
 81 |         description="Optional wait time in seconds before executing the click event logic"
 82 |     )
 83 | 
 84 | 
 85 | class SolveCaptcha(BaseModel):
 86 |     type: Literal[ActionType.SOLVE_CAPTCHA] = Field(
 87 |         description="""Solve captcha, enters the solve captcha into a specified element and clicks another element, both identified by their mmid. Ideal for captcha solving ,entering captcha and clicking submit.Successfully completes when all three actions are executed without errors, returning True; otherwise, it provides False or an explanatory message of any failure encountered."""
 88 |     )
 89 |     text_element_mmid: int = Field(
 90 |         description="The mmid number of the element where the captcha will be entered"
 91 |     )
 92 | 
 93 |     click_element_mmid: int = Field(
 94 |         description="The mmid number of the element that will be clicked after the catcha entry to submit"
 95 |     )
 96 | 
 97 |     wait_before_click_execution: Optional[float] = Field(
 98 |         description="Optional wait time in seconds before executing the click event logic"
 99 |     )
100 | 
101 | 
102 | class Score(IntEnum):
103 |     FAIL = 0
104 |     PASS = 1
105 | 
106 | 
107 | # class GetDomTextAction(BaseModel):
108 | #     type: Literal[ActionType.GET_DOM_TEXT_CONTENT]
109 | 
110 | 
111 | # class GetDomInputsAction(BaseModel):
112 | #     type: Literal[ActionType.GET_DOM_INPUT_FILEDS]
113 | 
114 | 
115 | # class GetDomAllAction(BaseModel):
116 | #     type: Literal[ActionType.GET_DOM_ALL_CONTENTS]
117 | 
118 | 
119 | # class GetCurrentUrlAction(BaseModel):
120 | #     type: Literal[ActionType.GET_CURRENT_URL]
121 | 
122 | 
123 | Action = Union[
124 |     ClickAction,
125 |     TypeAction,
126 |     GotoAction,
127 |     EnterTextAndClickAction,
128 |     SolveCaptcha,
129 |     # GetDomTextAction,
130 |     # GetDomInputsAction,
131 |     # GetDomAllAction,
132 |     # GetCurrentUrlAction,
133 | ]
134 | 
135 | 
136 | class Task(BaseModel):
137 |     id: int
138 |     description: str
139 |     url: Optional[str]
140 |     result: Optional[str]
141 | 
142 | 
143 | class TaskWithActions(BaseModel):
144 |     id: int
145 |     description: str
146 |     actions_to_be_performed: Optional[List[Action]]
147 |     result: Optional[str]
148 | 
149 | 
150 | class Memory(BaseModel):
151 |     objective: str
152 |     current_state: State
153 |     plan: Optional[Union[List[Task], List[TaskWithActions]]]
154 |     thought: str
155 |     completed_tasks: Optional[Union[List[Task], List[TaskWithActions]]]
156 |     current_task: Optional[Union[Task, TaskWithActions]]
157 |     final_response: Optional[str]
158 |     current_tasks_for_eval: Optional[List[TaskWithActions]]
159 |     sorted_tasks: Optional[List[TaskWithActions]]
160 | 
161 |     class Config:
162 |         use_enum_values = True
163 | 
164 | 
165 | # Planner
166 | class PlannerInput(BaseModel):
167 |     objective: str
168 |     completed_tasks: Optional[List[Task]]
169 |     task_for_review: Optional[Task]
170 | 
171 | 
172 | class PlannerOutput(BaseModel):
173 |     plan: Optional[List[Task]]
174 |     thought: str
175 |     next_task: Optional[Task]
176 |     is_complete: bool
177 |     final_response: Optional[str]
178 | 
179 | 
180 | # Executor
181 | class BrowserNavInput(BaseModel):
182 |     task: Task
183 | 
184 | 
185 | class BrowserNavOutput(BaseModel):
186 |     completed_task: Task
187 | 
188 | 
189 | # AgentQ
190 | class AgentQBaseInput(BaseModel):
191 |     objective: str
192 |     completed_tasks: Optional[List[Task]]
193 |     current_page_url: str
194 |     current_page_dom: str
195 | 
196 | 
197 | class AgentQBaseOutput(BaseModel):
198 |     thought: str
199 |     plan: List[Task]
200 |     next_task: Optional[Task]
201 |     next_task_actions: Optional[List[Action]]
202 |     is_complete: bool
203 |     final_response: Optional[str]
204 | 
205 | 
206 | # Actor
207 | class AgentQActorInput(BaseModel):
208 |     objective: str
209 |     completed_tasks: Optional[List[TaskWithActions]]
210 |     current_page_url: str
211 |     current_page_dom: str
212 | 
213 | 
214 | class AgentQActorOutput(BaseModel):
215 |     thought: str
216 |     proposed_tasks: Optional[List[TaskWithActions]]
217 |     is_complete: bool
218 |     final_response: Optional[str]
219 | 
220 | 
221 | # Critic
222 | class AgentQCriticInput(BaseModel):
223 |     objective: str
224 |     completed_tasks: Optional[List[TaskWithActions]]
225 |     tasks_for_eval: List[TaskWithActions]
226 |     current_page_url: str
227 |     current_page_dom: str
228 | 
229 | 
230 | class AgentQCriticOutput(BaseModel):
231 |     thought: str
232 |     top_task: TaskWithActions
233 | 
234 | 
235 | # Vision
236 | class VisionInput(BaseModel):
237 |     objective: str
238 | 
239 | 
240 | class VisionOutput(BaseModel):
241 |     is_terminal: bool
242 | 
243 | 
244 | class EvalAgentInput(BaseModel):
245 |     objective: str
246 |     agent_output: str
247 |     current_page_url: str
248 |     current_page_dom: str
249 | 
250 | 
251 | class EvalAgentOutput(BaseModel):
252 |     score: Score
253 | 
254 | 
255 | class CaptchaAgentInput(BaseModel):
256 |     objective: str
257 | 
258 | 
259 | class CaptchaAgentOutput(BaseModel):
260 |     captcha: str
261 |     success: bool
262 | 
263 | 
264 | # Monte-Carlo
265 | class BrowserState(BaseModel):
266 |     dom: str
267 |     url: str
268 |     objective: str
269 |     completed_tasks: Optional[List[TaskWithActions]]
270 | 
271 | 
272 | class BrowserAction(BaseModel):
273 |     task_with_action: TaskWithActions
274 |     rank: float = Field(description="The rank of this action, higher is better")
275 | 
276 | 
277 | class DPOState(BaseModel):
278 |     objective: str
279 |     dom: str
280 | 
281 | 
282 | class DPOAction(BaseModel):
283 |     description: str
284 |     action: Action
285 | 
286 | 
287 | class DPOPair(BaseModel):
288 |     state: DPOState
289 |     winning_action: DPOAction
290 |     losing_action: DPOAction
291 | 


--------------------------------------------------------------------------------
/agentq/core/prompts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/prompts/__init__.py


--------------------------------------------------------------------------------
/agentq/core/skills/__init__.py:
--------------------------------------------------------------------------------
 1 | from agentq.core.skills.click_using_selector import (
 2 |     click,
 3 |     do_click,
 4 |     is_element_present,
 5 |     perform_javascript_click,
 6 |     perform_playwright_click,
 7 | )
 8 | from agentq.core.skills.enter_text_and_click import enter_text_and_click
 9 | from agentq.core.skills.enter_text_using_selector import (
10 |     bulk_enter_text,
11 |     custom_fill_element,
12 |     do_entertext,
13 | )
14 | from agentq.core.skills.get_dom_with_content_type import get_dom_with_content_type
15 | from agentq.core.skills.get_url import geturl
16 | from agentq.core.skills.get_user_input import get_user_input
17 | from agentq.core.skills.open_url import openurl
18 | from agentq.core.skills.press_key_combination import press_key_combination
19 | from agentq.core.skills.solve_captcha import solve_captcha
20 | 
21 | __all__ = (
22 |     click,
23 |     do_click,
24 |     is_element_present,
25 |     perform_javascript_click,
26 |     perform_playwright_click,
27 |     enter_text_and_click,
28 |     bulk_enter_text,
29 |     custom_fill_element,
30 |     do_entertext,
31 |     get_dom_with_content_type,
32 |     geturl,
33 |     get_user_input,
34 |     openurl,
35 |     press_key_combination,
36 |     solve_captcha,
37 | )
38 | 


--------------------------------------------------------------------------------
/agentq/core/skills/click_using_selector.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import inspect
  3 | import traceback
  4 | from typing import Dict
  5 | 
  6 | from playwright.async_api import ElementHandle, Page
  7 | from playwright.async_api import TimeoutError as PlaywrightTimeoutError
  8 | from typing_extensions import Annotated
  9 | 
 10 | from agentq.core.web_driver.playwright import PlaywrightManager
 11 | from agentq.utils.dom_mutation_observer import (
 12 |     subscribe,  # type: ignore
 13 |     unsubscribe,  # type: ignore
 14 | )
 15 | from agentq.utils.logger import logger
 16 | 
 17 | 
 18 | async def click(
 19 |     selector: Annotated[
 20 |         str,
 21 |         "The properly formed query selector string to identify the element for the click action (e.g. [mmid='114']). When \"mmid\" attribute is present, use it for the query selector. selector mmid will always be a number",
 22 |     ],
 23 |     wait_before_execution: Annotated[
 24 |         float,
 25 |         "Optional wait time in seconds before executing the click event logic.",
 26 |         float,
 27 |     ],
 28 | ) -> Annotated[str, "A message indicating success or failure of the click."]:
 29 |     """
 30 |     Executes a click action on the element matching the given query selector string within the currently open web page.
 31 |     If there is no page open, it will raise a ValueError. An optional wait time can be specified before executing the click logic. Use this to wait for the page to load especially when the last action caused the DOM/Page to load.
 32 | 
 33 |     Parameters:
 34 |     - selector: The query selector string to identify the element for the click action.
 35 |     - wait_before_execution: Optional wait time in seconds before executing the click event logic. Defaults to 0.0 seconds.
 36 | 
 37 |     Returns:
 38 |     - Success if the click was successful, Appropriate error message otherwise.
 39 |     """
 40 |     logger.info(f'Executing ClickElement with "{selector}" as the selector')
 41 | 
 42 |     # Initialize PlaywrightManager and get the active browser page
 43 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
 44 |     page = await browser_manager.get_current_page()
 45 | 
 46 |     if page is None:
 47 |         raise ValueError("No active page found. OpenURL command opens a new page.")
 48 | 
 49 |     function_name = inspect.currentframe().f_code.co_name
 50 | 
 51 |     await browser_manager.take_screenshots(f"{function_name}_start", page)
 52 | 
 53 |     await browser_manager.highlight_element(selector, True)
 54 | 
 55 |     dom_changes_detected = None
 56 | 
 57 |     def detect_dom_changes(changes: str):
 58 |         nonlocal dom_changes_detected
 59 |         dom_changes_detected = changes
 60 | 
 61 |     subscribe(detect_dom_changes)
 62 | 
 63 |     # Wrap the click action and subsequent operations in a try-except block
 64 |     try:
 65 |         # Set up navigation expectation with a shorter timeout
 66 |         async with page.expect_navigation(wait_until="domcontentloaded", timeout=10000):
 67 |             result = await do_click(page, selector, wait_before_execution)
 68 | 
 69 |         # Wait for a short time to ensure the page has settled
 70 |         await asyncio.sleep(1)
 71 |     except PlaywrightTimeoutError:
 72 |         # If navigation times out, it might be a single-page app or a slow-loading page
 73 |         logger.warning(
 74 |             "Navigation timeout occurred, but the click might have been successful."
 75 |         )
 76 |         result = {
 77 |             "summary_message": "Click executed, but no full page navigation detected",
 78 |             "detailed_message": "Click executed successfully, but no full page navigation was detected. This might be normal for single-page applications or slow-loading pages.",
 79 |         }
 80 |     except Exception as e:
 81 |         logger.error(f"Error during click operation: {e}")
 82 |         result = {
 83 |             "summary_message": "Click executed, but encountered an error",
 84 |             "detailed_message": f"Click executed, but encountered an error: {str(e)}",
 85 |         }
 86 | 
 87 |     await asyncio.sleep(
 88 |         0.1
 89 |     )  # sleep for 100ms to allow the mutation observer to detect changes
 90 |     unsubscribe(detect_dom_changes)
 91 |     await browser_manager.take_screenshots(f"{function_name}_end", page)
 92 | 
 93 |     if dom_changes_detected:
 94 |         return f"Success: {result['summary_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action to click {selector} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction."
 95 |     return result["detailed_message"]
 96 | 
 97 | 
 98 | async def do_click(
 99 |     page: Page, selector: str, wait_before_execution: float
100 | ) -> Dict[str, str]:
101 |     """
102 |     Executes the click action on the element with the given selector within the provided page.
103 | 
104 |     Parameters:
105 |     - page: The Playwright page instance.
106 |     - selector: The query selector string to identify the element for the click action.
107 |     - wait_before_execution: Optional wait time in seconds before executing the click event logic.
108 | 
109 |     Returns:
110 |     Dict[str,str] - Explanation of the outcome of this operation represented as a dictionary with 'summary_message' and 'detailed_message'.
111 |     """
112 |     logger.info(
113 |         f'Executing ClickElement with "{selector}" as the selector. Wait time before execution: {wait_before_execution} seconds.'
114 |     )
115 | 
116 |     # Wait before execution if specified
117 |     if wait_before_execution > 0:
118 |         await asyncio.sleep(wait_before_execution)
119 | 
120 |     # Wait for the selector to be present and ensure it's attached and visible. If timeout, try javascript click
121 |     try:
122 |         logger.info(
123 |             f'Executing ClickElement with "{selector}" as the selector. Waiting for the element to be attached and visible.'
124 |         )
125 | 
126 |         element = await asyncio.wait_for(
127 |             page.wait_for_selector(selector, state="attached", timeout=2000),
128 |             timeout=2000,
129 |         )
130 |         if element is None:
131 |             raise ValueError(f'Element with selector: "{selector}" not found')
132 | 
133 |         logger.info(
134 |             f'Element with selector: "{selector}" is attached. scrolling it into view if needed.'
135 |         )
136 |         try:
137 |             await element.scroll_into_view_if_needed(timeout=200)
138 |             logger.info(
139 |                 f'Element with selector: "{selector}" is attached and scrolled into view. Waiting for the element to be visible.'
140 |             )
141 |         except Exception:
142 |             # If scrollIntoView fails, just move on, not a big deal
143 |             pass
144 | 
145 |         try:
146 |             await element.wait_for_element_state("visible", timeout=200)
147 |             logger.info(
148 |                 f'Executing ClickElement with "{selector}" as the selector. Element is attached and visible. Clicking the element.'
149 |             )
150 |         except Exception:
151 |             # If the element is not visible, try to click it anyway
152 |             pass
153 | 
154 |         element_tag_name = await element.evaluate(
155 |             "element => element.tagName.toLowerCase()"
156 |         )
157 | 
158 |         if element_tag_name == "option":
159 |             element_value = await element.get_attribute(
160 |                 "value"
161 |             )  # get the text that is in the value of the option
162 |             parent_element = await element.evaluate_handle(
163 |                 "element => element.parentNode"
164 |             )
165 |             await parent_element.select_option(value=element_value)  # type: ignore
166 | 
167 |             logger.info(f'Select menu option "{element_value}" selected')
168 | 
169 |             return {
170 |                 "summary_message": f'Select menu option "{element_value}" selected',
171 |                 "detailed_message": f'Select menu option "{element_value}" selected.',
172 |             }
173 | 
174 |         msg = await perform_javascript_click(page, selector)
175 |         return {
176 |             "summary_message": msg,
177 |             "detailed_message": f"{msg} Click action completed, page may have navigated.",
178 |         }
179 |     except Exception as e:
180 |         logger.error(f'Unable to click element with selector: "{selector}". Error: {e}')
181 |         traceback.print_exc()
182 |         msg = f'Unable to click element with selector: "{selector}" since the selector is invalid.'
183 |         return {"summary_message": msg, "detailed_message": f"{msg}. Error: {e}"}
184 | 
185 | 
186 | async def is_element_present(page: Page, selector: str) -> bool:
187 |     """
188 |     Checks if an element is present on the page.
189 | 
190 |     Parameters:
191 |     - page: The Playwright page instance.
192 |     - selector: The query selector string to identify the element.
193 | 
194 |     Returns:
195 |     - True if the element is present, False otherwise.
196 |     """
197 |     element = await page.query_selector(selector)
198 |     return element is not None
199 | 
200 | 
201 | async def perform_playwright_click(element: ElementHandle, selector: str):
202 |     """
203 |     Performs a click action on the element using Playwright's click method.
204 | 
205 |     Parameters:
206 |     - element: The Playwright ElementHandle instance representing the element to be clicked.
207 |     - selector: The query selector string of the element.
208 | 
209 |     Returns:
210 |     - None
211 |     """
212 |     logger.info(
213 |         f"Performing first Step: Playwright Click on element with selector: {selector}"
214 |     )
215 |     await element.click(force=False, timeout=200)
216 | 
217 | 
218 | async def perform_javascript_click(page: Page, selector: str):
219 |     """
220 |     Performs a click action on the element using JavaScript.
221 | 
222 |     Parameters:
223 |     - page: The Playwright page instance.
224 |     - selector: The query selector string of the element.
225 | 
226 |     Returns:
227 |     - A string describing the result of the click action.
228 |     """
229 |     js_code = """(selector) => {
230 |         let element = document.querySelector(selector);
231 | 
232 |         if (!element) {
233 |             console.log(`perform_javascript_click: Element with selector ${selector} not found`);
234 |             return `perform_javascript_click: Element with selector ${selector} not found`;
235 |         }
236 | 
237 |         if (element.tagName.toLowerCase() === "option") {
238 |             let value = element.text;
239 |             let parent = element.parentElement;
240 | 
241 |             parent.value = element.value; // Directly set the value if possible
242 |             // Trigger change event if necessary
243 |             let event = new Event('change', { bubbles: true });
244 |             parent.dispatchEvent(event);
245 | 
246 |             console.log("Select menu option", value, "selected");
247 |             return "Select menu option: "+ value+ " selected";
248 |         }
249 |         else {
250 |             console.log("About to click selector", selector);
251 |             // If the element is a link, make it open in the same tab
252 |             if (element.tagName.toLowerCase() === "a") {
253 |                 element.target = "_self";
254 |                 // #TODO: Consider removing this in the future if it causes issues with intended new tab behavior
255 |                 element.removeAttribute('target');
256 |                 element.removeAttribute('rel');
257 |             }
258 |             let ariaExpandedBeforeClick = element.getAttribute('aria-expanded');
259 |             element.click();
260 |             let ariaExpandedAfterClick = element.getAttribute('aria-expanded');
261 |             if (ariaExpandedBeforeClick === 'false' && ariaExpandedAfterClick === 'true') {
262 |                 return "Executed JavaScript Click on element with selector: "+selector +". Very important: As a consequence a menu has appeared where you may need to make further selection. Very important: Get all_fields DOM to complete the action.";
263 |             }
264 |             return "Executed JavaScript Click on element with selector: "+selector;
265 |         }
266 |     }"""
267 |     try:
268 |         logger.info(f"Executing JavaScript click on element with selector: {selector}")
269 |         result: str = await page.evaluate(js_code, selector)
270 |         logger.debug(f"Executed JavaScript Click on element with selector: {selector}")
271 |         return result
272 |     except Exception as e:
273 |         logger.error(
274 |             f"Error executing JavaScript click on element with selector: {selector}. Error: {e}"
275 |         )
276 |         traceback.print_exc()
277 |         return f"Error executing JavaScript click: {str(e)}"
278 | 


--------------------------------------------------------------------------------
/agentq/core/skills/enter_text_and_click.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import inspect
  3 | 
  4 | from typing_extensions import Annotated
  5 | 
  6 | from agentq.core.web_driver.playwright import PlaywrightManager
  7 | from agentq.core.skills.click_using_selector import do_click
  8 | from agentq.core.skills.enter_text_using_selector import do_entertext
  9 | from agentq.core.skills.press_key_combination import do_press_key_combination
 10 | from agentq.utils.logger import logger
 11 | 
 12 | 
 13 | async def enter_text_and_click(
 14 |     text_selector: Annotated[
 15 |         str,
 16 |         "The properly formatted DOM selector query, for example [mmid='1234'], where the text will be entered. Use mmid attribute. mmid will always be a number",
 17 |     ],
 18 |     text_to_enter: Annotated[
 19 |         str,
 20 |         "The text that will be entered into the element specified by text_selector.",
 21 |     ],
 22 |     click_selector: Annotated[
 23 |         str,
 24 |         "The properly formatted DOM selector query, for example [mmid='1234'], for the element that will be clicked after text entry. mmid will always be a number",
 25 |     ],
 26 |     wait_before_click_execution: Annotated[
 27 |         float, "Optional wait time in seconds before executing the click.", float
 28 |     ],
 29 | ) -> Annotated[
 30 |     str, "A message indicating success or failure of the text entry and click."
 31 | ]:
 32 |     """
 33 |     Enters text into an element and then clicks on another element.
 34 | 
 35 |     Parameters:
 36 |     - text_selector: The selector for the element to enter text into. It should be a properly formatted DOM selector query, for example [mmid='1234'], where the text will be entered. Use the mmid attribute.
 37 |     - text_to_enter: The text to enter into the element specified by text_selector.
 38 |     - click_selector: The selector for the element to click. It should be a properly formatted DOM selector query, for example [mmid='1234'].
 39 |     - wait_before_click_execution: Optional wait time in seconds before executing the click action. Default is 0.0.
 40 | 
 41 |     Returns:
 42 |     - A message indicating the success or failure of the text entry and click.
 43 | 
 44 |     Raises:
 45 |     - ValueError: If no active page is found. The OpenURL command opens a new page.
 46 | 
 47 |     Example usage:
 48 |     ```
 49 |     await enter_text_and_click("[mmid='1234']", "Hello, World!", "[mmid='5678']", wait_before_click_execution=1.5)
 50 |     ```
 51 |     """
 52 |     logger.info(
 53 |         f"Entering text '{text_to_enter}' into element with selector '{text_selector}' and then clicking element with selector '{click_selector}'."
 54 |     )
 55 | 
 56 |     # Initialize PlaywrightManager and get the active browser page
 57 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
 58 |     page = await browser_manager.get_current_page()
 59 |     if page is None:  # type: ignore
 60 |         logger.error("No active page found")
 61 |         raise ValueError("No active page found. OpenURL command opens a new page.")
 62 | 
 63 |     await browser_manager.highlight_element(text_selector, True)
 64 | 
 65 |     function_name = inspect.currentframe().f_code.co_name  # type: ignore
 66 |     await browser_manager.take_screenshots(f"{function_name}_start", page)
 67 | 
 68 |     text_entry_result = await do_entertext(
 69 |         page, text_selector, text_to_enter, use_keyboard_fill=True
 70 |     )
 71 | 
 72 |     # await browser_manager.notify_user(text_entry_result["summary_message"])
 73 |     if not text_entry_result["summary_message"].startswith("Success"):
 74 |         await browser_manager.take_screenshots(f"{function_name}_end", page)
 75 |         return f"Failed to enter text '{text_to_enter}' into element with selector '{text_selector}'. Check that the selctor is valid."
 76 | 
 77 |     result = text_entry_result
 78 | 
 79 |     # if the text_selector is the same as the click_selector, press the Enter key instead of clicking
 80 |     if text_selector == click_selector:
 81 |         do_press_key_combination_result = await do_press_key_combination(
 82 |             browser_manager, page, "Enter"
 83 |         )
 84 |         if do_press_key_combination_result:
 85 |             result["detailed_message"] += (
 86 |                 f' Instead of click, pressed the Enter key successfully on element: "{click_selector}".'
 87 |             )
 88 |             # await browser_manager.notify_user(
 89 |             #     f'Pressed the Enter key successfully on element: "{click_selector}".',
 90 |             #     message_type=MessageType.ACTION,
 91 |             # )
 92 |         else:
 93 |             result["detailed_message"] += (
 94 |                 f' Clicking the same element after entering text in it, is of no value. Tried pressing the Enter key on element "{click_selector}" instead of click and failed.'
 95 |             )
 96 |             # await browser_manager.notify_user(
 97 |             #     'Failed to press the Enter key on element "{click_selector}".',
 98 |             #     message_type=MessageType.ACTION,
 99 |             # )
100 |     else:
101 |         await browser_manager.highlight_element(click_selector, True)
102 | 
103 |         do_click_result = await do_click(
104 |             page, click_selector, wait_before_click_execution
105 |         )
106 |         result["detailed_message"] += f' {do_click_result["detailed_message"]}'
107 |         # await browser_manager.notify_user(do_click_result["summary_message"])
108 | 
109 |     await asyncio.sleep(
110 |         0.1
111 |     )  # sleep for 100ms to allow the mutation observer to detect changes
112 | 
113 |     await browser_manager.take_screenshots(f"{function_name}_end", page)
114 | 
115 |     return result["detailed_message"]
116 | 


--------------------------------------------------------------------------------
/agentq/core/skills/enter_text_using_selector.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import inspect
  3 | import traceback
  4 | from dataclasses import dataclass
  5 | from typing import (
  6 |     Dict,
  7 |     List,  # noqa: UP035
  8 | )
  9 | 
 10 | from playwright.async_api import Page
 11 | from typing_extensions import Annotated
 12 | 
 13 | from agentq.core.web_driver.playwright import PlaywrightManager
 14 | from agentq.core.skills.press_key_combination import press_key_combination
 15 | from agentq.utils.dom_helper import get_element_outer_html
 16 | from agentq.utils.dom_mutation_observer import subscribe, unsubscribe
 17 | from agentq.utils.logger import logger
 18 | 
 19 | 
 20 | @dataclass
 21 | class EnterTextEntry:
 22 |     """
 23 |     Represents an entry for text input.
 24 | 
 25 |     Attributes:
 26 |         query_selector (str): A valid DOM selector query. Use the mmid attribute.
 27 |         text (str): The text to enter in the element identified by the query_selector.
 28 |     """
 29 | 
 30 |     query_selector: str
 31 |     text: str
 32 | 
 33 |     def __getitem__(self, key: str) -> str:
 34 |         if key == "query_selector":
 35 |             return self.query_selector
 36 |         elif key == "text":
 37 |             return self.text
 38 |         else:
 39 |             raise KeyError(f"{key} is not a valid key")
 40 | 
 41 | 
 42 | async def custom_fill_element(page: Page, selector: str, text_to_enter: str):
 43 |     """
 44 |     Sets the value of a DOM element to a specified text without triggering keyboard input events.
 45 | 
 46 |     This function directly sets the 'value' property of a DOM element identified by the given CSS selector,
 47 |     effectively changing its current value to the specified text. This approach bypasses the need for
 48 |     simulating keyboard typing, providing a more efficient and reliable way to fill in text fields,
 49 |     especially in automated testing scenarios where speed and accuracy are paramount.
 50 | 
 51 |     Args:
 52 |         page (Page): The Playwright Page object representing the browser tab in which the operation will be performed.
 53 |         selector (str): The CSS selector string used to locate the target DOM element. The function will apply the
 54 |                         text change to the first element that matches this selector.
 55 |         text_to_enter (str): The text value to be set in the target element. Existing content will be overwritten.
 56 | 
 57 |     Example:
 58 |         await custom_fill_element(page, '#username', 'test_user')
 59 | 
 60 |     Note:
 61 |         This function does not trigger input-related events (like 'input' or 'change'). If application logic
 62 |         relies on these events being fired, additional steps may be needed to simulate them.
 63 |     """
 64 |     selector = f"{selector}"  # Ensures the selector is treated as a string
 65 |     try:
 66 |         result = await page.evaluate(
 67 |             """(inputParams) => {
 68 |             const selector = inputParams.selector;
 69 |             let text_to_enter = inputParams.text_to_enter;
 70 |             text_to_enter = text_to_enter.trim();
 71 |             const element = document.querySelector(selector);
 72 |             if (!element) {
 73 |                 throw new Error(`Element not found: ${selector}`);
 74 |             }
 75 |             element.value = text_to_enter;
 76 |             return `Value set for ${selector}`;
 77 |         }""",
 78 |             {"selector": selector, "text_to_enter": text_to_enter},
 79 |         )
 80 |         logger.debug(f"custom_fill_element result: {result}")
 81 |     except Exception as e:
 82 |         logger.error(f"Error in custom_fill_element: {str(e)}")
 83 |         logger.error(f"Selector: {selector}, Text: {text_to_enter}")
 84 |         raise
 85 | 
 86 | 
 87 | async def entertext(
 88 |     entry: Annotated[
 89 |         EnterTextEntry,
 90 |         "An object containing 'query_selector' (DOM selector query using mmid attribute e.g. [mmid='114']) and 'text' (text to enter on the element). mmid will always be a number",
 91 |     ],
 92 | ) -> Annotated[str, "Explanation of the outcome of this operation."]:
 93 |     """
 94 |     Enters text into a DOM element identified by a CSS selector.
 95 | 
 96 |     This function enters the specified text into a DOM element identified by the given CSS selector.
 97 |     It uses the Playwright library to interact with the browser and perform the text entry operation.
 98 |     The function supports both direct setting of the 'value' property and simulating keyboard typing.
 99 | 
100 |     Args:
101 |         entry (EnterTextEntry): An object containing 'query_selector' (DOM selector query using mmid attribute)
102 |                                 and 'text' (text to enter on the element).
103 | 
104 |     Returns:
105 |         str: Explanation of the outcome of this operation.
106 | 
107 |     Example:
108 |         entry = EnterTextEntry(query_selector='#username', text='test_user')
109 |         result = await entertext(entry)
110 | 
111 |     Note:
112 |         - The 'query_selector' should be a valid CSS selector that uniquely identifies the target element.
113 |         - The 'text' parameter specifies the text to be entered into the element.
114 |         - The function uses the PlaywrightManager to manage the browser instance.
115 |         - If no active page is found, an error message is returned.
116 |         - The function internally calls the 'do_entertext' function to perform the text entry operation.
117 |         - The 'do_entertext' function applies a pulsating border effect to the target element during the operation.
118 |         - The function first clears any existing text in the input field before entering the new text.
119 |         - The 'use_keyboard_fill' parameter in 'do_entertext' determines whether to simulate keyboard typing or not.
120 |         - If 'use_keyboard_fill' is set to True, the function uses the 'page.keyboard.type' method to enter the text.
121 |         - If 'use_keyboard_fill' is set to False, the function uses the 'custom_fill_element' method to enter the text.
122 |     """
123 |     logger.info(f"Entering text: {entry}")
124 | 
125 |     if isinstance(entry, Dict):
126 |         query_selector: str = entry["query_selector"]
127 |         text_to_enter: str = entry["text"]
128 |     elif isinstance(entry, EnterTextEntry):
129 |         query_selector: str = entry.query_selector
130 |         text_to_enter: str = entry.text
131 |     else:
132 |         raise ValueError(
133 |             "Invalid input type for 'entry'. Expected EnterTextEntry or dict."
134 |         )
135 | 
136 |     if not isinstance(query_selector, str) or not isinstance(text_to_enter, str):
137 |         raise ValueError("query_selector and text must be strings")
138 | 
139 |     # logger.info(
140 |     #     f"######### Debug: query_selector={query_selector}, text_to_enter={text_to_enter}"
141 |     # )
142 | 
143 |     # Create and use the PlaywrightManager
144 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
145 |     page = await browser_manager.get_current_page()
146 |     if page is None:  # type: ignore
147 |         return "Error: No active page found. OpenURL command opens a new page."
148 | 
149 |     function_name = inspect.currentframe().f_code.co_name  # type: ignore
150 | 
151 |     await browser_manager.take_screenshots(f"{function_name}_start", page)
152 | 
153 |     await browser_manager.highlight_element(query_selector, True)
154 | 
155 |     dom_changes_detected = None
156 | 
157 |     def detect_dom_changes(changes: str):  # type: ignore
158 |         nonlocal dom_changes_detected
159 |         dom_changes_detected = changes  # type: ignore
160 | 
161 |     subscribe(detect_dom_changes)
162 | 
163 |     # Clear existing text before entering new text
164 |     # await page.evaluate(f"document.querySelector('{query_selector}').value = '';")
165 |     # logger.info(
166 |     #     f"######### About to page.evaluate: selector={query_selector}, text={text_to_enter}"
167 |     # )
168 |     await page.evaluate(
169 |         """
170 |         (selector) => {
171 |             const element = document.querySelector(selector);
172 |             if (element) {
173 |                 element.value = '';
174 |             } else {
175 |                 console.error('Element not found:', selector);
176 |             }
177 |         }
178 |         """,
179 |         query_selector,
180 |     )
181 |     # logger.info(
182 |     #     f"######### About to call do_entertext with: selector={query_selector}, text={text_to_enter}"
183 |     # )
184 |     result = await do_entertext(page, query_selector, text_to_enter)
185 |     # logger.info(f"#########do_entertext returned: {result}")
186 |     await asyncio.sleep(
187 |         0.1
188 |     )  # sleep for 100ms to allow the mutation observer to detect changes
189 |     unsubscribe(detect_dom_changes)
190 | 
191 |     await browser_manager.take_screenshots(f"{function_name}_end", page)
192 | 
193 |     if dom_changes_detected:
194 |         return f"{result['detailed_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action of entering text {text_to_enter} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction."
195 |     return result["detailed_message"]
196 | 
197 | 
198 | async def do_entertext(
199 |     page: Page, selector: str, text_to_enter: str, use_keyboard_fill: bool = True
200 | ):
201 |     """
202 |     Performs the text entry operation on a DOM element.
203 | 
204 |     This function performs the text entry operation on a DOM element identified by the given CSS selector.
205 |     It applies a pulsating border effect to the element during the operation for visual feedback.
206 |     The function supports both direct setting of the 'value' property and simulating keyboard typing.
207 | 
208 |     Args:
209 |         page (Page): The Playwright Page object representing the browser tab in which the operation will be performed.
210 |         selector (str): The CSS selector string used to locate the target DOM element.
211 |         text_to_enter (str): The text value to be set in the target element. Existing content will be overwritten.
212 |         use_keyboard_fill (bool, optional): Determines whether to simulate keyboard typing or not.
213 |                                             Defaults to False.
214 | 
215 |     Returns:
216 |         Dict[str, str]: Explanation of the outcome of this operation represented as a dictionary with 'summary_message' and 'detailed_message'.
217 | 
218 |     Example:
219 |         result = await do_entertext(page, '#username', 'test_user')
220 | 
221 |     Note:
222 |         - The 'use_keyboard_fill' parameter determines whether to simulate keyboard typing or not.
223 |         - If 'use_keyboard_fill' is set to True, the function uses the 'page.keyboard.type' method to enter the text.
224 |         - If 'use_keyboard_fill' is set to False, the function uses the 'custom_fill_element' method to enter the text.
225 |     """
226 |     try:
227 |         elem = await page.query_selector(selector)
228 | 
229 |         if elem is None:
230 |             error = f"Error: Selector {selector} not found. Unable to continue."
231 |             return {"summary_message": error, "detailed_message": error}
232 | 
233 |         # logger.info(f"######### Found selector {selector} to enter text")
234 |         element_outer_html = await get_element_outer_html(elem, page)
235 | 
236 |         if use_keyboard_fill:
237 |             await elem.focus()
238 |             await asyncio.sleep(0.1)
239 |             await press_key_combination("Control+A")
240 |             await asyncio.sleep(0.1)
241 |             await press_key_combination("Backspace")
242 |             await asyncio.sleep(0.1)
243 |             logger.debug(f"Focused element with selector {selector} to enter text")
244 |             # add a 100ms delay
245 |             await page.keyboard.type(text_to_enter, delay=1)
246 |         else:
247 |             await custom_fill_element(page, selector, text_to_enter)
248 |         await elem.focus()
249 |         logger.info(
250 |             f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}'
251 |         )
252 |         success_msg = f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}'
253 |         return {
254 |             "summary_message": success_msg,
255 |             "detailed_message": f"{success_msg} and outer HTML: {element_outer_html}.",
256 |         }
257 | 
258 |     except Exception as e:
259 |         traceback.print_exc()
260 |         error = f"Error entering text in selector {selector}."
261 |         # logger.info("Error in do_entertext", error)
262 |         return {"summary_message": error, "detailed_message": f"{error} Error: {e}"}
263 | 
264 | 
265 | async def bulk_enter_text(
266 |     entries: Annotated[
267 |         List[Dict[str, str]],
268 |         "List of objects, each containing 'query_selector' and 'text'.",
269 |     ],  # noqa: UP006
270 | ) -> Annotated[
271 |     List[Dict[str, str]],
272 |     "List of dictionaries, each containing 'query_selector' and the result of the operation.",
273 | ]:  # noqa: UP006
274 |     """
275 |     Enters text into multiple DOM elements using a bulk operation.
276 | 
277 |     This function enters text into multiple DOM elements using a bulk operation.
278 |     It takes a list of dictionaries, where each dictionary contains a 'query_selector' and 'text' pair.
279 |     The function internally calls the 'entertext' function to perform the text entry operation for each entry.
280 | 
281 |     Args:
282 |         entries: List of objects, each containing 'query_selector' and 'text'.
283 | 
284 |     Returns:
285 |         List of dictionaries, each containing 'query_selector' and the result of the operation.
286 | 
287 |     Example:
288 |         entries = [
289 |             {"query_selector": "#username", "text": "test_user"},
290 |             {"query_selector": "#password", "text": "test_password"}
291 |         ]
292 |         results = await bulk_enter_text(entries)
293 | 
294 |     Note:
295 |         - Each entry in the 'entries' list should be a dictionary with 'query_selector' and 'text' keys.
296 |         - The result is a list of dictionaries, where each dictionary contains the 'query_selector' and the result of the operation.
297 |     """
298 | 
299 |     results: List[Dict[str, str]] = []  # noqa: UP006
300 |     logger.info("Executing bulk Enter Text Command")
301 |     for entry in entries:
302 |         query_selector = entry["query_selector"]
303 |         text_to_enter = entry["text"]
304 |         logger.info(
305 |             f"Entering text: {text_to_enter} in element with selector: {query_selector}"
306 |         )
307 |         result = await entertext(
308 |             EnterTextEntry(query_selector=query_selector, text=text_to_enter)
309 |         )
310 | 
311 |         results.append({"query_selector": query_selector, "result": result})
312 | 
313 |     return results
314 | 


--------------------------------------------------------------------------------
/agentq/core/skills/get_dom_with_content_type.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any, Dict, Optional, Union
  4 | 
  5 | from playwright.async_api import Page
  6 | from typing_extensions import Annotated
  7 | 
  8 | from agentq.config.config import SOURCE_LOG_FOLDER_PATH
  9 | from agentq.core.web_driver.playwright import PlaywrightManager
 10 | from agentq.utils.dom_helper import wait_for_non_loading_dom_state
 11 | from agentq.utils.get_detailed_accessibility_tree import do_get_accessibility_info
 12 | from agentq.utils.logger import logger
 13 | 
 14 | 
 15 | async def get_dom_with_content_type(
 16 |     content_type: Annotated[
 17 |         str,
 18 |         "The type of content to extract: 'text_only': Extracts the innerText of the highest element in the document and responds with text, or 'input_fields': Extracts the text input and button elements in the dom.",
 19 |     ],
 20 |     webpage: Optional[Page] = None,
 21 | ) -> Annotated[
 22 |     Union[Dict[str, Any], str, None],
 23 |     "The output based on the specified content type.",
 24 | ]:
 25 |     """
 26 |     Retrieves and processes the DOM of the active page in a browser instance based on the specified content type.
 27 | 
 28 |     Parameters
 29 |     ----------
 30 |     content_type : str
 31 |         The type of content to extract. Possible values are:
 32 |         - 'text_only': Extracts the innerText of the highest element in the document and responds with text.
 33 |         - 'input_fields': Extracts the text input and button elements in the DOM and responds with a JSON object.
 34 |         - 'all_fields': Extracts all the fields in the DOM and responds with a JSON object.
 35 | 
 36 |     Returns
 37 |     -------
 38 |     Dict[str, Any] | str | None
 39 |         The processed content based on the specified content type. This could be:
 40 |         - A JSON object for 'input_fields' with just inputs.
 41 |         - Plain text for 'text_only'.
 42 |         - A minified DOM represented as a JSON object for 'all_fields'.
 43 | 
 44 |     Raises
 45 |     ------
 46 |     ValueError
 47 |         If an unsupported content_type is provided.
 48 |     """
 49 | 
 50 |     logger.info(f"Executing Get DOM Command based on content_type: {content_type}")
 51 |     start_time = time.time()
 52 |     # Create and use the PlaywrightManager
 53 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
 54 | 
 55 |     if webpage is not None:
 56 |         page = webpage
 57 |     else:
 58 |         page = await browser_manager.get_current_page()
 59 | 
 60 |     if page is None:  # type: ignore
 61 |         raise ValueError("No active page found. OpenURL command opens a new page.")
 62 | 
 63 |     extracted_data = None
 64 |     await wait_for_non_loading_dom_state(
 65 |         page, 5000
 66 |     )  # wait for the DOM to be ready, non loading means external resources do not need to be loaded
 67 |     user_success_message = ""
 68 |     if content_type == "all_fields":
 69 |         user_success_message = "Fetched all the fields in the DOM"
 70 |         extracted_data = await do_get_accessibility_info(page, only_input_fields=False)
 71 |     elif content_type == "input_fields":
 72 |         logger.debug("Fetching DOM for input_fields")
 73 |         extracted_data = await do_get_accessibility_info(page, only_input_fields=True)
 74 |         if extracted_data is None:
 75 |             return "Could not fetch input fields. Please consider trying with content_type all_fields."
 76 |         user_success_message = "Fetched only input fields in the DOM"
 77 |     elif content_type == "text_only":
 78 |         # Extract text from the body or the highest-level element
 79 |         logger.debug("Fetching DOM for text_only")
 80 |         text_content = await get_filtered_text_content(page)
 81 |         with open(
 82 |             os.path.join(SOURCE_LOG_FOLDER_PATH, "text_only_dom.txt"),
 83 |             "w",
 84 |             encoding="utf-8",
 85 |         ) as f:
 86 |             f.write(text_content)
 87 |         extracted_data = text_content
 88 |         user_success_message = "Fetched the text content of the DOM"
 89 |     else:
 90 |         raise ValueError(f"Unsupported content_type: {content_type}")
 91 | 
 92 |     elapsed_time = time.time() - start_time
 93 |     logger.info(f"Get DOM Command executed in {elapsed_time} seconds")
 94 |     # await browser_manager.notify_user(
 95 |     #     user_success_message, message_type=MessageType.ACTION
 96 |     # )
 97 |     return extracted_data  # type: ignore
 98 | 
 99 | 
100 | async def get_filtered_text_content(page: Page) -> str:
101 |     text_content = await page.evaluate("""
102 |         () => {
103 |             // Array of query selectors to filter out
104 |             const selectorsToFilter = ['#agente-overlay'];
105 | 
106 |             // Store the original visibility values to revert later
107 |             const originalStyles = [];
108 | 
109 |             // Hide the elements matching the query selectors
110 |             selectorsToFilter.forEach(selector => {
111 |                 const elements = document.querySelectorAll(selector);
112 |                 elements.forEach(element => {
113 |                     originalStyles.push({ element: element, originalStyle: element.style.visibility });
114 |                     element.style.visibility = 'hidden';
115 |                 });
116 |             });
117 | 
118 |             // Get the text content of the page
119 |             let textContent = document?.body?.innerText || document?.documentElement?.innerText || "";
120 | 
121 |             // Get all the alt text from images on the page
122 |             let altTexts = Array.from(document.querySelectorAll('img')).map(img => img.alt);
123 |             altTexts="Other Alt Texts in the page: " + altTexts.join(' ');
124 | 
125 |             // Revert the visibility changes
126 |             originalStyles.forEach(entry => {
127 |                 entry.element.style.visibility = entry.originalStyle;
128 |             });
129 |             textContent=textContent+" "+altTexts;
130 |             return textContent;
131 |         }
132 |     """)
133 |     return text_content
134 | 


--------------------------------------------------------------------------------
/agentq/core/skills/get_screenshot.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | 
 3 | from typing_extensions import Annotated, Optional
 4 | 
 5 | from agentq.core.web_driver.playwright import PlaywrightManager
 6 | from agentq.utils.logger import logger
 7 | from playwright.async_api import Page
 8 | 
 9 | 
10 | async def get_screenshot(
11 |         webpage: Optional[Page] = None
12 | ) -> (
13 |     Annotated[
14 |         str, "Returns a base64 encoded screenshot of the current active web page."
15 |     ]
16 | ):
17 |     """
18 |     Captures and returns a base64 encoded screenshot of the current page (only the visible viewport and not the full page)
19 | 
20 |     Returns:
21 |     - Base64 encoded string of the screenshot image.
22 |     """
23 | 
24 |     try:
25 |         # Create and use the PlaywrightManager
26 |         browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
27 |         if webpage is not None:
28 |             page = webpage 
29 |         else: 
30 |             page = await browser_manager.get_current_page()
31 |         logger.info("page {page}")
32 | 
33 |         if not page:
34 |             logger.info("No active page found. OpenURL command opens a new page.")
35 |             raise ValueError("No active page found. OpenURL command opens a new page.")
36 | 
37 |         await page.wait_for_load_state("domcontentloaded")
38 | 
39 |         # Capture the screenshot
40 |         logger.info("about to capture")
41 |         screenshot_bytes = await page.screenshot(full_page=False)
42 | 
43 |         # Encode the screenshot as base64
44 |         base64_screenshot = base64.b64encode(screenshot_bytes).decode("utf-8")
45 | 
46 |         return f"data:image/png;base64,{base64_screenshot}"
47 | 
48 |     except Exception as e:
49 |         raise ValueError(
50 |             "Failed to capture screenshot. Make sure a page is open and accessible."
51 |         ) from e
52 | 


--------------------------------------------------------------------------------
/agentq/core/skills/get_url.py:
--------------------------------------------------------------------------------
 1 | from playwright.async_api import Page
 2 | from typing_extensions import Annotated, Optional
 3 | 
 4 | from agentq.core.web_driver.playwright import PlaywrightManager
 5 | 
 6 | 
 7 | async def geturl(
 8 |     webpage: Optional[Page] = None,
 9 | ) -> Annotated[str, "Returns the full URL of the current active web site/page."]:
10 |     """
11 |     Returns the full URL of the current page
12 | 
13 |     Parameters:
14 | 
15 |     Returns:
16 |     - Full URL the browser's active page.
17 |     """
18 | 
19 |     try:
20 |         # Create and use the PlaywrightManager
21 |         browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
22 |         if webpage is not None:
23 |             page = webpage
24 |         else:
25 |             page = await browser_manager.get_current_page()
26 | 
27 |         if not page:
28 |             raise ValueError("No active page found. OpenURL command opens a new page.")
29 | 
30 |         await page.wait_for_load_state("domcontentloaded")
31 | 
32 |         # Get the URL of the current page
33 |         try:
34 |             title = await page.title()
35 |             current_url = page.url
36 |             if len(current_url) > 250:
37 |                 current_url = current_url[:250] + "..."
38 |             return f"Current Page: {current_url}, Title: {title}"  # type: ignore
39 |         except:  # noqa: E722
40 |             current_url = page.url
41 |             return f"Current Page: {current_url}"
42 | 
43 |     except Exception as e:
44 |         raise ValueError(
45 |             "No active page found. OpenURL command opens a new page."
46 |         ) from e
47 | 


--------------------------------------------------------------------------------
/agentq/core/skills/get_user_input.py:
--------------------------------------------------------------------------------
 1 | from typing import (
 2 |     Dict,
 3 |     List,  # noqa: UP035,
 4 | )
 5 | 
 6 | from typing_extensions import Annotated
 7 | 
 8 | from agentq.core.web_driver.playwright import PlaywrightManager
 9 | from agentq.utils.cli_helper import answer_questions_over_cli
10 | 
11 | 
12 | async def get_user_input(
13 |     questions: Annotated[
14 |         List[str], "List of questions to ask the user each one represented as a string"
15 |     ],
16 | ) -> Dict[str, str]:  # noqa: UP006
17 |     """
18 |     Asks the user a list of questions and returns the answers in a dictionary.
19 | 
20 |     Parameters:
21 |     - questions: A list of questions to ask the user ["What is Username?", "What is your password?"].
22 | 
23 |     Returns:
24 |     - Newline separated list of questions to ask the user
25 |     """
26 | 
27 |     answers: Dict[str, str] = {}
28 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
29 |     if browser_manager.ui_manager:
30 |         for question in questions:
31 |             answers[question] = await browser_manager.prompt_user(
32 |                 f"Question: {question}"
33 |             )
34 |     else:
35 |         answers = await answer_questions_over_cli(questions)
36 |     return answers
37 | 


--------------------------------------------------------------------------------
/agentq/core/skills/open_url.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import inspect
  3 | 
  4 | from playwright.async_api import TimeoutError as PlaywrightTimeoutError
  5 | from typing_extensions import Annotated
  6 | 
  7 | from agentq.core.web_driver.playwright import PlaywrightManager
  8 | from agentq.utils.logger import logger
  9 | 
 10 | 
 11 | async def openurl(
 12 |     url: Annotated[
 13 |         str,
 14 |         "The URL to navigate to. Value must include the protocol (http:// or https://).",
 15 |     ],
 16 |     timeout: Annotated[int, "Additional wait time in seconds after initial load."],
 17 |     max_retries: Annotated[int, "Maximum number of retry attempts"] = 3,
 18 | ) -> Annotated[str, "Returns the result of this request in text form"]:
 19 |     """
 20 |     Opens a specified URL in the active browser instance. Waits for an initial load event, then waits for either
 21 |     the 'domcontentloaded' event or a configurable timeout, whichever comes first.
 22 | 
 23 |     Parameters:
 24 |     - url: The URL to navigate to.
 25 |     - timeout: Additional time in seconds to wait after the initial load before considering the navigation successful.
 26 |     - max_retries: Maximum number of retry attempts (default: 3).
 27 | 
 28 |     Returns:
 29 |     - URL of the new page.
 30 |     """
 31 |     logger.info(f"Opening URL: {url}")
 32 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
 33 |     await browser_manager.get_browser_context()
 34 |     page = await browser_manager.get_current_page()
 35 |     # Navigate to the URL with a short timeout to ensure the initial load starts
 36 |     function_name = inspect.currentframe().f_code.co_name  # type: ignore
 37 |     url = ensure_protocol(url)
 38 | 
 39 |     for attempt in range(max_retries):
 40 |         try:
 41 |             await browser_manager.take_screenshots(f"{function_name}_start", page)
 42 | 
 43 |             # set extra headers for bypassing ngrok
 44 |             await page.set_extra_http_headers({"User-Agent": "AgentQ-Sentient"})
 45 | 
 46 |             # Use a longer timeout for navigation
 47 |             await page.goto(
 48 |                 url, timeout=max(30000, timeout * 1000), wait_until="domcontentloaded"
 49 |             )
 50 | 
 51 |             # Wait for network idle to ensure page is fully loaded
 52 |             await page.wait_for_load_state(
 53 |                 "networkidle", timeout=max(30000, timeout * 1000)
 54 |             )
 55 | 
 56 |             await browser_manager.take_screenshots(f"{function_name}_end", page)
 57 | 
 58 |             title = await page.title()
 59 |             final_url = page.url
 60 |             logger.info(f"Successfully loaded page: {final_url}")
 61 |             return f"Page loaded: {final_url}, Title: {title}"
 62 | 
 63 |         except PlaywrightTimeoutError as e:
 64 |             logger.warning(f"Timeout error on attempt {attempt + 1}: {e}")
 65 |             if attempt == max_retries - 1:
 66 |                 logger.error(f"Failed to load {url} after {max_retries} attempts")
 67 |                 return f"Failed to load page: {url}. Error: Timeout after {max_retries} attempts"
 68 |             await asyncio.sleep(2)  # Wait before retrying
 69 | 
 70 |         except Exception as e:
 71 |             logger.error(f"Error navigating to {url}: {e}")
 72 |             return f"Failed to load page: {url}. Error: {str(e)}"
 73 | 
 74 |     await browser_manager.take_screenshots(f"{function_name}_end", page)
 75 | 
 76 |     # await browser_manager.notify_user(
 77 |     #     f"Opened URL: {url}", message_type=MessageType.ACTION
 78 |     # )
 79 |     # Get the page title
 80 |     title = await page.title()
 81 |     url = page.url
 82 |     return f"Page loaded: {url}, Title: {title}"  # type: ignore
 83 | 
 84 | 
 85 | def ensure_protocol(url: str) -> str:
 86 |     """
 87 |     Ensures that a URL has a protocol (http:// or https://). If it doesn't have one,
 88 |     https:// is added by default.
 89 | 
 90 |     Parameters:
 91 |     - url: The URL to check and modify if necessary.
 92 | 
 93 |     Returns:
 94 |     - A URL string with a protocol.
 95 |     """
 96 |     if not url.startswith(("http://", "https://")):
 97 |         url = "https://" + url  # Default to http if no protocol is specified
 98 |         logger.info(
 99 |             f"Added 'https://' protocol to URL because it was missing. New URL is: {url}"
100 |         )
101 |     return url
102 | 


--------------------------------------------------------------------------------
/agentq/core/skills/pdf_text_extractor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import httpx
  4 | import pdfplumber
  5 | from typing_extensions import Annotated
  6 | 
  7 | from agentq.config.config import PROJECT_TEMP_PATH
  8 | from agentq.core.web_driver.playwright import PlaywrightManager
  9 | from agentq.utils.logger import logger
 10 | from agentq.utils.message_type import MessageType
 11 | 
 12 | 
 13 | async def extract_text_from_pdf(
 14 |     pdf_url: Annotated[str, "The URL of the PDF file to extract text from."],
 15 | ) -> Annotated[str, "All the text found in the PDF file."]:
 16 |     """
 17 |     Extract text from a PDF file.
 18 |     pdf_url: str - The URL of the PDF file to extract text from.
 19 |     returns: str - All the text found in the PDF.
 20 |     """
 21 |     file_path = os.path.join(
 22 |         PROJECT_TEMP_PATH, "downloaded_file.pdf"
 23 |     )  # fixed file path for downloading the PDF
 24 | 
 25 |     try:
 26 |         # Create and use the PlaywrightManager
 27 |         browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
 28 | 
 29 |         # Download the PDF
 30 |         download_result = await download_pdf(pdf_url, file_path)
 31 |         if not os.path.exists(download_result):
 32 |             return download_result  # Return error message if download failed
 33 | 
 34 |         # Open the PDF using pdfplumber and extract text
 35 |         text = ""
 36 |         with pdfplumber.open(download_result) as pdf:
 37 |             for page in pdf.pages:
 38 |                 page_text = page.extract_text()
 39 |                 if page_text:
 40 |                     text += page_text + "\n"
 41 |         extracted_text = text.strip()
 42 |         word_count = len(extracted_text.split())
 43 |         await browser_manager.notify_user(
 44 |             f"Extracted text from the PDF successfully. Found {word_count} words.",
 45 |             message_type=MessageType.ACTION,
 46 |         )
 47 |         return "Text found in the PDF:\n" + extracted_text
 48 |     except httpx.HTTPStatusError as e:
 49 |         logger.error(
 50 |             f"An error occurred while downloading the PDF from {pdf_url}: {str(e)}"
 51 |         )
 52 |         return f"An error occurred while downloading the PDF: {str(e)}"
 53 |     except Exception as e:
 54 |         logger.error(
 55 |             f"An error occurred while extracting text from the PDF that was downloaded from {pdf_url}: {str(e)}"
 56 |         )
 57 |         return f"An error occurred while extracting text: {str(e)}"
 58 |     finally:
 59 |         # Cleanup: Ensure the downloaded file is removed
 60 |         cleanup_temp_files(file_path)
 61 | 
 62 | 
 63 | def cleanup_temp_files(*file_paths: str) -> None:
 64 |     """
 65 |     Remove the specified temporary files.
 66 | 
 67 |     *file_paths: str - One or more file paths to be removed.
 68 |     """
 69 |     for file_path in file_paths:
 70 |         if os.path.exists(file_path):
 71 |             try:
 72 |                 os.remove(file_path)
 73 |                 logger.debug(f"Cleaned file from the filesystem: {file_path}")
 74 |             except Exception as e:
 75 |                 logger.error(f"Failed to remove {file_path}: {str(e)}")
 76 |         else:
 77 |             logger.debug(
 78 |                 f"File not found. Unable to clean it from the filesystem: {file_path}"
 79 |             )
 80 | 
 81 | 
 82 | async def download_pdf(pdf_url: str, file_path: str) -> str:
 83 |     """
 84 |     Download the PDF file from the given URL and save it to the specified path.
 85 | 
 86 |     pdf_url: str - The URL of the PDF file to download.
 87 |     file_path: str - The local path to save the downloaded PDF.
 88 | 
 89 |     returns: str - The file path of the downloaded PDF if successful, otherwise an error message.
 90 |     raises: Exception - If an error occurs during the download process.
 91 |     """
 92 |     try:
 93 |         logger.info(f"Downloading PDF from: {pdf_url} to: {file_path}")
 94 |         async with httpx.AsyncClient() as client:
 95 |             response = await client.get(pdf_url)
 96 |             response.raise_for_status()  # Ensure the request was successful
 97 |         with open(file_path, "wb") as pdf_file:
 98 |             pdf_file.write(response.content)
 99 |         return file_path
100 |     # except httpx.HTTPStatusError as e:
101 |     #     raise e
102 |     except Exception as e:
103 |         raise e
104 | 


--------------------------------------------------------------------------------
/agentq/core/skills/press_key_combination.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import inspect
  3 | 
  4 | from playwright.async_api import Page  # type: ignore
  5 | from typing_extensions import Annotated
  6 | 
  7 | from agentq.core.web_driver.playwright import PlaywrightManager
  8 | from agentq.utils.dom_mutation_observer import (
  9 |     subscribe,  # type: ignore
 10 |     unsubscribe,  # type: ignore
 11 | )
 12 | from agentq.utils.logger import logger
 13 | 
 14 | 
 15 | async def press_key_combination(
 16 |     key_combination: Annotated[str, "The key to press, e.g., Enter, PageDown etc"],
 17 | ) -> str:
 18 |     """
 19 |     Presses a key combination on the current active page managed by PlaywrightManager.
 20 | 
 21 |     This function simulates the pressing of a key or a combination of keys on the current active web page.
 22 |     The `key_combination` should be a string that represents the keys to be pressed, separated by '+' if it's a combination.
 23 |     For example, 'Control+C' to copy or 'Alt+F4' to close a window on Windows.
 24 | 
 25 |     Parameters:
 26 |     - key_combination (Annotated[str, "The key combination to press, e.g., 'Control+C'."]): The key combination to press, represented as a string. For combinations, use '+' as a separator.
 27 | 
 28 |     Raises:
 29 |     - ValueError: If no active page is found.
 30 | 
 31 |     Returns:
 32 |     str: status of the operation expressed as a string
 33 |     """
 34 | 
 35 |     logger.info(f"Executing press_key_combination with key combo: {key_combination}")
 36 |     # Create and use the PlaywrightManager
 37 |     browser_manager = PlaywrightManager()
 38 |     page = await browser_manager.get_current_page()
 39 | 
 40 |     if page is None:  # type: ignore
 41 |         raise ValueError("No active page found. OpenURL command opens a new page.")
 42 | 
 43 |     # Split the key combination if it's a combination of keys
 44 |     keys = key_combination.split("+")
 45 | 
 46 |     dom_changes_detected = None
 47 | 
 48 |     def detect_dom_changes(changes: str):  # type: ignore
 49 |         nonlocal dom_changes_detected
 50 |         dom_changes_detected = changes  # type: ignore
 51 | 
 52 |     subscribe(detect_dom_changes)
 53 |     # If it's a combination, hold down the modifier keys
 54 |     for key in keys[:-1]:  # All keys except the last one are considered modifier keys
 55 |         await page.keyboard.down(key)
 56 | 
 57 |     # Press the last key in the combination
 58 |     await page.keyboard.press(keys[-1])
 59 | 
 60 |     # Release the modifier keys
 61 |     for key in keys[:-1]:
 62 |         await page.keyboard.up(key)
 63 |     await asyncio.sleep(
 64 |         0.1
 65 |     )  # sleep for 100ms to allow the mutation observer to detect changes
 66 |     unsubscribe(detect_dom_changes)
 67 | 
 68 |     if dom_changes_detected:
 69 |         return f"Key {key_combination} executed successfully.\n As a consequence of this action, new elements have appeared in view:{dom_changes_detected}. This means that the action is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction."
 70 | 
 71 |     # await browser_manager.notify_user(
 72 |     #     f"Key {key_combination} executed successfully", message_type=MessageType.ACTION
 73 |     # )
 74 |     return f"Key {key_combination} executed successfully"
 75 | 
 76 | 
 77 | async def do_press_key_combination(
 78 |     browser_manager: PlaywrightManager, page: Page, key_combination: str
 79 | ) -> bool:
 80 |     """
 81 |     Presses a key combination on the provided page.
 82 | 
 83 |     This function simulates the pressing of a key or a combination of keys on a web page.
 84 |     The `key_combination` should be a string that represents the keys to be pressed, separated by '+' if it's a combination.
 85 |     For example, 'Control+C' to copy or 'Alt+F4' to close a window on Windows.
 86 | 
 87 |     Parameters:
 88 |     - browser_manager (PlaywrightManager): The PlaywrightManager instance.
 89 |     - page (Page): The Playwright page instance.
 90 |     - key_combination (str): The key combination to press, represented as a string. For combinations, use '+' as a separator.
 91 | 
 92 |     Returns:
 93 |     bool: True if success and False if failed
 94 |     """
 95 | 
 96 |     logger.info(f"Executing press_key_combination with key combo: {key_combination}")
 97 |     try:
 98 |         function_name = inspect.currentframe().f_code.co_name  # type: ignore
 99 |         await browser_manager.take_screenshots(f"{function_name}_start", page)
100 |         # Split the key combination if it's a combination of keys
101 |         keys = key_combination.split("+")
102 | 
103 |         # If it's a combination, hold down the modifier keys
104 |         for key in keys[
105 |             :-1
106 |         ]:  # All keys except the last one are considered modifier keys
107 |             await page.keyboard.down(key)
108 | 
109 |         # Press the last key in the combination
110 |         await page.keyboard.press(keys[-1])
111 | 
112 |         # Release the modifier keys
113 |         for key in keys[:-1]:
114 |             await page.keyboard.up(key)
115 | 
116 |     except Exception as e:
117 |         logger.error(f'Error executing press_key_combination "{key_combination}": {e}')
118 |         return False
119 | 
120 |     await browser_manager.take_screenshots(f"{function_name}_end", page)
121 | 
122 |     return True
123 | 


--------------------------------------------------------------------------------
/agentq/core/skills/solve_captcha.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | from typing import Annotated
  3 | 
  4 | from agentq.core.agent.captcha_agent import CaptchaAgent
  5 | from agentq.core.models.models import CaptchaAgentInput, CaptchaAgentOutput
  6 | from agentq.core.skills.enter_text_and_click import enter_text_and_click
  7 | from agentq.core.skills.get_screenshot import get_screenshot
  8 | from agentq.core.web_driver.playwright import PlaywrightManager
  9 | from agentq.utils.logger import logger
 10 | 
 11 | 
 12 | async def solve_captcha(
 13 |     text_selector: Annotated[
 14 |         str,
 15 |         "The properly formatted DOM selector query, for example [mmid='1234'], where the captcha text will be entered. Use mmid attribute. mmid will always be a number",
 16 |     ],
 17 |     click_selector: Annotated[
 18 |         str,
 19 |         "The properly formatted DOM selector query, for example [mmid='1234'], for the element that will be clicked after captch text entry. mmmid will be alwayes be a number",
 20 |     ],
 21 |     wait_before_click_execution: Annotated[
 22 |         float, "Optional wait time in seconds before executing the click."
 23 |     ],
 24 | ) -> Annotated[
 25 |     str, "A message indicating success of failure of the captcha solving and submitting"
 26 | ]:
 27 |     """
 28 |     Solves a captcha, enters into the text element and submits it by clicking another element.
 29 | 
 30 |     Parameters:
 31 |     - text_selector: The selector for the element to enter the captcha into. It should be a properly formatted DOM selector query, for example [mmid='1234'], where the captcha text will be entered. Use the mmid attribute.
 32 |     - click_selector: The selector for the element to click post captcha is entered. It should be a properly formatted DOM selector query, for example [mmid='1234'].
 33 |     - wait_before_click_execution: Optional wait time in seconds before executing the click action. Default is 0.0.
 34 | 
 35 |     Returns:
 36 |     - A message indicating the success or failure of the cathcha entry and click.
 37 | 
 38 |     Raises:
 39 |     - ValueError: If no active page is found. The OpenURL command opens a new page.
 40 | 
 41 |     Example usage:
 42 |     ```
 43 |     await solve_captcha("[mmid='1234']", "[mmid='5678']", wait_before_click_execution=1.5)
 44 |     ```
 45 |     -
 46 |     """
 47 |     logger.info("Solving captcha")
 48 | 
 49 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
 50 | 
 51 |     page = await browser_manager.get_current_page()
 52 | 
 53 |     if page is None:
 54 |         logger.error("No active page found")
 55 |         raise ValueError("No active page found. OpenURL command opens a new page")
 56 | 
 57 |     # Take ss for logging
 58 |     function_name = inspect.currentframe().f_code.co_name
 59 |     await browser_manager.highlight_element(text_selector, True)
 60 |     await browser_manager.take_screenshots(f"{function_name}_start", page=page)
 61 | 
 62 |     screenshot = await get_screenshot()
 63 |     captcha_agent = CaptchaAgent()
 64 |     input: CaptchaAgentInput = CaptchaAgentInput(objective="Solve this captcha")
 65 | 
 66 |     try:
 67 |         captcha_output: CaptchaAgentOutput = await captcha_agent.run(input, screenshot)
 68 |     except Exception as e:
 69 |         await browser_manager.take_screenshots(f"{function_name}_end", page=page)
 70 |         logger.error(f"Error in captcha_agent.run: {str(e)}")
 71 |         return "Failed to solve the captcha. Error in running the Captcha Agent"
 72 | 
 73 |     if not captcha_output.success:
 74 |         await browser_manager.take_screenshots(f"{function_name}_end", page=page)
 75 |         return "Failed to solve the captcha. Captcha agent did not succeed."
 76 | 
 77 |     success_msg = (
 78 |         f"Success. Successfully solved the captcha {captcha_output.captcha}.\n"
 79 |     )
 80 |     result = {
 81 |         "summary_message": success_msg,
 82 |         "detailed_message": f"{success_msg}",
 83 |     }
 84 | 
 85 |     # enter text and click
 86 |     enter_text_and_click_result = await enter_text_and_click(
 87 |         text_selector=text_selector,
 88 |         text_to_enter=captcha_output.captcha,
 89 |         click_selector=click_selector,
 90 |         wait_before_click_execution=wait_before_click_execution,
 91 |     )
 92 | 
 93 |     if not enter_text_and_click_result.startswith("Success"):
 94 |         await browser_manager.take_screenshots(f"{function_name}_end", page)
 95 |         return f"Solved the captcha but failed to enter it & click '{enter_text_and_click_result}' into element with text selector '{text_selector} & click selector {click_selector}'. Check that the selctor is valid."
 96 | 
 97 |     result["detailed_message"] += f"{enter_text_and_click_result}"
 98 | 
 99 |     return result["detailed_message"]
100 | 


--------------------------------------------------------------------------------
/agentq/core/skills/upload_file.py:
--------------------------------------------------------------------------------
 1 | from typing_extensions import Annotated
 2 | 
 3 | from agentq.core.web_driver.playwright import PlaywrightManager
 4 | from agentq.utils.logger import logger
 5 | 
 6 | 
 7 | async def upload_file(
 8 |     # label: Annotated[str, "Label for the element on which upload should happen"],
 9 |     selector: Annotated[
10 |         str,
11 |         "The properly formed query selector string to identify the file input element (e.g. [mmid='114']). When \"mmid\" attribute is present, use it for the query selector. mmid will always be a number",
12 |     ],
13 |     file_path: Annotated[str, "Path on the local system for the file to be uploaded"],
14 | ) -> Annotated[str, "A meesage indicating if the file uplaod was successful"]:
15 |     """
16 |     Uploads a file.
17 | 
18 |     Parameters:
19 |     - file_path: Path of the file that needs to be uploaded.
20 | 
21 |     Returns:
22 |     - A message indicating the success or failure of the file upload
23 |     """
24 |     logger.info(
25 |         f"Uploading file onto the page from {file_path} using selector {selector}"
26 |     )
27 |     print("naman-selector")
28 |     # print(label)
29 |     # label = "Add File"
30 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
31 |     page = await browser_manager.get_current_page()
32 | 
33 |     if not page:
34 |         raise ValueError("No active page found. OpenURL command opens a new page")
35 | 
36 |     await page.wait_for_load_state("domcontentloaded")
37 | 
38 |     try:
39 |         await page.locator(selector).set_input_files(file_path)
40 |         # await page.get_by_label(label).set_input_files(file_path)
41 |         logger.info(
42 |             "File upload was successful. I can confirm it. Please proceed ahead with next step."
43 |         )
44 |     except Exception as e:
45 |         logger.error(f"Failed to upload file: {e}")
46 |         return f"File upload failed {e}"
47 | 


--------------------------------------------------------------------------------
/agentq/core/web_driver/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/web_driver/__init__.py


--------------------------------------------------------------------------------
/agentq/user_preferences/user_preferences.txt:
--------------------------------------------------------------------------------
1 | 1. Your job is to find the relevant product asked by the user on a locally hosted e-commerce website: http://localhost:3000/abc
2 | 2. Your search query should be small and should only have important keywords. In general, you should try to checkout the first couple of results shown in the search results. 
3 | 3. Remember, a lot of custom filters like size, color, flavor, quantity, etc. are only available on the product page, and the product title on the search page may not exactly fit requirements but can loosely relate to the search query, so checkout the first couple of search results that match most closely. 
4 |     Eg, you are asked to buy strawberry biscuits from britannia - but when you search britannia strawberry biscuits, you get search results with chocloate-flavored britannia buiscuits. You should still click on it, as there could be an option on the product page to change flavor from chocolate to strawberry because the brand is the same and the product is the same: "braitannia and biscuits."
5 | 4. If you do not find the relevant product in the first couple of search results pages, you should go back to the search page and try to search with a different query.
6 | 5. Make sure to pay attention to all the attributes like size, color, quantity, etc. mentioned in the user's query and select appropriate attributes on the product details page before buying it. 
7 | 6. Ultimately, your task will only end when you click on the "Buy Now" button of the right product.


--------------------------------------------------------------------------------
/agentq/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/utils/__init__.py


--------------------------------------------------------------------------------
/agentq/utils/_pydantic.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, Tuple, Union, get_args
  2 | 
  3 | from pydantic import BaseModel
  4 | from pydantic.version import VERSION as PYDANTIC_VERSION
  5 | from typing_extensions import get_origin
  6 | 
  7 | __all__ = (
  8 |     "JsonSchemaValue",
  9 |     "model_dump",
 10 |     "model_dump_json",
 11 |     "type2schema",
 12 |     "evaluate_forwardref",
 13 | )
 14 | 
 15 | PYDANTIC_V1 = PYDANTIC_VERSION.startswith("1.")
 16 | 
 17 | if not PYDANTIC_V1:
 18 |     from pydantic import TypeAdapter
 19 |     from pydantic._internal._typing_extra import (
 20 |         eval_type_lenient as evaluate_forwardref,
 21 |     )
 22 |     from pydantic.json_schema import JsonSchemaValue
 23 | 
 24 |     def type2schema(t: Any) -> JsonSchemaValue:
 25 |         """Convert a type to a JSON schema
 26 | 
 27 |         Args:
 28 |             t (Type): The type to convert
 29 | 
 30 |         Returns:
 31 |             JsonSchemaValue: The JSON schema
 32 |         """
 33 |         return TypeAdapter(t).json_schema()
 34 | 
 35 |     def model_dump(model: BaseModel) -> Dict[str, Any]:
 36 |         """Convert a pydantic model to a dict
 37 | 
 38 |         Args:
 39 |             model (BaseModel): The model to convert
 40 | 
 41 |         Returns:
 42 |             Dict[str, Any]: The dict representation of the model
 43 | 
 44 |         """
 45 |         return model.model_dump()
 46 | 
 47 |     def model_dump_json(model: BaseModel) -> str:
 48 |         """Convert a pydantic model to a JSON string
 49 | 
 50 |         Args:
 51 |             model (BaseModel): The model to convert
 52 | 
 53 |         Returns:
 54 |             str: The JSON string representation of the model
 55 |         """
 56 |         return model.model_dump_json()
 57 | 
 58 | 
 59 | # Remove this once we drop support for pydantic 1.x
 60 | else:  # pragma: no cover
 61 |     from pydantic import TypeAdapter
 62 |     from pydantic.typing import (
 63 |         evaluate_forwardref as evaluate_forwardref,  # type: ignore[no-redef]
 64 |     )
 65 | 
 66 |     JsonSchemaValue = Dict[str, Any]  # type: ignore[misc]
 67 | 
 68 |     def type2schema(t: Any) -> JsonSchemaValue:
 69 |         """Convert a type to a JSON schema
 70 | 
 71 |         Args:
 72 |             t (Type): The type to convert
 73 | 
 74 |         Returns:
 75 |             JsonSchemaValue: The JSON schema
 76 |         """
 77 |         if PYDANTIC_V1:
 78 |             if t is None:
 79 |                 return {"type": "null"}
 80 |             elif get_origin(t) is Union:
 81 |                 return {"anyOf": [type2schema(tt) for tt in get_args(t)]}
 82 |             elif get_origin(t) in [Tuple, tuple]:
 83 |                 prefixItems = [type2schema(tt) for tt in get_args(t)]
 84 |                 return {
 85 |                     "maxItems": len(prefixItems),
 86 |                     "minItems": len(prefixItems),
 87 |                     "prefixItems": prefixItems,
 88 |                     "type": "array",
 89 |                 }
 90 | 
 91 |         d = TypeAdapter.json_schema(t)
 92 |         if "title" in d:
 93 |             d.pop("title")
 94 |         if "description" in d:
 95 |             d.pop("description")
 96 | 
 97 |         return d
 98 | 
 99 |     def model_dump(model: BaseModel) -> Dict[str, Any]:
100 |         """Convert a pydantic model to a dict
101 | 
102 |         Args:
103 |             model (BaseModel): The model to convert
104 | 
105 |         Returns:
106 |             Dict[str, Any]: The dict representation of the model
107 | 
108 |         """
109 |         return model.dict()
110 | 
111 |     def model_dump_json(model: BaseModel) -> str:
112 |         """Convert a pydantic model to a JSON string
113 | 
114 |         Args:
115 |             model (BaseModel): The model to convert
116 | 
117 |         Returns:
118 |             str: The JSON string representation of the model
119 |         """
120 |         return model.json()
121 | 


--------------------------------------------------------------------------------
/agentq/utils/cli_helper.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from asyncio import Future
 3 | from typing import Dict, List
 4 | 
 5 | 
 6 | def async_input(prompt: str) -> Future:  # type: ignore
 7 |     """
 8 |     Display a prompt to the user and wait for input in an asynchronous manner.
 9 | 
10 |     Parameters:
11 |     - prompt: The message to display to the user.
12 | 
13 |     Returns:
14 |     - A Future object that will be fulfilled with the user's input.
15 |     """
16 |     loop = asyncio.get_event_loop()
17 |     return loop.run_in_executor(None, input, prompt)
18 | 
19 | 
20 | async def answer_questions_over_cli(questions: List[str]) -> Dict[str, str]:
21 |     """
22 |     Asks a question over the command line and awaits the user's response.
23 | 
24 |     Parameters:
25 |     - questions: A list of questions to ask the user, e.g., ["What is your favorite site?", "What do you want to search for?"].
26 | 
27 |     Returns:
28 |     - A dictionary where each key is a question and each value is the user's response.
29 |     """
30 |     answers: Dict[str, str] = {}
31 |     print("*********************************")
32 |     for question in questions:
33 |         answers[question] = await async_input("Question: " + str(question) + " : ")
34 |     print("*********************************")
35 |     return answers
36 | 


--------------------------------------------------------------------------------
/agentq/utils/dom_helper.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import List, Optional
 3 | 
 4 | from playwright.async_api import ElementHandle, Page
 5 | 
 6 | from agentq.utils.logger import logger
 7 | 
 8 | 
 9 | async def wait_for_non_loading_dom_state(page: Page, max_wait_millis: int):
10 |     max_wait_seconds = max_wait_millis / 1000
11 |     end_time = asyncio.get_event_loop().time() + max_wait_seconds
12 |     while asyncio.get_event_loop().time() < end_time:
13 |         dom_state = await page.evaluate("document.readyState")
14 |         if dom_state != "loading":
15 |             logger.debug(f"DOM state is not 'loading': {dom_state}")
16 |             break  # Exit the loop if the DOM state is not 'loading'
17 | 
18 |         await asyncio.sleep(0.05)
19 | 
20 | 
21 | async def get_element_outer_html(
22 |     element: ElementHandle, page: Page, element_tag_name: Optional[str] = None
23 | ) -> str:
24 |     """
25 |     Constructs the opening tag of an HTML element along with its attributes.
26 | 
27 |     Args:
28 |         element (ElementHandle): The element to retrieve the opening tag for.
29 |         page (Page): The page object associated with the element.
30 |         element_tag_name (str, optional): The tag name of the element. Defaults to None. If not passed, it will be retrieved from the element.
31 | 
32 |     Returns:
33 |         str: The opening tag of the HTML element, including a select set of attributes.
34 |     """
35 |     tag_name: str = (
36 |         element_tag_name
37 |         if element_tag_name
38 |         else await page.evaluate("element => element.tagName.toLowerCase()", element)
39 |     )
40 | 
41 |     attributes_of_interest: List[str] = [
42 |         "id",
43 |         "name",
44 |         "aria-label",
45 |         "placeholder",
46 |         "href",
47 |         "src",
48 |         "aria-autocomplete",
49 |         "role",
50 |         "type",
51 |         "data-testid",
52 |         "value",
53 |         "selected",
54 |         "aria-labelledby",
55 |         "aria-describedby",
56 |         "aria-haspopup",
57 |     ]
58 |     opening_tag: str = f"<{tag_name}"
59 | 
60 |     for attr in attributes_of_interest:
61 |         value: str = await element.get_attribute(attr)  # type: ignore
62 |         if value:
63 |             opening_tag += f' {attr}="{value}"'
64 |     opening_tag += ">"
65 | 
66 |     return opening_tag
67 | 


--------------------------------------------------------------------------------
/agentq/utils/dom_mutation_observer.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | from typing import Callable, List  # noqa: UP035
 4 | 
 5 | from playwright.async_api import Page
 6 | 
 7 | # Create an event loop
 8 | loop = asyncio.get_event_loop()
 9 | 
10 | DOM_change_callback: List[Callable[[str], None]] = []
11 | 
12 | 
13 | def subscribe(callback: Callable[[str], None]) -> None:
14 |     DOM_change_callback.append(callback)
15 | 
16 | 
17 | def unsubscribe(callback: Callable[[str], None]) -> None:
18 |     DOM_change_callback.remove(callback)
19 | 
20 | 
21 | async def add_mutation_observer(page: Page):
22 |     """
23 |     Adds a mutation observer to the page to detect changes in the DOM.
24 |     When changes are detected, the observer calls the dom_mutation_change_detected function in the browser context.
25 |     This changes can be detected by subscribing to the dom_mutation_change_detected function by individual skills.
26 | 
27 |     Current implementation only detects when a new node is added to the DOM.
28 |     However, in many cases, the change could be a change in the style or class of an existing node (e.g. toggle visibility of a hidden node).
29 |     """
30 | 
31 |     await page.evaluate("""
32 |         console.log('Adding a mutation observer for DOM changes');
33 |         new MutationObserver((mutationsList, observer) => {
34 |             let changes_detected = [];
35 |             for(let mutation of mutationsList) {
36 |                 if (mutation.type === 'childList') {
37 |                     let allAddedNodes=mutation.addedNodes;
38 |                     for(let node of allAddedNodes) {
39 |                         if(node.tagName && !['SCRIPT', 'NOSCRIPT', 'STYLE'].includes(node.tagName) && !node.closest('#agentDriveAutoOverlay')) {
40 |                             let visibility=true;
41 |                             let content = node.innerText.trim();
42 |                             if(visibility && node.innerText.trim()){
43 |                                 if(content) {
44 |                                     changes_detected.push({tag: node.tagName, content: content});
45 |                                 }
46 |                             }
47 |                         }
48 |                     }
49 |                 } else if (mutation.type === 'characterData') {
50 |                     let node = mutation.target;
51 |                     if(node.parentNode && !['SCRIPT', 'NOSCRIPT', 'STYLE'].includes(node.parentNode.tagName) && !node.parentNode.closest('#agentDriveAutoOverlay')) {
52 |                         let visibility=true;
53 |                         let content = node.data.trim();
54 |                         if(visibility && content && window.getComputedStyle(node.parentNode).display !== 'none'){
55 |                             if(content && !changes_detected.some(change => change.content.includes(content))) {
56 |                                 changes_detected.push({tag: node.parentNode.tagName, content: content});
57 |                             }
58 |                         }
59 |                     }
60 |                 }
61 |             }
62 |             if(changes_detected.length > 0) {
63 |                 window.dom_mutation_change_detected(JSON.stringify(changes_detected));
64 |             }
65 |         }).observe(document, {subtree: true, childList: true, characterData: true});
66 |         """)
67 | 
68 | 
69 | async def handle_navigation_for_mutation_observer(page: Page):
70 |     await add_mutation_observer(page)
71 | 
72 | 
73 | async def dom_mutation_change_detected(changes_detected: str):
74 |     """
75 |     Detects changes in the DOM (new nodes added) and emits the event to all subscribed callbacks.
76 |     The changes_detected is a string in JSON formatt containing the tag and content of the new nodes added to the DOM.
77 | 
78 |     e.g.  The following will be detected when autocomplete recommendations show up when one types Nelson Mandela on google search
79 |     [{'tag': 'SPAN', 'content': 'nelson mandela wikipedia'}, {'tag': 'SPAN', 'content': 'nelson mandela movies'}]
80 |     """
81 |     changes_detected = json.loads(changes_detected.replace("\t", "").replace("\n", ""))
82 |     if len(changes_detected) > 0:
83 |         # Emit the event to all subscribed callbacks
84 |         for callback in DOM_change_callback:
85 |             # If the callback is a coroutine function
86 |             if asyncio.iscoroutinefunction(callback):
87 |                 await callback(changes_detected)
88 |             # If the callback is a regular function
89 |             else:
90 |                 callback(changes_detected)
91 | 


--------------------------------------------------------------------------------
/agentq/utils/extract_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any, Dict
 3 | 
 4 | from agentq.utils.logger import logger
 5 | 
 6 | 
 7 | def extract_json(message: str) -> Dict[str, Any]:
 8 |     """
 9 |     Parse the response from the browser agent and return the response as a dictionary.
10 |     """
11 |     json_response = {}
12 |     # Remove Markdown code block delimiters if present
13 |     message = message.strip()
14 |     if message.startswith("```"):
15 |         message = message.split("\n", 1)[1]  # Remove the first line
16 |     if message.endswith("```"):
17 |         message = message.rsplit("\n", 1)[0]  # Remove the last line
18 | 
19 |     # Remove any leading "json" tag
20 |     if message.lstrip().startswith("json"):
21 |         message = message.lstrip()[4:].lstrip()
22 | 
23 |     try:
24 |         return json.loads(message)
25 |     except json.JSONDecodeError as e:
26 |         logger.warn(
27 |             f"LLM response was not properly formed JSON. Error: {e}. "
28 |             f'LLM response: "{message}"'
29 |         )
30 |         message = message.replace("\\n", "\n")
31 |         message = message.replace("\n", " ")  # type: ignore
32 |         if "plan" in message and "next_step" in message:
33 |             start = message.index("plan") + len("plan")
34 |             end = message.index("next_step")
35 |             json_response["plan"] = message[start:end].replace('"', "").strip()
36 |         if "next_step" in message and "terminate" in message:
37 |             start = message.index("next_step") + len("next_step")
38 |             end = message.index("terminate")
39 |             json_response["next_step"] = message[start:end].replace('"', "").strip()
40 |         if "terminate" in message and "final_response" in message:
41 |             start = message.index("terminate") + len("terminate")
42 |             end = message.index("final_response")
43 |             matched_string = message[start:end].replace('"', "").strip()
44 |             if "yes" in matched_string:
45 |                 json_response["terminate"] = "yes"
46 |             else:
47 |                 json_response["terminate"] = "no"
48 | 
49 |             start = message.index("final_response") + len("final_response")
50 |             end = len(message) - 1
51 |             json_response["final_response"] = (
52 |                 message[start:end].replace('"', "").strip()
53 |             )
54 | 
55 |         elif "terminate" in message:
56 |             start = message.index("terminate") + len("terminate")
57 |             end = len(message) - 1
58 |             matched_string = message[start:end].replace('"', "").strip()
59 |             if "yes" in matched_string:
60 |                 json_response["terminate"] = "yes"
61 |             else:
62 |                 json_response["terminate"] = "no"
63 | 
64 |     return json_response
65 | 


--------------------------------------------------------------------------------
/agentq/utils/function_utils.py:
--------------------------------------------------------------------------------
  1 | # import inspect
  2 | # from typing import Any, Callable, Dict, List, Union
  3 | 
  4 | # from typing_extensions import Annotated, get_args, get_origin
  5 | 
  6 | 
  7 | # def get_type_name(type_hint: Any) -> str:
  8 | #     if hasattr(type_hint, "__name__"):
  9 | #         return type_hint.__name__
 10 | #     if hasattr(type_hint, "_name"):
 11 | #         return type_hint._name
 12 | #     return str(type_hint).replace("typing.", "")
 13 | 
 14 | 
 15 | # def get_parameter_schema(
 16 | #     name: str, param: inspect.Parameter, type_hint: Any
 17 | # ) -> Dict[str, Any]:
 18 | #     schema = {"type": get_type_name(type_hint)}
 19 | 
 20 | #     if get_origin(type_hint) is Annotated:
 21 | #         type_hint, description = get_args(type_hint)
 22 | #         schema["description"] = description
 23 | #     else:
 24 | #         schema["description"] = name
 25 | 
 26 | #     if get_origin(type_hint) is Union:
 27 | #         schema["type"] = [get_type_name(arg) for arg in get_args(type_hint)]
 28 | #     elif get_origin(type_hint) is List:
 29 | #         item_type = get_args(type_hint)[0]
 30 | #         if get_origin(item_type) is Dict:
 31 | #             key_type, value_type = get_args(item_type)
 32 | #             schema["type"] = "array"
 33 | #             schema["items"] = {
 34 | #                 "type": "object",
 35 | #                 "additionalProperties": {"type": get_type_name(value_type)},
 36 | #             }
 37 | #         else:
 38 | #             schema["type"] = "array"
 39 | #             schema["items"] = {"type": get_type_name(item_type)}
 40 | 
 41 | #     if param.default != inspect.Parameter.empty:
 42 | #         schema["default"] = param.default
 43 | #     return schema
 44 | 
 45 | 
 46 | # def generate_tool_from_function(
 47 | #     func: Callable[..., Any], tool_description: str
 48 | # ) -> Dict[str, Any]:
 49 | #     signature = inspect.signature(func)
 50 | #     type_hints = func.__annotations__
 51 | 
 52 | #     parameters = {}
 53 | #     for name, param in signature.parameters.items():
 54 | #         type_hint = type_hints.get(name, Any)
 55 | #         parameters[name] = get_parameter_schema(name, param, type_hint)
 56 | 
 57 | #     return {
 58 | #         "type": "function",
 59 | #         "function": {
 60 | #             "name": func.__name__,
 61 | #             "description": tool_description,
 62 | #             "parameters": {
 63 | #                 "type": "object",
 64 | #                 "properties": parameters,
 65 | #                 "required": [
 66 | #                     name
 67 | #                     for name, param in signature.parameters.items()
 68 | #                     if param.default == inspect.Parameter.empty
 69 | #                 ],
 70 | #             },
 71 | #         },
 72 | #     }
 73 | 
 74 | 
 75 | import functools
 76 | import inspect
 77 | import json
 78 | from logging import getLogger
 79 | from typing import (
 80 |     Any,
 81 |     Callable,
 82 |     Dict,
 83 |     ForwardRef,
 84 |     List,
 85 |     Optional,
 86 |     Set,
 87 |     Tuple,
 88 |     Type,
 89 |     TypeVar,
 90 |     Union,
 91 | )
 92 | 
 93 | from pydantic import BaseModel, Field
 94 | from typing_extensions import Annotated, Literal, get_args, get_origin
 95 | from playwright.async_api import Page
 96 | 
 97 | from ._pydantic import (
 98 |     JsonSchemaValue,
 99 |     evaluate_forwardref,
100 |     model_dump,
101 |     model_dump_json,
102 |     type2schema,
103 | )
104 | 
105 | logger = getLogger(__name__)
106 | 
107 | T = TypeVar("T")
108 | 
109 | 
110 | def get_typed_annotation(annotation: Any, globalns: Dict[str, Any]) -> Any:
111 |     """Get the type annotation of a parameter.
112 | 
113 |     Args:
114 |         annotation: The annotation of the parameter
115 |         globalns: The global namespace of the function
116 | 
117 |     Returns:
118 |         The type annotation of the parameter
119 |     """
120 |     if isinstance(annotation, str):
121 |         annotation = ForwardRef(annotation)
122 |         annotation = evaluate_forwardref(annotation, globalns, globalns)
123 |     return annotation
124 | 
125 | 
126 | def get_typed_signature(call: Callable[..., Any]) -> inspect.Signature:
127 |     """Get the signature of a function with type annotations.
128 | 
129 |     Args:
130 |         call: The function to get the signature for
131 | 
132 |     Returns:
133 |         The signature of the function with type annotations
134 |     """
135 |     signature = inspect.signature(call)
136 |     globalns = getattr(call, "__globals__", {})
137 |     typed_params = [
138 |         inspect.Parameter(
139 |             name=param.name,
140 |             kind=param.kind,
141 |             default=param.default,
142 |             annotation=get_typed_annotation(param.annotation, globalns),
143 |         )
144 |         for param in signature.parameters.values()
145 |     ]
146 |     typed_signature = inspect.Signature(typed_params)
147 |     return typed_signature
148 | 
149 | 
150 | def get_typed_return_annotation(call: Callable[..., Any]) -> Any:
151 |     """Get the return annotation of a function.
152 | 
153 |     Args:
154 |         call: The function to get the return annotation for
155 | 
156 |     Returns:
157 |         The return annotation of the function
158 |     """
159 |     signature = inspect.signature(call)
160 |     annotation = signature.return_annotation
161 | 
162 |     if annotation is inspect.Signature.empty:
163 |         return None
164 | 
165 |     globalns = getattr(call, "__globals__", {})
166 |     return get_typed_annotation(annotation, globalns)
167 | 
168 | 
169 | def get_param_annotations(
170 |     typed_signature: inspect.Signature,
171 | ) -> Dict[str, Union[Annotated[Type[Any], str], Type[Any]]]:
172 |     """Get the type annotations of the parameters of a function
173 | 
174 |     Args:
175 |         typed_signature: The signature of the function with type annotations
176 | 
177 |     Returns:
178 |         A dictionary of the type annotations of the parameters of the function
179 |     """
180 |     return {
181 |         k: v.annotation
182 |         for k, v in typed_signature.parameters.items()
183 |         if v.annotation is not inspect.Signature.empty
184 |     }
185 | 
186 | 
187 | class Parameters(BaseModel):
188 |     """Parameters of a function as defined by the OpenAI API"""
189 | 
190 |     type: Literal["object"] = "object"
191 |     properties: Dict[str, JsonSchemaValue]
192 |     required: List[str]
193 |     additionalProperties: bool
194 |     additionalProperties: bool
195 | 
196 | 
197 | class Function(BaseModel):
198 |     """A function as defined by the OpenAI API"""
199 | 
200 |     description: Annotated[str, Field(description="Description of the function")]
201 |     name: Annotated[str, Field(description="Name of the function")]
202 |     parameters: Annotated[Parameters, Field(description="Parameters of the function")]
203 |     strict: bool
204 | 
205 | 
206 | class ToolFunction(BaseModel):
207 |     """A function under tool as defined by the OpenAI API."""
208 | 
209 |     type: Literal["function"] = "function"
210 |     function: Annotated[Function, Field(description="Function under tool")]
211 | 
212 | 
213 | def get_parameter_json_schema(
214 |     k: str, v: Any, default_values: Dict[str, Any]
215 | ) -> JsonSchemaValue:
216 |     
217 |     if isinstance(v, type) and issubclass(v, Page):
218 |         # Skip schema generation for Page objects - some tools take page as an optional input (this is only utilised during evals when page object is passed to functions like get_dom_content)
219 |         return {
220 |             "type": "object",
221 |             "description": "Playwright Page object",
222 |         }
223 | 
224 |     # Handle Optional types
225 |     if get_origin(v) is Union and type(None) in get_args(v):
226 |         non_none_type = next(arg for arg in get_args(v) if arg is not type(None))
227 |         if isinstance(non_none_type, type) and issubclass(non_none_type, Page):
228 |             # Skip schema generation for Optional[Page]
229 |             return {
230 |                 "type": "object",
231 |                 "description": "Optional Playwright Page object",
232 |             }
233 |     
234 |     def type2description(k: str, v: Union[Annotated[Type[Any], str], Type[Any]]) -> str:
235 |         if get_origin(v) is Annotated:
236 |             args = get_args(v)
237 |             if len(args) > 1 and isinstance(args[1], str):
238 |                 return args[1]
239 |         return k
240 | 
241 |     schema = type2schema(v)
242 |     schema["description"] = type2description(k, v)
243 | 
244 |     if schema["type"] == "object":
245 |         schema["additionalProperties"] = False
246 |         if "properties" not in schema:
247 |             schema["properties"] = {}
248 | 
249 |     if schema["type"] == "array":
250 |         if "items" not in schema:
251 |             schema["items"] = {
252 |                 "type": "object",
253 |                 "properties": {},
254 |                 "additionalProperties": False,
255 |             }
256 |         elif schema["items"].get("type") == "object":
257 |             if "properties" not in schema["items"]:
258 |                 schema["items"]["properties"] = {}
259 |             schema["items"]["additionalProperties"] = False
260 | 
261 |     return schema
262 | 
263 | 
264 | def get_required_params(typed_signature: inspect.Signature) -> List[str]:
265 |     """Get the required parameters of a function
266 | 
267 |     Args:
268 |         signature: The signature of the function as returned by inspect.signature
269 | 
270 |     Returns:
271 |         A list of the required parameters of the function
272 |     """
273 |     return [
274 |         k
275 |         for k, v in typed_signature.parameters.items()
276 |         if v.default == inspect.Signature.empty
277 |     ]
278 | 
279 | 
280 | def get_default_values(typed_signature: inspect.Signature) -> Dict[str, Any]:
281 |     """Get default values of parameters of a function
282 | 
283 |     Args:
284 |         signature: The signature of the function as returned by inspect.signature
285 | 
286 |     Returns:
287 |         A dictionary of the default values of the parameters of the function
288 |     """
289 |     return {
290 |         k: v.default
291 |         for k, v in typed_signature.parameters.items()
292 |         if v.default != inspect.Signature.empty
293 |     }
294 | 
295 | 
296 | def get_parameters(
297 |     required: List[str],
298 |     param_annotations: Dict[str, Union[Annotated[Type[Any], str], Type[Any]]],
299 |     default_values: Dict[str, Any],
300 | ) -> Parameters:
301 |     properties = {}
302 |     for k, v in param_annotations.items():
303 |         if v is not inspect.Signature.empty:
304 |             if get_origin(v) is Annotated:
305 |                 v_type = get_args(v)[0]
306 |                 v_desc = get_args(v)[1] if len(get_args(v)) > 1 else k
307 |             else:
308 |                 v_type = v
309 |                 v_desc = k
310 |             
311 |             if (isinstance(v_type, type) and issubclass(v_type, Page)) or (
312 |                 get_origin(v_type) is Union
313 |                 and any(
314 |                     isinstance(arg, type) and issubclass(arg, Page)
315 |                     for arg in get_args(v_type)
316 |                 )
317 |             ):
318 |                 continue
319 |             
320 | 
321 |             if get_origin(v_type) is List:
322 |                 item_type = get_args(v_type)[0]
323 |                 properties[k] = {
324 |                     "type": "array",
325 |                     "items": get_parameter_json_schema(k, item_type, default_values),
326 |                     "description": v_desc,
327 |                 }
328 |             else:
329 |                 properties[k] = get_parameter_json_schema(k, v_type, default_values)
330 |                 properties[k]["description"] = v_desc
331 | 
332 |     return Parameters(
333 |         properties=properties,
334 |         required=list(properties.keys()),  # All properties are required
335 |         additionalProperties=False,
336 |     )
337 | 
338 | 
339 | def get_missing_annotations(
340 |     typed_signature: inspect.Signature, required: List[str]
341 | ) -> Tuple[Set[str], Set[str]]:
342 |     """Get the missing annotations of a function
343 | 
344 |     Ignores the parameters with default values as they are not required to be annotated, but logs a warning.
345 |     Args:
346 |         typed_signature: The signature of the function with type annotations
347 |         required: The required parameters of the function
348 | 
349 |     Returns:
350 |         A set of the missing annotations of the function
351 |     """
352 |     all_missing = {
353 |         k
354 |         for k, v in typed_signature.parameters.items()
355 |         if v.annotation is inspect.Signature.empty
356 |     }
357 |     missing = all_missing.intersection(set(required))
358 |     unannotated_with_default = all_missing.difference(missing)
359 |     return missing, unannotated_with_default
360 | 
361 | 
362 | def get_function_schema(
363 |     f: Callable[..., Any], *, name: Optional[str] = None, description: str
364 | ) -> Dict[str, Any]:
365 |     """Get a JSON schema for a function as defined by the OpenAI API
366 | 
367 |     Args:
368 |         f: The function to get the JSON schema for
369 |         name: The name of the function
370 |         description: The description of the function
371 | 
372 |     Returns:
373 |         A JSON schema for the function
374 | 
375 |     Raises:
376 |         TypeError: If the function is not annotated
377 | 
378 |     Examples:
379 | 
380 |     ```python
381 |     def f(a: Annotated[str, "Parameter a"], b: int = 2, c: Annotated[float, "Parameter c"] = 0.1) -> None:
382 |         pass
383 | 
384 |     get_function_schema(f, description="function f")
385 | 
386 |     #   {'type': 'function',
387 |     #    'function': {'description': 'function f',
388 |     #        'name': 'f',
389 |     #        'parameters': {'type': 'object',
390 |     #           'properties': {'a': {'type': 'str', 'description': 'Parameter a'},
391 |     #               'b': {'type': 'int', 'description': 'b'},
392 |     #               'c': {'type': 'float', 'description': 'Parameter c'}},
393 |     #           'required': ['a']}}}
394 |     ```
395 | 
396 |     """
397 |     typed_signature = get_typed_signature(f)
398 |     required = get_required_params(typed_signature)
399 |     default_values = get_default_values(typed_signature)
400 |     param_annotations = get_param_annotations(typed_signature)
401 |     return_annotation = get_typed_return_annotation(f)
402 |     missing, unannotated_with_default = get_missing_annotations(
403 |         typed_signature, required
404 |     )
405 | 
406 |     if return_annotation is None:
407 |         logger.warning(
408 |             f"The return type of the function '{f.__name__}' is not annotated. Although annotating it is "
409 |             + "optional, the function should return either a string, a subclass of 'pydantic.BaseModel'."
410 |         )
411 | 
412 |     if unannotated_with_default != set():
413 |         unannotated_with_default_s = [
414 |             f"'{k}'" for k in sorted(unannotated_with_default)
415 |         ]
416 |         logger.warning(
417 |             f"The following parameters of the function '{f.__name__}' with default values are not annotated: "
418 |             + f"{', '.join(unannotated_with_default_s)}."
419 |         )
420 | 
421 |     if missing != set():
422 |         missing_s = [f"'{k}'" for k in sorted(missing)]
423 |         raise TypeError(
424 |             f"All parameters of the function '{f.__name__}' without default values must be annotated. "
425 |             + f"The annotations are missing for the following parameters: {', '.join(missing_s)}"
426 |         )
427 | 
428 |     fname = name if name else f.__name__
429 | 
430 |     parameters = get_parameters(
431 |         required, param_annotations, default_values=default_values
432 |     )
433 | 
434 |     function = ToolFunction(
435 |         function=Function(
436 |             description=description,
437 |             name=fname,
438 |             parameters=parameters,
439 |             strict=True,
440 |         )
441 |     )
442 | 
443 |     schema = model_dump(function)
444 | 
445 |     return schema
446 | 
447 | 
448 | def get_load_param_if_needed_function(
449 |     t: Any,
450 | ) -> Optional[Callable[[Dict[str, Any], Type[BaseModel]], BaseModel]]:
451 |     """Get a function to load a parameter if it is a Pydantic model
452 | 
453 |     Args:
454 |         t: The type annotation of the parameter
455 | 
456 |     Returns:
457 |         A function to load the parameter if it is a Pydantic model, otherwise None
458 | 
459 |     """
460 |     if get_origin(t) is Annotated:
461 |         return get_load_param_if_needed_function(get_args(t)[0])
462 | 
463 |     def load_base_model(v: Dict[str, Any], t: Type[BaseModel]) -> BaseModel:
464 |         return t(**v)
465 | 
466 |     return load_base_model if isinstance(t, type) and issubclass(t, BaseModel) else None
467 | 
468 | 
469 | def load_basemodels_if_needed(func: Callable[..., Any]) -> Callable[..., Any]:
470 |     """A decorator to load the parameters of a function if they are Pydantic models
471 | 
472 |     Args:
473 |         func: The function with annotated parameters
474 | 
475 |     Returns:
476 |         A function that loads the parameters before calling the original function
477 | 
478 |     """
479 |     # get the type annotations of the parameters
480 |     typed_signature = get_typed_signature(func)
481 |     param_annotations = get_param_annotations(typed_signature)
482 | 
483 |     # get functions for loading BaseModels when needed based on the type annotations
484 |     kwargs_mapping_with_nones = {
485 |         k: get_load_param_if_needed_function(t) for k, t in param_annotations.items()
486 |     }
487 | 
488 |     # remove the None values
489 |     kwargs_mapping = {
490 |         k: f for k, f in kwargs_mapping_with_nones.items() if f is not None
491 |     }
492 | 
493 |     # a function that loads the parameters before calling the original function
494 |     @functools.wraps(func)
495 |     def _load_parameters_if_needed(*args: Any, **kwargs: Any) -> Any:
496 |         # load the BaseModels if needed
497 |         for k, f in kwargs_mapping.items():
498 |             kwargs[k] = f(kwargs[k], param_annotations[k])
499 | 
500 |         # call the original function
501 |         return func(*args, **kwargs)
502 | 
503 |     @functools.wraps(func)
504 |     async def _a_load_parameters_if_needed(*args: Any, **kwargs: Any) -> Any:
505 |         # load the BaseModels if needed
506 |         for k, f in kwargs_mapping.items():
507 |             kwargs[k] = f(kwargs[k], param_annotations[k])
508 | 
509 |         # call the original function
510 |         return await func(*args, **kwargs)
511 | 
512 |     if inspect.iscoroutinefunction(func):
513 |         return _a_load_parameters_if_needed
514 |     else:
515 |         return _load_parameters_if_needed
516 | 
517 | 
518 | def serialize_to_str(x: Any) -> str:
519 |     if isinstance(x, str):
520 |         return x
521 |     elif isinstance(x, BaseModel):
522 |         return model_dump_json(x)
523 |     else:
524 |         return json.dumps(x)
525 | 


--------------------------------------------------------------------------------
/agentq/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from typing import Union
 4 | 
 5 | # Create a logs directory if it doesn't exist
 6 | log_directory = "logs"
 7 | os.makedirs(log_directory, exist_ok=True)
 8 | 
 9 | # Configure the root logger
10 | logging.basicConfig(
11 |     level=logging.DEBUG,
12 |     format="[%(asctime)s] %(levelname)s {%(filename)s:%(lineno)d} - %(message)s",
13 | )
14 | 
15 | # Remove all handlers from the root logger
16 | for handler in logging.root.handlers[:]:
17 |     logging.root.removeHandler(handler)
18 | 
19 | logger = logging.getLogger(__name__)
20 | logger.addHandler(logging.FileHandler(os.path.join(log_directory, "app.log")))
21 | logger.setLevel(logging.INFO)
22 | 
23 | # logging.getLogger("httpcore").setLevel(logging.WARNING)
24 | # logging.getLogger("httpx").setLevel(logging.WARNING)
25 | # logging.getLogger("matplotlib.pyplot").setLevel(logging.WARNING)
26 | # logging.getLogger("PIL.PngImagePlugin").setLevel(logging.WARNING)
27 | # logging.getLogger("PIL.Image").setLevel(logging.WARNING)
28 | 
29 | 
30 | def set_log_level(level: Union[str, int]) -> None:
31 |     """
32 |     Set the log level for the logger.
33 | 
34 |     Parameters:
35 |     - level (Union[str, int]): A string or logging level such as 'debug', 'info', 'warning', 'error', or 'critical', or the corresponding logging constants like logging.DEBUG, logging.INFO, etc.
36 |     """
37 |     if isinstance(level, str):
38 |         level = level.upper()
39 |         numeric_level = getattr(logging, level, None)
40 |         if not isinstance(numeric_level, int):
41 |             raise ValueError(f"Invalid log level: {level}")
42 |         logger.setLevel(numeric_level)
43 |     else:
44 |         logger.setLevel(level)
45 | 


--------------------------------------------------------------------------------
/agentq/utils/message_type.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class MessageType(Enum):
 5 |     PLAN = "plan"
 6 |     STEP = "step"
 7 |     ACTION = "action"
 8 |     ANSWER = "answer"
 9 |     QUESTION = "question"
10 |     INFO = "info"
11 |     FINAL = "final"
12 |     DONE = "transaction_done"
13 |     ERROR = "error"
14 | 


--------------------------------------------------------------------------------
/agentq/utils/ui_messagetype.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class MessageType(Enum):
 5 |     PLAN = "plan"
 6 |     STEP = "step"
 7 |     ACTION = "action"
 8 |     ANSWER = "answer"
 9 |     QUESTION = "question"
10 |     INFO = "info"
11 |     FINAL = "final"
12 |     DONE = "transaction_done"
13 |     ERROR = "error"
14 | 


--------------------------------------------------------------------------------
/dpo_pairs.jsonl:
--------------------------------------------------------------------------------
1 | {"prompt": "Objective: go to football page on bbc\nCurrent DOM: {'role': 'WebArea', 'name': 'Google', 'children': [{'name': 'About', 'mmid': '42', 'tag': 'a'}, {'name': 'Store', 'mmid': '43', 'tag': 'a'}, {'name': 'Gmail ', 'mmid': '52', 'tag': 'a'}, {'name': 'Search for Images ', 'mmid': '54', 'tag': 'a'}, {'role': 'button', 'name': 'Search Labs', 'mmid': '59', 'tag': 'a'}, {'role': 'button', 'name': 'Google apps', 'mmid': '64', 'tag': 'a'}, {'role': 'button', 'name': 'Google Account: Naman Jain (namanbhulawat@gmail.com)', 'mmid': '70', 'tag': 'a', 'aria-label': 'Google Account: Naman Jain  \\n(namanbhulawat@gmail.com)'}, {'role': 'region', 'name': 'Celebrating popcorn', 'children': [{'name': 'Celebrating popcorn', 'mmid': '109', 'tag': 'button'}], 'mmid': '101', 'tag': 'div'}, {'role': 'combobox', 'name': 'q', 'description': 'Search', 'focused': True, 'autocomplete': 'both', 'mmid': '170', 'tag': 'textarea', 'aria-label': 'Search'}, {'role': 'button', 'name': 'Search by voice', 'mmid': '186', 'tag': 'div'}, {'role': 'button', 'name': 'Search by im...", "chosen": "Action: {\"type\":\"ENTER_TEXT_AND_CLICK\",\"text_element_mmid\":170,\"text_to_enter\":\"BBC Sport football page\",\"click_element_mmid\":383,\"wait_before_click_execution\":null}\nDescription: Search for 'BBC Sport football page' in Google search bar and click the search button", "rejected": "Action: {\"type\":\"GOTO_URL\",\"website\":\"https://www.bbc.com/sport\",\"timeout\":2.0}\nDescription: Directly go to BBC Sport page by visiting 'https://www.bbc.com/sport'"}
2 | {"prompt": "Objective: go to football page on bbc\nCurrent DOM: {'role': 'WebArea', 'name': 'Google', 'children': [{'name': 'About', 'mmid': '42', 'tag': 'a'}, {'name': 'Store', 'mmid': '43', 'tag': 'a'}, {'name': 'Gmail ', 'mmid': '52', 'tag': 'a'}, {'name': 'Search for Images ', 'mmid': '54', 'tag': 'a'}, {'role': 'button', 'name': 'Search Labs', 'mmid': '59', 'tag': 'a'}, {'role': 'button', 'name': 'Google apps', 'mmid': '64', 'tag': 'a'}, {'role': 'button', 'name': 'Google Account: Naman Jain (namanbhulawat@gmail.com)', 'mmid': '70', 'tag': 'a', 'aria-label': 'Google Account: Naman Jain  \\n(namanbhulawat@gmail.com)'}, {'role': 'region', 'name': 'Celebrating popcorn', 'children': [{'name': 'Celebrating popcorn', 'mmid': '109', 'tag': 'button'}], 'mmid': '101', 'tag': 'div'}, {'role': 'combobox', 'name': 'q', 'description': 'Search', 'focused': True, 'autocomplete': 'both', 'mmid': '170', 'tag': 'textarea', 'aria-label': 'Search'}, {'role': 'button', 'name': 'Search by voice', 'mmid': '186', 'tag': 'div'}, {'role': 'button', 'name': 'Search by im...", "chosen": "Action: {\"type\":\"ENTER_TEXT_AND_CLICK\",\"text_element_mmid\":170,\"text_to_enter\":\"BBC Sport football page\",\"click_element_mmid\":383,\"wait_before_click_execution\":null}\nDescription: Search for 'BBC Sport football page' in Google search bar and click the search button", "rejected": "Action: {\"type\":\"ENTER_TEXT_AND_CLICK\",\"text_element_mmid\":170,\"text_to_enter\":\"BBC football\",\"click_element_mmid\":383,\"wait_before_click_execution\":null}\nDescription: Search for 'BBC football' on Google and click the search button"}
3 | {"prompt": "Objective: go to football page on bbc\nCurrent DOM: Error: Unable to retrieve DOM...", "chosen": "Action: {\"type\":\"GOTO_URL\",\"website\":\"https://www.bbc.co.uk/sport/football\",\"timeout\":3.0}\nDescription: Navigate directly to the BBC Sport Football page via URL.", "rejected": "Action: {\"type\":\"GOTO_URL\",\"website\":\"https://www.bbc.co.uk\",\"timeout\":3.0}\nDescription: Navigate to the BBC homepage to find the sports section."}
4 | {"prompt": "Objective: go to football page on bbc\nCurrent DOM: Error: Unable to retrieve DOM...", "chosen": "Action: {\"type\":\"GOTO_URL\",\"website\":\"https://www.bbc.co.uk/sport/football\",\"timeout\":3.0}\nDescription: Navigate directly to the BBC Sport Football page via URL.", "rejected": "Action: {\"type\":\"ENTER_TEXT_AND_CLICK\",\"text_element_mmid\":170,\"text_to_enter\":\"BBC Sport football\",\"click_element_mmid\":383,\"wait_before_click_execution\":null}\nDescription: Refine search on Google for the specific BBC Sport football page."}
5 | 


--------------------------------------------------------------------------------
/logs.txt:
--------------------------------------------------------------------------------
1 | Starting System Orchestrator...
2 | Browser profile /Users/namanjain/Library/Application Support/Google/Chrome
3 | Browser started and ready.
4 | Enter your command (or type 'exit' to quit): 


--------------------------------------------------------------------------------
/output.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/output.txt


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "agentq"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["nischalj10 <nischalj10@gmail.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.10,<4.0"
10 | litellm = "^1.42.9"
11 | pydantic = "^2.8.2"
12 | pytest-playwright = "^0.5.1"
13 | pdfplumber = "0.11.2"
14 | typing-extensions = "^4.12.2"
15 | ruff = "^0.5.6"
16 | playwright-stealth = "^1.0.6"
17 | setuptools = "^72.1.0"
18 | openai = "^1.40.1"
19 | boto3 = "^1.34.157"
20 | python-json-logger = "^2.0.7"
21 | aiohttp = "^3.10.2"
22 | colorama = "^0.4.6"
23 | tiktoken = "^0.7.0"
24 | agentops = "^0.3.10"
25 | termcolor = "^2.4.0"
26 | tabulate = "^0.9.0"
27 | nltk = "^3.9.1"
28 | langsmith = "^0.1.104"
29 | instructor = "^1.4.0"
30 | flask = "^3.0.3"
31 | numpy = "^2.1.0"
32 | 
33 | 
34 | [build-system]
35 | requires = ["poetry-core"]
36 | build-backend = "poetry.core.masonry.api"
37 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | agentops==0.3.10 ; python_version >= "3.10" and python_version < "4.0"
 2 | aiohappyeyeballs==2.4.0 ; python_version >= "3.10" and python_version < "4.0"
 3 | aiohttp==3.10.5 ; python_version >= "3.10" and python_version < "4.0"
 4 | aiosignal==1.3.1 ; python_version >= "3.10" and python_version < "4.0"
 5 | annotated-types==0.7.0 ; python_version >= "3.10" and python_version < "4.0"
 6 | anyio==4.4.0 ; python_version >= "3.10" and python_version < "4.0"
 7 | async-timeout==4.0.3 ; python_version >= "3.10" and python_version < "3.11"
 8 | attrs==24.2.0 ; python_version >= "3.10" and python_version < "4.0"
 9 | boto3==1.35.1 ; python_version >= "3.10" and python_version < "4.0"
10 | botocore==1.35.1 ; python_version >= "3.10" and python_version < "4.0"
11 | certifi==2024.7.4 ; python_version >= "3.10" and python_version < "4.0"
12 | cffi==1.17.0 ; python_version >= "3.10" and python_version < "4.0" and platform_python_implementation != "PyPy"
13 | charset-normalizer==3.3.2 ; python_version >= "3.10" and python_version < "4.0"
14 | click==8.1.7 ; python_version >= "3.10" and python_version < "4.0"
15 | colorama==0.4.6 ; python_version >= "3.10" and python_version < "4.0"
16 | cryptography==43.0.0 ; python_version >= "3.10" and python_version < "4.0"
17 | distro==1.9.0 ; python_version >= "3.10" and python_version < "4.0"
18 | docstring-parser==0.16 ; python_version >= "3.10" and python_version < "4.0"
19 | exceptiongroup==1.2.2 ; python_version >= "3.10" and python_version < "3.11"
20 | filelock==3.15.4 ; python_version >= "3.10" and python_version < "4.0"
21 | frozenlist==1.4.1 ; python_version >= "3.10" and python_version < "4.0"
22 | fsspec==2024.6.1 ; python_version >= "3.10" and python_version < "4.0"
23 | greenlet==3.0.3 ; python_version >= "3.10" and python_version < "4.0"
24 | h11==0.14.0 ; python_version >= "3.10" and python_version < "4.0"
25 | httpcore==1.0.5 ; python_version >= "3.10" and python_version < "4.0"
26 | httpx==0.27.0 ; python_version >= "3.10" and python_version < "4.0"
27 | huggingface-hub==0.24.6 ; python_version >= "3.10" and python_version < "4.0"
28 | idna==3.7 ; python_version >= "3.10" and python_version < "4.0"
29 | importlib-metadata==8.3.0 ; python_version >= "3.10" and python_version < "4.0"
30 | iniconfig==2.0.0 ; python_version >= "3.10" and python_version < "4.0"
31 | instructor==1.4.0 ; python_version >= "3.10" and python_version < "4.0"
32 | jinja2==3.1.4 ; python_version >= "3.10" and python_version < "4.0"
33 | jiter==0.4.2 ; python_version >= "3.10" and python_version < "4.0"
34 | jmespath==1.0.1 ; python_version >= "3.10" and python_version < "4.0"
35 | joblib==1.4.2 ; python_version >= "3.10" and python_version < "4.0"
36 | jsonschema-specifications==2023.12.1 ; python_version >= "3.10" and python_version < "4.0"
37 | jsonschema==4.23.0 ; python_version >= "3.10" and python_version < "4.0"
38 | langsmith==0.1.104 ; python_version >= "3.10" and python_version < "4.0"
39 | litellm==1.43.18 ; python_version >= "3.10" and python_version < "4.0"
40 | markdown-it-py==3.0.0 ; python_version >= "3.10" and python_version < "4.0"
41 | markupsafe==2.1.5 ; python_version >= "3.10" and python_version < "4.0"
42 | mdurl==0.1.2 ; python_version >= "3.10" and python_version < "4.0"
43 | multidict==6.0.5 ; python_version >= "3.10" and python_version < "4.0"
44 | nltk==3.9.1 ; python_version >= "3.10" and python_version < "4.0"
45 | openai==1.41.1 ; python_version >= "3.10" and python_version < "4.0"
46 | orjson==3.10.7 ; python_version >= "3.10" and python_version < "4.0"
47 | packaging==23.2 ; python_version >= "3.10" and python_version < "4.0"
48 | pdfminer-six==20231228 ; python_version >= "3.10" and python_version < "4.0"
49 | pdfplumber==0.11.2 ; python_version >= "3.10" and python_version < "4.0"
50 | pillow==10.4.0 ; python_version >= "3.10" and python_version < "4.0"
51 | playwright-stealth==1.0.6 ; python_version >= "3.10" and python_version < "4.0"
52 | playwright==1.46.0 ; python_version >= "3.10" and python_version < "4.0"
53 | pluggy==1.5.0 ; python_version >= "3.10" and python_version < "4.0"
54 | psutil==5.9.8 ; python_version >= "3.10" and python_version < "4.0"
55 | pycparser==2.22 ; python_version >= "3.10" and python_version < "4.0" and platform_python_implementation != "PyPy"
56 | pydantic-core==2.20.1 ; python_version >= "3.10" and python_version < "4.0"
57 | pydantic==2.8.2 ; python_version >= "3.10" and python_version < "4.0"
58 | pyee==11.1.0 ; python_version >= "3.10" and python_version < "4.0"
59 | pygments==2.18.0 ; python_version >= "3.10" and python_version < "4.0"
60 | pypdfium2==4.30.0 ; python_version >= "3.10" and python_version < "4.0"
61 | pytest-base-url==2.1.0 ; python_version >= "3.10" and python_version < "4.0"
62 | pytest-playwright==0.5.1 ; python_version >= "3.10" and python_version < "4.0"
63 | pytest==8.3.2 ; python_version >= "3.10" and python_version < "4.0"
64 | python-dateutil==2.9.0.post0 ; python_version >= "3.10" and python_version < "4.0"
65 | python-dotenv==1.0.1 ; python_version >= "3.10" and python_version < "4.0"
66 | python-json-logger==2.0.7 ; python_version >= "3.10" and python_version < "4.0"
67 | python-slugify==8.0.4 ; python_version >= "3.10" and python_version < "4.0"
68 | pyyaml==6.0.1 ; python_version >= "3.10" and python_version < "4.0"
69 | referencing==0.35.1 ; python_version >= "3.10" and python_version < "4.0"
70 | regex==2024.7.24 ; python_version >= "3.10" and python_version < "4.0"
71 | requests==2.31.0 ; python_version >= "3.10" and python_version < "4.0"
72 | rich==13.8.0 ; python_version >= "3.10" and python_version < "4.0"
73 | rpds-py==0.20.0 ; python_version >= "3.10" and python_version < "4.0"
74 | ruff==0.5.7 ; python_version >= "3.10" and python_version < "4.0"
75 | s3transfer==0.10.2 ; python_version >= "3.10" and python_version < "4.0"
76 | setuptools==72.2.0 ; python_version >= "3.10" and python_version < "4.0"
77 | shellingham==1.5.4 ; python_version >= "3.10" and python_version < "4.0"
78 | six==1.16.0 ; python_version >= "3.10" and python_version < "4.0"
79 | sniffio==1.3.1 ; python_version >= "3.10" and python_version < "4.0"
80 | tabulate==0.9.0 ; python_version >= "3.10" and python_version < "4.0"
81 | tenacity==8.5.0 ; python_version >= "3.10" and python_version < "4.0"
82 | termcolor==2.4.0 ; python_version >= "3.10" and python_version < "4.0"
83 | text-unidecode==1.3 ; python_version >= "3.10" and python_version < "4.0"
84 | tiktoken==0.7.0 ; python_version >= "3.10" and python_version < "4.0"
85 | tokenizers==0.20.0 ; python_version >= "3.10" and python_version < "4.0"
86 | tomli==2.0.1 ; python_version >= "3.10" and python_version < "3.11"
87 | tqdm==4.66.5 ; python_version >= "3.10" and python_version < "4.0"
88 | typer==0.12.5 ; python_version >= "3.10" and python_version < "4.0"
89 | typing-extensions==4.12.2 ; python_version >= "3.10" and python_version < "4.0"
90 | urllib3==2.2.2 ; python_version >= "3.10" and python_version < "4.0"
91 | yarl==1.9.4 ; python_version >= "3.10" and python_version < "4.0"
92 | zipp==3.20.0 ; python_version >= "3.10" and python_version < "4.0"
93 | 


--------------------------------------------------------------------------------
/server.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from flask import Flask, jsonify, request
 4 | 
 5 | from agentq.__main__ import run_agent_sync
 6 | from agentq.core.mcts.browser_mcts import main as run_browser_mcts
 7 | 
 8 | app = Flask(__name__)
 9 | 
10 | 
11 | @app.route("/execute", methods=["GET"])
12 | def execute_command():
13 |     goal = request.args.get("goal")
14 |     if not goal:
15 |         return jsonify({"error": "No command provided"}), 400
16 | 
17 |     # Ensure we have an event loop
18 |     try:
19 |         loop = asyncio.get_event_loop()
20 |     except RuntimeError:
21 |         loop = asyncio.new_event_loop()
22 |         asyncio.set_event_loop(loop)
23 | 
24 |     # Run the agent asynchronously
25 |     result = run_agent_sync(command=goal)
26 |     return jsonify({"result": result})
27 | 
28 | 
29 | @app.route("/execute_mcts", methods=["GET"])
30 | def run_mcts():
31 |     objective = request.args.get("goal")
32 |     if not objective:
33 |         return jsonify({"error": "No objective provided"}), 400
34 | 
35 |     # Ensure we have an event loop
36 |     try:
37 |         loop = asyncio.get_event_loop()
38 |     except RuntimeError:
39 |         loop = asyncio.new_event_loop()
40 |         asyncio.set_event_loop(loop)
41 | 
42 |     # Run the MCTS algorithm asynchronously
43 |     result = loop.run_until_complete(run_browser_mcts(objective, eval_mode=True))
44 |     return result
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     app.run(host="0.0.0.0", port=8000, threaded=False)
49 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/test/__init__.py


--------------------------------------------------------------------------------
/test/run_tests.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import asyncio
 3 | 
 4 | from test.tests_processor import run_tests
 5 | 
 6 | if __name__ == "__main__":
 7 |     # Create the parser
 8 |     parser = argparse.ArgumentParser(
 9 |         description="Run test suite for specified range of test tasks."
10 |     )
11 | 
12 |     # Add arguments
13 |     parser.add_argument(
14 |         "-s",
15 |         "--take_screenshots",
16 |         type=bool,
17 |         default=False,
18 |         help="Take screenshots after every operation performed (default: False)",
19 |     )
20 |     parser.add_argument(
21 |         "-wait",
22 |         "--wait_time_non_headless",
23 |         type=int,
24 |         default=5,
25 |         help="Time to wait between test tasks when running in non-headless mode (default: 10 seconds)",
26 |     )
27 |     parser.add_argument(
28 |         "-min",
29 |         "--min_task_index",
30 |         type=int,
31 |         default=0,
32 |         help="Minimum task index to start tests from (default: 0)",
33 |     )
34 |     parser.add_argument(
35 |         "-max",
36 |         "--max_task_index",
37 |         type=int,
38 |         help="Maximum task index to end tests with, non-inclusive (default is all the tests in the file).",
39 |     )
40 |     parser.add_argument(
41 |         "-id",
42 |         "--test_results_id",
43 |         type=str,
44 |         default="",
45 |         help="A unique identifier for the test results. If not provided, a timestamp is used.",
46 |     )
47 |     parser.add_argument(
48 |         "-config",
49 |         "--test_config_file",
50 |         type=str,
51 |         help='Path to the test configuration file. Default is "test/tasks/test.json" in the project root.',
52 |     )
53 | 
54 |     # Parse the command line arguments
55 |     args = parser.parse_args()
56 | 
57 |     # Run the main function with the provided or default arguments, not passing browser_manager or AutoGenWrapper will cause the test processor to create new instances of them
58 |     asyncio.run(
59 |         run_tests(
60 |             orchestrator=None,
61 |             min_task_index=args.min_task_index,
62 |             max_task_index=args.max_task_index,
63 |             test_file=args.test_config_file,
64 |             test_results_id=args.test_results_id,
65 |             wait_time_non_headless=args.wait_time_non_headless,
66 |             take_screenshots=args.take_screenshots,
67 |         )
68 |     )
69 | 


--------------------------------------------------------------------------------
/test/tasks/two_tasks.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "sites": null,
 4 |     "task_id": 29,
 5 |     "require_login": false,
 6 |     "storage_state": null,
 7 |     "start_url": "https://www.allrecipes.com/",
 8 |     "geolocation": null,
 9 |     "intent_template": "Search for a Mediterranean-style grilled fish recipe on Allrecipes that includes ingredients like olives, has at least a 4-star rating, and more than 25 reviews. Detail the ingredients, cooking method, and total time required for preparation and cooking.",
10 |     "instantiation_dict": {},
11 |     "intent": "Search for a Mediterranean-style grilled fish recipe on Allrecipes that includes ingredients like olives, has at least a 4-star rating, and more than 25 reviews. Detail the ingredients, cooking method, and total time required for preparation and cooking.",
12 |     "require_reset": false,
13 |     "eval": {
14 |       "eval_types": ["manual"],
15 |       "reference_answers": {
16 |         "manual_check": {
17 |           "answer": "'Branzino Mediterranean', 36 reviews, <Ingredients> include olive oil, <cooking method>, Prep Time: 15 mins, Cook Time: 25 mins, Total Time: 40 mins",
18 |           "type": "possible"
19 |         }
20 |       },
21 |       "reference_url": null,
22 |       "program_html": null
23 |     },
24 |     "task_alias": "Allrecipes--29",
25 |     "task_index": 1
26 |   },
27 |   {
28 |     "sites": null,
29 |     "task_id": 72,
30 |     "require_login": false,
31 |     "storage_state": null,
32 |     "start_url": "https://www.amazon.com/",
33 |     "geolocation": null,
34 |     "intent_template": "Look for a USB-C hub on Amazon compatible with MacBook Pro, featuring at least 4 ports, including HDMI and SD card reader. The price should be under $50. Select the one after sorting by Best Sellers.",
35 |     "instantiation_dict": {},
36 |     "intent": "Look for a USB-C hub on Amazon compatible with MacBook Pro, featuring at least 4 ports, including HDMI and SD card reader. The price should be under $50. Select the one after sorting by Best Sellers.",
37 |     "require_reset": false,
38 |     "eval": {
39 |       "eval_types": ["manual"],
40 |       "reference_answers": {
41 |         "manual_check": {
42 |           "answer": "Hiearcool USB C Hub, USB C Multi-Port Adapter for MacBook Pro, 7IN1, include 4K HDMI USB3.0 and SD/TF Card Reader, $24.99",
43 |           "type": "possible"
44 |         }
45 |       },
46 |       "reference_url": null,
47 |       "program_html": null
48 |     },
49 |     "task_alias": "Amazon--27",
50 |     "task_index": 2
51 |   }
52 | ]
53 | 


--------------------------------------------------------------------------------
/test/test_config_auditor.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from typing import Any, Dict, List
 4 | 
 5 | from agentq.config.config import PROJECT_TEST_ROOT
 6 | from test.test_utils import load_config
 7 | 
 8 | 
 9 | def validate_and_update_task_ids(tasks: List[Dict[str, Any]]) -> None:
10 |     """Ensure that task IDs match their positions in the List and update them if necessary.
11 | 
12 |     Args:
13 |         tasks (List[Dict[str, Any]]): The List of tasks to process.
14 |     """
15 |     for index, task in enumerate(tasks):
16 |         task["task_id"] = index
17 | 
18 | 
19 | def substitute_intent_templates(tasks: List[Dict[str, Any]]) -> None:
20 |     """Substitute intent_template patterns with values from instantiation_Dict.
21 | 
22 |     Args:
23 |         tasks (List[Dict[str, Any]]): The List of tasks to process.
24 |     """
25 |     for task in tasks:
26 |         if "intent_template" in task and "instantiation_Dict" in task:
27 |             template = task["intent_template"]
28 |             for key, value in task["instantiation_Dict"].items():
29 |                 placeholder = "{{" + key + "}}"
30 |                 template = template.replace(placeholder, str(value))
31 |             task["intent"] = template
32 | 
33 | 
34 | def save_json_file(tasks: List[Dict[str, Any]], file_path: str) -> None:
35 |     """Save the modified List of tasks back to a JSON file.
36 | 
37 |     Args:
38 |         tasks (List[Dict[str, Any]]): The List of modified tasks.
39 |         file_path (str): The path to save the JSON file.
40 |     """
41 |     with open(file_path, "w", encoding="utf-8") as file:
42 |         json.dump(tasks, file, ensure_ascii=False, indent=4)
43 | 
44 | 
45 | def process_tasks(file_path: str) -> None:
46 |     """Load, process, and save tasks from/to a JSON file.
47 | 
48 |     Args:
49 |         file_path (str): The path to the JSON file containing tasks.
50 |     """
51 |     tasks = load_config(file_path)
52 |     validate_and_update_task_ids(tasks)
53 |     substitute_intent_templates(tasks)
54 |     save_json_file(tasks, file_path)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     file_path = os.path.join(PROJECT_TEST_ROOT, "tasks", "test.json")
59 |     process_tasks(file_path)
60 | 


--------------------------------------------------------------------------------
/test/test_tasks_formatter.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | # read the test configuration file, copy what is in task_id to task_alias and make task_id have an incremental numeric value, then save the file back to the same location
 5 | def format_test_config_file(test_config_file: str):
 6 |     with open(test_config_file, "r") as file:
 7 |         tasks = json.load(file)
 8 |     for i, task in enumerate(tasks):
 9 |         if "task_alias" in task:
10 |             continue
11 | 
12 |         task["task_alias"] = task["task_id"]
13 |         task["task_id"] = i
14 |         tasks[i] = task
15 |     with open(test_config_file, "w") as file:
16 |         json.dump(tasks, file, indent=4)
17 | 
18 | def add_task_index_to_test_config_file(test_config_file: str):
19 |     with open(test_config_file, "r") as file:
20 |         tasks = json.load(file)
21 |     for i, task in enumerate(tasks):
22 |         task["task_index"] = i
23 |         tasks[i] = task
24 |     with open(test_config_file, "w") as file:
25 |         json.dump(tasks, file, indent=4)
26 | format_test_config_file("test/tasks/webvoyager_test.json")
27 | add_task_index_to_test_config_file("test/tasks/webvoyager_test.json")
28 | 


--------------------------------------------------------------------------------
/test/test_utils.py:
--------------------------------------------------------------------------------
  1 | """Implements helper functions to assist evaluation cases where other evaluators are not suitable."""
  2 | 
  3 | import json
  4 | import os
  5 | from datetime import datetime
  6 | from pathlib import Path
  7 | from typing import Any, Dict, List, Optional, Union
  8 | 
  9 | from dotenv import load_dotenv
 10 | from nltk.tokenize import word_tokenize  # type: ignore
 11 | from openai import OpenAI
 12 | 
 13 | load_dotenv()
 14 | client = OpenAI()
 15 | 
 16 | 
 17 | def llm_fuzzy_match(pred: str, reference: str, question: str) -> float:
 18 |     """
 19 |     Evaluates if a predicted answer matches a reference answer semantically, considering the context of a question.
 20 | 
 21 |     This function simulates a grading scenario, understanding that a student's answer may use different wording or phrasing from the reference answer. It uses GPT-4-turbo model to assess semantic equivalence.
 22 | 
 23 |     Parameters:
 24 |         pred (str): The student's predicted answer.
 25 |         reference (str): The reference answer to compare against.
 26 |         question (str): The question related to the answers.
 27 | 
 28 |     Returns:
 29 |         float: Returns 1.0 if the predicted answer is semantically equivalent to the reference, otherwise 0.0.
 30 |     """
 31 |     messages: List[Dict[str, Any]] = []
 32 |     # construct the question to ask
 33 |     message = "Help a teacher to grade the answer of a student given a question. Keep in mind that the student may use different phrasing or wording to answer the question. The goal is to evaluate whether the answer is semantically equivalent to the reference answer.\n"
 34 |     message += f"question: {question}\n"
 35 |     message += f"reference answer: {reference}\n"
 36 |     message += "all the string 'N/A' that you see is a special sequence that means 'not achievable'\n"
 37 |     message += f"student answer: {pred}\n"
 38 |     message += "Conclude the judgement by correct/incorrect/partially correct."
 39 |     messages = [
 40 |         {"role": "system", "content": "You are a helpful assistant"},
 41 |         {"role": "user", "content": message},
 42 |     ]
 43 | 
 44 |     response = generate_from_openai_chat_completion(
 45 |         model="gpt-4-turbo-preview",
 46 |         messages=messages,
 47 |         temperature=0,
 48 |         max_tokens=768,
 49 |         top_p=1.0,
 50 |         context_length=0,
 51 |     ).lower()
 52 |     if "partially correct" in response or "incorrect" in response:
 53 |         return 0.0
 54 |     else:
 55 |         assert "correct" in response
 56 |         return 1.0
 57 | 
 58 | 
 59 | def llm_ua_match(pred: str, reference: str, question: str) -> float:
 60 |     """
 61 |     Evaluates the alignment between a reported reason for a task being unachievable and the actual reason.
 62 | 
 63 |     This function reviews both the actual and reported reasons for a task's unachievability within the context of the task.
 64 |     It assesses if the reported reason is implicitly or explicitly in line with the actual reason, using GPT-turbo model.
 65 | 
 66 |     Parameters:
 67 |         pred (str): The reported unachievable reason by an individual.
 68 |         reference (str): The actual reason why the task is unachievable.
 69 |         question (str): The task in question.
 70 | 
 71 |     Returns:
 72 |         float: Returns 1.0 if the reported reason aligns with the actual reason, otherwise 0.0.
 73 |     """
 74 |     messages: List[Dict[str, Any]] = []
 75 |     # construct the question to ask
 76 |     message = ""
 77 |     message += f"task: {question}\n"
 78 |     message += f"actual unachievable reason: {reference}\n"
 79 |     message += f"reported unachievable reason: {pred}\n"
 80 |     message += (
 81 |         "The task described above is inherently unachievable due to the reason specified under 'actual unachievable reason'. "
 82 |         "An individual previously attempted this task and was unable to complete it. They provided a reason for their failure, "
 83 |         "which is Listed under 'reported unachievable reason'. Your role is to review both the actual and reported reasons. "
 84 |         "Determine if the reported reason aligns with the actual reason, even if implicitly. "
 85 |         "If the stated reason is in line with the actual reason, respond with 'same'. Otherwise, respond with 'different'."
 86 |     )
 87 |     messages = [
 88 |         {"role": "system", "content": "You are a helpful assistant"},
 89 |         {"role": "user", "content": message},
 90 |     ]
 91 | 
 92 |     response = generate_from_openai_chat_completion(
 93 |         model="gpt-4-turbo-preview",
 94 |         messages=messages,
 95 |         temperature=0,
 96 |         max_tokens=768,
 97 |         top_p=1.0,
 98 |         context_length=0,
 99 |     ).lower()
100 |     if "different" in response:
101 |         return 0.0
102 |     else:
103 |         assert "same" in response
104 |         return 1.0
105 | 
106 | 
107 | def generate_from_openai_chat_completion(
108 |     messages: List[Dict[str, str]],
109 |     model: str,
110 |     temperature: float,
111 |     max_tokens: int,
112 |     top_p: float,
113 |     context_length: int,
114 |     stop_token: Optional[str] = None,
115 | ) -> str:
116 |     """
117 |     Generates a response from OpenAI's chat completions based on a conversation constructed from a List of messages.
118 | 
119 |     This function makes a call to the OpenAI API using specified parameters to control the generation.
120 |     It requires an API key to be set in the environment variables.
121 | 
122 |     Parameters:
123 |         messages (List[dict[str, str]]): A List of messages to construct the conversation context.
124 |         model (str): The model name to use for generating the completion.
125 |         temperature (float): Sampling temperature for generation.
126 |         max_tokens (int): Maximum number of tokens to generate.
127 |         top_p (float): Nucleus sampling parameter controlling the size of the probability mass to sample from.
128 |         context_length (int): The maximum number of tokens from `messages` to use for context.
129 |         stop_token (str, optional): A token at which to stop generating further tokens.
130 | 
131 |     Returns:
132 |         str: The generated response as a string.
133 | 
134 |     Raises:
135 |         ValueError: If the 'OPENAI_API_KEY' environment variable is not set.
136 |     """
137 |     if "OPENAI_API_KEY" not in os.environ:
138 |         raise ValueError(
139 |             "OPENAI_API_KEY environment variable must be set when using OpenAI API."
140 |         )
141 |     client.api_key = os.environ["OPENAI_API_KEY"]
142 |     client.organization = os.environ.get("OPENAI_ORGANIZATION", "")
143 | 
144 |     response = client.chat.completions.create(
145 |         model=model,
146 |         messages=messages,  # type: ignore
147 |         temperature=temperature,
148 |         max_tokens=max_tokens,
149 |         top_p=top_p,
150 |         n=1,
151 |         stop=[stop_token] if stop_token else None,
152 |     )
153 |     answer: str = response.choices[0].message.content  # type: ignore
154 |     return answer
155 | 
156 | 
157 | def clean_answer(answer: str) -> str:
158 |     """Cleans and preprocesses the answer string for evaluation.
159 | 
160 |     Parameters:
161 |         answer (str): The answer string to clean.
162 | 
163 |     Returns:
164 |         str: The cleaned and lowercased answer string.
165 |     """
166 |     answer = answer.strip().strip('"').strip("'").lower()
167 |     return answer
168 | 
169 | 
170 | def evaluate_exact_match(ref: str, pred: str) -> float:
171 |     """Evaluates if the predicted answer exactly matches the reference answer.
172 | 
173 |     Parameters:
174 |         ref (str): The reference answer.
175 |         pred (str): The predicted answer.
176 | 
177 |     Returns:
178 |         float: 1.0 if the answers match exactly, otherwise 0.0.
179 |     """
180 |     return float(clean_answer(pred) == clean_answer(ref))
181 | 
182 | 
183 | def evaluate_must_include(ref: str, pred: str, tokenize: bool = False) -> float:
184 |     """Checks if the predicted answer includes all phrases from the reference answer.
185 | 
186 |     Parameters:
187 |         ref (str): The reference answer containing phrases that must be included.
188 |         pred (str): The predicted answer to be evaluated.
189 |         tokenize (bool, optional): Tokenizes the answers before evaluation if True. Default is False.
190 | 
191 |     Returns:
192 |         float: 1.0 if all phrases are included, otherwise 0.0.
193 |     """
194 |     clean_ref = clean_answer(ref)
195 |     clean_pred = clean_answer(pred)
196 |     if tokenize and len(clean_ref) == 1:
197 |         return float(clean_ref in word_tokenize(clean_pred))
198 |     else:
199 |         return float(clean_ref in clean_pred)
200 | 
201 | 
202 | def evaluate_fuzzy_match(ref: str, pred: str, intent: str) -> float:
203 |     """Evaluates if the predicted answer is semantically similar to the reference answer.
204 | 
205 |     Uses a large language model to assess similarity based on the intent of the question.
206 | 
207 |     Parameters:
208 |         ref (str): The reference answer.
209 |         pred (str): The predicted answer.
210 |         intent (str): The intent or context of the question.
211 | 
212 |     Returns:
213 |         float: 1.0 if the answers are considered semantically similar, otherwise 0.0.
214 |     """
215 |     return llm_fuzzy_match(pred, ref, intent)
216 | 
217 | 
218 | def evaluate_ua_match(ref: str, pred: str, intent: str) -> float:
219 |     """Evaluates if the predicted reason for a task being unachievable matches the reference reason.
220 | 
221 |     Parameters:
222 |         ref (str): The reference reason why the task is unachievable.
223 |         pred (str): The predicted reason reported by the model.
224 |         intent (str): The intent or context of the task.
225 | 
226 |     Returns:
227 |         float: 1.0 if the reasons match, otherwise 0.0.
228 |     """
229 |     return llm_ua_match(pred, ref, intent)
230 | 
231 | 
232 | def load_config(config_file: Union[Path, str]) -> List[Dict[str, Any]]:
233 |     """Load the confiufiguration for the test cases
234 | 
235 |     Args:
236 |         config_file Union[Path, str]: Path to the config file
237 | 
238 |     Returns:
239 |         List[dict[str, Any]]: All the test cases in the config file
240 |     """
241 |     with open(config_file, "r") as f:  # noqa: UP015
242 |         configs = json.load(f)
243 |     return configs
244 | 
245 | 
246 | def task_config_validator(task_config: Dict[str, Any]) -> bool:
247 |     # Access the attributes
248 |     command = task_config.get("intent")
249 | 
250 |     if not command:
251 |         raise ValueError(
252 |             "Intent is missing in the task config file. Without it the task cannot be run."
253 |         )
254 | 
255 |     return True
256 | 
257 | 
258 | def get_formatted_current_timestamp(format: str = "%Y-%m-%d %H:%M:%S") -> str:
259 |     """Get the current timestamp in the specified format.
260 | 
261 |     Args:
262 |         format (str, optional): The format of the timestamp. Defaults to "%Y-%m-%d %H:%M:%S".
263 | 
264 |     Returns:
265 |         str: The current timestamp in the specified format.
266 |     """
267 |     # Get the current time
268 |     current_time = datetime.now()
269 | 
270 |     # Format the timestamp as a human-readable string
271 |     timestamp_str = current_time.strftime(format)
272 |     return timestamp_str
273 | 


--------------------------------------------------------------------------------
/test/tests_processor.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import os
  4 | import time
  5 | from typing import Any, Dict, List, Optional, Tuple
  6 | 
  7 | from playwright.async_api import Page
  8 | from tabulate import tabulate
  9 | from termcolor import colored
 10 | 
 11 | from agentq.config.config import PROJECT_TEST_ROOT
 12 | from agentq.core.agent.agentq import AgentQ
 13 | from agentq.core.agent.agentq_actor import AgentQActor
 14 | from agentq.core.agent.agentq_critic import AgentQCritic
 15 | from agentq.core.agent.browser_nav_agent import BrowserNavAgent
 16 | from agentq.core.agent.planner_agent import PlannerAgent
 17 | from agentq.core.models.models import State
 18 | from agentq.core.orchestrator.orchestrator import Orchestrator
 19 | from agentq.utils.logger import logger
 20 | from test.evaluators import evaluator_router
 21 | from test.test_utils import (
 22 |     get_formatted_current_timestamp,
 23 |     load_config,
 24 |     task_config_validator,
 25 | )
 26 | 
 27 | TEST_TASKS = os.path.join(PROJECT_TEST_ROOT, "tasks")
 28 | TEST_LOGS = os.path.join(PROJECT_TEST_ROOT, "logs")
 29 | TEST_RESULTS = os.path.join(PROJECT_TEST_ROOT, "results")
 30 | 
 31 | 
 32 | def check_top_level_test_folders():
 33 |     for folder in [TEST_LOGS, TEST_RESULTS]:
 34 |         if not os.path.exists(folder):
 35 |             os.makedirs(folder)
 36 |             logger.info(f"Created folder at: {folder}")
 37 | 
 38 | 
 39 | def create_test_results_id(test_results_id: Optional[str], test_file: str) -> str:
 40 |     prefix = "test_results_for_"
 41 |     if test_results_id:
 42 |         return f"{prefix}{test_results_id}"
 43 |     test_file_base = os.path.basename(test_file)
 44 |     test_file_name = os.path.splitext(test_file_base)[0]
 45 |     return f"{prefix}{test_file_name}"
 46 | 
 47 | 
 48 | def create_task_log_folders(task_id: str, test_results_id: str) -> Dict[str, str]:
 49 |     task_log_dir = os.path.join(
 50 |         TEST_LOGS, f"{test_results_id}", f"logs_for_task_{task_id}"
 51 |     )
 52 |     task_screenshots_dir = os.path.join(task_log_dir, "snapshots")
 53 |     for directory in [task_log_dir, task_screenshots_dir]:
 54 |         if not os.path.exists(directory):
 55 |             os.makedirs(directory)
 56 |             logger.info(f"Created directory at: {directory}")
 57 |     return {
 58 |         "task_log_folder": task_log_dir,
 59 |         "task_screenshots_folder": task_screenshots_dir,
 60 |     }
 61 | 
 62 | 
 63 | def create_results_dir(test_file: str, test_results_id: Optional[str]) -> str:
 64 |     if test_results_id:
 65 |         results_dir = os.path.join(TEST_RESULTS, f"results_for_{test_results_id}")
 66 |     else:
 67 |         test_file_base = os.path.basename(test_file)
 68 |         test_file_name = os.path.splitext(test_file_base)[0]
 69 |         results_dir = os.path.join(
 70 |             TEST_RESULTS, f"results_for_test_file_{test_file_name}"
 71 |         )
 72 |     if not os.path.exists(results_dir):
 73 |         os.makedirs(results_dir)
 74 |         logger.info(f"Created results directory: {results_dir}")
 75 |     return results_dir
 76 | 
 77 | 
 78 | def dump_log(task_id: str, messages: Dict[str, Any], logs_dir: str):
 79 |     file_name = os.path.join(logs_dir, f"execution_logs_{task_id}.json")
 80 |     with open(file_name, "w", encoding="utf-8") as f:
 81 |         json.dump(messages, f, ensure_ascii=False, indent=4)
 82 | 
 83 | 
 84 | def save_test_results(test_results: List[Dict[str, Any]], test_results_id: str):
 85 |     file_name = os.path.join(TEST_RESULTS, f"test_results_{test_results_id}.json")
 86 |     with open(file_name, "w", encoding="utf-8") as f:
 87 |         json.dump(test_results, f, ensure_ascii=False, indent=4)
 88 |     logger.info(f"Test results dumped to: {file_name}")
 89 | 
 90 | 
 91 | def save_individual_test_result(test_result: Dict[str, Any], results_dir: str):
 92 |     task_id = test_result["task_id"]
 93 |     file_name = os.path.join(results_dir, f"test_result_{task_id}.json")
 94 |     with open(file_name, "w", encoding="utf-8") as f:
 95 |         json.dump(test_result, f, ensure_ascii=False, indent=4)
 96 |     logger.info(f"Test result for task {task_id} dumped to: {file_name}")
 97 | 
 98 | 
 99 | def print_progress_bar(current: int, total: int, bar_length: int = 50) -> None:
100 |     percent = float(current) * 100 / total
101 |     arrow = "-" * int(percent / 100 * bar_length - 1) + ">"
102 |     spaces = " " * (bar_length - len(arrow))
103 |     print(f"\rProgress: [{arrow}{spaces}] {current}/{total} ({percent:.2f}%)", end="")
104 | 
105 | 
106 | def determine_status_and_color(score: float) -> Tuple[str, str]:
107 |     if score == 1:
108 |         return "Pass", "green"
109 |     elif score < 0:
110 |         return "Skip", "yellow"
111 |     else:
112 |         return "Fail", "red"
113 | 
114 | 
115 | def print_test_result(task_result: Dict[str, Any], index: int, total: int) -> None:
116 |     status, color = determine_status_and_color(task_result["score"])
117 |     result_table = [
118 |         ["Test Index", "Task ID", "Intent", "Status", "Time Taken (s)"],
119 |         [
120 |             index,
121 |             task_result["task_id"],
122 |             task_result["intent"],
123 |             colored(status, color),
124 |             round(task_result["tct"], 2),
125 |         ],
126 |     ]
127 |     print("\n" + tabulate(result_table, headers="firstrow", tablefmt="grid"))
128 | 
129 | 
130 | async def execute_single_task(
131 |     task_config: Dict[str, Any],
132 |     orchestrator: Orchestrator,
133 |     page: Page,
134 |     logs_dir: str,
135 | ) -> Dict[str, Any]:
136 |     task_config_validator(task_config)
137 |     command = task_config.get("intent", "")
138 |     task_id = task_config.get("task_id")
139 |     task_index = task_config.get("task_index")
140 |     start_url = task_config.get("start_url")
141 |     logger.info(f"Intent: {command}, Task ID: {task_id}")
142 | 
143 |     if start_url:
144 |         await page.goto(start_url, wait_until="load", timeout=30000)
145 | 
146 |     start_time = time.time()
147 |     # current_url = await orchestrator.playwright_manager.get_current_url()
148 |     command_exec_result = await orchestrator.execute_command(command)
149 |     end_time = time.time()
150 | 
151 |     single_task_result = {
152 |         "task_id": task_id,
153 |         "task_index": task_index,
154 |         "start_url": start_url,
155 |         "intent": str(command),
156 |         "last_url": page.url,
157 |         "tct": end_time - start_time,
158 |         "start_ts": get_formatted_current_timestamp(),
159 |         "completion_ts": get_formatted_current_timestamp(),
160 |     }
161 | 
162 |     logger.info(f'Command "{command}" took: {round(end_time - start_time, 2)} seconds.')
163 |     logger.info(f"Task {task_id} completed.")
164 | 
165 |     single_task_result["last_statement"] = command_exec_result
166 | 
167 |     dump_log(
168 |         str(task_id), {"command": command, "result": command_exec_result}, logs_dir
169 |     )
170 | 
171 |     evaluator = evaluator_router(task_config)
172 |     # we will use the existing client and not have another one created. thus None CDP session
173 |     cdp_session = None
174 |     evaluator_result = await evaluator(
175 |         task_config=task_config,
176 |         page=page,
177 |         client=cdp_session,
178 |         answer=command_exec_result,
179 |     )
180 | 
181 |     single_task_result["score"] = evaluator_result["score"]
182 |     single_task_result["reason"] = evaluator_result["reason"]
183 | 
184 |     return single_task_result
185 | 
186 | 
187 | async def run_tests(
188 |     orchestrator: Orchestrator,
189 |     min_task_index: int,
190 |     max_task_index: int,
191 |     test_file: str = "",
192 |     test_results_id: str = "",
193 |     wait_time_non_headless: int = 5,
194 |     take_screenshots: bool = True,
195 | ) -> List[Dict[str, Any]]:
196 |     check_top_level_test_folders()
197 | 
198 |     if not test_file:
199 |         test_file = os.path.join(
200 |             # TEST_TASKS, "annotator_dry_run_webvoyager_tasks_30.json"
201 |             TEST_TASKS,
202 |             "test.json",
203 |         )
204 | 
205 |     logger.info(f"Loading test configurations from: {test_file}")
206 |     test_configurations = load_config(test_file)
207 |     test_results_id = create_test_results_id(test_results_id, test_file)
208 |     results_dir = create_results_dir(test_file, test_results_id)
209 | 
210 |     page = await orchestrator.playwright_manager.get_current_page()
211 |     test_results = []
212 |     max_task_index = len(test_configurations) if not max_task_index else max_task_index
213 |     total_tests = max_task_index - min_task_index
214 | 
215 |     for index, task_config in enumerate(
216 |         test_configurations[min_task_index:max_task_index], start=min_task_index
217 |     ):
218 |         task_id = str(task_config.get("task_id"))
219 |         log_folders = create_task_log_folders(task_id, test_results_id)
220 | 
221 |         orchestrator.playwright_manager.set_take_screenshots(take_screenshots)
222 |         if take_screenshots:
223 |             orchestrator.playwright_manager.set_screenshots_dir(
224 |                 log_folders["task_screenshots_folder"]
225 |             )
226 | 
227 |         print_progress_bar(index - min_task_index, total_tests)
228 |         task_result = await execute_single_task(
229 |             task_config, orchestrator, page, log_folders["task_log_folder"]
230 |         )
231 |         test_results.append(task_result)
232 |         save_individual_test_result(task_result, results_dir)
233 |         print_test_result(task_result, index + 1, total_tests)
234 | 
235 |         if not orchestrator.playwright_manager.isheadless:
236 |             await asyncio.sleep(wait_time_non_headless)
237 | 
238 |         await orchestrator.playwright_manager.take_screenshots("final", None)
239 |         await orchestrator.playwright_manager.close_except_specified_tab(page)
240 | 
241 |     print_progress_bar(total_tests, total_tests)
242 |     print("\n\nAll tests completed.")
243 | 
244 |     print("\nDetailed Test Results:")
245 |     detailed_results_table = [
246 |         ["Test Index", "Task ID", "Intent", "Status", "Time Taken (s)"]
247 |     ]
248 |     for idx, result in enumerate(test_results, 1):
249 |         status, color = determine_status_and_color(result["score"])
250 |         detailed_results_table.append(
251 |             [
252 |                 idx,
253 |                 result["task_id"],
254 |                 result["intent"],
255 |                 colored(status, color),
256 |                 round(result["tct"], 2),
257 |             ]
258 |         )
259 | 
260 |     print(tabulate(detailed_results_table, headers="firstrow", tablefmt="grid"))
261 | 
262 |     passed_tests = [result for result in test_results if result["score"] == 1]
263 |     skipped_tests = [result for result in test_results if result["score"] < 0]
264 |     failed_tests = [result for result in test_results if 0 <= result["score"] < 1]
265 | 
266 |     summary_table = [
267 |         [
268 |             "Total Tests",
269 |             "Passed",
270 |             "Failed",
271 |             "Skipped",
272 |             "Average Time Taken (s)",
273 |             "Total Time Taken (s)",
274 |         ],
275 |         [
276 |             total_tests,
277 |             len(passed_tests),
278 |             len(failed_tests),
279 |             len(skipped_tests),
280 |             round(sum(test["tct"] for test in test_results) / total_tests, 2),
281 |             round(sum(test["tct"] for test in test_results), 2),
282 |         ],
283 |     ]
284 | 
285 |     print("\nSummary Report:")
286 |     print(tabulate(summary_table, headers="firstrow", tablefmt="grid"))
287 | 
288 |     return test_results
289 | 
290 | 
291 | # Main execution function (if needed)
292 | async def main():
293 |     state_to_agent_map = {
294 |         State.PLAN: PlannerAgent(),
295 |         State.BROWSE: BrowserNavAgent(),
296 |         State.AGENTQ_BASE: AgentQ(),
297 |         State.AGENTQ_ACTOR: AgentQActor(),
298 |         State.AGENTQ_CRITIC: AgentQCritic(),
299 |     }
300 |     orchestrator = Orchestrator(state_to_agent_map=state_to_agent_map, eval_mode=True)
301 |     await orchestrator.start()
302 |     await run_tests(orchestrator, 0, 29)  # Example: Run first 5 tests
303 |     await orchestrator.shutdown()
304 | 
305 | 
306 | if __name__ == "__main__":
307 |     asyncio.run(main())
308 | 


--------------------------------------------------------------------------------