├── .env.example ├── .gitignore ├── .python-version ├── .vscode ├── extensions.json └── settings.json ├── AgentQ.txt ├── LICENSE ├── README.md ├── agentq ├── __init__.py ├── __main__.py ├── config │ ├── __init__.py │ └── config.py ├── core │ ├── agent │ │ ├── __init__.py │ │ ├── agentq.py │ │ ├── agentq_actor.py │ │ ├── agentq_critic.py │ │ ├── base.py │ │ ├── browser_nav_agent.py │ │ ├── captcha_agent.py │ │ ├── eval_agent.py │ │ ├── planner_agent.py │ │ └── vision_agent.py │ ├── mcts │ │ ├── __init__.py │ │ ├── browser_mcts.py │ │ ├── core │ │ │ ├── base.py │ │ │ └── mcts.py │ │ ├── example │ │ │ └── grid.py │ │ └── visualization │ │ │ ├── __init__.py │ │ │ ├── __main__.py │ │ │ ├── tree_log.py │ │ │ ├── tree_snapshot.py │ │ │ └── visualizer_client.py │ ├── memory │ │ ├── __init__.py │ │ └── ltm.py │ ├── models │ │ ├── __init__.py │ │ └── models.py │ ├── orchestrator │ │ └── orchestrator.py │ ├── prompts │ │ ├── __init__.py │ │ └── prompts.py │ ├── skills │ │ ├── __init__.py │ │ ├── click_using_selector.py │ │ ├── enter_text_and_click.py │ │ ├── enter_text_using_selector.py │ │ ├── get_dom_with_content_type.py │ │ ├── get_screenshot.py │ │ ├── get_url.py │ │ ├── get_user_input.py │ │ ├── open_url.py │ │ ├── pdf_text_extractor.py │ │ ├── press_key_combination.py │ │ ├── solve_captcha.py │ │ └── upload_file.py │ └── web_driver │ │ ├── __init__.py │ │ └── playwright.py ├── user_preferences │ └── user_preferences.txt └── utils │ ├── __init__.py │ ├── _pydantic.py │ ├── cli_helper.py │ ├── dom_helper.py │ ├── dom_mutation_observer.py │ ├── extract_json.py │ ├── function_utils.py │ ├── get_detailed_accessibility_tree.py │ ├── logger.py │ ├── message_type.py │ └── ui_messagetype.py ├── dpo_pairs.jsonl ├── logs.txt ├── output.txt ├── poetry.lock ├── pyproject.toml ├── requirements.txt ├── server.py └── test ├── __init__.py ├── evaluators.py ├── run_tests.py ├── tasks ├── annotator_dry_run_webvoyager_tasks_30.json ├── test.json ├── two_tasks.json ├── webvoyager_sampled_data.json └── webvoyager_test.json ├── test_config_auditor.py ├── test_tasks_formatter.py ├── test_utils.py └── tests_processor.py /.env.example: -------------------------------------------------------------------------------- 1 | # the model name must be gpt-4o-2024-08-06 as it is dependent on structured output from open ai 2 | MODEL_NAME="gpt-4o-2024-08-06" 3 | 4 | LITELLM_LOG="ERROR" 5 | 6 | OPENAI_API_KEY="" 7 | 8 | # you can skip adding langfuse api keys. refer to the readme on how to disable tracing with langfuse. 9 | LANGFUSE_SECRET_KEY="sk-lf-" 10 | LANGFUSE_PUBLIC_KEY="pk-lf-" 11 | LANGFUSE_HOST="https://cloud.langfuse.com" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .venv/ 3 | __pycache__ 4 | log_files/ 5 | logs/ 6 | .DS_STORE 7 | results/ 8 | output.txt -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10.13 -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": ["charliermarsh.ruff"] 3 | } 4 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[python]": { 3 | "editor.formatOnSave": true, 4 | "editor.defaultFormatter": "charliermarsh.ruff", 5 | "editor.codeActionsOnSave": { 6 | "source.fixAll": "explicit", 7 | "source.organizeImports": "explicit" 8 | } 9 | }, 10 | "notebook.formatOnSave.enabled": true, 11 | "notebook.codeActionsOnSave": { 12 | "notebook.source.fixAll": "explicit", 13 | "notebook.source.organizeImports": "explicit" 14 | }, 15 | "ruff.nativeServer": "on" 16 | } 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Sentient Engineering 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # agentq - advanced reasoning and learning for autonomous AI agents 2 | 3 | agentq utilises various kinds of agentic architectures to complete a task on the web reliably. 4 | it has 5 | 6 | ``` 7 | 1. a planner <> navigator multi-agent architecutre 8 | 2. a solo planner-actor agent 9 | 3. an actor <> critic multi-agent architecture 10 | 4. actor <> critic architecture + monte carlo tree search based reinforcement learning + dpo finetuning 11 | ``` 12 | 13 | this repo also contains an oss implementation of the research paper [agent q](https://arxiv.org/abs/2408.07199) - thus the name. 14 | 15 | ### setup 16 | 17 | 1. we recommend installing poetry before proceeding with the next steps. you can install poetry using these [instructions](https://python-poetry.org/docs/#installation) 18 | 19 | 2. install dependencies 20 | 21 | ```bash 22 | poetry install 23 | ``` 24 | 25 | 3. start chrome in dev mode - in a seaparate terminal, use the command to start a chrome instance and do necesssary logins to job websites like linkedin/ wellfound, etc. 26 | 27 | for mac, use command - 28 | 29 | ```bash 30 | sudo /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 31 | ``` 32 | 33 | for linux - 34 | 35 | ```bash 36 | google-chrome --remote-debugging-port=9222 37 | ``` 38 | 39 | for windows - 40 | 41 | ```bash 42 | "C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222 43 | ``` 44 | 45 | 4. set up env - add openai and [langfuse](https://langfuse.com) keys to .env file. you can refer .env.example. currently adding langfuse is required. If you do not want tracing - then you can do the following changes 46 | 47 | - directly import open ai client via `import openai` rather than `from langfuse.openai import openai` in the `./agentq/core/agent/base.py` file. 48 | - you would also have to comment out the @obseve decorator and the below piece of code from the `run` function in the same file 49 | 50 | ```python 51 | langfuse_context.update_current_trace( 52 | name=self.agnet_name, 53 | session_id=session_id 54 | ) 55 | ``` 56 | 57 | 5. run the agent 58 | 59 | ```bash 60 | python -u -m agentq 61 | ``` 62 | 63 | ### run evals 64 | 65 | ```bash 66 | python -m test.tests_processor --orchestrator_type fsm 67 | ``` 68 | 69 | ### generate dpo pairs for RL 70 | 71 | ```bash 72 | python -m agentq.core.mcts.browser_mcts 73 | ``` 74 | 75 | #### citations 76 | 77 | a bunch of amazing work in the space has inspired this. 78 | 79 | ``` 80 | @misc{putta2024agentqadvancedreasoning, 81 | title={Agent Q: Advanced Reasoning and Learning for Autonomous AI Agents}, 82 | author={Pranav Putta and Edmund Mills and Naman Garg and Sumeet Motwani and Chelsea Finn and Divyansh Garg and Rafael Rafailov}, 83 | year={2024}, 84 | eprint={2408.07199}, 85 | archivePrefix={arXiv}, 86 | primaryClass={cs.AI}, 87 | url={https://arxiv.org/abs/2408.07199}, 88 | } 89 | ``` 90 | 91 | ``` 92 | @inproceedings{yao2022webshop, 93 | bibtex_show = {true}, 94 | title = {WebShop: Towards Scalable Real-World Web Interaction with Grounded Language Agents}, 95 | author = {Yao, Shunyu and Chen, Howard and Yang, John and Narasimhan, Karthik}, 96 | booktitle = {ArXiv}, 97 | year = {preprint}, 98 | html = {https://arxiv.org/abs/2207.01206}, 99 | tag = {NLP} 100 | } 101 | ``` 102 | 103 | ``` 104 | @article{he2024webvoyager, 105 | title={WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models}, 106 | author={He, Hongliang and Yao, Wenlin and Ma, Kaixin and Yu, Wenhao and Dai, Yong and Zhang, Hongming and Lan, Zhenzhong and Yu, Dong}, 107 | journal={arXiv preprint arXiv:2401.13919}, 108 | year={2024} 109 | } 110 | ``` 111 | 112 | ``` 113 | @misc{abuelsaad2024-agente, 114 | title={Agent-E: From Autonomous Web Navigation to Foundational Design Principles in Agentic Systems}, 115 | author={Tamer Abuelsaad and Deepak Akkil and Prasenjit Dey and Ashish Jagmohan and Aditya Vempaty and Ravi Kokku}, 116 | year={2024}, 117 | eprint={2407.13032}, 118 | archivePrefix={arXiv}, 119 | primaryClass={cs.AI}, 120 | url={https://arxiv.org/abs/2407.13032}, 121 | } 122 | ``` 123 | -------------------------------------------------------------------------------- /agentq/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/__init__.py -------------------------------------------------------------------------------- /agentq/__main__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from playwright.async_api import Page 4 | 5 | from agentq.core.agent.agentq import AgentQ 6 | from agentq.core.agent.agentq_actor import AgentQActor 7 | from agentq.core.agent.agentq_critic import AgentQCritic 8 | from agentq.core.agent.browser_nav_agent import BrowserNavAgent 9 | from agentq.core.agent.planner_agent import PlannerAgent 10 | from agentq.core.models.models import State 11 | from agentq.core.orchestrator.orchestrator import Orchestrator 12 | 13 | state_to_agent_map = { 14 | State.PLAN: PlannerAgent(), 15 | State.BROWSE: BrowserNavAgent(), 16 | State.AGENTQ_BASE: AgentQ(), 17 | State.AGENTQ_ACTOR: AgentQActor(), 18 | State.AGENTQ_CRITIC: AgentQCritic(), 19 | } 20 | 21 | 22 | async def run_agent(command): 23 | orchestrator = Orchestrator(state_to_agent_map=state_to_agent_map, eval_mode=True) 24 | await orchestrator.start() 25 | page: Page = await orchestrator.playwright_manager.get_current_page() 26 | await page.set_extra_http_headers({"User-Agent": "AgentQ-Bot"}) 27 | await page.goto( 28 | "http://localhost:3000/abc", wait_until="networkidle", timeout=30000 29 | ) 30 | result = await orchestrator.execute_command(command) 31 | return result 32 | 33 | 34 | def run_agent_sync(command): 35 | if asyncio.get_event_loop().is_closed(): 36 | loop = asyncio.new_event_loop() 37 | asyncio.set_event_loop(loop) 38 | else: 39 | loop = asyncio.get_event_loop() 40 | 41 | return loop.run_until_complete(run_agent(command)) 42 | 43 | 44 | async def main(): 45 | orchestrator = Orchestrator(state_to_agent_map=state_to_agent_map) 46 | await orchestrator.start() 47 | 48 | 49 | if __name__ == "__main__": 50 | asyncio.run(main()) 51 | -------------------------------------------------------------------------------- /agentq/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/config/__init__.py -------------------------------------------------------------------------------- /agentq/config/config.py: -------------------------------------------------------------------------------- 1 | # config.py at the project source code root 2 | import os 3 | 4 | # Get the absolute path of the current file (config.py) 5 | CURRENT_FILE_PATH = os.path.abspath(__file__) 6 | 7 | # Get the project root directory (two levels up from config.py) 8 | PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(CURRENT_FILE_PATH))) 9 | 10 | # Define other paths relative to the project root 11 | PROJECT_SOURCE_ROOT = os.path.join(PROJECT_ROOT, "agentq") 12 | SOURCE_LOG_FOLDER_PATH = os.path.join(PROJECT_SOURCE_ROOT, "log_files") 13 | PROJECT_TEMP_PATH = os.path.join(PROJECT_SOURCE_ROOT, "temp") 14 | USER_PREFERENCES_PATH = os.path.join(PROJECT_SOURCE_ROOT, "user_preferences") 15 | PROJECT_TEST_ROOT = os.path.join(PROJECT_ROOT, "test") 16 | 17 | # Check if the log folder exists, and if not, create it 18 | if not os.path.exists(SOURCE_LOG_FOLDER_PATH): 19 | os.makedirs(SOURCE_LOG_FOLDER_PATH) 20 | print(f"Created log folder at: {SOURCE_LOG_FOLDER_PATH}") 21 | 22 | # create user prefernces folder if it does not exist 23 | if not os.path.exists(USER_PREFERENCES_PATH): 24 | os.makedirs(USER_PREFERENCES_PATH) 25 | print(f"Created user preferences folder at: {USER_PREFERENCES_PATH}") 26 | 27 | if not os.path.exists(PROJECT_TEMP_PATH): 28 | os.makedirs(PROJECT_TEMP_PATH) 29 | print(f"Created temp folder at: {PROJECT_TEMP_PATH}") 30 | -------------------------------------------------------------------------------- /agentq/core/agent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/agent/__init__.py -------------------------------------------------------------------------------- /agentq/core/agent/agentq.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from string import Template 3 | 4 | from agentq.core.agent.base import BaseAgent 5 | from agentq.core.memory import ltm 6 | from agentq.core.models.models import AgentQBaseInput, AgentQBaseOutput 7 | from agentq.core.prompts.prompts import LLM_PROMPTS 8 | 9 | 10 | class AgentQ(BaseAgent): 11 | def __init__(self): 12 | self.name = "agentq" 13 | self.ltm = None 14 | self.ltm = self.__get_ltm() 15 | self.system_prompt = self.__modify_system_prompt(self.ltm) 16 | super().__init__( 17 | name=self.name, 18 | system_prompt=self.system_prompt, 19 | input_format=AgentQBaseInput, 20 | output_format=AgentQBaseOutput, 21 | keep_message_history=False, 22 | ) 23 | 24 | @staticmethod 25 | def __get_ltm(): 26 | return ltm.get_user_ltm() 27 | 28 | def __modify_system_prompt(self, ltm): 29 | system_prompt: str = LLM_PROMPTS["AGENTQ_BASE_PROMPT"] 30 | 31 | substitutions = { 32 | "task_information": ltm if ltm is not None else "", 33 | } 34 | 35 | # Use safe_substitute to avoid KeyError 36 | system_prompt = Template(system_prompt).safe_substitute(substitutions) 37 | 38 | # Add today's day & date to the system prompt 39 | today = datetime.now() 40 | today_date = today.strftime("%d/%m/%Y") 41 | weekday = today.strftime("%A") 42 | system_prompt += f"\nToday's date is: {today_date}" 43 | system_prompt += f"\nCurrent weekday is: {weekday}" 44 | 45 | return system_prompt 46 | -------------------------------------------------------------------------------- /agentq/core/agent/agentq_actor.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from string import Template 3 | 4 | from agentq.core.agent.base import BaseAgent 5 | from agentq.core.memory import ltm 6 | from agentq.core.models.models import AgentQActorInput, AgentQActorOutput 7 | from agentq.core.prompts.prompts import LLM_PROMPTS 8 | 9 | 10 | class AgentQActor(BaseAgent): 11 | def __init__(self): 12 | self.name = "actor" 13 | self.ltm = None 14 | self.ltm = self.__get_ltm() 15 | self.system_prompt = self.__modify_system_prompt(self.ltm) 16 | super().__init__( 17 | name=self.name, 18 | system_prompt=self.system_prompt, 19 | input_format=AgentQActorInput, 20 | output_format=AgentQActorOutput, 21 | keep_message_history=False, 22 | ) 23 | 24 | @staticmethod 25 | def __get_ltm(): 26 | return ltm.get_user_ltm() 27 | 28 | def __modify_system_prompt(self, ltm): 29 | system_prompt: str = LLM_PROMPTS["AGENTQ_ACTOR_PROMPT"] 30 | 31 | substitutions = { 32 | "basic_user_information": ltm if ltm is not None else "", 33 | } 34 | 35 | # Use safe_substitute to avoid KeyError 36 | system_prompt = Template(system_prompt).safe_substitute(substitutions) 37 | 38 | # Add today's day & date to the system prompt 39 | today = datetime.now() 40 | today_date = today.strftime("%d/%m/%Y") 41 | weekday = today.strftime("%A") 42 | system_prompt += f"\nToday's date is: {today_date}" 43 | system_prompt += f"\nCurrent weekday is: {weekday}" 44 | 45 | return system_prompt 46 | -------------------------------------------------------------------------------- /agentq/core/agent/agentq_critic.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from string import Template 3 | 4 | from agentq.core.agent.base import BaseAgent 5 | from agentq.core.memory import ltm 6 | from agentq.core.models.models import AgentQCriticInput, AgentQCriticOutput 7 | from agentq.core.prompts.prompts import LLM_PROMPTS 8 | 9 | 10 | class AgentQCritic(BaseAgent): 11 | def __init__(self): 12 | self.name = "critic" 13 | self.ltm = None 14 | self.ltm = self.__get_ltm() 15 | self.system_prompt = self.__modify_system_prompt(self.ltm) 16 | super().__init__( 17 | name=self.name, 18 | system_prompt=self.system_prompt, 19 | input_format=AgentQCriticInput, 20 | output_format=AgentQCriticOutput, 21 | keep_message_history=False, 22 | ) 23 | 24 | @staticmethod 25 | def __get_ltm(): 26 | return ltm.get_user_ltm() 27 | 28 | def __modify_system_prompt(self, ltm): 29 | system_prompt: str = LLM_PROMPTS["AGENTQ_CRITIC_PROMPT"] 30 | 31 | substitutions = { 32 | "basic_user_information": ltm if ltm is not None else "", 33 | } 34 | 35 | # Use safe_substitute to avoid KeyError 36 | system_prompt = Template(system_prompt).safe_substitute(substitutions) 37 | 38 | # Add today's day & date to the system prompt 39 | today = datetime.now() 40 | today_date = today.strftime("%d/%m/%Y") 41 | weekday = today.strftime("%A") 42 | system_prompt += f"\nToday's date is: {today_date}" 43 | system_prompt += f"\nCurrent weekday is: {weekday}" 44 | 45 | return system_prompt 46 | -------------------------------------------------------------------------------- /agentq/core/agent/base.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import Callable, List, Optional, Tuple, Type 4 | 5 | import instructor 6 | import instructor.patch 7 | import litellm 8 | import openai 9 | from instructor import Mode 10 | from langsmith import traceable 11 | from pydantic import BaseModel 12 | 13 | from agentq.utils.function_utils import get_function_schema 14 | from agentq.utils.logger import logger 15 | 16 | 17 | class BaseAgent: 18 | def __init__( 19 | self, 20 | name: str, 21 | system_prompt: str, 22 | input_format: Type[BaseModel], 23 | output_format: Type[BaseModel], 24 | tools: Optional[List[Tuple[Callable, str]]] = None, 25 | keep_message_history: bool = True, 26 | client: str = "openai", 27 | ): 28 | # Metdata 29 | self.agent_name = name 30 | 31 | # Messages 32 | self.system_prompt = system_prompt 33 | # handling the case where agent has to do async intialisation as system prompt depends on some async functions. 34 | # in those cases, we do init with empty system prompt string and then handle adding system prompt to messages array in the agent itself 35 | if self.system_prompt: 36 | self._initialize_messages() 37 | self.keep_message_history = keep_message_history 38 | 39 | # Input-output format 40 | self.input_format = input_format 41 | self.output_format = output_format 42 | 43 | # Set global configurations for litellm 44 | litellm.logging = True 45 | litellm.set_verbose = True 46 | 47 | # Llm client 48 | if client == "openai": 49 | self.client = openai.Client() 50 | elif client == "together": 51 | self.client = openai.OpenAI( 52 | base_url="https://api.together.xyz/v1", 53 | api_key=os.environ["TOGETHER_API_KEY"], 54 | ) 55 | 56 | self.client = instructor.from_openai(self.client, mode=Mode.JSON) 57 | 58 | # Tools 59 | self.tools_list = [] 60 | self.executable_functions_list = {} 61 | if tools: 62 | self._initialize_tools(tools) 63 | 64 | def _initialize_tools(self, tools: List[Tuple[Callable, str]]): 65 | for func, func_desc in tools: 66 | self.tools_list.append(get_function_schema(func, description=func_desc)) 67 | self.executable_functions_list[func.__name__] = func 68 | 69 | def _initialize_messages(self): 70 | self.messages = [{"role": "system", "content": self.system_prompt}] 71 | 72 | @traceable(run_type="chain", name="agent_run") 73 | async def run( 74 | self, 75 | input_data: BaseModel, 76 | screenshot: str = None, 77 | session_id: str = None, 78 | # model: str = "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo", 79 | model: str = "gpt-4o-2024-08-06", 80 | ) -> BaseModel: 81 | if not isinstance(input_data, self.input_format): 82 | raise ValueError(f"Input data must be of type {self.input_format.__name__}") 83 | 84 | # Handle message history. 85 | if not self.keep_message_history: 86 | self._initialize_messages() 87 | 88 | if screenshot: 89 | self.messages.append( 90 | { 91 | "role": "user", 92 | "content": [ 93 | { 94 | "type": "text", 95 | "text": input_data.model_dump_json( 96 | exclude={"current_page_dom", "current_page_url"} 97 | ), 98 | }, 99 | {"type": "image_url", "image_url": {"url": screenshot}}, 100 | ], 101 | } 102 | ) 103 | else: 104 | self.messages.append( 105 | { 106 | "role": "user", 107 | "content": input_data.model_dump_json( 108 | exclude={"current_page_dom", "current_page_url"} 109 | ), 110 | } 111 | ) 112 | 113 | # input dom and current page url in a separate message so that the LLM can pay attention to completed tasks better. *based on personal vibe check* 114 | if hasattr(input_data, "current_page_dom") and hasattr( 115 | input_data, "current_page_url" 116 | ): 117 | self.messages.append( 118 | { 119 | "role": "user", 120 | "content": f"Current page URL:\n{input_data.current_page_url}\n\n Current page DOM:\n{input_data.current_page_dom}", 121 | } 122 | ) 123 | 124 | # logger.info(self.messages) 125 | 126 | # TODO: add a max_turn here to prevent a inifinite fallout 127 | while True: 128 | # TODO: 129 | # 1. exeception handling while calling the client 130 | # 2. remove the else block as JSON mode in instrutor won't allow us to pass in tools. 131 | if len(self.tools_list) == 0: 132 | response = self.client.chat.completions.create( 133 | model=model, 134 | # model="gpt-4o-2024-08-06", 135 | # model="gpt-4o-mini", 136 | # model="groq/llama3-groq-70b-8192-tool-use-preview", 137 | # model="xlam-1b-fc-r", 138 | messages=self.messages, 139 | response_model=self.output_format, 140 | max_retries=4, 141 | ) 142 | else: 143 | response = self.client.chat.completions.create( 144 | model=model, 145 | messages=self.messages, 146 | response_model=self.output_format, 147 | tool_choice="auto", 148 | tools=self.tools_list, 149 | ) 150 | 151 | # instructor directly outputs response.choices[0].message. so we will do response_message = response 152 | # response_message = response.choices[0].message 153 | 154 | # instructor does not support funciton in JSON mode 155 | # if response_message.tool_calls: 156 | # tool_calls = response_message.tool_calls 157 | 158 | # if tool_calls: 159 | # self.messages.append(response_message) 160 | # for tool_call in tool_calls: 161 | # await self._append_tool_response(tool_call) 162 | # continue 163 | 164 | # parsed_response_content: self.output_format = response_message.parsed 165 | 166 | try: 167 | assert isinstance(response, self.output_format) 168 | except AssertionError: 169 | raise TypeError( 170 | f"Expected response_message to be of type {self.output_format.__name__}, but got {type(response).__name__}" 171 | ) 172 | return response 173 | 174 | async def _append_tool_response(self, tool_call): 175 | function_name = tool_call.function.name 176 | function_to_call = self.executable_functions_list[function_name] 177 | function_args = json.loads(tool_call.function.arguments) 178 | try: 179 | function_response = await function_to_call(**function_args) 180 | # print(function_response) 181 | self.messages.append( 182 | { 183 | "tool_call_id": tool_call.id, 184 | "role": "tool", 185 | "name": function_name, 186 | "content": str(function_response), 187 | } 188 | ) 189 | except Exception as e: 190 | logger.error(f"Error occurred calling the tool {function_name}: {str(e)}") 191 | self.messages.append( 192 | { 193 | "tool_call_id": tool_call.id, 194 | "role": "tool", 195 | "name": function_name, 196 | "content": str( 197 | "The tool responded with an error, please try again with a different tool or modify the parameters of the tool", 198 | function_response, 199 | ), 200 | } 201 | ) 202 | -------------------------------------------------------------------------------- /agentq/core/agent/browser_nav_agent.py: -------------------------------------------------------------------------------- 1 | from agentq.core.agent.base import BaseAgent 2 | from agentq.core.models.models import BrowserNavInput, BrowserNavOutput 3 | from agentq.core.prompts.prompts import LLM_PROMPTS 4 | from agentq.core.skills.click_using_selector import click as click_element 5 | from agentq.core.skills.enter_text_and_click import enter_text_and_click 6 | from agentq.core.skills.enter_text_using_selector import bulk_enter_text, entertext 7 | from agentq.core.skills.get_dom_with_content_type import get_dom_with_content_type 8 | from agentq.core.skills.get_url import geturl 9 | from agentq.core.skills.open_url import openurl 10 | from agentq.core.skills.pdf_text_extractor import extract_text_from_pdf 11 | from agentq.core.skills.press_key_combination import press_key_combination 12 | from agentq.core.skills.solve_captcha import solve_captcha 13 | from agentq.core.skills.upload_file import upload_file 14 | 15 | 16 | class BrowserNavAgent(BaseAgent): 17 | def __init__(self): 18 | self.name = "executor" 19 | 20 | super().__init__( 21 | name=self.name, 22 | system_prompt=LLM_PROMPTS["BROWSER_AGENT_PROMPT"], 23 | input_format=BrowserNavInput, 24 | output_format=BrowserNavOutput, 25 | keep_message_history=False, 26 | tools=self._get_tools(), 27 | ) 28 | 29 | def _get_tools(self): 30 | return [ 31 | (openurl, LLM_PROMPTS["OPEN_URL_PROMPT"]), 32 | (enter_text_and_click, LLM_PROMPTS["ENTER_TEXT_AND_CLICK_PROMPT"]), 33 | ( 34 | get_dom_with_content_type, 35 | LLM_PROMPTS["GET_DOM_WITH_CONTENT_TYPE_PROMPT"], 36 | ), 37 | (click_element, LLM_PROMPTS["CLICK_PROMPT"]), 38 | (geturl, LLM_PROMPTS["GET_URL_PROMPT"]), 39 | (bulk_enter_text, LLM_PROMPTS["BULK_ENTER_TEXT_PROMPT"]), 40 | (entertext, LLM_PROMPTS["ENTER_TEXT_PROMPT"]), 41 | (press_key_combination, LLM_PROMPTS["PRESS_KEY_COMBINATION_PROMPT"]), 42 | (extract_text_from_pdf, LLM_PROMPTS["EXTRACT_TEXT_FROM_PDF_PROMPT"]), 43 | (upload_file, LLM_PROMPTS["UPLOAD_FILE_PROMPT"]), 44 | (solve_captcha, LLM_PROMPTS["SOLVE_CAPTCHA_PROMPT"]), 45 | ] 46 | -------------------------------------------------------------------------------- /agentq/core/agent/captcha_agent.py: -------------------------------------------------------------------------------- 1 | from agentq.core.agent.base import BaseAgent 2 | from agentq.core.models.models import CaptchaAgentInput, CaptchaAgentOutput 3 | from agentq.core.prompts.prompts import LLM_PROMPTS 4 | 5 | 6 | class CaptchaAgent(BaseAgent): 7 | def __init__(self): 8 | self.name = "captcha_solver" 9 | super().__init__( 10 | name=self.name, 11 | system_prompt=LLM_PROMPTS["CAPTCHA_AGENT_PROMPT"], 12 | input_format=CaptchaAgentInput, 13 | output_format=CaptchaAgentOutput, 14 | keep_message_history=False, 15 | ) 16 | -------------------------------------------------------------------------------- /agentq/core/agent/eval_agent.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from string import Template 3 | 4 | from agentq.core.agent.base import BaseAgent 5 | from agentq.core.memory import ltm 6 | from agentq.core.models.models import EvalAgentInput, EvalAgentOutput 7 | from agentq.core.prompts.prompts import LLM_PROMPTS 8 | 9 | 10 | class EvalAgent(BaseAgent): 11 | def __init__(self): 12 | self.name = "eval" 13 | self.ltm = None 14 | # not passing ltm to the eval agent 15 | # self.ltm = self.__get_ltm() 16 | self.system_prompt = self.__modify_system_prompt(self.ltm) 17 | super().__init__( 18 | name=self.name, 19 | system_prompt=self.system_prompt, 20 | input_format=EvalAgentInput, 21 | output_format=EvalAgentOutput, 22 | keep_message_history=False, 23 | ) 24 | 25 | @staticmethod 26 | def __get_ltm(): 27 | return ltm.get_user_ltm() 28 | 29 | def __modify_system_prompt(self, ltm): 30 | system_prompt: str = LLM_PROMPTS["EVAL_AGENT_PROMPT"] 31 | 32 | substitutions = { 33 | "basic_user_information": ltm if ltm is not None else "", 34 | } 35 | 36 | # Use safe_substitute to avoid KeyError 37 | system_prompt = Template(system_prompt).safe_substitute(substitutions) 38 | 39 | # Add today's day & date to the system prompt 40 | today = datetime.now() 41 | today_date = today.strftime("%d/%m/%Y") 42 | weekday = today.strftime("%A") 43 | system_prompt += f"\nToday's date is: {today_date}" 44 | system_prompt += f"\nCurrent weekday is: {weekday}" 45 | 46 | return system_prompt 47 | -------------------------------------------------------------------------------- /agentq/core/agent/planner_agent.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from string import Template 3 | from typing import Optional 4 | 5 | from agentq.core.agent.base import BaseAgent 6 | from agentq.core.memory import ltm 7 | from agentq.core.models.models import PlannerInput, PlannerOutput 8 | from agentq.core.prompts.prompts import LLM_PROMPTS 9 | 10 | 11 | class PlannerAgent(BaseAgent): 12 | def __init__(self): 13 | ltm: Optional[str] = None 14 | ltm = self.__get_ltm() 15 | system_prompt = self.__modify_system_prompt(ltm) 16 | self.name = "planner" 17 | 18 | super().__init__( 19 | name=self.name, 20 | system_prompt=system_prompt, 21 | input_format=PlannerInput, 22 | output_format=PlannerOutput, 23 | keep_message_history=False, 24 | ) 25 | 26 | def __get_ltm(self): 27 | return ltm.get_user_ltm() 28 | 29 | def __modify_system_prompt(self, ltm): 30 | system_prompt: str = LLM_PROMPTS["PLANNER_AGENT_PROMPT"] 31 | 32 | # Add user ltm to system prompt 33 | 34 | if ltm is not None: 35 | ltm = "\n" + ltm 36 | system_prompt = Template(system_prompt).substitute(basic_user_information=ltm) 37 | 38 | # Add today's day & date to the system prompt 39 | today = datetime.now() 40 | today_date = today.strftime("%d/%m/%Y") 41 | weekday = today.strftime("%A") 42 | system_prompt += f"\nToday's date is: {today_date}" 43 | system_prompt += f"\nCurrent weekday is: {weekday}" 44 | 45 | return system_prompt 46 | -------------------------------------------------------------------------------- /agentq/core/agent/vision_agent.py: -------------------------------------------------------------------------------- 1 | from agentq.core.agent.base import BaseAgent 2 | from agentq.core.models.models import VisionInput, VisionOutput 3 | from agentq.core.prompts.prompts import LLM_PROMPTS 4 | 5 | 6 | class VisionAgent(BaseAgent): 7 | def __init__(self, client: str = "openai"): 8 | system_prompt: str = LLM_PROMPTS["VISION_AGENT_PROMPT"] 9 | self.name = "vision" 10 | 11 | super().__init__( 12 | name=self.name, 13 | system_prompt=system_prompt, 14 | input_format=VisionInput, 15 | output_format=VisionOutput, 16 | keep_message_history=False, 17 | client=client, 18 | ) 19 | -------------------------------------------------------------------------------- /agentq/core/mcts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/mcts/__init__.py -------------------------------------------------------------------------------- /agentq/core/mcts/core/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Generic, Protocol, Tuple, TypeVar, Union, runtime_checkable 3 | 4 | State = TypeVar("State") 5 | Action = TypeVar("Action") 6 | Example = TypeVar("Example") 7 | Trace = tuple[list[State], list[Action]] 8 | 9 | 10 | class WorldModel(ABC, Generic[State, Action, Example]): 11 | def __init__(self) -> None: 12 | self.example = None 13 | self.prompt = None 14 | 15 | @abstractmethod 16 | async def init_state(self) -> State: ... 17 | 18 | @abstractmethod 19 | async def step( 20 | self, state: State, action: Action 21 | ) -> Union[State, Tuple[State, dict]]: 22 | """Returns the next state and optionally an auxiliary data dict 23 | 24 | :param state: The current state 25 | :param action: The action to take 26 | :return: The next state and optionally an auxiliary data dict 27 | """ 28 | ... 29 | 30 | @abstractmethod 31 | async def is_terminal(self, state: State) -> bool: ... 32 | 33 | def update_example(self, example: Example, prompt=None) -> None: 34 | if prompt is not None: 35 | self.prompt = prompt 36 | self.example = example 37 | 38 | 39 | class DefaultWorldModel(WorldModel): 40 | # A default implementation of WorldModel that only 41 | # saves the action sequence as the state 42 | 43 | def __init__(self, base_model) -> None: 44 | super().__init__() 45 | self.base_model = base_model 46 | 47 | async def init_state(self): 48 | return [] 49 | 50 | async def step(self, state, action): 51 | return state + [action], {} 52 | 53 | async def is_terminal(self, state): 54 | # By default the state is never terminal 55 | return False 56 | 57 | 58 | class SearchConfig(ABC, Generic[State, Action, Example]): 59 | def __init__(self) -> None: 60 | self.example = None 61 | self.prompt = None 62 | 63 | @abstractmethod 64 | async def get_actions(self, state: State) -> list[Action]: ... 65 | 66 | def fast_reward(self, state: State, action: Action) -> tuple[float, dict]: 67 | return 0, {} 68 | 69 | @abstractmethod 70 | async def reward(self, state, action, **kwargs) -> tuple[float, dict]: ... 71 | 72 | def update_example(self, example: Example, prompt=None) -> None: 73 | if prompt is not None: 74 | self.prompt = prompt 75 | self.example = example 76 | 77 | 78 | @runtime_checkable 79 | class AlgorithmOutput(Protocol[State]): 80 | terminal_state: State 81 | trace: Trace 82 | 83 | 84 | class SearchAlgorithm(ABC): 85 | def __init__(self, **kwargs): ... 86 | 87 | @abstractmethod 88 | async def __call__( 89 | self, world_model: WorldModel, search_config: SearchConfig, **kwargs 90 | ) -> AlgorithmOutput: ... 91 | 92 | 93 | class Reasoner(ABC, Generic[State, Action, Example]): 94 | def __init__( 95 | self, 96 | world_model: WorldModel[State, Action, Example], 97 | search_config: SearchConfig[State, Action, Example], 98 | search_algo: SearchAlgorithm, 99 | ) -> None: 100 | self.world_model = world_model 101 | self.search_config = search_config 102 | self.search_algo = search_algo 103 | 104 | async def __call__( 105 | self, example: Example, prompt=None, **kwargs 106 | ) -> AlgorithmOutput[State]: 107 | self.world_model.update_example(example, prompt=prompt) 108 | self.search_config.update_example(example, prompt=prompt) 109 | return await self.search_algo(self.world_model, self.search_config, **kwargs) 110 | -------------------------------------------------------------------------------- /agentq/core/mcts/example/grid.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import List, NamedTuple, Tuple 3 | 4 | from agentq.core.mcts.core.base import Reasoner, SearchConfig, WorldModel 5 | from agentq.core.mcts.core.mcts import MCTS, MCTSResult 6 | 7 | 8 | class GridState(NamedTuple): 9 | position: Tuple[int, int] 10 | grid: List[List[int]] 11 | 12 | 13 | class GridAction(NamedTuple): 14 | direction: str # up, down, left, right 15 | 16 | 17 | class GridWorldModel(WorldModel[GridState, GridAction, None]): 18 | def __init__(self, grid: List[List[int]]): 19 | self.grid = grid 20 | self.height = len(grid) 21 | self.width = len(grid[0]) 22 | 23 | async def init_state(self) -> GridState: 24 | for i in range(self.height): 25 | for j in range(self.width): 26 | if self.grid[i][j] == 2: 27 | return GridState((i, j), self.grid) 28 | raise ValueError("No initial position (2) found in the grid") 29 | 30 | async def step( 31 | self, state: GridState, action: GridAction 32 | ) -> Tuple[GridState, dict]: 33 | x, y = state.position 34 | if action.direction == "up": 35 | new_x, new_y = x - 1, y 36 | elif action.direction == "down": 37 | new_x, new_y = x + 1, y 38 | elif action.direction == "left": 39 | new_x, new_y = x, y - 1 40 | elif action.direction == "right": 41 | new_x, new_y = x, y + 1 42 | else: 43 | raise ValueError(f"Invalid action: {action}") 44 | 45 | # Check for valid position 46 | if ( 47 | 0 <= new_x < self.height 48 | and 0 <= new_y < self.width 49 | and state.grid[new_x][new_y] != 1 50 | ): 51 | new_position = (new_x, new_y) 52 | else: 53 | new_position = state.position 54 | 55 | new_state = GridState(new_position, state.grid) 56 | return new_state, {} 57 | 58 | async def is_terminal(self, state: GridState) -> bool: 59 | # x, y = state.position 60 | # return state.grid[x][y] == 3 61 | return is_terminal(state) 62 | 63 | 64 | class GridSearchConfig(SearchConfig[GridState, GridAction, None]): 65 | def __init__(self): 66 | super().__init__() 67 | 68 | async def get_actions(self, state: GridState) -> List[GridAction]: 69 | return [ 70 | GridAction("up"), 71 | GridAction("down"), 72 | GridAction("left"), 73 | GridAction("right"), 74 | ] 75 | 76 | async def reward( 77 | self, state: GridState, action: GridAction, **kwargs 78 | ) -> Tuple[float, dict]: 79 | if is_terminal(state): 80 | return 1.0, {} # good move 81 | # else: 82 | # return 0.0, {} 83 | else: 84 | return -0.01, {} # small penalty for each step to encourage shorter path 85 | 86 | 87 | def is_terminal(state: GridState) -> bool: 88 | x, y = state.position 89 | return state.grid[x][y] == 3 90 | 91 | 92 | class MCTSGridWrapper(Reasoner[GridState, GridAction, None]): 93 | def __init__( 94 | self, 95 | grid: List[List[int]], 96 | n_iterations: int = 1000, 97 | exploration_weight: float = 1.0, 98 | ) -> None: 99 | self.grid = grid 100 | world_model = GridWorldModel(grid) 101 | search_config = GridSearchConfig() 102 | search_algo = MCTS( 103 | n_iters=n_iterations, 104 | w_exp=exploration_weight, 105 | cum_reward=sum, 106 | # calc_q=np.mean, 107 | simulate_strategy="random", 108 | output_strategy="max_reward", 109 | depth_limit=len(grid) * len(grid[0]), 110 | ) 111 | super().__init__(world_model, search_config, search_algo) 112 | 113 | async def __call__(self) -> MCTSResult: 114 | # return self.search_algo(self.world_model, self.search_config) 115 | return await super().__call__(example=None) 116 | 117 | @staticmethod 118 | def print_path(result: MCTSResult): 119 | if result.trace is None or len(result.trace) == 0: 120 | print("No valid path found") 121 | return 122 | 123 | states, actions = result.trace 124 | print("Path found: ") 125 | for i, (state, action) in enumerate(zip(states, actions)): 126 | print(f"Step{i}: Position {state.position}, Action: {action.direction}") 127 | 128 | print(f"Final position: {states[-1].position}") 129 | print(f"Cumulative reward: {result.cum_reward}") 130 | 131 | 132 | async def main(): 133 | # 0: Empty cell 134 | # 1: Blocked cell 135 | # 2: Initial position 136 | # 3: Exit (terminal state) 137 | grid = [ 138 | [0, 0, 0, 0, 0], 139 | [0, 1, 0, 1, 0], 140 | [0, 0, 0, 0, 0], 141 | [0, 0, 0, 0, 0], 142 | [0, 0, 3, 1, 2], 143 | ] 144 | 145 | mcts_wrapper = MCTSGridWrapper(grid, n_iterations=10000, exploration_weight=1.0) 146 | result = await mcts_wrapper() 147 | 148 | MCTSGridWrapper.print_path(result) 149 | 150 | 151 | if __name__ == "__main__": 152 | print("[DEBUG] Script started") 153 | asyncio.run(main()) 154 | print("[DEBUG] Script finished") 155 | -------------------------------------------------------------------------------- /agentq/core/mcts/visualization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/mcts/visualization/__init__.py -------------------------------------------------------------------------------- /agentq/core/mcts/visualization/__main__.py: -------------------------------------------------------------------------------- 1 | def main(): 2 | import argparse 3 | 4 | from agentq.core.mcts.visualization import VisualizerClient 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("tree_log", type=str) 8 | parser.add_argument("--base_url", type=str) 9 | args = parser.parse_args() 10 | 11 | if args.base_url is None: 12 | client = VisualizerClient() 13 | else: 14 | client = VisualizerClient(args.base_url) 15 | 16 | with open(args.tree_log) as f: 17 | data = f.read() 18 | result = client.post_log(data) 19 | print(result.access_url) 20 | -------------------------------------------------------------------------------- /agentq/core/mcts/visualization/tree_log.py: -------------------------------------------------------------------------------- 1 | import json 2 | from enum import Enum 3 | from typing import Sequence, Union 4 | 5 | from agentq.core.mcts.core.mcts import MCTSNode, MCTSResult 6 | from agentq.core.mcts.visualization.tree_snapshot import ( 7 | EdgeData, 8 | EdgeId, 9 | NodeData, 10 | NodeId, 11 | TreeSnapshot, 12 | ) 13 | 14 | 15 | class TreeLogEncoder(json.JSONEncoder): 16 | def default(self, o): 17 | from numpy import float32 18 | 19 | if isinstance(o, TreeSnapshot.Node): 20 | return o.__dict__ 21 | elif isinstance(o, TreeSnapshot.Edge): 22 | return o.__dict__ 23 | elif isinstance(o, TreeSnapshot): 24 | return o.__dict__() 25 | elif isinstance(o, float32): 26 | return float(o) 27 | elif isinstance(o, TreeLog): 28 | return {"logs": list(o)} 29 | elif hasattr(o, "__dict__"): 30 | return o.__dict__ 31 | elif isinstance(o, Enum): 32 | return o.value 33 | else: 34 | return str(o) 35 | 36 | 37 | class TreeLog: 38 | def __init__(self, tree_snapshots: Sequence[TreeSnapshot]) -> None: 39 | self._tree_snapshots = tree_snapshots 40 | 41 | def __getitem__(self, item): 42 | return self._tree_snapshots[item] 43 | 44 | def __iter__(self): 45 | return iter(self._tree_snapshots) 46 | 47 | def __len__(self): 48 | return len(self._tree_snapshots) 49 | 50 | def __str__(self): 51 | return json.dumps(self, cls=TreeLogEncoder, indent=2) 52 | 53 | @classmethod 54 | def from_mcts_results( 55 | cls, 56 | mcts_results: MCTSResult, 57 | node_data_factory: callable = None, 58 | edge_data_factory: callable = None, 59 | ) -> "TreeLog": 60 | def get_reward_details(n: MCTSNode) -> Union[dict, None]: 61 | if hasattr(n, "reward_details"): 62 | return n.reward_details 63 | return n.fast_reward_details if hasattr(n, "fast_reward_details") else None 64 | 65 | def default_node_data_factory(n: MCTSNode) -> NodeData: 66 | if not n.state: 67 | return NodeData({}) 68 | if hasattr(n.state, "_asdict"): 69 | state_dict = n.state._asdict() 70 | elif isinstance(n.state, list): 71 | state_dict = {idx: value for idx, value in enumerate(n.state)} 72 | else: 73 | try: 74 | state_dict = json.loads(json.dumps(n.state, cls=TreeLogEncoder)) 75 | except TypeError: 76 | state_dict = str(n.state) 77 | 78 | # Add color information to the node data 79 | state_dict["color"] = "green" 80 | return NodeData(state_dict) 81 | 82 | def default_edge_data_factory(n: MCTSNode) -> EdgeData: 83 | edge_data = {"Q": n.Q, "reward": n.reward, **get_reward_details(n)} 84 | 85 | # Add color information to the edge data 86 | edge_data["color"] = "brown" 87 | return EdgeData(edge_data) 88 | 89 | node_data_factory = node_data_factory or default_node_data_factory 90 | edge_data_factory = edge_data_factory or default_edge_data_factory 91 | 92 | snapshots = [] 93 | 94 | def all_nodes(node: MCTSNode): 95 | node_id = NodeId(node.id) 96 | 97 | nodes[node_id] = TreeSnapshot.Node(node_id, node_data_factory(node)) 98 | if node.children is None: 99 | return 100 | for child in node.children: 101 | edge_id = EdgeId(len(edges)) 102 | edges.append( 103 | TreeSnapshot.Edge( 104 | edge_id, node.id, child.id, edge_data_factory(child) 105 | ) 106 | ) 107 | all_nodes(child) 108 | 109 | if mcts_results.tree_state_after_each_iter is None: 110 | tree_states = [mcts_results.tree_state] 111 | else: 112 | tree_states = mcts_results.tree_state_after_each_iter 113 | for step in range(len(tree_states)): 114 | edges = [] 115 | nodes = {} 116 | 117 | root = tree_states[step] 118 | all_nodes(root) 119 | tree = TreeSnapshot(list(nodes.values()), edges) 120 | 121 | if mcts_results.trace_in_each_iter: 122 | trace = mcts_results.trace_in_each_iter[step] 123 | for step_idx in range(len(trace) - 1): 124 | in_node_id = trace[step_idx].id 125 | out_node_id = trace[step_idx + 1].id 126 | for edges in tree.out_edges(in_node_id): 127 | if edges.target == out_node_id: 128 | nodes[in_node_id].selected_edge = edges.id 129 | break 130 | 131 | for node in tree.nodes.values(): 132 | if node.selected_edge is None and tree.children(node.id): 133 | node.selected_edge = max( 134 | tree.out_edges(node.id), 135 | key=lambda edge: edge.data.get("Q", -float("inf")), 136 | ).id 137 | 138 | snapshots.append(tree) 139 | 140 | return cls(snapshots) 141 | -------------------------------------------------------------------------------- /agentq/core/mcts/visualization/tree_snapshot.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from dataclasses import dataclass 3 | from typing import NewType, Optional, Collection 4 | 5 | NodeId = NewType("NodeId", int) 6 | EdgeId = NewType("EdgeId", int) 7 | NodeData = NewType("NodeData", dict) 8 | EdgeData = NewType("EdgeData", dict) 9 | 10 | 11 | class TreeSnapshot: 12 | @dataclass 13 | class Node: 14 | id: NodeId 15 | data: NodeData 16 | selected_edge: Optional[EdgeId] = None 17 | 18 | @dataclass 19 | class Edge: 20 | id: EdgeId 21 | source: NodeId 22 | target: NodeId 23 | data: EdgeData 24 | 25 | def __init__(self, nodes: Collection[Node], edges: Collection[Edge]) -> None: 26 | self.nodes: dict[NodeId, TreeSnapshot.Node] = {node.id: node for node in nodes} 27 | self.edges: dict[EdgeId, TreeSnapshot.Edge] = {edge.id: edge for edge in edges} 28 | self._parent = {} 29 | self._children: dict[NodeId, set[NodeId]] = defaultdict(set) 30 | 31 | for edge in edges: 32 | self._parent[edge.target] = edge.source 33 | self._children[edge.source].add(edge.target) 34 | 35 | assert len(self._parent) == len(self.nodes) - 1 36 | assert self._connected() 37 | 38 | def _connected(self) -> bool: 39 | visited = set() 40 | queue = [next(iter(self.nodes))] 41 | while queue: 42 | node = queue.pop() 43 | visited.add(node) 44 | queue.extend(self._children[node] - visited) 45 | return len(visited) == len(self.nodes) 46 | 47 | def node(self, node_id: NodeId) -> Node: 48 | return self.nodes[node_id] 49 | 50 | def edge(self, edge_id: EdgeId) -> Edge: 51 | return self.edges[edge_id] 52 | 53 | def out_edges(self, node_id: NodeId) -> Collection[Edge]: 54 | return [self.edge(edge_id) for edge_id in self.edges if self.edge(edge_id).source == node_id] 55 | 56 | def in_edges(self, node_id: NodeId) -> Collection[Edge]: 57 | return [self.edge(edge_id) for edge_id in self.edges if self.edge(edge_id).target == node_id] 58 | 59 | def parent(self, node_id: NodeId) -> NodeId: 60 | return self._parent[node_id] 61 | 62 | def children(self, node_id: NodeId) -> Collection[NodeId]: 63 | return self._children[node_id] 64 | 65 | def __dict__(self): 66 | return { 67 | "nodes": self.nodes, 68 | "edges": self.edges, 69 | } 70 | -------------------------------------------------------------------------------- /agentq/core/mcts/visualization/visualizer_client.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import json 3 | import webbrowser 4 | from typing import Optional, Union 5 | 6 | import requests 7 | 8 | from agentq.core.mcts.core.mcts import MCTSResult 9 | from agentq.core.mcts.visualization.tree_log import TreeLog, TreeLogEncoder 10 | 11 | _API_DEFAULT_BASE_URL = "https://2wz3t0av30.execute-api.us-west-1.amazonaws.com/staging" 12 | _VISUALIZER_DEFAULT_BASE_URL = "https://www.llm-reasoners.net" 13 | 14 | 15 | class VisualizerClient: 16 | def __init__(self, base_url: str = _API_DEFAULT_BASE_URL) -> None: 17 | self.base_url = base_url 18 | 19 | @dataclasses.dataclass 20 | class TreeLogReceipt: 21 | id: str 22 | access_key: str 23 | 24 | @property 25 | def access_url(self) -> str: 26 | return f"{_VISUALIZER_DEFAULT_BASE_URL}/visualizer/{self.id}?accessKey={self.access_key}" 27 | 28 | def post_log(self, data: Union[TreeLog, str, dict]) -> Optional[TreeLogReceipt]: 29 | if isinstance(data, TreeLog): 30 | data = json.dumps(data, cls=TreeLogEncoder) 31 | if isinstance(data, dict): 32 | data = json.dumps(data, cls=TreeLogEncoder) 33 | 34 | url = f"{self.base_url}/logs" 35 | headers = {"Content-Type": "application/json"} 36 | response = requests.post(url, headers=headers, data=data) 37 | 38 | if response.status_code != 200: 39 | print( 40 | f"POST Log failed with status code: {response.status_code}, message: {response.text}" 41 | ) 42 | return None 43 | 44 | return self.TreeLogReceipt(**response.json()) 45 | 46 | 47 | def present_visualizer(receipt: VisualizerClient.TreeLogReceipt): 48 | print(f"Visualizer URL: {receipt.access_url}") 49 | webbrowser.open(receipt.access_url) 50 | 51 | 52 | def visualize(result: Union[TreeLog, MCTSResult], **kwargs): 53 | tree_log: TreeLog 54 | 55 | if isinstance(result, TreeLog): 56 | tree_log = result 57 | elif isinstance(result, MCTSResult): 58 | tree_log = TreeLog.from_mcts_results(result, **kwargs) 59 | else: 60 | raise TypeError(f"Unsupported result type: {type(result)}") 61 | 62 | receipt = VisualizerClient().post_log(tree_log) 63 | 64 | if receipt is not None: 65 | present_visualizer(receipt) 66 | -------------------------------------------------------------------------------- /agentq/core/memory/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/memory/__init__.py -------------------------------------------------------------------------------- /agentq/core/memory/ltm.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from agentq.config.config import USER_PREFERENCES_PATH 4 | from agentq.utils.logger import logger 5 | 6 | 7 | def get_user_ltm(): 8 | user_preference_file_name = "user_preferences.txt" 9 | user_preference_file = os.path.join( 10 | USER_PREFERENCES_PATH, user_preference_file_name 11 | ) 12 | try: 13 | with open(user_preference_file) as file: 14 | user_pref = file.read() 15 | return user_pref 16 | except FileNotFoundError: 17 | logger.warning(f"User preference file not found: {user_preference_file}") 18 | 19 | return None 20 | -------------------------------------------------------------------------------- /agentq/core/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/models/__init__.py -------------------------------------------------------------------------------- /agentq/core/models/models.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, IntEnum 2 | from typing import List, Literal, Optional, Union 3 | 4 | from pydantic import BaseModel 5 | from pydantic.fields import Field 6 | 7 | 8 | # Global 9 | class State(str, Enum): 10 | PLAN = "plan" 11 | BROWSE = "browse" 12 | COMPLETED = "completed" 13 | AGENTQ_BASE = "agentq_base" 14 | AGENTQ_ACTOR = "agentq_actor" 15 | AGENTQ_CRITIC = "agentq_critic" 16 | 17 | 18 | class ActionType(str, Enum): 19 | CLICK = "CLICK" 20 | TYPE = "TYPE" 21 | GOTO_URL = "GOTO_URL" 22 | ENTER_TEXT_AND_CLICK = "ENTER_TEXT_AND_CLICK" 23 | SOLVE_CAPTCHA = "SOLVE_CAPTCHA" 24 | # GET_DOM_TEXT_CONTENT = "GET_DOM_TEXT_CONTENT" 25 | # GET_DOM_INPUT_FILEDS = "GET_DOM_INPUT_FILEDS" 26 | # GET_DOM_ALL_CONTENTS = "GET_DOM_ALL_CONTENTS" 27 | # GET_CURRENT_URL = "GET_CURRENT_URL" 28 | 29 | 30 | class ClickAction(BaseModel): 31 | type: Literal[ActionType.CLICK] = Field( 32 | description="""Executes a click action on the element matching the given mmid attribute value. MMID is always a number. Returns Success if click was successful or appropriate error message if the element could not be clicked.""" 33 | ) 34 | mmid: int = Field( 35 | description="The mmid number of the element that needs to be clicked e.g. 114. mmid will always be a number" 36 | ) 37 | wait_before_execution: Optional[float] = Field( 38 | description="Optional wait time in seconds before executing the click event logic" 39 | ) 40 | 41 | 42 | class TypeAction(BaseModel): 43 | type: Literal[ActionType.TYPE] = Field( 44 | description="""Single enter given text in the DOM element matching the given mmid attribute value. This will only enter the text and not press enter or anything else. 45 | Returns Success if text entry was successful or appropriate error message if text could not be entered.""" 46 | ) 47 | mmid: int = Field( 48 | description="The mmid number of the element that needs to be clicked e.g. 114. mmid will always be a number" 49 | ) 50 | content: str = Field( 51 | description="The text to enter in the element identified by the query_selector." 52 | ) 53 | 54 | 55 | class GotoAction(BaseModel): 56 | type: Literal[ActionType.GOTO_URL] = Field( 57 | description="Opens a specified URL in the web browser instance. Returns url of the new page if successful or appropriate error message if the page could not be opened." 58 | ) 59 | website: str = Field( 60 | description="The URL to navigate to. Value must include the protocol (http:// or https://)." 61 | ) 62 | timeout: Optional[float] = Field( 63 | description="Additional wait time in seconds after initial load." 64 | ) 65 | 66 | 67 | class EnterTextAndClickAction(BaseModel): 68 | type: Literal[ActionType.ENTER_TEXT_AND_CLICK] = Field( 69 | description="""Enters text into a specified element and clicks another element, both identified by their mmid. Ideal for seamless actions like submitting search queries, this integrated approach ensures superior performance over separate text entry and click commands. Successfully completes when both actions are executed without errors, returning True; otherwise, it provides False or an explanatory message of any failure encountered.""" 70 | ) 71 | text_element_mmid: int = Field( 72 | description="The mmid number of the element where the text will be entered" 73 | ) 74 | text_to_enter: str = Field( 75 | description="The text that will be entered into the element specified by text_element_mmid" 76 | ) 77 | click_element_mmid: int = Field( 78 | description="The mmid number of the element that will be clicked after text entry." 79 | ) 80 | wait_before_click_execution: Optional[float] = Field( 81 | description="Optional wait time in seconds before executing the click event logic" 82 | ) 83 | 84 | 85 | class SolveCaptcha(BaseModel): 86 | type: Literal[ActionType.SOLVE_CAPTCHA] = Field( 87 | description="""Solve captcha, enters the solve captcha into a specified element and clicks another element, both identified by their mmid. Ideal for captcha solving ,entering captcha and clicking submit.Successfully completes when all three actions are executed without errors, returning True; otherwise, it provides False or an explanatory message of any failure encountered.""" 88 | ) 89 | text_element_mmid: int = Field( 90 | description="The mmid number of the element where the captcha will be entered" 91 | ) 92 | 93 | click_element_mmid: int = Field( 94 | description="The mmid number of the element that will be clicked after the catcha entry to submit" 95 | ) 96 | 97 | wait_before_click_execution: Optional[float] = Field( 98 | description="Optional wait time in seconds before executing the click event logic" 99 | ) 100 | 101 | 102 | class Score(IntEnum): 103 | FAIL = 0 104 | PASS = 1 105 | 106 | 107 | # class GetDomTextAction(BaseModel): 108 | # type: Literal[ActionType.GET_DOM_TEXT_CONTENT] 109 | 110 | 111 | # class GetDomInputsAction(BaseModel): 112 | # type: Literal[ActionType.GET_DOM_INPUT_FILEDS] 113 | 114 | 115 | # class GetDomAllAction(BaseModel): 116 | # type: Literal[ActionType.GET_DOM_ALL_CONTENTS] 117 | 118 | 119 | # class GetCurrentUrlAction(BaseModel): 120 | # type: Literal[ActionType.GET_CURRENT_URL] 121 | 122 | 123 | Action = Union[ 124 | ClickAction, 125 | TypeAction, 126 | GotoAction, 127 | EnterTextAndClickAction, 128 | SolveCaptcha, 129 | # GetDomTextAction, 130 | # GetDomInputsAction, 131 | # GetDomAllAction, 132 | # GetCurrentUrlAction, 133 | ] 134 | 135 | 136 | class Task(BaseModel): 137 | id: int 138 | description: str 139 | url: Optional[str] 140 | result: Optional[str] 141 | 142 | 143 | class TaskWithActions(BaseModel): 144 | id: int 145 | description: str 146 | actions_to_be_performed: Optional[List[Action]] 147 | result: Optional[str] 148 | 149 | 150 | class Memory(BaseModel): 151 | objective: str 152 | current_state: State 153 | plan: Optional[Union[List[Task], List[TaskWithActions]]] 154 | thought: str 155 | completed_tasks: Optional[Union[List[Task], List[TaskWithActions]]] 156 | current_task: Optional[Union[Task, TaskWithActions]] 157 | final_response: Optional[str] 158 | current_tasks_for_eval: Optional[List[TaskWithActions]] 159 | sorted_tasks: Optional[List[TaskWithActions]] 160 | 161 | class Config: 162 | use_enum_values = True 163 | 164 | 165 | # Planner 166 | class PlannerInput(BaseModel): 167 | objective: str 168 | completed_tasks: Optional[List[Task]] 169 | task_for_review: Optional[Task] 170 | 171 | 172 | class PlannerOutput(BaseModel): 173 | plan: Optional[List[Task]] 174 | thought: str 175 | next_task: Optional[Task] 176 | is_complete: bool 177 | final_response: Optional[str] 178 | 179 | 180 | # Executor 181 | class BrowserNavInput(BaseModel): 182 | task: Task 183 | 184 | 185 | class BrowserNavOutput(BaseModel): 186 | completed_task: Task 187 | 188 | 189 | # AgentQ 190 | class AgentQBaseInput(BaseModel): 191 | objective: str 192 | completed_tasks: Optional[List[Task]] 193 | current_page_url: str 194 | current_page_dom: str 195 | 196 | 197 | class AgentQBaseOutput(BaseModel): 198 | thought: str 199 | plan: List[Task] 200 | next_task: Optional[Task] 201 | next_task_actions: Optional[List[Action]] 202 | is_complete: bool 203 | final_response: Optional[str] 204 | 205 | 206 | # Actor 207 | class AgentQActorInput(BaseModel): 208 | objective: str 209 | completed_tasks: Optional[List[TaskWithActions]] 210 | current_page_url: str 211 | current_page_dom: str 212 | 213 | 214 | class AgentQActorOutput(BaseModel): 215 | thought: str 216 | proposed_tasks: Optional[List[TaskWithActions]] 217 | is_complete: bool 218 | final_response: Optional[str] 219 | 220 | 221 | # Critic 222 | class AgentQCriticInput(BaseModel): 223 | objective: str 224 | completed_tasks: Optional[List[TaskWithActions]] 225 | tasks_for_eval: List[TaskWithActions] 226 | current_page_url: str 227 | current_page_dom: str 228 | 229 | 230 | class AgentQCriticOutput(BaseModel): 231 | thought: str 232 | top_task: TaskWithActions 233 | 234 | 235 | # Vision 236 | class VisionInput(BaseModel): 237 | objective: str 238 | 239 | 240 | class VisionOutput(BaseModel): 241 | is_terminal: bool 242 | 243 | 244 | class EvalAgentInput(BaseModel): 245 | objective: str 246 | agent_output: str 247 | current_page_url: str 248 | current_page_dom: str 249 | 250 | 251 | class EvalAgentOutput(BaseModel): 252 | score: Score 253 | 254 | 255 | class CaptchaAgentInput(BaseModel): 256 | objective: str 257 | 258 | 259 | class CaptchaAgentOutput(BaseModel): 260 | captcha: str 261 | success: bool 262 | 263 | 264 | # Monte-Carlo 265 | class BrowserState(BaseModel): 266 | dom: str 267 | url: str 268 | objective: str 269 | completed_tasks: Optional[List[TaskWithActions]] 270 | 271 | 272 | class BrowserAction(BaseModel): 273 | task_with_action: TaskWithActions 274 | rank: float = Field(description="The rank of this action, higher is better") 275 | 276 | 277 | class DPOState(BaseModel): 278 | objective: str 279 | dom: str 280 | 281 | 282 | class DPOAction(BaseModel): 283 | description: str 284 | action: Action 285 | 286 | 287 | class DPOPair(BaseModel): 288 | state: DPOState 289 | winning_action: DPOAction 290 | losing_action: DPOAction 291 | -------------------------------------------------------------------------------- /agentq/core/prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/prompts/__init__.py -------------------------------------------------------------------------------- /agentq/core/skills/__init__.py: -------------------------------------------------------------------------------- 1 | from agentq.core.skills.click_using_selector import ( 2 | click, 3 | do_click, 4 | is_element_present, 5 | perform_javascript_click, 6 | perform_playwright_click, 7 | ) 8 | from agentq.core.skills.enter_text_and_click import enter_text_and_click 9 | from agentq.core.skills.enter_text_using_selector import ( 10 | bulk_enter_text, 11 | custom_fill_element, 12 | do_entertext, 13 | ) 14 | from agentq.core.skills.get_dom_with_content_type import get_dom_with_content_type 15 | from agentq.core.skills.get_url import geturl 16 | from agentq.core.skills.get_user_input import get_user_input 17 | from agentq.core.skills.open_url import openurl 18 | from agentq.core.skills.press_key_combination import press_key_combination 19 | from agentq.core.skills.solve_captcha import solve_captcha 20 | 21 | __all__ = ( 22 | click, 23 | do_click, 24 | is_element_present, 25 | perform_javascript_click, 26 | perform_playwright_click, 27 | enter_text_and_click, 28 | bulk_enter_text, 29 | custom_fill_element, 30 | do_entertext, 31 | get_dom_with_content_type, 32 | geturl, 33 | get_user_input, 34 | openurl, 35 | press_key_combination, 36 | solve_captcha, 37 | ) 38 | -------------------------------------------------------------------------------- /agentq/core/skills/click_using_selector.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | import traceback 4 | from typing import Dict 5 | 6 | from playwright.async_api import ElementHandle, Page 7 | from playwright.async_api import TimeoutError as PlaywrightTimeoutError 8 | from typing_extensions import Annotated 9 | 10 | from agentq.core.web_driver.playwright import PlaywrightManager 11 | from agentq.utils.dom_mutation_observer import ( 12 | subscribe, # type: ignore 13 | unsubscribe, # type: ignore 14 | ) 15 | from agentq.utils.logger import logger 16 | 17 | 18 | async def click( 19 | selector: Annotated[ 20 | str, 21 | "The properly formed query selector string to identify the element for the click action (e.g. [mmid='114']). When \"mmid\" attribute is present, use it for the query selector. selector mmid will always be a number", 22 | ], 23 | wait_before_execution: Annotated[ 24 | float, 25 | "Optional wait time in seconds before executing the click event logic.", 26 | float, 27 | ], 28 | ) -> Annotated[str, "A message indicating success or failure of the click."]: 29 | """ 30 | Executes a click action on the element matching the given query selector string within the currently open web page. 31 | If there is no page open, it will raise a ValueError. An optional wait time can be specified before executing the click logic. Use this to wait for the page to load especially when the last action caused the DOM/Page to load. 32 | 33 | Parameters: 34 | - selector: The query selector string to identify the element for the click action. 35 | - wait_before_execution: Optional wait time in seconds before executing the click event logic. Defaults to 0.0 seconds. 36 | 37 | Returns: 38 | - Success if the click was successful, Appropriate error message otherwise. 39 | """ 40 | logger.info(f'Executing ClickElement with "{selector}" as the selector') 41 | 42 | # Initialize PlaywrightManager and get the active browser page 43 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 44 | page = await browser_manager.get_current_page() 45 | 46 | if page is None: 47 | raise ValueError("No active page found. OpenURL command opens a new page.") 48 | 49 | function_name = inspect.currentframe().f_code.co_name 50 | 51 | await browser_manager.take_screenshots(f"{function_name}_start", page) 52 | 53 | await browser_manager.highlight_element(selector, True) 54 | 55 | dom_changes_detected = None 56 | 57 | def detect_dom_changes(changes: str): 58 | nonlocal dom_changes_detected 59 | dom_changes_detected = changes 60 | 61 | subscribe(detect_dom_changes) 62 | 63 | # Wrap the click action and subsequent operations in a try-except block 64 | try: 65 | # Set up navigation expectation with a shorter timeout 66 | async with page.expect_navigation(wait_until="domcontentloaded", timeout=10000): 67 | result = await do_click(page, selector, wait_before_execution) 68 | 69 | # Wait for a short time to ensure the page has settled 70 | await asyncio.sleep(1) 71 | except PlaywrightTimeoutError: 72 | # If navigation times out, it might be a single-page app or a slow-loading page 73 | logger.warning( 74 | "Navigation timeout occurred, but the click might have been successful." 75 | ) 76 | result = { 77 | "summary_message": "Click executed, but no full page navigation detected", 78 | "detailed_message": "Click executed successfully, but no full page navigation was detected. This might be normal for single-page applications or slow-loading pages.", 79 | } 80 | except Exception as e: 81 | logger.error(f"Error during click operation: {e}") 82 | result = { 83 | "summary_message": "Click executed, but encountered an error", 84 | "detailed_message": f"Click executed, but encountered an error: {str(e)}", 85 | } 86 | 87 | await asyncio.sleep( 88 | 0.1 89 | ) # sleep for 100ms to allow the mutation observer to detect changes 90 | unsubscribe(detect_dom_changes) 91 | await browser_manager.take_screenshots(f"{function_name}_end", page) 92 | 93 | if dom_changes_detected: 94 | return f"Success: {result['summary_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action to click {selector} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction." 95 | return result["detailed_message"] 96 | 97 | 98 | async def do_click( 99 | page: Page, selector: str, wait_before_execution: float 100 | ) -> Dict[str, str]: 101 | """ 102 | Executes the click action on the element with the given selector within the provided page. 103 | 104 | Parameters: 105 | - page: The Playwright page instance. 106 | - selector: The query selector string to identify the element for the click action. 107 | - wait_before_execution: Optional wait time in seconds before executing the click event logic. 108 | 109 | Returns: 110 | Dict[str,str] - Explanation of the outcome of this operation represented as a dictionary with 'summary_message' and 'detailed_message'. 111 | """ 112 | logger.info( 113 | f'Executing ClickElement with "{selector}" as the selector. Wait time before execution: {wait_before_execution} seconds.' 114 | ) 115 | 116 | # Wait before execution if specified 117 | if wait_before_execution > 0: 118 | await asyncio.sleep(wait_before_execution) 119 | 120 | # Wait for the selector to be present and ensure it's attached and visible. If timeout, try javascript click 121 | try: 122 | logger.info( 123 | f'Executing ClickElement with "{selector}" as the selector. Waiting for the element to be attached and visible.' 124 | ) 125 | 126 | element = await asyncio.wait_for( 127 | page.wait_for_selector(selector, state="attached", timeout=2000), 128 | timeout=2000, 129 | ) 130 | if element is None: 131 | raise ValueError(f'Element with selector: "{selector}" not found') 132 | 133 | logger.info( 134 | f'Element with selector: "{selector}" is attached. scrolling it into view if needed.' 135 | ) 136 | try: 137 | await element.scroll_into_view_if_needed(timeout=200) 138 | logger.info( 139 | f'Element with selector: "{selector}" is attached and scrolled into view. Waiting for the element to be visible.' 140 | ) 141 | except Exception: 142 | # If scrollIntoView fails, just move on, not a big deal 143 | pass 144 | 145 | try: 146 | await element.wait_for_element_state("visible", timeout=200) 147 | logger.info( 148 | f'Executing ClickElement with "{selector}" as the selector. Element is attached and visible. Clicking the element.' 149 | ) 150 | except Exception: 151 | # If the element is not visible, try to click it anyway 152 | pass 153 | 154 | element_tag_name = await element.evaluate( 155 | "element => element.tagName.toLowerCase()" 156 | ) 157 | 158 | if element_tag_name == "option": 159 | element_value = await element.get_attribute( 160 | "value" 161 | ) # get the text that is in the value of the option 162 | parent_element = await element.evaluate_handle( 163 | "element => element.parentNode" 164 | ) 165 | await parent_element.select_option(value=element_value) # type: ignore 166 | 167 | logger.info(f'Select menu option "{element_value}" selected') 168 | 169 | return { 170 | "summary_message": f'Select menu option "{element_value}" selected', 171 | "detailed_message": f'Select menu option "{element_value}" selected.', 172 | } 173 | 174 | msg = await perform_javascript_click(page, selector) 175 | return { 176 | "summary_message": msg, 177 | "detailed_message": f"{msg} Click action completed, page may have navigated.", 178 | } 179 | except Exception as e: 180 | logger.error(f'Unable to click element with selector: "{selector}". Error: {e}') 181 | traceback.print_exc() 182 | msg = f'Unable to click element with selector: "{selector}" since the selector is invalid.' 183 | return {"summary_message": msg, "detailed_message": f"{msg}. Error: {e}"} 184 | 185 | 186 | async def is_element_present(page: Page, selector: str) -> bool: 187 | """ 188 | Checks if an element is present on the page. 189 | 190 | Parameters: 191 | - page: The Playwright page instance. 192 | - selector: The query selector string to identify the element. 193 | 194 | Returns: 195 | - True if the element is present, False otherwise. 196 | """ 197 | element = await page.query_selector(selector) 198 | return element is not None 199 | 200 | 201 | async def perform_playwright_click(element: ElementHandle, selector: str): 202 | """ 203 | Performs a click action on the element using Playwright's click method. 204 | 205 | Parameters: 206 | - element: The Playwright ElementHandle instance representing the element to be clicked. 207 | - selector: The query selector string of the element. 208 | 209 | Returns: 210 | - None 211 | """ 212 | logger.info( 213 | f"Performing first Step: Playwright Click on element with selector: {selector}" 214 | ) 215 | await element.click(force=False, timeout=200) 216 | 217 | 218 | async def perform_javascript_click(page: Page, selector: str): 219 | """ 220 | Performs a click action on the element using JavaScript. 221 | 222 | Parameters: 223 | - page: The Playwright page instance. 224 | - selector: The query selector string of the element. 225 | 226 | Returns: 227 | - A string describing the result of the click action. 228 | """ 229 | js_code = """(selector) => { 230 | let element = document.querySelector(selector); 231 | 232 | if (!element) { 233 | console.log(`perform_javascript_click: Element with selector ${selector} not found`); 234 | return `perform_javascript_click: Element with selector ${selector} not found`; 235 | } 236 | 237 | if (element.tagName.toLowerCase() === "option") { 238 | let value = element.text; 239 | let parent = element.parentElement; 240 | 241 | parent.value = element.value; // Directly set the value if possible 242 | // Trigger change event if necessary 243 | let event = new Event('change', { bubbles: true }); 244 | parent.dispatchEvent(event); 245 | 246 | console.log("Select menu option", value, "selected"); 247 | return "Select menu option: "+ value+ " selected"; 248 | } 249 | else { 250 | console.log("About to click selector", selector); 251 | // If the element is a link, make it open in the same tab 252 | if (element.tagName.toLowerCase() === "a") { 253 | element.target = "_self"; 254 | // #TODO: Consider removing this in the future if it causes issues with intended new tab behavior 255 | element.removeAttribute('target'); 256 | element.removeAttribute('rel'); 257 | } 258 | let ariaExpandedBeforeClick = element.getAttribute('aria-expanded'); 259 | element.click(); 260 | let ariaExpandedAfterClick = element.getAttribute('aria-expanded'); 261 | if (ariaExpandedBeforeClick === 'false' && ariaExpandedAfterClick === 'true') { 262 | return "Executed JavaScript Click on element with selector: "+selector +". Very important: As a consequence a menu has appeared where you may need to make further selection. Very important: Get all_fields DOM to complete the action."; 263 | } 264 | return "Executed JavaScript Click on element with selector: "+selector; 265 | } 266 | }""" 267 | try: 268 | logger.info(f"Executing JavaScript click on element with selector: {selector}") 269 | result: str = await page.evaluate(js_code, selector) 270 | logger.debug(f"Executed JavaScript Click on element with selector: {selector}") 271 | return result 272 | except Exception as e: 273 | logger.error( 274 | f"Error executing JavaScript click on element with selector: {selector}. Error: {e}" 275 | ) 276 | traceback.print_exc() 277 | return f"Error executing JavaScript click: {str(e)}" 278 | -------------------------------------------------------------------------------- /agentq/core/skills/enter_text_and_click.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | 4 | from typing_extensions import Annotated 5 | 6 | from agentq.core.web_driver.playwright import PlaywrightManager 7 | from agentq.core.skills.click_using_selector import do_click 8 | from agentq.core.skills.enter_text_using_selector import do_entertext 9 | from agentq.core.skills.press_key_combination import do_press_key_combination 10 | from agentq.utils.logger import logger 11 | 12 | 13 | async def enter_text_and_click( 14 | text_selector: Annotated[ 15 | str, 16 | "The properly formatted DOM selector query, for example [mmid='1234'], where the text will be entered. Use mmid attribute. mmid will always be a number", 17 | ], 18 | text_to_enter: Annotated[ 19 | str, 20 | "The text that will be entered into the element specified by text_selector.", 21 | ], 22 | click_selector: Annotated[ 23 | str, 24 | "The properly formatted DOM selector query, for example [mmid='1234'], for the element that will be clicked after text entry. mmid will always be a number", 25 | ], 26 | wait_before_click_execution: Annotated[ 27 | float, "Optional wait time in seconds before executing the click.", float 28 | ], 29 | ) -> Annotated[ 30 | str, "A message indicating success or failure of the text entry and click." 31 | ]: 32 | """ 33 | Enters text into an element and then clicks on another element. 34 | 35 | Parameters: 36 | - text_selector: The selector for the element to enter text into. It should be a properly formatted DOM selector query, for example [mmid='1234'], where the text will be entered. Use the mmid attribute. 37 | - text_to_enter: The text to enter into the element specified by text_selector. 38 | - click_selector: The selector for the element to click. It should be a properly formatted DOM selector query, for example [mmid='1234']. 39 | - wait_before_click_execution: Optional wait time in seconds before executing the click action. Default is 0.0. 40 | 41 | Returns: 42 | - A message indicating the success or failure of the text entry and click. 43 | 44 | Raises: 45 | - ValueError: If no active page is found. The OpenURL command opens a new page. 46 | 47 | Example usage: 48 | ``` 49 | await enter_text_and_click("[mmid='1234']", "Hello, World!", "[mmid='5678']", wait_before_click_execution=1.5) 50 | ``` 51 | """ 52 | logger.info( 53 | f"Entering text '{text_to_enter}' into element with selector '{text_selector}' and then clicking element with selector '{click_selector}'." 54 | ) 55 | 56 | # Initialize PlaywrightManager and get the active browser page 57 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 58 | page = await browser_manager.get_current_page() 59 | if page is None: # type: ignore 60 | logger.error("No active page found") 61 | raise ValueError("No active page found. OpenURL command opens a new page.") 62 | 63 | await browser_manager.highlight_element(text_selector, True) 64 | 65 | function_name = inspect.currentframe().f_code.co_name # type: ignore 66 | await browser_manager.take_screenshots(f"{function_name}_start", page) 67 | 68 | text_entry_result = await do_entertext( 69 | page, text_selector, text_to_enter, use_keyboard_fill=True 70 | ) 71 | 72 | # await browser_manager.notify_user(text_entry_result["summary_message"]) 73 | if not text_entry_result["summary_message"].startswith("Success"): 74 | await browser_manager.take_screenshots(f"{function_name}_end", page) 75 | return f"Failed to enter text '{text_to_enter}' into element with selector '{text_selector}'. Check that the selctor is valid." 76 | 77 | result = text_entry_result 78 | 79 | # if the text_selector is the same as the click_selector, press the Enter key instead of clicking 80 | if text_selector == click_selector: 81 | do_press_key_combination_result = await do_press_key_combination( 82 | browser_manager, page, "Enter" 83 | ) 84 | if do_press_key_combination_result: 85 | result["detailed_message"] += ( 86 | f' Instead of click, pressed the Enter key successfully on element: "{click_selector}".' 87 | ) 88 | # await browser_manager.notify_user( 89 | # f'Pressed the Enter key successfully on element: "{click_selector}".', 90 | # message_type=MessageType.ACTION, 91 | # ) 92 | else: 93 | result["detailed_message"] += ( 94 | f' Clicking the same element after entering text in it, is of no value. Tried pressing the Enter key on element "{click_selector}" instead of click and failed.' 95 | ) 96 | # await browser_manager.notify_user( 97 | # 'Failed to press the Enter key on element "{click_selector}".', 98 | # message_type=MessageType.ACTION, 99 | # ) 100 | else: 101 | await browser_manager.highlight_element(click_selector, True) 102 | 103 | do_click_result = await do_click( 104 | page, click_selector, wait_before_click_execution 105 | ) 106 | result["detailed_message"] += f' {do_click_result["detailed_message"]}' 107 | # await browser_manager.notify_user(do_click_result["summary_message"]) 108 | 109 | await asyncio.sleep( 110 | 0.1 111 | ) # sleep for 100ms to allow the mutation observer to detect changes 112 | 113 | await browser_manager.take_screenshots(f"{function_name}_end", page) 114 | 115 | return result["detailed_message"] 116 | -------------------------------------------------------------------------------- /agentq/core/skills/enter_text_using_selector.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | import traceback 4 | from dataclasses import dataclass 5 | from typing import ( 6 | Dict, 7 | List, # noqa: UP035 8 | ) 9 | 10 | from playwright.async_api import Page 11 | from typing_extensions import Annotated 12 | 13 | from agentq.core.web_driver.playwright import PlaywrightManager 14 | from agentq.core.skills.press_key_combination import press_key_combination 15 | from agentq.utils.dom_helper import get_element_outer_html 16 | from agentq.utils.dom_mutation_observer import subscribe, unsubscribe 17 | from agentq.utils.logger import logger 18 | 19 | 20 | @dataclass 21 | class EnterTextEntry: 22 | """ 23 | Represents an entry for text input. 24 | 25 | Attributes: 26 | query_selector (str): A valid DOM selector query. Use the mmid attribute. 27 | text (str): The text to enter in the element identified by the query_selector. 28 | """ 29 | 30 | query_selector: str 31 | text: str 32 | 33 | def __getitem__(self, key: str) -> str: 34 | if key == "query_selector": 35 | return self.query_selector 36 | elif key == "text": 37 | return self.text 38 | else: 39 | raise KeyError(f"{key} is not a valid key") 40 | 41 | 42 | async def custom_fill_element(page: Page, selector: str, text_to_enter: str): 43 | """ 44 | Sets the value of a DOM element to a specified text without triggering keyboard input events. 45 | 46 | This function directly sets the 'value' property of a DOM element identified by the given CSS selector, 47 | effectively changing its current value to the specified text. This approach bypasses the need for 48 | simulating keyboard typing, providing a more efficient and reliable way to fill in text fields, 49 | especially in automated testing scenarios where speed and accuracy are paramount. 50 | 51 | Args: 52 | page (Page): The Playwright Page object representing the browser tab in which the operation will be performed. 53 | selector (str): The CSS selector string used to locate the target DOM element. The function will apply the 54 | text change to the first element that matches this selector. 55 | text_to_enter (str): The text value to be set in the target element. Existing content will be overwritten. 56 | 57 | Example: 58 | await custom_fill_element(page, '#username', 'test_user') 59 | 60 | Note: 61 | This function does not trigger input-related events (like 'input' or 'change'). If application logic 62 | relies on these events being fired, additional steps may be needed to simulate them. 63 | """ 64 | selector = f"{selector}" # Ensures the selector is treated as a string 65 | try: 66 | result = await page.evaluate( 67 | """(inputParams) => { 68 | const selector = inputParams.selector; 69 | let text_to_enter = inputParams.text_to_enter; 70 | text_to_enter = text_to_enter.trim(); 71 | const element = document.querySelector(selector); 72 | if (!element) { 73 | throw new Error(`Element not found: ${selector}`); 74 | } 75 | element.value = text_to_enter; 76 | return `Value set for ${selector}`; 77 | }""", 78 | {"selector": selector, "text_to_enter": text_to_enter}, 79 | ) 80 | logger.debug(f"custom_fill_element result: {result}") 81 | except Exception as e: 82 | logger.error(f"Error in custom_fill_element: {str(e)}") 83 | logger.error(f"Selector: {selector}, Text: {text_to_enter}") 84 | raise 85 | 86 | 87 | async def entertext( 88 | entry: Annotated[ 89 | EnterTextEntry, 90 | "An object containing 'query_selector' (DOM selector query using mmid attribute e.g. [mmid='114']) and 'text' (text to enter on the element). mmid will always be a number", 91 | ], 92 | ) -> Annotated[str, "Explanation of the outcome of this operation."]: 93 | """ 94 | Enters text into a DOM element identified by a CSS selector. 95 | 96 | This function enters the specified text into a DOM element identified by the given CSS selector. 97 | It uses the Playwright library to interact with the browser and perform the text entry operation. 98 | The function supports both direct setting of the 'value' property and simulating keyboard typing. 99 | 100 | Args: 101 | entry (EnterTextEntry): An object containing 'query_selector' (DOM selector query using mmid attribute) 102 | and 'text' (text to enter on the element). 103 | 104 | Returns: 105 | str: Explanation of the outcome of this operation. 106 | 107 | Example: 108 | entry = EnterTextEntry(query_selector='#username', text='test_user') 109 | result = await entertext(entry) 110 | 111 | Note: 112 | - The 'query_selector' should be a valid CSS selector that uniquely identifies the target element. 113 | - The 'text' parameter specifies the text to be entered into the element. 114 | - The function uses the PlaywrightManager to manage the browser instance. 115 | - If no active page is found, an error message is returned. 116 | - The function internally calls the 'do_entertext' function to perform the text entry operation. 117 | - The 'do_entertext' function applies a pulsating border effect to the target element during the operation. 118 | - The function first clears any existing text in the input field before entering the new text. 119 | - The 'use_keyboard_fill' parameter in 'do_entertext' determines whether to simulate keyboard typing or not. 120 | - If 'use_keyboard_fill' is set to True, the function uses the 'page.keyboard.type' method to enter the text. 121 | - If 'use_keyboard_fill' is set to False, the function uses the 'custom_fill_element' method to enter the text. 122 | """ 123 | logger.info(f"Entering text: {entry}") 124 | 125 | if isinstance(entry, Dict): 126 | query_selector: str = entry["query_selector"] 127 | text_to_enter: str = entry["text"] 128 | elif isinstance(entry, EnterTextEntry): 129 | query_selector: str = entry.query_selector 130 | text_to_enter: str = entry.text 131 | else: 132 | raise ValueError( 133 | "Invalid input type for 'entry'. Expected EnterTextEntry or dict." 134 | ) 135 | 136 | if not isinstance(query_selector, str) or not isinstance(text_to_enter, str): 137 | raise ValueError("query_selector and text must be strings") 138 | 139 | # logger.info( 140 | # f"######### Debug: query_selector={query_selector}, text_to_enter={text_to_enter}" 141 | # ) 142 | 143 | # Create and use the PlaywrightManager 144 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 145 | page = await browser_manager.get_current_page() 146 | if page is None: # type: ignore 147 | return "Error: No active page found. OpenURL command opens a new page." 148 | 149 | function_name = inspect.currentframe().f_code.co_name # type: ignore 150 | 151 | await browser_manager.take_screenshots(f"{function_name}_start", page) 152 | 153 | await browser_manager.highlight_element(query_selector, True) 154 | 155 | dom_changes_detected = None 156 | 157 | def detect_dom_changes(changes: str): # type: ignore 158 | nonlocal dom_changes_detected 159 | dom_changes_detected = changes # type: ignore 160 | 161 | subscribe(detect_dom_changes) 162 | 163 | # Clear existing text before entering new text 164 | # await page.evaluate(f"document.querySelector('{query_selector}').value = '';") 165 | # logger.info( 166 | # f"######### About to page.evaluate: selector={query_selector}, text={text_to_enter}" 167 | # ) 168 | await page.evaluate( 169 | """ 170 | (selector) => { 171 | const element = document.querySelector(selector); 172 | if (element) { 173 | element.value = ''; 174 | } else { 175 | console.error('Element not found:', selector); 176 | } 177 | } 178 | """, 179 | query_selector, 180 | ) 181 | # logger.info( 182 | # f"######### About to call do_entertext with: selector={query_selector}, text={text_to_enter}" 183 | # ) 184 | result = await do_entertext(page, query_selector, text_to_enter) 185 | # logger.info(f"#########do_entertext returned: {result}") 186 | await asyncio.sleep( 187 | 0.1 188 | ) # sleep for 100ms to allow the mutation observer to detect changes 189 | unsubscribe(detect_dom_changes) 190 | 191 | await browser_manager.take_screenshots(f"{function_name}_end", page) 192 | 193 | if dom_changes_detected: 194 | return f"{result['detailed_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action of entering text {text_to_enter} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction." 195 | return result["detailed_message"] 196 | 197 | 198 | async def do_entertext( 199 | page: Page, selector: str, text_to_enter: str, use_keyboard_fill: bool = True 200 | ): 201 | """ 202 | Performs the text entry operation on a DOM element. 203 | 204 | This function performs the text entry operation on a DOM element identified by the given CSS selector. 205 | It applies a pulsating border effect to the element during the operation for visual feedback. 206 | The function supports both direct setting of the 'value' property and simulating keyboard typing. 207 | 208 | Args: 209 | page (Page): The Playwright Page object representing the browser tab in which the operation will be performed. 210 | selector (str): The CSS selector string used to locate the target DOM element. 211 | text_to_enter (str): The text value to be set in the target element. Existing content will be overwritten. 212 | use_keyboard_fill (bool, optional): Determines whether to simulate keyboard typing or not. 213 | Defaults to False. 214 | 215 | Returns: 216 | Dict[str, str]: Explanation of the outcome of this operation represented as a dictionary with 'summary_message' and 'detailed_message'. 217 | 218 | Example: 219 | result = await do_entertext(page, '#username', 'test_user') 220 | 221 | Note: 222 | - The 'use_keyboard_fill' parameter determines whether to simulate keyboard typing or not. 223 | - If 'use_keyboard_fill' is set to True, the function uses the 'page.keyboard.type' method to enter the text. 224 | - If 'use_keyboard_fill' is set to False, the function uses the 'custom_fill_element' method to enter the text. 225 | """ 226 | try: 227 | elem = await page.query_selector(selector) 228 | 229 | if elem is None: 230 | error = f"Error: Selector {selector} not found. Unable to continue." 231 | return {"summary_message": error, "detailed_message": error} 232 | 233 | # logger.info(f"######### Found selector {selector} to enter text") 234 | element_outer_html = await get_element_outer_html(elem, page) 235 | 236 | if use_keyboard_fill: 237 | await elem.focus() 238 | await asyncio.sleep(0.1) 239 | await press_key_combination("Control+A") 240 | await asyncio.sleep(0.1) 241 | await press_key_combination("Backspace") 242 | await asyncio.sleep(0.1) 243 | logger.debug(f"Focused element with selector {selector} to enter text") 244 | # add a 100ms delay 245 | await page.keyboard.type(text_to_enter, delay=1) 246 | else: 247 | await custom_fill_element(page, selector, text_to_enter) 248 | await elem.focus() 249 | logger.info( 250 | f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}' 251 | ) 252 | success_msg = f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}' 253 | return { 254 | "summary_message": success_msg, 255 | "detailed_message": f"{success_msg} and outer HTML: {element_outer_html}.", 256 | } 257 | 258 | except Exception as e: 259 | traceback.print_exc() 260 | error = f"Error entering text in selector {selector}." 261 | # logger.info("Error in do_entertext", error) 262 | return {"summary_message": error, "detailed_message": f"{error} Error: {e}"} 263 | 264 | 265 | async def bulk_enter_text( 266 | entries: Annotated[ 267 | List[Dict[str, str]], 268 | "List of objects, each containing 'query_selector' and 'text'.", 269 | ], # noqa: UP006 270 | ) -> Annotated[ 271 | List[Dict[str, str]], 272 | "List of dictionaries, each containing 'query_selector' and the result of the operation.", 273 | ]: # noqa: UP006 274 | """ 275 | Enters text into multiple DOM elements using a bulk operation. 276 | 277 | This function enters text into multiple DOM elements using a bulk operation. 278 | It takes a list of dictionaries, where each dictionary contains a 'query_selector' and 'text' pair. 279 | The function internally calls the 'entertext' function to perform the text entry operation for each entry. 280 | 281 | Args: 282 | entries: List of objects, each containing 'query_selector' and 'text'. 283 | 284 | Returns: 285 | List of dictionaries, each containing 'query_selector' and the result of the operation. 286 | 287 | Example: 288 | entries = [ 289 | {"query_selector": "#username", "text": "test_user"}, 290 | {"query_selector": "#password", "text": "test_password"} 291 | ] 292 | results = await bulk_enter_text(entries) 293 | 294 | Note: 295 | - Each entry in the 'entries' list should be a dictionary with 'query_selector' and 'text' keys. 296 | - The result is a list of dictionaries, where each dictionary contains the 'query_selector' and the result of the operation. 297 | """ 298 | 299 | results: List[Dict[str, str]] = [] # noqa: UP006 300 | logger.info("Executing bulk Enter Text Command") 301 | for entry in entries: 302 | query_selector = entry["query_selector"] 303 | text_to_enter = entry["text"] 304 | logger.info( 305 | f"Entering text: {text_to_enter} in element with selector: {query_selector}" 306 | ) 307 | result = await entertext( 308 | EnterTextEntry(query_selector=query_selector, text=text_to_enter) 309 | ) 310 | 311 | results.append({"query_selector": query_selector, "result": result}) 312 | 313 | return results 314 | -------------------------------------------------------------------------------- /agentq/core/skills/get_dom_with_content_type.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Dict, Optional, Union 4 | 5 | from playwright.async_api import Page 6 | from typing_extensions import Annotated 7 | 8 | from agentq.config.config import SOURCE_LOG_FOLDER_PATH 9 | from agentq.core.web_driver.playwright import PlaywrightManager 10 | from agentq.utils.dom_helper import wait_for_non_loading_dom_state 11 | from agentq.utils.get_detailed_accessibility_tree import do_get_accessibility_info 12 | from agentq.utils.logger import logger 13 | 14 | 15 | async def get_dom_with_content_type( 16 | content_type: Annotated[ 17 | str, 18 | "The type of content to extract: 'text_only': Extracts the innerText of the highest element in the document and responds with text, or 'input_fields': Extracts the text input and button elements in the dom.", 19 | ], 20 | webpage: Optional[Page] = None, 21 | ) -> Annotated[ 22 | Union[Dict[str, Any], str, None], 23 | "The output based on the specified content type.", 24 | ]: 25 | """ 26 | Retrieves and processes the DOM of the active page in a browser instance based on the specified content type. 27 | 28 | Parameters 29 | ---------- 30 | content_type : str 31 | The type of content to extract. Possible values are: 32 | - 'text_only': Extracts the innerText of the highest element in the document and responds with text. 33 | - 'input_fields': Extracts the text input and button elements in the DOM and responds with a JSON object. 34 | - 'all_fields': Extracts all the fields in the DOM and responds with a JSON object. 35 | 36 | Returns 37 | ------- 38 | Dict[str, Any] | str | None 39 | The processed content based on the specified content type. This could be: 40 | - A JSON object for 'input_fields' with just inputs. 41 | - Plain text for 'text_only'. 42 | - A minified DOM represented as a JSON object for 'all_fields'. 43 | 44 | Raises 45 | ------ 46 | ValueError 47 | If an unsupported content_type is provided. 48 | """ 49 | 50 | logger.info(f"Executing Get DOM Command based on content_type: {content_type}") 51 | start_time = time.time() 52 | # Create and use the PlaywrightManager 53 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 54 | 55 | if webpage is not None: 56 | page = webpage 57 | else: 58 | page = await browser_manager.get_current_page() 59 | 60 | if page is None: # type: ignore 61 | raise ValueError("No active page found. OpenURL command opens a new page.") 62 | 63 | extracted_data = None 64 | await wait_for_non_loading_dom_state( 65 | page, 5000 66 | ) # wait for the DOM to be ready, non loading means external resources do not need to be loaded 67 | user_success_message = "" 68 | if content_type == "all_fields": 69 | user_success_message = "Fetched all the fields in the DOM" 70 | extracted_data = await do_get_accessibility_info(page, only_input_fields=False) 71 | elif content_type == "input_fields": 72 | logger.debug("Fetching DOM for input_fields") 73 | extracted_data = await do_get_accessibility_info(page, only_input_fields=True) 74 | if extracted_data is None: 75 | return "Could not fetch input fields. Please consider trying with content_type all_fields." 76 | user_success_message = "Fetched only input fields in the DOM" 77 | elif content_type == "text_only": 78 | # Extract text from the body or the highest-level element 79 | logger.debug("Fetching DOM for text_only") 80 | text_content = await get_filtered_text_content(page) 81 | with open( 82 | os.path.join(SOURCE_LOG_FOLDER_PATH, "text_only_dom.txt"), 83 | "w", 84 | encoding="utf-8", 85 | ) as f: 86 | f.write(text_content) 87 | extracted_data = text_content 88 | user_success_message = "Fetched the text content of the DOM" 89 | else: 90 | raise ValueError(f"Unsupported content_type: {content_type}") 91 | 92 | elapsed_time = time.time() - start_time 93 | logger.info(f"Get DOM Command executed in {elapsed_time} seconds") 94 | # await browser_manager.notify_user( 95 | # user_success_message, message_type=MessageType.ACTION 96 | # ) 97 | return extracted_data # type: ignore 98 | 99 | 100 | async def get_filtered_text_content(page: Page) -> str: 101 | text_content = await page.evaluate(""" 102 | () => { 103 | // Array of query selectors to filter out 104 | const selectorsToFilter = ['#agente-overlay']; 105 | 106 | // Store the original visibility values to revert later 107 | const originalStyles = []; 108 | 109 | // Hide the elements matching the query selectors 110 | selectorsToFilter.forEach(selector => { 111 | const elements = document.querySelectorAll(selector); 112 | elements.forEach(element => { 113 | originalStyles.push({ element: element, originalStyle: element.style.visibility }); 114 | element.style.visibility = 'hidden'; 115 | }); 116 | }); 117 | 118 | // Get the text content of the page 119 | let textContent = document?.body?.innerText || document?.documentElement?.innerText || ""; 120 | 121 | // Get all the alt text from images on the page 122 | let altTexts = Array.from(document.querySelectorAll('img')).map(img => img.alt); 123 | altTexts="Other Alt Texts in the page: " + altTexts.join(' '); 124 | 125 | // Revert the visibility changes 126 | originalStyles.forEach(entry => { 127 | entry.element.style.visibility = entry.originalStyle; 128 | }); 129 | textContent=textContent+" "+altTexts; 130 | return textContent; 131 | } 132 | """) 133 | return text_content 134 | -------------------------------------------------------------------------------- /agentq/core/skills/get_screenshot.py: -------------------------------------------------------------------------------- 1 | import base64 2 | 3 | from typing_extensions import Annotated, Optional 4 | 5 | from agentq.core.web_driver.playwright import PlaywrightManager 6 | from agentq.utils.logger import logger 7 | from playwright.async_api import Page 8 | 9 | 10 | async def get_screenshot( 11 | webpage: Optional[Page] = None 12 | ) -> ( 13 | Annotated[ 14 | str, "Returns a base64 encoded screenshot of the current active web page." 15 | ] 16 | ): 17 | """ 18 | Captures and returns a base64 encoded screenshot of the current page (only the visible viewport and not the full page) 19 | 20 | Returns: 21 | - Base64 encoded string of the screenshot image. 22 | """ 23 | 24 | try: 25 | # Create and use the PlaywrightManager 26 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 27 | if webpage is not None: 28 | page = webpage 29 | else: 30 | page = await browser_manager.get_current_page() 31 | logger.info("page {page}") 32 | 33 | if not page: 34 | logger.info("No active page found. OpenURL command opens a new page.") 35 | raise ValueError("No active page found. OpenURL command opens a new page.") 36 | 37 | await page.wait_for_load_state("domcontentloaded") 38 | 39 | # Capture the screenshot 40 | logger.info("about to capture") 41 | screenshot_bytes = await page.screenshot(full_page=False) 42 | 43 | # Encode the screenshot as base64 44 | base64_screenshot = base64.b64encode(screenshot_bytes).decode("utf-8") 45 | 46 | return f"data:image/png;base64,{base64_screenshot}" 47 | 48 | except Exception as e: 49 | raise ValueError( 50 | "Failed to capture screenshot. Make sure a page is open and accessible." 51 | ) from e 52 | -------------------------------------------------------------------------------- /agentq/core/skills/get_url.py: -------------------------------------------------------------------------------- 1 | from playwright.async_api import Page 2 | from typing_extensions import Annotated, Optional 3 | 4 | from agentq.core.web_driver.playwright import PlaywrightManager 5 | 6 | 7 | async def geturl( 8 | webpage: Optional[Page] = None, 9 | ) -> Annotated[str, "Returns the full URL of the current active web site/page."]: 10 | """ 11 | Returns the full URL of the current page 12 | 13 | Parameters: 14 | 15 | Returns: 16 | - Full URL the browser's active page. 17 | """ 18 | 19 | try: 20 | # Create and use the PlaywrightManager 21 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 22 | if webpage is not None: 23 | page = webpage 24 | else: 25 | page = await browser_manager.get_current_page() 26 | 27 | if not page: 28 | raise ValueError("No active page found. OpenURL command opens a new page.") 29 | 30 | await page.wait_for_load_state("domcontentloaded") 31 | 32 | # Get the URL of the current page 33 | try: 34 | title = await page.title() 35 | current_url = page.url 36 | if len(current_url) > 250: 37 | current_url = current_url[:250] + "..." 38 | return f"Current Page: {current_url}, Title: {title}" # type: ignore 39 | except: # noqa: E722 40 | current_url = page.url 41 | return f"Current Page: {current_url}" 42 | 43 | except Exception as e: 44 | raise ValueError( 45 | "No active page found. OpenURL command opens a new page." 46 | ) from e 47 | -------------------------------------------------------------------------------- /agentq/core/skills/get_user_input.py: -------------------------------------------------------------------------------- 1 | from typing import ( 2 | Dict, 3 | List, # noqa: UP035, 4 | ) 5 | 6 | from typing_extensions import Annotated 7 | 8 | from agentq.core.web_driver.playwright import PlaywrightManager 9 | from agentq.utils.cli_helper import answer_questions_over_cli 10 | 11 | 12 | async def get_user_input( 13 | questions: Annotated[ 14 | List[str], "List of questions to ask the user each one represented as a string" 15 | ], 16 | ) -> Dict[str, str]: # noqa: UP006 17 | """ 18 | Asks the user a list of questions and returns the answers in a dictionary. 19 | 20 | Parameters: 21 | - questions: A list of questions to ask the user ["What is Username?", "What is your password?"]. 22 | 23 | Returns: 24 | - Newline separated list of questions to ask the user 25 | """ 26 | 27 | answers: Dict[str, str] = {} 28 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 29 | if browser_manager.ui_manager: 30 | for question in questions: 31 | answers[question] = await browser_manager.prompt_user( 32 | f"Question: {question}" 33 | ) 34 | else: 35 | answers = await answer_questions_over_cli(questions) 36 | return answers 37 | -------------------------------------------------------------------------------- /agentq/core/skills/open_url.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | 4 | from playwright.async_api import TimeoutError as PlaywrightTimeoutError 5 | from typing_extensions import Annotated 6 | 7 | from agentq.core.web_driver.playwright import PlaywrightManager 8 | from agentq.utils.logger import logger 9 | 10 | 11 | async def openurl( 12 | url: Annotated[ 13 | str, 14 | "The URL to navigate to. Value must include the protocol (http:// or https://).", 15 | ], 16 | timeout: Annotated[int, "Additional wait time in seconds after initial load."], 17 | max_retries: Annotated[int, "Maximum number of retry attempts"] = 3, 18 | ) -> Annotated[str, "Returns the result of this request in text form"]: 19 | """ 20 | Opens a specified URL in the active browser instance. Waits for an initial load event, then waits for either 21 | the 'domcontentloaded' event or a configurable timeout, whichever comes first. 22 | 23 | Parameters: 24 | - url: The URL to navigate to. 25 | - timeout: Additional time in seconds to wait after the initial load before considering the navigation successful. 26 | - max_retries: Maximum number of retry attempts (default: 3). 27 | 28 | Returns: 29 | - URL of the new page. 30 | """ 31 | logger.info(f"Opening URL: {url}") 32 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 33 | await browser_manager.get_browser_context() 34 | page = await browser_manager.get_current_page() 35 | # Navigate to the URL with a short timeout to ensure the initial load starts 36 | function_name = inspect.currentframe().f_code.co_name # type: ignore 37 | url = ensure_protocol(url) 38 | 39 | for attempt in range(max_retries): 40 | try: 41 | await browser_manager.take_screenshots(f"{function_name}_start", page) 42 | 43 | # set extra headers for bypassing ngrok 44 | await page.set_extra_http_headers({"User-Agent": "AgentQ-Sentient"}) 45 | 46 | # Use a longer timeout for navigation 47 | await page.goto( 48 | url, timeout=max(30000, timeout * 1000), wait_until="domcontentloaded" 49 | ) 50 | 51 | # Wait for network idle to ensure page is fully loaded 52 | await page.wait_for_load_state( 53 | "networkidle", timeout=max(30000, timeout * 1000) 54 | ) 55 | 56 | await browser_manager.take_screenshots(f"{function_name}_end", page) 57 | 58 | title = await page.title() 59 | final_url = page.url 60 | logger.info(f"Successfully loaded page: {final_url}") 61 | return f"Page loaded: {final_url}, Title: {title}" 62 | 63 | except PlaywrightTimeoutError as e: 64 | logger.warning(f"Timeout error on attempt {attempt + 1}: {e}") 65 | if attempt == max_retries - 1: 66 | logger.error(f"Failed to load {url} after {max_retries} attempts") 67 | return f"Failed to load page: {url}. Error: Timeout after {max_retries} attempts" 68 | await asyncio.sleep(2) # Wait before retrying 69 | 70 | except Exception as e: 71 | logger.error(f"Error navigating to {url}: {e}") 72 | return f"Failed to load page: {url}. Error: {str(e)}" 73 | 74 | await browser_manager.take_screenshots(f"{function_name}_end", page) 75 | 76 | # await browser_manager.notify_user( 77 | # f"Opened URL: {url}", message_type=MessageType.ACTION 78 | # ) 79 | # Get the page title 80 | title = await page.title() 81 | url = page.url 82 | return f"Page loaded: {url}, Title: {title}" # type: ignore 83 | 84 | 85 | def ensure_protocol(url: str) -> str: 86 | """ 87 | Ensures that a URL has a protocol (http:// or https://). If it doesn't have one, 88 | https:// is added by default. 89 | 90 | Parameters: 91 | - url: The URL to check and modify if necessary. 92 | 93 | Returns: 94 | - A URL string with a protocol. 95 | """ 96 | if not url.startswith(("http://", "https://")): 97 | url = "https://" + url # Default to http if no protocol is specified 98 | logger.info( 99 | f"Added 'https://' protocol to URL because it was missing. New URL is: {url}" 100 | ) 101 | return url 102 | -------------------------------------------------------------------------------- /agentq/core/skills/pdf_text_extractor.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import httpx 4 | import pdfplumber 5 | from typing_extensions import Annotated 6 | 7 | from agentq.config.config import PROJECT_TEMP_PATH 8 | from agentq.core.web_driver.playwright import PlaywrightManager 9 | from agentq.utils.logger import logger 10 | from agentq.utils.message_type import MessageType 11 | 12 | 13 | async def extract_text_from_pdf( 14 | pdf_url: Annotated[str, "The URL of the PDF file to extract text from."], 15 | ) -> Annotated[str, "All the text found in the PDF file."]: 16 | """ 17 | Extract text from a PDF file. 18 | pdf_url: str - The URL of the PDF file to extract text from. 19 | returns: str - All the text found in the PDF. 20 | """ 21 | file_path = os.path.join( 22 | PROJECT_TEMP_PATH, "downloaded_file.pdf" 23 | ) # fixed file path for downloading the PDF 24 | 25 | try: 26 | # Create and use the PlaywrightManager 27 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 28 | 29 | # Download the PDF 30 | download_result = await download_pdf(pdf_url, file_path) 31 | if not os.path.exists(download_result): 32 | return download_result # Return error message if download failed 33 | 34 | # Open the PDF using pdfplumber and extract text 35 | text = "" 36 | with pdfplumber.open(download_result) as pdf: 37 | for page in pdf.pages: 38 | page_text = page.extract_text() 39 | if page_text: 40 | text += page_text + "\n" 41 | extracted_text = text.strip() 42 | word_count = len(extracted_text.split()) 43 | await browser_manager.notify_user( 44 | f"Extracted text from the PDF successfully. Found {word_count} words.", 45 | message_type=MessageType.ACTION, 46 | ) 47 | return "Text found in the PDF:\n" + extracted_text 48 | except httpx.HTTPStatusError as e: 49 | logger.error( 50 | f"An error occurred while downloading the PDF from {pdf_url}: {str(e)}" 51 | ) 52 | return f"An error occurred while downloading the PDF: {str(e)}" 53 | except Exception as e: 54 | logger.error( 55 | f"An error occurred while extracting text from the PDF that was downloaded from {pdf_url}: {str(e)}" 56 | ) 57 | return f"An error occurred while extracting text: {str(e)}" 58 | finally: 59 | # Cleanup: Ensure the downloaded file is removed 60 | cleanup_temp_files(file_path) 61 | 62 | 63 | def cleanup_temp_files(*file_paths: str) -> None: 64 | """ 65 | Remove the specified temporary files. 66 | 67 | *file_paths: str - One or more file paths to be removed. 68 | """ 69 | for file_path in file_paths: 70 | if os.path.exists(file_path): 71 | try: 72 | os.remove(file_path) 73 | logger.debug(f"Cleaned file from the filesystem: {file_path}") 74 | except Exception as e: 75 | logger.error(f"Failed to remove {file_path}: {str(e)}") 76 | else: 77 | logger.debug( 78 | f"File not found. Unable to clean it from the filesystem: {file_path}" 79 | ) 80 | 81 | 82 | async def download_pdf(pdf_url: str, file_path: str) -> str: 83 | """ 84 | Download the PDF file from the given URL and save it to the specified path. 85 | 86 | pdf_url: str - The URL of the PDF file to download. 87 | file_path: str - The local path to save the downloaded PDF. 88 | 89 | returns: str - The file path of the downloaded PDF if successful, otherwise an error message. 90 | raises: Exception - If an error occurs during the download process. 91 | """ 92 | try: 93 | logger.info(f"Downloading PDF from: {pdf_url} to: {file_path}") 94 | async with httpx.AsyncClient() as client: 95 | response = await client.get(pdf_url) 96 | response.raise_for_status() # Ensure the request was successful 97 | with open(file_path, "wb") as pdf_file: 98 | pdf_file.write(response.content) 99 | return file_path 100 | # except httpx.HTTPStatusError as e: 101 | # raise e 102 | except Exception as e: 103 | raise e 104 | -------------------------------------------------------------------------------- /agentq/core/skills/press_key_combination.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | 4 | from playwright.async_api import Page # type: ignore 5 | from typing_extensions import Annotated 6 | 7 | from agentq.core.web_driver.playwright import PlaywrightManager 8 | from agentq.utils.dom_mutation_observer import ( 9 | subscribe, # type: ignore 10 | unsubscribe, # type: ignore 11 | ) 12 | from agentq.utils.logger import logger 13 | 14 | 15 | async def press_key_combination( 16 | key_combination: Annotated[str, "The key to press, e.g., Enter, PageDown etc"], 17 | ) -> str: 18 | """ 19 | Presses a key combination on the current active page managed by PlaywrightManager. 20 | 21 | This function simulates the pressing of a key or a combination of keys on the current active web page. 22 | The `key_combination` should be a string that represents the keys to be pressed, separated by '+' if it's a combination. 23 | For example, 'Control+C' to copy or 'Alt+F4' to close a window on Windows. 24 | 25 | Parameters: 26 | - key_combination (Annotated[str, "The key combination to press, e.g., 'Control+C'."]): The key combination to press, represented as a string. For combinations, use '+' as a separator. 27 | 28 | Raises: 29 | - ValueError: If no active page is found. 30 | 31 | Returns: 32 | str: status of the operation expressed as a string 33 | """ 34 | 35 | logger.info(f"Executing press_key_combination with key combo: {key_combination}") 36 | # Create and use the PlaywrightManager 37 | browser_manager = PlaywrightManager() 38 | page = await browser_manager.get_current_page() 39 | 40 | if page is None: # type: ignore 41 | raise ValueError("No active page found. OpenURL command opens a new page.") 42 | 43 | # Split the key combination if it's a combination of keys 44 | keys = key_combination.split("+") 45 | 46 | dom_changes_detected = None 47 | 48 | def detect_dom_changes(changes: str): # type: ignore 49 | nonlocal dom_changes_detected 50 | dom_changes_detected = changes # type: ignore 51 | 52 | subscribe(detect_dom_changes) 53 | # If it's a combination, hold down the modifier keys 54 | for key in keys[:-1]: # All keys except the last one are considered modifier keys 55 | await page.keyboard.down(key) 56 | 57 | # Press the last key in the combination 58 | await page.keyboard.press(keys[-1]) 59 | 60 | # Release the modifier keys 61 | for key in keys[:-1]: 62 | await page.keyboard.up(key) 63 | await asyncio.sleep( 64 | 0.1 65 | ) # sleep for 100ms to allow the mutation observer to detect changes 66 | unsubscribe(detect_dom_changes) 67 | 68 | if dom_changes_detected: 69 | return f"Key {key_combination} executed successfully.\n As a consequence of this action, new elements have appeared in view:{dom_changes_detected}. This means that the action is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction." 70 | 71 | # await browser_manager.notify_user( 72 | # f"Key {key_combination} executed successfully", message_type=MessageType.ACTION 73 | # ) 74 | return f"Key {key_combination} executed successfully" 75 | 76 | 77 | async def do_press_key_combination( 78 | browser_manager: PlaywrightManager, page: Page, key_combination: str 79 | ) -> bool: 80 | """ 81 | Presses a key combination on the provided page. 82 | 83 | This function simulates the pressing of a key or a combination of keys on a web page. 84 | The `key_combination` should be a string that represents the keys to be pressed, separated by '+' if it's a combination. 85 | For example, 'Control+C' to copy or 'Alt+F4' to close a window on Windows. 86 | 87 | Parameters: 88 | - browser_manager (PlaywrightManager): The PlaywrightManager instance. 89 | - page (Page): The Playwright page instance. 90 | - key_combination (str): The key combination to press, represented as a string. For combinations, use '+' as a separator. 91 | 92 | Returns: 93 | bool: True if success and False if failed 94 | """ 95 | 96 | logger.info(f"Executing press_key_combination with key combo: {key_combination}") 97 | try: 98 | function_name = inspect.currentframe().f_code.co_name # type: ignore 99 | await browser_manager.take_screenshots(f"{function_name}_start", page) 100 | # Split the key combination if it's a combination of keys 101 | keys = key_combination.split("+") 102 | 103 | # If it's a combination, hold down the modifier keys 104 | for key in keys[ 105 | :-1 106 | ]: # All keys except the last one are considered modifier keys 107 | await page.keyboard.down(key) 108 | 109 | # Press the last key in the combination 110 | await page.keyboard.press(keys[-1]) 111 | 112 | # Release the modifier keys 113 | for key in keys[:-1]: 114 | await page.keyboard.up(key) 115 | 116 | except Exception as e: 117 | logger.error(f'Error executing press_key_combination "{key_combination}": {e}') 118 | return False 119 | 120 | await browser_manager.take_screenshots(f"{function_name}_end", page) 121 | 122 | return True 123 | -------------------------------------------------------------------------------- /agentq/core/skills/solve_captcha.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | from typing import Annotated 3 | 4 | from agentq.core.agent.captcha_agent import CaptchaAgent 5 | from agentq.core.models.models import CaptchaAgentInput, CaptchaAgentOutput 6 | from agentq.core.skills.enter_text_and_click import enter_text_and_click 7 | from agentq.core.skills.get_screenshot import get_screenshot 8 | from agentq.core.web_driver.playwright import PlaywrightManager 9 | from agentq.utils.logger import logger 10 | 11 | 12 | async def solve_captcha( 13 | text_selector: Annotated[ 14 | str, 15 | "The properly formatted DOM selector query, for example [mmid='1234'], where the captcha text will be entered. Use mmid attribute. mmid will always be a number", 16 | ], 17 | click_selector: Annotated[ 18 | str, 19 | "The properly formatted DOM selector query, for example [mmid='1234'], for the element that will be clicked after captch text entry. mmmid will be alwayes be a number", 20 | ], 21 | wait_before_click_execution: Annotated[ 22 | float, "Optional wait time in seconds before executing the click." 23 | ], 24 | ) -> Annotated[ 25 | str, "A message indicating success of failure of the captcha solving and submitting" 26 | ]: 27 | """ 28 | Solves a captcha, enters into the text element and submits it by clicking another element. 29 | 30 | Parameters: 31 | - text_selector: The selector for the element to enter the captcha into. It should be a properly formatted DOM selector query, for example [mmid='1234'], where the captcha text will be entered. Use the mmid attribute. 32 | - click_selector: The selector for the element to click post captcha is entered. It should be a properly formatted DOM selector query, for example [mmid='1234']. 33 | - wait_before_click_execution: Optional wait time in seconds before executing the click action. Default is 0.0. 34 | 35 | Returns: 36 | - A message indicating the success or failure of the cathcha entry and click. 37 | 38 | Raises: 39 | - ValueError: If no active page is found. The OpenURL command opens a new page. 40 | 41 | Example usage: 42 | ``` 43 | await solve_captcha("[mmid='1234']", "[mmid='5678']", wait_before_click_execution=1.5) 44 | ``` 45 | - 46 | """ 47 | logger.info("Solving captcha") 48 | 49 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 50 | 51 | page = await browser_manager.get_current_page() 52 | 53 | if page is None: 54 | logger.error("No active page found") 55 | raise ValueError("No active page found. OpenURL command opens a new page") 56 | 57 | # Take ss for logging 58 | function_name = inspect.currentframe().f_code.co_name 59 | await browser_manager.highlight_element(text_selector, True) 60 | await browser_manager.take_screenshots(f"{function_name}_start", page=page) 61 | 62 | screenshot = await get_screenshot() 63 | captcha_agent = CaptchaAgent() 64 | input: CaptchaAgentInput = CaptchaAgentInput(objective="Solve this captcha") 65 | 66 | try: 67 | captcha_output: CaptchaAgentOutput = await captcha_agent.run(input, screenshot) 68 | except Exception as e: 69 | await browser_manager.take_screenshots(f"{function_name}_end", page=page) 70 | logger.error(f"Error in captcha_agent.run: {str(e)}") 71 | return "Failed to solve the captcha. Error in running the Captcha Agent" 72 | 73 | if not captcha_output.success: 74 | await browser_manager.take_screenshots(f"{function_name}_end", page=page) 75 | return "Failed to solve the captcha. Captcha agent did not succeed." 76 | 77 | success_msg = ( 78 | f"Success. Successfully solved the captcha {captcha_output.captcha}.\n" 79 | ) 80 | result = { 81 | "summary_message": success_msg, 82 | "detailed_message": f"{success_msg}", 83 | } 84 | 85 | # enter text and click 86 | enter_text_and_click_result = await enter_text_and_click( 87 | text_selector=text_selector, 88 | text_to_enter=captcha_output.captcha, 89 | click_selector=click_selector, 90 | wait_before_click_execution=wait_before_click_execution, 91 | ) 92 | 93 | if not enter_text_and_click_result.startswith("Success"): 94 | await browser_manager.take_screenshots(f"{function_name}_end", page) 95 | return f"Solved the captcha but failed to enter it & click '{enter_text_and_click_result}' into element with text selector '{text_selector} & click selector {click_selector}'. Check that the selctor is valid." 96 | 97 | result["detailed_message"] += f"{enter_text_and_click_result}" 98 | 99 | return result["detailed_message"] 100 | -------------------------------------------------------------------------------- /agentq/core/skills/upload_file.py: -------------------------------------------------------------------------------- 1 | from typing_extensions import Annotated 2 | 3 | from agentq.core.web_driver.playwright import PlaywrightManager 4 | from agentq.utils.logger import logger 5 | 6 | 7 | async def upload_file( 8 | # label: Annotated[str, "Label for the element on which upload should happen"], 9 | selector: Annotated[ 10 | str, 11 | "The properly formed query selector string to identify the file input element (e.g. [mmid='114']). When \"mmid\" attribute is present, use it for the query selector. mmid will always be a number", 12 | ], 13 | file_path: Annotated[str, "Path on the local system for the file to be uploaded"], 14 | ) -> Annotated[str, "A meesage indicating if the file uplaod was successful"]: 15 | """ 16 | Uploads a file. 17 | 18 | Parameters: 19 | - file_path: Path of the file that needs to be uploaded. 20 | 21 | Returns: 22 | - A message indicating the success or failure of the file upload 23 | """ 24 | logger.info( 25 | f"Uploading file onto the page from {file_path} using selector {selector}" 26 | ) 27 | print("naman-selector") 28 | # print(label) 29 | # label = "Add File" 30 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 31 | page = await browser_manager.get_current_page() 32 | 33 | if not page: 34 | raise ValueError("No active page found. OpenURL command opens a new page") 35 | 36 | await page.wait_for_load_state("domcontentloaded") 37 | 38 | try: 39 | await page.locator(selector).set_input_files(file_path) 40 | # await page.get_by_label(label).set_input_files(file_path) 41 | logger.info( 42 | "File upload was successful. I can confirm it. Please proceed ahead with next step." 43 | ) 44 | except Exception as e: 45 | logger.error(f"Failed to upload file: {e}") 46 | return f"File upload failed {e}" 47 | -------------------------------------------------------------------------------- /agentq/core/web_driver/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/core/web_driver/__init__.py -------------------------------------------------------------------------------- /agentq/user_preferences/user_preferences.txt: -------------------------------------------------------------------------------- 1 | 1. Your job is to find the relevant product asked by the user on a locally hosted e-commerce website: http://localhost:3000/abc 2 | 2. Your search query should be small and should only have important keywords. In general, you should try to checkout the first couple of results shown in the search results.  3 | 3. Remember, a lot of custom filters like size, color, flavor, quantity, etc. are only available on the product page, and the product title on the search page may not exactly fit requirements but can loosely relate to the search query, so checkout the first couple of search results that match most closely.  4 | Eg, you are asked to buy strawberry biscuits from britannia - but when you search britannia strawberry biscuits, you get search results with chocloate-flavored britannia buiscuits. You should still click on it, as there could be an option on the product page to change flavor from chocolate to strawberry because the brand is the same and the product is the same: "braitannia and biscuits." 5 | 4. If you do not find the relevant product in the first couple of search results pages, you should go back to the search page and try to search with a different query. 6 | 5. Make sure to pay attention to all the attributes like size, color, quantity, etc. mentioned in the user's query and select appropriate attributes on the product details page before buying it.  7 | 6. Ultimately, your task will only end when you click on the "Buy Now" button of the right product. -------------------------------------------------------------------------------- /agentq/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/agentq/utils/__init__.py -------------------------------------------------------------------------------- /agentq/utils/_pydantic.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Tuple, Union, get_args 2 | 3 | from pydantic import BaseModel 4 | from pydantic.version import VERSION as PYDANTIC_VERSION 5 | from typing_extensions import get_origin 6 | 7 | __all__ = ( 8 | "JsonSchemaValue", 9 | "model_dump", 10 | "model_dump_json", 11 | "type2schema", 12 | "evaluate_forwardref", 13 | ) 14 | 15 | PYDANTIC_V1 = PYDANTIC_VERSION.startswith("1.") 16 | 17 | if not PYDANTIC_V1: 18 | from pydantic import TypeAdapter 19 | from pydantic._internal._typing_extra import ( 20 | eval_type_lenient as evaluate_forwardref, 21 | ) 22 | from pydantic.json_schema import JsonSchemaValue 23 | 24 | def type2schema(t: Any) -> JsonSchemaValue: 25 | """Convert a type to a JSON schema 26 | 27 | Args: 28 | t (Type): The type to convert 29 | 30 | Returns: 31 | JsonSchemaValue: The JSON schema 32 | """ 33 | return TypeAdapter(t).json_schema() 34 | 35 | def model_dump(model: BaseModel) -> Dict[str, Any]: 36 | """Convert a pydantic model to a dict 37 | 38 | Args: 39 | model (BaseModel): The model to convert 40 | 41 | Returns: 42 | Dict[str, Any]: The dict representation of the model 43 | 44 | """ 45 | return model.model_dump() 46 | 47 | def model_dump_json(model: BaseModel) -> str: 48 | """Convert a pydantic model to a JSON string 49 | 50 | Args: 51 | model (BaseModel): The model to convert 52 | 53 | Returns: 54 | str: The JSON string representation of the model 55 | """ 56 | return model.model_dump_json() 57 | 58 | 59 | # Remove this once we drop support for pydantic 1.x 60 | else: # pragma: no cover 61 | from pydantic import TypeAdapter 62 | from pydantic.typing import ( 63 | evaluate_forwardref as evaluate_forwardref, # type: ignore[no-redef] 64 | ) 65 | 66 | JsonSchemaValue = Dict[str, Any] # type: ignore[misc] 67 | 68 | def type2schema(t: Any) -> JsonSchemaValue: 69 | """Convert a type to a JSON schema 70 | 71 | Args: 72 | t (Type): The type to convert 73 | 74 | Returns: 75 | JsonSchemaValue: The JSON schema 76 | """ 77 | if PYDANTIC_V1: 78 | if t is None: 79 | return {"type": "null"} 80 | elif get_origin(t) is Union: 81 | return {"anyOf": [type2schema(tt) for tt in get_args(t)]} 82 | elif get_origin(t) in [Tuple, tuple]: 83 | prefixItems = [type2schema(tt) for tt in get_args(t)] 84 | return { 85 | "maxItems": len(prefixItems), 86 | "minItems": len(prefixItems), 87 | "prefixItems": prefixItems, 88 | "type": "array", 89 | } 90 | 91 | d = TypeAdapter.json_schema(t) 92 | if "title" in d: 93 | d.pop("title") 94 | if "description" in d: 95 | d.pop("description") 96 | 97 | return d 98 | 99 | def model_dump(model: BaseModel) -> Dict[str, Any]: 100 | """Convert a pydantic model to a dict 101 | 102 | Args: 103 | model (BaseModel): The model to convert 104 | 105 | Returns: 106 | Dict[str, Any]: The dict representation of the model 107 | 108 | """ 109 | return model.dict() 110 | 111 | def model_dump_json(model: BaseModel) -> str: 112 | """Convert a pydantic model to a JSON string 113 | 114 | Args: 115 | model (BaseModel): The model to convert 116 | 117 | Returns: 118 | str: The JSON string representation of the model 119 | """ 120 | return model.json() 121 | -------------------------------------------------------------------------------- /agentq/utils/cli_helper.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from asyncio import Future 3 | from typing import Dict, List 4 | 5 | 6 | def async_input(prompt: str) -> Future: # type: ignore 7 | """ 8 | Display a prompt to the user and wait for input in an asynchronous manner. 9 | 10 | Parameters: 11 | - prompt: The message to display to the user. 12 | 13 | Returns: 14 | - A Future object that will be fulfilled with the user's input. 15 | """ 16 | loop = asyncio.get_event_loop() 17 | return loop.run_in_executor(None, input, prompt) 18 | 19 | 20 | async def answer_questions_over_cli(questions: List[str]) -> Dict[str, str]: 21 | """ 22 | Asks a question over the command line and awaits the user's response. 23 | 24 | Parameters: 25 | - questions: A list of questions to ask the user, e.g., ["What is your favorite site?", "What do you want to search for?"]. 26 | 27 | Returns: 28 | - A dictionary where each key is a question and each value is the user's response. 29 | """ 30 | answers: Dict[str, str] = {} 31 | print("*********************************") 32 | for question in questions: 33 | answers[question] = await async_input("Question: " + str(question) + " : ") 34 | print("*********************************") 35 | return answers 36 | -------------------------------------------------------------------------------- /agentq/utils/dom_helper.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import List, Optional 3 | 4 | from playwright.async_api import ElementHandle, Page 5 | 6 | from agentq.utils.logger import logger 7 | 8 | 9 | async def wait_for_non_loading_dom_state(page: Page, max_wait_millis: int): 10 | max_wait_seconds = max_wait_millis / 1000 11 | end_time = asyncio.get_event_loop().time() + max_wait_seconds 12 | while asyncio.get_event_loop().time() < end_time: 13 | dom_state = await page.evaluate("document.readyState") 14 | if dom_state != "loading": 15 | logger.debug(f"DOM state is not 'loading': {dom_state}") 16 | break # Exit the loop if the DOM state is not 'loading' 17 | 18 | await asyncio.sleep(0.05) 19 | 20 | 21 | async def get_element_outer_html( 22 | element: ElementHandle, page: Page, element_tag_name: Optional[str] = None 23 | ) -> str: 24 | """ 25 | Constructs the opening tag of an HTML element along with its attributes. 26 | 27 | Args: 28 | element (ElementHandle): The element to retrieve the opening tag for. 29 | page (Page): The page object associated with the element. 30 | element_tag_name (str, optional): The tag name of the element. Defaults to None. If not passed, it will be retrieved from the element. 31 | 32 | Returns: 33 | str: The opening tag of the HTML element, including a select set of attributes. 34 | """ 35 | tag_name: str = ( 36 | element_tag_name 37 | if element_tag_name 38 | else await page.evaluate("element => element.tagName.toLowerCase()", element) 39 | ) 40 | 41 | attributes_of_interest: List[str] = [ 42 | "id", 43 | "name", 44 | "aria-label", 45 | "placeholder", 46 | "href", 47 | "src", 48 | "aria-autocomplete", 49 | "role", 50 | "type", 51 | "data-testid", 52 | "value", 53 | "selected", 54 | "aria-labelledby", 55 | "aria-describedby", 56 | "aria-haspopup", 57 | ] 58 | opening_tag: str = f"<{tag_name}" 59 | 60 | for attr in attributes_of_interest: 61 | value: str = await element.get_attribute(attr) # type: ignore 62 | if value: 63 | opening_tag += f' {attr}="{value}"' 64 | opening_tag += ">" 65 | 66 | return opening_tag 67 | -------------------------------------------------------------------------------- /agentq/utils/dom_mutation_observer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | from typing import Callable, List # noqa: UP035 4 | 5 | from playwright.async_api import Page 6 | 7 | # Create an event loop 8 | loop = asyncio.get_event_loop() 9 | 10 | DOM_change_callback: List[Callable[[str], None]] = [] 11 | 12 | 13 | def subscribe(callback: Callable[[str], None]) -> None: 14 | DOM_change_callback.append(callback) 15 | 16 | 17 | def unsubscribe(callback: Callable[[str], None]) -> None: 18 | DOM_change_callback.remove(callback) 19 | 20 | 21 | async def add_mutation_observer(page: Page): 22 | """ 23 | Adds a mutation observer to the page to detect changes in the DOM. 24 | When changes are detected, the observer calls the dom_mutation_change_detected function in the browser context. 25 | This changes can be detected by subscribing to the dom_mutation_change_detected function by individual skills. 26 | 27 | Current implementation only detects when a new node is added to the DOM. 28 | However, in many cases, the change could be a change in the style or class of an existing node (e.g. toggle visibility of a hidden node). 29 | """ 30 | 31 | await page.evaluate(""" 32 | console.log('Adding a mutation observer for DOM changes'); 33 | new MutationObserver((mutationsList, observer) => { 34 | let changes_detected = []; 35 | for(let mutation of mutationsList) { 36 | if (mutation.type === 'childList') { 37 | let allAddedNodes=mutation.addedNodes; 38 | for(let node of allAddedNodes) { 39 | if(node.tagName && !['SCRIPT', 'NOSCRIPT', 'STYLE'].includes(node.tagName) && !node.closest('#agentDriveAutoOverlay')) { 40 | let visibility=true; 41 | let content = node.innerText.trim(); 42 | if(visibility && node.innerText.trim()){ 43 | if(content) { 44 | changes_detected.push({tag: node.tagName, content: content}); 45 | } 46 | } 47 | } 48 | } 49 | } else if (mutation.type === 'characterData') { 50 | let node = mutation.target; 51 | if(node.parentNode && !['SCRIPT', 'NOSCRIPT', 'STYLE'].includes(node.parentNode.tagName) && !node.parentNode.closest('#agentDriveAutoOverlay')) { 52 | let visibility=true; 53 | let content = node.data.trim(); 54 | if(visibility && content && window.getComputedStyle(node.parentNode).display !== 'none'){ 55 | if(content && !changes_detected.some(change => change.content.includes(content))) { 56 | changes_detected.push({tag: node.parentNode.tagName, content: content}); 57 | } 58 | } 59 | } 60 | } 61 | } 62 | if(changes_detected.length > 0) { 63 | window.dom_mutation_change_detected(JSON.stringify(changes_detected)); 64 | } 65 | }).observe(document, {subtree: true, childList: true, characterData: true}); 66 | """) 67 | 68 | 69 | async def handle_navigation_for_mutation_observer(page: Page): 70 | await add_mutation_observer(page) 71 | 72 | 73 | async def dom_mutation_change_detected(changes_detected: str): 74 | """ 75 | Detects changes in the DOM (new nodes added) and emits the event to all subscribed callbacks. 76 | The changes_detected is a string in JSON formatt containing the tag and content of the new nodes added to the DOM. 77 | 78 | e.g. The following will be detected when autocomplete recommendations show up when one types Nelson Mandela on google search 79 | [{'tag': 'SPAN', 'content': 'nelson mandela wikipedia'}, {'tag': 'SPAN', 'content': 'nelson mandela movies'}] 80 | """ 81 | changes_detected = json.loads(changes_detected.replace("\t", "").replace("\n", "")) 82 | if len(changes_detected) > 0: 83 | # Emit the event to all subscribed callbacks 84 | for callback in DOM_change_callback: 85 | # If the callback is a coroutine function 86 | if asyncio.iscoroutinefunction(callback): 87 | await callback(changes_detected) 88 | # If the callback is a regular function 89 | else: 90 | callback(changes_detected) 91 | -------------------------------------------------------------------------------- /agentq/utils/extract_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict 3 | 4 | from agentq.utils.logger import logger 5 | 6 | 7 | def extract_json(message: str) -> Dict[str, Any]: 8 | """ 9 | Parse the response from the browser agent and return the response as a dictionary. 10 | """ 11 | json_response = {} 12 | # Remove Markdown code block delimiters if present 13 | message = message.strip() 14 | if message.startswith("```"): 15 | message = message.split("\n", 1)[1] # Remove the first line 16 | if message.endswith("```"): 17 | message = message.rsplit("\n", 1)[0] # Remove the last line 18 | 19 | # Remove any leading "json" tag 20 | if message.lstrip().startswith("json"): 21 | message = message.lstrip()[4:].lstrip() 22 | 23 | try: 24 | return json.loads(message) 25 | except json.JSONDecodeError as e: 26 | logger.warn( 27 | f"LLM response was not properly formed JSON. Error: {e}. " 28 | f'LLM response: "{message}"' 29 | ) 30 | message = message.replace("\\n", "\n") 31 | message = message.replace("\n", " ") # type: ignore 32 | if "plan" in message and "next_step" in message: 33 | start = message.index("plan") + len("plan") 34 | end = message.index("next_step") 35 | json_response["plan"] = message[start:end].replace('"', "").strip() 36 | if "next_step" in message and "terminate" in message: 37 | start = message.index("next_step") + len("next_step") 38 | end = message.index("terminate") 39 | json_response["next_step"] = message[start:end].replace('"', "").strip() 40 | if "terminate" in message and "final_response" in message: 41 | start = message.index("terminate") + len("terminate") 42 | end = message.index("final_response") 43 | matched_string = message[start:end].replace('"', "").strip() 44 | if "yes" in matched_string: 45 | json_response["terminate"] = "yes" 46 | else: 47 | json_response["terminate"] = "no" 48 | 49 | start = message.index("final_response") + len("final_response") 50 | end = len(message) - 1 51 | json_response["final_response"] = ( 52 | message[start:end].replace('"', "").strip() 53 | ) 54 | 55 | elif "terminate" in message: 56 | start = message.index("terminate") + len("terminate") 57 | end = len(message) - 1 58 | matched_string = message[start:end].replace('"', "").strip() 59 | if "yes" in matched_string: 60 | json_response["terminate"] = "yes" 61 | else: 62 | json_response["terminate"] = "no" 63 | 64 | return json_response 65 | -------------------------------------------------------------------------------- /agentq/utils/function_utils.py: -------------------------------------------------------------------------------- 1 | # import inspect 2 | # from typing import Any, Callable, Dict, List, Union 3 | 4 | # from typing_extensions import Annotated, get_args, get_origin 5 | 6 | 7 | # def get_type_name(type_hint: Any) -> str: 8 | # if hasattr(type_hint, "__name__"): 9 | # return type_hint.__name__ 10 | # if hasattr(type_hint, "_name"): 11 | # return type_hint._name 12 | # return str(type_hint).replace("typing.", "") 13 | 14 | 15 | # def get_parameter_schema( 16 | # name: str, param: inspect.Parameter, type_hint: Any 17 | # ) -> Dict[str, Any]: 18 | # schema = {"type": get_type_name(type_hint)} 19 | 20 | # if get_origin(type_hint) is Annotated: 21 | # type_hint, description = get_args(type_hint) 22 | # schema["description"] = description 23 | # else: 24 | # schema["description"] = name 25 | 26 | # if get_origin(type_hint) is Union: 27 | # schema["type"] = [get_type_name(arg) for arg in get_args(type_hint)] 28 | # elif get_origin(type_hint) is List: 29 | # item_type = get_args(type_hint)[0] 30 | # if get_origin(item_type) is Dict: 31 | # key_type, value_type = get_args(item_type) 32 | # schema["type"] = "array" 33 | # schema["items"] = { 34 | # "type": "object", 35 | # "additionalProperties": {"type": get_type_name(value_type)}, 36 | # } 37 | # else: 38 | # schema["type"] = "array" 39 | # schema["items"] = {"type": get_type_name(item_type)} 40 | 41 | # if param.default != inspect.Parameter.empty: 42 | # schema["default"] = param.default 43 | # return schema 44 | 45 | 46 | # def generate_tool_from_function( 47 | # func: Callable[..., Any], tool_description: str 48 | # ) -> Dict[str, Any]: 49 | # signature = inspect.signature(func) 50 | # type_hints = func.__annotations__ 51 | 52 | # parameters = {} 53 | # for name, param in signature.parameters.items(): 54 | # type_hint = type_hints.get(name, Any) 55 | # parameters[name] = get_parameter_schema(name, param, type_hint) 56 | 57 | # return { 58 | # "type": "function", 59 | # "function": { 60 | # "name": func.__name__, 61 | # "description": tool_description, 62 | # "parameters": { 63 | # "type": "object", 64 | # "properties": parameters, 65 | # "required": [ 66 | # name 67 | # for name, param in signature.parameters.items() 68 | # if param.default == inspect.Parameter.empty 69 | # ], 70 | # }, 71 | # }, 72 | # } 73 | 74 | 75 | import functools 76 | import inspect 77 | import json 78 | from logging import getLogger 79 | from typing import ( 80 | Any, 81 | Callable, 82 | Dict, 83 | ForwardRef, 84 | List, 85 | Optional, 86 | Set, 87 | Tuple, 88 | Type, 89 | TypeVar, 90 | Union, 91 | ) 92 | 93 | from pydantic import BaseModel, Field 94 | from typing_extensions import Annotated, Literal, get_args, get_origin 95 | from playwright.async_api import Page 96 | 97 | from ._pydantic import ( 98 | JsonSchemaValue, 99 | evaluate_forwardref, 100 | model_dump, 101 | model_dump_json, 102 | type2schema, 103 | ) 104 | 105 | logger = getLogger(__name__) 106 | 107 | T = TypeVar("T") 108 | 109 | 110 | def get_typed_annotation(annotation: Any, globalns: Dict[str, Any]) -> Any: 111 | """Get the type annotation of a parameter. 112 | 113 | Args: 114 | annotation: The annotation of the parameter 115 | globalns: The global namespace of the function 116 | 117 | Returns: 118 | The type annotation of the parameter 119 | """ 120 | if isinstance(annotation, str): 121 | annotation = ForwardRef(annotation) 122 | annotation = evaluate_forwardref(annotation, globalns, globalns) 123 | return annotation 124 | 125 | 126 | def get_typed_signature(call: Callable[..., Any]) -> inspect.Signature: 127 | """Get the signature of a function with type annotations. 128 | 129 | Args: 130 | call: The function to get the signature for 131 | 132 | Returns: 133 | The signature of the function with type annotations 134 | """ 135 | signature = inspect.signature(call) 136 | globalns = getattr(call, "__globals__", {}) 137 | typed_params = [ 138 | inspect.Parameter( 139 | name=param.name, 140 | kind=param.kind, 141 | default=param.default, 142 | annotation=get_typed_annotation(param.annotation, globalns), 143 | ) 144 | for param in signature.parameters.values() 145 | ] 146 | typed_signature = inspect.Signature(typed_params) 147 | return typed_signature 148 | 149 | 150 | def get_typed_return_annotation(call: Callable[..., Any]) -> Any: 151 | """Get the return annotation of a function. 152 | 153 | Args: 154 | call: The function to get the return annotation for 155 | 156 | Returns: 157 | The return annotation of the function 158 | """ 159 | signature = inspect.signature(call) 160 | annotation = signature.return_annotation 161 | 162 | if annotation is inspect.Signature.empty: 163 | return None 164 | 165 | globalns = getattr(call, "__globals__", {}) 166 | return get_typed_annotation(annotation, globalns) 167 | 168 | 169 | def get_param_annotations( 170 | typed_signature: inspect.Signature, 171 | ) -> Dict[str, Union[Annotated[Type[Any], str], Type[Any]]]: 172 | """Get the type annotations of the parameters of a function 173 | 174 | Args: 175 | typed_signature: The signature of the function with type annotations 176 | 177 | Returns: 178 | A dictionary of the type annotations of the parameters of the function 179 | """ 180 | return { 181 | k: v.annotation 182 | for k, v in typed_signature.parameters.items() 183 | if v.annotation is not inspect.Signature.empty 184 | } 185 | 186 | 187 | class Parameters(BaseModel): 188 | """Parameters of a function as defined by the OpenAI API""" 189 | 190 | type: Literal["object"] = "object" 191 | properties: Dict[str, JsonSchemaValue] 192 | required: List[str] 193 | additionalProperties: bool 194 | additionalProperties: bool 195 | 196 | 197 | class Function(BaseModel): 198 | """A function as defined by the OpenAI API""" 199 | 200 | description: Annotated[str, Field(description="Description of the function")] 201 | name: Annotated[str, Field(description="Name of the function")] 202 | parameters: Annotated[Parameters, Field(description="Parameters of the function")] 203 | strict: bool 204 | 205 | 206 | class ToolFunction(BaseModel): 207 | """A function under tool as defined by the OpenAI API.""" 208 | 209 | type: Literal["function"] = "function" 210 | function: Annotated[Function, Field(description="Function under tool")] 211 | 212 | 213 | def get_parameter_json_schema( 214 | k: str, v: Any, default_values: Dict[str, Any] 215 | ) -> JsonSchemaValue: 216 | 217 | if isinstance(v, type) and issubclass(v, Page): 218 | # Skip schema generation for Page objects - some tools take page as an optional input (this is only utilised during evals when page object is passed to functions like get_dom_content) 219 | return { 220 | "type": "object", 221 | "description": "Playwright Page object", 222 | } 223 | 224 | # Handle Optional types 225 | if get_origin(v) is Union and type(None) in get_args(v): 226 | non_none_type = next(arg for arg in get_args(v) if arg is not type(None)) 227 | if isinstance(non_none_type, type) and issubclass(non_none_type, Page): 228 | # Skip schema generation for Optional[Page] 229 | return { 230 | "type": "object", 231 | "description": "Optional Playwright Page object", 232 | } 233 | 234 | def type2description(k: str, v: Union[Annotated[Type[Any], str], Type[Any]]) -> str: 235 | if get_origin(v) is Annotated: 236 | args = get_args(v) 237 | if len(args) > 1 and isinstance(args[1], str): 238 | return args[1] 239 | return k 240 | 241 | schema = type2schema(v) 242 | schema["description"] = type2description(k, v) 243 | 244 | if schema["type"] == "object": 245 | schema["additionalProperties"] = False 246 | if "properties" not in schema: 247 | schema["properties"] = {} 248 | 249 | if schema["type"] == "array": 250 | if "items" not in schema: 251 | schema["items"] = { 252 | "type": "object", 253 | "properties": {}, 254 | "additionalProperties": False, 255 | } 256 | elif schema["items"].get("type") == "object": 257 | if "properties" not in schema["items"]: 258 | schema["items"]["properties"] = {} 259 | schema["items"]["additionalProperties"] = False 260 | 261 | return schema 262 | 263 | 264 | def get_required_params(typed_signature: inspect.Signature) -> List[str]: 265 | """Get the required parameters of a function 266 | 267 | Args: 268 | signature: The signature of the function as returned by inspect.signature 269 | 270 | Returns: 271 | A list of the required parameters of the function 272 | """ 273 | return [ 274 | k 275 | for k, v in typed_signature.parameters.items() 276 | if v.default == inspect.Signature.empty 277 | ] 278 | 279 | 280 | def get_default_values(typed_signature: inspect.Signature) -> Dict[str, Any]: 281 | """Get default values of parameters of a function 282 | 283 | Args: 284 | signature: The signature of the function as returned by inspect.signature 285 | 286 | Returns: 287 | A dictionary of the default values of the parameters of the function 288 | """ 289 | return { 290 | k: v.default 291 | for k, v in typed_signature.parameters.items() 292 | if v.default != inspect.Signature.empty 293 | } 294 | 295 | 296 | def get_parameters( 297 | required: List[str], 298 | param_annotations: Dict[str, Union[Annotated[Type[Any], str], Type[Any]]], 299 | default_values: Dict[str, Any], 300 | ) -> Parameters: 301 | properties = {} 302 | for k, v in param_annotations.items(): 303 | if v is not inspect.Signature.empty: 304 | if get_origin(v) is Annotated: 305 | v_type = get_args(v)[0] 306 | v_desc = get_args(v)[1] if len(get_args(v)) > 1 else k 307 | else: 308 | v_type = v 309 | v_desc = k 310 | 311 | if (isinstance(v_type, type) and issubclass(v_type, Page)) or ( 312 | get_origin(v_type) is Union 313 | and any( 314 | isinstance(arg, type) and issubclass(arg, Page) 315 | for arg in get_args(v_type) 316 | ) 317 | ): 318 | continue 319 | 320 | 321 | if get_origin(v_type) is List: 322 | item_type = get_args(v_type)[0] 323 | properties[k] = { 324 | "type": "array", 325 | "items": get_parameter_json_schema(k, item_type, default_values), 326 | "description": v_desc, 327 | } 328 | else: 329 | properties[k] = get_parameter_json_schema(k, v_type, default_values) 330 | properties[k]["description"] = v_desc 331 | 332 | return Parameters( 333 | properties=properties, 334 | required=list(properties.keys()), # All properties are required 335 | additionalProperties=False, 336 | ) 337 | 338 | 339 | def get_missing_annotations( 340 | typed_signature: inspect.Signature, required: List[str] 341 | ) -> Tuple[Set[str], Set[str]]: 342 | """Get the missing annotations of a function 343 | 344 | Ignores the parameters with default values as they are not required to be annotated, but logs a warning. 345 | Args: 346 | typed_signature: The signature of the function with type annotations 347 | required: The required parameters of the function 348 | 349 | Returns: 350 | A set of the missing annotations of the function 351 | """ 352 | all_missing = { 353 | k 354 | for k, v in typed_signature.parameters.items() 355 | if v.annotation is inspect.Signature.empty 356 | } 357 | missing = all_missing.intersection(set(required)) 358 | unannotated_with_default = all_missing.difference(missing) 359 | return missing, unannotated_with_default 360 | 361 | 362 | def get_function_schema( 363 | f: Callable[..., Any], *, name: Optional[str] = None, description: str 364 | ) -> Dict[str, Any]: 365 | """Get a JSON schema for a function as defined by the OpenAI API 366 | 367 | Args: 368 | f: The function to get the JSON schema for 369 | name: The name of the function 370 | description: The description of the function 371 | 372 | Returns: 373 | A JSON schema for the function 374 | 375 | Raises: 376 | TypeError: If the function is not annotated 377 | 378 | Examples: 379 | 380 | ```python 381 | def f(a: Annotated[str, "Parameter a"], b: int = 2, c: Annotated[float, "Parameter c"] = 0.1) -> None: 382 | pass 383 | 384 | get_function_schema(f, description="function f") 385 | 386 | # {'type': 'function', 387 | # 'function': {'description': 'function f', 388 | # 'name': 'f', 389 | # 'parameters': {'type': 'object', 390 | # 'properties': {'a': {'type': 'str', 'description': 'Parameter a'}, 391 | # 'b': {'type': 'int', 'description': 'b'}, 392 | # 'c': {'type': 'float', 'description': 'Parameter c'}}, 393 | # 'required': ['a']}}} 394 | ``` 395 | 396 | """ 397 | typed_signature = get_typed_signature(f) 398 | required = get_required_params(typed_signature) 399 | default_values = get_default_values(typed_signature) 400 | param_annotations = get_param_annotations(typed_signature) 401 | return_annotation = get_typed_return_annotation(f) 402 | missing, unannotated_with_default = get_missing_annotations( 403 | typed_signature, required 404 | ) 405 | 406 | if return_annotation is None: 407 | logger.warning( 408 | f"The return type of the function '{f.__name__}' is not annotated. Although annotating it is " 409 | + "optional, the function should return either a string, a subclass of 'pydantic.BaseModel'." 410 | ) 411 | 412 | if unannotated_with_default != set(): 413 | unannotated_with_default_s = [ 414 | f"'{k}'" for k in sorted(unannotated_with_default) 415 | ] 416 | logger.warning( 417 | f"The following parameters of the function '{f.__name__}' with default values are not annotated: " 418 | + f"{', '.join(unannotated_with_default_s)}." 419 | ) 420 | 421 | if missing != set(): 422 | missing_s = [f"'{k}'" for k in sorted(missing)] 423 | raise TypeError( 424 | f"All parameters of the function '{f.__name__}' without default values must be annotated. " 425 | + f"The annotations are missing for the following parameters: {', '.join(missing_s)}" 426 | ) 427 | 428 | fname = name if name else f.__name__ 429 | 430 | parameters = get_parameters( 431 | required, param_annotations, default_values=default_values 432 | ) 433 | 434 | function = ToolFunction( 435 | function=Function( 436 | description=description, 437 | name=fname, 438 | parameters=parameters, 439 | strict=True, 440 | ) 441 | ) 442 | 443 | schema = model_dump(function) 444 | 445 | return schema 446 | 447 | 448 | def get_load_param_if_needed_function( 449 | t: Any, 450 | ) -> Optional[Callable[[Dict[str, Any], Type[BaseModel]], BaseModel]]: 451 | """Get a function to load a parameter if it is a Pydantic model 452 | 453 | Args: 454 | t: The type annotation of the parameter 455 | 456 | Returns: 457 | A function to load the parameter if it is a Pydantic model, otherwise None 458 | 459 | """ 460 | if get_origin(t) is Annotated: 461 | return get_load_param_if_needed_function(get_args(t)[0]) 462 | 463 | def load_base_model(v: Dict[str, Any], t: Type[BaseModel]) -> BaseModel: 464 | return t(**v) 465 | 466 | return load_base_model if isinstance(t, type) and issubclass(t, BaseModel) else None 467 | 468 | 469 | def load_basemodels_if_needed(func: Callable[..., Any]) -> Callable[..., Any]: 470 | """A decorator to load the parameters of a function if they are Pydantic models 471 | 472 | Args: 473 | func: The function with annotated parameters 474 | 475 | Returns: 476 | A function that loads the parameters before calling the original function 477 | 478 | """ 479 | # get the type annotations of the parameters 480 | typed_signature = get_typed_signature(func) 481 | param_annotations = get_param_annotations(typed_signature) 482 | 483 | # get functions for loading BaseModels when needed based on the type annotations 484 | kwargs_mapping_with_nones = { 485 | k: get_load_param_if_needed_function(t) for k, t in param_annotations.items() 486 | } 487 | 488 | # remove the None values 489 | kwargs_mapping = { 490 | k: f for k, f in kwargs_mapping_with_nones.items() if f is not None 491 | } 492 | 493 | # a function that loads the parameters before calling the original function 494 | @functools.wraps(func) 495 | def _load_parameters_if_needed(*args: Any, **kwargs: Any) -> Any: 496 | # load the BaseModels if needed 497 | for k, f in kwargs_mapping.items(): 498 | kwargs[k] = f(kwargs[k], param_annotations[k]) 499 | 500 | # call the original function 501 | return func(*args, **kwargs) 502 | 503 | @functools.wraps(func) 504 | async def _a_load_parameters_if_needed(*args: Any, **kwargs: Any) -> Any: 505 | # load the BaseModels if needed 506 | for k, f in kwargs_mapping.items(): 507 | kwargs[k] = f(kwargs[k], param_annotations[k]) 508 | 509 | # call the original function 510 | return await func(*args, **kwargs) 511 | 512 | if inspect.iscoroutinefunction(func): 513 | return _a_load_parameters_if_needed 514 | else: 515 | return _load_parameters_if_needed 516 | 517 | 518 | def serialize_to_str(x: Any) -> str: 519 | if isinstance(x, str): 520 | return x 521 | elif isinstance(x, BaseModel): 522 | return model_dump_json(x) 523 | else: 524 | return json.dumps(x) 525 | -------------------------------------------------------------------------------- /agentq/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Union 4 | 5 | # Create a logs directory if it doesn't exist 6 | log_directory = "logs" 7 | os.makedirs(log_directory, exist_ok=True) 8 | 9 | # Configure the root logger 10 | logging.basicConfig( 11 | level=logging.DEBUG, 12 | format="[%(asctime)s] %(levelname)s {%(filename)s:%(lineno)d} - %(message)s", 13 | ) 14 | 15 | # Remove all handlers from the root logger 16 | for handler in logging.root.handlers[:]: 17 | logging.root.removeHandler(handler) 18 | 19 | logger = logging.getLogger(__name__) 20 | logger.addHandler(logging.FileHandler(os.path.join(log_directory, "app.log"))) 21 | logger.setLevel(logging.INFO) 22 | 23 | # logging.getLogger("httpcore").setLevel(logging.WARNING) 24 | # logging.getLogger("httpx").setLevel(logging.WARNING) 25 | # logging.getLogger("matplotlib.pyplot").setLevel(logging.WARNING) 26 | # logging.getLogger("PIL.PngImagePlugin").setLevel(logging.WARNING) 27 | # logging.getLogger("PIL.Image").setLevel(logging.WARNING) 28 | 29 | 30 | def set_log_level(level: Union[str, int]) -> None: 31 | """ 32 | Set the log level for the logger. 33 | 34 | Parameters: 35 | - level (Union[str, int]): A string or logging level such as 'debug', 'info', 'warning', 'error', or 'critical', or the corresponding logging constants like logging.DEBUG, logging.INFO, etc. 36 | """ 37 | if isinstance(level, str): 38 | level = level.upper() 39 | numeric_level = getattr(logging, level, None) 40 | if not isinstance(numeric_level, int): 41 | raise ValueError(f"Invalid log level: {level}") 42 | logger.setLevel(numeric_level) 43 | else: 44 | logger.setLevel(level) 45 | -------------------------------------------------------------------------------- /agentq/utils/message_type.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class MessageType(Enum): 5 | PLAN = "plan" 6 | STEP = "step" 7 | ACTION = "action" 8 | ANSWER = "answer" 9 | QUESTION = "question" 10 | INFO = "info" 11 | FINAL = "final" 12 | DONE = "transaction_done" 13 | ERROR = "error" 14 | -------------------------------------------------------------------------------- /agentq/utils/ui_messagetype.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class MessageType(Enum): 5 | PLAN = "plan" 6 | STEP = "step" 7 | ACTION = "action" 8 | ANSWER = "answer" 9 | QUESTION = "question" 10 | INFO = "info" 11 | FINAL = "final" 12 | DONE = "transaction_done" 13 | ERROR = "error" 14 | -------------------------------------------------------------------------------- /dpo_pairs.jsonl: -------------------------------------------------------------------------------- 1 | {"prompt": "Objective: go to football page on bbc\nCurrent DOM: {'role': 'WebArea', 'name': 'Google', 'children': [{'name': 'About', 'mmid': '42', 'tag': 'a'}, {'name': 'Store', 'mmid': '43', 'tag': 'a'}, {'name': 'Gmail ', 'mmid': '52', 'tag': 'a'}, {'name': 'Search for Images ', 'mmid': '54', 'tag': 'a'}, {'role': 'button', 'name': 'Search Labs', 'mmid': '59', 'tag': 'a'}, {'role': 'button', 'name': 'Google apps', 'mmid': '64', 'tag': 'a'}, {'role': 'button', 'name': 'Google Account: Naman Jain (namanbhulawat@gmail.com)', 'mmid': '70', 'tag': 'a', 'aria-label': 'Google Account: Naman Jain \\n(namanbhulawat@gmail.com)'}, {'role': 'region', 'name': 'Celebrating popcorn', 'children': [{'name': 'Celebrating popcorn', 'mmid': '109', 'tag': 'button'}], 'mmid': '101', 'tag': 'div'}, {'role': 'combobox', 'name': 'q', 'description': 'Search', 'focused': True, 'autocomplete': 'both', 'mmid': '170', 'tag': 'textarea', 'aria-label': 'Search'}, {'role': 'button', 'name': 'Search by voice', 'mmid': '186', 'tag': 'div'}, {'role': 'button', 'name': 'Search by im...", "chosen": "Action: {\"type\":\"ENTER_TEXT_AND_CLICK\",\"text_element_mmid\":170,\"text_to_enter\":\"BBC Sport football page\",\"click_element_mmid\":383,\"wait_before_click_execution\":null}\nDescription: Search for 'BBC Sport football page' in Google search bar and click the search button", "rejected": "Action: {\"type\":\"GOTO_URL\",\"website\":\"https://www.bbc.com/sport\",\"timeout\":2.0}\nDescription: Directly go to BBC Sport page by visiting 'https://www.bbc.com/sport'"} 2 | {"prompt": "Objective: go to football page on bbc\nCurrent DOM: {'role': 'WebArea', 'name': 'Google', 'children': [{'name': 'About', 'mmid': '42', 'tag': 'a'}, {'name': 'Store', 'mmid': '43', 'tag': 'a'}, {'name': 'Gmail ', 'mmid': '52', 'tag': 'a'}, {'name': 'Search for Images ', 'mmid': '54', 'tag': 'a'}, {'role': 'button', 'name': 'Search Labs', 'mmid': '59', 'tag': 'a'}, {'role': 'button', 'name': 'Google apps', 'mmid': '64', 'tag': 'a'}, {'role': 'button', 'name': 'Google Account: Naman Jain (namanbhulawat@gmail.com)', 'mmid': '70', 'tag': 'a', 'aria-label': 'Google Account: Naman Jain \\n(namanbhulawat@gmail.com)'}, {'role': 'region', 'name': 'Celebrating popcorn', 'children': [{'name': 'Celebrating popcorn', 'mmid': '109', 'tag': 'button'}], 'mmid': '101', 'tag': 'div'}, {'role': 'combobox', 'name': 'q', 'description': 'Search', 'focused': True, 'autocomplete': 'both', 'mmid': '170', 'tag': 'textarea', 'aria-label': 'Search'}, {'role': 'button', 'name': 'Search by voice', 'mmid': '186', 'tag': 'div'}, {'role': 'button', 'name': 'Search by im...", "chosen": "Action: {\"type\":\"ENTER_TEXT_AND_CLICK\",\"text_element_mmid\":170,\"text_to_enter\":\"BBC Sport football page\",\"click_element_mmid\":383,\"wait_before_click_execution\":null}\nDescription: Search for 'BBC Sport football page' in Google search bar and click the search button", "rejected": "Action: {\"type\":\"ENTER_TEXT_AND_CLICK\",\"text_element_mmid\":170,\"text_to_enter\":\"BBC football\",\"click_element_mmid\":383,\"wait_before_click_execution\":null}\nDescription: Search for 'BBC football' on Google and click the search button"} 3 | {"prompt": "Objective: go to football page on bbc\nCurrent DOM: Error: Unable to retrieve DOM...", "chosen": "Action: {\"type\":\"GOTO_URL\",\"website\":\"https://www.bbc.co.uk/sport/football\",\"timeout\":3.0}\nDescription: Navigate directly to the BBC Sport Football page via URL.", "rejected": "Action: {\"type\":\"GOTO_URL\",\"website\":\"https://www.bbc.co.uk\",\"timeout\":3.0}\nDescription: Navigate to the BBC homepage to find the sports section."} 4 | {"prompt": "Objective: go to football page on bbc\nCurrent DOM: Error: Unable to retrieve DOM...", "chosen": "Action: {\"type\":\"GOTO_URL\",\"website\":\"https://www.bbc.co.uk/sport/football\",\"timeout\":3.0}\nDescription: Navigate directly to the BBC Sport Football page via URL.", "rejected": "Action: {\"type\":\"ENTER_TEXT_AND_CLICK\",\"text_element_mmid\":170,\"text_to_enter\":\"BBC Sport football\",\"click_element_mmid\":383,\"wait_before_click_execution\":null}\nDescription: Refine search on Google for the specific BBC Sport football page."} 5 | -------------------------------------------------------------------------------- /logs.txt: -------------------------------------------------------------------------------- 1 | Starting System Orchestrator... 2 | Browser profile /Users/namanjain/Library/Application Support/Google/Chrome 3 | Browser started and ready. 4 | Enter your command (or type 'exit' to quit): -------------------------------------------------------------------------------- /output.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/output.txt -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "agentq" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["nischalj10 "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.10,<4.0" 10 | litellm = "^1.42.9" 11 | pydantic = "^2.8.2" 12 | pytest-playwright = "^0.5.1" 13 | pdfplumber = "0.11.2" 14 | typing-extensions = "^4.12.2" 15 | ruff = "^0.5.6" 16 | playwright-stealth = "^1.0.6" 17 | setuptools = "^72.1.0" 18 | openai = "^1.40.1" 19 | boto3 = "^1.34.157" 20 | python-json-logger = "^2.0.7" 21 | aiohttp = "^3.10.2" 22 | colorama = "^0.4.6" 23 | tiktoken = "^0.7.0" 24 | agentops = "^0.3.10" 25 | termcolor = "^2.4.0" 26 | tabulate = "^0.9.0" 27 | nltk = "^3.9.1" 28 | langsmith = "^0.1.104" 29 | instructor = "^1.4.0" 30 | flask = "^3.0.3" 31 | numpy = "^2.1.0" 32 | 33 | 34 | [build-system] 35 | requires = ["poetry-core"] 36 | build-backend = "poetry.core.masonry.api" 37 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | agentops==0.3.10 ; python_version >= "3.10" and python_version < "4.0" 2 | aiohappyeyeballs==2.4.0 ; python_version >= "3.10" and python_version < "4.0" 3 | aiohttp==3.10.5 ; python_version >= "3.10" and python_version < "4.0" 4 | aiosignal==1.3.1 ; python_version >= "3.10" and python_version < "4.0" 5 | annotated-types==0.7.0 ; python_version >= "3.10" and python_version < "4.0" 6 | anyio==4.4.0 ; python_version >= "3.10" and python_version < "4.0" 7 | async-timeout==4.0.3 ; python_version >= "3.10" and python_version < "3.11" 8 | attrs==24.2.0 ; python_version >= "3.10" and python_version < "4.0" 9 | boto3==1.35.1 ; python_version >= "3.10" and python_version < "4.0" 10 | botocore==1.35.1 ; python_version >= "3.10" and python_version < "4.0" 11 | certifi==2024.7.4 ; python_version >= "3.10" and python_version < "4.0" 12 | cffi==1.17.0 ; python_version >= "3.10" and python_version < "4.0" and platform_python_implementation != "PyPy" 13 | charset-normalizer==3.3.2 ; python_version >= "3.10" and python_version < "4.0" 14 | click==8.1.7 ; python_version >= "3.10" and python_version < "4.0" 15 | colorama==0.4.6 ; python_version >= "3.10" and python_version < "4.0" 16 | cryptography==43.0.0 ; python_version >= "3.10" and python_version < "4.0" 17 | distro==1.9.0 ; python_version >= "3.10" and python_version < "4.0" 18 | docstring-parser==0.16 ; python_version >= "3.10" and python_version < "4.0" 19 | exceptiongroup==1.2.2 ; python_version >= "3.10" and python_version < "3.11" 20 | filelock==3.15.4 ; python_version >= "3.10" and python_version < "4.0" 21 | frozenlist==1.4.1 ; python_version >= "3.10" and python_version < "4.0" 22 | fsspec==2024.6.1 ; python_version >= "3.10" and python_version < "4.0" 23 | greenlet==3.0.3 ; python_version >= "3.10" and python_version < "4.0" 24 | h11==0.14.0 ; python_version >= "3.10" and python_version < "4.0" 25 | httpcore==1.0.5 ; python_version >= "3.10" and python_version < "4.0" 26 | httpx==0.27.0 ; python_version >= "3.10" and python_version < "4.0" 27 | huggingface-hub==0.24.6 ; python_version >= "3.10" and python_version < "4.0" 28 | idna==3.7 ; python_version >= "3.10" and python_version < "4.0" 29 | importlib-metadata==8.3.0 ; python_version >= "3.10" and python_version < "4.0" 30 | iniconfig==2.0.0 ; python_version >= "3.10" and python_version < "4.0" 31 | instructor==1.4.0 ; python_version >= "3.10" and python_version < "4.0" 32 | jinja2==3.1.4 ; python_version >= "3.10" and python_version < "4.0" 33 | jiter==0.4.2 ; python_version >= "3.10" and python_version < "4.0" 34 | jmespath==1.0.1 ; python_version >= "3.10" and python_version < "4.0" 35 | joblib==1.4.2 ; python_version >= "3.10" and python_version < "4.0" 36 | jsonschema-specifications==2023.12.1 ; python_version >= "3.10" and python_version < "4.0" 37 | jsonschema==4.23.0 ; python_version >= "3.10" and python_version < "4.0" 38 | langsmith==0.1.104 ; python_version >= "3.10" and python_version < "4.0" 39 | litellm==1.43.18 ; python_version >= "3.10" and python_version < "4.0" 40 | markdown-it-py==3.0.0 ; python_version >= "3.10" and python_version < "4.0" 41 | markupsafe==2.1.5 ; python_version >= "3.10" and python_version < "4.0" 42 | mdurl==0.1.2 ; python_version >= "3.10" and python_version < "4.0" 43 | multidict==6.0.5 ; python_version >= "3.10" and python_version < "4.0" 44 | nltk==3.9.1 ; python_version >= "3.10" and python_version < "4.0" 45 | openai==1.41.1 ; python_version >= "3.10" and python_version < "4.0" 46 | orjson==3.10.7 ; python_version >= "3.10" and python_version < "4.0" 47 | packaging==23.2 ; python_version >= "3.10" and python_version < "4.0" 48 | pdfminer-six==20231228 ; python_version >= "3.10" and python_version < "4.0" 49 | pdfplumber==0.11.2 ; python_version >= "3.10" and python_version < "4.0" 50 | pillow==10.4.0 ; python_version >= "3.10" and python_version < "4.0" 51 | playwright-stealth==1.0.6 ; python_version >= "3.10" and python_version < "4.0" 52 | playwright==1.46.0 ; python_version >= "3.10" and python_version < "4.0" 53 | pluggy==1.5.0 ; python_version >= "3.10" and python_version < "4.0" 54 | psutil==5.9.8 ; python_version >= "3.10" and python_version < "4.0" 55 | pycparser==2.22 ; python_version >= "3.10" and python_version < "4.0" and platform_python_implementation != "PyPy" 56 | pydantic-core==2.20.1 ; python_version >= "3.10" and python_version < "4.0" 57 | pydantic==2.8.2 ; python_version >= "3.10" and python_version < "4.0" 58 | pyee==11.1.0 ; python_version >= "3.10" and python_version < "4.0" 59 | pygments==2.18.0 ; python_version >= "3.10" and python_version < "4.0" 60 | pypdfium2==4.30.0 ; python_version >= "3.10" and python_version < "4.0" 61 | pytest-base-url==2.1.0 ; python_version >= "3.10" and python_version < "4.0" 62 | pytest-playwright==0.5.1 ; python_version >= "3.10" and python_version < "4.0" 63 | pytest==8.3.2 ; python_version >= "3.10" and python_version < "4.0" 64 | python-dateutil==2.9.0.post0 ; python_version >= "3.10" and python_version < "4.0" 65 | python-dotenv==1.0.1 ; python_version >= "3.10" and python_version < "4.0" 66 | python-json-logger==2.0.7 ; python_version >= "3.10" and python_version < "4.0" 67 | python-slugify==8.0.4 ; python_version >= "3.10" and python_version < "4.0" 68 | pyyaml==6.0.1 ; python_version >= "3.10" and python_version < "4.0" 69 | referencing==0.35.1 ; python_version >= "3.10" and python_version < "4.0" 70 | regex==2024.7.24 ; python_version >= "3.10" and python_version < "4.0" 71 | requests==2.31.0 ; python_version >= "3.10" and python_version < "4.0" 72 | rich==13.8.0 ; python_version >= "3.10" and python_version < "4.0" 73 | rpds-py==0.20.0 ; python_version >= "3.10" and python_version < "4.0" 74 | ruff==0.5.7 ; python_version >= "3.10" and python_version < "4.0" 75 | s3transfer==0.10.2 ; python_version >= "3.10" and python_version < "4.0" 76 | setuptools==72.2.0 ; python_version >= "3.10" and python_version < "4.0" 77 | shellingham==1.5.4 ; python_version >= "3.10" and python_version < "4.0" 78 | six==1.16.0 ; python_version >= "3.10" and python_version < "4.0" 79 | sniffio==1.3.1 ; python_version >= "3.10" and python_version < "4.0" 80 | tabulate==0.9.0 ; python_version >= "3.10" and python_version < "4.0" 81 | tenacity==8.5.0 ; python_version >= "3.10" and python_version < "4.0" 82 | termcolor==2.4.0 ; python_version >= "3.10" and python_version < "4.0" 83 | text-unidecode==1.3 ; python_version >= "3.10" and python_version < "4.0" 84 | tiktoken==0.7.0 ; python_version >= "3.10" and python_version < "4.0" 85 | tokenizers==0.20.0 ; python_version >= "3.10" and python_version < "4.0" 86 | tomli==2.0.1 ; python_version >= "3.10" and python_version < "3.11" 87 | tqdm==4.66.5 ; python_version >= "3.10" and python_version < "4.0" 88 | typer==0.12.5 ; python_version >= "3.10" and python_version < "4.0" 89 | typing-extensions==4.12.2 ; python_version >= "3.10" and python_version < "4.0" 90 | urllib3==2.2.2 ; python_version >= "3.10" and python_version < "4.0" 91 | yarl==1.9.4 ; python_version >= "3.10" and python_version < "4.0" 92 | zipp==3.20.0 ; python_version >= "3.10" and python_version < "4.0" 93 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from flask import Flask, jsonify, request 4 | 5 | from agentq.__main__ import run_agent_sync 6 | from agentq.core.mcts.browser_mcts import main as run_browser_mcts 7 | 8 | app = Flask(__name__) 9 | 10 | 11 | @app.route("/execute", methods=["GET"]) 12 | def execute_command(): 13 | goal = request.args.get("goal") 14 | if not goal: 15 | return jsonify({"error": "No command provided"}), 400 16 | 17 | # Ensure we have an event loop 18 | try: 19 | loop = asyncio.get_event_loop() 20 | except RuntimeError: 21 | loop = asyncio.new_event_loop() 22 | asyncio.set_event_loop(loop) 23 | 24 | # Run the agent asynchronously 25 | result = run_agent_sync(command=goal) 26 | return jsonify({"result": result}) 27 | 28 | 29 | @app.route("/execute_mcts", methods=["GET"]) 30 | def run_mcts(): 31 | objective = request.args.get("goal") 32 | if not objective: 33 | return jsonify({"error": "No objective provided"}), 400 34 | 35 | # Ensure we have an event loop 36 | try: 37 | loop = asyncio.get_event_loop() 38 | except RuntimeError: 39 | loop = asyncio.new_event_loop() 40 | asyncio.set_event_loop(loop) 41 | 42 | # Run the MCTS algorithm asynchronously 43 | result = loop.run_until_complete(run_browser_mcts(objective, eval_mode=True)) 44 | return result 45 | 46 | 47 | if __name__ == "__main__": 48 | app.run(host="0.0.0.0", port=8000, threaded=False) 49 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/agent-q/6050777f833f43c36421398cb2f524ea9709c839/test/__init__.py -------------------------------------------------------------------------------- /test/run_tests.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import asyncio 3 | 4 | from test.tests_processor import run_tests 5 | 6 | if __name__ == "__main__": 7 | # Create the parser 8 | parser = argparse.ArgumentParser( 9 | description="Run test suite for specified range of test tasks." 10 | ) 11 | 12 | # Add arguments 13 | parser.add_argument( 14 | "-s", 15 | "--take_screenshots", 16 | type=bool, 17 | default=False, 18 | help="Take screenshots after every operation performed (default: False)", 19 | ) 20 | parser.add_argument( 21 | "-wait", 22 | "--wait_time_non_headless", 23 | type=int, 24 | default=5, 25 | help="Time to wait between test tasks when running in non-headless mode (default: 10 seconds)", 26 | ) 27 | parser.add_argument( 28 | "-min", 29 | "--min_task_index", 30 | type=int, 31 | default=0, 32 | help="Minimum task index to start tests from (default: 0)", 33 | ) 34 | parser.add_argument( 35 | "-max", 36 | "--max_task_index", 37 | type=int, 38 | help="Maximum task index to end tests with, non-inclusive (default is all the tests in the file).", 39 | ) 40 | parser.add_argument( 41 | "-id", 42 | "--test_results_id", 43 | type=str, 44 | default="", 45 | help="A unique identifier for the test results. If not provided, a timestamp is used.", 46 | ) 47 | parser.add_argument( 48 | "-config", 49 | "--test_config_file", 50 | type=str, 51 | help='Path to the test configuration file. Default is "test/tasks/test.json" in the project root.', 52 | ) 53 | 54 | # Parse the command line arguments 55 | args = parser.parse_args() 56 | 57 | # Run the main function with the provided or default arguments, not passing browser_manager or AutoGenWrapper will cause the test processor to create new instances of them 58 | asyncio.run( 59 | run_tests( 60 | orchestrator=None, 61 | min_task_index=args.min_task_index, 62 | max_task_index=args.max_task_index, 63 | test_file=args.test_config_file, 64 | test_results_id=args.test_results_id, 65 | wait_time_non_headless=args.wait_time_non_headless, 66 | take_screenshots=args.take_screenshots, 67 | ) 68 | ) 69 | -------------------------------------------------------------------------------- /test/tasks/two_tasks.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "sites": null, 4 | "task_id": 29, 5 | "require_login": false, 6 | "storage_state": null, 7 | "start_url": "https://www.allrecipes.com/", 8 | "geolocation": null, 9 | "intent_template": "Search for a Mediterranean-style grilled fish recipe on Allrecipes that includes ingredients like olives, has at least a 4-star rating, and more than 25 reviews. Detail the ingredients, cooking method, and total time required for preparation and cooking.", 10 | "instantiation_dict": {}, 11 | "intent": "Search for a Mediterranean-style grilled fish recipe on Allrecipes that includes ingredients like olives, has at least a 4-star rating, and more than 25 reviews. Detail the ingredients, cooking method, and total time required for preparation and cooking.", 12 | "require_reset": false, 13 | "eval": { 14 | "eval_types": ["manual"], 15 | "reference_answers": { 16 | "manual_check": { 17 | "answer": "'Branzino Mediterranean', 36 reviews, include olive oil, , Prep Time: 15 mins, Cook Time: 25 mins, Total Time: 40 mins", 18 | "type": "possible" 19 | } 20 | }, 21 | "reference_url": null, 22 | "program_html": null 23 | }, 24 | "task_alias": "Allrecipes--29", 25 | "task_index": 1 26 | }, 27 | { 28 | "sites": null, 29 | "task_id": 72, 30 | "require_login": false, 31 | "storage_state": null, 32 | "start_url": "https://www.amazon.com/", 33 | "geolocation": null, 34 | "intent_template": "Look for a USB-C hub on Amazon compatible with MacBook Pro, featuring at least 4 ports, including HDMI and SD card reader. The price should be under $50. Select the one after sorting by Best Sellers.", 35 | "instantiation_dict": {}, 36 | "intent": "Look for a USB-C hub on Amazon compatible with MacBook Pro, featuring at least 4 ports, including HDMI and SD card reader. The price should be under $50. Select the one after sorting by Best Sellers.", 37 | "require_reset": false, 38 | "eval": { 39 | "eval_types": ["manual"], 40 | "reference_answers": { 41 | "manual_check": { 42 | "answer": "Hiearcool USB C Hub, USB C Multi-Port Adapter for MacBook Pro, 7IN1, include 4K HDMI USB3.0 and SD/TF Card Reader, $24.99", 43 | "type": "possible" 44 | } 45 | }, 46 | "reference_url": null, 47 | "program_html": null 48 | }, 49 | "task_alias": "Amazon--27", 50 | "task_index": 2 51 | } 52 | ] 53 | -------------------------------------------------------------------------------- /test/test_config_auditor.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import Any, Dict, List 4 | 5 | from agentq.config.config import PROJECT_TEST_ROOT 6 | from test.test_utils import load_config 7 | 8 | 9 | def validate_and_update_task_ids(tasks: List[Dict[str, Any]]) -> None: 10 | """Ensure that task IDs match their positions in the List and update them if necessary. 11 | 12 | Args: 13 | tasks (List[Dict[str, Any]]): The List of tasks to process. 14 | """ 15 | for index, task in enumerate(tasks): 16 | task["task_id"] = index 17 | 18 | 19 | def substitute_intent_templates(tasks: List[Dict[str, Any]]) -> None: 20 | """Substitute intent_template patterns with values from instantiation_Dict. 21 | 22 | Args: 23 | tasks (List[Dict[str, Any]]): The List of tasks to process. 24 | """ 25 | for task in tasks: 26 | if "intent_template" in task and "instantiation_Dict" in task: 27 | template = task["intent_template"] 28 | for key, value in task["instantiation_Dict"].items(): 29 | placeholder = "{{" + key + "}}" 30 | template = template.replace(placeholder, str(value)) 31 | task["intent"] = template 32 | 33 | 34 | def save_json_file(tasks: List[Dict[str, Any]], file_path: str) -> None: 35 | """Save the modified List of tasks back to a JSON file. 36 | 37 | Args: 38 | tasks (List[Dict[str, Any]]): The List of modified tasks. 39 | file_path (str): The path to save the JSON file. 40 | """ 41 | with open(file_path, "w", encoding="utf-8") as file: 42 | json.dump(tasks, file, ensure_ascii=False, indent=4) 43 | 44 | 45 | def process_tasks(file_path: str) -> None: 46 | """Load, process, and save tasks from/to a JSON file. 47 | 48 | Args: 49 | file_path (str): The path to the JSON file containing tasks. 50 | """ 51 | tasks = load_config(file_path) 52 | validate_and_update_task_ids(tasks) 53 | substitute_intent_templates(tasks) 54 | save_json_file(tasks, file_path) 55 | 56 | 57 | if __name__ == "__main__": 58 | file_path = os.path.join(PROJECT_TEST_ROOT, "tasks", "test.json") 59 | process_tasks(file_path) 60 | -------------------------------------------------------------------------------- /test/test_tasks_formatter.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | # read the test configuration file, copy what is in task_id to task_alias and make task_id have an incremental numeric value, then save the file back to the same location 5 | def format_test_config_file(test_config_file: str): 6 | with open(test_config_file, "r") as file: 7 | tasks = json.load(file) 8 | for i, task in enumerate(tasks): 9 | if "task_alias" in task: 10 | continue 11 | 12 | task["task_alias"] = task["task_id"] 13 | task["task_id"] = i 14 | tasks[i] = task 15 | with open(test_config_file, "w") as file: 16 | json.dump(tasks, file, indent=4) 17 | 18 | def add_task_index_to_test_config_file(test_config_file: str): 19 | with open(test_config_file, "r") as file: 20 | tasks = json.load(file) 21 | for i, task in enumerate(tasks): 22 | task["task_index"] = i 23 | tasks[i] = task 24 | with open(test_config_file, "w") as file: 25 | json.dump(tasks, file, indent=4) 26 | format_test_config_file("test/tasks/webvoyager_test.json") 27 | add_task_index_to_test_config_file("test/tasks/webvoyager_test.json") 28 | -------------------------------------------------------------------------------- /test/test_utils.py: -------------------------------------------------------------------------------- 1 | """Implements helper functions to assist evaluation cases where other evaluators are not suitable.""" 2 | 3 | import json 4 | import os 5 | from datetime import datetime 6 | from pathlib import Path 7 | from typing import Any, Dict, List, Optional, Union 8 | 9 | from dotenv import load_dotenv 10 | from nltk.tokenize import word_tokenize # type: ignore 11 | from openai import OpenAI 12 | 13 | load_dotenv() 14 | client = OpenAI() 15 | 16 | 17 | def llm_fuzzy_match(pred: str, reference: str, question: str) -> float: 18 | """ 19 | Evaluates if a predicted answer matches a reference answer semantically, considering the context of a question. 20 | 21 | This function simulates a grading scenario, understanding that a student's answer may use different wording or phrasing from the reference answer. It uses GPT-4-turbo model to assess semantic equivalence. 22 | 23 | Parameters: 24 | pred (str): The student's predicted answer. 25 | reference (str): The reference answer to compare against. 26 | question (str): The question related to the answers. 27 | 28 | Returns: 29 | float: Returns 1.0 if the predicted answer is semantically equivalent to the reference, otherwise 0.0. 30 | """ 31 | messages: List[Dict[str, Any]] = [] 32 | # construct the question to ask 33 | message = "Help a teacher to grade the answer of a student given a question. Keep in mind that the student may use different phrasing or wording to answer the question. The goal is to evaluate whether the answer is semantically equivalent to the reference answer.\n" 34 | message += f"question: {question}\n" 35 | message += f"reference answer: {reference}\n" 36 | message += "all the string 'N/A' that you see is a special sequence that means 'not achievable'\n" 37 | message += f"student answer: {pred}\n" 38 | message += "Conclude the judgement by correct/incorrect/partially correct." 39 | messages = [ 40 | {"role": "system", "content": "You are a helpful assistant"}, 41 | {"role": "user", "content": message}, 42 | ] 43 | 44 | response = generate_from_openai_chat_completion( 45 | model="gpt-4-turbo-preview", 46 | messages=messages, 47 | temperature=0, 48 | max_tokens=768, 49 | top_p=1.0, 50 | context_length=0, 51 | ).lower() 52 | if "partially correct" in response or "incorrect" in response: 53 | return 0.0 54 | else: 55 | assert "correct" in response 56 | return 1.0 57 | 58 | 59 | def llm_ua_match(pred: str, reference: str, question: str) -> float: 60 | """ 61 | Evaluates the alignment between a reported reason for a task being unachievable and the actual reason. 62 | 63 | This function reviews both the actual and reported reasons for a task's unachievability within the context of the task. 64 | It assesses if the reported reason is implicitly or explicitly in line with the actual reason, using GPT-turbo model. 65 | 66 | Parameters: 67 | pred (str): The reported unachievable reason by an individual. 68 | reference (str): The actual reason why the task is unachievable. 69 | question (str): The task in question. 70 | 71 | Returns: 72 | float: Returns 1.0 if the reported reason aligns with the actual reason, otherwise 0.0. 73 | """ 74 | messages: List[Dict[str, Any]] = [] 75 | # construct the question to ask 76 | message = "" 77 | message += f"task: {question}\n" 78 | message += f"actual unachievable reason: {reference}\n" 79 | message += f"reported unachievable reason: {pred}\n" 80 | message += ( 81 | "The task described above is inherently unachievable due to the reason specified under 'actual unachievable reason'. " 82 | "An individual previously attempted this task and was unable to complete it. They provided a reason for their failure, " 83 | "which is Listed under 'reported unachievable reason'. Your role is to review both the actual and reported reasons. " 84 | "Determine if the reported reason aligns with the actual reason, even if implicitly. " 85 | "If the stated reason is in line with the actual reason, respond with 'same'. Otherwise, respond with 'different'." 86 | ) 87 | messages = [ 88 | {"role": "system", "content": "You are a helpful assistant"}, 89 | {"role": "user", "content": message}, 90 | ] 91 | 92 | response = generate_from_openai_chat_completion( 93 | model="gpt-4-turbo-preview", 94 | messages=messages, 95 | temperature=0, 96 | max_tokens=768, 97 | top_p=1.0, 98 | context_length=0, 99 | ).lower() 100 | if "different" in response: 101 | return 0.0 102 | else: 103 | assert "same" in response 104 | return 1.0 105 | 106 | 107 | def generate_from_openai_chat_completion( 108 | messages: List[Dict[str, str]], 109 | model: str, 110 | temperature: float, 111 | max_tokens: int, 112 | top_p: float, 113 | context_length: int, 114 | stop_token: Optional[str] = None, 115 | ) -> str: 116 | """ 117 | Generates a response from OpenAI's chat completions based on a conversation constructed from a List of messages. 118 | 119 | This function makes a call to the OpenAI API using specified parameters to control the generation. 120 | It requires an API key to be set in the environment variables. 121 | 122 | Parameters: 123 | messages (List[dict[str, str]]): A List of messages to construct the conversation context. 124 | model (str): The model name to use for generating the completion. 125 | temperature (float): Sampling temperature for generation. 126 | max_tokens (int): Maximum number of tokens to generate. 127 | top_p (float): Nucleus sampling parameter controlling the size of the probability mass to sample from. 128 | context_length (int): The maximum number of tokens from `messages` to use for context. 129 | stop_token (str, optional): A token at which to stop generating further tokens. 130 | 131 | Returns: 132 | str: The generated response as a string. 133 | 134 | Raises: 135 | ValueError: If the 'OPENAI_API_KEY' environment variable is not set. 136 | """ 137 | if "OPENAI_API_KEY" not in os.environ: 138 | raise ValueError( 139 | "OPENAI_API_KEY environment variable must be set when using OpenAI API." 140 | ) 141 | client.api_key = os.environ["OPENAI_API_KEY"] 142 | client.organization = os.environ.get("OPENAI_ORGANIZATION", "") 143 | 144 | response = client.chat.completions.create( 145 | model=model, 146 | messages=messages, # type: ignore 147 | temperature=temperature, 148 | max_tokens=max_tokens, 149 | top_p=top_p, 150 | n=1, 151 | stop=[stop_token] if stop_token else None, 152 | ) 153 | answer: str = response.choices[0].message.content # type: ignore 154 | return answer 155 | 156 | 157 | def clean_answer(answer: str) -> str: 158 | """Cleans and preprocesses the answer string for evaluation. 159 | 160 | Parameters: 161 | answer (str): The answer string to clean. 162 | 163 | Returns: 164 | str: The cleaned and lowercased answer string. 165 | """ 166 | answer = answer.strip().strip('"').strip("'").lower() 167 | return answer 168 | 169 | 170 | def evaluate_exact_match(ref: str, pred: str) -> float: 171 | """Evaluates if the predicted answer exactly matches the reference answer. 172 | 173 | Parameters: 174 | ref (str): The reference answer. 175 | pred (str): The predicted answer. 176 | 177 | Returns: 178 | float: 1.0 if the answers match exactly, otherwise 0.0. 179 | """ 180 | return float(clean_answer(pred) == clean_answer(ref)) 181 | 182 | 183 | def evaluate_must_include(ref: str, pred: str, tokenize: bool = False) -> float: 184 | """Checks if the predicted answer includes all phrases from the reference answer. 185 | 186 | Parameters: 187 | ref (str): The reference answer containing phrases that must be included. 188 | pred (str): The predicted answer to be evaluated. 189 | tokenize (bool, optional): Tokenizes the answers before evaluation if True. Default is False. 190 | 191 | Returns: 192 | float: 1.0 if all phrases are included, otherwise 0.0. 193 | """ 194 | clean_ref = clean_answer(ref) 195 | clean_pred = clean_answer(pred) 196 | if tokenize and len(clean_ref) == 1: 197 | return float(clean_ref in word_tokenize(clean_pred)) 198 | else: 199 | return float(clean_ref in clean_pred) 200 | 201 | 202 | def evaluate_fuzzy_match(ref: str, pred: str, intent: str) -> float: 203 | """Evaluates if the predicted answer is semantically similar to the reference answer. 204 | 205 | Uses a large language model to assess similarity based on the intent of the question. 206 | 207 | Parameters: 208 | ref (str): The reference answer. 209 | pred (str): The predicted answer. 210 | intent (str): The intent or context of the question. 211 | 212 | Returns: 213 | float: 1.0 if the answers are considered semantically similar, otherwise 0.0. 214 | """ 215 | return llm_fuzzy_match(pred, ref, intent) 216 | 217 | 218 | def evaluate_ua_match(ref: str, pred: str, intent: str) -> float: 219 | """Evaluates if the predicted reason for a task being unachievable matches the reference reason. 220 | 221 | Parameters: 222 | ref (str): The reference reason why the task is unachievable. 223 | pred (str): The predicted reason reported by the model. 224 | intent (str): The intent or context of the task. 225 | 226 | Returns: 227 | float: 1.0 if the reasons match, otherwise 0.0. 228 | """ 229 | return llm_ua_match(pred, ref, intent) 230 | 231 | 232 | def load_config(config_file: Union[Path, str]) -> List[Dict[str, Any]]: 233 | """Load the confiufiguration for the test cases 234 | 235 | Args: 236 | config_file Union[Path, str]: Path to the config file 237 | 238 | Returns: 239 | List[dict[str, Any]]: All the test cases in the config file 240 | """ 241 | with open(config_file, "r") as f: # noqa: UP015 242 | configs = json.load(f) 243 | return configs 244 | 245 | 246 | def task_config_validator(task_config: Dict[str, Any]) -> bool: 247 | # Access the attributes 248 | command = task_config.get("intent") 249 | 250 | if not command: 251 | raise ValueError( 252 | "Intent is missing in the task config file. Without it the task cannot be run." 253 | ) 254 | 255 | return True 256 | 257 | 258 | def get_formatted_current_timestamp(format: str = "%Y-%m-%d %H:%M:%S") -> str: 259 | """Get the current timestamp in the specified format. 260 | 261 | Args: 262 | format (str, optional): The format of the timestamp. Defaults to "%Y-%m-%d %H:%M:%S". 263 | 264 | Returns: 265 | str: The current timestamp in the specified format. 266 | """ 267 | # Get the current time 268 | current_time = datetime.now() 269 | 270 | # Format the timestamp as a human-readable string 271 | timestamp_str = current_time.strftime(format) 272 | return timestamp_str 273 | -------------------------------------------------------------------------------- /test/tests_processor.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import os 4 | import time 5 | from typing import Any, Dict, List, Optional, Tuple 6 | 7 | from playwright.async_api import Page 8 | from tabulate import tabulate 9 | from termcolor import colored 10 | 11 | from agentq.config.config import PROJECT_TEST_ROOT 12 | from agentq.core.agent.agentq import AgentQ 13 | from agentq.core.agent.agentq_actor import AgentQActor 14 | from agentq.core.agent.agentq_critic import AgentQCritic 15 | from agentq.core.agent.browser_nav_agent import BrowserNavAgent 16 | from agentq.core.agent.planner_agent import PlannerAgent 17 | from agentq.core.models.models import State 18 | from agentq.core.orchestrator.orchestrator import Orchestrator 19 | from agentq.utils.logger import logger 20 | from test.evaluators import evaluator_router 21 | from test.test_utils import ( 22 | get_formatted_current_timestamp, 23 | load_config, 24 | task_config_validator, 25 | ) 26 | 27 | TEST_TASKS = os.path.join(PROJECT_TEST_ROOT, "tasks") 28 | TEST_LOGS = os.path.join(PROJECT_TEST_ROOT, "logs") 29 | TEST_RESULTS = os.path.join(PROJECT_TEST_ROOT, "results") 30 | 31 | 32 | def check_top_level_test_folders(): 33 | for folder in [TEST_LOGS, TEST_RESULTS]: 34 | if not os.path.exists(folder): 35 | os.makedirs(folder) 36 | logger.info(f"Created folder at: {folder}") 37 | 38 | 39 | def create_test_results_id(test_results_id: Optional[str], test_file: str) -> str: 40 | prefix = "test_results_for_" 41 | if test_results_id: 42 | return f"{prefix}{test_results_id}" 43 | test_file_base = os.path.basename(test_file) 44 | test_file_name = os.path.splitext(test_file_base)[0] 45 | return f"{prefix}{test_file_name}" 46 | 47 | 48 | def create_task_log_folders(task_id: str, test_results_id: str) -> Dict[str, str]: 49 | task_log_dir = os.path.join( 50 | TEST_LOGS, f"{test_results_id}", f"logs_for_task_{task_id}" 51 | ) 52 | task_screenshots_dir = os.path.join(task_log_dir, "snapshots") 53 | for directory in [task_log_dir, task_screenshots_dir]: 54 | if not os.path.exists(directory): 55 | os.makedirs(directory) 56 | logger.info(f"Created directory at: {directory}") 57 | return { 58 | "task_log_folder": task_log_dir, 59 | "task_screenshots_folder": task_screenshots_dir, 60 | } 61 | 62 | 63 | def create_results_dir(test_file: str, test_results_id: Optional[str]) -> str: 64 | if test_results_id: 65 | results_dir = os.path.join(TEST_RESULTS, f"results_for_{test_results_id}") 66 | else: 67 | test_file_base = os.path.basename(test_file) 68 | test_file_name = os.path.splitext(test_file_base)[0] 69 | results_dir = os.path.join( 70 | TEST_RESULTS, f"results_for_test_file_{test_file_name}" 71 | ) 72 | if not os.path.exists(results_dir): 73 | os.makedirs(results_dir) 74 | logger.info(f"Created results directory: {results_dir}") 75 | return results_dir 76 | 77 | 78 | def dump_log(task_id: str, messages: Dict[str, Any], logs_dir: str): 79 | file_name = os.path.join(logs_dir, f"execution_logs_{task_id}.json") 80 | with open(file_name, "w", encoding="utf-8") as f: 81 | json.dump(messages, f, ensure_ascii=False, indent=4) 82 | 83 | 84 | def save_test_results(test_results: List[Dict[str, Any]], test_results_id: str): 85 | file_name = os.path.join(TEST_RESULTS, f"test_results_{test_results_id}.json") 86 | with open(file_name, "w", encoding="utf-8") as f: 87 | json.dump(test_results, f, ensure_ascii=False, indent=4) 88 | logger.info(f"Test results dumped to: {file_name}") 89 | 90 | 91 | def save_individual_test_result(test_result: Dict[str, Any], results_dir: str): 92 | task_id = test_result["task_id"] 93 | file_name = os.path.join(results_dir, f"test_result_{task_id}.json") 94 | with open(file_name, "w", encoding="utf-8") as f: 95 | json.dump(test_result, f, ensure_ascii=False, indent=4) 96 | logger.info(f"Test result for task {task_id} dumped to: {file_name}") 97 | 98 | 99 | def print_progress_bar(current: int, total: int, bar_length: int = 50) -> None: 100 | percent = float(current) * 100 / total 101 | arrow = "-" * int(percent / 100 * bar_length - 1) + ">" 102 | spaces = " " * (bar_length - len(arrow)) 103 | print(f"\rProgress: [{arrow}{spaces}] {current}/{total} ({percent:.2f}%)", end="") 104 | 105 | 106 | def determine_status_and_color(score: float) -> Tuple[str, str]: 107 | if score == 1: 108 | return "Pass", "green" 109 | elif score < 0: 110 | return "Skip", "yellow" 111 | else: 112 | return "Fail", "red" 113 | 114 | 115 | def print_test_result(task_result: Dict[str, Any], index: int, total: int) -> None: 116 | status, color = determine_status_and_color(task_result["score"]) 117 | result_table = [ 118 | ["Test Index", "Task ID", "Intent", "Status", "Time Taken (s)"], 119 | [ 120 | index, 121 | task_result["task_id"], 122 | task_result["intent"], 123 | colored(status, color), 124 | round(task_result["tct"], 2), 125 | ], 126 | ] 127 | print("\n" + tabulate(result_table, headers="firstrow", tablefmt="grid")) 128 | 129 | 130 | async def execute_single_task( 131 | task_config: Dict[str, Any], 132 | orchestrator: Orchestrator, 133 | page: Page, 134 | logs_dir: str, 135 | ) -> Dict[str, Any]: 136 | task_config_validator(task_config) 137 | command = task_config.get("intent", "") 138 | task_id = task_config.get("task_id") 139 | task_index = task_config.get("task_index") 140 | start_url = task_config.get("start_url") 141 | logger.info(f"Intent: {command}, Task ID: {task_id}") 142 | 143 | if start_url: 144 | await page.goto(start_url, wait_until="load", timeout=30000) 145 | 146 | start_time = time.time() 147 | # current_url = await orchestrator.playwright_manager.get_current_url() 148 | command_exec_result = await orchestrator.execute_command(command) 149 | end_time = time.time() 150 | 151 | single_task_result = { 152 | "task_id": task_id, 153 | "task_index": task_index, 154 | "start_url": start_url, 155 | "intent": str(command), 156 | "last_url": page.url, 157 | "tct": end_time - start_time, 158 | "start_ts": get_formatted_current_timestamp(), 159 | "completion_ts": get_formatted_current_timestamp(), 160 | } 161 | 162 | logger.info(f'Command "{command}" took: {round(end_time - start_time, 2)} seconds.') 163 | logger.info(f"Task {task_id} completed.") 164 | 165 | single_task_result["last_statement"] = command_exec_result 166 | 167 | dump_log( 168 | str(task_id), {"command": command, "result": command_exec_result}, logs_dir 169 | ) 170 | 171 | evaluator = evaluator_router(task_config) 172 | # we will use the existing client and not have another one created. thus None CDP session 173 | cdp_session = None 174 | evaluator_result = await evaluator( 175 | task_config=task_config, 176 | page=page, 177 | client=cdp_session, 178 | answer=command_exec_result, 179 | ) 180 | 181 | single_task_result["score"] = evaluator_result["score"] 182 | single_task_result["reason"] = evaluator_result["reason"] 183 | 184 | return single_task_result 185 | 186 | 187 | async def run_tests( 188 | orchestrator: Orchestrator, 189 | min_task_index: int, 190 | max_task_index: int, 191 | test_file: str = "", 192 | test_results_id: str = "", 193 | wait_time_non_headless: int = 5, 194 | take_screenshots: bool = True, 195 | ) -> List[Dict[str, Any]]: 196 | check_top_level_test_folders() 197 | 198 | if not test_file: 199 | test_file = os.path.join( 200 | # TEST_TASKS, "annotator_dry_run_webvoyager_tasks_30.json" 201 | TEST_TASKS, 202 | "test.json", 203 | ) 204 | 205 | logger.info(f"Loading test configurations from: {test_file}") 206 | test_configurations = load_config(test_file) 207 | test_results_id = create_test_results_id(test_results_id, test_file) 208 | results_dir = create_results_dir(test_file, test_results_id) 209 | 210 | page = await orchestrator.playwright_manager.get_current_page() 211 | test_results = [] 212 | max_task_index = len(test_configurations) if not max_task_index else max_task_index 213 | total_tests = max_task_index - min_task_index 214 | 215 | for index, task_config in enumerate( 216 | test_configurations[min_task_index:max_task_index], start=min_task_index 217 | ): 218 | task_id = str(task_config.get("task_id")) 219 | log_folders = create_task_log_folders(task_id, test_results_id) 220 | 221 | orchestrator.playwright_manager.set_take_screenshots(take_screenshots) 222 | if take_screenshots: 223 | orchestrator.playwright_manager.set_screenshots_dir( 224 | log_folders["task_screenshots_folder"] 225 | ) 226 | 227 | print_progress_bar(index - min_task_index, total_tests) 228 | task_result = await execute_single_task( 229 | task_config, orchestrator, page, log_folders["task_log_folder"] 230 | ) 231 | test_results.append(task_result) 232 | save_individual_test_result(task_result, results_dir) 233 | print_test_result(task_result, index + 1, total_tests) 234 | 235 | if not orchestrator.playwright_manager.isheadless: 236 | await asyncio.sleep(wait_time_non_headless) 237 | 238 | await orchestrator.playwright_manager.take_screenshots("final", None) 239 | await orchestrator.playwright_manager.close_except_specified_tab(page) 240 | 241 | print_progress_bar(total_tests, total_tests) 242 | print("\n\nAll tests completed.") 243 | 244 | print("\nDetailed Test Results:") 245 | detailed_results_table = [ 246 | ["Test Index", "Task ID", "Intent", "Status", "Time Taken (s)"] 247 | ] 248 | for idx, result in enumerate(test_results, 1): 249 | status, color = determine_status_and_color(result["score"]) 250 | detailed_results_table.append( 251 | [ 252 | idx, 253 | result["task_id"], 254 | result["intent"], 255 | colored(status, color), 256 | round(result["tct"], 2), 257 | ] 258 | ) 259 | 260 | print(tabulate(detailed_results_table, headers="firstrow", tablefmt="grid")) 261 | 262 | passed_tests = [result for result in test_results if result["score"] == 1] 263 | skipped_tests = [result for result in test_results if result["score"] < 0] 264 | failed_tests = [result for result in test_results if 0 <= result["score"] < 1] 265 | 266 | summary_table = [ 267 | [ 268 | "Total Tests", 269 | "Passed", 270 | "Failed", 271 | "Skipped", 272 | "Average Time Taken (s)", 273 | "Total Time Taken (s)", 274 | ], 275 | [ 276 | total_tests, 277 | len(passed_tests), 278 | len(failed_tests), 279 | len(skipped_tests), 280 | round(sum(test["tct"] for test in test_results) / total_tests, 2), 281 | round(sum(test["tct"] for test in test_results), 2), 282 | ], 283 | ] 284 | 285 | print("\nSummary Report:") 286 | print(tabulate(summary_table, headers="firstrow", tablefmt="grid")) 287 | 288 | return test_results 289 | 290 | 291 | # Main execution function (if needed) 292 | async def main(): 293 | state_to_agent_map = { 294 | State.PLAN: PlannerAgent(), 295 | State.BROWSE: BrowserNavAgent(), 296 | State.AGENTQ_BASE: AgentQ(), 297 | State.AGENTQ_ACTOR: AgentQActor(), 298 | State.AGENTQ_CRITIC: AgentQCritic(), 299 | } 300 | orchestrator = Orchestrator(state_to_agent_map=state_to_agent_map, eval_mode=True) 301 | await orchestrator.start() 302 | await run_tests(orchestrator, 0, 29) # Example: Run first 5 tests 303 | await orchestrator.shutdown() 304 | 305 | 306 | if __name__ == "__main__": 307 | asyncio.run(main()) 308 | --------------------------------------------------------------------------------