├── .gitignore ├── LICENSE ├── README.md ├── cookbook.py ├── poetry.lock ├── pyproject.toml ├── requirements.txt └── sentient ├── __init__.py ├── __main__.py ├── config ├── __init__.py └── config.py ├── core ├── agent │ ├── __init__.py │ ├── agent.py │ └── base.py ├── memory │ ├── __init__.py │ └── ltm.py ├── models │ ├── __init__.py │ └── models.py ├── orchestrator │ └── orchestrator.py ├── prompts │ ├── __init__.py │ └── prompts.py ├── skills │ ├── __init__.py │ ├── click_using_selector.py │ ├── enter_text_and_click.py │ ├── enter_text_using_selector.py │ ├── get_dom_with_content_type.py │ ├── get_screenshot.py │ ├── get_url.py │ ├── get_user_input.py │ ├── open_url.py │ ├── pdf_text_extractor.py │ ├── press_key_combination.py │ └── upload_file.py └── web_driver │ ├── __init__.py │ └── playwright.py ├── task_instructions └── task_instructions.txt └── utils ├── __init__.py ├── _pydantic.py ├── cli_helper.py ├── dom_helper.py ├── dom_mutation_observer.py ├── extract_json.py ├── function_utils.py ├── get_detailed_accessibility_tree.py ├── logger.py ├── message_type.py ├── providers.py └── ui_messagetype.py /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .venv/ 3 | __pycache__ 4 | log_files/ 5 | logs/ 6 | .DS_STORE 7 | results/ 8 | dist/ 9 | test.py 10 | test_instructor.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Sentient Engineering 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sentient - browser controlling agents in 3 lines of code 2 | 3 | [beta] 4 | 5 | ```python 6 | from sentient import sentient 7 | import asyncio 8 | result = asyncio.run(sentient.invoke(goal="play shape of you on youtube")) 9 | ``` 10 | 11 | ### setup 12 | 13 | 1. install sentient `pip install sentient` 14 | 15 | 2. currently, you need to start chrome in dev mode - in a seaparate terminal on the port 9222. use the below commands to start the chrome instance and do necesssary logins if needed 16 | 17 | for mac, use command - 18 | 19 | ```bash 20 | sudo /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 21 | ``` 22 | 23 | to run brave browser (mac) - 24 | 25 | ```bash 26 | sudo /Applications/Brave\ Browser.app/Contents/MacOS/Brave\ Browser --remote-debugging-port=9222 --guest 27 | ``` 28 | 29 | for linux - 30 | 31 | ```bash 32 | google-chrome --remote-debugging-port=9222 33 | ``` 34 | 35 | for windows - 36 | 37 | ```bash 38 | "C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222 39 | ``` 40 | 41 | 4. setup open ai api key in a .env file or `export OPENAI_API_KEY="sk-proj-"` 42 | 43 | 5. run the agent 44 | 45 | ```python 46 | from sentient import sentient 47 | import asyncio 48 | 49 | # if you wanna run in a jupyter notebook, uncomment the following two lines : 50 | #import nest_asyncio 51 | #nest_asyncio.apply() 52 | 53 | result = asyncio.run(sentient.invoke("play shape of you on youtube")) 54 | ``` 55 | 56 | 6. note - by default we use `gpt-4o-2024-08-06` from `openai` to run sentient as it is the best performing model. you can also use other models like `gpt4o` or `gpt4o-mini` but the reliabilty may take some hit. 57 | 58 | --- 59 | 60 | ### setting custom task specific instructions 61 | 62 | you can customise the agent's behaviour by providing natural language descripition of how it should naviagate or what all things it should keep in mind while executing a particualr task. 63 | this is helpful in improving the accuracy and reliability of the agent on your specific task. 64 | 65 | ``` 66 | from sentient import sentient 67 | import asyncio 68 | 69 | custom_instructions = """ 70 | 1. Directly go to youtube.com rather than searching for the song on google! 71 | """ 72 | 73 | #use with open ai 74 | result = asyncio.run(sentient.invoke( 75 | goal="play shape of you on youtube", 76 | task_instructions=custom_instructions, 77 | provider="openai", 78 | model="gpt-4o-2024-08-06")) 79 | ``` 80 | 81 | --- 82 | 83 | ### using providers other than open ai 84 | 85 | we currently support a few providers. if you wish to have others included, please create a new issue. you can pass custom instructions in a similar fashion as shown above. you can also refer the [cookbook](cookbook.py) for seeing all examples of using sentient with various providers. 86 | 87 | > **Note** - the reliability of agent is dependent on whether the model is able to produce reliable json. we reccommend using open ai's latest gpt4o models for most tasks. claude 3.5 sonnet and some other instruction tuned models are also good. small local models might not produce reliable json - thus leading to failures more often. 88 | 89 | #### using anthropic 90 | 91 | 1. set API key - `export ANTHROPIC_API_KEY="sk-ant..."` 92 | 93 | 2. pass provider and model options to the invoke command. 94 | 95 | ```python 96 | #using with anthropic 97 | result = asyncio.run(sentient.invoke( 98 | goal="play shape of you on youtube", 99 | provider="anthropic", 100 | model="claude-3-5-sonnet-20240620")) 101 | ``` 102 | 103 | #### using ollama 104 | 105 | 1. ensure the ollama server is on. you just need to pass the name of the model. 106 | 107 | ```python 108 | #use with ollama 109 | result = asyncio.run(sentient.invoke( 110 | goal="play shape of you on youtube", 111 | provider="ollama", 112 | model="llama3")) 113 | ``` 114 | 115 | #### using groq 116 | 117 | 1. set groq API key - `export GROQ_API_KEY="gsk..."` 118 | 119 | 2. pass provider and model options to the invoke command. NOTE: only llama-3.1-70b-versatile has context window large enough to support the agent. also, the model does not produce reliable outputs. we recommend using groq only for testing purposes. 120 | 121 | ```python 122 | # use with groq models 123 | result = asyncio.run(sentient.invoke( 124 | goal="play shape of you on youtube", 125 | provider="groq", 126 | model="llama-3.1-70b-versatile")) 127 | ``` 128 | 129 | #### using together ai 130 | 131 | 1. set API key for Together AI - `export TOGETHER_API_KEY="your-api-key"` 132 | 133 | 2. pass provider and model options to the invoke command. 134 | 135 | ```python 136 | #use with together ai 137 | result = asyncio.run(sentient.invoke( 138 | goal="play shape of you on youtube", 139 | provider="together", 140 | model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo")) 141 | ``` 142 | 143 | #### using a custom open ai compatible server 144 | 145 | 1. you can use this to use any open ai api compatible server (like vllm/ ollama running on a different machine. etc) 146 | 147 | 2. set API key for your custom server - `export CUSTOM_API_KEY="your-api-key"`. fill in any random value if there is no api key needed. 148 | 149 | 3. pass in the custom base url and model name to the invoke command. 150 | 151 | ```python 152 | #use with custom server 153 | result = asyncio.run(sentient.invoke( 154 | goal="play shape of you on youtube", 155 | provider="custom", 156 | custom_base_url="http://localhost:8080/v1", 157 | model="model_name")) 158 | ``` 159 | 160 | #### using open-router 161 | 162 | 1. set API key for open router - `export OPENROUTER_API_KEY="your-api-key"` 163 | 164 | 2. we use litellm to call openrouter. so if you want to disable litellm logging - `export LITELLM_LOG="ERROR"` 165 | 166 | 3. pass provider and model options to the invoke command. model name should be passed as openrouter/your-model-name 167 | 168 | ```python 169 | # use with open-router 170 | result = asyncio.run(sentient.invoke( 171 | goal="play shape of you on youtube", 172 | provider="openrouter", 173 | model="openrouter/anthropic/claude-3.5-sonnet")) 174 | ``` 175 | -------------------------------------------------------------------------------- /cookbook.py: -------------------------------------------------------------------------------- 1 | from sentient import sentient 2 | import asyncio 3 | 4 | custom_instructions = """ 5 | 1. Directly go to youtube.com rather than searching for the song on google! 6 | """ 7 | 8 | # #use with open ai 9 | result = asyncio.run(sentient.invoke( 10 | goal="play shape of you on youtube", 11 | task_instructions=custom_instructions, 12 | provider="openai", 13 | model="gpt-4o-2024-08-06")) 14 | 15 | # #use with together ai 16 | result = asyncio.run(sentient.invoke( 17 | goal="play shape of you on youtube", 18 | task_instructions=custom_instructions, 19 | provider="together", 20 | model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo")) 21 | 22 | # #use with ollama 23 | result = asyncio.run(sentient.invoke( 24 | goal="play shape of you on youtube", 25 | task_instructions=custom_instructions, 26 | provider="ollama", 27 | model="llama3")) 28 | 29 | #using anthropic 30 | result = asyncio.run(sentient.invoke( 31 | goal="play shape of you on youtube", 32 | task_instructions=custom_instructions, 33 | provider="anthropic", 34 | model="claude-3-5-sonnet-20240620")) 35 | 36 | # use with groq models 37 | result = asyncio.run(sentient.invoke( 38 | goal="play shape of you on youtube", 39 | task_instructions=custom_instructions, 40 | provider="groq", 41 | model="llama-3.1-70b-versatile")) 42 | 43 | #using a custom endpoint (like remotely hosted vLLM/ ollama servers) 44 | #endpoint must be openai compatible 45 | result = asyncio.run(sentient.invoke( 46 | goal="play shape of you on youtube", 47 | task_instructions=custom_instructions, 48 | provider="custom", 49 | custom_base_url="http://localhost:8080/v1", 50 | model="llama3.1")) 51 | 52 | print(result) -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "sentient" 3 | version = "0.1.10" 4 | description = "" 5 | authors = ["nischalj10 ", "thebhulawat "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.9,<4.0" 10 | pydantic = "^2.8.2" 11 | pytest-playwright = "^0.5.1" 12 | pdfplumber = "0.11.2" 13 | typing-extensions = "^4.12.2" 14 | ruff = "^0.5.6" 15 | setuptools = "^72.1.0" 16 | openai = "^1.40.1" 17 | boto3 = "^1.34.157" 18 | python-json-logger = "^2.0.7" 19 | aiohttp = "^3.10.2" 20 | colorama = "^0.4.6" 21 | tiktoken = "^0.7.0" 22 | termcolor = "^2.4.0" 23 | tabulate = "^0.9.0" 24 | langsmith = "^0.1.104" 25 | instructor = "1.4.2" 26 | python-dotenv = "^1.0.1" 27 | google-generativeai = "^0.8.1" 28 | groq = "^0.11.0" 29 | jsonref = "^1.1.0" 30 | eval-type-backport = "^0.2.0" 31 | anthropic = "^0.34.2" 32 | litellm = "^1.48.8" 33 | 34 | 35 | [build-system] 36 | requires = ["poetry-core"] 37 | build-backend = "poetry.core.masonry.api" 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | agentops==0.3.10 ; python_version >= "3.10" and python_version < "4.0" 2 | aiohappyeyeballs==2.4.0 ; python_version >= "3.10" and python_version < "4.0" 3 | aiohttp==3.10.5 ; python_version >= "3.10" and python_version < "4.0" 4 | aiosignal==1.3.1 ; python_version >= "3.10" and python_version < "4.0" 5 | annotated-types==0.7.0 ; python_version >= "3.10" and python_version < "4.0" 6 | anyio==4.4.0 ; python_version >= "3.10" and python_version < "4.0" 7 | async-timeout==4.0.3 ; python_version >= "3.10" and python_version < "3.11" 8 | attrs==24.2.0 ; python_version >= "3.10" and python_version < "4.0" 9 | boto3==1.35.1 ; python_version >= "3.10" and python_version < "4.0" 10 | botocore==1.35.1 ; python_version >= "3.10" and python_version < "4.0" 11 | certifi==2024.7.4 ; python_version >= "3.10" and python_version < "4.0" 12 | cffi==1.17.0 ; python_version >= "3.10" and python_version < "4.0" and platform_python_implementation != "PyPy" 13 | charset-normalizer==3.3.2 ; python_version >= "3.10" and python_version < "4.0" 14 | click==8.1.7 ; python_version >= "3.10" and python_version < "4.0" 15 | colorama==0.4.6 ; python_version >= "3.10" and python_version < "4.0" 16 | cryptography==43.0.0 ; python_version >= "3.10" and python_version < "4.0" 17 | distro==1.9.0 ; python_version >= "3.10" and python_version < "4.0" 18 | docstring-parser==0.16 ; python_version >= "3.10" and python_version < "4.0" 19 | exceptiongroup==1.2.2 ; python_version >= "3.10" and python_version < "3.11" 20 | filelock==3.15.4 ; python_version >= "3.10" and python_version < "4.0" 21 | frozenlist==1.4.1 ; python_version >= "3.10" and python_version < "4.0" 22 | fsspec==2024.6.1 ; python_version >= "3.10" and python_version < "4.0" 23 | greenlet==3.0.3 ; python_version >= "3.10" and python_version < "4.0" 24 | h11==0.14.0 ; python_version >= "3.10" and python_version < "4.0" 25 | httpcore==1.0.5 ; python_version >= "3.10" and python_version < "4.0" 26 | httpx==0.27.0 ; python_version >= "3.10" and python_version < "4.0" 27 | huggingface-hub==0.24.6 ; python_version >= "3.10" and python_version < "4.0" 28 | idna==3.7 ; python_version >= "3.10" and python_version < "4.0" 29 | importlib-metadata==8.3.0 ; python_version >= "3.10" and python_version < "4.0" 30 | iniconfig==2.0.0 ; python_version >= "3.10" and python_version < "4.0" 31 | instructor==1.4.0 ; python_version >= "3.10" and python_version < "4.0" 32 | jinja2==3.1.4 ; python_version >= "3.10" and python_version < "4.0" 33 | jiter==0.4.2 ; python_version >= "3.10" and python_version < "4.0" 34 | jmespath==1.0.1 ; python_version >= "3.10" and python_version < "4.0" 35 | joblib==1.4.2 ; python_version >= "3.10" and python_version < "4.0" 36 | jsonschema-specifications==2023.12.1 ; python_version >= "3.10" and python_version < "4.0" 37 | jsonschema==4.23.0 ; python_version >= "3.10" and python_version < "4.0" 38 | langsmith==0.1.104 ; python_version >= "3.10" and python_version < "4.0" 39 | litellm==1.43.18 ; python_version >= "3.10" and python_version < "4.0" 40 | markdown-it-py==3.0.0 ; python_version >= "3.10" and python_version < "4.0" 41 | markupsafe==2.1.5 ; python_version >= "3.10" and python_version < "4.0" 42 | mdurl==0.1.2 ; python_version >= "3.10" and python_version < "4.0" 43 | multidict==6.0.5 ; python_version >= "3.10" and python_version < "4.0" 44 | nltk==3.9.1 ; python_version >= "3.10" and python_version < "4.0" 45 | openai==1.41.1 ; python_version >= "3.10" and python_version < "4.0" 46 | orjson==3.10.7 ; python_version >= "3.10" and python_version < "4.0" 47 | packaging==23.2 ; python_version >= "3.10" and python_version < "4.0" 48 | pdfminer-six==20231228 ; python_version >= "3.10" and python_version < "4.0" 49 | pdfplumber==0.11.2 ; python_version >= "3.10" and python_version < "4.0" 50 | pillow==10.4.0 ; python_version >= "3.10" and python_version < "4.0" 51 | playwright-stealth==1.0.6 ; python_version >= "3.10" and python_version < "4.0" 52 | playwright==1.46.0 ; python_version >= "3.10" and python_version < "4.0" 53 | pluggy==1.5.0 ; python_version >= "3.10" and python_version < "4.0" 54 | psutil==5.9.8 ; python_version >= "3.10" and python_version < "4.0" 55 | pycparser==2.22 ; python_version >= "3.10" and python_version < "4.0" and platform_python_implementation != "PyPy" 56 | pydantic-core==2.20.1 ; python_version >= "3.10" and python_version < "4.0" 57 | pydantic==2.8.2 ; python_version >= "3.10" and python_version < "4.0" 58 | pyee==11.1.0 ; python_version >= "3.10" and python_version < "4.0" 59 | pygments==2.18.0 ; python_version >= "3.10" and python_version < "4.0" 60 | pypdfium2==4.30.0 ; python_version >= "3.10" and python_version < "4.0" 61 | pytest-base-url==2.1.0 ; python_version >= "3.10" and python_version < "4.0" 62 | pytest-playwright==0.5.1 ; python_version >= "3.10" and python_version < "4.0" 63 | pytest==8.3.2 ; python_version >= "3.10" and python_version < "4.0" 64 | python-dateutil==2.9.0.post0 ; python_version >= "3.10" and python_version < "4.0" 65 | python-dotenv==1.0.1 ; python_version >= "3.10" and python_version < "4.0" 66 | python-json-logger==2.0.7 ; python_version >= "3.10" and python_version < "4.0" 67 | python-slugify==8.0.4 ; python_version >= "3.10" and python_version < "4.0" 68 | pyyaml==6.0.1 ; python_version >= "3.10" and python_version < "4.0" 69 | referencing==0.35.1 ; python_version >= "3.10" and python_version < "4.0" 70 | regex==2024.7.24 ; python_version >= "3.10" and python_version < "4.0" 71 | requests==2.31.0 ; python_version >= "3.10" and python_version < "4.0" 72 | rich==13.8.0 ; python_version >= "3.10" and python_version < "4.0" 73 | rpds-py==0.20.0 ; python_version >= "3.10" and python_version < "4.0" 74 | ruff==0.5.7 ; python_version >= "3.10" and python_version < "4.0" 75 | s3transfer==0.10.2 ; python_version >= "3.10" and python_version < "4.0" 76 | setuptools==72.2.0 ; python_version >= "3.10" and python_version < "4.0" 77 | shellingham==1.5.4 ; python_version >= "3.10" and python_version < "4.0" 78 | six==1.16.0 ; python_version >= "3.10" and python_version < "4.0" 79 | sniffio==1.3.1 ; python_version >= "3.10" and python_version < "4.0" 80 | tabulate==0.9.0 ; python_version >= "3.10" and python_version < "4.0" 81 | tenacity==8.5.0 ; python_version >= "3.10" and python_version < "4.0" 82 | termcolor==2.4.0 ; python_version >= "3.10" and python_version < "4.0" 83 | text-unidecode==1.3 ; python_version >= "3.10" and python_version < "4.0" 84 | tiktoken==0.7.0 ; python_version >= "3.10" and python_version < "4.0" 85 | tokenizers==0.20.0 ; python_version >= "3.10" and python_version < "4.0" 86 | tomli==2.0.1 ; python_version >= "3.10" and python_version < "3.11" 87 | tqdm==4.66.5 ; python_version >= "3.10" and python_version < "4.0" 88 | typer==0.12.5 ; python_version >= "3.10" and python_version < "4.0" 89 | typing-extensions==4.12.2 ; python_version >= "3.10" and python_version < "4.0" 90 | urllib3==2.2.2 ; python_version >= "3.10" and python_version < "4.0" 91 | yarl==1.9.4 ; python_version >= "3.10" and python_version < "4.0" 92 | zipp==3.20.0 ; python_version >= "3.10" and python_version < "4.0" 93 | -------------------------------------------------------------------------------- /sentient/__init__.py: -------------------------------------------------------------------------------- 1 | from sentient.core.orchestrator.orchestrator import Orchestrator 2 | from sentient.core.agent.agent import Agent 3 | from sentient.core.models.models import State 4 | from sentient.core.memory import ltm 5 | from sentient.utils.providers import get_provider 6 | 7 | class Sentient: 8 | def __init__(self): 9 | self.orchestrator = None 10 | 11 | def _create_state_to_agent_map(self, provider: str, model: str, custom_base_url: str = None): 12 | provider_instance = get_provider(provider, custom_base_url) 13 | return { 14 | State.BASE_AGENT: Agent(provider=provider_instance, model_name=model), 15 | } 16 | 17 | async def _initialize(self, provider: str, model: str, custom_base_url: str = None): 18 | if not self.orchestrator: 19 | state_to_agent_map = self._create_state_to_agent_map(provider, model, custom_base_url) 20 | self.orchestrator = Orchestrator(state_to_agent_map=state_to_agent_map) 21 | await self.orchestrator.start() 22 | 23 | async def invoke( 24 | self, 25 | goal: str, 26 | provider: str = "openai", 27 | model: str = "gpt-4o-2024-08-06", 28 | task_instructions: str = None, 29 | custom_base_url: str = None 30 | ): 31 | if task_instructions: 32 | ltm.set_task_instructions(task_instructions) 33 | await self._initialize(provider, model, custom_base_url) 34 | result = await self.orchestrator.execute_command(goal) 35 | return result 36 | 37 | async def shutdown(self): 38 | if self.orchestrator: 39 | await self.orchestrator.shutdown() 40 | 41 | sentient = Sentient() -------------------------------------------------------------------------------- /sentient/__main__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from sentient.core.agent.agent import Agent 4 | from sentient.core.models.models import State 5 | from sentient.core.orchestrator.orchestrator import Orchestrator 6 | 7 | 8 | async def main(): 9 | # Define state machine 10 | state_to_agent_map = { 11 | State.BASE_AGENT: Agent(), 12 | } 13 | 14 | orchestrator = Orchestrator(state_to_agent_map=state_to_agent_map) 15 | await orchestrator.start() 16 | 17 | 18 | if __name__ == "__main__": 19 | asyncio.run(main()) 20 | -------------------------------------------------------------------------------- /sentient/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/config/__init__.py -------------------------------------------------------------------------------- /sentient/config/config.py: -------------------------------------------------------------------------------- 1 | # config.py at the project source code root 2 | import os 3 | 4 | # Get the absolute path of the current file (config.py) 5 | CURRENT_FILE_PATH = os.path.abspath(__file__) 6 | 7 | # Get the project root directory (two levels up from config.py) 8 | PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(CURRENT_FILE_PATH))) 9 | 10 | # Define other paths relative to the project root 11 | PROJECT_SOURCE_ROOT = os.path.join(PROJECT_ROOT, "sentient") 12 | SOURCE_LOG_FOLDER_PATH = os.path.join(PROJECT_SOURCE_ROOT, "log_files") 13 | PROJECT_TEMP_PATH = os.path.join(PROJECT_SOURCE_ROOT, "temp") 14 | TASK_INSTRUCTION_PATH = os.path.join(PROJECT_SOURCE_ROOT, "task_instructions") 15 | PROJECT_TEST_ROOT = os.path.join(PROJECT_SOURCE_ROOT, "test") 16 | 17 | # Check if the log folder exists, and if not, create it 18 | if not os.path.exists(SOURCE_LOG_FOLDER_PATH): 19 | os.makedirs(SOURCE_LOG_FOLDER_PATH) 20 | print(f"Created log folder at: {SOURCE_LOG_FOLDER_PATH}") 21 | 22 | # create user prefernces folder if it does not exist 23 | if not os.path.exists(TASK_INSTRUCTION_PATH): 24 | os.makedirs(TASK_INSTRUCTION_PATH) 25 | print(f"Created task instruction folder at: {TASK_INSTRUCTION_PATH}") 26 | 27 | if not os.path.exists(PROJECT_TEMP_PATH): 28 | os.makedirs(PROJECT_TEMP_PATH) 29 | print(f"Created temp folder at: {PROJECT_TEMP_PATH}") 30 | -------------------------------------------------------------------------------- /sentient/core/agent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/core/agent/__init__.py -------------------------------------------------------------------------------- /sentient/core/agent/agent.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from string import Template 3 | 4 | from sentient.core.agent.base import BaseAgent 5 | from sentient.core.memory import ltm 6 | from sentient.core.models.models import AgentInput, AgentOutput 7 | from sentient.core.prompts.prompts import LLM_PROMPTS 8 | from sentient.utils.providers import LLMProvider 9 | 10 | 11 | class Agent(BaseAgent): 12 | def __init__(self, provider:LLMProvider, model_name: str): 13 | self.name = "sentient" 14 | self.ltm = None 15 | self.ltm = self.__get_ltm() 16 | self.system_prompt = self.__modify_system_prompt(self.ltm) 17 | super().__init__( 18 | name=self.name, 19 | system_prompt=self.system_prompt, 20 | input_format=AgentInput, 21 | output_format=AgentOutput, 22 | keep_message_history=False, 23 | provider=provider, 24 | model_name=model_name, 25 | ) 26 | 27 | @staticmethod 28 | def __get_ltm(): 29 | return ltm.get_task_instructions() 30 | 31 | def __modify_system_prompt(self, ltm): 32 | system_prompt: str = LLM_PROMPTS["BASE_AGENT_PROMPT"] 33 | 34 | substitutions = { 35 | "task_information": ltm if ltm is not None else "", 36 | } 37 | 38 | # Use safe_substitute to avoid KeyError 39 | system_prompt = Template(system_prompt).safe_substitute(substitutions) 40 | 41 | # Add today's day & date to the system prompt 42 | today = datetime.now() 43 | today_date = today.strftime("%d/%m/%Y") 44 | weekday = today.strftime("%A") 45 | system_prompt += f"\nToday's date is: {today_date}" 46 | system_prompt += f"\nCurrent weekday is: {weekday}" 47 | 48 | return system_prompt 49 | -------------------------------------------------------------------------------- /sentient/core/agent/base.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Callable, List, Optional, Tuple, Type 3 | 4 | import instructor 5 | import instructor.patch 6 | import openai 7 | from instructor import Mode 8 | from instructor.exceptions import InstructorRetryException 9 | from pydantic import BaseModel 10 | from groq import Groq 11 | from anthropic import Anthropic 12 | from litellm import completion 13 | 14 | from sentient.utils.function_utils import get_function_schema 15 | from sentient.utils.logger import logger 16 | from sentient.utils.providers import LLMProvider 17 | 18 | class BaseAgent: 19 | def __init__( 20 | self, 21 | name: str, 22 | system_prompt: str, 23 | input_format: Type[BaseModel], 24 | output_format: Type[BaseModel], 25 | tools: Optional[List[Tuple[Callable, str]]] = None, 26 | keep_message_history: bool = True, 27 | provider: LLMProvider = None, 28 | model_name: str = None, 29 | ): 30 | # Metdata 31 | self.agent_name = name 32 | 33 | # Messages 34 | self.system_prompt = system_prompt 35 | if self.system_prompt: 36 | self._initialize_messages() 37 | self.keep_message_history = keep_message_history 38 | 39 | # Input-output format 40 | self.input_format = input_format 41 | self.output_format = output_format 42 | 43 | # Llm client 44 | self.provider_name = provider.get_provider_name() 45 | self.provider = provider 46 | client_config = self.provider.get_client_config() 47 | 48 | # if self.provider_name == "google": 49 | # self.client = instructor.from_gemini( 50 | # client=genai.GenerativeModel( 51 | # model_name=model_name, 52 | # ) 53 | # ) 54 | if self.provider_name == "groq": 55 | self.client = Groq(**client_config) 56 | self.client = instructor.from_groq(self.client, mode=Mode.TOOLS) 57 | elif self.provider_name == "anthropic": 58 | self.client = instructor.from_anthropic(Anthropic()) 59 | elif self.provider_name == "openrouter": 60 | # use litellm for openrouter as instructor currently does not seem to have support for openrouter 61 | self.client = instructor.from_litellm(completion=completion) 62 | elif self.provider_name == "together": 63 | self.client = openai.Client(**client_config) 64 | self.client = instructor.from_openai(self.client, mode=Mode.JSON) 65 | else: 66 | self.client = openai.Client(**client_config) 67 | self.client = instructor.from_openai(self.client, mode=Mode.TOOLS) 68 | 69 | # Set model name 70 | self.model_name = model_name 71 | 72 | # Tools 73 | self.tools_list = [] 74 | self.executable_functions_list = {} 75 | if tools: 76 | self._initialize_tools(tools) 77 | 78 | def _initialize_tools(self, tools: List[Tuple[Callable, str]]): 79 | for func, func_desc in tools: 80 | self.tools_list.append(get_function_schema(func, description=func_desc)) 81 | self.executable_functions_list[func.__name__] = func 82 | 83 | def _initialize_messages(self): 84 | self.messages = [{"role": "user", "content": self.system_prompt}] 85 | self.messages.append( 86 | { 87 | "role": "assistant", 88 | "content": "Understood. I will properly follow the instructions given. Can you provide me with the objective and other details in JSON format?", 89 | } 90 | ) 91 | 92 | # @traceable(run_type="chain", name="agent_run") 93 | async def run( 94 | self, input_data: BaseModel, screenshot: str = None 95 | ) -> BaseModel: 96 | if not isinstance(input_data, self.input_format): 97 | raise ValueError(f"Input data must be of type {self.input_format.__name__}") 98 | 99 | # Handle message history. 100 | if not self.keep_message_history: 101 | self._initialize_messages() 102 | 103 | if screenshot: 104 | self.messages.append( 105 | { 106 | "role": "user", 107 | "content": [ 108 | { 109 | "type": "text", 110 | "text": input_data.model_dump_json( 111 | exclude={"current_page_dom", "current_page_url"} 112 | ), 113 | }, 114 | {"type": "image_url", "image_url": {"url": screenshot}}, 115 | ], 116 | } 117 | ) 118 | else: 119 | self.messages.append( 120 | { 121 | "role": "user", 122 | "content": input_data.model_dump_json( 123 | exclude={"current_page_dom", "current_page_url"} 124 | ), 125 | } 126 | ) 127 | 128 | self.messages.append( 129 | { 130 | "role": "assistant", 131 | "content": "Understood. I will properly follow the instructions given. Can you provide me with the current page DOM and URL please?", 132 | } 133 | ) 134 | 135 | # input dom and current page url in a separate message so that the LLM can pay attention to completed tasks better. *based on personal vibe check* 136 | if hasattr(input_data, "current_page_dom") and hasattr( 137 | input_data, "current_page_url" 138 | ): 139 | self.messages.append( 140 | { 141 | "role": "user", 142 | "content": f"Current page URL:\n{input_data.current_page_url}\n\n Current page DOM:\n{input_data.current_page_dom}", 143 | } 144 | ) 145 | 146 | while True: 147 | # TODO: 148 | # 1. better exeception handling and messages while calling the client 149 | # 2. remove the else block as JSON mode in instrutor won't allow us to pass in tools. 150 | # 3. add a max_turn here to prevent a inifinite fallout 151 | try: 152 | response = None 153 | if len(self.tools_list) == 0: 154 | try: 155 | response: self.output_format = self.client.chat.completions.create( 156 | model=self.model_name, 157 | messages=self.messages, 158 | response_model=self.output_format, 159 | max_retries=3, 160 | max_tokens=1000 if self.provider_name == "anthropic" else None, 161 | ) 162 | except InstructorRetryException as e: 163 | print(f"InstructorRetryException: client - {self.provider_name} model - {self.model_name}") 164 | print(f"Error: {str(e)}") 165 | print(f"Error details: {e.__dict__}") 166 | except Exception as e: 167 | print("Error in output", e) 168 | else: 169 | response = self.client.chat.completions.create( 170 | model=self.model_name, 171 | messages=self.messages, 172 | response_model=self.output_format, 173 | tool_choice="auto", 174 | tools=self.tools_list, 175 | ) 176 | 177 | assert isinstance(response, self.output_format) 178 | return response 179 | 180 | # instructor directly outputs response.choices[0].message. so we will do response_message = response 181 | # response_message = response.choices[0].message 182 | 183 | # instructor does not support funciton in JSON mode 184 | # if response_message.tool_calls: 185 | # tool_calls = response_message.tool_calls 186 | 187 | # if tool_calls: 188 | # self.messages.append(response_message) 189 | # for tool_call in tool_calls: 190 | # await self._append_tool_response(tool_call) 191 | # continue 192 | 193 | # parsed_response_content: self.output_format = response_message.parsed 194 | 195 | except AssertionError: 196 | raise TypeError( 197 | f"Expected response_message to be of type {self.output_format.__name__}, but got {type(response).__name__}") 198 | except Exception as e: 199 | logger.error(f"Unexpected error: {str(e)}") 200 | raise 201 | 202 | 203 | 204 | async def _append_tool_response(self, tool_call): 205 | function_name = tool_call.function.name 206 | function_to_call = self.executable_functions_list[function_name] 207 | function_args = json.loads(tool_call.function.arguments) 208 | try: 209 | function_response = await function_to_call(**function_args) 210 | # print(function_response) 211 | self.messages.append( 212 | { 213 | "tool_call_id": tool_call.id, 214 | "role": "tool", 215 | "name": function_name, 216 | "content": str(function_response), 217 | } 218 | ) 219 | except Exception as e: 220 | logger.error(f"Error occurred calling the tool {function_name}: {str(e)}") 221 | self.messages.append( 222 | { 223 | "tool_call_id": tool_call.id, 224 | "role": "tool", 225 | "name": function_name, 226 | "content": str( 227 | "The tool responded with an error, please try again with a different tool or modify the parameters of the tool", 228 | function_response, 229 | ), 230 | } 231 | ) 232 | -------------------------------------------------------------------------------- /sentient/core/memory/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/core/memory/__init__.py -------------------------------------------------------------------------------- /sentient/core/memory/ltm.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from sentient.config.config import TASK_INSTRUCTION_PATH 4 | from sentient.utils.logger import logger 5 | 6 | task_instruction_file_name = "task_instructions.txt" 7 | task_instruction_file = os.path.join( 8 | TASK_INSTRUCTION_PATH, task_instruction_file_name 9 | ) 10 | 11 | def get_task_instructions(): 12 | try: 13 | with open(task_instruction_file) as file: 14 | user_pref = file.read() 15 | logger.info(f"Task instructions loaded from: {task_instruction_file}") 16 | return user_pref 17 | except FileNotFoundError: 18 | logger.warning(f"Task instruction file not found: {task_instruction_file}") 19 | 20 | return None 21 | 22 | def set_task_instructions(instructions: str): 23 | try: 24 | # clear and write new instructions 25 | with open(task_instruction_file, 'w') as file: 26 | file.write(instructions) 27 | logger.info(f"Task instructions updated in: {task_instruction_file}") 28 | except IOError: 29 | logger.error(f"Failed to write task instructions to: {task_instruction_file}") -------------------------------------------------------------------------------- /sentient/core/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/core/models/__init__.py -------------------------------------------------------------------------------- /sentient/core/models/models.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import List, Literal, Optional, Union 3 | 4 | from pydantic import BaseModel 5 | from pydantic.fields import Field 6 | 7 | #Global 8 | class State(str, Enum): 9 | COMPLETED = "completed" 10 | BASE_AGENT = "agentq_base" 11 | 12 | 13 | class ActionType(str, Enum): 14 | CLICK = "CLICK" 15 | TYPE = "TYPE" 16 | GOTO_URL = "GOTO_URL" 17 | ENTER_TEXT_AND_CLICK = "ENTER_TEXT_AND_CLICK" 18 | 19 | 20 | class ClickAction(BaseModel): 21 | type: Literal[ActionType.CLICK] = Field( 22 | description="""Executes a click action on the element matching the given mmid attribute value. MMID is always a number. Returns Success if click was successful or appropriate error message if the element could not be clicked.""" 23 | ) 24 | mmid: int = Field( 25 | description="The mmid number of the element that needs to be clicked e.g. 114. mmid will always be a number" 26 | ) 27 | wait_before_execution: Optional[float] = Field( 28 | description="Optional wait time in seconds before executing the click event logic" 29 | ) 30 | 31 | 32 | class TypeAction(BaseModel): 33 | type: Literal[ActionType.TYPE] = Field( 34 | description="""Single enter given text in the DOM element matching the given mmid attribute value. This will only enter the text and not press enter or anything else. 35 | Returns Success if text entry was successful or appropriate error message if text could not be entered.""" 36 | ) 37 | mmid: int = Field( 38 | description="The mmid number of the element that needs to be clicked e.g. 114. mmid will always be a number" 39 | ) 40 | content: str = Field( 41 | description="The text to enter in the element identified by the query_selector." 42 | ) 43 | 44 | 45 | class GotoAction(BaseModel): 46 | type: Literal[ActionType.GOTO_URL] = Field( 47 | description="Opens a specified URL in the web browser instance. Returns url of the new page if successful or appropriate error message if the page could not be opened." 48 | ) 49 | website: str = Field( 50 | description="The URL to navigate to. Value must include the protocol (http:// or https://)." 51 | ) 52 | timeout: Optional[float] = Field( 53 | description="Additional wait time in seconds after initial load." 54 | ) 55 | 56 | class EnterTextAndClickAction(BaseModel): 57 | type: Literal[ActionType.ENTER_TEXT_AND_CLICK] = Field( 58 | description="""Enters text into a specified element and clicks another element, both identified by their mmid. Ideal for seamless actions like submitting search queries, this integrated approach ensures superior performance over separate text entry and click commands. Successfully completes when both actions are executed without errors, returning True; otherwise, it provides False or an explanatory message of any failure encountered.""" 59 | ) 60 | text_element_mmid: int = Field( 61 | description="The mmid number of the element where the text will be entered" 62 | ) 63 | text_to_enter: str = Field( 64 | description="The text that will be entered into the element specified by text_element_mmid" 65 | ) 66 | click_element_mmid: int = Field( 67 | description="The mmid number of the element that will be clicked after text entry." 68 | ) 69 | wait_before_click_execution: Optional[float] = Field( 70 | description="Optional wait time in seconds before executing the click event logic" 71 | ) 72 | 73 | Action = Union[ 74 | ClickAction, 75 | TypeAction, 76 | GotoAction, 77 | EnterTextAndClickAction, 78 | ] 79 | 80 | 81 | class Task(BaseModel): 82 | id: int 83 | description: str 84 | url: Optional[str] = Field(default=None, description="Optional URL of the page on which task will happen") 85 | result: Optional[str] = Field(default=None, description="Optional result of the task") 86 | 87 | 88 | class TaskWithActions(BaseModel): 89 | id: int 90 | description: str 91 | actions_to_be_performed: Optional[List[Action]] = Field(default=None) 92 | result: Optional[str] = Field(default=None) 93 | 94 | 95 | class Memory(BaseModel): 96 | objective: str 97 | current_state: State 98 | plan: Optional[Union[List[Task], List[TaskWithActions]]] = Field(default=None) 99 | thought: str 100 | completed_tasks: Optional[Union[List[Task], List[TaskWithActions]]] = Field(default=None) 101 | current_task: Optional[Union[Task, TaskWithActions]] = Field(default=None) 102 | final_response: Optional[str] = Field(default=None) 103 | 104 | class Config: 105 | use_enum_values = True 106 | 107 | 108 | 109 | # Agent 110 | class AgentInput(BaseModel): 111 | objective: str 112 | completed_tasks: Optional[List[Task]] = Field(default=None) 113 | current_page_url: str 114 | current_page_dom: str 115 | 116 | class AgentOutput(BaseModel): 117 | thought: str 118 | plan: List[Task] 119 | next_task: Optional[Task] = Field(default=None, description="The next task to be executed") 120 | next_task_actions: Optional[List[Action]] = Field(default=None, description="List of actions for the next task") 121 | is_complete: bool 122 | final_response: Optional[str] = Field(default=None, description="Final response of the agent") -------------------------------------------------------------------------------- /sentient/core/orchestrator/orchestrator.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import textwrap 3 | from typing import Dict, List 4 | 5 | from colorama import Fore, init 6 | from dotenv import load_dotenv 7 | from langsmith import traceable 8 | 9 | from sentient.core.agent.base import BaseAgent 10 | from sentient.core.models.models import ( 11 | Action, 12 | ActionType, 13 | AgentInput, 14 | AgentOutput, 15 | Memory, 16 | State, 17 | Task, 18 | ) 19 | from sentient.core.skills.click_using_selector import click 20 | from sentient.core.skills.enter_text_using_selector import EnterTextEntry, entertext 21 | from sentient.core.skills.get_dom_with_content_type import get_dom_with_content_type 22 | from sentient.core.skills.get_url import geturl 23 | from sentient.core.skills.open_url import openurl 24 | from sentient.core.skills.enter_text_and_click import enter_text_and_click 25 | from sentient.core.web_driver.playwright import PlaywrightManager 26 | 27 | init(autoreset=True) 28 | 29 | 30 | class Orchestrator: 31 | def __init__( 32 | self, state_to_agent_map: Dict[State, BaseAgent], eval_mode: bool = False 33 | ): 34 | load_dotenv() 35 | self.state_to_agent_map = state_to_agent_map 36 | self.playwright_manager = PlaywrightManager() 37 | self.eval_mode = eval_mode 38 | self.shutdown_event = asyncio.Event() 39 | # self.session_id = str(uuid.uuid4()) 40 | 41 | async def start(self): 42 | print("Starting orchestrator") 43 | await self.playwright_manager.async_initialize(eval_mode=self.eval_mode) 44 | print("Browser started and ready") 45 | 46 | # if not self.eval_mode: 47 | # await self._command_loop() 48 | 49 | @classmethod 50 | async def invoke(cls, command: str): 51 | orchestrator = cls() 52 | await orchestrator.start() 53 | result = await orchestrator.execute_command(command) 54 | return result 55 | 56 | async def _command_loop(self): 57 | while not self.shutdown_event.is_set(): 58 | try: 59 | command = await self._get_user_input() 60 | if command.strip().lower() == "exit": 61 | await self.shutdown() 62 | else: 63 | await self.execute_command(command) 64 | except asyncio.CancelledError: 65 | break 66 | except Exception as e: 67 | print(f"An error occurred: {e}") 68 | 69 | async def _get_user_input(self): 70 | return await asyncio.get_event_loop().run_in_executor( 71 | None, input, "Enter your command (or type 'exit' to quit) " 72 | ) 73 | 74 | # @traceable(run_type="chain", name="execute_command") 75 | async def execute_command(self, command: str): 76 | try: 77 | # Create initial memory 78 | self.memory = Memory( 79 | objective=command, 80 | current_state=State.BASE_AGENT, 81 | plan=[], 82 | thought="", 83 | completed_tasks=[], 84 | current_task=None, 85 | final_response=None, 86 | ) 87 | print(f"Executing command {self.memory.objective}") 88 | while self.memory.current_state != State.COMPLETED: 89 | await self._handle_state() 90 | self._print_final_response() 91 | return self.memory.final_response 92 | except Exception as e: 93 | print(f"Error executing the command {self.memory.objective}: {e}") 94 | 95 | def run(self) -> Memory: 96 | while self.memory.current_state != State.COMPLETED: 97 | self._handle_state() 98 | 99 | self._print_final_response() 100 | return self.memory 101 | 102 | async def _handle_state(self): 103 | current_state = self.memory.current_state 104 | 105 | if current_state not in self.state_to_agent_map: 106 | raise ValueError(f"Unhandled state! No agent for {current_state}") 107 | 108 | if current_state == State.BASE_AGENT: 109 | await self._handle_agent() 110 | else: 111 | raise ValueError(f"Unhandled state: {current_state}") 112 | 113 | 114 | async def _handle_agent(self): 115 | agent = self.state_to_agent_map[State.BASE_AGENT] 116 | self._print_memory_and_agent(agent.name) 117 | 118 | # repesenting state with dom representation 119 | dom = await get_dom_with_content_type(content_type="all_fields") 120 | url = await geturl() 121 | 122 | input_data = AgentInput( 123 | objective=self.memory.objective, 124 | completed_tasks=self.memory.completed_tasks, 125 | current_page_url=str(url), 126 | current_page_dom=str(dom), 127 | ) 128 | 129 | try: 130 | output: AgentOutput = await agent.run(input_data) 131 | await self._update_memory_from_agent(output) 132 | print(f"{Fore.MAGENTA}Base Agent Q has updated the memory.") 133 | except Exception as e: 134 | print(f"{Fore.RED}Unexpected Error in Agent Execution:") 135 | print(str(e)) 136 | 137 | 138 | async def _update_memory_from_agent(self, agentq_output: AgentOutput): 139 | if agentq_output.is_complete: 140 | self.memory.current_state = State.COMPLETED 141 | self.memory.final_response = agentq_output.final_response 142 | elif agentq_output.next_task: 143 | self.memory.current_state = State.BASE_AGENT 144 | if agentq_output.next_task_actions: 145 | action_results = await self.handle_agent_actions( 146 | agentq_output.next_task_actions 147 | ) 148 | print("Action results:", action_results) 149 | flattened_results = "; ".join(action_results) 150 | agentq_output.next_task.result = flattened_results 151 | 152 | self.memory.completed_tasks.append(agentq_output.next_task) 153 | self.memory.plan = agentq_output.plan 154 | self.memory.thought = agentq_output.thought 155 | current_task_id = len(self.memory.completed_tasks) + 1 156 | self.memory.current_task = Task( 157 | id=current_task_id, 158 | description=agentq_output.next_task.description, 159 | url=None, 160 | result=None, 161 | ) 162 | else: 163 | raise ValueError("Planner did not provide next task or completion status") 164 | 165 | async def handle_agent_actions(self, actions: List[Action]): 166 | results = [] 167 | for action in actions: 168 | if action.type == ActionType.GOTO_URL: 169 | result = await openurl(url=action.website, timeout=action.timeout or 1) 170 | print("Action - GOTO") 171 | elif action.type == ActionType.TYPE: 172 | entry = EnterTextEntry( 173 | query_selector=f"[mmid='{action.mmid}']", text=action.content 174 | ) 175 | result = await entertext(entry) 176 | print("Action - TYPE") 177 | elif action.type == ActionType.CLICK: 178 | result = await click( 179 | selector=f"[mmid='{action.mmid}']", 180 | wait_before_execution=action.wait_before_execution or 1, 181 | ) 182 | print("Action - CLICK") 183 | elif action.type == ActionType.ENTER_TEXT_AND_CLICK: 184 | result = await enter_text_and_click( 185 | text_selector=f"[mmid='{action.text_element_mmid}']", 186 | text_to_enter=action.text_to_enter, 187 | click_selector=f"[mmid='{action.click_element_mmid}']", 188 | wait_before_click_execution=action.wait_before_click_execution 189 | or 1.5, 190 | ) 191 | print("Action - ENTER TEXT AND CLICK") 192 | else: 193 | result = f"Unsupported action type: {action.type}" 194 | 195 | results.append(result) 196 | 197 | return results 198 | 199 | async def shutdown(self): 200 | print("Shutting down orchestrator!") 201 | self.shutdown_event.set() 202 | await self.playwright_manager.stop_playwright() 203 | 204 | def _print_memory_and_agent(self, agent_type: str): 205 | print(f"{Fore.CYAN}{'='*50}") 206 | print(f"{Fore.YELLOW}Current State: {Fore.GREEN}{self.memory.current_state}") 207 | print(f"{Fore.YELLOW}Agent: {Fore.GREEN}{agent_type}") 208 | print(f"{Fore.YELLOW}Current Thought: {Fore.GREEN}{self.memory.thought}") 209 | if len(self.memory.plan) == 0: 210 | print(f"{Fore.YELLOW}Plan:{Fore.GREEN} none") 211 | else: 212 | print(f"{Fore.YELLOW}Plan:") 213 | for task in self.memory.plan: 214 | print(f"{Fore.GREEN} {task.id}. {task.description}") 215 | if self.memory.current_task: 216 | print( 217 | f"{Fore.YELLOW}Current Task: {Fore.GREEN}{self.memory.current_task.description}" 218 | ) 219 | if len(self.memory.completed_tasks) == 0: 220 | print(f"{Fore.YELLOW}Completed Tasks:{Fore.GREEN} none") 221 | else: 222 | print(f"{Fore.YELLOW}Completed Tasks:") 223 | for task in self.memory.completed_tasks: 224 | status = "✓" if task.result else " " 225 | print(f"{Fore.GREEN} [{status}] {task.id}. {task.description}") 226 | print(f"{Fore.CYAN}{'='*50}") 227 | 228 | def _print_task_result(self, task: Task): 229 | print(f"{Fore.CYAN}{'='*50}") 230 | print(f"{Fore.YELLOW}Task Completed: {Fore.GREEN}{task.description}") 231 | print(f"{Fore.YELLOW}Result:") 232 | wrapped_result = textwrap.wrap(task.result, width=80) 233 | for line in wrapped_result: 234 | print(f"{Fore.WHITE}{line}") 235 | print(f"{Fore.CYAN}{'='*50}") 236 | 237 | def _print_final_response(self): 238 | print(f"\n{Fore.GREEN}{'='*50}") 239 | print(f"{Fore.GREEN}Objective Completed!") 240 | print(f"{Fore.GREEN}{'='*50}") 241 | print(f"{Fore.YELLOW}Final Response:") 242 | wrapped_response = textwrap.wrap(self.memory.final_response, width=80) 243 | for line in wrapped_response: 244 | print(f"{Fore.WHITE}{line}") 245 | print(f"{Fore.GREEN}{'='*50}") 246 | -------------------------------------------------------------------------------- /sentient/core/prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/core/prompts/__init__.py -------------------------------------------------------------------------------- /sentient/core/prompts/prompts.py: -------------------------------------------------------------------------------- 1 | LLM_PROMPTS = { 2 | "BASE_AGENT_PROMPT": """ 3 | You are a web automation planner. Your role is to receive an objective from the user and plan the next steps to complete the overall objective. You are part of an overall larger system where the actions you output are completed by a browser actuation system. 4 | 5 | ## Execution Flow Guidelines: ## 6 | 1. You will look at the tasks that have been done till now, their successes/ failures. If no tasks have been completed till now, that means you have to start from scratch. 7 | 2. Once you have carefully observed the completed tasks and their results, then think step by step and break down the objective into a sequence of simple tasks and come up with a plan needed to complete the overall objective. 8 | 3. Identify the next overall task and the actions that are needed to be taken on the browser to complete the next task. These actions will be given to a browser actuation system which will actually perform these actions and provide you with the result of these actions. 9 | 10 | Your input and output will strictly be a well-formatted JSON with attributes as mentioned below. 11 | 12 | Input: 13 | - objective: Mandatory string representing the main objective to be achieved via web automation 14 | - completed_tasks: Optional list of all tasks that have been completed so far in order to complete the objective. This also has the result of each of the task/action that was done previously. The result can be successful or unsuccessful. In either cases, CAREFULLY OBSERVE this array of tasks and update plan accordingly to meet the objective. 15 | - current_page_url: Mandatory string containing the URL of the current web page. 16 | - current_page_dom : Mandatory string containing a DOM represntation of the current web page. It has mmid attached to all the elements which would be helpful for you to find elements for performing actions for the next task. 17 | 18 | Output: 19 | - thought - A Mandatory string specifying your thoughts on how did you come up with the plan to solve the objective. How did you come up with the next task and why did you choose particular actions to achieve the next task. reiterate the objective here so that you can always remember what's your eventual aim. Reason deeply and think step by step to illustrate your thoughts here. 20 | - plan: Mandaory List of tasks that need be performed to achieve the objective. Think step by step. Update this based on the overall objective, tasks completed till now and their results and the current state of the webpage. You will also be provided with a DOM representation of the browser page to plan better. 21 | - next_task: Optional String representing detailed next task to be executed. Next task is consistent with the plan. This needs to be present for every response except when objective has been achieved. SEND THE next_task from the OVERALL plan. MAKE SURE to look at the provided DOM representation to adjust the appropriate next task. 22 | - next_task_actions - You have to output here a list of strings indicating the actions that need to be done in order to complete the above next task. 23 | - is_complete: Mandatory boolean indicating whether the entire objective has been achieved. Return True when the exact objective is complete without any compromises or you are absolutely convinced that the objective cannot be completed, no otherwise. This is mandatory for every response. 24 | - final_response: Optional string representing the summary of the completed work. This is to be returned only if the objective is COMPLETE. This is the final answer string that will be returned to the user. Use the plan and result to come with final response for the objective provided by the user. 25 | 26 | Format of task object: 27 | - id: Mandatory Integer representing the id of the task 28 | - description: Mandatory string representing the description of the task 29 | - url: String representing the URL on which task has been performed 30 | - result: String representing the result of the task. It should be a short summary of the actions you performed to accomplish the task, and what worked and what did not. 31 | 32 | Actions available and their description - 33 | 1. CLICK[MMID, WAIT_BEFORE_EXECUTION] - Executes a click action on the element matching the given mmid attribute value. MMID is always a number. Returns Success if click was successful or appropriate error message if the element could not be clicked. 34 | 2. TYPE[MMID, CONTENT] - Single enter given text in the DOM element matching the given mmid attribute value. This will only enter the text and not press enter or anything else. Returns Success if text entry was successful or appropriate error message if text could not be entered. 35 | 3. GOTO_URL[URL, TIMEOUT] - Opens a specified URL in the web browser instance. Returns url of the new page if successful or appropriate error message if the page could not be opened. 36 | 4. ENTER_TEXT_AND_CLICK[TEXT_ELEMENT_MMID, TEXT_TO_ENTER, CLICK_ELEMENT_MMID, WAIT_BEFORE_CLICK_EXECUTION] - This action enters text into a specified element and clicks another element, both identified by their mmid. Ideal for seamless actions like submitting search queries, this integrated approach ensures superior performance over separate text entry and click commands. Successfully completes when both actions are executed without errors, returning True; otherwise, it provides False or an explanatory message of any failure encountered. Always prefer this dual-action skill for tasks that combine text input and element clicking to leverage its streamlined operation. 37 | 38 | ## Planning Guidelines: ## 39 | 1. If you know the direct URL, use it directly instead of searching for it (e.g. go to www.espn.com). Optimise the plan to avoid unnecessary steps. 40 | 2. Do not combine multiple tasks into one. A task should be strictly as simple as interacting with a single element or navigating to a page. If you need to interact with multiple elements or perform multiple actions, you will break it down into multiple tasks. 41 | 3. ## VERY IMPORTANT ## - Add verification as part of the plan, after each step and specifically before terminating to ensure that the task is completed successfully. Use the provided DOM or get the webpage DOM by calling an action to verify that the task at hand is completing successfully. If not, modify the plan accordingly. 42 | 4. If the task requires multiple informations, all of them are equally important and should be gathered before terminating the task. You will strive to meet all the requirements of the task. 43 | 5. If one plan fails, you MUST revise the plan and try a different approach. You will NOT terminate a task untill you are absolutely convinced that the task is impossible to accomplish. 44 | 6. Think critically if the task has been actually been achieved before doing the final termination. 45 | 7. Make sure to take into account task sepcific information. 46 | 47 | ## Web Navigation guidelines ## 48 | 1. Based on the actions you output, web navigation will be done, which may include logging into websites and interacting with any web content 49 | 2. Use the provided DOM representation for element location or text summarization. 50 | 3. Interact with pages using only the "mmid" attribute in DOM elements. mmid will always be a number. 51 | 4. Execute Actions sequentially to avoid navigation timing issues. 52 | 5. The given actions are NOT parallelizable. They are intended for sequential execution. 53 | 6. When inputing information, remember to follow the format of the input field. For example, if the input field is a date field, you will enter the date in the correct format (e.g. YYYY-MM-DD), you may get clues from the placeholder text in the input field. 54 | 7. Individual function will reply with action success and if any changes were observed as a consequence. Adjust your approach based on this feedback. 55 | 8. Ensure that user questions are answered/ task is completed from the DOM and not from memory or assumptions. 56 | 9. Do not repeat the same action multiple times if it fails. Instead, if something did not work after a few attempts, terminate the task. 57 | 10. When being asked to play a song/ video/ some other content - it is essential to know that lot of websites like youtube autoplay the content. In such cases, you should not unncessarily click play/ pause repeatedly. 58 | 11. The only way you can extract information from a webpage is by looking at the DOM already provided to you. Do NOT call any actions to try and extract information. Extract XYZ info from the webpage is NOT a valid next task or action. 59 | 60 | ## Complexities of web navigation: ## 61 | 1. Many forms have mandatory fields that need to be filled up before they can be submitted. Have a look at what fields look mandatory. 62 | 2. In many websites, there are multiple options to filter or sort results. First try to list elements on the page which will help the task (e.g. any links or interactive elements that may lead me to the support page?). 63 | 3. Always keep in mind complexities such as filtering, advanced search, sorting, and other features that may be present on the website. Use them when the task requires it. 64 | 4. Very often list of items such as, search results, list of products, list of reviews, list of people etc. may be divided into multiple pages. If you need complete information, it is critical to explicitly go through all the pages. 65 | 5. Sometimes search capabilities available on the page will not yield the optimal results. Revise the search query to either more specific or more generic. 66 | 6. When a page refreshes or navigates to a new page, information entered in the previous page may be lost. Check that the information needs to be re-entered (e.g. what are the values in source and destination on the page?). 67 | 7. Sometimes some elements may not be visible or be disabled until some other action is performed. Check if there are any other fields that may need to be interacted for elements to appear or be enabled. 68 | 8. Be extra careful with elements like date and time selectors, dropdowns, etc. because they might be made differently and dom might update differently. so make sure that once you call a function to select a date, re verify if it has actually been selected. if not, retry in another way. 69 | 70 | Example 1: 71 | Input: { 72 | "objective": "Find the cheapest premium economy flights from Helsinki to Stockholm on 15 March on Skyscanner.", 73 | "completed_tasks": [], 74 | "current_page_dom" : "{'role': 'WebArea', 'name': 'Google', 'children': [{'name': 'About', 'mmid': '26', 'tag': 'a'}, {'name': 'Store', 'mmid': '27', 'tag': 'a'}, {'name': 'Gmail ', 'mmid': '36', 'tag': 'a'}, {'name': 'Search for Images ', 'mmid': '38', 'tag': 'a'}, {'role': 'button', 'name': 'Search Labs', 'mmid': '43', 'tag': 'a'}, {'role': 'button', 'name': 'Google apps', 'mmid': '48', 'tag': 'a'}, {'role': 'button', 'name': 'Google Account: Nischal (nischalj10@gmail.com)', 'mmid': '54', 'tag': 'a', 'aria-label': 'Google Account: Nischal \\n(nischalj10@gmail.com)'}, {'role': 'link', 'name': 'Paris Games August Most Searched Playground', 'mmid': 79}, {'name': 'Share', 'mmid': '85', 'tag': 'button', 'additional_info': [{}]}, {'role': 'combobox', 'name': 'q', 'description': 'Search', 'focused': True, 'autocomplete': 'both', 'mmid': '142', 'tag': 'textarea', 'aria-label': 'Search'}, {'role': 'button', 'name': 'Search by voice', 'mmid': '154', 'tag': 'div'}, {'role': 'button', 'name': 'Search by image', 'mmid': '161', 'tag': 'div'}, {'role': 'button', 'name': 'btnK', 'description': 'Google Search', 'mmid': '303', 'tag': 'input', 'tag_type': 'submit', 'aria-label': 'Google Search'}, {'role': 'button', 'name': 'btnI', 'description': \"I'm Feeling Lucky\", 'mmid': '304', 'tag': 'input', 'tag_type': 'submit', 'aria-label': \"I'm Feeling Lucky\"}, {'role': 'text', 'name': 'Google offered in: '}, {'name': 'हिन्दी', 'mmid': '320', 'tag': 'a'}, {'name': 'বাংলা', 'mmid': '321', 'tag': 'a'}, {'name': 'తెలుగు', 'mmid': '322', 'tag': 'a'}, {'name': 'मराठी', 'mmid': '323', 'tag': 'a'}, {'name': 'தமிழ்', 'mmid': '324', 'tag': 'a'}, {'name': 'ગુજરાતી', 'mmid': '325', 'tag': 'a'}, {'name': 'ಕನ್ನಡ', 'mmid': '326', 'tag': 'a'}, {'name': 'മലയാളം', 'mmid': '327', 'tag': 'a'}, {'name': 'ਪੰਜਾਬੀ', 'mmid': '328', 'tag': 'a'}, {'role': 'text', 'name': 'India'}, {'name': 'Advertising', 'mmid': '336', 'tag': 'a'}, {'name': 'Business', 'mmid': '337', 'tag': 'a'}, {'name': 'How Search works', 'mmid': '338', 'tag': 'a'}, {'name': 'Privacy', 'mmid': '340', 'tag': 'a'}, {'name': 'Terms', 'mmid': '341', 'tag': 'a'}, {'role': 'button', 'name': 'Settings', 'mmid': '347', 'tag': 'div'}]}" 75 | } 76 | 77 | Output - 78 | { 79 | "thought" : "I see it look like the google homepage in the provided DOM representation. In order to book flight, I should go to a website like skyscanner and carry my searches over there. 80 | Once I am there, I should correctly set the origin city, destination city, day of travel, number of passengers, journey type (one way/ round trip), and seat type (premium economy) in the shown filters based on the objective. 81 | If I do not see some filters, I will try to search for them in the next step once some results are shown from initial filters. Maybe the UI of website does not provide all the filters in on go for better user experience. 82 | Post that I should see some results from skyscanner. I should also probably apply a price low to high filter if the flights are shown in a different order. If I am able to do all this, I should be able to complete the objective fairly easily. 83 | I will start with naviagting to skyscanner home page", 84 | "plan": [ 85 | {"id": 1, "description": "Go to www.skyscanner.com", "url": "https://www.skyscanner.com"}, 86 | {"id": 2, "description": "List the interaction options available on skyscanner page relevant for flight reservation along with their default values"}, 87 | {"id": 3, "description": "Select the journey option to one-way (if not default)"}, 88 | {"id": 4, "description": "Set number of passengers to 1 (if not default)"}, 89 | {"id": 5, "description": "Set the departure date to 15 March 2025"}, 90 | {"id": 6, "description": "Set ticket type to Economy Premium"}, 91 | {"id": 7, "description": "Set from airport to 'Helsinki'"}, 92 | {"id": 8, "description": "Set destination airport to Stockholm"}, 93 | {"id": 9, "description": "Confirm that current values in the source airport, destination airport and departure date fields are Helsinki, Stockholm and 15 March 2025 respectively"}, 94 | {"id": 10, "description": "Click on the search button to get the search results"}, 95 | {"id": 11, "description": "Confirm that you are on the search results page"}, 96 | {"id": 12, "description": "Extract the price of the cheapest flight from Helsinki to Stockholm from the search results"} 97 | ], 98 | "next_task" : {"id": 1, "url": null, "description": "Go to www.skyscanner.com", "result": null}, 99 | "next_task_actions" : [{"type":"GOTO_URL","website":"https://www.skyscanner.com", "timeout":"2"}], 100 | "is_complete": False, 101 | } 102 | 103 | Notice above how there is confirmation after each step and how interaction (e.g. setting source and destination) with each element is a separate step. Follow same pattern. 104 | 105 | Some task sepcific information that you MUST take into account: \n $task_information 106 | 107 | ## SOME VERY IMPORTANT POINTS TO ALWAYS REMEMBER ## 108 | 1. NEVER ASK WHAT TO DO NEXT or HOW would you like to proceed to the user. 109 | 2. ONLY do one task at a time. 110 | """, 111 | "OPEN_URL_PROMPT": """Opens a specified URL in the web browser instance. Returns url of the new page if successful or appropriate error message if the page could not be opened.""", 112 | "ENTER_TEXT_AND_CLICK_PROMPT": """ 113 | This skill enters text into a specified element and clicks another element, both identified by their DOM selector queries. 114 | Ideal for seamless actions like submitting search queries, this integrated approach ensures superior performance over separate text entry and click commands. 115 | Successfully completes when both actions are executed without errors, returning True; otherwise, it provides False or an explanatory message of any failure encountered. 116 | Always prefer this dual-action skill for tasks that combine text input and element clicking to leverage its streamlined operation. 117 | """, 118 | "GET_DOM_WITH_CONTENT_TYPE_PROMPT": """ 119 | Retrieves the DOM of the current web site based on the given content type. 120 | The DOM representation returned contains items ordered in the same way they appear on the page. Keep this in mind when executing user requests that contain ordinals or numbered items. 121 | text_only - returns plain text representing all the text in the web site. Use this for any information retrieval task. This will contain the most complete textual information. 122 | input_fields - returns a JSON string containing a list of objects representing text input html elements with mmid attribute. Use this strictly for interaction purposes with text input fields. 123 | all_fields - returns a JSON string containing a list of objects representing all interactive elements and their attributes with mmid attribute. Use this strictly to identify and interact with any type of elements on page. 124 | If information is not available in one content type, you must try another content_type. 125 | """, 126 | "CLICK_PROMPT": """Executes a click action on the element matching the given mmid attribute value. It is best to use mmid attribute as the selector. 127 | Returns Success if click was successful or appropriate error message if the element could not be clicked. 128 | """, 129 | "GET_URL_PROMPT": """Get the full URL of the current web page/site. If the user command seems to imply an action that would be suitable for an already open website in their browser, use this to fetch current website URL.""", 130 | "ENTER_TEXT_PROMPT": """Single enter given text in the DOM element matching the given mmid attribute value. This will only enter the text and not press enter or anything else. 131 | Returns Success if text entry was successful or appropriate error message if text could not be entered. 132 | """, 133 | "BULK_ENTER_TEXT_PROMPT": """Bulk enter text in multiple DOM fields. To be used when there are multiple fields to be filled on the same page. Typically use this when you see a form to fill with multiple inputs. Make sure to have mmid from a get DOM tool before hand. 134 | Enters text in the DOM elements matching the given mmid attribute value. 135 | The input will receive a list of objects containing the DOM query selector and the text to enter. 136 | This will only enter the text and not press enter or anything else. 137 | Returns each selector and the result for attempting to enter text. 138 | """, 139 | "PRESS_KEY_COMBINATION_PROMPT": """Presses the given key on the current web page. 140 | This is useful for pressing the enter button to submit a search query, PageDown to scroll, ArrowDown to change selection in a focussed list etc. 141 | """, 142 | "EXTRACT_TEXT_FROM_PDF_PROMPT": """Extracts text from a PDF file hosted at the given URL.""", 143 | "UPLOAD_FILE_PROMPT": """This skill uploads a file on the page opened by the web browser instance""", 144 | } -------------------------------------------------------------------------------- /sentient/core/skills/__init__.py: -------------------------------------------------------------------------------- 1 | from sentient.core.skills.click_using_selector import ( 2 | click, 3 | do_click, 4 | is_element_present, 5 | perform_javascript_click, 6 | perform_playwright_click, 7 | ) 8 | from sentient.core.skills.enter_text_and_click import enter_text_and_click 9 | from sentient.core.skills.enter_text_using_selector import ( 10 | bulk_enter_text, 11 | custom_fill_element, 12 | do_entertext, 13 | ) 14 | from sentient.core.skills.get_dom_with_content_type import get_dom_with_content_type 15 | from sentient.core.skills.get_url import geturl 16 | from sentient.core.skills.get_user_input import get_user_input 17 | from sentient.core.skills.open_url import openurl 18 | from sentient.core.skills.press_key_combination import press_key_combination 19 | 20 | __all__ = ( 21 | click, 22 | do_click, 23 | is_element_present, 24 | perform_javascript_click, 25 | perform_playwright_click, 26 | enter_text_and_click, 27 | bulk_enter_text, 28 | custom_fill_element, 29 | do_entertext, 30 | get_dom_with_content_type, 31 | geturl, 32 | get_user_input, 33 | openurl, 34 | press_key_combination, 35 | ) 36 | -------------------------------------------------------------------------------- /sentient/core/skills/click_using_selector.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | import traceback 4 | from typing import Dict 5 | 6 | from playwright.async_api import ElementHandle, Page 7 | from playwright.async_api import TimeoutError as PlaywrightTimeoutError 8 | from typing_extensions import Annotated 9 | 10 | from sentient.core.web_driver.playwright import PlaywrightManager 11 | from sentient.utils.dom_mutation_observer import ( 12 | subscribe, # type: ignore 13 | unsubscribe, # type: ignore 14 | ) 15 | from sentient.utils.logger import logger 16 | 17 | async def click( 18 | selector: Annotated[ 19 | str, 20 | "The properly formed query selector string to identify the element for the click action (e.g. [mmid='114']). When \"mmid\" attribute is present, use it for the query selector. selector mmid will always be a number", 21 | ], 22 | wait_before_execution: Annotated[ 23 | float, 24 | "Optional wait time in seconds before executing the click event logic.", 25 | float, 26 | ], 27 | ) -> Annotated[str, "A message indicating success or failure of the click."]: 28 | """ 29 | Executes a click action on the element matching the given query selector string within the currently open web page. 30 | If there is no page open, it will raise a ValueError. An optional wait time can be specified before executing the click logic. Use this to wait for the page to load especially when the last action caused the DOM/Page to load. 31 | 32 | Parameters: 33 | - selector: The query selector string to identify the element for the click action. 34 | - wait_before_execution: Optional wait time in seconds before executing the click event logic. Defaults to 0.0 seconds. 35 | 36 | Returns: 37 | - Success if the click was successful, Appropriate error message otherwise. 38 | """ 39 | logger.info(f'Executing ClickElement with "{selector}" as the selector') 40 | 41 | # Initialize PlaywrightManager and get the active browser page 42 | browser_manager = PlaywrightManager() 43 | page = await browser_manager.get_current_page() 44 | 45 | if page is None: 46 | raise ValueError("No active page found. OpenURL command opens a new page.") 47 | 48 | function_name = inspect.currentframe().f_code.co_name 49 | 50 | await browser_manager.take_screenshots(f"{function_name}_start", page) 51 | 52 | await browser_manager.highlight_element(selector, True) 53 | 54 | dom_changes_detected = None 55 | 56 | def detect_dom_changes(changes: str): 57 | nonlocal dom_changes_detected 58 | dom_changes_detected = changes 59 | 60 | subscribe(detect_dom_changes) 61 | 62 | # Wrap the click action and subsequent operations in a try-except block 63 | try: 64 | # Set up navigation expectation with a shorter timeout 65 | async with page.expect_navigation(wait_until="domcontentloaded", timeout=10000): 66 | result = await do_click(page, selector, wait_before_execution) 67 | 68 | # Wait for a short time to ensure the page has settled 69 | await asyncio.sleep(1) 70 | except PlaywrightTimeoutError: 71 | # If navigation times out, it might be a single-page app or a slow-loading page 72 | logger.warning("Navigation timeout occurred, but the click might have been successful.") 73 | result = { 74 | "summary_message": "Click executed, but no full page navigation detected", 75 | "detailed_message": "Click executed successfully, but no full page navigation was detected. This might be normal for single-page applications or slow-loading pages.", 76 | } 77 | except Exception as e: 78 | logger.error(f"Error during click operation: {e}") 79 | result = { 80 | "summary_message": "Click executed, but encountered an error", 81 | "detailed_message": f"Click executed, but encountered an error: {str(e)}", 82 | } 83 | 84 | await asyncio.sleep(0.1) # sleep for 100ms to allow the mutation observer to detect changes 85 | unsubscribe(detect_dom_changes) 86 | await browser_manager.take_screenshots(f"{function_name}_end", page) 87 | 88 | if dom_changes_detected: 89 | return f"Success: {result['summary_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action to click {selector} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction." 90 | return result["detailed_message"] 91 | 92 | async def do_click( 93 | page: Page, selector: str, wait_before_execution: float 94 | ) -> Dict[str, str]: 95 | """ 96 | Executes the click action on the element with the given selector within the provided page. 97 | 98 | Parameters: 99 | - page: The Playwright page instance. 100 | - selector: The query selector string to identify the element for the click action. 101 | - wait_before_execution: Optional wait time in seconds before executing the click event logic. 102 | 103 | Returns: 104 | Dict[str,str] - Explanation of the outcome of this operation represented as a dictionary with 'summary_message' and 'detailed_message'. 105 | """ 106 | logger.info( 107 | f'Executing ClickElement with "{selector}" as the selector. Wait time before execution: {wait_before_execution} seconds.' 108 | ) 109 | 110 | # Wait before execution if specified 111 | if wait_before_execution > 0: 112 | await asyncio.sleep(wait_before_execution) 113 | 114 | # Wait for the selector to be present and ensure it's attached and visible. If timeout, try javascript click 115 | try: 116 | logger.info( 117 | f'Executing ClickElement with "{selector}" as the selector. Waiting for the element to be attached and visible.' 118 | ) 119 | 120 | element = await asyncio.wait_for( 121 | page.wait_for_selector(selector, state="attached", timeout=2000), 122 | timeout=2000, 123 | ) 124 | if element is None: 125 | raise ValueError(f'Element with selector: "{selector}" not found') 126 | 127 | logger.info( 128 | f'Element with selector: "{selector}" is attached. scrolling it into view if needed.' 129 | ) 130 | try: 131 | await element.scroll_into_view_if_needed(timeout=200) 132 | logger.info( 133 | f'Element with selector: "{selector}" is attached and scrolled into view. Waiting for the element to be visible.' 134 | ) 135 | except Exception: 136 | # If scrollIntoView fails, just move on, not a big deal 137 | pass 138 | 139 | try: 140 | await element.wait_for_element_state("visible", timeout=200) 141 | logger.info( 142 | f'Executing ClickElement with "{selector}" as the selector. Element is attached and visible. Clicking the element.' 143 | ) 144 | except Exception: 145 | # If the element is not visible, try to click it anyway 146 | pass 147 | 148 | element_tag_name = await element.evaluate( 149 | "element => element.tagName.toLowerCase()" 150 | ) 151 | 152 | if element_tag_name == "option": 153 | element_value = await element.get_attribute( 154 | "value" 155 | ) # get the text that is in the value of the option 156 | parent_element = await element.evaluate_handle( 157 | "element => element.parentNode" 158 | ) 159 | await parent_element.select_option(value=element_value) # type: ignore 160 | 161 | logger.info(f'Select menu option "{element_value}" selected') 162 | 163 | return { 164 | "summary_message": f'Select menu option "{element_value}" selected', 165 | "detailed_message": f'Select menu option "{element_value}" selected.', 166 | } 167 | 168 | msg = await perform_javascript_click(page, selector) 169 | return { 170 | "summary_message": msg, 171 | "detailed_message": f"{msg} Click action completed, page may have navigated.", 172 | } 173 | except Exception as e: 174 | logger.error(f'Unable to click element with selector: "{selector}". Error: {e}') 175 | traceback.print_exc() 176 | msg = f'Unable to click element with selector: "{selector}" since the selector is invalid.' 177 | return {"summary_message": msg, "detailed_message": f"{msg}. Error: {e}"} 178 | 179 | 180 | async def is_element_present(page: Page, selector: str) -> bool: 181 | """ 182 | Checks if an element is present on the page. 183 | 184 | Parameters: 185 | - page: The Playwright page instance. 186 | - selector: The query selector string to identify the element. 187 | 188 | Returns: 189 | - True if the element is present, False otherwise. 190 | """ 191 | element = await page.query_selector(selector) 192 | return element is not None 193 | 194 | 195 | async def perform_playwright_click(element: ElementHandle, selector: str): 196 | """ 197 | Performs a click action on the element using Playwright's click method. 198 | 199 | Parameters: 200 | - element: The Playwright ElementHandle instance representing the element to be clicked. 201 | - selector: The query selector string of the element. 202 | 203 | Returns: 204 | - None 205 | """ 206 | logger.info( 207 | f"Performing first Step: Playwright Click on element with selector: {selector}" 208 | ) 209 | await element.click(force=False, timeout=200) 210 | 211 | 212 | async def perform_javascript_click(page: Page, selector: str): 213 | """ 214 | Performs a click action on the element using JavaScript. 215 | 216 | Parameters: 217 | - page: The Playwright page instance. 218 | - selector: The query selector string of the element. 219 | 220 | Returns: 221 | - A string describing the result of the click action. 222 | """ 223 | js_code = """(selector) => { 224 | let element = document.querySelector(selector); 225 | 226 | if (!element) { 227 | console.log(`perform_javascript_click: Element with selector ${selector} not found`); 228 | return `perform_javascript_click: Element with selector ${selector} not found`; 229 | } 230 | 231 | if (element.tagName.toLowerCase() === "option") { 232 | let value = element.text; 233 | let parent = element.parentElement; 234 | 235 | parent.value = element.value; // Directly set the value if possible 236 | // Trigger change event if necessary 237 | let event = new Event('change', { bubbles: true }); 238 | parent.dispatchEvent(event); 239 | 240 | console.log("Select menu option", value, "selected"); 241 | return "Select menu option: "+ value+ " selected"; 242 | } 243 | else { 244 | console.log("About to click selector", selector); 245 | // If the element is a link, make it open in the same tab 246 | if (element.tagName.toLowerCase() === "a") { 247 | element.target = "_self"; 248 | // #TODO: Consider removing this in the future if it causes issues with intended new tab behavior 249 | element.removeAttribute('target'); 250 | element.removeAttribute('rel'); 251 | } 252 | let ariaExpandedBeforeClick = element.getAttribute('aria-expanded'); 253 | element.click(); 254 | let ariaExpandedAfterClick = element.getAttribute('aria-expanded'); 255 | if (ariaExpandedBeforeClick === 'false' && ariaExpandedAfterClick === 'true') { 256 | return "Executed JavaScript Click on element with selector: "+selector +". Very important: As a consequence a menu has appeared where you may need to make further selection. Very important: Get all_fields DOM to complete the action."; 257 | } 258 | return "Executed JavaScript Click on element with selector: "+selector; 259 | } 260 | }""" 261 | try: 262 | logger.info(f"Executing JavaScript click on element with selector: {selector}") 263 | result: str = await page.evaluate(js_code, selector) 264 | logger.debug(f"Executed JavaScript Click on element with selector: {selector}") 265 | return result 266 | except Exception as e: 267 | logger.error( 268 | f"Error executing JavaScript click on element with selector: {selector}. Error: {e}" 269 | ) 270 | traceback.print_exc() 271 | return f"Error executing JavaScript click: {str(e)}" 272 | -------------------------------------------------------------------------------- /sentient/core/skills/enter_text_and_click.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | 4 | from typing_extensions import Annotated 5 | 6 | from sentient.core.web_driver.playwright import PlaywrightManager 7 | from sentient.core.skills.click_using_selector import do_click 8 | from sentient.core.skills.enter_text_using_selector import do_entertext 9 | from sentient.core.skills.press_key_combination import do_press_key_combination 10 | from sentient.utils.logger import logger 11 | 12 | 13 | async def enter_text_and_click( 14 | text_selector: Annotated[ 15 | str, 16 | "The properly formatted DOM selector query, for example [mmid='1234'], where the text will be entered. Use mmid attribute. mmid will always be a number", 17 | ], 18 | text_to_enter: Annotated[ 19 | str, 20 | "The text that will be entered into the element specified by text_selector.", 21 | ], 22 | click_selector: Annotated[ 23 | str, 24 | "The properly formatted DOM selector query, for example [mmid='1234'], for the element that will be clicked after text entry. mmid will always be a number", 25 | ], 26 | wait_before_click_execution: Annotated[ 27 | float, "Optional wait time in seconds before executing the click.", float 28 | ], 29 | ) -> Annotated[ 30 | str, "A message indicating success or failure of the text entry and click." 31 | ]: 32 | """ 33 | Enters text into an element and then clicks on another element. 34 | 35 | Parameters: 36 | - text_selector: The selector for the element to enter text into. It should be a properly formatted DOM selector query, for example [mmid='1234'], where the text will be entered. Use the mmid attribute. 37 | - text_to_enter: The text to enter into the element specified by text_selector. 38 | - click_selector: The selector for the element to click. It should be a properly formatted DOM selector query, for example [mmid='1234']. 39 | - wait_before_click_execution: Optional wait time in seconds before executing the click action. Default is 0.0. 40 | 41 | Returns: 42 | - A message indicating the success or failure of the text entry and click. 43 | 44 | Raises: 45 | - ValueError: If no active page is found. The OpenURL command opens a new page. 46 | 47 | Example usage: 48 | ``` 49 | await enter_text_and_click("[mmid='1234']", "Hello, World!", "[mmid='5678']", wait_before_click_execution=1.5) 50 | ``` 51 | """ 52 | logger.info( 53 | f"Entering text '{text_to_enter}' into element with selector '{text_selector}' and then clicking element with selector '{click_selector}'." 54 | ) 55 | 56 | # Initialize PlaywrightManager and get the active browser page 57 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 58 | page = await browser_manager.get_current_page() 59 | if page is None: # type: ignore 60 | logger.error("No active page found") 61 | raise ValueError("No active page found. OpenURL command opens a new page.") 62 | 63 | await browser_manager.highlight_element(text_selector, True) 64 | 65 | function_name = inspect.currentframe().f_code.co_name # type: ignore 66 | await browser_manager.take_screenshots(f"{function_name}_start", page) 67 | 68 | text_entry_result = await do_entertext( 69 | page, text_selector, text_to_enter, use_keyboard_fill=True 70 | ) 71 | 72 | # await browser_manager.notify_user(text_entry_result["summary_message"]) 73 | if not text_entry_result["summary_message"].startswith("Success"): 74 | await browser_manager.take_screenshots(f"{function_name}_end", page) 75 | return f"Failed to enter text '{text_to_enter}' into element with selector '{text_selector}'. Check that the selctor is valid." 76 | 77 | result = text_entry_result 78 | 79 | # if the text_selector is the same as the click_selector, press the Enter key instead of clicking 80 | try: 81 | if text_selector == click_selector: 82 | do_press_key_combination_result = await do_press_key_combination( 83 | browser_manager, page, "Enter" 84 | ) 85 | if do_press_key_combination_result: 86 | result["detailed_message"] += ( 87 | f' Instead of click, pressed the Enter key successfully on element: "{click_selector}".' 88 | ) 89 | # await browser_manager.notify_user( 90 | # f'Pressed the Enter key successfully on element: "{click_selector}".', 91 | # message_type=MessageType.ACTION, 92 | # ) 93 | else: 94 | result["detailed_message"] += ( 95 | f' Clicking the same element after entering text in it, is of no value. Tried pressing the Enter key on element "{click_selector}" instead of click and failed.' 96 | ) 97 | # await browser_manager.notify_user( 98 | # 'Failed to press the Enter key on element "{click_selector}".', 99 | # message_type=MessageType.ACTION, 100 | # ) 101 | else: 102 | await browser_manager.highlight_element(click_selector, True) 103 | 104 | do_click_result = await do_click( 105 | page, click_selector, wait_before_click_execution 106 | ) 107 | result["detailed_message"] += f' {do_click_result["detailed_message"]}' 108 | 109 | await page.wait_for_load_state("domcontentloaded") # Wait for DOM content to be loaded after the action 110 | # await browser_manager.notify_user(do_click_result["summary_message"]) 111 | await asyncio.sleep(0.5) # sleep for 1 sec to allow the mutation observer to detect changes 112 | await browser_manager.take_screenshots(f"{function_name}_end", page) 113 | return result["detailed_message"] 114 | except Exception as e: 115 | error_message = f"An error occurred during the click action: {str(e)}. This may be due to page navigation." 116 | logger.error(error_message) 117 | return error_message 118 | -------------------------------------------------------------------------------- /sentient/core/skills/enter_text_using_selector.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | import traceback 4 | from dataclasses import dataclass 5 | from typing import ( 6 | Dict, 7 | List, # noqa: UP035 8 | ) 9 | 10 | from playwright.async_api import Page 11 | from typing_extensions import Annotated 12 | 13 | from sentient.core.web_driver.playwright import PlaywrightManager 14 | from sentient.core.skills.press_key_combination import press_key_combination 15 | from sentient.utils.dom_helper import get_element_outer_html 16 | from sentient.utils.dom_mutation_observer import subscribe, unsubscribe 17 | from sentient.utils.logger import logger 18 | 19 | 20 | @dataclass 21 | class EnterTextEntry: 22 | """ 23 | Represents an entry for text input. 24 | 25 | Attributes: 26 | query_selector (str): A valid DOM selector query. Use the mmid attribute. 27 | text (str): The text to enter in the element identified by the query_selector. 28 | """ 29 | 30 | query_selector: str 31 | text: str 32 | 33 | def __getitem__(self, key: str) -> str: 34 | if key == "query_selector": 35 | return self.query_selector 36 | elif key == "text": 37 | return self.text 38 | else: 39 | raise KeyError(f"{key} is not a valid key") 40 | 41 | 42 | async def custom_fill_element(page: Page, selector: str, text_to_enter: str): 43 | """ 44 | Sets the value of a DOM element to a specified text without triggering keyboard input events. 45 | 46 | This function directly sets the 'value' property of a DOM element identified by the given CSS selector, 47 | effectively changing its current value to the specified text. This approach bypasses the need for 48 | simulating keyboard typing, providing a more efficient and reliable way to fill in text fields, 49 | especially in automated testing scenarios where speed and accuracy are paramount. 50 | 51 | Args: 52 | page (Page): The Playwright Page object representing the browser tab in which the operation will be performed. 53 | selector (str): The CSS selector string used to locate the target DOM element. The function will apply the 54 | text change to the first element that matches this selector. 55 | text_to_enter (str): The text value to be set in the target element. Existing content will be overwritten. 56 | 57 | Example: 58 | await custom_fill_element(page, '#username', 'test_user') 59 | 60 | Note: 61 | This function does not trigger input-related events (like 'input' or 'change'). If application logic 62 | relies on these events being fired, additional steps may be needed to simulate them. 63 | """ 64 | selector = f"{selector}" # Ensures the selector is treated as a string 65 | try: 66 | result = await page.evaluate( 67 | """(inputParams) => { 68 | const selector = inputParams.selector; 69 | let text_to_enter = inputParams.text_to_enter; 70 | text_to_enter = text_to_enter.trim(); 71 | const element = document.querySelector(selector); 72 | if (!element) { 73 | throw new Error(`Element not found: ${selector}`); 74 | } 75 | element.value = text_to_enter; 76 | return `Value set for ${selector}`; 77 | }""", 78 | {"selector": selector, "text_to_enter": text_to_enter}, 79 | ) 80 | logger.debug(f"custom_fill_element result: {result}") 81 | except Exception as e: 82 | logger.error(f"Error in custom_fill_element: {str(e)}") 83 | logger.error(f"Selector: {selector}, Text: {text_to_enter}") 84 | raise 85 | 86 | 87 | async def entertext( 88 | entry: Annotated[ 89 | EnterTextEntry, 90 | "An object containing 'query_selector' (DOM selector query using mmid attribute e.g. [mmid='114']) and 'text' (text to enter on the element). mmid will always be a number", 91 | ], 92 | ) -> Annotated[str, "Explanation of the outcome of this operation."]: 93 | """ 94 | Enters text into a DOM element identified by a CSS selector. 95 | 96 | This function enters the specified text into a DOM element identified by the given CSS selector. 97 | It uses the Playwright library to interact with the browser and perform the text entry operation. 98 | The function supports both direct setting of the 'value' property and simulating keyboard typing. 99 | 100 | Args: 101 | entry (EnterTextEntry): An object containing 'query_selector' (DOM selector query using mmid attribute) 102 | and 'text' (text to enter on the element). 103 | 104 | Returns: 105 | str: Explanation of the outcome of this operation. 106 | 107 | Example: 108 | entry = EnterTextEntry(query_selector='#username', text='test_user') 109 | result = await entertext(entry) 110 | 111 | Note: 112 | - The 'query_selector' should be a valid CSS selector that uniquely identifies the target element. 113 | - The 'text' parameter specifies the text to be entered into the element. 114 | - The function uses the PlaywrightManager to manage the browser instance. 115 | - If no active page is found, an error message is returned. 116 | - The function internally calls the 'do_entertext' function to perform the text entry operation. 117 | - The 'do_entertext' function applies a pulsating border effect to the target element during the operation. 118 | - The function first clears any existing text in the input field before entering the new text. 119 | - The 'use_keyboard_fill' parameter in 'do_entertext' determines whether to simulate keyboard typing or not. 120 | - If 'use_keyboard_fill' is set to True, the function uses the 'page.keyboard.type' method to enter the text. 121 | - If 'use_keyboard_fill' is set to False, the function uses the 'custom_fill_element' method to enter the text. 122 | """ 123 | logger.info(f"Entering text: {entry}") 124 | 125 | if isinstance(entry, Dict): 126 | query_selector: str = entry["query_selector"] 127 | text_to_enter: str = entry["text"] 128 | elif isinstance(entry, EnterTextEntry): 129 | query_selector: str = entry.query_selector 130 | text_to_enter: str = entry.text 131 | else: 132 | raise ValueError( 133 | "Invalid input type for 'entry'. Expected EnterTextEntry or dict." 134 | ) 135 | 136 | if not isinstance(query_selector, str) or not isinstance(text_to_enter, str): 137 | raise ValueError("query_selector and text must be strings") 138 | 139 | # logger.info( 140 | # f"######### Debug: query_selector={query_selector}, text_to_enter={text_to_enter}" 141 | # ) 142 | 143 | # Create and use the PlaywrightManager 144 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 145 | page = await browser_manager.get_current_page() 146 | if page is None: # type: ignore 147 | return "Error: No active page found. OpenURL command opens a new page." 148 | 149 | function_name = inspect.currentframe().f_code.co_name # type: ignore 150 | 151 | await browser_manager.take_screenshots(f"{function_name}_start", page) 152 | 153 | await browser_manager.highlight_element(query_selector, True) 154 | 155 | dom_changes_detected = None 156 | 157 | def detect_dom_changes(changes: str): # type: ignore 158 | nonlocal dom_changes_detected 159 | dom_changes_detected = changes # type: ignore 160 | 161 | subscribe(detect_dom_changes) 162 | 163 | # Clear existing text before entering new text 164 | # await page.evaluate(f"document.querySelector('{query_selector}').value = '';") 165 | # logger.info( 166 | # f"######### About to page.evaluate: selector={query_selector}, text={text_to_enter}" 167 | # ) 168 | await page.evaluate( 169 | """ 170 | (selector) => { 171 | const element = document.querySelector(selector); 172 | if (element) { 173 | element.value = ''; 174 | } else { 175 | console.error('Element not found:', selector); 176 | } 177 | } 178 | """, 179 | query_selector, 180 | ) 181 | # logger.info( 182 | # f"######### About to call do_entertext with: selector={query_selector}, text={text_to_enter}" 183 | # ) 184 | result = await do_entertext(page, query_selector, text_to_enter) 185 | # logger.info(f"#########do_entertext returned: {result}") 186 | await asyncio.sleep( 187 | 0.1 188 | ) # sleep for 100ms to allow the mutation observer to detect changes 189 | unsubscribe(detect_dom_changes) 190 | 191 | await browser_manager.take_screenshots(f"{function_name}_end", page) 192 | 193 | if dom_changes_detected: 194 | return f"{result['detailed_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action of entering text {text_to_enter} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction." 195 | return result["detailed_message"] 196 | 197 | 198 | async def do_entertext( 199 | page: Page, selector: str, text_to_enter: str, use_keyboard_fill: bool = True 200 | ): 201 | """ 202 | Performs the text entry operation on a DOM element. 203 | 204 | This function performs the text entry operation on a DOM element identified by the given CSS selector. 205 | It applies a pulsating border effect to the element during the operation for visual feedback. 206 | The function supports both direct setting of the 'value' property and simulating keyboard typing. 207 | 208 | Args: 209 | page (Page): The Playwright Page object representing the browser tab in which the operation will be performed. 210 | selector (str): The CSS selector string used to locate the target DOM element. 211 | text_to_enter (str): The text value to be set in the target element. Existing content will be overwritten. 212 | use_keyboard_fill (bool, optional): Determines whether to simulate keyboard typing or not. 213 | Defaults to False. 214 | 215 | Returns: 216 | Dict[str, str]: Explanation of the outcome of this operation represented as a dictionary with 'summary_message' and 'detailed_message'. 217 | 218 | Example: 219 | result = await do_entertext(page, '#username', 'test_user') 220 | 221 | Note: 222 | - The 'use_keyboard_fill' parameter determines whether to simulate keyboard typing or not. 223 | - If 'use_keyboard_fill' is set to True, the function uses the 'page.keyboard.type' method to enter the text. 224 | - If 'use_keyboard_fill' is set to False, the function uses the 'custom_fill_element' method to enter the text. 225 | """ 226 | try: 227 | elem = await page.query_selector(selector) 228 | 229 | if elem is None: 230 | error = f"Error: Selector {selector} not found. Unable to continue." 231 | return {"summary_message": error, "detailed_message": error} 232 | 233 | # logger.info(f"######### Found selector {selector} to enter text") 234 | element_outer_html = await get_element_outer_html(elem, page) 235 | 236 | if use_keyboard_fill: 237 | await elem.focus() 238 | await asyncio.sleep(0.1) 239 | await press_key_combination("Control+A") 240 | await asyncio.sleep(0.1) 241 | await press_key_combination("Backspace") 242 | await asyncio.sleep(0.1) 243 | logger.debug(f"Focused element with selector {selector} to enter text") 244 | # add a 100ms delay 245 | await page.keyboard.type(text_to_enter, delay=1) 246 | else: 247 | await custom_fill_element(page, selector, text_to_enter) 248 | await elem.focus() 249 | logger.info( 250 | f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}' 251 | ) 252 | success_msg = f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}' 253 | return { 254 | "summary_message": success_msg, 255 | "detailed_message": f"{success_msg} and outer HTML: {element_outer_html}.", 256 | } 257 | 258 | except Exception as e: 259 | traceback.print_exc() 260 | error = f"Error entering text in selector {selector}." 261 | # logger.info("Error in do_entertext", error) 262 | return {"summary_message": error, "detailed_message": f"{error} Error: {e}"} 263 | 264 | 265 | async def bulk_enter_text( 266 | entries: Annotated[ 267 | List[Dict[str, str]], 268 | "List of objects, each containing 'query_selector' and 'text'.", 269 | ], # noqa: UP006 270 | ) -> Annotated[ 271 | List[Dict[str, str]], 272 | "List of dictionaries, each containing 'query_selector' and the result of the operation.", 273 | ]: # noqa: UP006 274 | """ 275 | Enters text into multiple DOM elements using a bulk operation. 276 | 277 | This function enters text into multiple DOM elements using a bulk operation. 278 | It takes a list of dictionaries, where each dictionary contains a 'query_selector' and 'text' pair. 279 | The function internally calls the 'entertext' function to perform the text entry operation for each entry. 280 | 281 | Args: 282 | entries: List of objects, each containing 'query_selector' and 'text'. 283 | 284 | Returns: 285 | List of dictionaries, each containing 'query_selector' and the result of the operation. 286 | 287 | Example: 288 | entries = [ 289 | {"query_selector": "#username", "text": "test_user"}, 290 | {"query_selector": "#password", "text": "test_password"} 291 | ] 292 | results = await bulk_enter_text(entries) 293 | 294 | Note: 295 | - Each entry in the 'entries' list should be a dictionary with 'query_selector' and 'text' keys. 296 | - The result is a list of dictionaries, where each dictionary contains the 'query_selector' and the result of the operation. 297 | """ 298 | 299 | results: List[Dict[str, str]] = [] # noqa: UP006 300 | logger.info("Executing bulk Enter Text Command") 301 | for entry in entries: 302 | query_selector = entry["query_selector"] 303 | text_to_enter = entry["text"] 304 | logger.info( 305 | f"Entering text: {text_to_enter} in element with selector: {query_selector}" 306 | ) 307 | result = await entertext( 308 | EnterTextEntry(query_selector=query_selector, text=text_to_enter) 309 | ) 310 | 311 | results.append({"query_selector": query_selector, "result": result}) 312 | 313 | return results 314 | -------------------------------------------------------------------------------- /sentient/core/skills/get_dom_with_content_type.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from typing import Any, Union, Dict 4 | 5 | from playwright.async_api import Page 6 | from typing_extensions import Annotated 7 | 8 | from sentient.config.config import SOURCE_LOG_FOLDER_PATH 9 | from sentient.core.web_driver.playwright import PlaywrightManager 10 | from sentient.utils.dom_helper import wait_for_non_loading_dom_state 11 | from sentient.utils.get_detailed_accessibility_tree import do_get_accessibility_info 12 | from sentient.utils.logger import logger 13 | 14 | 15 | async def get_dom_with_content_type( 16 | content_type: Annotated[ 17 | str, 18 | "The type of content to extract: 'text_only': Extracts the innerText of the highest element in the document and responds with text, or 'input_fields': Extracts the text input and button elements in the dom.", 19 | ], 20 | ) -> Annotated[ 21 | Union[Dict[str, Any], str, None], 22 | "The output based on the specified content type.", 23 | ]: 24 | """ 25 | Retrieves and processes the DOM of the active page in a browser instance based on the specified content type. 26 | 27 | Parameters 28 | ---------- 29 | content_type : str 30 | The type of content to extract. Possible values are: 31 | - 'text_only': Extracts the innerText of the highest element in the document and responds with text. 32 | - 'input_fields': Extracts the text input and button elements in the DOM and responds with a JSON object. 33 | - 'all_fields': Extracts all the fields in the DOM and responds with a JSON object. 34 | 35 | Returns 36 | ------- 37 | Dict[str, Any] | str | None 38 | The processed content based on the specified content type. This could be: 39 | - A JSON object for 'input_fields' with just inputs. 40 | - Plain text for 'text_only'. 41 | - A minified DOM represented as a JSON object for 'all_fields'. 42 | 43 | Raises 44 | ------ 45 | ValueError 46 | If an unsupported content_type is provided. 47 | """ 48 | 49 | logger.info(f"Executing Get DOM Command based on content_type: {content_type}") 50 | start_time = time.time() 51 | # Create and use the PlaywrightManager 52 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 53 | page = await browser_manager.get_current_page() 54 | if page is None: # type: ignore 55 | raise ValueError("No active page found. OpenURL command opens a new page.") 56 | 57 | extracted_data = None 58 | await wait_for_non_loading_dom_state( 59 | page, 2000 60 | ) # wait for the DOM to be ready, non loading means external resources do not need to be loaded 61 | user_success_message = "" 62 | if content_type == "all_fields": 63 | user_success_message = "Fetched all the fields in the DOM" 64 | extracted_data = await do_get_accessibility_info(page, only_input_fields=False) 65 | elif content_type == "input_fields": 66 | logger.debug("Fetching DOM for input_fields") 67 | extracted_data = await do_get_accessibility_info(page, only_input_fields=True) 68 | if extracted_data is None: 69 | return "Could not fetch input fields. Please consider trying with content_type all_fields." 70 | user_success_message = "Fetched only input fields in the DOM" 71 | elif content_type == "text_only": 72 | # Extract text from the body or the highest-level element 73 | logger.debug("Fetching DOM for text_only") 74 | text_content = await get_filtered_text_content(page) 75 | with open( 76 | os.path.join(SOURCE_LOG_FOLDER_PATH, "text_only_dom.txt"), 77 | "w", 78 | encoding="utf-8", 79 | ) as f: 80 | f.write(text_content) 81 | extracted_data = text_content 82 | user_success_message = "Fetched the text content of the DOM" 83 | else: 84 | raise ValueError(f"Unsupported content_type: {content_type}") 85 | 86 | elapsed_time = time.time() - start_time 87 | logger.info(f"Get DOM Command executed in {elapsed_time} seconds") 88 | # await browser_manager.notify_user( 89 | # user_success_message, message_type=MessageType.ACTION 90 | # ) 91 | return extracted_data # type: ignore 92 | 93 | 94 | async def get_filtered_text_content(page: Page) -> str: 95 | text_content = await page.evaluate(""" 96 | () => { 97 | // Array of query selectors to filter out 98 | const selectorsToFilter = ['#agente-overlay']; 99 | 100 | // Store the original visibility values to revert later 101 | const originalStyles = []; 102 | 103 | // Hide the elements matching the query selectors 104 | selectorsToFilter.forEach(selector => { 105 | const elements = document.querySelectorAll(selector); 106 | elements.forEach(element => { 107 | originalStyles.push({ element: element, originalStyle: element.style.visibility }); 108 | element.style.visibility = 'hidden'; 109 | }); 110 | }); 111 | 112 | // Get the text content of the page 113 | let textContent = document?.body?.innerText || document?.documentElement?.innerText || ""; 114 | 115 | // Get all the alt text from images on the page 116 | let altTexts = Array.from(document.querySelectorAll('img')).map(img => img.alt); 117 | altTexts="Other Alt Texts in the page: " + altTexts.join(' '); 118 | 119 | // Revert the visibility changes 120 | originalStyles.forEach(entry => { 121 | entry.element.style.visibility = entry.originalStyle; 122 | }); 123 | textContent=textContent+" "+altTexts; 124 | return textContent; 125 | } 126 | """) 127 | return text_content 128 | -------------------------------------------------------------------------------- /sentient/core/skills/get_screenshot.py: -------------------------------------------------------------------------------- 1 | import base64 2 | 3 | from typing_extensions import Annotated 4 | 5 | from sentient.core.web_driver.playwright import PlaywrightManager 6 | from sentient.utils.logger import logger 7 | 8 | 9 | async def get_screenshot() -> ( 10 | Annotated[ 11 | str, "Returns a base64 encoded screenshot of the current active web page." 12 | ] 13 | ): 14 | """ 15 | Captures and returns a base64 encoded screenshot of the current page (only the visible viewport and not the full page) 16 | 17 | Returns: 18 | - Base64 encoded string of the screenshot image. 19 | """ 20 | 21 | try: 22 | # Create and use the PlaywrightManager 23 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 24 | page = await browser_manager.get_current_page() 25 | logger.info("page {page}") 26 | 27 | if not page: 28 | logger.info("No active page found. OpenURL command opens a new page.") 29 | raise ValueError("No active page found. OpenURL command opens a new page.") 30 | 31 | await page.wait_for_load_state("domcontentloaded") 32 | 33 | # Capture the screenshot 34 | logger.info("about to capture") 35 | screenshot_bytes = await page.screenshot(full_page=False) 36 | 37 | # Encode the screenshot as base64 38 | base64_screenshot = base64.b64encode(screenshot_bytes).decode("utf-8") 39 | 40 | return f"data:image/png;base64,{base64_screenshot}" 41 | 42 | except Exception as e: 43 | raise ValueError( 44 | "Failed to capture screenshot. Make sure a page is open and accessible." 45 | ) from e 46 | -------------------------------------------------------------------------------- /sentient/core/skills/get_url.py: -------------------------------------------------------------------------------- 1 | from typing_extensions import Annotated 2 | 3 | from sentient.core.web_driver.playwright import PlaywrightManager 4 | 5 | 6 | async def geturl() -> ( 7 | Annotated[str, "Returns the full URL of the current active web site/page."] 8 | ): 9 | """ 10 | Returns the full URL of the current page 11 | 12 | Parameters: 13 | 14 | Returns: 15 | - Full URL the browser's active page. 16 | """ 17 | 18 | try: 19 | # Create and use the PlaywrightManager 20 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 21 | page = await browser_manager.get_current_page() 22 | 23 | if not page: 24 | raise ValueError("No active page found. OpenURL command opens a new page.") 25 | 26 | await page.wait_for_load_state("domcontentloaded") 27 | 28 | # Get the URL of the current page 29 | try: 30 | title = await page.title() 31 | current_url = page.url 32 | if len(current_url) > 250: 33 | current_url = current_url[:250] + "..." 34 | return f"Current Page: {current_url}, Title: {title}" # type: ignore 35 | except: # noqa: E722 36 | current_url = page.url 37 | return f"Current Page: {current_url}" 38 | 39 | except Exception as e: 40 | raise ValueError( 41 | "No active page found. OpenURL command opens a new page." 42 | ) from e 43 | -------------------------------------------------------------------------------- /sentient/core/skills/get_user_input.py: -------------------------------------------------------------------------------- 1 | from typing import ( 2 | Dict, 3 | List, # noqa: UP035, 4 | ) 5 | 6 | from typing_extensions import Annotated 7 | 8 | from sentient.core.web_driver.playwright import PlaywrightManager 9 | from sentient.utils.cli_helper import answer_questions_over_cli 10 | 11 | 12 | async def get_user_input( 13 | questions: Annotated[ 14 | List[str], "List of questions to ask the user each one represented as a string" 15 | ], 16 | ) -> Dict[str, str]: # noqa: UP006 17 | """ 18 | Asks the user a list of questions and returns the answers in a dictionary. 19 | 20 | Parameters: 21 | - questions: A list of questions to ask the user ["What is Username?", "What is your password?"]. 22 | 23 | Returns: 24 | - Newline separated list of questions to ask the user 25 | """ 26 | 27 | answers: Dict[str, str] = {} 28 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 29 | if browser_manager.ui_manager: 30 | for question in questions: 31 | answers[question] = await browser_manager.prompt_user( 32 | f"Question: {question}" 33 | ) 34 | else: 35 | answers = await answer_questions_over_cli(questions) 36 | return answers 37 | -------------------------------------------------------------------------------- /sentient/core/skills/open_url.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | 4 | from playwright.async_api import TimeoutError as PlaywrightTimeoutError 5 | from typing_extensions import Annotated 6 | 7 | from sentient.core.web_driver.playwright import PlaywrightManager 8 | from sentient.utils.logger import logger 9 | 10 | 11 | async def openurl( 12 | url: Annotated[ 13 | str, 14 | "The URL to navigate to. Value must include the protocol (http:// or https://).", 15 | ], 16 | timeout: Annotated[int, "Additional wait time in seconds after initial load."], 17 | max_retries: Annotated[int, "Maximum number of retry attempts"] = 3, 18 | ) -> Annotated[str, "Returns the result of this request in text form"]: 19 | """ 20 | Opens a specified URL in the active browser instance. Waits for an initial load event, then waits for either 21 | the 'domcontentloaded' event or a configurable timeout, whichever comes first. 22 | 23 | Parameters: 24 | - url: The URL to navigate to. 25 | - timeout: Additional time in seconds to wait after the initial load before considering the navigation successful. 26 | - max_retries: Maximum number of retry attempts (default: 3). 27 | 28 | Returns: 29 | - URL of the new page. 30 | """ 31 | logger.info(f"Opening URL: {url}") 32 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 33 | await browser_manager.get_browser_context() 34 | page = await browser_manager.get_current_page() 35 | # Navigate to the URL with a short timeout to ensure the initial load starts 36 | function_name = inspect.currentframe().f_code.co_name # type: ignore 37 | url = ensure_protocol(url) 38 | 39 | for attempt in range(max_retries): 40 | try: 41 | await browser_manager.take_screenshots(f"{function_name}_start", page) 42 | 43 | # Use a longer timeout for navigation 44 | await page.goto( 45 | url, timeout=max(30000, timeout * 1000), wait_until="domcontentloaded" 46 | ) 47 | 48 | # Wait for network idle to ensure page is fully loaded 49 | await page.wait_for_load_state( 50 | "domcontentloaded", timeout=max(30000, timeout * 1000) 51 | ) 52 | 53 | await browser_manager.take_screenshots(f"{function_name}_end", page) 54 | 55 | title = await page.title() 56 | final_url = page.url 57 | logger.info(f"Successfully loaded page: {final_url}") 58 | return f"Page loaded: {final_url}, Title: {title}" 59 | 60 | except PlaywrightTimeoutError as e: 61 | logger.warning(f"Timeout error on attempt {attempt + 1}: {e}") 62 | if attempt == max_retries - 1: 63 | logger.error(f"Failed to load {url} after {max_retries} attempts") 64 | return f"Failed to load page: {url}. Error: Timeout after {max_retries} attempts" 65 | await asyncio.sleep(2) # Wait before retrying 66 | 67 | except Exception as e: 68 | logger.error(f"Error navigating to {url}: {e}") 69 | return f"Failed to load page: {url}. Error: {str(e)}" 70 | 71 | await browser_manager.take_screenshots(f"{function_name}_end", page) 72 | 73 | # await browser_manager.notify_user( 74 | # f"Opened URL: {url}", message_type=MessageType.ACTION 75 | # ) 76 | # Get the page title 77 | title = await page.title() 78 | url = page.url 79 | return f"Page loaded: {url}, Title: {title}" # type: ignore 80 | 81 | 82 | def ensure_protocol(url: str) -> str: 83 | """ 84 | Ensures that a URL has a protocol (http:// or https://). If it doesn't have one, 85 | https:// is added by default. 86 | 87 | Parameters: 88 | - url: The URL to check and modify if necessary. 89 | 90 | Returns: 91 | - A URL string with a protocol. 92 | """ 93 | if not url.startswith(("http://", "https://")): 94 | url = "https://" + url # Default to http if no protocol is specified 95 | logger.info( 96 | f"Added 'https://' protocol to URL because it was missing. New URL is: {url}" 97 | ) 98 | return url 99 | -------------------------------------------------------------------------------- /sentient/core/skills/pdf_text_extractor.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import httpx 4 | import pdfplumber 5 | from typing_extensions import Annotated 6 | 7 | from sentient.config.config import PROJECT_TEMP_PATH 8 | from sentient.core.web_driver.playwright import PlaywrightManager 9 | from sentient.utils.logger import logger 10 | from sentient.utils.message_type import MessageType 11 | 12 | 13 | async def extract_text_from_pdf( 14 | pdf_url: Annotated[str, "The URL of the PDF file to extract text from."], 15 | ) -> Annotated[str, "All the text found in the PDF file."]: 16 | """ 17 | Extract text from a PDF file. 18 | pdf_url: str - The URL of the PDF file to extract text from. 19 | returns: str - All the text found in the PDF. 20 | """ 21 | file_path = os.path.join( 22 | PROJECT_TEMP_PATH, "downloaded_file.pdf" 23 | ) # fixed file path for downloading the PDF 24 | 25 | try: 26 | # Create and use the PlaywrightManager 27 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 28 | 29 | # Download the PDF 30 | download_result = await download_pdf(pdf_url, file_path) 31 | if not os.path.exists(download_result): 32 | return download_result # Return error message if download failed 33 | 34 | # Open the PDF using pdfplumber and extract text 35 | text = "" 36 | with pdfplumber.open(download_result) as pdf: 37 | for page in pdf.pages: 38 | page_text = page.extract_text() 39 | if page_text: 40 | text += page_text + "\n" 41 | extracted_text = text.strip() 42 | word_count = len(extracted_text.split()) 43 | await browser_manager.notify_user( 44 | f"Extracted text from the PDF successfully. Found {word_count} words.", 45 | message_type=MessageType.ACTION, 46 | ) 47 | return "Text found in the PDF:\n" + extracted_text 48 | except httpx.HTTPStatusError as e: 49 | logger.error( 50 | f"An error occurred while downloading the PDF from {pdf_url}: {str(e)}" 51 | ) 52 | return f"An error occurred while downloading the PDF: {str(e)}" 53 | except Exception as e: 54 | logger.error( 55 | f"An error occurred while extracting text from the PDF that was downloaded from {pdf_url}: {str(e)}" 56 | ) 57 | return f"An error occurred while extracting text: {str(e)}" 58 | finally: 59 | # Cleanup: Ensure the downloaded file is removed 60 | cleanup_temp_files(file_path) 61 | 62 | 63 | def cleanup_temp_files(*file_paths: str) -> None: 64 | """ 65 | Remove the specified temporary files. 66 | 67 | *file_paths: str - One or more file paths to be removed. 68 | """ 69 | for file_path in file_paths: 70 | if os.path.exists(file_path): 71 | try: 72 | os.remove(file_path) 73 | logger.debug(f"Cleaned file from the filesystem: {file_path}") 74 | except Exception as e: 75 | logger.error(f"Failed to remove {file_path}: {str(e)}") 76 | else: 77 | logger.debug( 78 | f"File not found. Unable to clean it from the filesystem: {file_path}" 79 | ) 80 | 81 | 82 | async def download_pdf(pdf_url: str, file_path: str) -> str: 83 | """ 84 | Download the PDF file from the given URL and save it to the specified path. 85 | 86 | pdf_url: str - The URL of the PDF file to download. 87 | file_path: str - The local path to save the downloaded PDF. 88 | 89 | returns: str - The file path of the downloaded PDF if successful, otherwise an error message. 90 | raises: Exception - If an error occurs during the download process. 91 | """ 92 | try: 93 | logger.info(f"Downloading PDF from: {pdf_url} to: {file_path}") 94 | async with httpx.AsyncClient() as client: 95 | response = await client.get(pdf_url) 96 | response.raise_for_status() # Ensure the request was successful 97 | with open(file_path, "wb") as pdf_file: 98 | pdf_file.write(response.content) 99 | return file_path 100 | # except httpx.HTTPStatusError as e: 101 | # raise e 102 | except Exception as e: 103 | raise e 104 | -------------------------------------------------------------------------------- /sentient/core/skills/press_key_combination.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import inspect 3 | 4 | from playwright.async_api import Page # type: ignore 5 | from typing_extensions import Annotated 6 | 7 | from sentient.core.web_driver.playwright import PlaywrightManager 8 | from sentient.utils.dom_mutation_observer import ( 9 | subscribe, # type: ignore 10 | unsubscribe, # type: ignore 11 | ) 12 | from sentient.utils.logger import logger 13 | 14 | 15 | async def press_key_combination( 16 | key_combination: Annotated[str, "The key to press, e.g., Enter, PageDown etc"], 17 | ) -> str: 18 | """ 19 | Presses a key combination on the current active page managed by PlaywrightManager. 20 | 21 | This function simulates the pressing of a key or a combination of keys on the current active web page. 22 | The `key_combination` should be a string that represents the keys to be pressed, separated by '+' if it's a combination. 23 | For example, 'Control+C' to copy or 'Alt+F4' to close a window on Windows. 24 | 25 | Parameters: 26 | - key_combination (Annotated[str, "The key combination to press, e.g., 'Control+C'."]): The key combination to press, represented as a string. For combinations, use '+' as a separator. 27 | 28 | Raises: 29 | - ValueError: If no active page is found. 30 | 31 | Returns: 32 | str: status of the operation expressed as a string 33 | """ 34 | 35 | logger.info(f"Executing press_key_combination with key combo: {key_combination}") 36 | # Create and use the PlaywrightManager 37 | browser_manager = PlaywrightManager() 38 | page = await browser_manager.get_current_page() 39 | 40 | if page is None: # type: ignore 41 | raise ValueError("No active page found. OpenURL command opens a new page.") 42 | 43 | # Split the key combination if it's a combination of keys 44 | keys = key_combination.split("+") 45 | 46 | dom_changes_detected = None 47 | 48 | def detect_dom_changes(changes: str): # type: ignore 49 | nonlocal dom_changes_detected 50 | dom_changes_detected = changes # type: ignore 51 | 52 | subscribe(detect_dom_changes) 53 | # If it's a combination, hold down the modifier keys 54 | for key in keys[:-1]: # All keys except the last one are considered modifier keys 55 | await page.keyboard.down(key) 56 | 57 | # Press the last key in the combination 58 | await page.keyboard.press(keys[-1]) 59 | 60 | # Release the modifier keys 61 | for key in keys[:-1]: 62 | await page.keyboard.up(key) 63 | await asyncio.sleep( 64 | 0.1 65 | ) # sleep for 100ms to allow the mutation observer to detect changes 66 | unsubscribe(detect_dom_changes) 67 | 68 | if dom_changes_detected: 69 | return f"Key {key_combination} executed successfully.\n As a consequence of this action, new elements have appeared in view:{dom_changes_detected}. This means that the action is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction." 70 | 71 | # await browser_manager.notify_user( 72 | # f"Key {key_combination} executed successfully", message_type=MessageType.ACTION 73 | # ) 74 | return f"Key {key_combination} executed successfully" 75 | 76 | 77 | async def do_press_key_combination( 78 | browser_manager: PlaywrightManager, page: Page, key_combination: str 79 | ) -> bool: 80 | """ 81 | Presses a key combination on the provided page. 82 | 83 | This function simulates the pressing of a key or a combination of keys on a web page. 84 | The `key_combination` should be a string that represents the keys to be pressed, separated by '+' if it's a combination. 85 | For example, 'Control+C' to copy or 'Alt+F4' to close a window on Windows. 86 | 87 | Parameters: 88 | - browser_manager (PlaywrightManager): The PlaywrightManager instance. 89 | - page (Page): The Playwright page instance. 90 | - key_combination (str): The key combination to press, represented as a string. For combinations, use '+' as a separator. 91 | 92 | Returns: 93 | bool: True if success and False if failed 94 | """ 95 | 96 | logger.info(f"Executing press_key_combination with key combo: {key_combination}") 97 | try: 98 | function_name = inspect.currentframe().f_code.co_name # type: ignore 99 | await browser_manager.take_screenshots(f"{function_name}_start", page) 100 | # Split the key combination if it's a combination of keys 101 | keys = key_combination.split("+") 102 | 103 | # If it's a combination, hold down the modifier keys 104 | for key in keys[ 105 | :-1 106 | ]: # All keys except the last one are considered modifier keys 107 | await page.keyboard.down(key) 108 | 109 | # Press the last key in the combination 110 | await page.keyboard.press(keys[-1]) 111 | 112 | # Release the modifier keys 113 | for key in keys[:-1]: 114 | await page.keyboard.up(key) 115 | 116 | except Exception as e: 117 | logger.error(f'Error executing press_key_combination "{key_combination}": {e}') 118 | return False 119 | 120 | await browser_manager.take_screenshots(f"{function_name}_end", page) 121 | 122 | return True 123 | -------------------------------------------------------------------------------- /sentient/core/skills/upload_file.py: -------------------------------------------------------------------------------- 1 | from typing_extensions import Annotated 2 | 3 | from sentient.core.web_driver.playwright import PlaywrightManager 4 | from sentient.utils.logger import logger 5 | 6 | 7 | async def upload_file( 8 | # label: Annotated[str, "Label for the element on which upload should happen"], 9 | selector: Annotated[ 10 | str, 11 | "The properly formed query selector string to identify the file input element (e.g. [mmid='114']). When \"mmid\" attribute is present, use it for the query selector. mmid will always be a number", 12 | ], 13 | file_path: Annotated[str, "Path on the local system for the file to be uploaded"], 14 | ) -> Annotated[str, "A meesage indicating if the file uplaod was successful"]: 15 | """ 16 | Uploads a file. 17 | 18 | Parameters: 19 | - file_path: Path of the file that needs to be uploaded. 20 | 21 | Returns: 22 | - A message indicating the success or failure of the file upload 23 | """ 24 | logger.info( 25 | f"Uploading file onto the page from {file_path} using selector {selector}" 26 | ) 27 | print("naman-selector") 28 | # print(label) 29 | # label = "Add File" 30 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 31 | page = await browser_manager.get_current_page() 32 | 33 | if not page: 34 | raise ValueError("No active page found. OpenURL command opens a new page") 35 | 36 | await page.wait_for_load_state("domcontentloaded") 37 | 38 | try: 39 | await page.locator(selector).set_input_files(file_path) 40 | # await page.get_by_label(label).set_input_files(file_path) 41 | logger.info( 42 | "File upload was successful. I can confirm it. Please proceed ahead with next step." 43 | ) 44 | except Exception as e: 45 | logger.error(f"Failed to upload file: {e}") 46 | return f"File upload failed {e}" 47 | -------------------------------------------------------------------------------- /sentient/core/web_driver/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/core/web_driver/__init__.py -------------------------------------------------------------------------------- /sentient/core/web_driver/playwright.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import time 3 | from typing import List, Union 4 | 5 | from playwright.async_api import BrowserContext, Page, Playwright 6 | from playwright.async_api import async_playwright as playwright 7 | 8 | from sentient.utils.dom_mutation_observer import ( 9 | dom_mutation_change_detected, 10 | handle_navigation_for_mutation_observer, 11 | ) 12 | from sentient.utils.logger import logger 13 | from sentient.utils.ui_messagetype import MessageType 14 | 15 | # TODO - Create a wrapper browser manager class that either starts a playwright manager (our solution) or a hosted browser manager like browserbase 16 | 17 | 18 | class PlaywrightManager: 19 | _homepage = "https://google.com" 20 | _playwright = None 21 | _browser_context = None 22 | __async_initialize_done = False 23 | _instance = None 24 | _take_screenshots = False 25 | _screenshots_dir = None 26 | 27 | def __new__(cls, *args, **kwargs): # type: ignore 28 | """ 29 | Ensures that only one instance of PlaywrightManager is created (singleton pattern). 30 | """ 31 | if cls._instance is None: 32 | cls._instance = super().__new__(cls) 33 | cls._instance.__initialized = False 34 | logger.debug("Browser instance created..") 35 | return cls._instance 36 | 37 | def __init__( 38 | self, 39 | browser_type: str = "chromium", 40 | headless: bool = False, 41 | gui_input_mode: bool = True, 42 | screenshots_dir: str = "", 43 | take_screenshots: bool = False, 44 | ): 45 | """ 46 | Initializes the PlaywrightManager with the specified browser type and headless mode. 47 | Initialization occurs only once due to the singleton pattern. 48 | 49 | Args: 50 | browser_type (str, optional): The type of browser to use. Defaults to "chromium". 51 | headless (bool, optional): Flag to launch the browser in headless mode or not. Defaults to False (non-headless). 52 | """ 53 | if self.__initialized: 54 | return 55 | self.browser_type = browser_type 56 | self.isheadless = headless 57 | self.__initialized = True 58 | # self.notification_manager = NotificationManager() 59 | # self.user_response_event = asyncio.Event() 60 | # if gui_input_mode: 61 | # self.ui_manager: UIManager = UIManager() 62 | self.set_take_screenshots(take_screenshots) 63 | self.set_screenshots_dir(screenshots_dir) 64 | 65 | async def async_initialize(self, eval_mode: bool = False): 66 | """ 67 | Asynchronously initialize necessary components and handlers for the browser context. 68 | """ 69 | if self.__async_initialize_done: 70 | return 71 | 72 | # Step 1: Ensure Playwright is started and browser context is created 73 | await self.start_playwright() 74 | self.eval_mode = eval_mode 75 | await self.ensure_browser_context() 76 | 77 | # Step 2: Deferred setup of handlers 78 | # await self.setup_handlers() 79 | 80 | # Step 3: Navigate to homepage 81 | await self.go_to_homepage() 82 | 83 | self.__async_initialize_done = True 84 | 85 | async def ensure_browser_context(self): 86 | """ 87 | Ensure that a browser context exists, creating it if necessary. 88 | """ 89 | if self._browser_context is None: 90 | await self.create_browser_context() 91 | 92 | # async def setup_handlers(self): 93 | # """ 94 | # Setup various handlers after the browser context has been ensured. 95 | # """ 96 | # await self.set_overlay_state_handler() 97 | # await self.set_user_response_handler() 98 | # await self.set_navigation_handler() 99 | 100 | async def start_playwright(self): 101 | """ 102 | Starts the Playwright instance if it hasn't been started yet. This method is idempotent. 103 | """ 104 | if not PlaywrightManager._playwright: 105 | PlaywrightManager._playwright: Playwright = await playwright().start() 106 | 107 | async def stop_playwright(self): 108 | """ 109 | Stops the Playwright instance and resets it to None. This method should be called to clean up resources. 110 | """ 111 | # Close the browser context if it's initialized 112 | if PlaywrightManager._browser_context is not None: 113 | await PlaywrightManager._browser_context.close() 114 | PlaywrightManager._browser_context = None 115 | 116 | # Stop the Playwright instance if it's initialized 117 | if PlaywrightManager._playwright is not None: # type: ignore 118 | await PlaywrightManager._playwright.stop() 119 | PlaywrightManager._playwright = None # type: ignore 120 | 121 | async def create_browser_context(self): 122 | # load_dotenv() 123 | # user_data_dir: str = os.environ["BROWSER_USER_DATA_DIR"] 124 | # profile_directory: str = os.environ["BROWSER_PROFILE"] 125 | # print("Browser profile", user_data_dir) 126 | # logger.info("Browser Profile - " + user_data_dir + profile_directory) 127 | try: 128 | # PlaywrightManager._browser_context = ( 129 | # await PlaywrightManager._playwright.chromium.launch_persistent_context( 130 | # user_data_dir=user_data_dir, 131 | # channel="chrome", 132 | # headless=self.isheadless, 133 | # args=[ 134 | # f"--profile-directory={profile_directory}", 135 | # "--disable-session-crashed-bubble", 136 | # "--disable-infobars", 137 | # "--no-default-browser-check", 138 | # "--no-first-run", 139 | # "--disable-popup-blocking", 140 | # "--disable-notifications", 141 | # "--disable-features=ChromeWhatsNewUI", 142 | # "--disable-blink-features=AutomationControlled", 143 | # "--disable-gpu", 144 | # "--no-sandbox", 145 | # "--disable-dev-shm-usage", 146 | # "--no-first-run", 147 | # "--no-zygote", 148 | # "--ignore-certificate-errors", 149 | # "--disable-popup-blocking", 150 | # "--remote-debugging-port=9222", 151 | # "--restore-last-session", 152 | # ], 153 | # ignore_default_args=["--enable-automation", "--bwsi"], 154 | # no_viewport=True, 155 | # ) 156 | # ) 157 | 158 | # await PlaywrightManager._playwright.chromium.launch_persistent_context( 159 | # user_data_dir=user_data_dir, 160 | # channel="chrome", 161 | # headless=False, 162 | # args=[ 163 | # f"--profile-directory={profile_directory}", 164 | # "--remote-debugging-port=9224", 165 | # ], 166 | # no_viewport=True, 167 | # ) 168 | 169 | # in eval mode - start a temp browser. 170 | if self.eval_mode: 171 | print("Starting in eval mode", self.eval_mode) 172 | new_user_dir = tempfile.mkdtemp() 173 | logger.info( 174 | f"Starting a temporary browser instance. trying to launch with a new user dir {new_user_dir}" 175 | ) 176 | PlaywrightManager._browser_context = await PlaywrightManager._playwright.chromium.launch_persistent_context( 177 | new_user_dir, 178 | channel="chrome", 179 | headless=self.isheadless, 180 | args=[ 181 | "--disable-blink-features=AutomationControlled", 182 | "--disable-session-crashed-bubble", # disable the restore session bubble 183 | "--disable-infobars", # disable informational popups, 184 | ], 185 | no_viewport=True, 186 | ) 187 | else: 188 | browser = await PlaywrightManager._playwright.chromium.connect_over_cdp( 189 | "http://localhost:9222" 190 | ) 191 | PlaywrightManager._browser_context = browser.contexts[0] 192 | 193 | # Additional step to modify the navigator.webdriver property 194 | pages = PlaywrightManager._browser_context.pages 195 | for page in pages: 196 | # await stealth_async(page) # Apply stealth to each page 197 | await page.add_init_script(""" 198 | Object.defineProperty(navigator, 'webdriver', { 199 | get: () => undefined 200 | }) 201 | """) 202 | 203 | except Exception as e: 204 | if "Target page, context or browser has been closed" in str(e): 205 | new_user_dir = tempfile.mkdtemp() 206 | # logger.error( 207 | # f"Failed to launch persistent context with user data dir {user_data_dir}: {e} Trying to launch with a new user dir {new_user_dir}" 208 | # ) 209 | logger.error( 210 | f"Failed to launch persistent context with provided user data dir: {e} Trying to launch with a new user dir {new_user_dir}" 211 | ) 212 | PlaywrightManager._browser_context = await PlaywrightManager._playwright.chromium.launch_persistent_context( 213 | new_user_dir, 214 | channel="chrome", 215 | headless=self.isheadless, 216 | args=[ 217 | "--disable-blink-features=AutomationControlled", 218 | "--disable-session-crashed-bubble", # disable the restore session bubble 219 | "--disable-infobars", # disable informational popups, 220 | ], 221 | no_viewport=True, 222 | ) 223 | # # Apply stealth to the new context 224 | # for page in PlaywrightManager._browser_context.pages: 225 | # await stealth_async(page) 226 | elif "Chromium distribution 'chrome' is not found " in str(e): 227 | raise ValueError( 228 | "Chrome is not installed on this device. Install Google Chrome or install playwright using 'playwright install chrome'. Refer to the readme for more information." 229 | ) from None 230 | else: 231 | raise e from None 232 | 233 | async def get_browser_context(self): 234 | """ 235 | Returns the existing browser context, or creates a new one if it doesn't exist. 236 | """ 237 | await self.ensure_browser_context() 238 | return self._browser_context 239 | 240 | async def get_current_url(self) -> Union[str, None]: 241 | """ 242 | Get the current URL of current page 243 | 244 | Returns: 245 | str | None: The current URL if any. 246 | """ 247 | try: 248 | current_page: Page = await self.get_current_page() 249 | return current_page.url 250 | except Exception: 251 | pass 252 | return None 253 | 254 | async def get_current_page(self) -> Page: 255 | """ 256 | Get the current page of the browser 257 | 258 | Returns: 259 | Page: The current page if any. 260 | """ 261 | try: 262 | browser: BrowserContext = await self.get_browser_context() # type: ignore 263 | # Filter out closed pages 264 | pages: List[Page] = [page for page in browser.pages if not page.is_closed()] 265 | page: Union[Page, None] = pages[-1] if pages else None 266 | logger.debug(f"Current page: {page.url if page else None}") 267 | if page is not None: 268 | return page 269 | else: 270 | page: Page = await browser.new_page() # type: ignore 271 | # await stealth_async(page) # Apply stealth to the new page 272 | return page 273 | except Exception as e: 274 | logger.warn(f"Browser context was closed. Creating a new one. {e}") 275 | except Exception as e: 276 | logger.warn(f"Browser context was closed. Creating a new one. {e}") 277 | PlaywrightManager._browser_context = None 278 | _browser: BrowserContext = await self.get_browser_context() # type: ignore 279 | page: Union[Page, None] = await self.get_current_page() 280 | return page 281 | 282 | async def close_all_tabs(self, keep_first_tab: bool = True): 283 | """ 284 | Closes all tabs in the browser context, except for the first tab if `keep_first_tab` is set to True. 285 | 286 | Args: 287 | keep_first_tab (bool, optional): Whether to keep the first tab open. Defaults to True. 288 | """ 289 | browser_context = await self.get_browser_context() 290 | pages: List[Page] = browser_context.pages # type: ignore 291 | pages_to_close: List[Page] = pages[1:] if keep_first_tab else pages # type: ignore 292 | for page in pages_to_close: # type: ignore 293 | await page.close() # type: ignore 294 | 295 | async def close_except_specified_tab(self, page_to_keep: Page): 296 | """ 297 | Closes all tabs in the browser context, except for the specified tab. 298 | 299 | Args: 300 | page_to_keep (Page): The Playwright page object representing the tab that should remain open. 301 | """ 302 | browser_context = await self.get_browser_context() 303 | for page in browser_context.pages: # type: ignore 304 | if page != page_to_keep: # Check if the current page is not the one to keep 305 | await page.close() # type: ignore 306 | 307 | async def go_to_homepage(self): 308 | page: Page = await PlaywrightManager.get_current_page(self) 309 | try: 310 | await page.goto(self._homepage, timeout=10000) # 10 seconds timeout 311 | except Exception as e: 312 | logger.error(f"Failed to navigate to homepage: {e}") 313 | # implement a retry mechanism here 314 | try: 315 | await page.goto(self._homepage, timeout=10000) # 10 seconds timeout 316 | except Exception as e: 317 | logger.error(f"Failed to navigate to homepage: {e}") 318 | # implement a retry mechanism here 319 | 320 | async def set_navigation_handler(self): 321 | page: Page = await PlaywrightManager.get_current_page(self) 322 | page.on("domcontentloaded", self.ui_manager.handle_navigation) # type: ignore 323 | page.on("domcontentloaded", handle_navigation_for_mutation_observer) # type: ignore 324 | await page.expose_function( 325 | "dom_mutation_change_detected", dom_mutation_change_detected 326 | ) # type: ignore 327 | 328 | async def set_overlay_state_handler(self): 329 | logger.debug("Setting overlay state handler") 330 | context = await self.get_browser_context() 331 | await context.expose_function( 332 | "overlay_state_changed", self.overlay_state_handler 333 | ) # type: ignore 334 | await context.expose_function( 335 | "show_steps_state_changed", self.show_steps_state_handler 336 | ) # type: ignore 337 | 338 | async def overlay_state_handler(self, is_collapsed: bool): 339 | page = await self.get_current_page() 340 | self.ui_manager.update_overlay_state(is_collapsed) 341 | if not is_collapsed: 342 | await self.ui_manager.update_overlay_chat_history(page) 343 | 344 | async def show_steps_state_handler(self, show_details: bool): 345 | page = await self.get_current_page() 346 | await self.ui_manager.update_overlay_show_details(show_details, page) 347 | 348 | async def set_user_response_handler(self): 349 | context = await self.get_browser_context() 350 | await context.expose_function("user_response", self.receive_user_response) # type: ignore 351 | 352 | # async def notify_user( 353 | # self, message: str, message_type: MessageType = MessageType.STEP 354 | # ): 355 | # """ 356 | # Notify the user with a message. 357 | 358 | # Args: 359 | # message (str): The message to notify the user with. 360 | # message_type (enum, optional): Values can be 'PLAN', 'QUESTION', 'ANSWER', 'INFO', 'STEP'. Defaults to 'STEP'. 361 | # To Do: Convert to Enum. 362 | # """ 363 | 364 | # if message.startswith(":"): 365 | # message = message[1:] 366 | 367 | # if message.endswith(","): 368 | # message = message[:-1] 369 | 370 | # if message_type == MessageType.PLAN: 371 | # message = beautify_plan_message(message) 372 | # message = "Plan:\n" + message 373 | # elif message_type == MessageType.STEP: 374 | # if "confirm" in message.lower(): 375 | # message = "Verify: " + message 376 | # else: 377 | # message = "Next step: " + message 378 | # elif message_type == MessageType.QUESTION: 379 | # message = "Question: " + message 380 | # elif message_type == MessageType.ANSWER: 381 | # message = "Response: " + message 382 | 383 | # safe_message = escape_js_message(message) 384 | # self.ui_manager.new_system_message(safe_message, message_type) 385 | 386 | # if self.ui_manager.overlay_show_details == False: # noqa: E712 387 | # if message_type not in ( 388 | # MessageType.PLAN, 389 | # MessageType.QUESTION, 390 | # MessageType.ANSWER, 391 | # MessageType.INFO, 392 | # ): 393 | # return 394 | 395 | # if self.ui_manager.overlay_show_details == True: # noqa: E712 396 | # if message_type not in ( 397 | # MessageType.PLAN, 398 | # MessageType.QUESTION, 399 | # MessageType.ANSWER, 400 | # MessageType.INFO, 401 | # MessageType.STEP, 402 | # ): 403 | # return 404 | 405 | # safe_message_type = escape_js_message(message_type.value) 406 | # try: 407 | # js_code = f"addSystemMessage({safe_message}, is_awaiting_user_response=false, message_type={safe_message_type});" 408 | # page = await self.get_current_page() 409 | # await page.evaluate(js_code) 410 | # except Exception as e: 411 | # logger.error( 412 | # f'Failed to notify user with message "{message}". However, most likey this will work itself out after the page loads: {e}' 413 | # ) 414 | 415 | # self.notification_manager.notify(message, message_type.value) 416 | 417 | async def highlight_element(self, selector: str, add_highlight: bool): 418 | try: 419 | page: Page = await self.get_current_page() 420 | if add_highlight: 421 | # Add the 'agente-ui-automation-highlight' class to the element. This class is used to apply the fading border. 422 | await page.eval_on_selector( 423 | selector, 424 | """e => { 425 | let originalBorderStyle = e.style.border; 426 | e.classList.add('agente-ui-automation-highlight'); 427 | e.addEventListener('animationend', () => { 428 | e.classList.remove('agente-ui-automation-highlight') 429 | });}""", 430 | ) 431 | logger.debug( 432 | f"Applied pulsating border to element with selector {selector} to indicate text entry operation" 433 | ) 434 | else: 435 | # Remove the 'agente-ui-automation-highlight' class from the element. 436 | await page.eval_on_selector( 437 | selector, 438 | "e => e.classList.remove('agente-ui-automation-highlight')", 439 | ) 440 | logger.debug( 441 | f"Removed pulsating border from element with selector {selector} after text entry operation" 442 | ) 443 | except Exception: 444 | # This is not significant enough to fail the operation 445 | pass 446 | 447 | # async def receive_user_response(self, response: str): 448 | # self.user_response = response # Store the response for later use. 449 | # logger.debug(f"Received user response to system prompt: {response}") 450 | # # Notify event loop that the user's response has been received. 451 | # self.user_response_event.set() 452 | 453 | # async def prompt_user(self, message: str) -> str: 454 | # """ 455 | # Prompt the user with a message and wait for a response. 456 | 457 | # Args: 458 | # message (str): The message to prompt the user with. 459 | 460 | # Returns: 461 | # str: The user's response. 462 | # """ 463 | # logger.debug(f'Prompting user with message: "{message}"') 464 | # # self.ui_manager.new_system_message(message) 465 | 466 | # page = await self.get_current_page() 467 | 468 | # await self.ui_manager.show_overlay(page) 469 | # self.log_system_message( 470 | # message, MessageType.QUESTION 471 | # ) # add the message to history after the overlay is opened to avoid double adding it. add_system_message below will add it 472 | 473 | # safe_message = escape_js_message(message) 474 | 475 | # js_code = f"addSystemMessage({safe_message}, is_awaiting_user_response=true, message_type='question');" 476 | # await page.evaluate(js_code) 477 | 478 | # await self.user_response_event.wait() 479 | # result = self.user_response 480 | # logger.info(f'User prompt reponse to "{message}": {result}') 481 | # self.user_response_event.clear() 482 | # self.user_response = "" 483 | # self.ui_manager.new_user_message(result) 484 | # return result 485 | 486 | def set_take_screenshots(self, take_screenshots: bool): 487 | self._take_screenshots = take_screenshots 488 | 489 | def get_take_screenshots(self): 490 | return self._take_screenshots 491 | 492 | def set_screenshots_dir(self, screenshots_dir: str): 493 | self._screenshots_dir = screenshots_dir 494 | 495 | def get_screenshots_dir(self): 496 | return self._screenshots_dir 497 | 498 | async def take_screenshots( 499 | self, 500 | name: str, 501 | page: Union[Page, None], 502 | full_page: bool = True, 503 | include_timestamp: bool = True, 504 | load_state: str = "domcontentloaded", 505 | take_snapshot_timeout: int = 5 * 1000, 506 | ): 507 | if not self._take_screenshots: 508 | return 509 | if page is None: 510 | page = await self.get_current_page() 511 | 512 | screenshot_name = name 513 | 514 | if include_timestamp: 515 | screenshot_name = f"{int(time.time_ns())}_{screenshot_name}" 516 | screenshot_name += ".png" 517 | screenshot_path = f"{self.get_screenshots_dir()}/{screenshot_name}" 518 | try: 519 | await page.wait_for_load_state( 520 | state=load_state, timeout=take_snapshot_timeout 521 | ) # type: ignore 522 | await page.screenshot( 523 | path=screenshot_path, 524 | full_page=full_page, 525 | timeout=take_snapshot_timeout, 526 | caret="initial", 527 | scale="device", 528 | ) 529 | logger.debug(f"Screen shot saved to: {screenshot_path}") 530 | except Exception as e: 531 | logger.error( 532 | f'Failed to take screenshot and save to "{screenshot_path}". Error: {e}' 533 | ) 534 | 535 | def log_user_message(self, message: str): 536 | """ 537 | Log the user's message. 538 | 539 | Args: 540 | message (str): The user's message to log. 541 | """ 542 | self.ui_manager.new_user_message(message) 543 | 544 | def log_system_message(self, message: str, type: MessageType = MessageType.STEP): 545 | """ 546 | Log a system message. 547 | 548 | Args: 549 | message (str): The system message to log. 550 | """ 551 | self.ui_manager.new_system_message(message, type) 552 | 553 | async def update_processing_state(self, processing_state: str): 554 | """ 555 | Update the processing state of the overlay. 556 | 557 | Args: 558 | is_processing (str): "init", "processing", "done" 559 | """ 560 | page = await self.get_current_page() 561 | 562 | await self.ui_manager.update_processing_state(processing_state, page) 563 | 564 | async def command_completed( 565 | self, command: str, elapsed_time: Union[float, None] = None 566 | ): 567 | """ 568 | Notify the overlay that the command has been completed. 569 | """ 570 | logger.debug( 571 | f'Command "{command}" has been completed. Focusing on the overlay input if it is open.' 572 | ) 573 | page = await self.get_current_page() 574 | await self.ui_manager.command_completed(page, command, elapsed_time) 575 | -------------------------------------------------------------------------------- /sentient/task_instructions/task_instructions.txt: -------------------------------------------------------------------------------- 1 | 2 | 1. Directly go to youtube.com rather than searching for the song on google! 3 | -------------------------------------------------------------------------------- /sentient/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/utils/__init__.py -------------------------------------------------------------------------------- /sentient/utils/_pydantic.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, Tuple, Union, get_args 2 | 3 | from pydantic import BaseModel 4 | from pydantic.version import VERSION as PYDANTIC_VERSION 5 | from typing_extensions import get_origin 6 | 7 | __all__ = ( 8 | "JsonSchemaValue", 9 | "model_dump", 10 | "model_dump_json", 11 | "type2schema", 12 | "evaluate_forwardref", 13 | ) 14 | 15 | PYDANTIC_V1 = PYDANTIC_VERSION.startswith("1.") 16 | 17 | if not PYDANTIC_V1: 18 | from pydantic import TypeAdapter 19 | from pydantic._internal._typing_extra import ( 20 | eval_type_lenient as evaluate_forwardref, 21 | ) 22 | from pydantic.json_schema import JsonSchemaValue 23 | 24 | def type2schema(t: Any) -> JsonSchemaValue: 25 | """Convert a type to a JSON schema 26 | 27 | Args: 28 | t (Type): The type to convert 29 | 30 | Returns: 31 | JsonSchemaValue: The JSON schema 32 | """ 33 | return TypeAdapter(t).json_schema() 34 | 35 | def model_dump(model: BaseModel) -> Dict[str, Any]: 36 | """Convert a pydantic model to a dict 37 | 38 | Args: 39 | model (BaseModel): The model to convert 40 | 41 | Returns: 42 | Dict[str, Any]: The dict representation of the model 43 | 44 | """ 45 | return model.model_dump() 46 | 47 | def model_dump_json(model: BaseModel) -> str: 48 | """Convert a pydantic model to a JSON string 49 | 50 | Args: 51 | model (BaseModel): The model to convert 52 | 53 | Returns: 54 | str: The JSON string representation of the model 55 | """ 56 | return model.model_dump_json() 57 | 58 | 59 | # Remove this once we drop support for pydantic 1.x 60 | else: # pragma: no cover 61 | from pydantic import TypeAdapter 62 | from pydantic.typing import ( 63 | evaluate_forwardref as evaluate_forwardref, # type: ignore[no-redef] 64 | ) 65 | 66 | JsonSchemaValue = Dict[str, Any] # type: ignore[misc] 67 | 68 | def type2schema(t: Any) -> JsonSchemaValue: 69 | """Convert a type to a JSON schema 70 | 71 | Args: 72 | t (Type): The type to convert 73 | 74 | Returns: 75 | JsonSchemaValue: The JSON schema 76 | """ 77 | if PYDANTIC_V1: 78 | if t is None: 79 | return {"type": "null"} 80 | elif get_origin(t) is Union: 81 | return {"anyOf": [type2schema(tt) for tt in get_args(t)]} 82 | elif get_origin(t) in [Tuple, tuple]: 83 | prefixItems = [type2schema(tt) for tt in get_args(t)] 84 | return { 85 | "maxItems": len(prefixItems), 86 | "minItems": len(prefixItems), 87 | "prefixItems": prefixItems, 88 | "type": "array", 89 | } 90 | 91 | d = TypeAdapter.json_schema(t) 92 | if "title" in d: 93 | d.pop("title") 94 | if "description" in d: 95 | d.pop("description") 96 | 97 | return d 98 | 99 | def model_dump(model: BaseModel) -> Dict[str, Any]: 100 | """Convert a pydantic model to a dict 101 | 102 | Args: 103 | model (BaseModel): The model to convert 104 | 105 | Returns: 106 | Dict[str, Any]: The dict representation of the model 107 | 108 | """ 109 | return model.dict() 110 | 111 | def model_dump_json(model: BaseModel) -> str: 112 | """Convert a pydantic model to a JSON string 113 | 114 | Args: 115 | model (BaseModel): The model to convert 116 | 117 | Returns: 118 | str: The JSON string representation of the model 119 | """ 120 | return model.json() 121 | -------------------------------------------------------------------------------- /sentient/utils/cli_helper.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from asyncio import Future 3 | from typing import Dict, List 4 | 5 | 6 | def async_input(prompt: str) -> Future: # type: ignore 7 | """ 8 | Display a prompt to the user and wait for input in an asynchronous manner. 9 | 10 | Parameters: 11 | - prompt: The message to display to the user. 12 | 13 | Returns: 14 | - A Future object that will be fulfilled with the user's input. 15 | """ 16 | loop = asyncio.get_event_loop() 17 | return loop.run_in_executor(None, input, prompt) 18 | 19 | 20 | async def answer_questions_over_cli(questions: List[str]) -> Dict[str, str]: 21 | """ 22 | Asks a question over the command line and awaits the user's response. 23 | 24 | Parameters: 25 | - questions: A list of questions to ask the user, e.g., ["What is your favorite site?", "What do you want to search for?"]. 26 | 27 | Returns: 28 | - A dictionary where each key is a question and each value is the user's response. 29 | """ 30 | answers: Dict[str, str] = {} 31 | print("*********************************") 32 | for question in questions: 33 | answers[question] = await async_input("Question: " + str(question) + " : ") 34 | print("*********************************") 35 | return answers 36 | -------------------------------------------------------------------------------- /sentient/utils/dom_helper.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import List, Optional 3 | 4 | from playwright.async_api import ElementHandle, Page 5 | 6 | from sentient.utils.logger import logger 7 | 8 | 9 | async def wait_for_non_loading_dom_state(page: Page, max_wait_millis: int): 10 | max_wait_seconds = max_wait_millis / 1000 11 | end_time = asyncio.get_event_loop().time() + max_wait_seconds 12 | while asyncio.get_event_loop().time() < end_time: 13 | dom_state = await page.evaluate("document.readyState") 14 | if dom_state != "loading": 15 | logger.debug(f"DOM state is not 'loading': {dom_state}") 16 | break # Exit the loop if the DOM state is not 'loading' 17 | 18 | await asyncio.sleep(0.05) 19 | 20 | 21 | async def get_element_outer_html( 22 | element: ElementHandle, page: Page, element_tag_name: Optional[str] = None 23 | ) -> str: 24 | """ 25 | Constructs the opening tag of an HTML element along with its attributes. 26 | 27 | Args: 28 | element (ElementHandle): The element to retrieve the opening tag for. 29 | page (Page): The page object associated with the element. 30 | element_tag_name (str, optional): The tag name of the element. Defaults to None. If not passed, it will be retrieved from the element. 31 | 32 | Returns: 33 | str: The opening tag of the HTML element, including a select set of attributes. 34 | """ 35 | tag_name: str = ( 36 | element_tag_name 37 | if element_tag_name 38 | else await page.evaluate("element => element.tagName.toLowerCase()", element) 39 | ) 40 | 41 | attributes_of_interest: List[str] = [ 42 | "id", 43 | "name", 44 | "aria-label", 45 | "placeholder", 46 | "href", 47 | "src", 48 | "aria-autocomplete", 49 | "role", 50 | "type", 51 | "data-testid", 52 | "value", 53 | "selected", 54 | "aria-labelledby", 55 | "aria-describedby", 56 | "aria-haspopup", 57 | ] 58 | opening_tag: str = f"<{tag_name}" 59 | 60 | for attr in attributes_of_interest: 61 | value: str = await element.get_attribute(attr) # type: ignore 62 | if value: 63 | opening_tag += f' {attr}="{value}"' 64 | opening_tag += ">" 65 | 66 | return opening_tag 67 | -------------------------------------------------------------------------------- /sentient/utils/dom_mutation_observer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | from typing import Callable, List # noqa: UP035 4 | 5 | from playwright.async_api import Page 6 | 7 | # Create an event loop 8 | loop = asyncio.get_event_loop() 9 | 10 | DOM_change_callback: List[Callable[[str], None]] = [] 11 | 12 | 13 | def subscribe(callback: Callable[[str], None]) -> None: 14 | DOM_change_callback.append(callback) 15 | 16 | 17 | def unsubscribe(callback: Callable[[str], None]) -> None: 18 | DOM_change_callback.remove(callback) 19 | 20 | 21 | async def add_mutation_observer(page: Page): 22 | """ 23 | Adds a mutation observer to the page to detect changes in the DOM. 24 | When changes are detected, the observer calls the dom_mutation_change_detected function in the browser context. 25 | This changes can be detected by subscribing to the dom_mutation_change_detected function by individual skills. 26 | 27 | Current implementation only detects when a new node is added to the DOM. 28 | However, in many cases, the change could be a change in the style or class of an existing node (e.g. toggle visibility of a hidden node). 29 | """ 30 | 31 | await page.evaluate(""" 32 | console.log('Adding a mutation observer for DOM changes'); 33 | new MutationObserver((mutationsList, observer) => { 34 | let changes_detected = []; 35 | for(let mutation of mutationsList) { 36 | if (mutation.type === 'childList') { 37 | let allAddedNodes=mutation.addedNodes; 38 | for(let node of allAddedNodes) { 39 | if(node.tagName && !['SCRIPT', 'NOSCRIPT', 'STYLE'].includes(node.tagName) && !node.closest('#agentDriveAutoOverlay')) { 40 | let visibility=true; 41 | let content = node.innerText.trim(); 42 | if(visibility && node.innerText.trim()){ 43 | if(content) { 44 | changes_detected.push({tag: node.tagName, content: content}); 45 | } 46 | } 47 | } 48 | } 49 | } else if (mutation.type === 'characterData') { 50 | let node = mutation.target; 51 | if(node.parentNode && !['SCRIPT', 'NOSCRIPT', 'STYLE'].includes(node.parentNode.tagName) && !node.parentNode.closest('#agentDriveAutoOverlay')) { 52 | let visibility=true; 53 | let content = node.data.trim(); 54 | if(visibility && content && window.getComputedStyle(node.parentNode).display !== 'none'){ 55 | if(content && !changes_detected.some(change => change.content.includes(content))) { 56 | changes_detected.push({tag: node.parentNode.tagName, content: content}); 57 | } 58 | } 59 | } 60 | } 61 | } 62 | if(changes_detected.length > 0) { 63 | window.dom_mutation_change_detected(JSON.stringify(changes_detected)); 64 | } 65 | }).observe(document, {subtree: true, childList: true, characterData: true}); 66 | """) 67 | 68 | 69 | async def handle_navigation_for_mutation_observer(page: Page): 70 | await add_mutation_observer(page) 71 | 72 | 73 | async def dom_mutation_change_detected(changes_detected: str): 74 | """ 75 | Detects changes in the DOM (new nodes added) and emits the event to all subscribed callbacks. 76 | The changes_detected is a string in JSON formatt containing the tag and content of the new nodes added to the DOM. 77 | 78 | e.g. The following will be detected when autocomplete recommendations show up when one types Nelson Mandela on google search 79 | [{'tag': 'SPAN', 'content': 'nelson mandela wikipedia'}, {'tag': 'SPAN', 'content': 'nelson mandela movies'}] 80 | """ 81 | changes_detected = json.loads(changes_detected.replace("\t", "").replace("\n", "")) 82 | if len(changes_detected) > 0: 83 | # Emit the event to all subscribed callbacks 84 | for callback in DOM_change_callback: 85 | # If the callback is a coroutine function 86 | if asyncio.iscoroutinefunction(callback): 87 | await callback(changes_detected) 88 | # If the callback is a regular function 89 | else: 90 | callback(changes_detected) 91 | -------------------------------------------------------------------------------- /sentient/utils/extract_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict 3 | 4 | from sentient.utils.logger import logger 5 | 6 | 7 | def extract_json(message: str) -> Dict[str, Any]: 8 | """ 9 | Parse the response from the browser agent and return the response as a dictionary. 10 | """ 11 | json_response = {} 12 | # Remove Markdown code block delimiters if present 13 | message = message.strip() 14 | if message.startswith("```"): 15 | message = message.split("\n", 1)[1] # Remove the first line 16 | if message.endswith("```"): 17 | message = message.rsplit("\n", 1)[0] # Remove the last line 18 | 19 | # Remove any leading "json" tag 20 | if message.lstrip().startswith("json"): 21 | message = message.lstrip()[4:].lstrip() 22 | 23 | try: 24 | return json.loads(message) 25 | except json.JSONDecodeError as e: 26 | logger.warn( 27 | f"LLM response was not properly formed JSON. Error: {e}. " 28 | f'LLM response: "{message}"' 29 | ) 30 | message = message.replace("\\n", "\n") 31 | message = message.replace("\n", " ") # type: ignore 32 | if "plan" in message and "next_step" in message: 33 | start = message.index("plan") + len("plan") 34 | end = message.index("next_step") 35 | json_response["plan"] = message[start:end].replace('"', "").strip() 36 | if "next_step" in message and "terminate" in message: 37 | start = message.index("next_step") + len("next_step") 38 | end = message.index("terminate") 39 | json_response["next_step"] = message[start:end].replace('"', "").strip() 40 | if "terminate" in message and "final_response" in message: 41 | start = message.index("terminate") + len("terminate") 42 | end = message.index("final_response") 43 | matched_string = message[start:end].replace('"', "").strip() 44 | if "yes" in matched_string: 45 | json_response["terminate"] = "yes" 46 | else: 47 | json_response["terminate"] = "no" 48 | 49 | start = message.index("final_response") + len("final_response") 50 | end = len(message) - 1 51 | json_response["final_response"] = ( 52 | message[start:end].replace('"', "").strip() 53 | ) 54 | 55 | elif "terminate" in message: 56 | start = message.index("terminate") + len("terminate") 57 | end = len(message) - 1 58 | matched_string = message[start:end].replace('"', "").strip() 59 | if "yes" in matched_string: 60 | json_response["terminate"] = "yes" 61 | else: 62 | json_response["terminate"] = "no" 63 | 64 | return json_response 65 | -------------------------------------------------------------------------------- /sentient/utils/function_utils.py: -------------------------------------------------------------------------------- 1 | # import inspect 2 | # from typing import Any, Callable, Dict, List, Union 3 | 4 | # from typing_extensions import Annotated, get_args, get_origin 5 | 6 | 7 | # def get_type_name(type_hint: Any) -> str: 8 | # if hasattr(type_hint, "__name__"): 9 | # return type_hint.__name__ 10 | # if hasattr(type_hint, "_name"): 11 | # return type_hint._name 12 | # return str(type_hint).replace("typing.", "") 13 | 14 | 15 | # def get_parameter_schema( 16 | # name: str, param: inspect.Parameter, type_hint: Any 17 | # ) -> Dict[str, Any]: 18 | # schema = {"type": get_type_name(type_hint)} 19 | 20 | # if get_origin(type_hint) is Annotated: 21 | # type_hint, description = get_args(type_hint) 22 | # schema["description"] = description 23 | # else: 24 | # schema["description"] = name 25 | 26 | # if get_origin(type_hint) is Union: 27 | # schema["type"] = [get_type_name(arg) for arg in get_args(type_hint)] 28 | # elif get_origin(type_hint) is List: 29 | # item_type = get_args(type_hint)[0] 30 | # if get_origin(item_type) is Dict: 31 | # key_type, value_type = get_args(item_type) 32 | # schema["type"] = "array" 33 | # schema["items"] = { 34 | # "type": "object", 35 | # "additionalProperties": {"type": get_type_name(value_type)}, 36 | # } 37 | # else: 38 | # schema["type"] = "array" 39 | # schema["items"] = {"type": get_type_name(item_type)} 40 | 41 | # if param.default != inspect.Parameter.empty: 42 | # schema["default"] = param.default 43 | # return schema 44 | 45 | 46 | # def generate_tool_from_function( 47 | # func: Callable[..., Any], tool_description: str 48 | # ) -> Dict[str, Any]: 49 | # signature = inspect.signature(func) 50 | # type_hints = func.__annotations__ 51 | 52 | # parameters = {} 53 | # for name, param in signature.parameters.items(): 54 | # type_hint = type_hints.get(name, Any) 55 | # parameters[name] = get_parameter_schema(name, param, type_hint) 56 | 57 | # return { 58 | # "type": "function", 59 | # "function": { 60 | # "name": func.__name__, 61 | # "description": tool_description, 62 | # "parameters": { 63 | # "type": "object", 64 | # "properties": parameters, 65 | # "required": [ 66 | # name 67 | # for name, param in signature.parameters.items() 68 | # if param.default == inspect.Parameter.empty 69 | # ], 70 | # }, 71 | # }, 72 | # } 73 | 74 | 75 | import functools 76 | import inspect 77 | import json 78 | from logging import getLogger 79 | from typing import ( 80 | Any, 81 | Callable, 82 | Dict, 83 | ForwardRef, 84 | List, 85 | Optional, 86 | Set, 87 | Tuple, 88 | Type, 89 | TypeVar, 90 | Union, 91 | ) 92 | 93 | from pydantic import BaseModel, Field 94 | from typing_extensions import Annotated, Literal, get_args, get_origin 95 | 96 | from ._pydantic import ( 97 | JsonSchemaValue, 98 | evaluate_forwardref, 99 | model_dump, 100 | model_dump_json, 101 | type2schema, 102 | ) 103 | 104 | logger = getLogger(__name__) 105 | 106 | T = TypeVar("T") 107 | 108 | 109 | def get_typed_annotation(annotation: Any, globalns: Dict[str, Any]) -> Any: 110 | """Get the type annotation of a parameter. 111 | 112 | Args: 113 | annotation: The annotation of the parameter 114 | globalns: The global namespace of the function 115 | 116 | Returns: 117 | The type annotation of the parameter 118 | """ 119 | if isinstance(annotation, str): 120 | annotation = ForwardRef(annotation) 121 | annotation = evaluate_forwardref(annotation, globalns, globalns) 122 | return annotation 123 | 124 | 125 | def get_typed_signature(call: Callable[..., Any]) -> inspect.Signature: 126 | """Get the signature of a function with type annotations. 127 | 128 | Args: 129 | call: The function to get the signature for 130 | 131 | Returns: 132 | The signature of the function with type annotations 133 | """ 134 | signature = inspect.signature(call) 135 | globalns = getattr(call, "__globals__", {}) 136 | typed_params = [ 137 | inspect.Parameter( 138 | name=param.name, 139 | kind=param.kind, 140 | default=param.default, 141 | annotation=get_typed_annotation(param.annotation, globalns), 142 | ) 143 | for param in signature.parameters.values() 144 | ] 145 | typed_signature = inspect.Signature(typed_params) 146 | return typed_signature 147 | 148 | 149 | def get_typed_return_annotation(call: Callable[..., Any]) -> Any: 150 | """Get the return annotation of a function. 151 | 152 | Args: 153 | call: The function to get the return annotation for 154 | 155 | Returns: 156 | The return annotation of the function 157 | """ 158 | signature = inspect.signature(call) 159 | annotation = signature.return_annotation 160 | 161 | if annotation is inspect.Signature.empty: 162 | return None 163 | 164 | globalns = getattr(call, "__globals__", {}) 165 | return get_typed_annotation(annotation, globalns) 166 | 167 | 168 | def get_param_annotations( 169 | typed_signature: inspect.Signature, 170 | ) -> Dict[str, Union[Annotated[Type[Any], str], Type[Any]]]: 171 | """Get the type annotations of the parameters of a function 172 | 173 | Args: 174 | typed_signature: The signature of the function with type annotations 175 | 176 | Returns: 177 | A dictionary of the type annotations of the parameters of the function 178 | """ 179 | return { 180 | k: v.annotation 181 | for k, v in typed_signature.parameters.items() 182 | if v.annotation is not inspect.Signature.empty 183 | } 184 | 185 | 186 | class Parameters(BaseModel): 187 | """Parameters of a function as defined by the OpenAI API""" 188 | 189 | type: Literal["object"] = "object" 190 | properties: Dict[str, JsonSchemaValue] 191 | required: List[str] 192 | additionalProperties: bool 193 | additionalProperties: bool 194 | 195 | 196 | class Function(BaseModel): 197 | """A function as defined by the OpenAI API""" 198 | 199 | description: Annotated[str, Field(description="Description of the function")] 200 | name: Annotated[str, Field(description="Name of the function")] 201 | parameters: Annotated[Parameters, Field(description="Parameters of the function")] 202 | strict: bool 203 | 204 | 205 | class ToolFunction(BaseModel): 206 | """A function under tool as defined by the OpenAI API.""" 207 | 208 | type: Literal["function"] = "function" 209 | function: Annotated[Function, Field(description="Function under tool")] 210 | 211 | 212 | def get_parameter_json_schema( 213 | k: str, v: Any, default_values: Dict[str, Any] 214 | ) -> JsonSchemaValue: 215 | def type2description(k: str, v: Union[Annotated[Type[Any], str], Type[Any]]) -> str: 216 | if get_origin(v) is Annotated: 217 | args = get_args(v) 218 | if len(args) > 1 and isinstance(args[1], str): 219 | return args[1] 220 | return k 221 | 222 | schema = type2schema(v) 223 | schema["description"] = type2description(k, v) 224 | 225 | if schema["type"] == "object": 226 | schema["additionalProperties"] = False 227 | if "properties" not in schema: 228 | schema["properties"] = {} 229 | 230 | if schema["type"] == "array": 231 | if "items" not in schema: 232 | schema["items"] = { 233 | "type": "object", 234 | "properties": {}, 235 | "additionalProperties": False, 236 | } 237 | elif schema["items"].get("type") == "object": 238 | if "properties" not in schema["items"]: 239 | schema["items"]["properties"] = {} 240 | schema["items"]["additionalProperties"] = False 241 | 242 | return schema 243 | 244 | 245 | def get_required_params(typed_signature: inspect.Signature) -> List[str]: 246 | """Get the required parameters of a function 247 | 248 | Args: 249 | signature: The signature of the function as returned by inspect.signature 250 | 251 | Returns: 252 | A list of the required parameters of the function 253 | """ 254 | return [ 255 | k 256 | for k, v in typed_signature.parameters.items() 257 | if v.default == inspect.Signature.empty 258 | ] 259 | 260 | 261 | def get_default_values(typed_signature: inspect.Signature) -> Dict[str, Any]: 262 | """Get default values of parameters of a function 263 | 264 | Args: 265 | signature: The signature of the function as returned by inspect.signature 266 | 267 | Returns: 268 | A dictionary of the default values of the parameters of the function 269 | """ 270 | return { 271 | k: v.default 272 | for k, v in typed_signature.parameters.items() 273 | if v.default != inspect.Signature.empty 274 | } 275 | 276 | 277 | def get_parameters( 278 | required: List[str], 279 | param_annotations: Dict[str, Union[Annotated[Type[Any], str], Type[Any]]], 280 | default_values: Dict[str, Any], 281 | ) -> Parameters: 282 | properties = {} 283 | for k, v in param_annotations.items(): 284 | if v is not inspect.Signature.empty: 285 | if get_origin(v) is Annotated: 286 | v_type = get_args(v)[0] 287 | v_desc = get_args(v)[1] if len(get_args(v)) > 1 else k 288 | else: 289 | v_type = v 290 | v_desc = k 291 | 292 | if get_origin(v_type) is List: 293 | item_type = get_args(v_type)[0] 294 | properties[k] = { 295 | "type": "array", 296 | "items": get_parameter_json_schema(k, item_type, default_values), 297 | "description": v_desc, 298 | } 299 | else: 300 | properties[k] = get_parameter_json_schema(k, v_type, default_values) 301 | properties[k]["description"] = v_desc 302 | 303 | return Parameters( 304 | properties=properties, 305 | required=list(properties.keys()), # All properties are required 306 | additionalProperties=False, 307 | ) 308 | 309 | 310 | def get_missing_annotations( 311 | typed_signature: inspect.Signature, required: List[str] 312 | ) -> Tuple[Set[str], Set[str]]: 313 | """Get the missing annotations of a function 314 | 315 | Ignores the parameters with default values as they are not required to be annotated, but logs a warning. 316 | Args: 317 | typed_signature: The signature of the function with type annotations 318 | required: The required parameters of the function 319 | 320 | Returns: 321 | A set of the missing annotations of the function 322 | """ 323 | all_missing = { 324 | k 325 | for k, v in typed_signature.parameters.items() 326 | if v.annotation is inspect.Signature.empty 327 | } 328 | missing = all_missing.intersection(set(required)) 329 | unannotated_with_default = all_missing.difference(missing) 330 | return missing, unannotated_with_default 331 | 332 | 333 | def get_function_schema( 334 | f: Callable[..., Any], *, name: Optional[str] = None, description: str 335 | ) -> Dict[str, Any]: 336 | """Get a JSON schema for a function as defined by the OpenAI API 337 | 338 | Args: 339 | f: The function to get the JSON schema for 340 | name: The name of the function 341 | description: The description of the function 342 | 343 | Returns: 344 | A JSON schema for the function 345 | 346 | Raises: 347 | TypeError: If the function is not annotated 348 | 349 | Examples: 350 | 351 | ```python 352 | def f(a: Annotated[str, "Parameter a"], b: int = 2, c: Annotated[float, "Parameter c"] = 0.1) -> None: 353 | pass 354 | 355 | get_function_schema(f, description="function f") 356 | 357 | # {'type': 'function', 358 | # 'function': {'description': 'function f', 359 | # 'name': 'f', 360 | # 'parameters': {'type': 'object', 361 | # 'properties': {'a': {'type': 'str', 'description': 'Parameter a'}, 362 | # 'b': {'type': 'int', 'description': 'b'}, 363 | # 'c': {'type': 'float', 'description': 'Parameter c'}}, 364 | # 'required': ['a']}}} 365 | ``` 366 | 367 | """ 368 | typed_signature = get_typed_signature(f) 369 | required = get_required_params(typed_signature) 370 | default_values = get_default_values(typed_signature) 371 | param_annotations = get_param_annotations(typed_signature) 372 | return_annotation = get_typed_return_annotation(f) 373 | missing, unannotated_with_default = get_missing_annotations( 374 | typed_signature, required 375 | ) 376 | 377 | if return_annotation is None: 378 | logger.warning( 379 | f"The return type of the function '{f.__name__}' is not annotated. Although annotating it is " 380 | + "optional, the function should return either a string, a subclass of 'pydantic.BaseModel'." 381 | ) 382 | 383 | if unannotated_with_default != set(): 384 | unannotated_with_default_s = [ 385 | f"'{k}'" for k in sorted(unannotated_with_default) 386 | ] 387 | logger.warning( 388 | f"The following parameters of the function '{f.__name__}' with default values are not annotated: " 389 | + f"{', '.join(unannotated_with_default_s)}." 390 | ) 391 | 392 | if missing != set(): 393 | missing_s = [f"'{k}'" for k in sorted(missing)] 394 | raise TypeError( 395 | f"All parameters of the function '{f.__name__}' without default values must be annotated. " 396 | + f"The annotations are missing for the following parameters: {', '.join(missing_s)}" 397 | ) 398 | 399 | fname = name if name else f.__name__ 400 | 401 | parameters = get_parameters( 402 | required, param_annotations, default_values=default_values 403 | ) 404 | 405 | function = ToolFunction( 406 | function=Function( 407 | description=description, 408 | name=fname, 409 | parameters=parameters, 410 | strict=True, 411 | ) 412 | ) 413 | 414 | schema = model_dump(function) 415 | 416 | return schema 417 | 418 | 419 | def get_load_param_if_needed_function( 420 | t: Any, 421 | ) -> Optional[Callable[[Dict[str, Any], Type[BaseModel]], BaseModel]]: 422 | """Get a function to load a parameter if it is a Pydantic model 423 | 424 | Args: 425 | t: The type annotation of the parameter 426 | 427 | Returns: 428 | A function to load the parameter if it is a Pydantic model, otherwise None 429 | 430 | """ 431 | if get_origin(t) is Annotated: 432 | return get_load_param_if_needed_function(get_args(t)[0]) 433 | 434 | def load_base_model(v: Dict[str, Any], t: Type[BaseModel]) -> BaseModel: 435 | return t(**v) 436 | 437 | return load_base_model if isinstance(t, type) and issubclass(t, BaseModel) else None 438 | 439 | 440 | def load_basemodels_if_needed(func: Callable[..., Any]) -> Callable[..., Any]: 441 | """A decorator to load the parameters of a function if they are Pydantic models 442 | 443 | Args: 444 | func: The function with annotated parameters 445 | 446 | Returns: 447 | A function that loads the parameters before calling the original function 448 | 449 | """ 450 | # get the type annotations of the parameters 451 | typed_signature = get_typed_signature(func) 452 | param_annotations = get_param_annotations(typed_signature) 453 | 454 | # get functions for loading BaseModels when needed based on the type annotations 455 | kwargs_mapping_with_nones = { 456 | k: get_load_param_if_needed_function(t) for k, t in param_annotations.items() 457 | } 458 | 459 | # remove the None values 460 | kwargs_mapping = { 461 | k: f for k, f in kwargs_mapping_with_nones.items() if f is not None 462 | } 463 | 464 | # a function that loads the parameters before calling the original function 465 | @functools.wraps(func) 466 | def _load_parameters_if_needed(*args: Any, **kwargs: Any) -> Any: 467 | # load the BaseModels if needed 468 | for k, f in kwargs_mapping.items(): 469 | kwargs[k] = f(kwargs[k], param_annotations[k]) 470 | 471 | # call the original function 472 | return func(*args, **kwargs) 473 | 474 | @functools.wraps(func) 475 | async def _a_load_parameters_if_needed(*args: Any, **kwargs: Any) -> Any: 476 | # load the BaseModels if needed 477 | for k, f in kwargs_mapping.items(): 478 | kwargs[k] = f(kwargs[k], param_annotations[k]) 479 | 480 | # call the original function 481 | return await func(*args, **kwargs) 482 | 483 | if inspect.iscoroutinefunction(func): 484 | return _a_load_parameters_if_needed 485 | else: 486 | return _load_parameters_if_needed 487 | 488 | 489 | def serialize_to_str(x: Any) -> str: 490 | if isinstance(x, str): 491 | return x 492 | elif isinstance(x, BaseModel): 493 | return model_dump_json(x) 494 | else: 495 | return json.dumps(x) 496 | -------------------------------------------------------------------------------- /sentient/utils/get_detailed_accessibility_tree.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import traceback 5 | from typing import Dict, List, Optional 6 | 7 | from playwright.async_api import Page 8 | from typing_extensions import Annotated, Any 9 | 10 | from sentient.config.config import SOURCE_LOG_FOLDER_PATH 11 | from sentient.core.web_driver.playwright import PlaywrightManager 12 | from sentient.utils.logger import logger 13 | 14 | space_delimited_mmid = re.compile(r"^[\d ]+$") 15 | 16 | 17 | def is_space_delimited_mmid(s: str) -> bool: 18 | """ 19 | Check if the given string matches the the mmid pattern of number space repeated. 20 | 21 | Parameters: 22 | - s (str): The string to check against the pattern. 23 | 24 | Returns: 25 | - bool: True if the string matches the pattern, False otherwise. 26 | """ 27 | # Use fullmatch() to ensure the entire string matches the pattern 28 | return bool(space_delimited_mmid.fullmatch(s)) 29 | 30 | 31 | async def __inject_attributes(page: Page): 32 | """ 33 | Injects 'mmid' and 'aria-keyshortcuts' into all DOM elements. If an element already has an 'aria-keyshortcuts', 34 | it renames it to 'orig-aria-keyshortcuts' before injecting the new 'aria-keyshortcuts' 35 | This will be captured in the accessibility tree and thus make it easier to reconcile the tree with the DOM. 36 | 'aria-keyshortcuts' is choosen because it is not widely used aria attribute. 37 | """ 38 | 39 | last_mmid = await page.evaluate("""() => { 40 | const allElements = document.querySelectorAll('*'); 41 | let id = 0; 42 | allElements.forEach(element => { 43 | const origAriaAttribute = element.getAttribute('aria-keyshortcuts'); 44 | const mmid = `${++id}`; 45 | element.setAttribute('mmid', mmid); 46 | element.setAttribute('aria-keyshortcuts', mmid); 47 | //console.log(`Injected 'mmid'into element with tag: ${element.tagName} and mmid: ${mmid}`); 48 | if (origAriaAttribute) { 49 | element.setAttribute('orig-aria-keyshortcuts', origAriaAttribute); 50 | } 51 | }); 52 | return id; 53 | }""") 54 | logger.debug(f"Added MMID into {last_mmid} elements") 55 | 56 | 57 | async def __fetch_dom_info( 58 | page: Page, accessibility_tree: Dict[str, Any], only_input_fields: bool 59 | ): 60 | """ 61 | Iterates over the accessibility tree, fetching additional information from the DOM based on 'mmid', 62 | and constructs a new JSON structure with detailed information. 63 | 64 | Args: 65 | page (Page): The page object representing the web page. 66 | accessibility_tree (Dict[str, Any]): The accessibility tree JSON structure. 67 | only_input_fields (bool): Flag indicating whether to include only input fields in the new JSON structure. 68 | 69 | Returns: 70 | Dict[str, Any]: The pruned tree with detailed information from the DOM. 71 | """ 72 | 73 | logger.debug("Reconciling the Accessibility Tree with the DOM") 74 | # Define the attributes to fetch for each element 75 | attributes = [ 76 | "name", 77 | "aria-label", 78 | "placeholder", 79 | "mmid", 80 | "id", 81 | "for", 82 | "data-testid", 83 | ] 84 | backup_attributes = [] # if the attributes are not found, then try to get these attributes 85 | tags_to_ignore = [ 86 | "head", 87 | "style", 88 | "script", 89 | "link", 90 | "meta", 91 | "noscript", 92 | "template", 93 | "iframe", 94 | "g", 95 | "main", 96 | "c-wiz", 97 | "svg", 98 | "path", 99 | ] 100 | attributes_to_delete = ["level", "multiline", "haspopup", "id", "for"] 101 | ids_to_ignore = ["agentDriveAutoOverlay"] 102 | 103 | # Recursive function to process each node in the accessibility tree 104 | async def process_node(node: Dict[str, Any]): 105 | if "children" in node: 106 | for child in node["children"]: 107 | await process_node(child) 108 | 109 | # Use 'name' attribute from the accessibility node as 'mmid' 110 | mmid_temp: str = node.get("keyshortcuts") # type: ignore 111 | 112 | # If the name has multiple mmids, take the last one 113 | if mmid_temp and is_space_delimited_mmid(mmid_temp): 114 | # TODO: consider if we should grab each of the mmids and process them separately as seperate nodes copying this node's attributes 115 | mmid_temp = mmid_temp.split(" ")[-1] 116 | 117 | # focusing on nodes with mmid, which is the attribute we inject 118 | try: 119 | mmid = int(mmid_temp) 120 | except (ValueError, TypeError): 121 | # logger.error(f"'name attribute contains \"{node.get('name')}\", which is not a valid numeric mmid. Adding node as is: {node}") 122 | return node.get("name") 123 | 124 | if node["role"] == "menuitem": 125 | return node.get("name") 126 | 127 | if node.get("role") == "dialog" and node.get("modal") == True: # noqa: E712 128 | node["important information"] = ( 129 | "This is a modal dialog. Please interact with this dialog and close it to be able to interact with the full page (e.g. by pressing the close button or selecting an option)." 130 | ) 131 | 132 | if mmid: 133 | # Determine if we need to fetch 'innerText' based on the absence of 'children' in the accessibility node 134 | should_fetch_inner_text = "children" not in node 135 | 136 | js_code = """ 137 | (input_params) => { 138 | const should_fetch_inner_text = input_params.should_fetch_inner_text; 139 | const mmid = input_params.mmid; 140 | const attributes = input_params.attributes; 141 | const tags_to_ignore = input_params.tags_to_ignore; 142 | const ids_to_ignore = input_params.ids_to_ignore; 143 | 144 | const element = document.querySelector(`[mmid="${mmid}"]`); 145 | 146 | if (!element) { 147 | console.log(`No element found with mmid: ${mmid}`); 148 | return null; 149 | } 150 | 151 | if (ids_to_ignore.includes(element.id)) { 152 | console.log(`Ignoring element with id: ${element.id}`, element); 153 | return null; 154 | } 155 | //Ignore "option" because it would have been processed with the select element 156 | if (tags_to_ignore.includes(element.tagName.toLowerCase()) || element.tagName.toLowerCase() === "option") return null; 157 | 158 | let attributes_to_values = { 159 | 'tag': element.tagName.toLowerCase() // Always include the tag name 160 | }; 161 | 162 | // If the element is an input, include its type as well 163 | if (element.tagName.toLowerCase() === 'input') { 164 | attributes_to_values['tag_type'] = element.type; // This will capture 'checkbox', 'radio', etc. 165 | } 166 | else if (element.tagName.toLowerCase() === 'select') { 167 | attributes_to_values["mmid"] = element.getAttribute('mmid'); 168 | attributes_to_values["role"] = "combobox"; 169 | attributes_to_values["options"] = []; 170 | 171 | for (const option of element.options) { 172 | let option_attributes_to_values = { 173 | "mmid": option.getAttribute('mmid'), 174 | "text": option.text, 175 | "value": option.value, 176 | "selected": option.selected 177 | }; 178 | attributes_to_values["options"].push(option_attributes_to_values); 179 | } 180 | return attributes_to_values; 181 | } 182 | 183 | for (const attribute of attributes) { 184 | let value = element.getAttribute(attribute); 185 | 186 | if(value){ 187 | /* 188 | if(attribute === 'href'){ 189 | value = value.split('?')[0] 190 | } 191 | */ 192 | attributes_to_values[attribute] = value; 193 | } 194 | } 195 | 196 | if (should_fetch_inner_text && element.innerText) { 197 | attributes_to_values['description'] = element.innerText; 198 | } 199 | 200 | let role = element.getAttribute('role'); 201 | if(role==='listbox' || element.tagName.toLowerCase()=== 'ul'){ 202 | let children=element.children; 203 | let filtered_children = Array.from(children).filter(child => child.getAttribute('role') === 'option'); 204 | console.log("Listbox or ul found: ", filtered_children); 205 | let attributes_to_include = ['mmid', 'role', 'aria-label','value']; 206 | attributes_to_values["additional_info"]=[] 207 | for (const child of children) { 208 | let children_attributes_to_values = {}; 209 | 210 | for (let attr of child.attributes) { 211 | // If the attribute is not in the predefined list, add it to children_attributes_to_values 212 | if (attributes_to_include.includes(attr.name)) { 213 | children_attributes_to_values[attr.name] = attr.value; 214 | } 215 | } 216 | 217 | attributes_to_values["additional_info"].push(children_attributes_to_values); 218 | } 219 | } 220 | // Check if attributes_to_values contains more than just 'name', 'role', and 'mmid' 221 | const keys = Object.keys(attributes_to_values); 222 | const minimalKeys = ['tag', 'mmid']; 223 | const hasMoreThanMinimalKeys = keys.length > minimalKeys.length || keys.some(key => !minimalKeys.includes(key)); 224 | 225 | if (!hasMoreThanMinimalKeys) { 226 | //If there were no attributes found, then try to get the backup attributes 227 | for (const backupAttribute of input_params.backup_attributes) { 228 | let value = element.getAttribute(backupAttribute); 229 | if(value){ 230 | attributes_to_values[backupAttribute] = value; 231 | } 232 | } 233 | 234 | //if even the backup attributes are not found, then return null, which will cause this element to be skipped 235 | if(Object.keys(attributes_to_values).length <= minimalKeys.length) { 236 | if (element.tagName.toLowerCase() === 'button') { 237 | attributes_to_values["mmid"] = element.getAttribute('mmid'); 238 | attributes_to_values["role"] = "button"; 239 | attributes_to_values["additional_info"] = []; 240 | let children=element.children; 241 | let attributes_to_exclude = ['width', 'height', 'path', 'class', 'viewBox', 'mmid'] 242 | 243 | // Check if the button has no text and no attributes 244 | if (element.innerText.trim() === '') { 245 | 246 | for (const child of children) { 247 | let children_attributes_to_values = {}; 248 | 249 | for (let attr of child.attributes) { 250 | // If the attribute is not in the predefined list, add it to children_attributes_to_values 251 | if (!attributes_to_exclude.includes(attr.name)) { 252 | children_attributes_to_values[attr.name] = attr.value; 253 | } 254 | } 255 | 256 | attributes_to_values["additional_info"].push(children_attributes_to_values); 257 | } 258 | console.log("Button with no text and no attributes: ", attributes_to_values); 259 | return attributes_to_values; 260 | } 261 | } 262 | 263 | return null; // Return null if only minimal keys are present 264 | } 265 | } 266 | return attributes_to_values; 267 | } 268 | """ 269 | 270 | # Fetch attributes and possibly 'innerText' from the DOM element by 'mmid' 271 | element_attributes = await page.evaluate( 272 | js_code, 273 | { 274 | "mmid": mmid, 275 | "attributes": attributes, 276 | "backup_attributes": backup_attributes, 277 | "should_fetch_inner_text": should_fetch_inner_text, 278 | "tags_to_ignore": tags_to_ignore, 279 | "ids_to_ignore": ids_to_ignore, 280 | }, 281 | ) 282 | 283 | if "keyshortcuts" in node: 284 | del node["keyshortcuts"] # remove keyshortcuts since it is not needed 285 | 286 | node["mmid"] = mmid 287 | 288 | # Update the node with fetched information 289 | if element_attributes: 290 | node.update(element_attributes) 291 | 292 | # check if 'name' and 'mmid' are the same 293 | if ( 294 | node.get("name") == node.get("mmid") 295 | and node.get("role") != "textbox" 296 | ): 297 | del node["name"] # Remove 'name' from the node 298 | 299 | if ( 300 | "name" in node 301 | and "description" in node 302 | and ( 303 | node["name"] == node["description"] 304 | or node["name"] == node["description"].replace("\n", " ") 305 | or node["description"].replace("\n", "") in node["name"] 306 | ) 307 | ): 308 | del node[ 309 | "description" 310 | ] # if the name is same as description, then remove the description to avoid duplication 311 | 312 | if ( 313 | "name" in node 314 | and "aria-label" in node 315 | and node["aria-label"] in node["name"] 316 | ): 317 | del node[ 318 | "aria-label" 319 | ] # if the name is same as the aria-label, then remove the aria-label to avoid duplication 320 | 321 | if "name" in node and "text" in node and node["name"] == node["text"]: 322 | del node[ 323 | "text" 324 | ] # if the name is same as the text, then remove the text to avoid duplication 325 | 326 | if ( 327 | node.get("tag") == "select" 328 | ): # children are not needed for select menus since "options" attriburte is already added 329 | node.pop("children", None) 330 | node.pop("role", None) 331 | node.pop("description", None) 332 | 333 | # role and tag can have the same info. Get rid of role if it is the same as tag 334 | if node.get("role") == node.get("tag"): 335 | del node["role"] 336 | 337 | # avoid duplicate aria-label 338 | if ( 339 | node.get("aria-label") 340 | and node.get("placeholder") 341 | and node.get("aria-label") == node.get("placeholder") 342 | ): 343 | del node["aria-label"] 344 | 345 | if node.get("role") == "link": 346 | del node["role"] 347 | if node.get("description"): 348 | node["text"] = node["description"] 349 | del node["description"] 350 | 351 | # textbox just means a text input and that is expressed well enough with the rest of the attributes returned 352 | # if node.get('role') == "textbox": 353 | # del node['role'] 354 | 355 | if node.get("role") == "textbox": 356 | # get the id attribute of this field from the DOM 357 | if "id" in element_attributes and element_attributes["id"]: 358 | # find if there is an element in the DOM that has this id in aria-labelledby. 359 | js_code = """ 360 | (inputParams) => { 361 | let referencingElements = []; 362 | const referencedElement = document.querySelector(`[aria-labelledby="${inputParams.aria_labelled_by_query_value}"]`); 363 | if(referencedElement) { 364 | const mmid = referencedElement.getAttribute('mmid'); 365 | if (mmid) { 366 | return {"mmid": mmid, "tag": referencedElement.tagName.toLowerCase()}; 367 | } 368 | } 369 | return null; 370 | } 371 | """ 372 | # textbox just means a text input and that is expressed well enough with the rest of the attributes returned 373 | # del node['role'] 374 | 375 | # remove attributes that are not needed once processing of a node is complete 376 | for attribute_to_delete in attributes_to_delete: 377 | if attribute_to_delete in node: 378 | node.pop(attribute_to_delete, None) 379 | else: 380 | logger.debug(f"No element found with mmid: {mmid}, deleting node: {node}") 381 | node["marked_for_deletion_by_mm"] = True 382 | 383 | # Process each node in the tree starting from the root 384 | await process_node(accessibility_tree) 385 | 386 | pruned_tree = __prune_tree(accessibility_tree, only_input_fields) 387 | 388 | logger.debug("Reconciliation complete") 389 | return pruned_tree 390 | 391 | 392 | async def __cleanup_dom(page: Page): 393 | """ 394 | Cleans up the DOM by removing injected 'aria-description' attributes and restoring any original 'aria-keyshortcuts' 395 | from 'orig-aria-keyshortcuts'. 396 | """ 397 | logger.debug("Cleaning up the DOM's previous injections") 398 | await page.evaluate("""() => { 399 | const allElements = document.querySelectorAll('*[mmid]'); 400 | allElements.forEach(element => { 401 | element.removeAttribute('aria-keyshortcuts'); 402 | const origAriaLabel = element.getAttribute('orig-aria-keyshortcuts'); 403 | if (origAriaLabel) { 404 | element.setAttribute('aria-keyshortcuts', origAriaLabel); 405 | element.removeAttribute('orig-aria-keyshortcuts'); 406 | } 407 | }); 408 | }""") 409 | logger.debug("DOM cleanup complete") 410 | 411 | 412 | def __prune_tree( 413 | node: Dict[str, Any], only_input_fields: bool 414 | ) -> Optional[Dict[str, Any]]: 415 | """ 416 | Recursively prunes a tree starting from `node`, based on pruning conditions and handling of 'unraveling'. 417 | 418 | The function has two main jobs: 419 | 1. Pruning: Remove nodes that don't meet certain conditions, like being marked for deletion. 420 | 2. Unraveling: For nodes marked with 'marked_for_unravel_children', we replace them with their children, 421 | effectively removing the node and lifting its children up a level in the tree. 422 | 423 | This happens in place, meaning we modify the tree as we go, which is efficient but means you should 424 | be cautious about modifying the tree outside this function during a prune operation. 425 | 426 | Args: 427 | - node (Dict[str, Any]): The node we're currently looking at. We'll check this node, its children, 428 | and so on, recursively down the tree. 429 | - only_input_fields (bool): If True, we're only interested in pruning input-related nodes (like form fields). 430 | This lets you narrow the focus if, for example, you're only interested in cleaning up form-related parts 431 | of a larger tree. 432 | 433 | Returns: 434 | - Dict[str, Any] | None: The pruned version of `node`, or None if `node` was pruned away. When we 'unravel' 435 | a node, we directly replace it with its children in the parent's list of children, so the return value 436 | will be the parent, updated in place. 437 | 438 | Notes: 439 | - 'marked_for_deletion_by_mm' is our flag for nodes that should definitely be removed. 440 | - Unraveling is neat for flattening the tree when a node is just a wrapper without semantic meaning. 441 | - We use a while loop with manual index management to safely modify the list of children as we iterate over it. 442 | """ 443 | if "marked_for_deletion_by_mm" in node: 444 | return None 445 | 446 | if "children" in node: 447 | i = 0 448 | while i < len(node["children"]): 449 | child = node["children"][i] 450 | if "marked_for_unravel_children" in child: 451 | # Replace the current child with its children 452 | if "children" in child: 453 | node["children"] = ( 454 | node["children"][:i] 455 | + child["children"] 456 | + node["children"][i + 1 :] 457 | ) 458 | i += ( 459 | len(child["children"]) - 1 460 | ) # Adjust the index for the new children 461 | else: 462 | # If the node marked for unraveling has no children, remove it 463 | node["children"].pop(i) 464 | i -= 1 # Adjust the index since we removed an element 465 | else: 466 | # Recursively prune the child if it's not marked for unraveling 467 | pruned_child = __prune_tree(child, only_input_fields) 468 | if pruned_child is None: 469 | # If the child is pruned, remove it from the children list 470 | node["children"].pop(i) 471 | i -= 1 # Adjust the index since we removed an element 472 | else: 473 | # Update the child with the pruned version 474 | node["children"][i] = pruned_child 475 | i += 1 # Move to the next child 476 | 477 | # After processing all children, if the children array is empty, remove it 478 | if not node["children"]: 479 | del node["children"] 480 | 481 | # Apply existing conditions to decide if the current node should be pruned 482 | return None if __should_prune_node(node, only_input_fields) else node 483 | 484 | 485 | def __should_prune_node(node: Dict[str, Any], only_input_fields: bool): 486 | """ 487 | Determines if a node should be pruned based on its 'role' and 'element_attributes'. 488 | 489 | Args: 490 | node (Dict[str, Any]): The node to be evaluated. 491 | only_input_fields (bool): Flag indicating whether only input fields should be considered. 492 | 493 | Returns: 494 | bool: True if the node should be pruned, False otherwise. 495 | """ 496 | # If the request is for only input fields and this is not an input field, then mark the node for prunning 497 | if ( 498 | node.get("role") != "WebArea" 499 | and only_input_fields 500 | and not ( 501 | node.get("tag") in ("input", "button", "textarea") 502 | or node.get("role") == "button" 503 | ) 504 | ): 505 | return True 506 | 507 | if ( 508 | node.get("role") == "generic" 509 | and "children" not in node 510 | and not ("name" in node and node.get("name")) 511 | ): # The presence of 'children' is checked after potentially deleting it above 512 | return True 513 | 514 | if node.get("role") in ["separator", "LineBreak"]: 515 | return True 516 | processed_name = "" 517 | if "name" in node: 518 | processed_name: str = node.get("name") # type: ignore 519 | processed_name = processed_name.replace(",", "") 520 | processed_name = processed_name.replace(":", "") 521 | processed_name = processed_name.replace("\n", "") 522 | processed_name = processed_name.strip() 523 | if len(processed_name) < 3: 524 | processed_name = "" 525 | 526 | # check if the node only have name and role, then delete that node 527 | if ( 528 | len(node) == 2 529 | and "name" in node 530 | and "role" in node 531 | and not (node.get("role") == "text" and processed_name != "") 532 | ): 533 | return True 534 | return False 535 | 536 | 537 | async def get_node_dom_element(page: Page, mmid: str): 538 | return await page.evaluate( 539 | """ 540 | (mmid) => { 541 | return document.querySelector(`[mmid="${mmid}"]`); 542 | } 543 | """, 544 | mmid, 545 | ) 546 | 547 | 548 | async def get_element_attributes(page: Page, mmid: str, attributes: List[str]): 549 | return await page.evaluate( 550 | """ 551 | (inputParams) => { 552 | const mmid = inputParams.mmid; 553 | const attributes = inputParams.attributes; 554 | const element = document.querySelector(`[mmid="${mmid}"]`); 555 | if (!element) return null; // Return null if element is not found 556 | 557 | let attrs = {}; 558 | for (let attr of attributes) { 559 | attrs[attr] = element.getAttribute(attr); 560 | } 561 | return attrs; 562 | } 563 | """, 564 | {"mmid": mmid, "attributes": attributes}, 565 | ) 566 | 567 | 568 | async def get_dom_with_accessibility_info() -> ( 569 | Annotated[ 570 | Optional[Dict[str, Any]], 571 | "A minified representation of the HTML DOM for the current webpage", 572 | ] 573 | ): 574 | """ 575 | Retrieves, processes, and minifies the Accessibility tree of the active page in a browser instance. 576 | Strictly follow the name and role tag for any interaction with the nodes. 577 | 578 | Returns: 579 | - The minified JSON content of the browser's active page. 580 | """ 581 | logger.debug("Executing Get Accessibility Tree Command") 582 | # Create and use the PlaywrightManager 583 | browser_manager = PlaywrightManager(browser_type="chromium", headless=False) 584 | page = await browser_manager.get_current_page() 585 | if page is None: # type: ignore 586 | raise ValueError("No active page found") 587 | 588 | return await do_get_accessibility_info(page) 589 | 590 | 591 | async def do_get_accessibility_info(page: Page, only_input_fields: bool = False): 592 | """ 593 | Retrieves the accessibility information of a web page and saves it as JSON files. 594 | 595 | Args: 596 | page (Page): The page object representing the web page. 597 | only_input_fields (bool, optional): If True, only retrieves accessibility information for input fields. 598 | Defaults to False. 599 | 600 | Returns: 601 | Dict[str, Any] or None: The enhanced accessibility tree as a dictionary, or None if an error occurred. 602 | """ 603 | await __inject_attributes(page) 604 | accessibility_tree: Dict[str, Any] = await page.accessibility.snapshot( 605 | interesting_only=True 606 | ) # type: ignore 607 | 608 | with open( 609 | os.path.join(SOURCE_LOG_FOLDER_PATH, "json_accessibility_dom.json"), 610 | "w", 611 | encoding="utf-8", 612 | ) as f: 613 | f.write(json.dumps(accessibility_tree, indent=2)) 614 | logger.debug("json_accessibility_dom.json saved") 615 | 616 | await __cleanup_dom(page) 617 | try: 618 | enhanced_tree = await __fetch_dom_info( 619 | page, accessibility_tree, only_input_fields 620 | ) 621 | 622 | logger.debug("Enhanced Accessibility Tree ready") 623 | 624 | with open( 625 | os.path.join( 626 | SOURCE_LOG_FOLDER_PATH, "json_accessibility_dom_enriched.json" 627 | ), 628 | "w", 629 | encoding="utf-8", 630 | ) as f: 631 | f.write(json.dumps(enhanced_tree, indent=2)) 632 | logger.debug("json_accessibility_dom_enriched.json saved") 633 | 634 | return enhanced_tree 635 | except Exception as e: 636 | logger.error(f"Error while fetching DOM info: {e}") 637 | traceback.print_exc() 638 | return None 639 | -------------------------------------------------------------------------------- /sentient/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Union 4 | from sentient.config.config import SOURCE_LOG_FOLDER_PATH 5 | 6 | # Configure the root logger 7 | logging.basicConfig( 8 | level=logging.DEBUG, 9 | format="[%(asctime)s] %(levelname)s {%(filename)s:%(lineno)d} - %(message)s", 10 | ) 11 | 12 | # Remove all handlers from the root logger 13 | for handler in logging.root.handlers[:]: 14 | logging.root.removeHandler(handler) 15 | 16 | logger = logging.getLogger(__name__) 17 | logger.addHandler(logging.FileHandler(os.path.join(SOURCE_LOG_FOLDER_PATH, "sentient.log"))) 18 | logger.setLevel(logging.INFO) 19 | 20 | # logging.getLogger("httpcore").setLevel(logging.WARNING) 21 | # logging.getLogger("httpx").setLevel(logging.WARNING) 22 | # logging.getLogger("matplotlib.pyplot").setLevel(logging.WARNING) 23 | # logging.getLogger("PIL.PngImagePlugin").setLevel(logging.WARNING) 24 | # logging.getLogger("PIL.Image").setLevel(logging.WARNING) 25 | 26 | 27 | def set_log_level(level: Union[str, int]) -> None: 28 | """ 29 | Set the log level for the logger. 30 | 31 | Parameters: 32 | - level (Union[str, int]): A string or logging level such as 'debug', 'info', 'warning', 'error', or 'critical', or the corresponding logging constants like logging.DEBUG, logging.INFO, etc. 33 | """ 34 | if isinstance(level, str): 35 | level = level.upper() 36 | numeric_level = getattr(logging, level, None) 37 | if not isinstance(numeric_level, int): 38 | raise ValueError(f"Invalid log level: {level}") 39 | logger.setLevel(numeric_level) 40 | else: 41 | logger.setLevel(level) 42 | -------------------------------------------------------------------------------- /sentient/utils/message_type.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class MessageType(Enum): 5 | PLAN = "plan" 6 | STEP = "step" 7 | ACTION = "action" 8 | ANSWER = "answer" 9 | QUESTION = "question" 10 | INFO = "info" 11 | FINAL = "final" 12 | DONE = "transaction_done" 13 | ERROR = "error" 14 | -------------------------------------------------------------------------------- /sentient/utils/providers.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Dict, Any 3 | import os 4 | 5 | 6 | class LLMProvider(ABC): 7 | @abstractmethod 8 | def get_client_config(self) -> Dict[str, str]: 9 | pass 10 | 11 | @abstractmethod 12 | def get_provider_name(self) -> str: 13 | pass 14 | 15 | class OpenAIProvider(LLMProvider): 16 | def get_client_config(self) -> Dict[str, str]: 17 | return { 18 | "api_key": os.environ.get("OPENAI_API_KEY"), 19 | "base_url": "https://api.openai.com/v1", 20 | } 21 | def get_provider_name(self) -> str: 22 | return "openai" 23 | 24 | class TogetherAIProvider(LLMProvider): 25 | def get_client_config(self) -> Dict[str, str]: 26 | return { 27 | "api_key": os.environ.get("TOGETHER_API_KEY"), 28 | "base_url": "https://api.together.xyz/v1", 29 | } 30 | def get_provider_name(self) -> str: 31 | return "together" 32 | 33 | class OllamaProvider(LLMProvider): 34 | def get_client_config(self) -> Dict[str, str]: 35 | return { 36 | "api_key": "ollama", 37 | "base_url": "http://localhost:11434/v1/", 38 | } 39 | def get_provider_name(self) -> str: 40 | return "ollama" 41 | 42 | class GroqProvider(LLMProvider): 43 | def get_client_config(self) -> Dict[str, str]: 44 | return { 45 | "api_key": os.environ.get("GROQ_API_KEY"), 46 | } 47 | def get_provider_name(self) -> str: 48 | return "groq" 49 | 50 | class AnthropicProvider(LLMProvider): 51 | def get_client_config(self) -> Dict[str, str]: 52 | return { 53 | "api_key": os.environ.get("ANTHROPIC_API_KEY"), 54 | } 55 | def get_provider_name(self) -> str: 56 | return "anthropic" 57 | 58 | class CustomProvider(LLMProvider): 59 | def __init__(self, base_url: str): 60 | self.base_url = base_url 61 | 62 | def get_client_config(self) -> Dict[str, Any]: 63 | return { 64 | "api_key": os.environ.get("CUSTOM_API_KEY"), 65 | "base_url": self.base_url, 66 | } 67 | 68 | def get_provider_name(self) -> str: 69 | return "custom" 70 | 71 | class OpenRouterProvider(LLMProvider): 72 | def get_client_config(self) -> Dict[str, str]: 73 | pass 74 | 75 | def get_provider_name(self) -> Dict[str, Any]: 76 | return "openrouter" 77 | 78 | # class GoogleProvider(LLMProvider): 79 | # def get_client_config(self) -> Dict[str, str]: 80 | # api_key = os.environ.get("GOOGLE_API_KEY") 81 | # os.environ['API_KEY'] = api_key 82 | # return { 83 | # "api_key": os.environ.get("GOOGLE_API_KEY"), 84 | # } 85 | # def get_provider_name(self) -> str: 86 | # return "google" 87 | 88 | PROVIDER_MAP = { 89 | "openai": OpenAIProvider(), 90 | "together": TogetherAIProvider(), 91 | "ollama": OllamaProvider(), 92 | "groq": GroqProvider(), 93 | "anthropic": AnthropicProvider(), 94 | "openrouter": OpenRouterProvider() 95 | # "google": GoogleProvider(), 96 | } 97 | 98 | def get_provider(provider_name: str, custom_base_url: str = None) -> LLMProvider: 99 | if provider_name.lower() == "custom": 100 | if not custom_base_url: 101 | raise ValueError("Custom provider requires a base_url") 102 | return CustomProvider(custom_base_url) 103 | else: 104 | provider = PROVIDER_MAP.get(provider_name.lower()) 105 | if not provider: 106 | raise ValueError(f"Unsupported provider: {provider_name}. Choose one of the supported providers: {', '.join(PROVIDER_MAP.keys())}") 107 | return provider -------------------------------------------------------------------------------- /sentient/utils/ui_messagetype.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class MessageType(Enum): 5 | PLAN = "plan" 6 | STEP = "step" 7 | ACTION = "action" 8 | ANSWER = "answer" 9 | QUESTION = "question" 10 | INFO = "info" 11 | FINAL = "final" 12 | DONE = "transaction_done" 13 | ERROR = "error" 14 | --------------------------------------------------------------------------------