├── .gitignore
├── LICENSE
├── README.md
├── cookbook.py
├── poetry.lock
├── pyproject.toml
├── requirements.txt
└── sentient
    ├── __init__.py
    ├── __main__.py
    ├── config
        ├── __init__.py
        └── config.py
    ├── core
        ├── agent
        │   ├── __init__.py
        │   ├── agent.py
        │   └── base.py
        ├── memory
        │   ├── __init__.py
        │   └── ltm.py
        ├── models
        │   ├── __init__.py
        │   └── models.py
        ├── orchestrator
        │   └── orchestrator.py
        ├── prompts
        │   ├── __init__.py
        │   └── prompts.py
        ├── skills
        │   ├── __init__.py
        │   ├── click_using_selector.py
        │   ├── enter_text_and_click.py
        │   ├── enter_text_using_selector.py
        │   ├── get_dom_with_content_type.py
        │   ├── get_screenshot.py
        │   ├── get_url.py
        │   ├── get_user_input.py
        │   ├── open_url.py
        │   ├── pdf_text_extractor.py
        │   ├── press_key_combination.py
        │   └── upload_file.py
        └── web_driver
        │   ├── __init__.py
        │   └── playwright.py
    ├── task_instructions
        └── task_instructions.txt
    └── utils
        ├── __init__.py
        ├── _pydantic.py
        ├── cli_helper.py
        ├── dom_helper.py
        ├── dom_mutation_observer.py
        ├── extract_json.py
        ├── function_utils.py
        ├── get_detailed_accessibility_tree.py
        ├── logger.py
        ├── message_type.py
        ├── providers.py
        └── ui_messagetype.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .env
 2 | .venv/
 3 | __pycache__
 4 | log_files/
 5 | logs/
 6 | .DS_STORE
 7 | results/
 8 | dist/
 9 | test.py
10 | test_instructor.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Sentient Engineering
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # sentient - browser controlling agents in 3 lines of code
  2 | 
  3 | [beta]
  4 | 
  5 | ```python
  6 | from sentient import sentient
  7 | import asyncio
  8 | result = asyncio.run(sentient.invoke(goal="play shape of you on youtube"))
  9 | ```
 10 | 
 11 | ### setup
 12 | 
 13 | 1. install sentient `pip install sentient`
 14 | 
 15 | 2. currently, you need to start chrome in dev mode - in a seaparate terminal on the port 9222. use the below commands to start the chrome instance and do necesssary logins if needed
 16 | 
 17 | for mac, use command -
 18 | 
 19 | ```bash
 20 | sudo /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
 21 | ```
 22 | 
 23 | to run brave browser (mac) -
 24 | 
 25 | ```bash
 26 | sudo /Applications/Brave\ Browser.app/Contents/MacOS/Brave\ Browser --remote-debugging-port=9222 --guest
 27 | ```
 28 | 
 29 | for linux -
 30 | 
 31 | ```bash
 32 | google-chrome --remote-debugging-port=9222
 33 | ```
 34 | 
 35 | for windows -
 36 | 
 37 | ```bash
 38 | "C:\Program Files\Google\Chrome\Application\chrome.exe" --remote-debugging-port=9222
 39 | ```
 40 | 
 41 | 4. setup open ai api key in a .env file or `export OPENAI_API_KEY="sk-proj-"`
 42 | 
 43 | 5. run the agent
 44 | 
 45 | ```python
 46 | from sentient import sentient
 47 | import asyncio
 48 | 
 49 | # if you wanna run in a jupyter notebook, uncomment the following two lines :
 50 | #import nest_asyncio
 51 | #nest_asyncio.apply()
 52 | 
 53 | result = asyncio.run(sentient.invoke("play shape of you on youtube"))
 54 | ```
 55 | 
 56 | 6. note - by default we use `gpt-4o-2024-08-06` from `openai` to run sentient as it is the best performing model. you can also use other models like `gpt4o` or `gpt4o-mini` but the reliabilty may take some hit.
 57 | 
 58 | ---
 59 | 
 60 | ### setting custom task specific instructions
 61 | 
 62 | you can customise the agent's behaviour by providing natural language descripition of how it should naviagate or what all things it should keep in mind while executing a particualr task.
 63 | this is helpful in improving the accuracy and reliability of the agent on your specific task.
 64 | 
 65 | ```
 66 | from sentient import sentient
 67 | import asyncio
 68 | 
 69 | custom_instructions = """
 70 | 1. Directly go to youtube.com rather than searching for the song on google!
 71 | """
 72 | 
 73 | #use with open ai
 74 | result = asyncio.run(sentient.invoke(
 75 |     goal="play shape of you on youtube",
 76 |     task_instructions=custom_instructions,
 77 |     provider="openai",
 78 |     model="gpt-4o-2024-08-06"))
 79 | ```
 80 | 
 81 | ---
 82 | 
 83 | ### using providers other than open ai
 84 | 
 85 | we currently support a few providers. if you wish to have others included, please create a new issue. you can pass custom instructions in a similar fashion as shown above. you can also refer the [cookbook](cookbook.py) for seeing all examples of using sentient with various providers.
 86 | 
 87 | > **Note** - the reliability of agent is dependent on whether the model is able to produce reliable json. we reccommend using open ai's latest gpt4o models for most tasks. claude 3.5 sonnet and some other instruction tuned models are also good. small local models might not produce reliable json - thus leading to failures more often.
 88 | 
 89 | #### using anthropic
 90 | 
 91 | 1. set API key - `export ANTHROPIC_API_KEY="sk-ant..."`
 92 | 
 93 | 2. pass provider and model options to the invoke command.
 94 | 
 95 | ```python
 96 | #using with anthropic
 97 | result = asyncio.run(sentient.invoke(
 98 |     goal="play shape of you on youtube",
 99 |     provider="anthropic",
100 |     model="claude-3-5-sonnet-20240620"))
101 | ```
102 | 
103 | #### using ollama
104 | 
105 | 1. ensure the ollama server is on. you just need to pass the name of the model.
106 | 
107 | ```python
108 | #use with ollama
109 | result = asyncio.run(sentient.invoke(
110 |     goal="play shape of you on youtube",
111 |     provider="ollama",
112 |     model="llama3"))
113 | ```
114 | 
115 | #### using groq
116 | 
117 | 1. set groq API key - `export GROQ_API_KEY="gsk..."`
118 | 
119 | 2. pass provider and model options to the invoke command. NOTE: only llama-3.1-70b-versatile has context window large enough to support the agent. also, the model does not produce reliable outputs. we recommend using groq only for testing purposes.
120 | 
121 | ```python
122 | # use with groq models
123 | result = asyncio.run(sentient.invoke(
124 |     goal="play shape of you on youtube",
125 |     provider="groq",
126 |     model="llama-3.1-70b-versatile"))
127 | ```
128 | 
129 | #### using together ai
130 | 
131 | 1. set API key for Together AI - `export TOGETHER_API_KEY="your-api-key"`
132 | 
133 | 2. pass provider and model options to the invoke command.
134 | 
135 | ```python
136 | #use with together ai
137 | result = asyncio.run(sentient.invoke(
138 |     goal="play shape of you on youtube",
139 |     provider="together",
140 |     model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"))
141 | ```
142 | 
143 | #### using a custom open ai compatible server
144 | 
145 | 1. you can use this to use any open ai api compatible server (like vllm/ ollama running on a different machine. etc)
146 | 
147 | 2. set API key for your custom server - `export CUSTOM_API_KEY="your-api-key"`. fill in any random value if there is no api key needed.
148 | 
149 | 3. pass in the custom base url and model name to the invoke command.
150 | 
151 | ```python
152 | #use with custom server
153 | result = asyncio.run(sentient.invoke(
154 |     goal="play shape of you on youtube",
155 |     provider="custom",
156 |     custom_base_url="http://localhost:8080/v1",
157 |     model="model_name"))
158 | ```
159 | 
160 | #### using open-router
161 | 
162 | 1. set API key for open router - `export OPENROUTER_API_KEY="your-api-key"`
163 | 
164 | 2. we use litellm to call openrouter. so if you want to disable litellm logging - `export LITELLM_LOG="ERROR"`
165 | 
166 | 3. pass provider and model options to the invoke command. model name should be passed as openrouter/your-model-name
167 | 
168 | ```python
169 | # use with open-router
170 | result = asyncio.run(sentient.invoke(
171 |     goal="play shape of you on youtube",
172 |     provider="openrouter",
173 |     model="openrouter/anthropic/claude-3.5-sonnet"))
174 | ```
175 | 


--------------------------------------------------------------------------------
/cookbook.py:
--------------------------------------------------------------------------------
 1 | from sentient import sentient
 2 | import asyncio
 3 | 
 4 | custom_instructions = """
 5 | 1. Directly go to youtube.com rather than searching for the song on google!
 6 | """
 7 | 
 8 | # #use with open ai
 9 | result = asyncio.run(sentient.invoke(
10 |     goal="play shape of you on youtube", 
11 |     task_instructions=custom_instructions,
12 |     provider="openai",
13 |     model="gpt-4o-2024-08-06"))
14 | 
15 | # #use with together ai
16 | result = asyncio.run(sentient.invoke(
17 |     goal="play shape of you on youtube", 
18 |     task_instructions=custom_instructions, 
19 |     provider="together",
20 |     model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"))
21 | 
22 | # #use with ollama
23 | result = asyncio.run(sentient.invoke(
24 |     goal="play shape of you on youtube", 
25 |     task_instructions=custom_instructions, 
26 |     provider="ollama",
27 |     model="llama3"))
28 | 
29 | #using anthropic
30 | result = asyncio.run(sentient.invoke(
31 |     goal="play shape of you on youtube", 
32 |     task_instructions=custom_instructions, 
33 |     provider="anthropic",
34 |     model="claude-3-5-sonnet-20240620"))
35 | 
36 | # use with groq models 
37 | result = asyncio.run(sentient.invoke(
38 |     goal="play shape of you on youtube", 
39 |     task_instructions=custom_instructions, 
40 |     provider="groq",
41 |     model="llama-3.1-70b-versatile"))
42 | 
43 | #using a custom endpoint (like remotely hosted vLLM/ ollama servers) 
44 | #endpoint must be openai compatible
45 | result = asyncio.run(sentient.invoke(
46 |     goal="play shape of you on youtube", 
47 |     task_instructions=custom_instructions,
48 |     provider="custom",
49 |     custom_base_url="http://localhost:8080/v1",
50 |     model="llama3.1"))
51 | 
52 | print(result)


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "sentient"
 3 | version = "0.1.10"
 4 | description = ""
 5 | authors = ["nischalj10 <nischalj10@gmail.com>", "thebhulawat <namanbhulawat@gmail.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.9,<4.0"
10 | pydantic = "^2.8.2"
11 | pytest-playwright = "^0.5.1"
12 | pdfplumber = "0.11.2"
13 | typing-extensions = "^4.12.2"
14 | ruff = "^0.5.6"
15 | setuptools = "^72.1.0"
16 | openai = "^1.40.1"
17 | boto3 = "^1.34.157"
18 | python-json-logger = "^2.0.7"
19 | aiohttp = "^3.10.2"
20 | colorama = "^0.4.6"
21 | tiktoken = "^0.7.0"
22 | termcolor = "^2.4.0"
23 | tabulate = "^0.9.0"
24 | langsmith = "^0.1.104"
25 | instructor = "1.4.2"
26 | python-dotenv = "^1.0.1"
27 | google-generativeai = "^0.8.1"
28 | groq = "^0.11.0"
29 | jsonref = "^1.1.0"
30 | eval-type-backport = "^0.2.0"
31 | anthropic = "^0.34.2"
32 | litellm = "^1.48.8"
33 | 
34 | 
35 | [build-system]
36 | requires = ["poetry-core"]
37 | build-backend = "poetry.core.masonry.api"
38 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | agentops==0.3.10 ; python_version >= "3.10" and python_version < "4.0"
 2 | aiohappyeyeballs==2.4.0 ; python_version >= "3.10" and python_version < "4.0"
 3 | aiohttp==3.10.5 ; python_version >= "3.10" and python_version < "4.0"
 4 | aiosignal==1.3.1 ; python_version >= "3.10" and python_version < "4.0"
 5 | annotated-types==0.7.0 ; python_version >= "3.10" and python_version < "4.0"
 6 | anyio==4.4.0 ; python_version >= "3.10" and python_version < "4.0"
 7 | async-timeout==4.0.3 ; python_version >= "3.10" and python_version < "3.11"
 8 | attrs==24.2.0 ; python_version >= "3.10" and python_version < "4.0"
 9 | boto3==1.35.1 ; python_version >= "3.10" and python_version < "4.0"
10 | botocore==1.35.1 ; python_version >= "3.10" and python_version < "4.0"
11 | certifi==2024.7.4 ; python_version >= "3.10" and python_version < "4.0"
12 | cffi==1.17.0 ; python_version >= "3.10" and python_version < "4.0" and platform_python_implementation != "PyPy"
13 | charset-normalizer==3.3.2 ; python_version >= "3.10" and python_version < "4.0"
14 | click==8.1.7 ; python_version >= "3.10" and python_version < "4.0"
15 | colorama==0.4.6 ; python_version >= "3.10" and python_version < "4.0"
16 | cryptography==43.0.0 ; python_version >= "3.10" and python_version < "4.0"
17 | distro==1.9.0 ; python_version >= "3.10" and python_version < "4.0"
18 | docstring-parser==0.16 ; python_version >= "3.10" and python_version < "4.0"
19 | exceptiongroup==1.2.2 ; python_version >= "3.10" and python_version < "3.11"
20 | filelock==3.15.4 ; python_version >= "3.10" and python_version < "4.0"
21 | frozenlist==1.4.1 ; python_version >= "3.10" and python_version < "4.0"
22 | fsspec==2024.6.1 ; python_version >= "3.10" and python_version < "4.0"
23 | greenlet==3.0.3 ; python_version >= "3.10" and python_version < "4.0"
24 | h11==0.14.0 ; python_version >= "3.10" and python_version < "4.0"
25 | httpcore==1.0.5 ; python_version >= "3.10" and python_version < "4.0"
26 | httpx==0.27.0 ; python_version >= "3.10" and python_version < "4.0"
27 | huggingface-hub==0.24.6 ; python_version >= "3.10" and python_version < "4.0"
28 | idna==3.7 ; python_version >= "3.10" and python_version < "4.0"
29 | importlib-metadata==8.3.0 ; python_version >= "3.10" and python_version < "4.0"
30 | iniconfig==2.0.0 ; python_version >= "3.10" and python_version < "4.0"
31 | instructor==1.4.0 ; python_version >= "3.10" and python_version < "4.0"
32 | jinja2==3.1.4 ; python_version >= "3.10" and python_version < "4.0"
33 | jiter==0.4.2 ; python_version >= "3.10" and python_version < "4.0"
34 | jmespath==1.0.1 ; python_version >= "3.10" and python_version < "4.0"
35 | joblib==1.4.2 ; python_version >= "3.10" and python_version < "4.0"
36 | jsonschema-specifications==2023.12.1 ; python_version >= "3.10" and python_version < "4.0"
37 | jsonschema==4.23.0 ; python_version >= "3.10" and python_version < "4.0"
38 | langsmith==0.1.104 ; python_version >= "3.10" and python_version < "4.0"
39 | litellm==1.43.18 ; python_version >= "3.10" and python_version < "4.0"
40 | markdown-it-py==3.0.0 ; python_version >= "3.10" and python_version < "4.0"
41 | markupsafe==2.1.5 ; python_version >= "3.10" and python_version < "4.0"
42 | mdurl==0.1.2 ; python_version >= "3.10" and python_version < "4.0"
43 | multidict==6.0.5 ; python_version >= "3.10" and python_version < "4.0"
44 | nltk==3.9.1 ; python_version >= "3.10" and python_version < "4.0"
45 | openai==1.41.1 ; python_version >= "3.10" and python_version < "4.0"
46 | orjson==3.10.7 ; python_version >= "3.10" and python_version < "4.0"
47 | packaging==23.2 ; python_version >= "3.10" and python_version < "4.0"
48 | pdfminer-six==20231228 ; python_version >= "3.10" and python_version < "4.0"
49 | pdfplumber==0.11.2 ; python_version >= "3.10" and python_version < "4.0"
50 | pillow==10.4.0 ; python_version >= "3.10" and python_version < "4.0"
51 | playwright-stealth==1.0.6 ; python_version >= "3.10" and python_version < "4.0"
52 | playwright==1.46.0 ; python_version >= "3.10" and python_version < "4.0"
53 | pluggy==1.5.0 ; python_version >= "3.10" and python_version < "4.0"
54 | psutil==5.9.8 ; python_version >= "3.10" and python_version < "4.0"
55 | pycparser==2.22 ; python_version >= "3.10" and python_version < "4.0" and platform_python_implementation != "PyPy"
56 | pydantic-core==2.20.1 ; python_version >= "3.10" and python_version < "4.0"
57 | pydantic==2.8.2 ; python_version >= "3.10" and python_version < "4.0"
58 | pyee==11.1.0 ; python_version >= "3.10" and python_version < "4.0"
59 | pygments==2.18.0 ; python_version >= "3.10" and python_version < "4.0"
60 | pypdfium2==4.30.0 ; python_version >= "3.10" and python_version < "4.0"
61 | pytest-base-url==2.1.0 ; python_version >= "3.10" and python_version < "4.0"
62 | pytest-playwright==0.5.1 ; python_version >= "3.10" and python_version < "4.0"
63 | pytest==8.3.2 ; python_version >= "3.10" and python_version < "4.0"
64 | python-dateutil==2.9.0.post0 ; python_version >= "3.10" and python_version < "4.0"
65 | python-dotenv==1.0.1 ; python_version >= "3.10" and python_version < "4.0"
66 | python-json-logger==2.0.7 ; python_version >= "3.10" and python_version < "4.0"
67 | python-slugify==8.0.4 ; python_version >= "3.10" and python_version < "4.0"
68 | pyyaml==6.0.1 ; python_version >= "3.10" and python_version < "4.0"
69 | referencing==0.35.1 ; python_version >= "3.10" and python_version < "4.0"
70 | regex==2024.7.24 ; python_version >= "3.10" and python_version < "4.0"
71 | requests==2.31.0 ; python_version >= "3.10" and python_version < "4.0"
72 | rich==13.8.0 ; python_version >= "3.10" and python_version < "4.0"
73 | rpds-py==0.20.0 ; python_version >= "3.10" and python_version < "4.0"
74 | ruff==0.5.7 ; python_version >= "3.10" and python_version < "4.0"
75 | s3transfer==0.10.2 ; python_version >= "3.10" and python_version < "4.0"
76 | setuptools==72.2.0 ; python_version >= "3.10" and python_version < "4.0"
77 | shellingham==1.5.4 ; python_version >= "3.10" and python_version < "4.0"
78 | six==1.16.0 ; python_version >= "3.10" and python_version < "4.0"
79 | sniffio==1.3.1 ; python_version >= "3.10" and python_version < "4.0"
80 | tabulate==0.9.0 ; python_version >= "3.10" and python_version < "4.0"
81 | tenacity==8.5.0 ; python_version >= "3.10" and python_version < "4.0"
82 | termcolor==2.4.0 ; python_version >= "3.10" and python_version < "4.0"
83 | text-unidecode==1.3 ; python_version >= "3.10" and python_version < "4.0"
84 | tiktoken==0.7.0 ; python_version >= "3.10" and python_version < "4.0"
85 | tokenizers==0.20.0 ; python_version >= "3.10" and python_version < "4.0"
86 | tomli==2.0.1 ; python_version >= "3.10" and python_version < "3.11"
87 | tqdm==4.66.5 ; python_version >= "3.10" and python_version < "4.0"
88 | typer==0.12.5 ; python_version >= "3.10" and python_version < "4.0"
89 | typing-extensions==4.12.2 ; python_version >= "3.10" and python_version < "4.0"
90 | urllib3==2.2.2 ; python_version >= "3.10" and python_version < "4.0"
91 | yarl==1.9.4 ; python_version >= "3.10" and python_version < "4.0"
92 | zipp==3.20.0 ; python_version >= "3.10" and python_version < "4.0"
93 | 


--------------------------------------------------------------------------------
/sentient/__init__.py:
--------------------------------------------------------------------------------
 1 | from sentient.core.orchestrator.orchestrator import Orchestrator
 2 | from sentient.core.agent.agent import Agent
 3 | from sentient.core.models.models import State
 4 | from sentient.core.memory import ltm
 5 | from sentient.utils.providers import get_provider
 6 | 
 7 | class Sentient:
 8 |     def __init__(self):
 9 |         self.orchestrator = None
10 |     
11 |     def _create_state_to_agent_map(self, provider: str, model: str, custom_base_url: str = None):
12 |         provider_instance = get_provider(provider, custom_base_url)
13 |         return {
14 |             State.BASE_AGENT: Agent(provider=provider_instance, model_name=model),
15 |         }
16 | 
17 |     async def _initialize(self, provider: str, model: str, custom_base_url: str = None):
18 |         if not self.orchestrator:
19 |             state_to_agent_map = self._create_state_to_agent_map(provider, model, custom_base_url)
20 |             self.orchestrator = Orchestrator(state_to_agent_map=state_to_agent_map)
21 |             await self.orchestrator.start()
22 | 
23 |     async def invoke(
24 |             self, 
25 |             goal: str, 
26 |             provider: str = "openai", 
27 |             model: str = "gpt-4o-2024-08-06", 
28 |             task_instructions: str = None, 
29 |             custom_base_url: str = None
30 |             ):
31 |         if task_instructions:
32 |             ltm.set_task_instructions(task_instructions)
33 |         await self._initialize(provider, model, custom_base_url)
34 |         result = await self.orchestrator.execute_command(goal)
35 |         return result
36 | 
37 |     async def shutdown(self):
38 |         if self.orchestrator:
39 |             await self.orchestrator.shutdown()
40 | 
41 | sentient = Sentient()


--------------------------------------------------------------------------------
/sentient/__main__.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from sentient.core.agent.agent import Agent
 4 | from sentient.core.models.models import State
 5 | from sentient.core.orchestrator.orchestrator import Orchestrator
 6 | 
 7 | 
 8 | async def main():
 9 |     # Define state machine
10 |     state_to_agent_map = {
11 |         State.BASE_AGENT: Agent(),
12 |     }
13 | 
14 |     orchestrator = Orchestrator(state_to_agent_map=state_to_agent_map)
15 |     await orchestrator.start()
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     asyncio.run(main())
20 | 


--------------------------------------------------------------------------------
/sentient/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/config/__init__.py


--------------------------------------------------------------------------------
/sentient/config/config.py:
--------------------------------------------------------------------------------
 1 | # config.py at the project source code root
 2 | import os
 3 | 
 4 | # Get the absolute path of the current file (config.py)
 5 | CURRENT_FILE_PATH = os.path.abspath(__file__)
 6 | 
 7 | # Get the project root directory (two levels up from config.py)
 8 | PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(CURRENT_FILE_PATH)))
 9 | 
10 | # Define other paths relative to the project root
11 | PROJECT_SOURCE_ROOT = os.path.join(PROJECT_ROOT, "sentient")
12 | SOURCE_LOG_FOLDER_PATH = os.path.join(PROJECT_SOURCE_ROOT, "log_files")
13 | PROJECT_TEMP_PATH = os.path.join(PROJECT_SOURCE_ROOT, "temp")
14 | TASK_INSTRUCTION_PATH = os.path.join(PROJECT_SOURCE_ROOT, "task_instructions")
15 | PROJECT_TEST_ROOT = os.path.join(PROJECT_SOURCE_ROOT, "test")
16 | 
17 | # Check if the log folder exists, and if not, create it
18 | if not os.path.exists(SOURCE_LOG_FOLDER_PATH):
19 |     os.makedirs(SOURCE_LOG_FOLDER_PATH)
20 |     print(f"Created log folder at: {SOURCE_LOG_FOLDER_PATH}")
21 | 
22 | # create user prefernces folder if it does not exist
23 | if not os.path.exists(TASK_INSTRUCTION_PATH):
24 |     os.makedirs(TASK_INSTRUCTION_PATH)
25 |     print(f"Created task instruction folder at: {TASK_INSTRUCTION_PATH}")
26 | 
27 | if not os.path.exists(PROJECT_TEMP_PATH):
28 |     os.makedirs(PROJECT_TEMP_PATH)
29 |     print(f"Created temp folder at: {PROJECT_TEMP_PATH}")
30 | 


--------------------------------------------------------------------------------
/sentient/core/agent/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/core/agent/__init__.py


--------------------------------------------------------------------------------
/sentient/core/agent/agent.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from string import Template
 3 | 
 4 | from sentient.core.agent.base import BaseAgent
 5 | from sentient.core.memory import ltm
 6 | from sentient.core.models.models import AgentInput, AgentOutput
 7 | from sentient.core.prompts.prompts import LLM_PROMPTS
 8 | from sentient.utils.providers import LLMProvider
 9 | 
10 | 
11 | class Agent(BaseAgent):
12 |     def __init__(self, provider:LLMProvider, model_name: str):
13 |         self.name = "sentient"
14 |         self.ltm = None
15 |         self.ltm = self.__get_ltm()
16 |         self.system_prompt = self.__modify_system_prompt(self.ltm)
17 |         super().__init__(
18 |             name=self.name,
19 |             system_prompt=self.system_prompt,
20 |             input_format=AgentInput,
21 |             output_format=AgentOutput,
22 |             keep_message_history=False,
23 |             provider=provider,
24 |             model_name=model_name,
25 |         )
26 | 
27 |     @staticmethod
28 |     def __get_ltm():
29 |         return ltm.get_task_instructions()
30 | 
31 |     def __modify_system_prompt(self, ltm):
32 |         system_prompt: str = LLM_PROMPTS["BASE_AGENT_PROMPT"]
33 | 
34 |         substitutions = {
35 |             "task_information": ltm if ltm is not None else "",
36 |         }
37 | 
38 |         # Use safe_substitute to avoid KeyError
39 |         system_prompt = Template(system_prompt).safe_substitute(substitutions)
40 | 
41 |         # Add today's day & date to the system prompt
42 |         today = datetime.now()
43 |         today_date = today.strftime("%d/%m/%Y")
44 |         weekday = today.strftime("%A")
45 |         system_prompt += f"\nToday's date is: {today_date}"
46 |         system_prompt += f"\nCurrent weekday is: {weekday}"
47 | 
48 |         return system_prompt
49 | 


--------------------------------------------------------------------------------
/sentient/core/agent/base.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import Callable, List, Optional, Tuple, Type
  3 | 
  4 | import instructor
  5 | import instructor.patch
  6 | import openai
  7 | from instructor import Mode
  8 | from instructor.exceptions import InstructorRetryException
  9 | from pydantic import BaseModel
 10 | from groq import Groq
 11 | from anthropic import Anthropic
 12 | from litellm import completion
 13 | 
 14 | from sentient.utils.function_utils import get_function_schema
 15 | from sentient.utils.logger import logger
 16 | from sentient.utils.providers import LLMProvider
 17 | 
 18 | class BaseAgent:
 19 |     def __init__(
 20 |         self,
 21 |         name: str,
 22 |         system_prompt: str,
 23 |         input_format: Type[BaseModel],
 24 |         output_format: Type[BaseModel],
 25 |         tools: Optional[List[Tuple[Callable, str]]] = None,
 26 |         keep_message_history: bool = True,
 27 |         provider: LLMProvider = None,
 28 |         model_name: str = None,
 29 |     ):
 30 |         # Metdata
 31 |         self.agent_name = name
 32 | 
 33 |         # Messages
 34 |         self.system_prompt = system_prompt
 35 |         if self.system_prompt:
 36 |             self._initialize_messages()
 37 |         self.keep_message_history = keep_message_history
 38 | 
 39 |         # Input-output format
 40 |         self.input_format = input_format
 41 |         self.output_format = output_format
 42 | 
 43 |         # Llm client
 44 |         self.provider_name = provider.get_provider_name()
 45 |         self.provider = provider
 46 |         client_config = self.provider.get_client_config()
 47 | 
 48 |         # if self.provider_name == "google":
 49 |         #     self.client = instructor.from_gemini(
 50 |         #         client=genai.GenerativeModel(
 51 |         #             model_name=model_name, 
 52 |         #         )
 53 |         #     )
 54 |         if self.provider_name == "groq":
 55 |             self.client = Groq(**client_config)
 56 |             self.client = instructor.from_groq(self.client, mode=Mode.TOOLS)
 57 |         elif self.provider_name == "anthropic":
 58 |             self.client = instructor.from_anthropic(Anthropic())
 59 |         elif self.provider_name == "openrouter": 
 60 |             # use litellm for openrouter as instructor currently does not seem to have support for openrouter
 61 |             self.client = instructor.from_litellm(completion=completion)
 62 |         elif self.provider_name == "together":
 63 |             self.client = openai.Client(**client_config)
 64 |             self.client = instructor.from_openai(self.client, mode=Mode.JSON)
 65 |         else:
 66 |             self.client = openai.Client(**client_config)
 67 |             self.client = instructor.from_openai(self.client, mode=Mode.TOOLS)
 68 |         
 69 |         # Set model name
 70 |         self.model_name = model_name
 71 | 
 72 |         # Tools
 73 |         self.tools_list = []
 74 |         self.executable_functions_list = {}
 75 |         if tools:
 76 |             self._initialize_tools(tools)
 77 | 
 78 |     def _initialize_tools(self, tools: List[Tuple[Callable, str]]):
 79 |         for func, func_desc in tools:
 80 |             self.tools_list.append(get_function_schema(func, description=func_desc))
 81 |             self.executable_functions_list[func.__name__] = func
 82 | 
 83 |     def _initialize_messages(self):
 84 |         self.messages = [{"role": "user", "content": self.system_prompt}]
 85 |         self.messages.append(
 86 |                 {
 87 |                     "role": "assistant",
 88 |                     "content": "Understood. I will properly follow the instructions given. Can you provide me with the objective and other details in JSON format?",
 89 |                 }
 90 |             )
 91 | 
 92 |     # @traceable(run_type="chain", name="agent_run")
 93 |     async def run(
 94 |         self, input_data: BaseModel, screenshot: str = None
 95 |     ) -> BaseModel:
 96 |         if not isinstance(input_data, self.input_format):
 97 |             raise ValueError(f"Input data must be of type {self.input_format.__name__}")
 98 | 
 99 |         # Handle message history.
100 |         if not self.keep_message_history:
101 |             self._initialize_messages()
102 | 
103 |         if screenshot:
104 |             self.messages.append(
105 |                 {
106 |                     "role": "user",
107 |                     "content": [
108 |                         {
109 |                             "type": "text",
110 |                             "text": input_data.model_dump_json(
111 |                                 exclude={"current_page_dom", "current_page_url"}
112 |                             ),
113 |                         },
114 |                         {"type": "image_url", "image_url": {"url": screenshot}},
115 |                     ],
116 |                 }
117 |             )
118 |         else:
119 |             self.messages.append(
120 |                 {
121 |                     "role": "user",
122 |                     "content": input_data.model_dump_json(
123 |                         exclude={"current_page_dom", "current_page_url"}
124 |                     ),
125 |                 }
126 |             )
127 | 
128 |         self.messages.append(
129 |                 {
130 |                     "role": "assistant",
131 |                     "content": "Understood. I will properly follow the instructions given. Can you provide me with the current page DOM and URL please?",
132 |                 }
133 |             )
134 |         
135 |         # input dom and current page url in a separate message so that the LLM can pay attention to completed tasks better. *based on personal vibe check*
136 |         if hasattr(input_data, "current_page_dom") and hasattr(
137 |             input_data, "current_page_url"
138 |         ):
139 |             self.messages.append(
140 |                 {
141 |                     "role": "user",
142 |                     "content": f"Current page URL:\n{input_data.current_page_url}\n\n Current page DOM:\n{input_data.current_page_dom}",
143 |                 }
144 |             )
145 | 
146 |         while True:
147 |             # TODO:
148 |             # 1. better exeception handling and messages while calling the client
149 |             # 2. remove the else block as JSON mode in instrutor won't allow us to pass in tools.
150 |             # 3. add a max_turn here to prevent a inifinite fallout
151 |             try:
152 |                 response = None
153 |                 if len(self.tools_list) == 0:
154 |                     try: 
155 |                         response: self.output_format = self.client.chat.completions.create(
156 |                         model=self.model_name,
157 |                         messages=self.messages,
158 |                         response_model=self.output_format,
159 |                         max_retries=3,
160 |                         max_tokens=1000 if self.provider_name == "anthropic" else None,
161 |                         )
162 |                     except InstructorRetryException as e:
163 |                         print(f"InstructorRetryException: client - {self.provider_name} model - {self.model_name}")
164 |                         print(f"Error: {str(e)}")
165 |                         print(f"Error details: {e.__dict__}")
166 |                     except Exception as e:
167 |                         print("Error in output", e)
168 |                 else:
169 |                     response = self.client.chat.completions.create(
170 |                         model=self.model_name,
171 |                         messages=self.messages,
172 |                         response_model=self.output_format,
173 |                         tool_choice="auto",
174 |                         tools=self.tools_list,
175 |                     )
176 |                 
177 |                 assert isinstance(response, self.output_format)
178 |                 return response
179 | 
180 |                 # instructor directly outputs response.choices[0].message. so we will do response_message = response
181 |                 # response_message = response.choices[0].message
182 | 
183 |                 # instructor does not support funciton in JSON mode
184 |                 # if response_message.tool_calls:
185 |                 #     tool_calls = response_message.tool_calls
186 | 
187 |                 # if tool_calls:
188 |                 #     self.messages.append(response_message)
189 |                 #     for tool_call in tool_calls:
190 |                 #         await self._append_tool_response(tool_call)
191 |                 #     continue
192 | 
193 |                 # parsed_response_content: self.output_format = response_message.parsed
194 |                 
195 |             except AssertionError:
196 |                     raise TypeError(
197 |                         f"Expected response_message to be of type {self.output_format.__name__}, but got {type(response).__name__}")
198 |             except Exception as e:
199 |                 logger.error(f"Unexpected error: {str(e)}")
200 |                 raise
201 | 
202 |             
203 | 
204 |     async def _append_tool_response(self, tool_call):
205 |         function_name = tool_call.function.name
206 |         function_to_call = self.executable_functions_list[function_name]
207 |         function_args = json.loads(tool_call.function.arguments)
208 |         try:
209 |             function_response = await function_to_call(**function_args)
210 |             # print(function_response)
211 |             self.messages.append(
212 |                 {
213 |                     "tool_call_id": tool_call.id,
214 |                     "role": "tool",
215 |                     "name": function_name,
216 |                     "content": str(function_response),
217 |                 }
218 |             )
219 |         except Exception as e:
220 |             logger.error(f"Error occurred calling the tool {function_name}: {str(e)}")
221 |             self.messages.append(
222 |                 {
223 |                     "tool_call_id": tool_call.id,
224 |                     "role": "tool",
225 |                     "name": function_name,
226 |                     "content": str(
227 |                         "The tool responded with an error, please try again with a different tool or modify the parameters of the tool",
228 |                         function_response,
229 |                     ),
230 |                 }
231 |             )
232 | 


--------------------------------------------------------------------------------
/sentient/core/memory/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/core/memory/__init__.py


--------------------------------------------------------------------------------
/sentient/core/memory/ltm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from sentient.config.config import TASK_INSTRUCTION_PATH
 4 | from sentient.utils.logger import logger
 5 | 
 6 | task_instruction_file_name = "task_instructions.txt"
 7 | task_instruction_file = os.path.join(
 8 |         TASK_INSTRUCTION_PATH, task_instruction_file_name
 9 |     )
10 | 
11 | def get_task_instructions():
12 |     try:
13 |         with open(task_instruction_file) as file:
14 |             user_pref = file.read()
15 |         logger.info(f"Task instructions loaded from: {task_instruction_file}")
16 |         return user_pref
17 |     except FileNotFoundError:
18 |         logger.warning(f"Task instruction file not found: {task_instruction_file}")
19 | 
20 |     return None
21 | 
22 | def set_task_instructions(instructions: str):
23 |     try:
24 |         # clear and write new instructions
25 |         with open(task_instruction_file, 'w') as file:
26 |             file.write(instructions)
27 |         logger.info(f"Task instructions updated in: {task_instruction_file}")
28 |     except IOError:
29 |         logger.error(f"Failed to write task instructions to: {task_instruction_file}")


--------------------------------------------------------------------------------
/sentient/core/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/core/models/__init__.py


--------------------------------------------------------------------------------
/sentient/core/models/models.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | from typing import List, Literal, Optional, Union
  3 | 
  4 | from pydantic import BaseModel
  5 | from pydantic.fields import Field
  6 | 
  7 | #Global
  8 | class State(str, Enum):
  9 |     COMPLETED = "completed"
 10 |     BASE_AGENT = "agentq_base"
 11 | 
 12 | 
 13 | class ActionType(str, Enum):
 14 |     CLICK = "CLICK"
 15 |     TYPE = "TYPE"
 16 |     GOTO_URL = "GOTO_URL"
 17 |     ENTER_TEXT_AND_CLICK = "ENTER_TEXT_AND_CLICK"
 18 | 
 19 | 
 20 | class ClickAction(BaseModel):
 21 |     type: Literal[ActionType.CLICK] = Field(
 22 |         description="""Executes a click action on the element matching the given mmid attribute value. MMID is always a number. Returns Success if click was successful or appropriate error message if the element could not be clicked."""
 23 |     )
 24 |     mmid: int = Field(
 25 |         description="The mmid number of the element that needs to be clicked e.g. 114. mmid will always be a number"
 26 |     )
 27 |     wait_before_execution: Optional[float] = Field(
 28 |         description="Optional wait time in seconds before executing the click event logic"
 29 |     )
 30 | 
 31 | 
 32 | class TypeAction(BaseModel):
 33 |     type: Literal[ActionType.TYPE] = Field(
 34 |         description="""Single enter given text in the DOM element matching the given mmid attribute value. This will only enter the text and not press enter or anything else.
 35 |    Returns Success if text entry was successful or appropriate error message if text could not be entered."""
 36 |     )
 37 |     mmid: int = Field(
 38 |         description="The mmid number of the element that needs to be clicked e.g. 114. mmid will always be a number"
 39 |     )
 40 |     content: str = Field(
 41 |         description="The text to enter in the element identified by the query_selector."
 42 |     )
 43 | 
 44 | 
 45 | class GotoAction(BaseModel):
 46 |     type: Literal[ActionType.GOTO_URL] = Field(
 47 |         description="Opens a specified URL in the web browser instance. Returns url of the new page if successful or appropriate error message if the page could not be opened."
 48 |     )
 49 |     website: str = Field(
 50 |         description="The URL to navigate to. Value must include the protocol (http:// or https://)."
 51 |     )
 52 |     timeout: Optional[float] = Field(
 53 |         description="Additional wait time in seconds after initial load."
 54 |     )
 55 | 
 56 | class EnterTextAndClickAction(BaseModel):
 57 |     type: Literal[ActionType.ENTER_TEXT_AND_CLICK] = Field(
 58 |         description="""Enters text into a specified element and clicks another element, both identified by their mmid. Ideal for seamless actions like submitting search queries, this integrated approach ensures superior performance over separate text entry and click commands. Successfully completes when both actions are executed without errors, returning True; otherwise, it provides False or an explanatory message of any failure encountered."""
 59 |     )
 60 |     text_element_mmid: int = Field(
 61 |         description="The mmid number of the element where the text will be entered"
 62 |     )
 63 |     text_to_enter: str = Field(
 64 |         description="The text that will be entered into the element specified by text_element_mmid"
 65 |     )
 66 |     click_element_mmid: int = Field(
 67 |         description="The mmid number of the element that will be clicked after text entry."
 68 |     )
 69 |     wait_before_click_execution: Optional[float] = Field(
 70 |         description="Optional wait time in seconds before executing the click event logic"
 71 |     )
 72 | 
 73 | Action = Union[
 74 |     ClickAction,
 75 |     TypeAction,
 76 |     GotoAction,
 77 |     EnterTextAndClickAction,
 78 | ]
 79 | 
 80 | 
 81 | class Task(BaseModel):
 82 |     id: int
 83 |     description: str
 84 |     url: Optional[str] = Field(default=None, description="Optional URL of the page on which task will happen")
 85 |     result: Optional[str] = Field(default=None, description="Optional result of the task")
 86 | 
 87 | 
 88 | class TaskWithActions(BaseModel):
 89 |     id: int
 90 |     description: str
 91 |     actions_to_be_performed: Optional[List[Action]] = Field(default=None)
 92 |     result: Optional[str] = Field(default=None)
 93 | 
 94 | 
 95 | class Memory(BaseModel):
 96 |     objective: str
 97 |     current_state: State
 98 |     plan: Optional[Union[List[Task], List[TaskWithActions]]] = Field(default=None)
 99 |     thought: str
100 |     completed_tasks: Optional[Union[List[Task], List[TaskWithActions]]] = Field(default=None)
101 |     current_task: Optional[Union[Task, TaskWithActions]] = Field(default=None)
102 |     final_response: Optional[str] = Field(default=None)
103 | 
104 |     class Config:
105 |         use_enum_values = True
106 | 
107 | 
108 | 
109 | # Agent
110 | class AgentInput(BaseModel):
111 |     objective: str
112 |     completed_tasks: Optional[List[Task]] = Field(default=None)
113 |     current_page_url: str
114 |     current_page_dom: str
115 | 
116 | class AgentOutput(BaseModel):
117 |     thought: str
118 |     plan: List[Task]
119 |     next_task: Optional[Task] = Field(default=None, description="The next task to be executed")
120 |     next_task_actions: Optional[List[Action]] = Field(default=None, description="List of actions for the next task")
121 |     is_complete: bool
122 |     final_response: Optional[str] = Field(default=None, description="Final response of the agent")


--------------------------------------------------------------------------------
/sentient/core/orchestrator/orchestrator.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import textwrap
  3 | from typing import Dict, List
  4 | 
  5 | from colorama import Fore, init
  6 | from dotenv import load_dotenv
  7 | from langsmith import traceable
  8 | 
  9 | from sentient.core.agent.base import BaseAgent
 10 | from sentient.core.models.models import (
 11 |     Action,
 12 |     ActionType,
 13 |     AgentInput,
 14 |     AgentOutput,
 15 |     Memory,
 16 |     State,
 17 |     Task,
 18 | )
 19 | from sentient.core.skills.click_using_selector import click
 20 | from sentient.core.skills.enter_text_using_selector import EnterTextEntry, entertext
 21 | from sentient.core.skills.get_dom_with_content_type import get_dom_with_content_type
 22 | from sentient.core.skills.get_url import geturl
 23 | from sentient.core.skills.open_url import openurl
 24 | from sentient.core.skills.enter_text_and_click import enter_text_and_click
 25 | from sentient.core.web_driver.playwright import PlaywrightManager
 26 | 
 27 | init(autoreset=True)
 28 | 
 29 | 
 30 | class Orchestrator:
 31 |     def __init__(
 32 |         self, state_to_agent_map: Dict[State, BaseAgent], eval_mode: bool = False
 33 |     ):
 34 |         load_dotenv()
 35 |         self.state_to_agent_map = state_to_agent_map
 36 |         self.playwright_manager = PlaywrightManager()
 37 |         self.eval_mode = eval_mode
 38 |         self.shutdown_event = asyncio.Event()
 39 |         # self.session_id = str(uuid.uuid4())
 40 | 
 41 |     async def start(self):
 42 |         print("Starting orchestrator")
 43 |         await self.playwright_manager.async_initialize(eval_mode=self.eval_mode)
 44 |         print("Browser started and ready")
 45 | 
 46 |         # if not self.eval_mode:
 47 |         #     await self._command_loop()
 48 |     
 49 |     @classmethod
 50 |     async def invoke(cls, command: str):
 51 |         orchestrator = cls()
 52 |         await orchestrator.start()
 53 |         result = await orchestrator.execute_command(command)
 54 |         return result
 55 | 
 56 |     async def _command_loop(self):
 57 |         while not self.shutdown_event.is_set():
 58 |             try:
 59 |                 command = await self._get_user_input()
 60 |                 if command.strip().lower() == "exit":
 61 |                     await self.shutdown()
 62 |                 else:
 63 |                     await self.execute_command(command)
 64 |             except asyncio.CancelledError:
 65 |                 break
 66 |             except Exception as e:
 67 |                 print(f"An error occurred: {e}")
 68 | 
 69 |     async def _get_user_input(self):
 70 |         return await asyncio.get_event_loop().run_in_executor(
 71 |             None, input, "Enter your command (or type 'exit' to quit) "
 72 |         )
 73 | 
 74 |     # @traceable(run_type="chain", name="execute_command")
 75 |     async def execute_command(self, command: str):
 76 |         try:
 77 |             # Create initial memory
 78 |             self.memory = Memory(
 79 |                 objective=command,
 80 |                 current_state=State.BASE_AGENT,
 81 |                 plan=[],
 82 |                 thought="",
 83 |                 completed_tasks=[],
 84 |                 current_task=None,
 85 |                 final_response=None,
 86 |             )
 87 |             print(f"Executing command {self.memory.objective}")
 88 |             while self.memory.current_state != State.COMPLETED:
 89 |                 await self._handle_state()
 90 |             self._print_final_response()
 91 |             return self.memory.final_response
 92 |         except Exception as e:
 93 |             print(f"Error executing the command {self.memory.objective}: {e}")
 94 | 
 95 |     def run(self) -> Memory:
 96 |         while self.memory.current_state != State.COMPLETED:
 97 |             self._handle_state()
 98 | 
 99 |         self._print_final_response()
100 |         return self.memory
101 | 
102 |     async def _handle_state(self):
103 |         current_state = self.memory.current_state
104 | 
105 |         if current_state not in self.state_to_agent_map:
106 |             raise ValueError(f"Unhandled state! No agent for {current_state}")
107 |         
108 |         if current_state == State.BASE_AGENT:
109 |             await self._handle_agent()
110 |         else:
111 |             raise ValueError(f"Unhandled state: {current_state}")
112 | 
113 | 
114 |     async def _handle_agent(self):
115 |         agent = self.state_to_agent_map[State.BASE_AGENT]
116 |         self._print_memory_and_agent(agent.name)
117 | 
118 |         # repesenting state with dom representation
119 |         dom = await get_dom_with_content_type(content_type="all_fields")
120 |         url = await geturl()
121 | 
122 |         input_data = AgentInput(
123 |             objective=self.memory.objective,
124 |             completed_tasks=self.memory.completed_tasks,
125 |             current_page_url=str(url),
126 |             current_page_dom=str(dom),
127 |         )
128 |         
129 |         try:
130 |             output: AgentOutput = await agent.run(input_data)
131 |             await self._update_memory_from_agent(output)
132 |             print(f"{Fore.MAGENTA}Base Agent Q has updated the memory.")
133 |         except Exception as e:
134 |             print(f"{Fore.RED}Unexpected Error in Agent Execution:")
135 |             print(str(e))
136 | 
137 | 
138 |     async def _update_memory_from_agent(self, agentq_output: AgentOutput):
139 |         if agentq_output.is_complete:
140 |             self.memory.current_state = State.COMPLETED
141 |             self.memory.final_response = agentq_output.final_response
142 |         elif agentq_output.next_task:
143 |             self.memory.current_state = State.BASE_AGENT
144 |             if agentq_output.next_task_actions:
145 |                 action_results = await self.handle_agent_actions(
146 |                     agentq_output.next_task_actions
147 |                 )
148 |                 print("Action results:", action_results)
149 |                 flattened_results = "; ".join(action_results)
150 |                 agentq_output.next_task.result = flattened_results
151 | 
152 |             self.memory.completed_tasks.append(agentq_output.next_task)
153 |             self.memory.plan = agentq_output.plan
154 |             self.memory.thought = agentq_output.thought
155 |             current_task_id = len(self.memory.completed_tasks) + 1
156 |             self.memory.current_task = Task(
157 |                 id=current_task_id,
158 |                 description=agentq_output.next_task.description,
159 |                 url=None,
160 |                 result=None,
161 |             )
162 |         else:
163 |             raise ValueError("Planner did not provide next task or completion status")
164 | 
165 |     async def handle_agent_actions(self, actions: List[Action]):
166 |         results = []
167 |         for action in actions:
168 |             if action.type == ActionType.GOTO_URL:
169 |                 result = await openurl(url=action.website, timeout=action.timeout or 1)
170 |                 print("Action - GOTO")
171 |             elif action.type == ActionType.TYPE:
172 |                 entry = EnterTextEntry(
173 |                     query_selector=f"[mmid='{action.mmid}']", text=action.content
174 |                 )
175 |                 result = await entertext(entry)
176 |                 print("Action - TYPE")
177 |             elif action.type == ActionType.CLICK:
178 |                 result = await click(
179 |                     selector=f"[mmid='{action.mmid}']",
180 |                     wait_before_execution=action.wait_before_execution or 1,
181 |                 )
182 |                 print("Action - CLICK")
183 |             elif action.type == ActionType.ENTER_TEXT_AND_CLICK:
184 |                 result = await enter_text_and_click(
185 |                     text_selector=f"[mmid='{action.text_element_mmid}']",
186 |                     text_to_enter=action.text_to_enter,
187 |                     click_selector=f"[mmid='{action.click_element_mmid}']",
188 |                     wait_before_click_execution=action.wait_before_click_execution
189 |                     or 1.5,
190 |                 )
191 |                 print("Action - ENTER TEXT AND CLICK")
192 |             else:
193 |                 result = f"Unsupported action type: {action.type}"
194 | 
195 |             results.append(result)
196 | 
197 |         return results
198 | 
199 |     async def shutdown(self):
200 |         print("Shutting down orchestrator!")
201 |         self.shutdown_event.set()
202 |         await self.playwright_manager.stop_playwright()
203 | 
204 |     def _print_memory_and_agent(self, agent_type: str):
205 |         print(f"{Fore.CYAN}{'='*50}")
206 |         print(f"{Fore.YELLOW}Current State: {Fore.GREEN}{self.memory.current_state}")
207 |         print(f"{Fore.YELLOW}Agent: {Fore.GREEN}{agent_type}")
208 |         print(f"{Fore.YELLOW}Current Thought: {Fore.GREEN}{self.memory.thought}")
209 |         if len(self.memory.plan) == 0:
210 |             print(f"{Fore.YELLOW}Plan:{Fore.GREEN} none")
211 |         else:
212 |             print(f"{Fore.YELLOW}Plan:")
213 |             for task in self.memory.plan:
214 |                 print(f"{Fore.GREEN} {task.id}. {task.description}")
215 |         if self.memory.current_task:
216 |             print(
217 |                 f"{Fore.YELLOW}Current Task: {Fore.GREEN}{self.memory.current_task.description}"
218 |             )
219 |         if len(self.memory.completed_tasks) == 0:
220 |             print(f"{Fore.YELLOW}Completed Tasks:{Fore.GREEN} none")
221 |         else:
222 |             print(f"{Fore.YELLOW}Completed Tasks:")
223 |             for task in self.memory.completed_tasks:
224 |                 status = "✓" if task.result else " "
225 |                 print(f"{Fore.GREEN}  [{status}] {task.id}. {task.description}")
226 |         print(f"{Fore.CYAN}{'='*50}")
227 | 
228 |     def _print_task_result(self, task: Task):
229 |         print(f"{Fore.CYAN}{'='*50}")
230 |         print(f"{Fore.YELLOW}Task Completed: {Fore.GREEN}{task.description}")
231 |         print(f"{Fore.YELLOW}Result:")
232 |         wrapped_result = textwrap.wrap(task.result, width=80)
233 |         for line in wrapped_result:
234 |             print(f"{Fore.WHITE}{line}")
235 |         print(f"{Fore.CYAN}{'='*50}")
236 | 
237 |     def _print_final_response(self):
238 |         print(f"\n{Fore.GREEN}{'='*50}")
239 |         print(f"{Fore.GREEN}Objective Completed!")
240 |         print(f"{Fore.GREEN}{'='*50}")
241 |         print(f"{Fore.YELLOW}Final Response:")
242 |         wrapped_response = textwrap.wrap(self.memory.final_response, width=80)
243 |         for line in wrapped_response:
244 |             print(f"{Fore.WHITE}{line}")
245 |         print(f"{Fore.GREEN}{'='*50}")
246 | 


--------------------------------------------------------------------------------
/sentient/core/prompts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/core/prompts/__init__.py


--------------------------------------------------------------------------------
/sentient/core/prompts/prompts.py:
--------------------------------------------------------------------------------
  1 | LLM_PROMPTS = {
  2 |     "BASE_AGENT_PROMPT": """
  3 | You are a web automation planner. Your role is to receive an objective from the user and plan the next steps to complete the overall objective. You are part of an overall larger system where the actions you output are completed by a browser actuation system.
  4 | 
  5 |  ## Execution Flow Guidelines: ##
  6 | 1. You will look at the tasks that have been done till now, their successes/ failures. If no tasks have been completed till now, that means you have to start from scratch. 
  7 | 2. Once you have carefully observed the completed tasks and their results, then think step by step and break down the objective into a sequence of simple tasks and come up with a plan needed to complete the overall objective.
  8 | 3. Identify the next overall task and the actions that are needed to be taken on the browser to complete the next task. These actions will be given to a browser actuation system which will actually perform these actions and provide you with the result of these actions.
  9 | 
 10 | Your input and output will strictly be a well-formatted JSON with attributes as mentioned below.
 11 | 
 12 |  Input:
 13 |  - objective: Mandatory string representing the main objective to be achieved via web automation
 14 |  - completed_tasks: Optional list of all tasks that have been completed so far in order to complete the objective. This also has the result of each of the task/action that was done previously. The result can be successful or unsuccessful. In either cases, CAREFULLY OBSERVE this array of tasks and update plan accordingly to meet the objective.
 15 |  - current_page_url: Mandatory string containing the URL of the current web page.
 16 |  - current_page_dom : Mandatory string containing a DOM represntation of the current web page. It has mmid attached to all the elements which would be helpful for you to find elements for performing actions for the next task.
 17 | 
 18 | Output:
 19 |  - thought - A Mandatory string specifying your thoughts on how did you come up with the plan to solve the objective. How did you come up with the next task and why did you choose particular actions to achieve the next task. reiterate the objective here so that you can always remember what's your eventual aim. Reason deeply and think step by step to illustrate your thoughts here.
 20 |  - plan: Mandaory List of tasks that need be performed to achieve the objective. Think step by step. Update this based on the overall objective, tasks completed till now and their results and the current state of the webpage. You will also be provided with a DOM representation of the browser page to plan better.
 21 |  - next_task: Optional String representing detailed next task to be executed. Next task is consistent with the plan. This needs to be present for every response except when objective has been achieved. SEND THE next_task from the OVERALL plan. MAKE SURE to look at the provided DOM representation to adjust the appropriate next task.
 22 |  - next_task_actions - You have to output here a list of strings indicating the actions that need to be done in order to complete the above next task.
 23 |  - is_complete: Mandatory boolean indicating whether the entire objective has been achieved. Return True when the exact objective is complete without any compromises or you are absolutely convinced that the objective cannot be completed, no otherwise. This is mandatory for every response.
 24 |  - final_response: Optional string representing the summary of the completed work. This is to be returned only if the objective is COMPLETE. This is the final answer string that will be returned to the user. Use the plan and result to come with final response for the objective provided by the user.
 25 | 
 26 |  Format of task object:
 27 |  - id: Mandatory Integer representing the id of the task
 28 |  - description: Mandatory string representing the description of the task
 29 |  - url: String representing the URL on which task has been performed
 30 |  - result: String representing the result of the task. It should be a short summary of the actions you performed to accomplish the task, and what worked and what did not.
 31 | 
 32 | Actions available and their description - 
 33 | 1. CLICK[MMID, WAIT_BEFORE_EXECUTION] - Executes a click action on the element matching the given mmid attribute value. MMID is always a number. Returns Success if click was successful or appropriate error message if the element could not be clicked.
 34 | 2. TYPE[MMID, CONTENT] - Single enter given text in the DOM element matching the given mmid attribute value. This will only enter the text and not press enter or anything else. Returns Success if text entry was successful or appropriate error message if text could not be entered.
 35 | 3. GOTO_URL[URL, TIMEOUT] - Opens a specified URL in the web browser instance. Returns url of the new page if successful or appropriate error message if the page could not be opened.
 36 | 4. ENTER_TEXT_AND_CLICK[TEXT_ELEMENT_MMID, TEXT_TO_ENTER, CLICK_ELEMENT_MMID, WAIT_BEFORE_CLICK_EXECUTION] - This action enters text into a specified element and clicks another element, both identified by their mmid. Ideal for seamless actions like submitting search queries, this integrated approach ensures superior performance over separate text entry and click commands. Successfully completes when both actions are executed without errors, returning True; otherwise, it provides False or an explanatory message of any failure encountered. Always prefer this dual-action skill for tasks that combine text input and element clicking to leverage its streamlined operation.
 37 | 
 38 |  ## Planning Guidelines: ##
 39 |  1. If you know the direct URL, use it directly instead of searching for it (e.g. go to www.espn.com). Optimise the plan to avoid unnecessary steps.
 40 |  2. Do not combine multiple tasks into one. A task should be strictly as simple as interacting with a single element or navigating to a page. If you need to interact with multiple elements or perform multiple actions, you will break it down into multiple tasks. 
 41 |  3. ## VERY IMPORTANT ## - Add verification as part of the plan, after each step and specifically before terminating to ensure that the task is completed successfully. Use the provided DOM or get the webpage DOM by calling an action to verify that the task at hand is completing successfully. If not, modify the plan accordingly.
 42 |  4. If the task requires multiple informations, all of them are equally important and should be gathered before terminating the task. You will strive to meet all the requirements of the task.
 43 |  5. If one plan fails, you MUST revise the plan and try a different approach. You will NOT terminate a task untill you are absolutely convinced that the task is impossible to accomplish.
 44 |  6. Think critically if the task has been actually been achieved before doing the final termination.
 45 |  7. Make sure to take into account task sepcific information.
 46 | 
 47 |  ## Web Navigation guidelines ##
 48 |  1. Based on the actions you output, web navigation will be done, which may include logging into websites and interacting with any web content
 49 |  2. Use the provided DOM representation for element location or text summarization.
 50 |  3. Interact with pages using only the "mmid" attribute in DOM elements. mmid will always be a number.
 51 |  4. Execute Actions sequentially to avoid navigation timing issues.
 52 |  5. The given actions are NOT parallelizable. They are intended for sequential execution.
 53 |  6. When inputing information, remember to follow the format of the input field. For example, if the input field is a date field, you will enter the date in the correct format (e.g. YYYY-MM-DD), you may get clues from the placeholder text in the input field.
 54 |  7. Individual function will reply with action success and if any changes were observed as a consequence. Adjust your approach based on this feedback.
 55 |  8. Ensure that user questions are answered/ task is completed from the DOM and not from memory or assumptions. 
 56 |  9. Do not repeat the same action multiple times if it fails. Instead, if something did not work after a few attempts, terminate the task.
 57 |  10. When being asked to play a song/ video/ some other content - it is essential to know that lot of  websites like youtube autoplay the content. In such cases, you should not unncessarily click play/ pause repeatedly.  
 58 |  11. The only way you can extract information from a webpage is by looking at the DOM already provided to you. Do NOT call any actions to try and extract information. Extract XYZ info from the webpage is NOT a valid next task or action.
 59 | 
 60 |  ## Complexities of web navigation: ##
 61 |  1. Many forms have mandatory fields that need to be filled up before they can be submitted. Have a look at what fields look mandatory.
 62 |  2. In many websites, there are multiple options to filter or sort results. First try to list elements on the page which will help the task (e.g. any links or interactive elements that may lead me to the support page?).
 63 |  3. Always keep in mind complexities such as filtering, advanced search, sorting, and other features that may be present on the website. Use them when the task requires it.
 64 |  4. Very often list of items such as, search results, list of products, list of reviews, list of people etc. may be divided into multiple pages. If you need complete information, it is critical to explicitly go through all the pages.
 65 |  5. Sometimes search capabilities available on the page will not yield the optimal results. Revise the search query to either more specific or more generic.
 66 |  6. When a page refreshes or navigates to a new page, information entered in the previous page may be lost. Check that the information needs to be re-entered (e.g. what are the values in source and destination on the page?).
 67 |  7. Sometimes some elements may not be visible or be disabled until some other action is performed. Check if there are any other fields that may need to be interacted for elements to appear or be enabled.
 68 |  8. Be extra careful with elements like date and time selectors, dropdowns, etc. because they might be made differently and dom might update differently. so make sure that once you call a function to select a date, re verify if it has actually been selected. if not, retry in another way.
 69 | 
 70 | Example 1:
 71 |  Input: {
 72 |  "objective": "Find the cheapest premium economy flights from Helsinki to Stockholm on 15 March on Skyscanner.",
 73 |  "completed_tasks": [],
 74 |  "current_page_dom" : "{'role': 'WebArea', 'name': 'Google', 'children': [{'name': 'About', 'mmid': '26', 'tag': 'a'}, {'name': 'Store', 'mmid': '27', 'tag': 'a'}, {'name': 'Gmail ', 'mmid': '36', 'tag': 'a'}, {'name': 'Search for Images ', 'mmid': '38', 'tag': 'a'}, {'role': 'button', 'name': 'Search Labs', 'mmid': '43', 'tag': 'a'}, {'role': 'button', 'name': 'Google apps', 'mmid': '48', 'tag': 'a'}, {'role': 'button', 'name': 'Google Account: Nischal (nischalj10@gmail.com)', 'mmid': '54', 'tag': 'a', 'aria-label': 'Google Account: Nischal \\n(nischalj10@gmail.com)'}, {'role': 'link', 'name': 'Paris Games August Most Searched Playground', 'mmid': 79}, {'name': 'Share', 'mmid': '85', 'tag': 'button', 'additional_info': [{}]}, {'role': 'combobox', 'name': 'q', 'description': 'Search', 'focused': True, 'autocomplete': 'both', 'mmid': '142', 'tag': 'textarea', 'aria-label': 'Search'}, {'role': 'button', 'name': 'Search by voice', 'mmid': '154', 'tag': 'div'}, {'role': 'button', 'name': 'Search by image', 'mmid': '161', 'tag': 'div'}, {'role': 'button', 'name': 'btnK', 'description': 'Google Search', 'mmid': '303', 'tag': 'input', 'tag_type': 'submit', 'aria-label': 'Google Search'}, {'role': 'button', 'name': 'btnI', 'description': \"I'm Feeling Lucky\", 'mmid': '304', 'tag': 'input', 'tag_type': 'submit', 'aria-label': \"I'm Feeling Lucky\"}, {'role': 'text', 'name': 'Google offered in: '}, {'name': 'हिन्दी', 'mmid': '320', 'tag': 'a'}, {'name': 'বাংলা', 'mmid': '321', 'tag': 'a'}, {'name': 'తెలుగు', 'mmid': '322', 'tag': 'a'}, {'name': 'मराठी', 'mmid': '323', 'tag': 'a'}, {'name': 'தமிழ்', 'mmid': '324', 'tag': 'a'}, {'name': 'ગુજરાતી', 'mmid': '325', 'tag': 'a'}, {'name': 'ಕನ್ನಡ', 'mmid': '326', 'tag': 'a'}, {'name': 'മലയാളം', 'mmid': '327', 'tag': 'a'}, {'name': 'ਪੰਜਾਬੀ', 'mmid': '328', 'tag': 'a'}, {'role': 'text', 'name': 'India'}, {'name': 'Advertising', 'mmid': '336', 'tag': 'a'}, {'name': 'Business', 'mmid': '337', 'tag': 'a'}, {'name': 'How Search works', 'mmid': '338', 'tag': 'a'}, {'name': 'Privacy', 'mmid': '340', 'tag': 'a'}, {'name': 'Terms', 'mmid': '341', 'tag': 'a'}, {'role': 'button', 'name': 'Settings', 'mmid': '347', 'tag': 'div'}]}"
 75 |  }
 76 | 
 77 | Output  -
 78 |  {
 79 |  "thought" : "I see it look like the google homepage in the provided DOM representation. In order to book flight, I should go to a website like skyscanner and carry my searches over there. 
 80 | Once I am there, I should correctly set the origin city, destination city, day of travel, number of passengers, journey type (one way/ round trip), and seat type (premium economy) in the shown filters based on the objective. 
 81 | If I do not see some filters, I will try to search for them in the next step once some results are shown from initial filters. Maybe the UI of website does not provide all the filters in on go for better user experience. 
 82 | Post that I should see some results from skyscanner. I should also probably apply a price low to high filter if the flights are shown in a different order. If I am able to do all this, I should be able to complete the objective fairly easily. 
 83 | I will start with naviagting to skyscanner home page",
 84 |  "plan": [
 85 |  {"id": 1, "description": "Go to www.skyscanner.com", "url": "https://www.skyscanner.com"},
 86 |  {"id": 2, "description": "List the interaction options available on skyscanner page relevant for flight reservation along with their default values"},
 87 |  {"id": 3, "description": "Select the journey option to one-way (if not default)"},
 88 |  {"id": 4, "description": "Set number of passengers to 1 (if not default)"},
 89 |  {"id": 5, "description": "Set the departure date to 15 March 2025"},
 90 |  {"id": 6, "description": "Set ticket type to Economy Premium"},
 91 |  {"id": 7, "description": "Set from airport to 'Helsinki'"},
 92 |  {"id": 8, "description": "Set destination airport to Stockholm"},
 93 |  {"id": 9, "description": "Confirm that current values in the source airport, destination airport and departure date fields are Helsinki, Stockholm and 15 March 2025 respectively"},
 94 |  {"id": 10, "description": "Click on the search button to get the search results"},
 95 |  {"id": 11, "description": "Confirm that you are on the search results page"},
 96 |  {"id": 12, "description": "Extract the price of the cheapest flight from Helsinki to Stockholm from the search results"}
 97 |  ],
 98 |  "next_task" : {"id": 1, "url": null, "description": "Go to www.skyscanner.com", "result": null},
 99 |  "next_task_actions" : [{"type":"GOTO_URL","website":"https://www.skyscanner.com", "timeout":"2"}],
100 |  "is_complete": False,
101 |  }
102 | 
103 | Notice above how there is confirmation after each step and how interaction (e.g. setting source and destination) with each element is a separate step. Follow same pattern.
104 | 
105 | Some task sepcific information that you MUST take into account: \n $task_information
106 | 
107 |  ## SOME VERY IMPORTANT POINTS TO ALWAYS REMEMBER ##
108 |  1. NEVER ASK WHAT TO DO NEXT or HOW would you like to proceed to the user.
109 |  2. ONLY do one task at a time.
110 | """,
111 |     "OPEN_URL_PROMPT": """Opens a specified URL in the web browser instance. Returns url of the new page if successful or appropriate error message if the page could not be opened.""",
112 |     "ENTER_TEXT_AND_CLICK_PROMPT": """
113 |      This skill enters text into a specified element and clicks another element, both identified by their DOM selector queries.
114 |      Ideal for seamless actions like submitting search queries, this integrated approach ensures superior performance over separate text entry and click commands.
115 |      Successfully completes when both actions are executed without errors, returning True; otherwise, it provides False or an explanatory message of any failure encountered.
116 |      Always prefer this dual-action skill for tasks that combine text input and element clicking to leverage its streamlined operation.
117 |     """,
118 |     "GET_DOM_WITH_CONTENT_TYPE_PROMPT": """
119 |      Retrieves the DOM of the current web site based on the given content type.
120 |      The DOM representation returned contains items ordered in the same way they appear on the page. Keep this in mind when executing user requests that contain ordinals or numbered items.
121 |      text_only - returns plain text representing all the text in the web site. Use this for any information retrieval task. This will contain the most complete textual information.
122 |      input_fields - returns a JSON string containing a list of objects representing text input html elements with mmid attribute. Use this strictly for interaction purposes with text input fields.
123 |      all_fields - returns a JSON string containing a list of objects representing all interactive elements and their attributes with mmid attribute. Use this strictly to identify and interact with any type of elements on page.
124 |      If information is not available in one content type, you must try another content_type.
125 |     """,
126 |     "CLICK_PROMPT": """Executes a click action on the element matching the given mmid attribute value. It is best to use mmid attribute as the selector.
127 |     Returns Success if click was successful or appropriate error message if the element could not be clicked.
128 |     """,
129 |     "GET_URL_PROMPT": """Get the full URL of the current web page/site. If the user command seems to imply an action that would be suitable for an already open website in their browser, use this to fetch current website URL.""",
130 |     "ENTER_TEXT_PROMPT": """Single enter given text in the DOM element matching the given mmid attribute value. This will only enter the text and not press enter or anything else.
131 |      Returns Success if text entry was successful or appropriate error message if text could not be entered.
132 |      """,
133 |     "BULK_ENTER_TEXT_PROMPT": """Bulk enter text in multiple DOM fields. To be used when there are multiple fields to be filled on the same page. Typically use this when you see a form to fill with multiple inputs. Make sure to have mmid from a get DOM tool before hand.
134 |      Enters text in the DOM elements matching the given mmid attribute value.
135 |      The input will receive a list of objects containing the DOM query selector and the text to enter.
136 |      This will only enter the text and not press enter or anything else.
137 |      Returns each selector and the result for attempting to enter text.
138 |      """,
139 |     "PRESS_KEY_COMBINATION_PROMPT": """Presses the given key on the current web page.
140 |     This is useful for pressing the enter button to submit a search query, PageDown to scroll, ArrowDown to change selection in a focussed list etc.
141 |     """,
142 |     "EXTRACT_TEXT_FROM_PDF_PROMPT": """Extracts text from a PDF file hosted at the given URL.""",
143 |     "UPLOAD_FILE_PROMPT": """This skill uploads a file on the page opened by the web browser instance""",
144 | }


--------------------------------------------------------------------------------
/sentient/core/skills/__init__.py:
--------------------------------------------------------------------------------
 1 | from sentient.core.skills.click_using_selector import (
 2 |     click,
 3 |     do_click,
 4 |     is_element_present,
 5 |     perform_javascript_click,
 6 |     perform_playwright_click,
 7 | )
 8 | from sentient.core.skills.enter_text_and_click import enter_text_and_click
 9 | from sentient.core.skills.enter_text_using_selector import (
10 |     bulk_enter_text,
11 |     custom_fill_element,
12 |     do_entertext,
13 | )
14 | from sentient.core.skills.get_dom_with_content_type import get_dom_with_content_type
15 | from sentient.core.skills.get_url import geturl
16 | from sentient.core.skills.get_user_input import get_user_input
17 | from sentient.core.skills.open_url import openurl
18 | from sentient.core.skills.press_key_combination import press_key_combination
19 | 
20 | __all__ = (
21 |     click,
22 |     do_click,
23 |     is_element_present,
24 |     perform_javascript_click,
25 |     perform_playwright_click,
26 |     enter_text_and_click,
27 |     bulk_enter_text,
28 |     custom_fill_element,
29 |     do_entertext,
30 |     get_dom_with_content_type,
31 |     geturl,
32 |     get_user_input,
33 |     openurl,
34 |     press_key_combination,
35 | )
36 | 


--------------------------------------------------------------------------------
/sentient/core/skills/click_using_selector.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import inspect
  3 | import traceback
  4 | from typing import Dict
  5 | 
  6 | from playwright.async_api import ElementHandle, Page
  7 | from playwright.async_api import TimeoutError as PlaywrightTimeoutError
  8 | from typing_extensions import Annotated
  9 | 
 10 | from sentient.core.web_driver.playwright import PlaywrightManager
 11 | from sentient.utils.dom_mutation_observer import (
 12 |     subscribe,  # type: ignore
 13 |     unsubscribe,  # type: ignore
 14 | )
 15 | from sentient.utils.logger import logger
 16 | 
 17 | async def click(
 18 |     selector: Annotated[
 19 |         str,
 20 |         "The properly formed query selector string to identify the element for the click action (e.g. [mmid='114']). When \"mmid\" attribute is present, use it for the query selector. selector mmid will always be a number",
 21 |     ],
 22 |     wait_before_execution: Annotated[
 23 |         float,
 24 |         "Optional wait time in seconds before executing the click event logic.",
 25 |         float,
 26 |     ],
 27 | ) -> Annotated[str, "A message indicating success or failure of the click."]:
 28 |     """
 29 |     Executes a click action on the element matching the given query selector string within the currently open web page.
 30 |     If there is no page open, it will raise a ValueError. An optional wait time can be specified before executing the click logic. Use this to wait for the page to load especially when the last action caused the DOM/Page to load.
 31 | 
 32 |     Parameters:
 33 |     - selector: The query selector string to identify the element for the click action.
 34 |     - wait_before_execution: Optional wait time in seconds before executing the click event logic. Defaults to 0.0 seconds.
 35 | 
 36 |     Returns:
 37 |     - Success if the click was successful, Appropriate error message otherwise.
 38 |     """
 39 |     logger.info(f'Executing ClickElement with "{selector}" as the selector')
 40 | 
 41 |     # Initialize PlaywrightManager and get the active browser page
 42 |     browser_manager = PlaywrightManager()
 43 |     page = await browser_manager.get_current_page()
 44 | 
 45 |     if page is None:
 46 |         raise ValueError("No active page found. OpenURL command opens a new page.")
 47 | 
 48 |     function_name = inspect.currentframe().f_code.co_name
 49 | 
 50 |     await browser_manager.take_screenshots(f"{function_name}_start", page)
 51 | 
 52 |     await browser_manager.highlight_element(selector, True)
 53 | 
 54 |     dom_changes_detected = None
 55 | 
 56 |     def detect_dom_changes(changes: str):
 57 |         nonlocal dom_changes_detected
 58 |         dom_changes_detected = changes
 59 | 
 60 |     subscribe(detect_dom_changes)
 61 | 
 62 |     # Wrap the click action and subsequent operations in a try-except block
 63 |     try:
 64 |         # Set up navigation expectation with a shorter timeout
 65 |         async with page.expect_navigation(wait_until="domcontentloaded", timeout=10000):
 66 |             result = await do_click(page, selector, wait_before_execution)
 67 | 
 68 |         # Wait for a short time to ensure the page has settled
 69 |         await asyncio.sleep(1)
 70 |     except PlaywrightTimeoutError:
 71 |         # If navigation times out, it might be a single-page app or a slow-loading page
 72 |         logger.warning("Navigation timeout occurred, but the click might have been successful.")
 73 |         result = {
 74 |             "summary_message": "Click executed, but no full page navigation detected",
 75 |             "detailed_message": "Click executed successfully, but no full page navigation was detected. This might be normal for single-page applications or slow-loading pages.",
 76 |         }
 77 |     except Exception as e:
 78 |         logger.error(f"Error during click operation: {e}")
 79 |         result = {
 80 |             "summary_message": "Click executed, but encountered an error",
 81 |             "detailed_message": f"Click executed, but encountered an error: {str(e)}",
 82 |         }
 83 | 
 84 |     await asyncio.sleep(0.1)  # sleep for 100ms to allow the mutation observer to detect changes
 85 |     unsubscribe(detect_dom_changes)
 86 |     await browser_manager.take_screenshots(f"{function_name}_end", page)
 87 | 
 88 |     if dom_changes_detected:
 89 |         return f"Success: {result['summary_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action to click {selector} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction."
 90 |     return result["detailed_message"]
 91 | 
 92 | async def do_click(
 93 |     page: Page, selector: str, wait_before_execution: float
 94 | ) -> Dict[str, str]:
 95 |     """
 96 |     Executes the click action on the element with the given selector within the provided page.
 97 | 
 98 |     Parameters:
 99 |     - page: The Playwright page instance.
100 |     - selector: The query selector string to identify the element for the click action.
101 |     - wait_before_execution: Optional wait time in seconds before executing the click event logic.
102 | 
103 |     Returns:
104 |     Dict[str,str] - Explanation of the outcome of this operation represented as a dictionary with 'summary_message' and 'detailed_message'.
105 |     """
106 |     logger.info(
107 |         f'Executing ClickElement with "{selector}" as the selector. Wait time before execution: {wait_before_execution} seconds.'
108 |     )
109 | 
110 |     # Wait before execution if specified
111 |     if wait_before_execution > 0:
112 |         await asyncio.sleep(wait_before_execution)
113 | 
114 |     # Wait for the selector to be present and ensure it's attached and visible. If timeout, try javascript click
115 |     try:
116 |         logger.info(
117 |             f'Executing ClickElement with "{selector}" as the selector. Waiting for the element to be attached and visible.'
118 |         )
119 | 
120 |         element = await asyncio.wait_for(
121 |             page.wait_for_selector(selector, state="attached", timeout=2000),
122 |             timeout=2000,
123 |         )
124 |         if element is None:
125 |             raise ValueError(f'Element with selector: "{selector}" not found')
126 | 
127 |         logger.info(
128 |             f'Element with selector: "{selector}" is attached. scrolling it into view if needed.'
129 |         )
130 |         try:
131 |             await element.scroll_into_view_if_needed(timeout=200)
132 |             logger.info(
133 |                 f'Element with selector: "{selector}" is attached and scrolled into view. Waiting for the element to be visible.'
134 |             )
135 |         except Exception:
136 |             # If scrollIntoView fails, just move on, not a big deal
137 |             pass
138 | 
139 |         try:
140 |             await element.wait_for_element_state("visible", timeout=200)
141 |             logger.info(
142 |                 f'Executing ClickElement with "{selector}" as the selector. Element is attached and visible. Clicking the element.'
143 |             )
144 |         except Exception:
145 |             # If the element is not visible, try to click it anyway
146 |             pass
147 | 
148 |         element_tag_name = await element.evaluate(
149 |             "element => element.tagName.toLowerCase()"
150 |         )
151 | 
152 |         if element_tag_name == "option":
153 |             element_value = await element.get_attribute(
154 |                 "value"
155 |             )  # get the text that is in the value of the option
156 |             parent_element = await element.evaluate_handle(
157 |                 "element => element.parentNode"
158 |             )
159 |             await parent_element.select_option(value=element_value)  # type: ignore
160 | 
161 |             logger.info(f'Select menu option "{element_value}" selected')
162 | 
163 |             return {
164 |                 "summary_message": f'Select menu option "{element_value}" selected',
165 |                 "detailed_message": f'Select menu option "{element_value}" selected.',
166 |             }
167 | 
168 |         msg = await perform_javascript_click(page, selector)
169 |         return {
170 |             "summary_message": msg,
171 |             "detailed_message": f"{msg} Click action completed, page may have navigated.",
172 |         }
173 |     except Exception as e:
174 |         logger.error(f'Unable to click element with selector: "{selector}". Error: {e}')
175 |         traceback.print_exc()
176 |         msg = f'Unable to click element with selector: "{selector}" since the selector is invalid.'
177 |         return {"summary_message": msg, "detailed_message": f"{msg}. Error: {e}"}
178 | 
179 | 
180 | async def is_element_present(page: Page, selector: str) -> bool:
181 |     """
182 |     Checks if an element is present on the page.
183 | 
184 |     Parameters:
185 |     - page: The Playwright page instance.
186 |     - selector: The query selector string to identify the element.
187 | 
188 |     Returns:
189 |     - True if the element is present, False otherwise.
190 |     """
191 |     element = await page.query_selector(selector)
192 |     return element is not None
193 | 
194 | 
195 | async def perform_playwright_click(element: ElementHandle, selector: str):
196 |     """
197 |     Performs a click action on the element using Playwright's click method.
198 | 
199 |     Parameters:
200 |     - element: The Playwright ElementHandle instance representing the element to be clicked.
201 |     - selector: The query selector string of the element.
202 | 
203 |     Returns:
204 |     - None
205 |     """
206 |     logger.info(
207 |         f"Performing first Step: Playwright Click on element with selector: {selector}"
208 |     )
209 |     await element.click(force=False, timeout=200)
210 | 
211 | 
212 | async def perform_javascript_click(page: Page, selector: str):
213 |     """
214 |     Performs a click action on the element using JavaScript.
215 | 
216 |     Parameters:
217 |     - page: The Playwright page instance.
218 |     - selector: The query selector string of the element.
219 | 
220 |     Returns:
221 |     - A string describing the result of the click action.
222 |     """
223 |     js_code = """(selector) => {
224 |         let element = document.querySelector(selector);
225 | 
226 |         if (!element) {
227 |             console.log(`perform_javascript_click: Element with selector ${selector} not found`);
228 |             return `perform_javascript_click: Element with selector ${selector} not found`;
229 |         }
230 | 
231 |         if (element.tagName.toLowerCase() === "option") {
232 |             let value = element.text;
233 |             let parent = element.parentElement;
234 | 
235 |             parent.value = element.value; // Directly set the value if possible
236 |             // Trigger change event if necessary
237 |             let event = new Event('change', { bubbles: true });
238 |             parent.dispatchEvent(event);
239 | 
240 |             console.log("Select menu option", value, "selected");
241 |             return "Select menu option: "+ value+ " selected";
242 |         }
243 |         else {
244 |             console.log("About to click selector", selector);
245 |             // If the element is a link, make it open in the same tab
246 |             if (element.tagName.toLowerCase() === "a") {
247 |                 element.target = "_self";
248 |                 // #TODO: Consider removing this in the future if it causes issues with intended new tab behavior
249 |                 element.removeAttribute('target');
250 |                 element.removeAttribute('rel');
251 |             }
252 |             let ariaExpandedBeforeClick = element.getAttribute('aria-expanded');
253 |             element.click();
254 |             let ariaExpandedAfterClick = element.getAttribute('aria-expanded');
255 |             if (ariaExpandedBeforeClick === 'false' && ariaExpandedAfterClick === 'true') {
256 |                 return "Executed JavaScript Click on element with selector: "+selector +". Very important: As a consequence a menu has appeared where you may need to make further selection. Very important: Get all_fields DOM to complete the action.";
257 |             }
258 |             return "Executed JavaScript Click on element with selector: "+selector;
259 |         }
260 |     }"""
261 |     try:
262 |         logger.info(f"Executing JavaScript click on element with selector: {selector}")
263 |         result: str = await page.evaluate(js_code, selector)
264 |         logger.debug(f"Executed JavaScript Click on element with selector: {selector}")
265 |         return result
266 |     except Exception as e:
267 |         logger.error(
268 |             f"Error executing JavaScript click on element with selector: {selector}. Error: {e}"
269 |         )
270 |         traceback.print_exc()
271 |         return f"Error executing JavaScript click: {str(e)}"
272 | 


--------------------------------------------------------------------------------
/sentient/core/skills/enter_text_and_click.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import inspect
  3 | 
  4 | from typing_extensions import Annotated
  5 | 
  6 | from sentient.core.web_driver.playwright import PlaywrightManager
  7 | from sentient.core.skills.click_using_selector import do_click
  8 | from sentient.core.skills.enter_text_using_selector import do_entertext
  9 | from sentient.core.skills.press_key_combination import do_press_key_combination
 10 | from sentient.utils.logger import logger
 11 | 
 12 | 
 13 | async def enter_text_and_click(
 14 |     text_selector: Annotated[
 15 |         str,
 16 |         "The properly formatted DOM selector query, for example [mmid='1234'], where the text will be entered. Use mmid attribute. mmid will always be a number",
 17 |     ],
 18 |     text_to_enter: Annotated[
 19 |         str,
 20 |         "The text that will be entered into the element specified by text_selector.",
 21 |     ],
 22 |     click_selector: Annotated[
 23 |         str,
 24 |         "The properly formatted DOM selector query, for example [mmid='1234'], for the element that will be clicked after text entry. mmid will always be a number",
 25 |     ],
 26 |     wait_before_click_execution: Annotated[
 27 |         float, "Optional wait time in seconds before executing the click.", float
 28 |     ],
 29 | ) -> Annotated[
 30 |     str, "A message indicating success or failure of the text entry and click."
 31 | ]:
 32 |     """
 33 |     Enters text into an element and then clicks on another element.
 34 | 
 35 |     Parameters:
 36 |     - text_selector: The selector for the element to enter text into. It should be a properly formatted DOM selector query, for example [mmid='1234'], where the text will be entered. Use the mmid attribute.
 37 |     - text_to_enter: The text to enter into the element specified by text_selector.
 38 |     - click_selector: The selector for the element to click. It should be a properly formatted DOM selector query, for example [mmid='1234'].
 39 |     - wait_before_click_execution: Optional wait time in seconds before executing the click action. Default is 0.0.
 40 | 
 41 |     Returns:
 42 |     - A message indicating the success or failure of the text entry and click.
 43 | 
 44 |     Raises:
 45 |     - ValueError: If no active page is found. The OpenURL command opens a new page.
 46 | 
 47 |     Example usage:
 48 |     ```
 49 |     await enter_text_and_click("[mmid='1234']", "Hello, World!", "[mmid='5678']", wait_before_click_execution=1.5)
 50 |     ```
 51 |     """
 52 |     logger.info(
 53 |         f"Entering text '{text_to_enter}' into element with selector '{text_selector}' and then clicking element with selector '{click_selector}'."
 54 |     )
 55 | 
 56 |     # Initialize PlaywrightManager and get the active browser page
 57 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
 58 |     page = await browser_manager.get_current_page()
 59 |     if page is None:  # type: ignore
 60 |         logger.error("No active page found")
 61 |         raise ValueError("No active page found. OpenURL command opens a new page.")
 62 | 
 63 |     await browser_manager.highlight_element(text_selector, True)
 64 | 
 65 |     function_name = inspect.currentframe().f_code.co_name  # type: ignore
 66 |     await browser_manager.take_screenshots(f"{function_name}_start", page)
 67 | 
 68 |     text_entry_result = await do_entertext(
 69 |         page, text_selector, text_to_enter, use_keyboard_fill=True
 70 |     )
 71 | 
 72 |     # await browser_manager.notify_user(text_entry_result["summary_message"])
 73 |     if not text_entry_result["summary_message"].startswith("Success"):
 74 |         await browser_manager.take_screenshots(f"{function_name}_end", page)
 75 |         return f"Failed to enter text '{text_to_enter}' into element with selector '{text_selector}'. Check that the selctor is valid."
 76 | 
 77 |     result = text_entry_result
 78 | 
 79 |     # if the text_selector is the same as the click_selector, press the Enter key instead of clicking
 80 |     try:
 81 |         if text_selector == click_selector:
 82 |             do_press_key_combination_result = await do_press_key_combination(
 83 |                 browser_manager, page, "Enter"
 84 |             )
 85 |             if do_press_key_combination_result:
 86 |                 result["detailed_message"] += (
 87 |                     f' Instead of click, pressed the Enter key successfully on element: "{click_selector}".'
 88 |                 )
 89 |                 # await browser_manager.notify_user(
 90 |                 #     f'Pressed the Enter key successfully on element: "{click_selector}".',
 91 |                 #     message_type=MessageType.ACTION,
 92 |                 # )
 93 |             else:
 94 |                 result["detailed_message"] += (
 95 |                     f' Clicking the same element after entering text in it, is of no value. Tried pressing the Enter key on element "{click_selector}" instead of click and failed.'
 96 |                 )
 97 |                 # await browser_manager.notify_user(
 98 |                 #     'Failed to press the Enter key on element "{click_selector}".',
 99 |                 #     message_type=MessageType.ACTION,
100 |                 # )
101 |         else:
102 |             await browser_manager.highlight_element(click_selector, True)
103 | 
104 |             do_click_result = await do_click(
105 |                 page, click_selector, wait_before_click_execution
106 |             )
107 |             result["detailed_message"] += f' {do_click_result["detailed_message"]}'
108 |             
109 |         await page.wait_for_load_state("domcontentloaded") # Wait for DOM content to be loaded after the action
110 |         # await browser_manager.notify_user(do_click_result["summary_message"])
111 |         await asyncio.sleep(0.5)  # sleep for 1 sec to allow the mutation observer to detect changes
112 |         await browser_manager.take_screenshots(f"{function_name}_end", page)
113 |         return result["detailed_message"]
114 |     except Exception as e:
115 |         error_message = f"An error occurred during the click action: {str(e)}. This may be due to page navigation."
116 |         logger.error(error_message)
117 |         return error_message
118 | 


--------------------------------------------------------------------------------
/sentient/core/skills/enter_text_using_selector.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import inspect
  3 | import traceback
  4 | from dataclasses import dataclass
  5 | from typing import (
  6 |     Dict,
  7 |     List,  # noqa: UP035
  8 | )
  9 | 
 10 | from playwright.async_api import Page
 11 | from typing_extensions import Annotated
 12 | 
 13 | from sentient.core.web_driver.playwright import PlaywrightManager
 14 | from sentient.core.skills.press_key_combination import press_key_combination
 15 | from sentient.utils.dom_helper import get_element_outer_html
 16 | from sentient.utils.dom_mutation_observer import subscribe, unsubscribe
 17 | from sentient.utils.logger import logger
 18 | 
 19 | 
 20 | @dataclass
 21 | class EnterTextEntry:
 22 |     """
 23 |     Represents an entry for text input.
 24 | 
 25 |     Attributes:
 26 |         query_selector (str): A valid DOM selector query. Use the mmid attribute.
 27 |         text (str): The text to enter in the element identified by the query_selector.
 28 |     """
 29 | 
 30 |     query_selector: str
 31 |     text: str
 32 | 
 33 |     def __getitem__(self, key: str) -> str:
 34 |         if key == "query_selector":
 35 |             return self.query_selector
 36 |         elif key == "text":
 37 |             return self.text
 38 |         else:
 39 |             raise KeyError(f"{key} is not a valid key")
 40 | 
 41 | 
 42 | async def custom_fill_element(page: Page, selector: str, text_to_enter: str):
 43 |     """
 44 |     Sets the value of a DOM element to a specified text without triggering keyboard input events.
 45 | 
 46 |     This function directly sets the 'value' property of a DOM element identified by the given CSS selector,
 47 |     effectively changing its current value to the specified text. This approach bypasses the need for
 48 |     simulating keyboard typing, providing a more efficient and reliable way to fill in text fields,
 49 |     especially in automated testing scenarios where speed and accuracy are paramount.
 50 | 
 51 |     Args:
 52 |         page (Page): The Playwright Page object representing the browser tab in which the operation will be performed.
 53 |         selector (str): The CSS selector string used to locate the target DOM element. The function will apply the
 54 |                         text change to the first element that matches this selector.
 55 |         text_to_enter (str): The text value to be set in the target element. Existing content will be overwritten.
 56 | 
 57 |     Example:
 58 |         await custom_fill_element(page, '#username', 'test_user')
 59 | 
 60 |     Note:
 61 |         This function does not trigger input-related events (like 'input' or 'change'). If application logic
 62 |         relies on these events being fired, additional steps may be needed to simulate them.
 63 |     """
 64 |     selector = f"{selector}"  # Ensures the selector is treated as a string
 65 |     try:
 66 |         result = await page.evaluate(
 67 |             """(inputParams) => {
 68 |             const selector = inputParams.selector;
 69 |             let text_to_enter = inputParams.text_to_enter;
 70 |             text_to_enter = text_to_enter.trim();
 71 |             const element = document.querySelector(selector);
 72 |             if (!element) {
 73 |                 throw new Error(`Element not found: ${selector}`);
 74 |             }
 75 |             element.value = text_to_enter;
 76 |             return `Value set for ${selector}`;
 77 |         }""",
 78 |             {"selector": selector, "text_to_enter": text_to_enter},
 79 |         )
 80 |         logger.debug(f"custom_fill_element result: {result}")
 81 |     except Exception as e:
 82 |         logger.error(f"Error in custom_fill_element: {str(e)}")
 83 |         logger.error(f"Selector: {selector}, Text: {text_to_enter}")
 84 |         raise
 85 | 
 86 | 
 87 | async def entertext(
 88 |     entry: Annotated[
 89 |         EnterTextEntry,
 90 |         "An object containing 'query_selector' (DOM selector query using mmid attribute e.g. [mmid='114']) and 'text' (text to enter on the element). mmid will always be a number",
 91 |     ],
 92 | ) -> Annotated[str, "Explanation of the outcome of this operation."]:
 93 |     """
 94 |     Enters text into a DOM element identified by a CSS selector.
 95 | 
 96 |     This function enters the specified text into a DOM element identified by the given CSS selector.
 97 |     It uses the Playwright library to interact with the browser and perform the text entry operation.
 98 |     The function supports both direct setting of the 'value' property and simulating keyboard typing.
 99 | 
100 |     Args:
101 |         entry (EnterTextEntry): An object containing 'query_selector' (DOM selector query using mmid attribute)
102 |                                 and 'text' (text to enter on the element).
103 | 
104 |     Returns:
105 |         str: Explanation of the outcome of this operation.
106 | 
107 |     Example:
108 |         entry = EnterTextEntry(query_selector='#username', text='test_user')
109 |         result = await entertext(entry)
110 | 
111 |     Note:
112 |         - The 'query_selector' should be a valid CSS selector that uniquely identifies the target element.
113 |         - The 'text' parameter specifies the text to be entered into the element.
114 |         - The function uses the PlaywrightManager to manage the browser instance.
115 |         - If no active page is found, an error message is returned.
116 |         - The function internally calls the 'do_entertext' function to perform the text entry operation.
117 |         - The 'do_entertext' function applies a pulsating border effect to the target element during the operation.
118 |         - The function first clears any existing text in the input field before entering the new text.
119 |         - The 'use_keyboard_fill' parameter in 'do_entertext' determines whether to simulate keyboard typing or not.
120 |         - If 'use_keyboard_fill' is set to True, the function uses the 'page.keyboard.type' method to enter the text.
121 |         - If 'use_keyboard_fill' is set to False, the function uses the 'custom_fill_element' method to enter the text.
122 |     """
123 |     logger.info(f"Entering text: {entry}")
124 | 
125 |     if isinstance(entry, Dict):
126 |         query_selector: str = entry["query_selector"]
127 |         text_to_enter: str = entry["text"]
128 |     elif isinstance(entry, EnterTextEntry):
129 |         query_selector: str = entry.query_selector
130 |         text_to_enter: str = entry.text
131 |     else:
132 |         raise ValueError(
133 |             "Invalid input type for 'entry'. Expected EnterTextEntry or dict."
134 |         )
135 | 
136 |     if not isinstance(query_selector, str) or not isinstance(text_to_enter, str):
137 |         raise ValueError("query_selector and text must be strings")
138 | 
139 |     # logger.info(
140 |     #     f"######### Debug: query_selector={query_selector}, text_to_enter={text_to_enter}"
141 |     # )
142 | 
143 |     # Create and use the PlaywrightManager
144 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
145 |     page = await browser_manager.get_current_page()
146 |     if page is None:  # type: ignore
147 |         return "Error: No active page found. OpenURL command opens a new page."
148 | 
149 |     function_name = inspect.currentframe().f_code.co_name  # type: ignore
150 | 
151 |     await browser_manager.take_screenshots(f"{function_name}_start", page)
152 | 
153 |     await browser_manager.highlight_element(query_selector, True)
154 | 
155 |     dom_changes_detected = None
156 | 
157 |     def detect_dom_changes(changes: str):  # type: ignore
158 |         nonlocal dom_changes_detected
159 |         dom_changes_detected = changes  # type: ignore
160 | 
161 |     subscribe(detect_dom_changes)
162 | 
163 |     # Clear existing text before entering new text
164 |     # await page.evaluate(f"document.querySelector('{query_selector}').value = '';")
165 |     # logger.info(
166 |     #     f"######### About to page.evaluate: selector={query_selector}, text={text_to_enter}"
167 |     # )
168 |     await page.evaluate(
169 |         """
170 |         (selector) => {
171 |             const element = document.querySelector(selector);
172 |             if (element) {
173 |                 element.value = '';
174 |             } else {
175 |                 console.error('Element not found:', selector);
176 |             }
177 |         }
178 |         """,
179 |         query_selector,
180 |     )
181 |     # logger.info(
182 |     #     f"######### About to call do_entertext with: selector={query_selector}, text={text_to_enter}"
183 |     # )
184 |     result = await do_entertext(page, query_selector, text_to_enter)
185 |     # logger.info(f"#########do_entertext returned: {result}")
186 |     await asyncio.sleep(
187 |         0.1
188 |     )  # sleep for 100ms to allow the mutation observer to detect changes
189 |     unsubscribe(detect_dom_changes)
190 | 
191 |     await browser_manager.take_screenshots(f"{function_name}_end", page)
192 | 
193 |     if dom_changes_detected:
194 |         return f"{result['detailed_message']}.\n As a consequence of this action, new elements have appeared in view: {dom_changes_detected}. This means that the action of entering text {text_to_enter} is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction."
195 |     return result["detailed_message"]
196 | 
197 | 
198 | async def do_entertext(
199 |     page: Page, selector: str, text_to_enter: str, use_keyboard_fill: bool = True
200 | ):
201 |     """
202 |     Performs the text entry operation on a DOM element.
203 | 
204 |     This function performs the text entry operation on a DOM element identified by the given CSS selector.
205 |     It applies a pulsating border effect to the element during the operation for visual feedback.
206 |     The function supports both direct setting of the 'value' property and simulating keyboard typing.
207 | 
208 |     Args:
209 |         page (Page): The Playwright Page object representing the browser tab in which the operation will be performed.
210 |         selector (str): The CSS selector string used to locate the target DOM element.
211 |         text_to_enter (str): The text value to be set in the target element. Existing content will be overwritten.
212 |         use_keyboard_fill (bool, optional): Determines whether to simulate keyboard typing or not.
213 |                                             Defaults to False.
214 | 
215 |     Returns:
216 |         Dict[str, str]: Explanation of the outcome of this operation represented as a dictionary with 'summary_message' and 'detailed_message'.
217 | 
218 |     Example:
219 |         result = await do_entertext(page, '#username', 'test_user')
220 | 
221 |     Note:
222 |         - The 'use_keyboard_fill' parameter determines whether to simulate keyboard typing or not.
223 |         - If 'use_keyboard_fill' is set to True, the function uses the 'page.keyboard.type' method to enter the text.
224 |         - If 'use_keyboard_fill' is set to False, the function uses the 'custom_fill_element' method to enter the text.
225 |     """
226 |     try:
227 |         elem = await page.query_selector(selector)
228 | 
229 |         if elem is None:
230 |             error = f"Error: Selector {selector} not found. Unable to continue."
231 |             return {"summary_message": error, "detailed_message": error}
232 | 
233 |         # logger.info(f"######### Found selector {selector} to enter text")
234 |         element_outer_html = await get_element_outer_html(elem, page)
235 | 
236 |         if use_keyboard_fill:
237 |             await elem.focus()
238 |             await asyncio.sleep(0.1)
239 |             await press_key_combination("Control+A")
240 |             await asyncio.sleep(0.1)
241 |             await press_key_combination("Backspace")
242 |             await asyncio.sleep(0.1)
243 |             logger.debug(f"Focused element with selector {selector} to enter text")
244 |             # add a 100ms delay
245 |             await page.keyboard.type(text_to_enter, delay=1)
246 |         else:
247 |             await custom_fill_element(page, selector, text_to_enter)
248 |         await elem.focus()
249 |         logger.info(
250 |             f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}'
251 |         )
252 |         success_msg = f'Success. Text "{text_to_enter}" set successfully in the element with selector {selector}'
253 |         return {
254 |             "summary_message": success_msg,
255 |             "detailed_message": f"{success_msg} and outer HTML: {element_outer_html}.",
256 |         }
257 | 
258 |     except Exception as e:
259 |         traceback.print_exc()
260 |         error = f"Error entering text in selector {selector}."
261 |         # logger.info("Error in do_entertext", error)
262 |         return {"summary_message": error, "detailed_message": f"{error} Error: {e}"}
263 | 
264 | 
265 | async def bulk_enter_text(
266 |     entries: Annotated[
267 |         List[Dict[str, str]],
268 |         "List of objects, each containing 'query_selector' and 'text'.",
269 |     ],  # noqa: UP006
270 | ) -> Annotated[
271 |     List[Dict[str, str]],
272 |     "List of dictionaries, each containing 'query_selector' and the result of the operation.",
273 | ]:  # noqa: UP006
274 |     """
275 |     Enters text into multiple DOM elements using a bulk operation.
276 | 
277 |     This function enters text into multiple DOM elements using a bulk operation.
278 |     It takes a list of dictionaries, where each dictionary contains a 'query_selector' and 'text' pair.
279 |     The function internally calls the 'entertext' function to perform the text entry operation for each entry.
280 | 
281 |     Args:
282 |         entries: List of objects, each containing 'query_selector' and 'text'.
283 | 
284 |     Returns:
285 |         List of dictionaries, each containing 'query_selector' and the result of the operation.
286 | 
287 |     Example:
288 |         entries = [
289 |             {"query_selector": "#username", "text": "test_user"},
290 |             {"query_selector": "#password", "text": "test_password"}
291 |         ]
292 |         results = await bulk_enter_text(entries)
293 | 
294 |     Note:
295 |         - Each entry in the 'entries' list should be a dictionary with 'query_selector' and 'text' keys.
296 |         - The result is a list of dictionaries, where each dictionary contains the 'query_selector' and the result of the operation.
297 |     """
298 | 
299 |     results: List[Dict[str, str]] = []  # noqa: UP006
300 |     logger.info("Executing bulk Enter Text Command")
301 |     for entry in entries:
302 |         query_selector = entry["query_selector"]
303 |         text_to_enter = entry["text"]
304 |         logger.info(
305 |             f"Entering text: {text_to_enter} in element with selector: {query_selector}"
306 |         )
307 |         result = await entertext(
308 |             EnterTextEntry(query_selector=query_selector, text=text_to_enter)
309 |         )
310 | 
311 |         results.append({"query_selector": query_selector, "result": result})
312 | 
313 |     return results
314 | 


--------------------------------------------------------------------------------
/sentient/core/skills/get_dom_with_content_type.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from typing import Any, Union, Dict
  4 | 
  5 | from playwright.async_api import Page
  6 | from typing_extensions import Annotated
  7 | 
  8 | from sentient.config.config import SOURCE_LOG_FOLDER_PATH
  9 | from sentient.core.web_driver.playwright import PlaywrightManager
 10 | from sentient.utils.dom_helper import wait_for_non_loading_dom_state
 11 | from sentient.utils.get_detailed_accessibility_tree import do_get_accessibility_info
 12 | from sentient.utils.logger import logger
 13 | 
 14 | 
 15 | async def get_dom_with_content_type(
 16 |     content_type: Annotated[
 17 |         str,
 18 |         "The type of content to extract: 'text_only': Extracts the innerText of the highest element in the document and responds with text, or 'input_fields': Extracts the text input and button elements in the dom.",
 19 |     ],
 20 | ) -> Annotated[
 21 |     Union[Dict[str, Any], str, None],
 22 |     "The output based on the specified content type.",
 23 | ]:
 24 |     """
 25 |     Retrieves and processes the DOM of the active page in a browser instance based on the specified content type.
 26 | 
 27 |     Parameters
 28 |     ----------
 29 |     content_type : str
 30 |         The type of content to extract. Possible values are:
 31 |         - 'text_only': Extracts the innerText of the highest element in the document and responds with text.
 32 |         - 'input_fields': Extracts the text input and button elements in the DOM and responds with a JSON object.
 33 |         - 'all_fields': Extracts all the fields in the DOM and responds with a JSON object.
 34 | 
 35 |     Returns
 36 |     -------
 37 |     Dict[str, Any] | str | None
 38 |         The processed content based on the specified content type. This could be:
 39 |         - A JSON object for 'input_fields' with just inputs.
 40 |         - Plain text for 'text_only'.
 41 |         - A minified DOM represented as a JSON object for 'all_fields'.
 42 | 
 43 |     Raises
 44 |     ------
 45 |     ValueError
 46 |         If an unsupported content_type is provided.
 47 |     """
 48 | 
 49 |     logger.info(f"Executing Get DOM Command based on content_type: {content_type}")
 50 |     start_time = time.time()
 51 |     # Create and use the PlaywrightManager
 52 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
 53 |     page = await browser_manager.get_current_page()
 54 |     if page is None:  # type: ignore
 55 |         raise ValueError("No active page found. OpenURL command opens a new page.")
 56 | 
 57 |     extracted_data = None
 58 |     await wait_for_non_loading_dom_state(
 59 |         page, 2000
 60 |     )  # wait for the DOM to be ready, non loading means external resources do not need to be loaded
 61 |     user_success_message = ""
 62 |     if content_type == "all_fields":
 63 |         user_success_message = "Fetched all the fields in the DOM"
 64 |         extracted_data = await do_get_accessibility_info(page, only_input_fields=False)
 65 |     elif content_type == "input_fields":
 66 |         logger.debug("Fetching DOM for input_fields")
 67 |         extracted_data = await do_get_accessibility_info(page, only_input_fields=True)
 68 |         if extracted_data is None:
 69 |             return "Could not fetch input fields. Please consider trying with content_type all_fields."
 70 |         user_success_message = "Fetched only input fields in the DOM"
 71 |     elif content_type == "text_only":
 72 |         # Extract text from the body or the highest-level element
 73 |         logger.debug("Fetching DOM for text_only")
 74 |         text_content = await get_filtered_text_content(page)
 75 |         with open(
 76 |             os.path.join(SOURCE_LOG_FOLDER_PATH, "text_only_dom.txt"),
 77 |             "w",
 78 |             encoding="utf-8",
 79 |         ) as f:
 80 |             f.write(text_content)
 81 |         extracted_data = text_content
 82 |         user_success_message = "Fetched the text content of the DOM"
 83 |     else:
 84 |         raise ValueError(f"Unsupported content_type: {content_type}")
 85 | 
 86 |     elapsed_time = time.time() - start_time
 87 |     logger.info(f"Get DOM Command executed in {elapsed_time} seconds")
 88 |     # await browser_manager.notify_user(
 89 |     #     user_success_message, message_type=MessageType.ACTION
 90 |     # )
 91 |     return extracted_data  # type: ignore
 92 | 
 93 | 
 94 | async def get_filtered_text_content(page: Page) -> str:
 95 |     text_content = await page.evaluate("""
 96 |         () => {
 97 |             // Array of query selectors to filter out
 98 |             const selectorsToFilter = ['#agente-overlay'];
 99 | 
100 |             // Store the original visibility values to revert later
101 |             const originalStyles = [];
102 | 
103 |             // Hide the elements matching the query selectors
104 |             selectorsToFilter.forEach(selector => {
105 |                 const elements = document.querySelectorAll(selector);
106 |                 elements.forEach(element => {
107 |                     originalStyles.push({ element: element, originalStyle: element.style.visibility });
108 |                     element.style.visibility = 'hidden';
109 |                 });
110 |             });
111 | 
112 |             // Get the text content of the page
113 |             let textContent = document?.body?.innerText || document?.documentElement?.innerText || "";
114 | 
115 |             // Get all the alt text from images on the page
116 |             let altTexts = Array.from(document.querySelectorAll('img')).map(img => img.alt);
117 |             altTexts="Other Alt Texts in the page: " + altTexts.join(' ');
118 | 
119 |             // Revert the visibility changes
120 |             originalStyles.forEach(entry => {
121 |                 entry.element.style.visibility = entry.originalStyle;
122 |             });
123 |             textContent=textContent+" "+altTexts;
124 |             return textContent;
125 |         }
126 |     """)
127 |     return text_content
128 | 


--------------------------------------------------------------------------------
/sentient/core/skills/get_screenshot.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | 
 3 | from typing_extensions import Annotated
 4 | 
 5 | from sentient.core.web_driver.playwright import PlaywrightManager
 6 | from sentient.utils.logger import logger
 7 | 
 8 | 
 9 | async def get_screenshot() -> (
10 |     Annotated[
11 |         str, "Returns a base64 encoded screenshot of the current active web page."
12 |     ]
13 | ):
14 |     """
15 |     Captures and returns a base64 encoded screenshot of the current page (only the visible viewport and not the full page)
16 | 
17 |     Returns:
18 |     - Base64 encoded string of the screenshot image.
19 |     """
20 | 
21 |     try:
22 |         # Create and use the PlaywrightManager
23 |         browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
24 |         page = await browser_manager.get_current_page()
25 |         logger.info("page {page}")
26 | 
27 |         if not page:
28 |             logger.info("No active page found. OpenURL command opens a new page.")
29 |             raise ValueError("No active page found. OpenURL command opens a new page.")
30 | 
31 |         await page.wait_for_load_state("domcontentloaded")
32 | 
33 |         # Capture the screenshot
34 |         logger.info("about to capture")
35 |         screenshot_bytes = await page.screenshot(full_page=False)
36 | 
37 |         # Encode the screenshot as base64
38 |         base64_screenshot = base64.b64encode(screenshot_bytes).decode("utf-8")
39 | 
40 |         return f"data:image/png;base64,{base64_screenshot}"
41 | 
42 |     except Exception as e:
43 |         raise ValueError(
44 |             "Failed to capture screenshot. Make sure a page is open and accessible."
45 |         ) from e
46 | 


--------------------------------------------------------------------------------
/sentient/core/skills/get_url.py:
--------------------------------------------------------------------------------
 1 | from typing_extensions import Annotated
 2 | 
 3 | from sentient.core.web_driver.playwright import PlaywrightManager
 4 | 
 5 | 
 6 | async def geturl() -> (
 7 |     Annotated[str, "Returns the full URL of the current active web site/page."]
 8 | ):
 9 |     """
10 |     Returns the full URL of the current page
11 | 
12 |     Parameters:
13 | 
14 |     Returns:
15 |     - Full URL the browser's active page.
16 |     """
17 | 
18 |     try:
19 |         # Create and use the PlaywrightManager
20 |         browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
21 |         page = await browser_manager.get_current_page()
22 | 
23 |         if not page:
24 |             raise ValueError("No active page found. OpenURL command opens a new page.")
25 | 
26 |         await page.wait_for_load_state("domcontentloaded")
27 | 
28 |         # Get the URL of the current page
29 |         try:
30 |             title = await page.title()
31 |             current_url = page.url
32 |             if len(current_url) > 250:
33 |                 current_url = current_url[:250] + "..."
34 |             return f"Current Page: {current_url}, Title: {title}"  # type: ignore
35 |         except:  # noqa: E722
36 |             current_url = page.url
37 |             return f"Current Page: {current_url}"
38 | 
39 |     except Exception as e:
40 |         raise ValueError(
41 |             "No active page found. OpenURL command opens a new page."
42 |         ) from e
43 | 


--------------------------------------------------------------------------------
/sentient/core/skills/get_user_input.py:
--------------------------------------------------------------------------------
 1 | from typing import (
 2 |     Dict,
 3 |     List,  # noqa: UP035,
 4 | )
 5 | 
 6 | from typing_extensions import Annotated
 7 | 
 8 | from sentient.core.web_driver.playwright import PlaywrightManager
 9 | from sentient.utils.cli_helper import answer_questions_over_cli
10 | 
11 | 
12 | async def get_user_input(
13 |     questions: Annotated[
14 |         List[str], "List of questions to ask the user each one represented as a string"
15 |     ],
16 | ) -> Dict[str, str]:  # noqa: UP006
17 |     """
18 |     Asks the user a list of questions and returns the answers in a dictionary.
19 | 
20 |     Parameters:
21 |     - questions: A list of questions to ask the user ["What is Username?", "What is your password?"].
22 | 
23 |     Returns:
24 |     - Newline separated list of questions to ask the user
25 |     """
26 | 
27 |     answers: Dict[str, str] = {}
28 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
29 |     if browser_manager.ui_manager:
30 |         for question in questions:
31 |             answers[question] = await browser_manager.prompt_user(
32 |                 f"Question: {question}"
33 |             )
34 |     else:
35 |         answers = await answer_questions_over_cli(questions)
36 |     return answers
37 | 


--------------------------------------------------------------------------------
/sentient/core/skills/open_url.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import inspect
 3 | 
 4 | from playwright.async_api import TimeoutError as PlaywrightTimeoutError
 5 | from typing_extensions import Annotated
 6 | 
 7 | from sentient.core.web_driver.playwright import PlaywrightManager
 8 | from sentient.utils.logger import logger
 9 | 
10 | 
11 | async def openurl(
12 |     url: Annotated[
13 |         str,
14 |         "The URL to navigate to. Value must include the protocol (http:// or https://).",
15 |     ],
16 |     timeout: Annotated[int, "Additional wait time in seconds after initial load."],
17 |     max_retries: Annotated[int, "Maximum number of retry attempts"] = 3,
18 | ) -> Annotated[str, "Returns the result of this request in text form"]:
19 |     """
20 |     Opens a specified URL in the active browser instance. Waits for an initial load event, then waits for either
21 |     the 'domcontentloaded' event or a configurable timeout, whichever comes first.
22 | 
23 |     Parameters:
24 |     - url: The URL to navigate to.
25 |     - timeout: Additional time in seconds to wait after the initial load before considering the navigation successful.
26 |     - max_retries: Maximum number of retry attempts (default: 3).
27 | 
28 |     Returns:
29 |     - URL of the new page.
30 |     """
31 |     logger.info(f"Opening URL: {url}")
32 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
33 |     await browser_manager.get_browser_context()
34 |     page = await browser_manager.get_current_page()
35 |     # Navigate to the URL with a short timeout to ensure the initial load starts
36 |     function_name = inspect.currentframe().f_code.co_name  # type: ignore
37 |     url = ensure_protocol(url)
38 | 
39 |     for attempt in range(max_retries):
40 |         try:
41 |             await browser_manager.take_screenshots(f"{function_name}_start", page)
42 | 
43 |             # Use a longer timeout for navigation
44 |             await page.goto(
45 |                 url, timeout=max(30000, timeout * 1000), wait_until="domcontentloaded"
46 |             )
47 | 
48 |             # Wait for network idle to ensure page is fully loaded
49 |             await page.wait_for_load_state(
50 |                 "domcontentloaded", timeout=max(30000, timeout * 1000)
51 |             )
52 | 
53 |             await browser_manager.take_screenshots(f"{function_name}_end", page)
54 | 
55 |             title = await page.title()
56 |             final_url = page.url
57 |             logger.info(f"Successfully loaded page: {final_url}")
58 |             return f"Page loaded: {final_url}, Title: {title}"
59 | 
60 |         except PlaywrightTimeoutError as e:
61 |             logger.warning(f"Timeout error on attempt {attempt + 1}: {e}")
62 |             if attempt == max_retries - 1:
63 |                 logger.error(f"Failed to load {url} after {max_retries} attempts")
64 |                 return f"Failed to load page: {url}. Error: Timeout after {max_retries} attempts"
65 |             await asyncio.sleep(2)  # Wait before retrying
66 | 
67 |         except Exception as e:
68 |             logger.error(f"Error navigating to {url}: {e}")
69 |             return f"Failed to load page: {url}. Error: {str(e)}"
70 | 
71 |     await browser_manager.take_screenshots(f"{function_name}_end", page)
72 | 
73 |     # await browser_manager.notify_user(
74 |     #     f"Opened URL: {url}", message_type=MessageType.ACTION
75 |     # )
76 |     # Get the page title
77 |     title = await page.title()
78 |     url = page.url
79 |     return f"Page loaded: {url}, Title: {title}"  # type: ignore
80 | 
81 | 
82 | def ensure_protocol(url: str) -> str:
83 |     """
84 |     Ensures that a URL has a protocol (http:// or https://). If it doesn't have one,
85 |     https:// is added by default.
86 | 
87 |     Parameters:
88 |     - url: The URL to check and modify if necessary.
89 | 
90 |     Returns:
91 |     - A URL string with a protocol.
92 |     """
93 |     if not url.startswith(("http://", "https://")):
94 |         url = "https://" + url  # Default to http if no protocol is specified
95 |         logger.info(
96 |             f"Added 'https://' protocol to URL because it was missing. New URL is: {url}"
97 |         )
98 |     return url
99 | 


--------------------------------------------------------------------------------
/sentient/core/skills/pdf_text_extractor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import httpx
  4 | import pdfplumber
  5 | from typing_extensions import Annotated
  6 | 
  7 | from sentient.config.config import PROJECT_TEMP_PATH
  8 | from sentient.core.web_driver.playwright import PlaywrightManager
  9 | from sentient.utils.logger import logger
 10 | from sentient.utils.message_type import MessageType
 11 | 
 12 | 
 13 | async def extract_text_from_pdf(
 14 |     pdf_url: Annotated[str, "The URL of the PDF file to extract text from."],
 15 | ) -> Annotated[str, "All the text found in the PDF file."]:
 16 |     """
 17 |     Extract text from a PDF file.
 18 |     pdf_url: str - The URL of the PDF file to extract text from.
 19 |     returns: str - All the text found in the PDF.
 20 |     """
 21 |     file_path = os.path.join(
 22 |         PROJECT_TEMP_PATH, "downloaded_file.pdf"
 23 |     )  # fixed file path for downloading the PDF
 24 | 
 25 |     try:
 26 |         # Create and use the PlaywrightManager
 27 |         browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
 28 | 
 29 |         # Download the PDF
 30 |         download_result = await download_pdf(pdf_url, file_path)
 31 |         if not os.path.exists(download_result):
 32 |             return download_result  # Return error message if download failed
 33 | 
 34 |         # Open the PDF using pdfplumber and extract text
 35 |         text = ""
 36 |         with pdfplumber.open(download_result) as pdf:
 37 |             for page in pdf.pages:
 38 |                 page_text = page.extract_text()
 39 |                 if page_text:
 40 |                     text += page_text + "\n"
 41 |         extracted_text = text.strip()
 42 |         word_count = len(extracted_text.split())
 43 |         await browser_manager.notify_user(
 44 |             f"Extracted text from the PDF successfully. Found {word_count} words.",
 45 |             message_type=MessageType.ACTION,
 46 |         )
 47 |         return "Text found in the PDF:\n" + extracted_text
 48 |     except httpx.HTTPStatusError as e:
 49 |         logger.error(
 50 |             f"An error occurred while downloading the PDF from {pdf_url}: {str(e)}"
 51 |         )
 52 |         return f"An error occurred while downloading the PDF: {str(e)}"
 53 |     except Exception as e:
 54 |         logger.error(
 55 |             f"An error occurred while extracting text from the PDF that was downloaded from {pdf_url}: {str(e)}"
 56 |         )
 57 |         return f"An error occurred while extracting text: {str(e)}"
 58 |     finally:
 59 |         # Cleanup: Ensure the downloaded file is removed
 60 |         cleanup_temp_files(file_path)
 61 | 
 62 | 
 63 | def cleanup_temp_files(*file_paths: str) -> None:
 64 |     """
 65 |     Remove the specified temporary files.
 66 | 
 67 |     *file_paths: str - One or more file paths to be removed.
 68 |     """
 69 |     for file_path in file_paths:
 70 |         if os.path.exists(file_path):
 71 |             try:
 72 |                 os.remove(file_path)
 73 |                 logger.debug(f"Cleaned file from the filesystem: {file_path}")
 74 |             except Exception as e:
 75 |                 logger.error(f"Failed to remove {file_path}: {str(e)}")
 76 |         else:
 77 |             logger.debug(
 78 |                 f"File not found. Unable to clean it from the filesystem: {file_path}"
 79 |             )
 80 | 
 81 | 
 82 | async def download_pdf(pdf_url: str, file_path: str) -> str:
 83 |     """
 84 |     Download the PDF file from the given URL and save it to the specified path.
 85 | 
 86 |     pdf_url: str - The URL of the PDF file to download.
 87 |     file_path: str - The local path to save the downloaded PDF.
 88 | 
 89 |     returns: str - The file path of the downloaded PDF if successful, otherwise an error message.
 90 |     raises: Exception - If an error occurs during the download process.
 91 |     """
 92 |     try:
 93 |         logger.info(f"Downloading PDF from: {pdf_url} to: {file_path}")
 94 |         async with httpx.AsyncClient() as client:
 95 |             response = await client.get(pdf_url)
 96 |             response.raise_for_status()  # Ensure the request was successful
 97 |         with open(file_path, "wb") as pdf_file:
 98 |             pdf_file.write(response.content)
 99 |         return file_path
100 |     # except httpx.HTTPStatusError as e:
101 |     #     raise e
102 |     except Exception as e:
103 |         raise e
104 | 


--------------------------------------------------------------------------------
/sentient/core/skills/press_key_combination.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import inspect
  3 | 
  4 | from playwright.async_api import Page  # type: ignore
  5 | from typing_extensions import Annotated
  6 | 
  7 | from sentient.core.web_driver.playwright import PlaywrightManager
  8 | from sentient.utils.dom_mutation_observer import (
  9 |     subscribe,  # type: ignore
 10 |     unsubscribe,  # type: ignore
 11 | )
 12 | from sentient.utils.logger import logger
 13 | 
 14 | 
 15 | async def press_key_combination(
 16 |     key_combination: Annotated[str, "The key to press, e.g., Enter, PageDown etc"],
 17 | ) -> str:
 18 |     """
 19 |     Presses a key combination on the current active page managed by PlaywrightManager.
 20 | 
 21 |     This function simulates the pressing of a key or a combination of keys on the current active web page.
 22 |     The `key_combination` should be a string that represents the keys to be pressed, separated by '+' if it's a combination.
 23 |     For example, 'Control+C' to copy or 'Alt+F4' to close a window on Windows.
 24 | 
 25 |     Parameters:
 26 |     - key_combination (Annotated[str, "The key combination to press, e.g., 'Control+C'."]): The key combination to press, represented as a string. For combinations, use '+' as a separator.
 27 | 
 28 |     Raises:
 29 |     - ValueError: If no active page is found.
 30 | 
 31 |     Returns:
 32 |     str: status of the operation expressed as a string
 33 |     """
 34 | 
 35 |     logger.info(f"Executing press_key_combination with key combo: {key_combination}")
 36 |     # Create and use the PlaywrightManager
 37 |     browser_manager = PlaywrightManager()
 38 |     page = await browser_manager.get_current_page()
 39 | 
 40 |     if page is None:  # type: ignore
 41 |         raise ValueError("No active page found. OpenURL command opens a new page.")
 42 | 
 43 |     # Split the key combination if it's a combination of keys
 44 |     keys = key_combination.split("+")
 45 | 
 46 |     dom_changes_detected = None
 47 | 
 48 |     def detect_dom_changes(changes: str):  # type: ignore
 49 |         nonlocal dom_changes_detected
 50 |         dom_changes_detected = changes  # type: ignore
 51 | 
 52 |     subscribe(detect_dom_changes)
 53 |     # If it's a combination, hold down the modifier keys
 54 |     for key in keys[:-1]:  # All keys except the last one are considered modifier keys
 55 |         await page.keyboard.down(key)
 56 | 
 57 |     # Press the last key in the combination
 58 |     await page.keyboard.press(keys[-1])
 59 | 
 60 |     # Release the modifier keys
 61 |     for key in keys[:-1]:
 62 |         await page.keyboard.up(key)
 63 |     await asyncio.sleep(
 64 |         0.1
 65 |     )  # sleep for 100ms to allow the mutation observer to detect changes
 66 |     unsubscribe(detect_dom_changes)
 67 | 
 68 |     if dom_changes_detected:
 69 |         return f"Key {key_combination} executed successfully.\n As a consequence of this action, new elements have appeared in view:{dom_changes_detected}. This means that the action is not yet executed and needs further interaction. Get all_fields DOM to complete the interaction."
 70 | 
 71 |     # await browser_manager.notify_user(
 72 |     #     f"Key {key_combination} executed successfully", message_type=MessageType.ACTION
 73 |     # )
 74 |     return f"Key {key_combination} executed successfully"
 75 | 
 76 | 
 77 | async def do_press_key_combination(
 78 |     browser_manager: PlaywrightManager, page: Page, key_combination: str
 79 | ) -> bool:
 80 |     """
 81 |     Presses a key combination on the provided page.
 82 | 
 83 |     This function simulates the pressing of a key or a combination of keys on a web page.
 84 |     The `key_combination` should be a string that represents the keys to be pressed, separated by '+' if it's a combination.
 85 |     For example, 'Control+C' to copy or 'Alt+F4' to close a window on Windows.
 86 | 
 87 |     Parameters:
 88 |     - browser_manager (PlaywrightManager): The PlaywrightManager instance.
 89 |     - page (Page): The Playwright page instance.
 90 |     - key_combination (str): The key combination to press, represented as a string. For combinations, use '+' as a separator.
 91 | 
 92 |     Returns:
 93 |     bool: True if success and False if failed
 94 |     """
 95 | 
 96 |     logger.info(f"Executing press_key_combination with key combo: {key_combination}")
 97 |     try:
 98 |         function_name = inspect.currentframe().f_code.co_name  # type: ignore
 99 |         await browser_manager.take_screenshots(f"{function_name}_start", page)
100 |         # Split the key combination if it's a combination of keys
101 |         keys = key_combination.split("+")
102 | 
103 |         # If it's a combination, hold down the modifier keys
104 |         for key in keys[
105 |             :-1
106 |         ]:  # All keys except the last one are considered modifier keys
107 |             await page.keyboard.down(key)
108 | 
109 |         # Press the last key in the combination
110 |         await page.keyboard.press(keys[-1])
111 | 
112 |         # Release the modifier keys
113 |         for key in keys[:-1]:
114 |             await page.keyboard.up(key)
115 | 
116 |     except Exception as e:
117 |         logger.error(f'Error executing press_key_combination "{key_combination}": {e}')
118 |         return False
119 | 
120 |     await browser_manager.take_screenshots(f"{function_name}_end", page)
121 | 
122 |     return True
123 | 


--------------------------------------------------------------------------------
/sentient/core/skills/upload_file.py:
--------------------------------------------------------------------------------
 1 | from typing_extensions import Annotated
 2 | 
 3 | from sentient.core.web_driver.playwright import PlaywrightManager
 4 | from sentient.utils.logger import logger
 5 | 
 6 | 
 7 | async def upload_file(
 8 |     # label: Annotated[str, "Label for the element on which upload should happen"],
 9 |     selector: Annotated[
10 |         str,
11 |         "The properly formed query selector string to identify the file input element (e.g. [mmid='114']). When \"mmid\" attribute is present, use it for the query selector. mmid will always be a number",
12 |     ],
13 |     file_path: Annotated[str, "Path on the local system for the file to be uploaded"],
14 | ) -> Annotated[str, "A meesage indicating if the file uplaod was successful"]:
15 |     """
16 |     Uploads a file.
17 | 
18 |     Parameters:
19 |     - file_path: Path of the file that needs to be uploaded.
20 | 
21 |     Returns:
22 |     - A message indicating the success or failure of the file upload
23 |     """
24 |     logger.info(
25 |         f"Uploading file onto the page from {file_path} using selector {selector}"
26 |     )
27 |     print("naman-selector")
28 |     # print(label)
29 |     # label = "Add File"
30 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
31 |     page = await browser_manager.get_current_page()
32 | 
33 |     if not page:
34 |         raise ValueError("No active page found. OpenURL command opens a new page")
35 | 
36 |     await page.wait_for_load_state("domcontentloaded")
37 | 
38 |     try:
39 |         await page.locator(selector).set_input_files(file_path)
40 |         # await page.get_by_label(label).set_input_files(file_path)
41 |         logger.info(
42 |             "File upload was successful. I can confirm it. Please proceed ahead with next step."
43 |         )
44 |     except Exception as e:
45 |         logger.error(f"Failed to upload file: {e}")
46 |         return f"File upload failed {e}"
47 | 


--------------------------------------------------------------------------------
/sentient/core/web_driver/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/core/web_driver/__init__.py


--------------------------------------------------------------------------------
/sentient/core/web_driver/playwright.py:
--------------------------------------------------------------------------------
  1 | import tempfile
  2 | import time
  3 | from typing import List, Union
  4 | 
  5 | from playwright.async_api import BrowserContext, Page, Playwright
  6 | from playwright.async_api import async_playwright as playwright
  7 | 
  8 | from sentient.utils.dom_mutation_observer import (
  9 |     dom_mutation_change_detected,
 10 |     handle_navigation_for_mutation_observer,
 11 | )
 12 | from sentient.utils.logger import logger
 13 | from sentient.utils.ui_messagetype import MessageType
 14 | 
 15 | # TODO - Create a wrapper browser manager class that either starts a playwright manager (our solution) or a hosted browser manager like browserbase
 16 | 
 17 | 
 18 | class PlaywrightManager:
 19 |     _homepage = "https://google.com"
 20 |     _playwright = None
 21 |     _browser_context = None
 22 |     __async_initialize_done = False
 23 |     _instance = None
 24 |     _take_screenshots = False
 25 |     _screenshots_dir = None
 26 | 
 27 |     def __new__(cls, *args, **kwargs):  # type: ignore
 28 |         """
 29 |         Ensures that only one instance of PlaywrightManager is created (singleton pattern).
 30 |         """
 31 |         if cls._instance is None:
 32 |             cls._instance = super().__new__(cls)
 33 |             cls._instance.__initialized = False
 34 |             logger.debug("Browser instance created..")
 35 |         return cls._instance
 36 | 
 37 |     def __init__(
 38 |         self,
 39 |         browser_type: str = "chromium",
 40 |         headless: bool = False,
 41 |         gui_input_mode: bool = True,
 42 |         screenshots_dir: str = "",
 43 |         take_screenshots: bool = False,
 44 |     ):
 45 |         """
 46 |         Initializes the PlaywrightManager with the specified browser type and headless mode.
 47 |         Initialization occurs only once due to the singleton pattern.
 48 | 
 49 |         Args:
 50 |             browser_type (str, optional): The type of browser to use. Defaults to "chromium".
 51 |             headless (bool, optional): Flag to launch the browser in headless mode or not. Defaults to False (non-headless).
 52 |         """
 53 |         if self.__initialized:
 54 |             return
 55 |         self.browser_type = browser_type
 56 |         self.isheadless = headless
 57 |         self.__initialized = True
 58 |         # self.notification_manager = NotificationManager()
 59 |         # self.user_response_event = asyncio.Event()
 60 |         # if gui_input_mode:
 61 |         #     self.ui_manager: UIManager = UIManager()
 62 |         self.set_take_screenshots(take_screenshots)
 63 |         self.set_screenshots_dir(screenshots_dir)
 64 | 
 65 |     async def async_initialize(self, eval_mode: bool = False):
 66 |         """
 67 |         Asynchronously initialize necessary components and handlers for the browser context.
 68 |         """
 69 |         if self.__async_initialize_done:
 70 |             return
 71 | 
 72 |         # Step 1: Ensure Playwright is started and browser context is created
 73 |         await self.start_playwright()
 74 |         self.eval_mode = eval_mode
 75 |         await self.ensure_browser_context()
 76 | 
 77 |         # Step 2: Deferred setup of handlers
 78 |         # await self.setup_handlers()
 79 | 
 80 |         # Step 3: Navigate to homepage
 81 |         await self.go_to_homepage()
 82 | 
 83 |         self.__async_initialize_done = True
 84 | 
 85 |     async def ensure_browser_context(self):
 86 |         """
 87 |         Ensure that a browser context exists, creating it if necessary.
 88 |         """
 89 |         if self._browser_context is None:
 90 |             await self.create_browser_context()
 91 | 
 92 |     # async def setup_handlers(self):
 93 |     #     """
 94 |     #     Setup various handlers after the browser context has been ensured.
 95 |     #     """
 96 |     #     await self.set_overlay_state_handler()
 97 |     #     await self.set_user_response_handler()
 98 |     #     await self.set_navigation_handler()
 99 | 
100 |     async def start_playwright(self):
101 |         """
102 |         Starts the Playwright instance if it hasn't been started yet. This method is idempotent.
103 |         """
104 |         if not PlaywrightManager._playwright:
105 |             PlaywrightManager._playwright: Playwright = await playwright().start()
106 | 
107 |     async def stop_playwright(self):
108 |         """
109 |         Stops the Playwright instance and resets it to None. This method should be called to clean up resources.
110 |         """
111 |         # Close the browser context if it's initialized
112 |         if PlaywrightManager._browser_context is not None:
113 |             await PlaywrightManager._browser_context.close()
114 |             PlaywrightManager._browser_context = None
115 | 
116 |         # Stop the Playwright instance if it's initialized
117 |         if PlaywrightManager._playwright is not None:  # type: ignore
118 |             await PlaywrightManager._playwright.stop()
119 |             PlaywrightManager._playwright = None  # type: ignore
120 | 
121 |     async def create_browser_context(self):
122 |         # load_dotenv()
123 |         # user_data_dir: str = os.environ["BROWSER_USER_DATA_DIR"]
124 |         # profile_directory: str = os.environ["BROWSER_PROFILE"]
125 |         # print("Browser profile", user_data_dir)
126 |         # logger.info("Browser Profile - " + user_data_dir + profile_directory)
127 |         try:
128 |             # PlaywrightManager._browser_context = (
129 |             #     await PlaywrightManager._playwright.chromium.launch_persistent_context(
130 |             #         user_data_dir=user_data_dir,
131 |             #         channel="chrome",
132 |             #         headless=self.isheadless,
133 |             #         args=[
134 |             #             f"--profile-directory={profile_directory}",
135 |             #             "--disable-session-crashed-bubble",
136 |             #             "--disable-infobars",
137 |             #             "--no-default-browser-check",
138 |             #             "--no-first-run",
139 |             #             "--disable-popup-blocking",
140 |             #             "--disable-notifications",
141 |             #             "--disable-features=ChromeWhatsNewUI",
142 |             #             "--disable-blink-features=AutomationControlled",
143 |             #             "--disable-gpu",
144 |             #             "--no-sandbox",
145 |             #             "--disable-dev-shm-usage",
146 |             #             "--no-first-run",
147 |             #             "--no-zygote",
148 |             #             "--ignore-certificate-errors",
149 |             #             "--disable-popup-blocking",
150 |             #             "--remote-debugging-port=9222",
151 |             #             "--restore-last-session",
152 |             #         ],
153 |             #         ignore_default_args=["--enable-automation", "--bwsi"],
154 |             #         no_viewport=True,
155 |             #     )
156 |             # )
157 | 
158 |             # await PlaywrightManager._playwright.chromium.launch_persistent_context(
159 |             #     user_data_dir=user_data_dir,
160 |             #     channel="chrome",
161 |             #     headless=False,
162 |             #     args=[
163 |             #         f"--profile-directory={profile_directory}",
164 |             #         "--remote-debugging-port=9224",
165 |             #     ],
166 |             #     no_viewport=True,
167 |             # )
168 | 
169 |             # in eval mode - start a temp browser.
170 |             if self.eval_mode:
171 |                 print("Starting in eval mode", self.eval_mode)
172 |                 new_user_dir = tempfile.mkdtemp()
173 |                 logger.info(
174 |                     f"Starting a temporary browser instance. trying to launch with a new user dir {new_user_dir}"
175 |                 )
176 |                 PlaywrightManager._browser_context = await PlaywrightManager._playwright.chromium.launch_persistent_context(
177 |                     new_user_dir,
178 |                     channel="chrome",
179 |                     headless=self.isheadless,
180 |                     args=[
181 |                         "--disable-blink-features=AutomationControlled",
182 |                         "--disable-session-crashed-bubble",  # disable the restore session bubble
183 |                         "--disable-infobars",  # disable informational popups,
184 |                     ],
185 |                     no_viewport=True,
186 |                 )
187 |             else:
188 |                 browser = await PlaywrightManager._playwright.chromium.connect_over_cdp(
189 |                     "http://localhost:9222"
190 |                 )
191 |                 PlaywrightManager._browser_context = browser.contexts[0]
192 | 
193 |             # Additional step to modify the navigator.webdriver property
194 |             pages = PlaywrightManager._browser_context.pages
195 |             for page in pages:
196 |                 # await stealth_async(page)  # Apply stealth to each page
197 |                 await page.add_init_script("""
198 |                     Object.defineProperty(navigator, 'webdriver', {
199 |                         get: () => undefined
200 |                     })
201 |                 """)
202 | 
203 |         except Exception as e:
204 |             if "Target page, context or browser has been closed" in str(e):
205 |                 new_user_dir = tempfile.mkdtemp()
206 |                 # logger.error(
207 |                 #     f"Failed to launch persistent context with user data dir {user_data_dir}: {e} Trying to launch with a new user dir {new_user_dir}"
208 |                 # )
209 |                 logger.error(
210 |                     f"Failed to launch persistent context with provided user data dir: {e} Trying to launch with a new user dir {new_user_dir}"
211 |                 )
212 |                 PlaywrightManager._browser_context = await PlaywrightManager._playwright.chromium.launch_persistent_context(
213 |                     new_user_dir,
214 |                     channel="chrome",
215 |                     headless=self.isheadless,
216 |                     args=[
217 |                         "--disable-blink-features=AutomationControlled",
218 |                         "--disable-session-crashed-bubble",  # disable the restore session bubble
219 |                         "--disable-infobars",  # disable informational popups,
220 |                     ],
221 |                     no_viewport=True,
222 |                 )
223 |                 # # Apply stealth to the new context
224 |                 # for page in PlaywrightManager._browser_context.pages:
225 |                 #     await stealth_async(page)
226 |             elif "Chromium distribution 'chrome' is not found " in str(e):
227 |                 raise ValueError(
228 |                     "Chrome is not installed on this device. Install Google Chrome or install playwright using 'playwright install chrome'. Refer to the readme for more information."
229 |                 ) from None
230 |             else:
231 |                 raise e from None
232 | 
233 |     async def get_browser_context(self):
234 |         """
235 |         Returns the existing browser context, or creates a new one if it doesn't exist.
236 |         """
237 |         await self.ensure_browser_context()
238 |         return self._browser_context
239 | 
240 |     async def get_current_url(self) -> Union[str, None]:
241 |         """
242 |         Get the current URL of current page
243 | 
244 |         Returns:
245 |             str | None: The current URL if any.
246 |         """
247 |         try:
248 |             current_page: Page = await self.get_current_page()
249 |             return current_page.url
250 |         except Exception:
251 |             pass
252 |         return None
253 | 
254 |     async def get_current_page(self) -> Page:
255 |         """
256 |         Get the current page of the browser
257 | 
258 |         Returns:
259 |             Page: The current page if any.
260 |         """
261 |         try:
262 |             browser: BrowserContext = await self.get_browser_context()  # type: ignore
263 |             # Filter out closed pages
264 |             pages: List[Page] = [page for page in browser.pages if not page.is_closed()]
265 |             page: Union[Page, None] = pages[-1] if pages else None
266 |             logger.debug(f"Current page: {page.url if page else None}")
267 |             if page is not None:
268 |                 return page
269 |             else:
270 |                 page: Page = await browser.new_page()  # type: ignore
271 |                 # await stealth_async(page)  # Apply stealth to the new page
272 |                 return page
273 |         except Exception as e:
274 |             logger.warn(f"Browser context was closed. Creating a new one. {e}")
275 |         except Exception as e:
276 |             logger.warn(f"Browser context was closed. Creating a new one. {e}")
277 |             PlaywrightManager._browser_context = None
278 |             _browser: BrowserContext = await self.get_browser_context()  # type: ignore
279 |             page: Union[Page, None] = await self.get_current_page()
280 |             return page
281 | 
282 |     async def close_all_tabs(self, keep_first_tab: bool = True):
283 |         """
284 |         Closes all tabs in the browser context, except for the first tab if `keep_first_tab` is set to True.
285 | 
286 |         Args:
287 |             keep_first_tab (bool, optional): Whether to keep the first tab open. Defaults to True.
288 |         """
289 |         browser_context = await self.get_browser_context()
290 |         pages: List[Page] = browser_context.pages  # type: ignore
291 |         pages_to_close: List[Page] = pages[1:] if keep_first_tab else pages  # type: ignore
292 |         for page in pages_to_close:  # type: ignore
293 |             await page.close()  # type: ignore
294 | 
295 |     async def close_except_specified_tab(self, page_to_keep: Page):
296 |         """
297 |         Closes all tabs in the browser context, except for the specified tab.
298 | 
299 |         Args:
300 |             page_to_keep (Page): The Playwright page object representing the tab that should remain open.
301 |         """
302 |         browser_context = await self.get_browser_context()
303 |         for page in browser_context.pages:  # type: ignore
304 |             if page != page_to_keep:  # Check if the current page is not the one to keep
305 |                 await page.close()  # type: ignore
306 | 
307 |     async def go_to_homepage(self):
308 |         page: Page = await PlaywrightManager.get_current_page(self)
309 |         try:
310 |             await page.goto(self._homepage, timeout=10000)  # 10 seconds timeout
311 |         except Exception as e:
312 |             logger.error(f"Failed to navigate to homepage: {e}")
313 |             # implement a retry mechanism here
314 |         try:
315 |             await page.goto(self._homepage, timeout=10000)  # 10 seconds timeout
316 |         except Exception as e:
317 |             logger.error(f"Failed to navigate to homepage: {e}")
318 |             # implement a retry mechanism here
319 | 
320 |     async def set_navigation_handler(self):
321 |         page: Page = await PlaywrightManager.get_current_page(self)
322 |         page.on("domcontentloaded", self.ui_manager.handle_navigation)  # type: ignore
323 |         page.on("domcontentloaded", handle_navigation_for_mutation_observer)  # type: ignore
324 |         await page.expose_function(
325 |             "dom_mutation_change_detected", dom_mutation_change_detected
326 |         )  # type: ignore
327 | 
328 |     async def set_overlay_state_handler(self):
329 |         logger.debug("Setting overlay state handler")
330 |         context = await self.get_browser_context()
331 |         await context.expose_function(
332 |             "overlay_state_changed", self.overlay_state_handler
333 |         )  # type: ignore
334 |         await context.expose_function(
335 |             "show_steps_state_changed", self.show_steps_state_handler
336 |         )  # type: ignore
337 | 
338 |     async def overlay_state_handler(self, is_collapsed: bool):
339 |         page = await self.get_current_page()
340 |         self.ui_manager.update_overlay_state(is_collapsed)
341 |         if not is_collapsed:
342 |             await self.ui_manager.update_overlay_chat_history(page)
343 | 
344 |     async def show_steps_state_handler(self, show_details: bool):
345 |         page = await self.get_current_page()
346 |         await self.ui_manager.update_overlay_show_details(show_details, page)
347 | 
348 |     async def set_user_response_handler(self):
349 |         context = await self.get_browser_context()
350 |         await context.expose_function("user_response", self.receive_user_response)  # type: ignore
351 | 
352 |     # async def notify_user(
353 |     #     self, message: str, message_type: MessageType = MessageType.STEP
354 |     # ):
355 |     #     """
356 |     #     Notify the user with a message.
357 | 
358 |     #     Args:
359 |     #         message (str): The message to notify the user with.
360 |     #         message_type (enum, optional): Values can be 'PLAN', 'QUESTION', 'ANSWER', 'INFO', 'STEP'. Defaults to 'STEP'.
361 |     #         To Do: Convert to Enum.
362 |     #     """
363 | 
364 |     #     if message.startswith(":"):
365 |     #         message = message[1:]
366 | 
367 |     #     if message.endswith(","):
368 |     #         message = message[:-1]
369 | 
370 |     #     if message_type == MessageType.PLAN:
371 |     #         message = beautify_plan_message(message)
372 |     #         message = "Plan:\n" + message
373 |     #     elif message_type == MessageType.STEP:
374 |     #         if "confirm" in message.lower():
375 |     #             message = "Verify: " + message
376 |     #         else:
377 |     #             message = "Next step: " + message
378 |     #     elif message_type == MessageType.QUESTION:
379 |     #         message = "Question: " + message
380 |     #     elif message_type == MessageType.ANSWER:
381 |     #         message = "Response: " + message
382 | 
383 |     #     safe_message = escape_js_message(message)
384 |     #     self.ui_manager.new_system_message(safe_message, message_type)
385 | 
386 |     #     if self.ui_manager.overlay_show_details == False:  # noqa: E712
387 |     #         if message_type not in (
388 |     #             MessageType.PLAN,
389 |     #             MessageType.QUESTION,
390 |     #             MessageType.ANSWER,
391 |     #             MessageType.INFO,
392 |     #         ):
393 |     #             return
394 | 
395 |     #     if self.ui_manager.overlay_show_details == True:  # noqa: E712
396 |     #         if message_type not in (
397 |     #             MessageType.PLAN,
398 |     #             MessageType.QUESTION,
399 |     #             MessageType.ANSWER,
400 |     #             MessageType.INFO,
401 |     #             MessageType.STEP,
402 |     #         ):
403 |     #             return
404 | 
405 |     #     safe_message_type = escape_js_message(message_type.value)
406 |     #     try:
407 |     #         js_code = f"addSystemMessage({safe_message}, is_awaiting_user_response=false, message_type={safe_message_type});"
408 |     #         page = await self.get_current_page()
409 |     #         await page.evaluate(js_code)
410 |     #     except Exception as e:
411 |     #         logger.error(
412 |     #             f'Failed to notify user with message "{message}". However, most likey this will work itself out after the page loads: {e}'
413 |     #         )
414 | 
415 |     #     self.notification_manager.notify(message, message_type.value)
416 | 
417 |     async def highlight_element(self, selector: str, add_highlight: bool):
418 |         try:
419 |             page: Page = await self.get_current_page()
420 |             if add_highlight:
421 |                 # Add the 'agente-ui-automation-highlight' class to the element. This class is used to apply the fading border.
422 |                 await page.eval_on_selector(
423 |                     selector,
424 |                     """e => {
425 |                             let originalBorderStyle = e.style.border;
426 |                             e.classList.add('agente-ui-automation-highlight');
427 |                             e.addEventListener('animationend', () => {
428 |                                 e.classList.remove('agente-ui-automation-highlight')
429 |                             });}""",
430 |                 )
431 |                 logger.debug(
432 |                     f"Applied pulsating border to element with selector {selector} to indicate text entry operation"
433 |                 )
434 |             else:
435 |                 # Remove the 'agente-ui-automation-highlight' class from the element.
436 |                 await page.eval_on_selector(
437 |                     selector,
438 |                     "e => e.classList.remove('agente-ui-automation-highlight')",
439 |                 )
440 |                 logger.debug(
441 |                     f"Removed pulsating border from element with selector {selector} after text entry operation"
442 |                 )
443 |         except Exception:
444 |             # This is not significant enough to fail the operation
445 |             pass
446 | 
447 |     # async def receive_user_response(self, response: str):
448 |     #     self.user_response = response  # Store the response for later use.
449 |     #     logger.debug(f"Received user response to system prompt: {response}")
450 |     #     # Notify event loop that the user's response has been received.
451 |     #     self.user_response_event.set()
452 | 
453 |     # async def prompt_user(self, message: str) -> str:
454 |     #     """
455 |     #     Prompt the user with a message and wait for a response.
456 | 
457 |     #     Args:
458 |     #         message (str): The message to prompt the user with.
459 | 
460 |     #     Returns:
461 |     #         str: The user's response.
462 |     #     """
463 |     #     logger.debug(f'Prompting user with message: "{message}"')
464 |     #     # self.ui_manager.new_system_message(message)
465 | 
466 |     #     page = await self.get_current_page()
467 | 
468 |     #     await self.ui_manager.show_overlay(page)
469 |     #     self.log_system_message(
470 |     #         message, MessageType.QUESTION
471 |     #     )  # add the message to history after the overlay is opened to avoid double adding it. add_system_message below will add it
472 | 
473 |     #     safe_message = escape_js_message(message)
474 | 
475 |     #     js_code = f"addSystemMessage({safe_message}, is_awaiting_user_response=true, message_type='question');"
476 |     #     await page.evaluate(js_code)
477 | 
478 |     #     await self.user_response_event.wait()
479 |     #     result = self.user_response
480 |     #     logger.info(f'User prompt reponse to "{message}": {result}')
481 |     #     self.user_response_event.clear()
482 |     #     self.user_response = ""
483 |     #     self.ui_manager.new_user_message(result)
484 |     #     return result
485 | 
486 |     def set_take_screenshots(self, take_screenshots: bool):
487 |         self._take_screenshots = take_screenshots
488 | 
489 |     def get_take_screenshots(self):
490 |         return self._take_screenshots
491 | 
492 |     def set_screenshots_dir(self, screenshots_dir: str):
493 |         self._screenshots_dir = screenshots_dir
494 | 
495 |     def get_screenshots_dir(self):
496 |         return self._screenshots_dir
497 | 
498 |     async def take_screenshots(
499 |         self,
500 |         name: str,
501 |         page: Union[Page, None],
502 |         full_page: bool = True,
503 |         include_timestamp: bool = True,
504 |         load_state: str = "domcontentloaded",
505 |         take_snapshot_timeout: int = 5 * 1000,
506 |     ):
507 |         if not self._take_screenshots:
508 |             return
509 |         if page is None:
510 |             page = await self.get_current_page()
511 | 
512 |         screenshot_name = name
513 | 
514 |         if include_timestamp:
515 |             screenshot_name = f"{int(time.time_ns())}_{screenshot_name}"
516 |         screenshot_name += ".png"
517 |         screenshot_path = f"{self.get_screenshots_dir()}/{screenshot_name}"
518 |         try:
519 |             await page.wait_for_load_state(
520 |                 state=load_state, timeout=take_snapshot_timeout
521 |             )  # type: ignore
522 |             await page.screenshot(
523 |                 path=screenshot_path,
524 |                 full_page=full_page,
525 |                 timeout=take_snapshot_timeout,
526 |                 caret="initial",
527 |                 scale="device",
528 |             )
529 |             logger.debug(f"Screen shot saved to: {screenshot_path}")
530 |         except Exception as e:
531 |             logger.error(
532 |                 f'Failed to take screenshot and save to "{screenshot_path}". Error: {e}'
533 |             )
534 | 
535 |     def log_user_message(self, message: str):
536 |         """
537 |         Log the user's message.
538 | 
539 |         Args:
540 |             message (str): The user's message to log.
541 |         """
542 |         self.ui_manager.new_user_message(message)
543 | 
544 |     def log_system_message(self, message: str, type: MessageType = MessageType.STEP):
545 |         """
546 |         Log a system message.
547 | 
548 |         Args:
549 |             message (str): The system message to log.
550 |         """
551 |         self.ui_manager.new_system_message(message, type)
552 | 
553 |     async def update_processing_state(self, processing_state: str):
554 |         """
555 |         Update the processing state of the overlay.
556 | 
557 |         Args:
558 |             is_processing (str): "init", "processing", "done"
559 |         """
560 |         page = await self.get_current_page()
561 | 
562 |         await self.ui_manager.update_processing_state(processing_state, page)
563 | 
564 |     async def command_completed(
565 |         self, command: str, elapsed_time: Union[float, None] = None
566 |     ):
567 |         """
568 |         Notify the overlay that the command has been completed.
569 |         """
570 |         logger.debug(
571 |             f'Command "{command}" has been completed. Focusing on the overlay input if it is open.'
572 |         )
573 |         page = await self.get_current_page()
574 |         await self.ui_manager.command_completed(page, command, elapsed_time)
575 | 


--------------------------------------------------------------------------------
/sentient/task_instructions/task_instructions.txt:
--------------------------------------------------------------------------------
1 | 
2 | 1. Directly go to youtube.com rather than searching for the song on google!
3 | 


--------------------------------------------------------------------------------
/sentient/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sentient-engineering/sentient/43dc0b1259ecca3f2560572704878322b02bdf66/sentient/utils/__init__.py


--------------------------------------------------------------------------------
/sentient/utils/_pydantic.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, Tuple, Union, get_args
  2 | 
  3 | from pydantic import BaseModel
  4 | from pydantic.version import VERSION as PYDANTIC_VERSION
  5 | from typing_extensions import get_origin
  6 | 
  7 | __all__ = (
  8 |     "JsonSchemaValue",
  9 |     "model_dump",
 10 |     "model_dump_json",
 11 |     "type2schema",
 12 |     "evaluate_forwardref",
 13 | )
 14 | 
 15 | PYDANTIC_V1 = PYDANTIC_VERSION.startswith("1.")
 16 | 
 17 | if not PYDANTIC_V1:
 18 |     from pydantic import TypeAdapter
 19 |     from pydantic._internal._typing_extra import (
 20 |         eval_type_lenient as evaluate_forwardref,
 21 |     )
 22 |     from pydantic.json_schema import JsonSchemaValue
 23 | 
 24 |     def type2schema(t: Any) -> JsonSchemaValue:
 25 |         """Convert a type to a JSON schema
 26 | 
 27 |         Args:
 28 |             t (Type): The type to convert
 29 | 
 30 |         Returns:
 31 |             JsonSchemaValue: The JSON schema
 32 |         """
 33 |         return TypeAdapter(t).json_schema()
 34 | 
 35 |     def model_dump(model: BaseModel) -> Dict[str, Any]:
 36 |         """Convert a pydantic model to a dict
 37 | 
 38 |         Args:
 39 |             model (BaseModel): The model to convert
 40 | 
 41 |         Returns:
 42 |             Dict[str, Any]: The dict representation of the model
 43 | 
 44 |         """
 45 |         return model.model_dump()
 46 | 
 47 |     def model_dump_json(model: BaseModel) -> str:
 48 |         """Convert a pydantic model to a JSON string
 49 | 
 50 |         Args:
 51 |             model (BaseModel): The model to convert
 52 | 
 53 |         Returns:
 54 |             str: The JSON string representation of the model
 55 |         """
 56 |         return model.model_dump_json()
 57 | 
 58 | 
 59 | # Remove this once we drop support for pydantic 1.x
 60 | else:  # pragma: no cover
 61 |     from pydantic import TypeAdapter
 62 |     from pydantic.typing import (
 63 |         evaluate_forwardref as evaluate_forwardref,  # type: ignore[no-redef]
 64 |     )
 65 | 
 66 |     JsonSchemaValue = Dict[str, Any]  # type: ignore[misc]
 67 | 
 68 |     def type2schema(t: Any) -> JsonSchemaValue:
 69 |         """Convert a type to a JSON schema
 70 | 
 71 |         Args:
 72 |             t (Type): The type to convert
 73 | 
 74 |         Returns:
 75 |             JsonSchemaValue: The JSON schema
 76 |         """
 77 |         if PYDANTIC_V1:
 78 |             if t is None:
 79 |                 return {"type": "null"}
 80 |             elif get_origin(t) is Union:
 81 |                 return {"anyOf": [type2schema(tt) for tt in get_args(t)]}
 82 |             elif get_origin(t) in [Tuple, tuple]:
 83 |                 prefixItems = [type2schema(tt) for tt in get_args(t)]
 84 |                 return {
 85 |                     "maxItems": len(prefixItems),
 86 |                     "minItems": len(prefixItems),
 87 |                     "prefixItems": prefixItems,
 88 |                     "type": "array",
 89 |                 }
 90 | 
 91 |         d = TypeAdapter.json_schema(t)
 92 |         if "title" in d:
 93 |             d.pop("title")
 94 |         if "description" in d:
 95 |             d.pop("description")
 96 | 
 97 |         return d
 98 | 
 99 |     def model_dump(model: BaseModel) -> Dict[str, Any]:
100 |         """Convert a pydantic model to a dict
101 | 
102 |         Args:
103 |             model (BaseModel): The model to convert
104 | 
105 |         Returns:
106 |             Dict[str, Any]: The dict representation of the model
107 | 
108 |         """
109 |         return model.dict()
110 | 
111 |     def model_dump_json(model: BaseModel) -> str:
112 |         """Convert a pydantic model to a JSON string
113 | 
114 |         Args:
115 |             model (BaseModel): The model to convert
116 | 
117 |         Returns:
118 |             str: The JSON string representation of the model
119 |         """
120 |         return model.json()
121 | 


--------------------------------------------------------------------------------
/sentient/utils/cli_helper.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from asyncio import Future
 3 | from typing import Dict, List
 4 | 
 5 | 
 6 | def async_input(prompt: str) -> Future:  # type: ignore
 7 |     """
 8 |     Display a prompt to the user and wait for input in an asynchronous manner.
 9 | 
10 |     Parameters:
11 |     - prompt: The message to display to the user.
12 | 
13 |     Returns:
14 |     - A Future object that will be fulfilled with the user's input.
15 |     """
16 |     loop = asyncio.get_event_loop()
17 |     return loop.run_in_executor(None, input, prompt)
18 | 
19 | 
20 | async def answer_questions_over_cli(questions: List[str]) -> Dict[str, str]:
21 |     """
22 |     Asks a question over the command line and awaits the user's response.
23 | 
24 |     Parameters:
25 |     - questions: A list of questions to ask the user, e.g., ["What is your favorite site?", "What do you want to search for?"].
26 | 
27 |     Returns:
28 |     - A dictionary where each key is a question and each value is the user's response.
29 |     """
30 |     answers: Dict[str, str] = {}
31 |     print("*********************************")
32 |     for question in questions:
33 |         answers[question] = await async_input("Question: " + str(question) + " : ")
34 |     print("*********************************")
35 |     return answers
36 | 


--------------------------------------------------------------------------------
/sentient/utils/dom_helper.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import List, Optional
 3 | 
 4 | from playwright.async_api import ElementHandle, Page
 5 | 
 6 | from sentient.utils.logger import logger
 7 | 
 8 | 
 9 | async def wait_for_non_loading_dom_state(page: Page, max_wait_millis: int):
10 |     max_wait_seconds = max_wait_millis / 1000
11 |     end_time = asyncio.get_event_loop().time() + max_wait_seconds
12 |     while asyncio.get_event_loop().time() < end_time:
13 |         dom_state = await page.evaluate("document.readyState")
14 |         if dom_state != "loading":
15 |             logger.debug(f"DOM state is not 'loading': {dom_state}")
16 |             break  # Exit the loop if the DOM state is not 'loading'
17 | 
18 |         await asyncio.sleep(0.05)
19 | 
20 | 
21 | async def get_element_outer_html(
22 |     element: ElementHandle, page: Page, element_tag_name: Optional[str] = None
23 | ) -> str:
24 |     """
25 |     Constructs the opening tag of an HTML element along with its attributes.
26 | 
27 |     Args:
28 |         element (ElementHandle): The element to retrieve the opening tag for.
29 |         page (Page): The page object associated with the element.
30 |         element_tag_name (str, optional): The tag name of the element. Defaults to None. If not passed, it will be retrieved from the element.
31 | 
32 |     Returns:
33 |         str: The opening tag of the HTML element, including a select set of attributes.
34 |     """
35 |     tag_name: str = (
36 |         element_tag_name
37 |         if element_tag_name
38 |         else await page.evaluate("element => element.tagName.toLowerCase()", element)
39 |     )
40 | 
41 |     attributes_of_interest: List[str] = [
42 |         "id",
43 |         "name",
44 |         "aria-label",
45 |         "placeholder",
46 |         "href",
47 |         "src",
48 |         "aria-autocomplete",
49 |         "role",
50 |         "type",
51 |         "data-testid",
52 |         "value",
53 |         "selected",
54 |         "aria-labelledby",
55 |         "aria-describedby",
56 |         "aria-haspopup",
57 |     ]
58 |     opening_tag: str = f"<{tag_name}"
59 | 
60 |     for attr in attributes_of_interest:
61 |         value: str = await element.get_attribute(attr)  # type: ignore
62 |         if value:
63 |             opening_tag += f' {attr}="{value}"'
64 |     opening_tag += ">"
65 | 
66 |     return opening_tag
67 | 


--------------------------------------------------------------------------------
/sentient/utils/dom_mutation_observer.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import json
 3 | from typing import Callable, List  # noqa: UP035
 4 | 
 5 | from playwright.async_api import Page
 6 | 
 7 | # Create an event loop
 8 | loop = asyncio.get_event_loop()
 9 | 
10 | DOM_change_callback: List[Callable[[str], None]] = []
11 | 
12 | 
13 | def subscribe(callback: Callable[[str], None]) -> None:
14 |     DOM_change_callback.append(callback)
15 | 
16 | 
17 | def unsubscribe(callback: Callable[[str], None]) -> None:
18 |     DOM_change_callback.remove(callback)
19 | 
20 | 
21 | async def add_mutation_observer(page: Page):
22 |     """
23 |     Adds a mutation observer to the page to detect changes in the DOM.
24 |     When changes are detected, the observer calls the dom_mutation_change_detected function in the browser context.
25 |     This changes can be detected by subscribing to the dom_mutation_change_detected function by individual skills.
26 | 
27 |     Current implementation only detects when a new node is added to the DOM.
28 |     However, in many cases, the change could be a change in the style or class of an existing node (e.g. toggle visibility of a hidden node).
29 |     """
30 | 
31 |     await page.evaluate("""
32 |         console.log('Adding a mutation observer for DOM changes');
33 |         new MutationObserver((mutationsList, observer) => {
34 |             let changes_detected = [];
35 |             for(let mutation of mutationsList) {
36 |                 if (mutation.type === 'childList') {
37 |                     let allAddedNodes=mutation.addedNodes;
38 |                     for(let node of allAddedNodes) {
39 |                         if(node.tagName && !['SCRIPT', 'NOSCRIPT', 'STYLE'].includes(node.tagName) && !node.closest('#agentDriveAutoOverlay')) {
40 |                             let visibility=true;
41 |                             let content = node.innerText.trim();
42 |                             if(visibility && node.innerText.trim()){
43 |                                 if(content) {
44 |                                     changes_detected.push({tag: node.tagName, content: content});
45 |                                 }
46 |                             }
47 |                         }
48 |                     }
49 |                 } else if (mutation.type === 'characterData') {
50 |                     let node = mutation.target;
51 |                     if(node.parentNode && !['SCRIPT', 'NOSCRIPT', 'STYLE'].includes(node.parentNode.tagName) && !node.parentNode.closest('#agentDriveAutoOverlay')) {
52 |                         let visibility=true;
53 |                         let content = node.data.trim();
54 |                         if(visibility && content && window.getComputedStyle(node.parentNode).display !== 'none'){
55 |                             if(content && !changes_detected.some(change => change.content.includes(content))) {
56 |                                 changes_detected.push({tag: node.parentNode.tagName, content: content});
57 |                             }
58 |                         }
59 |                     }
60 |                 }
61 |             }
62 |             if(changes_detected.length > 0) {
63 |                 window.dom_mutation_change_detected(JSON.stringify(changes_detected));
64 |             }
65 |         }).observe(document, {subtree: true, childList: true, characterData: true});
66 |         """)
67 | 
68 | 
69 | async def handle_navigation_for_mutation_observer(page: Page):
70 |     await add_mutation_observer(page)
71 | 
72 | 
73 | async def dom_mutation_change_detected(changes_detected: str):
74 |     """
75 |     Detects changes in the DOM (new nodes added) and emits the event to all subscribed callbacks.
76 |     The changes_detected is a string in JSON formatt containing the tag and content of the new nodes added to the DOM.
77 | 
78 |     e.g.  The following will be detected when autocomplete recommendations show up when one types Nelson Mandela on google search
79 |     [{'tag': 'SPAN', 'content': 'nelson mandela wikipedia'}, {'tag': 'SPAN', 'content': 'nelson mandela movies'}]
80 |     """
81 |     changes_detected = json.loads(changes_detected.replace("\t", "").replace("\n", ""))
82 |     if len(changes_detected) > 0:
83 |         # Emit the event to all subscribed callbacks
84 |         for callback in DOM_change_callback:
85 |             # If the callback is a coroutine function
86 |             if asyncio.iscoroutinefunction(callback):
87 |                 await callback(changes_detected)
88 |             # If the callback is a regular function
89 |             else:
90 |                 callback(changes_detected)
91 | 


--------------------------------------------------------------------------------
/sentient/utils/extract_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any, Dict
 3 | 
 4 | from sentient.utils.logger import logger
 5 | 
 6 | 
 7 | def extract_json(message: str) -> Dict[str, Any]:
 8 |     """
 9 |     Parse the response from the browser agent and return the response as a dictionary.
10 |     """
11 |     json_response = {}
12 |     # Remove Markdown code block delimiters if present
13 |     message = message.strip()
14 |     if message.startswith("```"):
15 |         message = message.split("\n", 1)[1]  # Remove the first line
16 |     if message.endswith("```"):
17 |         message = message.rsplit("\n", 1)[0]  # Remove the last line
18 | 
19 |     # Remove any leading "json" tag
20 |     if message.lstrip().startswith("json"):
21 |         message = message.lstrip()[4:].lstrip()
22 | 
23 |     try:
24 |         return json.loads(message)
25 |     except json.JSONDecodeError as e:
26 |         logger.warn(
27 |             f"LLM response was not properly formed JSON. Error: {e}. "
28 |             f'LLM response: "{message}"'
29 |         )
30 |         message = message.replace("\\n", "\n")
31 |         message = message.replace("\n", " ")  # type: ignore
32 |         if "plan" in message and "next_step" in message:
33 |             start = message.index("plan") + len("plan")
34 |             end = message.index("next_step")
35 |             json_response["plan"] = message[start:end].replace('"', "").strip()
36 |         if "next_step" in message and "terminate" in message:
37 |             start = message.index("next_step") + len("next_step")
38 |             end = message.index("terminate")
39 |             json_response["next_step"] = message[start:end].replace('"', "").strip()
40 |         if "terminate" in message and "final_response" in message:
41 |             start = message.index("terminate") + len("terminate")
42 |             end = message.index("final_response")
43 |             matched_string = message[start:end].replace('"', "").strip()
44 |             if "yes" in matched_string:
45 |                 json_response["terminate"] = "yes"
46 |             else:
47 |                 json_response["terminate"] = "no"
48 | 
49 |             start = message.index("final_response") + len("final_response")
50 |             end = len(message) - 1
51 |             json_response["final_response"] = (
52 |                 message[start:end].replace('"', "").strip()
53 |             )
54 | 
55 |         elif "terminate" in message:
56 |             start = message.index("terminate") + len("terminate")
57 |             end = len(message) - 1
58 |             matched_string = message[start:end].replace('"', "").strip()
59 |             if "yes" in matched_string:
60 |                 json_response["terminate"] = "yes"
61 |             else:
62 |                 json_response["terminate"] = "no"
63 | 
64 |     return json_response
65 | 


--------------------------------------------------------------------------------
/sentient/utils/function_utils.py:
--------------------------------------------------------------------------------
  1 | # import inspect
  2 | # from typing import Any, Callable, Dict, List, Union
  3 | 
  4 | # from typing_extensions import Annotated, get_args, get_origin
  5 | 
  6 | 
  7 | # def get_type_name(type_hint: Any) -> str:
  8 | #     if hasattr(type_hint, "__name__"):
  9 | #         return type_hint.__name__
 10 | #     if hasattr(type_hint, "_name"):
 11 | #         return type_hint._name
 12 | #     return str(type_hint).replace("typing.", "")
 13 | 
 14 | 
 15 | # def get_parameter_schema(
 16 | #     name: str, param: inspect.Parameter, type_hint: Any
 17 | # ) -> Dict[str, Any]:
 18 | #     schema = {"type": get_type_name(type_hint)}
 19 | 
 20 | #     if get_origin(type_hint) is Annotated:
 21 | #         type_hint, description = get_args(type_hint)
 22 | #         schema["description"] = description
 23 | #     else:
 24 | #         schema["description"] = name
 25 | 
 26 | #     if get_origin(type_hint) is Union:
 27 | #         schema["type"] = [get_type_name(arg) for arg in get_args(type_hint)]
 28 | #     elif get_origin(type_hint) is List:
 29 | #         item_type = get_args(type_hint)[0]
 30 | #         if get_origin(item_type) is Dict:
 31 | #             key_type, value_type = get_args(item_type)
 32 | #             schema["type"] = "array"
 33 | #             schema["items"] = {
 34 | #                 "type": "object",
 35 | #                 "additionalProperties": {"type": get_type_name(value_type)},
 36 | #             }
 37 | #         else:
 38 | #             schema["type"] = "array"
 39 | #             schema["items"] = {"type": get_type_name(item_type)}
 40 | 
 41 | #     if param.default != inspect.Parameter.empty:
 42 | #         schema["default"] = param.default
 43 | #     return schema
 44 | 
 45 | 
 46 | # def generate_tool_from_function(
 47 | #     func: Callable[..., Any], tool_description: str
 48 | # ) -> Dict[str, Any]:
 49 | #     signature = inspect.signature(func)
 50 | #     type_hints = func.__annotations__
 51 | 
 52 | #     parameters = {}
 53 | #     for name, param in signature.parameters.items():
 54 | #         type_hint = type_hints.get(name, Any)
 55 | #         parameters[name] = get_parameter_schema(name, param, type_hint)
 56 | 
 57 | #     return {
 58 | #         "type": "function",
 59 | #         "function": {
 60 | #             "name": func.__name__,
 61 | #             "description": tool_description,
 62 | #             "parameters": {
 63 | #                 "type": "object",
 64 | #                 "properties": parameters,
 65 | #                 "required": [
 66 | #                     name
 67 | #                     for name, param in signature.parameters.items()
 68 | #                     if param.default == inspect.Parameter.empty
 69 | #                 ],
 70 | #             },
 71 | #         },
 72 | #     }
 73 | 
 74 | 
 75 | import functools
 76 | import inspect
 77 | import json
 78 | from logging import getLogger
 79 | from typing import (
 80 |     Any,
 81 |     Callable,
 82 |     Dict,
 83 |     ForwardRef,
 84 |     List,
 85 |     Optional,
 86 |     Set,
 87 |     Tuple,
 88 |     Type,
 89 |     TypeVar,
 90 |     Union,
 91 | )
 92 | 
 93 | from pydantic import BaseModel, Field
 94 | from typing_extensions import Annotated, Literal, get_args, get_origin
 95 | 
 96 | from ._pydantic import (
 97 |     JsonSchemaValue,
 98 |     evaluate_forwardref,
 99 |     model_dump,
100 |     model_dump_json,
101 |     type2schema,
102 | )
103 | 
104 | logger = getLogger(__name__)
105 | 
106 | T = TypeVar("T")
107 | 
108 | 
109 | def get_typed_annotation(annotation: Any, globalns: Dict[str, Any]) -> Any:
110 |     """Get the type annotation of a parameter.
111 | 
112 |     Args:
113 |         annotation: The annotation of the parameter
114 |         globalns: The global namespace of the function
115 | 
116 |     Returns:
117 |         The type annotation of the parameter
118 |     """
119 |     if isinstance(annotation, str):
120 |         annotation = ForwardRef(annotation)
121 |         annotation = evaluate_forwardref(annotation, globalns, globalns)
122 |     return annotation
123 | 
124 | 
125 | def get_typed_signature(call: Callable[..., Any]) -> inspect.Signature:
126 |     """Get the signature of a function with type annotations.
127 | 
128 |     Args:
129 |         call: The function to get the signature for
130 | 
131 |     Returns:
132 |         The signature of the function with type annotations
133 |     """
134 |     signature = inspect.signature(call)
135 |     globalns = getattr(call, "__globals__", {})
136 |     typed_params = [
137 |         inspect.Parameter(
138 |             name=param.name,
139 |             kind=param.kind,
140 |             default=param.default,
141 |             annotation=get_typed_annotation(param.annotation, globalns),
142 |         )
143 |         for param in signature.parameters.values()
144 |     ]
145 |     typed_signature = inspect.Signature(typed_params)
146 |     return typed_signature
147 | 
148 | 
149 | def get_typed_return_annotation(call: Callable[..., Any]) -> Any:
150 |     """Get the return annotation of a function.
151 | 
152 |     Args:
153 |         call: The function to get the return annotation for
154 | 
155 |     Returns:
156 |         The return annotation of the function
157 |     """
158 |     signature = inspect.signature(call)
159 |     annotation = signature.return_annotation
160 | 
161 |     if annotation is inspect.Signature.empty:
162 |         return None
163 | 
164 |     globalns = getattr(call, "__globals__", {})
165 |     return get_typed_annotation(annotation, globalns)
166 | 
167 | 
168 | def get_param_annotations(
169 |     typed_signature: inspect.Signature,
170 | ) -> Dict[str, Union[Annotated[Type[Any], str], Type[Any]]]:
171 |     """Get the type annotations of the parameters of a function
172 | 
173 |     Args:
174 |         typed_signature: The signature of the function with type annotations
175 | 
176 |     Returns:
177 |         A dictionary of the type annotations of the parameters of the function
178 |     """
179 |     return {
180 |         k: v.annotation
181 |         for k, v in typed_signature.parameters.items()
182 |         if v.annotation is not inspect.Signature.empty
183 |     }
184 | 
185 | 
186 | class Parameters(BaseModel):
187 |     """Parameters of a function as defined by the OpenAI API"""
188 | 
189 |     type: Literal["object"] = "object"
190 |     properties: Dict[str, JsonSchemaValue]
191 |     required: List[str]
192 |     additionalProperties: bool
193 |     additionalProperties: bool
194 | 
195 | 
196 | class Function(BaseModel):
197 |     """A function as defined by the OpenAI API"""
198 | 
199 |     description: Annotated[str, Field(description="Description of the function")]
200 |     name: Annotated[str, Field(description="Name of the function")]
201 |     parameters: Annotated[Parameters, Field(description="Parameters of the function")]
202 |     strict: bool
203 | 
204 | 
205 | class ToolFunction(BaseModel):
206 |     """A function under tool as defined by the OpenAI API."""
207 | 
208 |     type: Literal["function"] = "function"
209 |     function: Annotated[Function, Field(description="Function under tool")]
210 | 
211 | 
212 | def get_parameter_json_schema(
213 |     k: str, v: Any, default_values: Dict[str, Any]
214 | ) -> JsonSchemaValue:
215 |     def type2description(k: str, v: Union[Annotated[Type[Any], str], Type[Any]]) -> str:
216 |         if get_origin(v) is Annotated:
217 |             args = get_args(v)
218 |             if len(args) > 1 and isinstance(args[1], str):
219 |                 return args[1]
220 |         return k
221 | 
222 |     schema = type2schema(v)
223 |     schema["description"] = type2description(k, v)
224 | 
225 |     if schema["type"] == "object":
226 |         schema["additionalProperties"] = False
227 |         if "properties" not in schema:
228 |             schema["properties"] = {}
229 | 
230 |     if schema["type"] == "array":
231 |         if "items" not in schema:
232 |             schema["items"] = {
233 |                 "type": "object",
234 |                 "properties": {},
235 |                 "additionalProperties": False,
236 |             }
237 |         elif schema["items"].get("type") == "object":
238 |             if "properties" not in schema["items"]:
239 |                 schema["items"]["properties"] = {}
240 |             schema["items"]["additionalProperties"] = False
241 | 
242 |     return schema
243 | 
244 | 
245 | def get_required_params(typed_signature: inspect.Signature) -> List[str]:
246 |     """Get the required parameters of a function
247 | 
248 |     Args:
249 |         signature: The signature of the function as returned by inspect.signature
250 | 
251 |     Returns:
252 |         A list of the required parameters of the function
253 |     """
254 |     return [
255 |         k
256 |         for k, v in typed_signature.parameters.items()
257 |         if v.default == inspect.Signature.empty
258 |     ]
259 | 
260 | 
261 | def get_default_values(typed_signature: inspect.Signature) -> Dict[str, Any]:
262 |     """Get default values of parameters of a function
263 | 
264 |     Args:
265 |         signature: The signature of the function as returned by inspect.signature
266 | 
267 |     Returns:
268 |         A dictionary of the default values of the parameters of the function
269 |     """
270 |     return {
271 |         k: v.default
272 |         for k, v in typed_signature.parameters.items()
273 |         if v.default != inspect.Signature.empty
274 |     }
275 | 
276 | 
277 | def get_parameters(
278 |     required: List[str],
279 |     param_annotations: Dict[str, Union[Annotated[Type[Any], str], Type[Any]]],
280 |     default_values: Dict[str, Any],
281 | ) -> Parameters:
282 |     properties = {}
283 |     for k, v in param_annotations.items():
284 |         if v is not inspect.Signature.empty:
285 |             if get_origin(v) is Annotated:
286 |                 v_type = get_args(v)[0]
287 |                 v_desc = get_args(v)[1] if len(get_args(v)) > 1 else k
288 |             else:
289 |                 v_type = v
290 |                 v_desc = k
291 | 
292 |             if get_origin(v_type) is List:
293 |                 item_type = get_args(v_type)[0]
294 |                 properties[k] = {
295 |                     "type": "array",
296 |                     "items": get_parameter_json_schema(k, item_type, default_values),
297 |                     "description": v_desc,
298 |                 }
299 |             else:
300 |                 properties[k] = get_parameter_json_schema(k, v_type, default_values)
301 |                 properties[k]["description"] = v_desc
302 | 
303 |     return Parameters(
304 |         properties=properties,
305 |         required=list(properties.keys()),  # All properties are required
306 |         additionalProperties=False,
307 |     )
308 | 
309 | 
310 | def get_missing_annotations(
311 |     typed_signature: inspect.Signature, required: List[str]
312 | ) -> Tuple[Set[str], Set[str]]:
313 |     """Get the missing annotations of a function
314 | 
315 |     Ignores the parameters with default values as they are not required to be annotated, but logs a warning.
316 |     Args:
317 |         typed_signature: The signature of the function with type annotations
318 |         required: The required parameters of the function
319 | 
320 |     Returns:
321 |         A set of the missing annotations of the function
322 |     """
323 |     all_missing = {
324 |         k
325 |         for k, v in typed_signature.parameters.items()
326 |         if v.annotation is inspect.Signature.empty
327 |     }
328 |     missing = all_missing.intersection(set(required))
329 |     unannotated_with_default = all_missing.difference(missing)
330 |     return missing, unannotated_with_default
331 | 
332 | 
333 | def get_function_schema(
334 |     f: Callable[..., Any], *, name: Optional[str] = None, description: str
335 | ) -> Dict[str, Any]:
336 |     """Get a JSON schema for a function as defined by the OpenAI API
337 | 
338 |     Args:
339 |         f: The function to get the JSON schema for
340 |         name: The name of the function
341 |         description: The description of the function
342 | 
343 |     Returns:
344 |         A JSON schema for the function
345 | 
346 |     Raises:
347 |         TypeError: If the function is not annotated
348 | 
349 |     Examples:
350 | 
351 |     ```python
352 |     def f(a: Annotated[str, "Parameter a"], b: int = 2, c: Annotated[float, "Parameter c"] = 0.1) -> None:
353 |         pass
354 | 
355 |     get_function_schema(f, description="function f")
356 | 
357 |     #   {'type': 'function',
358 |     #    'function': {'description': 'function f',
359 |     #        'name': 'f',
360 |     #        'parameters': {'type': 'object',
361 |     #           'properties': {'a': {'type': 'str', 'description': 'Parameter a'},
362 |     #               'b': {'type': 'int', 'description': 'b'},
363 |     #               'c': {'type': 'float', 'description': 'Parameter c'}},
364 |     #           'required': ['a']}}}
365 |     ```
366 | 
367 |     """
368 |     typed_signature = get_typed_signature(f)
369 |     required = get_required_params(typed_signature)
370 |     default_values = get_default_values(typed_signature)
371 |     param_annotations = get_param_annotations(typed_signature)
372 |     return_annotation = get_typed_return_annotation(f)
373 |     missing, unannotated_with_default = get_missing_annotations(
374 |         typed_signature, required
375 |     )
376 | 
377 |     if return_annotation is None:
378 |         logger.warning(
379 |             f"The return type of the function '{f.__name__}' is not annotated. Although annotating it is "
380 |             + "optional, the function should return either a string, a subclass of 'pydantic.BaseModel'."
381 |         )
382 | 
383 |     if unannotated_with_default != set():
384 |         unannotated_with_default_s = [
385 |             f"'{k}'" for k in sorted(unannotated_with_default)
386 |         ]
387 |         logger.warning(
388 |             f"The following parameters of the function '{f.__name__}' with default values are not annotated: "
389 |             + f"{', '.join(unannotated_with_default_s)}."
390 |         )
391 | 
392 |     if missing != set():
393 |         missing_s = [f"'{k}'" for k in sorted(missing)]
394 |         raise TypeError(
395 |             f"All parameters of the function '{f.__name__}' without default values must be annotated. "
396 |             + f"The annotations are missing for the following parameters: {', '.join(missing_s)}"
397 |         )
398 | 
399 |     fname = name if name else f.__name__
400 | 
401 |     parameters = get_parameters(
402 |         required, param_annotations, default_values=default_values
403 |     )
404 | 
405 |     function = ToolFunction(
406 |         function=Function(
407 |             description=description,
408 |             name=fname,
409 |             parameters=parameters,
410 |             strict=True,
411 |         )
412 |     )
413 | 
414 |     schema = model_dump(function)
415 | 
416 |     return schema
417 | 
418 | 
419 | def get_load_param_if_needed_function(
420 |     t: Any,
421 | ) -> Optional[Callable[[Dict[str, Any], Type[BaseModel]], BaseModel]]:
422 |     """Get a function to load a parameter if it is a Pydantic model
423 | 
424 |     Args:
425 |         t: The type annotation of the parameter
426 | 
427 |     Returns:
428 |         A function to load the parameter if it is a Pydantic model, otherwise None
429 | 
430 |     """
431 |     if get_origin(t) is Annotated:
432 |         return get_load_param_if_needed_function(get_args(t)[0])
433 | 
434 |     def load_base_model(v: Dict[str, Any], t: Type[BaseModel]) -> BaseModel:
435 |         return t(**v)
436 | 
437 |     return load_base_model if isinstance(t, type) and issubclass(t, BaseModel) else None
438 | 
439 | 
440 | def load_basemodels_if_needed(func: Callable[..., Any]) -> Callable[..., Any]:
441 |     """A decorator to load the parameters of a function if they are Pydantic models
442 | 
443 |     Args:
444 |         func: The function with annotated parameters
445 | 
446 |     Returns:
447 |         A function that loads the parameters before calling the original function
448 | 
449 |     """
450 |     # get the type annotations of the parameters
451 |     typed_signature = get_typed_signature(func)
452 |     param_annotations = get_param_annotations(typed_signature)
453 | 
454 |     # get functions for loading BaseModels when needed based on the type annotations
455 |     kwargs_mapping_with_nones = {
456 |         k: get_load_param_if_needed_function(t) for k, t in param_annotations.items()
457 |     }
458 | 
459 |     # remove the None values
460 |     kwargs_mapping = {
461 |         k: f for k, f in kwargs_mapping_with_nones.items() if f is not None
462 |     }
463 | 
464 |     # a function that loads the parameters before calling the original function
465 |     @functools.wraps(func)
466 |     def _load_parameters_if_needed(*args: Any, **kwargs: Any) -> Any:
467 |         # load the BaseModels if needed
468 |         for k, f in kwargs_mapping.items():
469 |             kwargs[k] = f(kwargs[k], param_annotations[k])
470 | 
471 |         # call the original function
472 |         return func(*args, **kwargs)
473 | 
474 |     @functools.wraps(func)
475 |     async def _a_load_parameters_if_needed(*args: Any, **kwargs: Any) -> Any:
476 |         # load the BaseModels if needed
477 |         for k, f in kwargs_mapping.items():
478 |             kwargs[k] = f(kwargs[k], param_annotations[k])
479 | 
480 |         # call the original function
481 |         return await func(*args, **kwargs)
482 | 
483 |     if inspect.iscoroutinefunction(func):
484 |         return _a_load_parameters_if_needed
485 |     else:
486 |         return _load_parameters_if_needed
487 | 
488 | 
489 | def serialize_to_str(x: Any) -> str:
490 |     if isinstance(x, str):
491 |         return x
492 |     elif isinstance(x, BaseModel):
493 |         return model_dump_json(x)
494 |     else:
495 |         return json.dumps(x)
496 | 


--------------------------------------------------------------------------------
/sentient/utils/get_detailed_accessibility_tree.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | import traceback
  5 | from typing import Dict, List, Optional
  6 | 
  7 | from playwright.async_api import Page
  8 | from typing_extensions import Annotated, Any
  9 | 
 10 | from sentient.config.config import SOURCE_LOG_FOLDER_PATH
 11 | from sentient.core.web_driver.playwright import PlaywrightManager
 12 | from sentient.utils.logger import logger
 13 | 
 14 | space_delimited_mmid = re.compile(r"^[\d ]+$")
 15 | 
 16 | 
 17 | def is_space_delimited_mmid(s: str) -> bool:
 18 |     """
 19 |     Check if the given string matches the the mmid pattern of number space repeated.
 20 | 
 21 |     Parameters:
 22 |     - s (str): The string to check against the pattern.
 23 | 
 24 |     Returns:
 25 |     - bool: True if the string matches the pattern, False otherwise.
 26 |     """
 27 |     # Use fullmatch() to ensure the entire string matches the pattern
 28 |     return bool(space_delimited_mmid.fullmatch(s))
 29 | 
 30 | 
 31 | async def __inject_attributes(page: Page):
 32 |     """
 33 |     Injects 'mmid' and 'aria-keyshortcuts' into all DOM elements. If an element already has an 'aria-keyshortcuts',
 34 |     it renames it to 'orig-aria-keyshortcuts' before injecting the new 'aria-keyshortcuts'
 35 |     This will be captured in the accessibility tree and thus make it easier to reconcile the tree with the DOM.
 36 |     'aria-keyshortcuts' is choosen because it is not widely used aria attribute.
 37 |     """
 38 | 
 39 |     last_mmid = await page.evaluate("""() => {
 40 |         const allElements = document.querySelectorAll('*');
 41 |         let id = 0;
 42 |         allElements.forEach(element => {
 43 |             const origAriaAttribute = element.getAttribute('aria-keyshortcuts');
 44 |             const mmid = `${++id}`;
 45 |             element.setAttribute('mmid', mmid);
 46 |             element.setAttribute('aria-keyshortcuts', mmid);
 47 |             //console.log(`Injected 'mmid'into element with tag: ${element.tagName} and mmid: ${mmid}`);
 48 |             if (origAriaAttribute) {
 49 |                 element.setAttribute('orig-aria-keyshortcuts', origAriaAttribute);
 50 |             }
 51 |         });
 52 |         return id;
 53 |     }""")
 54 |     logger.debug(f"Added MMID into {last_mmid} elements")
 55 | 
 56 | 
 57 | async def __fetch_dom_info(
 58 |     page: Page, accessibility_tree: Dict[str, Any], only_input_fields: bool
 59 | ):
 60 |     """
 61 |     Iterates over the accessibility tree, fetching additional information from the DOM based on 'mmid',
 62 |     and constructs a new JSON structure with detailed information.
 63 | 
 64 |     Args:
 65 |         page (Page): The page object representing the web page.
 66 |         accessibility_tree (Dict[str, Any]): The accessibility tree JSON structure.
 67 |         only_input_fields (bool): Flag indicating whether to include only input fields in the new JSON structure.
 68 | 
 69 |     Returns:
 70 |         Dict[str, Any]: The pruned tree with detailed information from the DOM.
 71 |     """
 72 | 
 73 |     logger.debug("Reconciling the Accessibility Tree with the DOM")
 74 |     # Define the attributes to fetch for each element
 75 |     attributes = [
 76 |         "name",
 77 |         "aria-label",
 78 |         "placeholder",
 79 |         "mmid",
 80 |         "id",
 81 |         "for",
 82 |         "data-testid",
 83 |     ]
 84 |     backup_attributes = []  # if the attributes are not found, then try to get these attributes
 85 |     tags_to_ignore = [
 86 |         "head",
 87 |         "style",
 88 |         "script",
 89 |         "link",
 90 |         "meta",
 91 |         "noscript",
 92 |         "template",
 93 |         "iframe",
 94 |         "g",
 95 |         "main",
 96 |         "c-wiz",
 97 |         "svg",
 98 |         "path",
 99 |     ]
100 |     attributes_to_delete = ["level", "multiline", "haspopup", "id", "for"]
101 |     ids_to_ignore = ["agentDriveAutoOverlay"]
102 | 
103 |     # Recursive function to process each node in the accessibility tree
104 |     async def process_node(node: Dict[str, Any]):
105 |         if "children" in node:
106 |             for child in node["children"]:
107 |                 await process_node(child)
108 | 
109 |         # Use 'name' attribute from the accessibility node as 'mmid'
110 |         mmid_temp: str = node.get("keyshortcuts")  # type: ignore
111 | 
112 |         # If the name has multiple mmids, take the last one
113 |         if mmid_temp and is_space_delimited_mmid(mmid_temp):
114 |             # TODO: consider if we should grab each of the mmids and process them separately as seperate nodes copying this node's attributes
115 |             mmid_temp = mmid_temp.split(" ")[-1]
116 | 
117 |         # focusing on nodes with mmid, which is the attribute we inject
118 |         try:
119 |             mmid = int(mmid_temp)
120 |         except (ValueError, TypeError):
121 |             # logger.error(f"'name attribute contains \"{node.get('name')}\", which is not a valid numeric mmid. Adding node as is: {node}")
122 |             return node.get("name")
123 | 
124 |         if node["role"] == "menuitem":
125 |             return node.get("name")
126 | 
127 |         if node.get("role") == "dialog" and node.get("modal") == True:  # noqa: E712
128 |             node["important information"] = (
129 |                 "This is a modal dialog. Please interact with this dialog and close it to be able to interact with the full page (e.g. by pressing the close button or selecting an option)."
130 |             )
131 | 
132 |         if mmid:
133 |             # Determine if we need to fetch 'innerText' based on the absence of 'children' in the accessibility node
134 |             should_fetch_inner_text = "children" not in node
135 | 
136 |             js_code = """
137 |             (input_params) => {
138 |                 const should_fetch_inner_text = input_params.should_fetch_inner_text;
139 |                 const mmid = input_params.mmid;
140 |                 const attributes = input_params.attributes;
141 |                 const tags_to_ignore = input_params.tags_to_ignore;
142 |                 const ids_to_ignore = input_params.ids_to_ignore;
143 | 
144 |                 const element = document.querySelector(`[mmid="${mmid}"]`);
145 | 
146 |                 if (!element) {
147 |                     console.log(`No element found with mmid: ${mmid}`);
148 |                     return null;
149 |                 }
150 | 
151 |                 if (ids_to_ignore.includes(element.id)) {
152 |                     console.log(`Ignoring element with id: ${element.id}`, element);
153 |                     return null;
154 |                 }
155 |                 //Ignore "option" because it would have been processed with the select element
156 |                 if (tags_to_ignore.includes(element.tagName.toLowerCase()) || element.tagName.toLowerCase() === "option") return null;
157 | 
158 |                 let attributes_to_values = {
159 |                     'tag': element.tagName.toLowerCase() // Always include the tag name
160 |                 };
161 | 
162 |                 // If the element is an input, include its type as well
163 |                 if (element.tagName.toLowerCase() === 'input') {
164 |                     attributes_to_values['tag_type'] = element.type; // This will capture 'checkbox', 'radio', etc.
165 |                 }
166 |                 else if (element.tagName.toLowerCase() === 'select') {
167 |                     attributes_to_values["mmid"] = element.getAttribute('mmid');
168 |                     attributes_to_values["role"] = "combobox";
169 |                     attributes_to_values["options"] = [];
170 | 
171 |                     for (const option of element.options) {
172 |                         let option_attributes_to_values = {
173 |                             "mmid": option.getAttribute('mmid'),
174 |                             "text": option.text,
175 |                             "value": option.value,
176 |                             "selected": option.selected
177 |                         };
178 |                         attributes_to_values["options"].push(option_attributes_to_values);
179 |                     }
180 |                     return attributes_to_values;
181 |                 }
182 | 
183 |                 for (const attribute of attributes) {
184 |                     let value = element.getAttribute(attribute);
185 | 
186 |                     if(value){
187 |                         /*
188 |                         if(attribute === 'href'){
189 |                             value = value.split('?')[0]
190 |                         }
191 |                         */
192 |                         attributes_to_values[attribute] = value;
193 |                     }
194 |                 }
195 | 
196 |                 if (should_fetch_inner_text && element.innerText) {
197 |                     attributes_to_values['description'] = element.innerText;
198 |                 }
199 | 
200 |                 let role = element.getAttribute('role');
201 |                 if(role==='listbox' || element.tagName.toLowerCase()=== 'ul'){
202 |                     let children=element.children;
203 |                     let filtered_children = Array.from(children).filter(child => child.getAttribute('role') === 'option');
204 |                     console.log("Listbox or ul found: ", filtered_children);
205 |                     let attributes_to_include = ['mmid', 'role', 'aria-label','value'];
206 |                     attributes_to_values["additional_info"]=[]
207 |                     for (const child of children) {
208 |                         let children_attributes_to_values = {};
209 | 
210 |                         for (let attr of child.attributes) {
211 |                             // If the attribute is not in the predefined list, add it to children_attributes_to_values
212 |                             if (attributes_to_include.includes(attr.name)) {
213 |                                 children_attributes_to_values[attr.name] = attr.value;
214 |                             }
215 |                         }
216 | 
217 |                         attributes_to_values["additional_info"].push(children_attributes_to_values);
218 |                     }
219 |                 }
220 |                 // Check if attributes_to_values contains more than just 'name', 'role', and 'mmid'
221 |                 const keys = Object.keys(attributes_to_values);
222 |                 const minimalKeys = ['tag', 'mmid'];
223 |                 const hasMoreThanMinimalKeys = keys.length > minimalKeys.length || keys.some(key => !minimalKeys.includes(key));
224 | 
225 |                 if (!hasMoreThanMinimalKeys) {
226 |                     //If there were no attributes found, then try to get the backup attributes
227 |                     for (const backupAttribute of input_params.backup_attributes) {
228 |                         let value = element.getAttribute(backupAttribute);
229 |                         if(value){
230 |                             attributes_to_values[backupAttribute] = value;
231 |                         }
232 |                     }
233 | 
234 |                     //if even the backup attributes are not found, then return null, which will cause this element to be skipped
235 |                     if(Object.keys(attributes_to_values).length <= minimalKeys.length) {
236 |                         if (element.tagName.toLowerCase() === 'button') {
237 |                                 attributes_to_values["mmid"] = element.getAttribute('mmid');
238 |                                 attributes_to_values["role"] = "button";
239 |                                 attributes_to_values["additional_info"] = [];
240 |                                 let children=element.children;
241 |                                 let attributes_to_exclude = ['width', 'height', 'path', 'class', 'viewBox', 'mmid']
242 | 
243 |                                 // Check if the button has no text and no attributes
244 |                                 if (element.innerText.trim() === '') {
245 | 
246 |                                     for (const child of children) {
247 |                                         let children_attributes_to_values = {};
248 | 
249 |                                         for (let attr of child.attributes) {
250 |                                             // If the attribute is not in the predefined list, add it to children_attributes_to_values
251 |                                             if (!attributes_to_exclude.includes(attr.name)) {
252 |                                                 children_attributes_to_values[attr.name] = attr.value;
253 |                                             }
254 |                                         }
255 | 
256 |                                         attributes_to_values["additional_info"].push(children_attributes_to_values);
257 |                                     }
258 |                                     console.log("Button with no text and no attributes: ", attributes_to_values);
259 |                                     return attributes_to_values;
260 |                                 }
261 |                         }
262 | 
263 |                         return null; // Return null if only minimal keys are present
264 |                     }
265 |                 }
266 |                 return attributes_to_values;
267 |             }
268 |             """
269 | 
270 |             # Fetch attributes and possibly 'innerText' from the DOM element by 'mmid'
271 |             element_attributes = await page.evaluate(
272 |                 js_code,
273 |                 {
274 |                     "mmid": mmid,
275 |                     "attributes": attributes,
276 |                     "backup_attributes": backup_attributes,
277 |                     "should_fetch_inner_text": should_fetch_inner_text,
278 |                     "tags_to_ignore": tags_to_ignore,
279 |                     "ids_to_ignore": ids_to_ignore,
280 |                 },
281 |             )
282 | 
283 |             if "keyshortcuts" in node:
284 |                 del node["keyshortcuts"]  # remove keyshortcuts since it is not needed
285 | 
286 |             node["mmid"] = mmid
287 | 
288 |             # Update the node with fetched information
289 |             if element_attributes:
290 |                 node.update(element_attributes)
291 | 
292 |                 # check if 'name' and 'mmid' are the same
293 |                 if (
294 |                     node.get("name") == node.get("mmid")
295 |                     and node.get("role") != "textbox"
296 |                 ):
297 |                     del node["name"]  # Remove 'name' from the node
298 | 
299 |                 if (
300 |                     "name" in node
301 |                     and "description" in node
302 |                     and (
303 |                         node["name"] == node["description"]
304 |                         or node["name"] == node["description"].replace("\n", " ")
305 |                         or node["description"].replace("\n", "") in node["name"]
306 |                     )
307 |                 ):
308 |                     del node[
309 |                         "description"
310 |                     ]  # if the name is same as description, then remove the description to avoid duplication
311 | 
312 |                 if (
313 |                     "name" in node
314 |                     and "aria-label" in node
315 |                     and node["aria-label"] in node["name"]
316 |                 ):
317 |                     del node[
318 |                         "aria-label"
319 |                     ]  # if the name is same as the aria-label, then remove the aria-label to avoid duplication
320 | 
321 |                 if "name" in node and "text" in node and node["name"] == node["text"]:
322 |                     del node[
323 |                         "text"
324 |                     ]  # if the name is same as the text, then remove the text to avoid duplication
325 | 
326 |                 if (
327 |                     node.get("tag") == "select"
328 |                 ):  # children are not needed for select menus since "options" attriburte is already added
329 |                     node.pop("children", None)
330 |                     node.pop("role", None)
331 |                     node.pop("description", None)
332 | 
333 |                 # role and tag can have the same info. Get rid of role if it is the same as tag
334 |                 if node.get("role") == node.get("tag"):
335 |                     del node["role"]
336 | 
337 |                 # avoid duplicate aria-label
338 |                 if (
339 |                     node.get("aria-label")
340 |                     and node.get("placeholder")
341 |                     and node.get("aria-label") == node.get("placeholder")
342 |                 ):
343 |                     del node["aria-label"]
344 | 
345 |                 if node.get("role") == "link":
346 |                     del node["role"]
347 |                     if node.get("description"):
348 |                         node["text"] = node["description"]
349 |                         del node["description"]
350 | 
351 |                 # textbox just means a text input and that is expressed well enough with the rest of the attributes returned
352 |                 # if node.get('role') == "textbox":
353 |                 #    del node['role']
354 | 
355 |                 if node.get("role") == "textbox":
356 |                     # get the id attribute of this field from the DOM
357 |                     if "id" in element_attributes and element_attributes["id"]:
358 |                         # find if there is an element in the DOM that has this id in aria-labelledby.
359 |                         js_code = """
360 |                         (inputParams) => {
361 |                             let referencingElements = [];
362 |                             const referencedElement = document.querySelector(`[aria-labelledby="${inputParams.aria_labelled_by_query_value}"]`);
363 |                             if(referencedElement) {
364 |                                 const mmid = referencedElement.getAttribute('mmid');
365 |                                 if (mmid) {
366 |                                     return {"mmid": mmid, "tag": referencedElement.tagName.toLowerCase()};
367 |                                 }
368 |                             }
369 |                             return null;
370 |                         }
371 |                         """
372 |                     # textbox just means a text input and that is expressed well enough with the rest of the attributes returned
373 |                     # del node['role']
374 | 
375 |             # remove attributes that are not needed once processing of a node is complete
376 |             for attribute_to_delete in attributes_to_delete:
377 |                 if attribute_to_delete in node:
378 |                     node.pop(attribute_to_delete, None)
379 |         else:
380 |             logger.debug(f"No element found with mmid: {mmid}, deleting node: {node}")
381 |             node["marked_for_deletion_by_mm"] = True
382 | 
383 |     # Process each node in the tree starting from the root
384 |     await process_node(accessibility_tree)
385 | 
386 |     pruned_tree = __prune_tree(accessibility_tree, only_input_fields)
387 | 
388 |     logger.debug("Reconciliation complete")
389 |     return pruned_tree
390 | 
391 | 
392 | async def __cleanup_dom(page: Page):
393 |     """
394 |     Cleans up the DOM by removing injected 'aria-description' attributes and restoring any original 'aria-keyshortcuts'
395 |     from 'orig-aria-keyshortcuts'.
396 |     """
397 |     logger.debug("Cleaning up the DOM's previous injections")
398 |     await page.evaluate("""() => {
399 |         const allElements = document.querySelectorAll('*[mmid]');
400 |         allElements.forEach(element => {
401 |             element.removeAttribute('aria-keyshortcuts');
402 |             const origAriaLabel = element.getAttribute('orig-aria-keyshortcuts');
403 |             if (origAriaLabel) {
404 |                 element.setAttribute('aria-keyshortcuts', origAriaLabel);
405 |                 element.removeAttribute('orig-aria-keyshortcuts');
406 |             }
407 |         });
408 |     }""")
409 |     logger.debug("DOM cleanup complete")
410 | 
411 | 
412 | def __prune_tree(
413 |     node: Dict[str, Any], only_input_fields: bool
414 | ) -> Optional[Dict[str, Any]]:
415 |     """
416 |     Recursively prunes a tree starting from `node`, based on pruning conditions and handling of 'unraveling'.
417 | 
418 |     The function has two main jobs:
419 |     1. Pruning: Remove nodes that don't meet certain conditions, like being marked for deletion.
420 |     2. Unraveling: For nodes marked with 'marked_for_unravel_children', we replace them with their children,
421 |        effectively removing the node and lifting its children up a level in the tree.
422 | 
423 |     This happens in place, meaning we modify the tree as we go, which is efficient but means you should
424 |     be cautious about modifying the tree outside this function during a prune operation.
425 | 
426 |     Args:
427 |     - node (Dict[str, Any]): The node we're currently looking at. We'll check this node, its children,
428 |       and so on, recursively down the tree.
429 |     - only_input_fields (bool): If True, we're only interested in pruning input-related nodes (like form fields).
430 |       This lets you narrow the focus if, for example, you're only interested in cleaning up form-related parts
431 |       of a larger tree.
432 | 
433 |     Returns:
434 |     - Dict[str, Any] | None: The pruned version of `node`, or None if `node` was pruned away. When we 'unravel'
435 |       a node, we directly replace it with its children in the parent's list of children, so the return value
436 |       will be the parent, updated in place.
437 | 
438 |     Notes:
439 |     - 'marked_for_deletion_by_mm' is our flag for nodes that should definitely be removed.
440 |     - Unraveling is neat for flattening the tree when a node is just a wrapper without semantic meaning.
441 |     - We use a while loop with manual index management to safely modify the list of children as we iterate over it.
442 |     """
443 |     if "marked_for_deletion_by_mm" in node:
444 |         return None
445 | 
446 |     if "children" in node:
447 |         i = 0
448 |         while i < len(node["children"]):
449 |             child = node["children"][i]
450 |             if "marked_for_unravel_children" in child:
451 |                 # Replace the current child with its children
452 |                 if "children" in child:
453 |                     node["children"] = (
454 |                         node["children"][:i]
455 |                         + child["children"]
456 |                         + node["children"][i + 1 :]
457 |                     )
458 |                     i += (
459 |                         len(child["children"]) - 1
460 |                     )  # Adjust the index for the new children
461 |                 else:
462 |                     # If the node marked for unraveling has no children, remove it
463 |                     node["children"].pop(i)
464 |                     i -= 1  # Adjust the index since we removed an element
465 |             else:
466 |                 # Recursively prune the child if it's not marked for unraveling
467 |                 pruned_child = __prune_tree(child, only_input_fields)
468 |                 if pruned_child is None:
469 |                     # If the child is pruned, remove it from the children list
470 |                     node["children"].pop(i)
471 |                     i -= 1  # Adjust the index since we removed an element
472 |                 else:
473 |                     # Update the child with the pruned version
474 |                     node["children"][i] = pruned_child
475 |             i += 1  # Move to the next child
476 | 
477 |         # After processing all children, if the children array is empty, remove it
478 |         if not node["children"]:
479 |             del node["children"]
480 | 
481 |     # Apply existing conditions to decide if the current node should be pruned
482 |     return None if __should_prune_node(node, only_input_fields) else node
483 | 
484 | 
485 | def __should_prune_node(node: Dict[str, Any], only_input_fields: bool):
486 |     """
487 |     Determines if a node should be pruned based on its 'role' and 'element_attributes'.
488 | 
489 |     Args:
490 |         node (Dict[str, Any]): The node to be evaluated.
491 |         only_input_fields (bool): Flag indicating whether only input fields should be considered.
492 | 
493 |     Returns:
494 |         bool: True if the node should be pruned, False otherwise.
495 |     """
496 |     # If the request is for only input fields and this is not an input field, then mark the node for prunning
497 |     if (
498 |         node.get("role") != "WebArea"
499 |         and only_input_fields
500 |         and not (
501 |             node.get("tag") in ("input", "button", "textarea")
502 |             or node.get("role") == "button"
503 |         )
504 |     ):
505 |         return True
506 | 
507 |     if (
508 |         node.get("role") == "generic"
509 |         and "children" not in node
510 |         and not ("name" in node and node.get("name"))
511 |     ):  # The presence of 'children' is checked after potentially deleting it above
512 |         return True
513 | 
514 |     if node.get("role") in ["separator", "LineBreak"]:
515 |         return True
516 |     processed_name = ""
517 |     if "name" in node:
518 |         processed_name: str = node.get("name")  # type: ignore
519 |         processed_name = processed_name.replace(",", "")
520 |         processed_name = processed_name.replace(":", "")
521 |         processed_name = processed_name.replace("\n", "")
522 |         processed_name = processed_name.strip()
523 |         if len(processed_name) < 3:
524 |             processed_name = ""
525 | 
526 |     # check if the node only have name and role, then delete that node
527 |     if (
528 |         len(node) == 2
529 |         and "name" in node
530 |         and "role" in node
531 |         and not (node.get("role") == "text" and processed_name != "")
532 |     ):
533 |         return True
534 |     return False
535 | 
536 | 
537 | async def get_node_dom_element(page: Page, mmid: str):
538 |     return await page.evaluate(
539 |         """
540 |         (mmid) => {
541 |             return document.querySelector(`[mmid="${mmid}"]`);
542 |         }
543 |     """,
544 |         mmid,
545 |     )
546 | 
547 | 
548 | async def get_element_attributes(page: Page, mmid: str, attributes: List[str]):
549 |     return await page.evaluate(
550 |         """
551 |         (inputParams) => {
552 |             const mmid = inputParams.mmid;
553 |             const attributes = inputParams.attributes;
554 |             const element = document.querySelector(`[mmid="${mmid}"]`);
555 |             if (!element) return null;  // Return null if element is not found
556 | 
557 |             let attrs = {};
558 |             for (let attr of attributes) {
559 |                 attrs[attr] = element.getAttribute(attr);
560 |             }
561 |             return attrs;
562 |         }
563 |     """,
564 |         {"mmid": mmid, "attributes": attributes},
565 |     )
566 | 
567 | 
568 | async def get_dom_with_accessibility_info() -> (
569 |     Annotated[
570 |         Optional[Dict[str, Any]],
571 |         "A minified representation of the HTML DOM for the current webpage",
572 |     ]
573 | ):
574 |     """
575 |     Retrieves, processes, and minifies the Accessibility tree of the active page in a browser instance.
576 |     Strictly follow the name and role tag for any interaction with the nodes.
577 | 
578 |     Returns:
579 |     - The minified JSON content of the browser's active page.
580 |     """
581 |     logger.debug("Executing Get Accessibility Tree Command")
582 |     # Create and use the PlaywrightManager
583 |     browser_manager = PlaywrightManager(browser_type="chromium", headless=False)
584 |     page = await browser_manager.get_current_page()
585 |     if page is None:  # type: ignore
586 |         raise ValueError("No active page found")
587 | 
588 |     return await do_get_accessibility_info(page)
589 | 
590 | 
591 | async def do_get_accessibility_info(page: Page, only_input_fields: bool = False):
592 |     """
593 |     Retrieves the accessibility information of a web page and saves it as JSON files.
594 | 
595 |     Args:
596 |         page (Page): The page object representing the web page.
597 |         only_input_fields (bool, optional): If True, only retrieves accessibility information for input fields.
598 |             Defaults to False.
599 | 
600 |     Returns:
601 |         Dict[str, Any] or None: The enhanced accessibility tree as a dictionary, or None if an error occurred.
602 |     """
603 |     await __inject_attributes(page)
604 |     accessibility_tree: Dict[str, Any] = await page.accessibility.snapshot(
605 |         interesting_only=True
606 |     )  # type: ignore
607 | 
608 |     with open(
609 |         os.path.join(SOURCE_LOG_FOLDER_PATH, "json_accessibility_dom.json"),
610 |         "w",
611 |         encoding="utf-8",
612 |     ) as f:
613 |         f.write(json.dumps(accessibility_tree, indent=2))
614 |         logger.debug("json_accessibility_dom.json saved")
615 | 
616 |     await __cleanup_dom(page)
617 |     try:
618 |         enhanced_tree = await __fetch_dom_info(
619 |             page, accessibility_tree, only_input_fields
620 |         )
621 | 
622 |         logger.debug("Enhanced Accessibility Tree ready")
623 | 
624 |         with open(
625 |             os.path.join(
626 |                 SOURCE_LOG_FOLDER_PATH, "json_accessibility_dom_enriched.json"
627 |             ),
628 |             "w",
629 |             encoding="utf-8",
630 |         ) as f:
631 |             f.write(json.dumps(enhanced_tree, indent=2))
632 |             logger.debug("json_accessibility_dom_enriched.json saved")
633 | 
634 |         return enhanced_tree
635 |     except Exception as e:
636 |         logger.error(f"Error while fetching DOM info: {e}")
637 |         traceback.print_exc()
638 |         return None
639 | 


--------------------------------------------------------------------------------
/sentient/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from typing import Union
 4 | from sentient.config.config import SOURCE_LOG_FOLDER_PATH
 5 | 
 6 | # Configure the root logger
 7 | logging.basicConfig(
 8 |     level=logging.DEBUG,
 9 |     format="[%(asctime)s] %(levelname)s {%(filename)s:%(lineno)d} - %(message)s",
10 | )
11 | 
12 | # Remove all handlers from the root logger
13 | for handler in logging.root.handlers[:]:
14 |     logging.root.removeHandler(handler)
15 | 
16 | logger = logging.getLogger(__name__)
17 | logger.addHandler(logging.FileHandler(os.path.join(SOURCE_LOG_FOLDER_PATH, "sentient.log")))
18 | logger.setLevel(logging.INFO)
19 | 
20 | # logging.getLogger("httpcore").setLevel(logging.WARNING)
21 | # logging.getLogger("httpx").setLevel(logging.WARNING)
22 | # logging.getLogger("matplotlib.pyplot").setLevel(logging.WARNING)
23 | # logging.getLogger("PIL.PngImagePlugin").setLevel(logging.WARNING)
24 | # logging.getLogger("PIL.Image").setLevel(logging.WARNING)
25 | 
26 | 
27 | def set_log_level(level: Union[str, int]) -> None:
28 |     """
29 |     Set the log level for the logger.
30 | 
31 |     Parameters:
32 |     - level (Union[str, int]): A string or logging level such as 'debug', 'info', 'warning', 'error', or 'critical', or the corresponding logging constants like logging.DEBUG, logging.INFO, etc.
33 |     """
34 |     if isinstance(level, str):
35 |         level = level.upper()
36 |         numeric_level = getattr(logging, level, None)
37 |         if not isinstance(numeric_level, int):
38 |             raise ValueError(f"Invalid log level: {level}")
39 |         logger.setLevel(numeric_level)
40 |     else:
41 |         logger.setLevel(level)
42 | 


--------------------------------------------------------------------------------
/sentient/utils/message_type.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class MessageType(Enum):
 5 |     PLAN = "plan"
 6 |     STEP = "step"
 7 |     ACTION = "action"
 8 |     ANSWER = "answer"
 9 |     QUESTION = "question"
10 |     INFO = "info"
11 |     FINAL = "final"
12 |     DONE = "transaction_done"
13 |     ERROR = "error"
14 | 


--------------------------------------------------------------------------------
/sentient/utils/providers.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from typing import Dict, Any
  3 | import os
  4 | 
  5 | 
  6 | class LLMProvider(ABC):
  7 |     @abstractmethod
  8 |     def get_client_config(self) -> Dict[str, str]:
  9 |         pass
 10 | 
 11 |     @abstractmethod
 12 |     def get_provider_name(self) -> str:
 13 |         pass
 14 | 
 15 | class OpenAIProvider(LLMProvider):
 16 |     def get_client_config(self) -> Dict[str, str]:
 17 |         return {
 18 |             "api_key": os.environ.get("OPENAI_API_KEY"),
 19 |             "base_url": "https://api.openai.com/v1",
 20 |         }
 21 |     def get_provider_name(self) -> str:
 22 |         return "openai"
 23 | 
 24 | class TogetherAIProvider(LLMProvider):
 25 |     def get_client_config(self) -> Dict[str, str]:
 26 |         return {
 27 |             "api_key": os.environ.get("TOGETHER_API_KEY"),
 28 |             "base_url": "https://api.together.xyz/v1",
 29 |         } 
 30 |     def get_provider_name(self) -> str:
 31 |         return "together"
 32 | 
 33 | class OllamaProvider(LLMProvider):
 34 |     def get_client_config(self) -> Dict[str, str]:
 35 |         return {
 36 |             "api_key": "ollama",
 37 |             "base_url": "http://localhost:11434/v1/",
 38 |         }
 39 |     def get_provider_name(self) -> str:
 40 |         return "ollama"
 41 | 
 42 | class GroqProvider(LLMProvider):
 43 |     def get_client_config(self) -> Dict[str, str]:
 44 |         return {
 45 |             "api_key": os.environ.get("GROQ_API_KEY"),
 46 |         }
 47 |     def get_provider_name(self) -> str:
 48 |         return "groq"
 49 |     
 50 | class AnthropicProvider(LLMProvider):
 51 |     def get_client_config(self) -> Dict[str, str]:
 52 |         return {
 53 |             "api_key": os.environ.get("ANTHROPIC_API_KEY"),
 54 |         }
 55 |     def get_provider_name(self) -> str:
 56 |             return "anthropic"
 57 | 
 58 | class CustomProvider(LLMProvider):
 59 |     def __init__(self, base_url: str):
 60 |         self.base_url = base_url
 61 | 
 62 |     def get_client_config(self) -> Dict[str, Any]:
 63 |         return {
 64 |             "api_key": os.environ.get("CUSTOM_API_KEY"),
 65 |             "base_url": self.base_url,
 66 |         }
 67 |     
 68 |     def get_provider_name(self) -> str:
 69 |             return "custom"
 70 |     
 71 | class OpenRouterProvider(LLMProvider): 
 72 |     def get_client_config(self) -> Dict[str, str]:
 73 |         pass
 74 |     
 75 |     def get_provider_name(self) -> Dict[str, Any]: 
 76 |         return "openrouter"
 77 | 
 78 | # class GoogleProvider(LLMProvider):
 79 | #     def get_client_config(self) -> Dict[str, str]:
 80 | #         api_key = os.environ.get("GOOGLE_API_KEY")
 81 | #         os.environ['API_KEY'] = api_key
 82 | #         return {
 83 | #             "api_key": os.environ.get("GOOGLE_API_KEY"),
 84 | #         }
 85 | #     def get_provider_name(self) -> str:
 86 | #         return "google"
 87 | 
 88 | PROVIDER_MAP = {
 89 |     "openai": OpenAIProvider(),
 90 |     "together": TogetherAIProvider(),
 91 |     "ollama": OllamaProvider(),
 92 |     "groq": GroqProvider(),
 93 |     "anthropic": AnthropicProvider(),
 94 |     "openrouter": OpenRouterProvider()
 95 |     # "google": GoogleProvider(),
 96 | }
 97 | 
 98 | def get_provider(provider_name: str, custom_base_url: str = None) -> LLMProvider:
 99 |     if provider_name.lower() == "custom":
100 |         if not custom_base_url:
101 |             raise ValueError("Custom provider requires a base_url")
102 |         return CustomProvider(custom_base_url)
103 |     else:
104 |         provider = PROVIDER_MAP.get(provider_name.lower())
105 |         if not provider:
106 |             raise ValueError(f"Unsupported provider: {provider_name}. Choose one of the supported providers: {', '.join(PROVIDER_MAP.keys())}")
107 |         return provider


--------------------------------------------------------------------------------
/sentient/utils/ui_messagetype.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class MessageType(Enum):
 5 |     PLAN = "plan"
 6 |     STEP = "step"
 7 |     ACTION = "action"
 8 |     ANSWER = "answer"
 9 |     QUESTION = "question"
10 |     INFO = "info"
11 |     FINAL = "final"
12 |     DONE = "transaction_done"
13 |     ERROR = "error"
14 | 


--------------------------------------------------------------------------------