├── .github └── workflows │ └── publish.yml ├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── evals ├── eval_generate_json.py └── eval_output_model.py ├── index ├── __init__.py ├── agent │ ├── agent.py │ ├── demo_images │ │ ├── complex_layout_highlight.png │ │ ├── complex_layout_small_elements.png │ │ ├── loading.png │ │ └── scroll.png │ ├── message_manager.py │ ├── models.py │ ├── prompts.py │ └── utils.py ├── browser │ ├── browser.py │ ├── detector.py │ ├── findVisibleInteractiveElements.js │ ├── fonts │ │ └── OpenSans-Medium.ttf │ ├── models.py │ └── utils.py ├── cli.py ├── controller │ ├── controller.py │ └── default_actions.py └── llm │ ├── llm.py │ └── providers │ ├── __init__.py │ ├── anthropic.py │ ├── anthropic_bedrock.py │ ├── gemini.py │ ├── gemini_vertex.py │ ├── groq.py │ └── openai.py ├── pyproject.toml ├── static ├── logo_dark.png ├── logo_light.png └── traces.png ├── tests └── agent │ └── test_utils.py └── uv.lock /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | publish: 13 | runs-on: ubuntu-latest 14 | environment: 15 | name: pypi 16 | url: https://pypi.org/p/lmnr/ 17 | permissions: 18 | id-token: write 19 | steps: 20 | - uses: actions/checkout@v4 21 | - name: Install uv 22 | uses: astral-sh/setup-uv@v4 23 | - name: Set up Python 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: '3.10' 27 | - name: Install the project 28 | run: uv sync --all-extras --dev 29 | - name: Verify tag matches package version 30 | run: | 31 | # Extract version from tag (remove 'v' prefix) 32 | TAG_VERSION=${GITHUB_REF#refs/tags/v} 33 | # Extract version from pyproject.toml 34 | PACKAGE_VERSION=$(grep -oP '(?<=version = ")[^"]+' pyproject.toml) 35 | echo "Tag version: $TAG_VERSION" 36 | echo "Package version: $PACKAGE_VERSION" 37 | # Check if versions match 38 | if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then 39 | echo "Error: Tag version ($TAG_VERSION) does not match package version ($PACKAGE_VERSION)" 40 | exit 1 41 | fi 42 | - name: Build package 43 | run: uv build 44 | - name: Publish package 45 | uses: pypa/gh-action-pypi-publish@release/v1 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | #poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 112 | .pdm.toml 113 | .pdm-python 114 | .pdm-build/ 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[python]": { 3 | "editor.codeActionsOnSave": { 4 | "source.fixAll": "explicit", 5 | "source.organizeImports": "explicit" 6 | }, 7 | "editor.defaultFormatter": "charliermarsh.ruff" 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2025] [LMNR AI, Inc.] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![GitHub stars](https://img.shields.io/github/stars/lmnr-ai/index?style=social) 2 | ![Static Badge](https://img.shields.io/badge/Y%20Combinator-S24-orange) 3 | ![X (formerly Twitter) Follow](https://img.shields.io/twitter/follow/lmnrai) 4 | ![Static Badge](https://img.shields.io/badge/Join_Discord-464646?&logo=discord&logoColor=5865F2) 5 | 6 | 7 | 8 | 9 | Laminar logo 10 | 11 | 12 | # Index 13 | 14 | Index is a state-of-the-art open-source browser agent that autonomously executes complex web tasks. It turns any website into an accessible API and can be seamlessly integrated with just a few lines of code. 15 | 16 | - [x] Powered by reasoning LLMs with vision capabilities. 17 | - [x] Gemini 2.5 Pro (really fast and accurate) 18 | - [x] Claude 3.7 Sonnet with extended thinking (reliable and accurate) 19 | - [x] OpenAI o4-mini (depending on the reasoning effort, provides good balance between speed, cost and accuracy) 20 | - [x] Gemini 2.5 Flash (really fast, cheap, and good for less complex tasks) 21 | - [x] `pip install lmnr-index` and use it in your project 22 | - [x] `index run` to run the agent in the interactive CLI 23 | - [x] Supports structured output via Pydantic schemas for reliable data extraction. 24 | - [x] Index is also available as a [serverless API.](https://docs.lmnr.ai/index-agent/api/getting-started) 25 | - [x] You can also try out Index via [Chat UI](https://lmnr.ai/chat). 26 | - [x] Supports advanced [browser agent observability](https://docs.lmnr.ai/index-agent/tracing) powered by open-source platform [Laminar](https://github.com/lmnr-ai/lmnr). 27 | 28 | prompt: go to ycombinator.com. summarize first 3 companies in the W25 batch and make new spreadsheet in google sheets. 29 | 30 | https://github.com/user-attachments/assets/2b46ee20-81b6-4188-92fb-4d97fe0b3d6a 31 | 32 | ## Documentation 33 | 34 | Check out full documentation [here](https://docs.lmnr.ai/index-agent/getting-started) 35 | 36 | ## Quickstart 37 | 38 | ### Install dependencies 39 | ```bash 40 | pip install lmnr-index 'lmnr[all]' 41 | 42 | # Install playwright 43 | playwright install chromium 44 | ``` 45 | 46 | ### Setup model API keys 47 | 48 | Setup your model API keys in `.env` file in your project root: 49 | ``` 50 | GEMINI_API_KEY= 51 | ANTHROPIC_API_KEY= 52 | OPENAI_API_KEY= 53 | # Optional, to trace the agent's actions and record browser session 54 | LMNR_PROJECT_API_KEY= 55 | ``` 56 | 57 | ### Run Index with code 58 | ```python 59 | import asyncio 60 | from index import Agent, GeminiProvider 61 | from pydantic import BaseModel 62 | from lmnr import Laminar 63 | import os 64 | 65 | # to trace the agent's actions and record browser session 66 | Laminar.initialize() 67 | 68 | # Define Pydantic schema for structured output 69 | class NewsSummary(BaseModel): 70 | title: str 71 | summary: str 72 | 73 | async def main(): 74 | 75 | llm = GeminiProvider(model="gemini-2.5-pro-preview-05-06") 76 | agent = Agent(llm=llm) 77 | 78 | # Example of getting structured output 79 | output = await agent.run( 80 | prompt="Navigate to news.ycombinator.com, find a post about AI, extract its title and provide a concise summary.", 81 | output_model=NewsSummary 82 | ) 83 | 84 | summary = NewsSummary.model_validate(output.result.content) 85 | print(f"Title: {summary.title}") 86 | print(f"Summary: {summary.summary}") 87 | 88 | if __name__ == "__main__": 89 | asyncio.run(main()) 90 | ``` 91 | 92 | ### Run Index with CLI 93 | 94 | Index CLI features: 95 | - Browser state persistence between sessions 96 | - Follow-up messages with support for "give human control" action 97 | - Real-time streaming updates 98 | - Beautiful terminal UI using Textual 99 | 100 | You can run Index CLI with the following command. 101 | ```bash 102 | index run 103 | ``` 104 | 105 | Output will look like this: 106 | 107 | ``` 108 | Loaded existing browser state 109 | ╭───────────────────── Interactive Mode ─────────────────────╮ 110 | │ Index Browser Agent Interactive Mode │ 111 | │ Type your message and press Enter. The agent will respond. │ 112 | │ Press Ctrl+C to exit. │ 113 | ╰────────────────────────────────────────────────────────────╯ 114 | 115 | Choose an LLM model: 116 | 1. Gemini 2.5 Flash 117 | 2. Claude 3.7 Sonnet 118 | 3. OpenAI o4-mini 119 | Select model [1/2] (1): 3 120 | Using OpenAI model: o4-mini 121 | Loaded existing browser state 122 | 123 | Your message: go to lmnr.ai, summarize pricing page 124 | 125 | Agent is working... 126 | Step 1: Opening lmnr.ai 127 | Step 2: Opening Pricing page 128 | Step 3: Scrolling for more pricing details 129 | Step 4: Scrolling back up to view pricing tiers 130 | Step 5: Provided concise summary of the three pricing tiers 131 | ``` 132 | 133 | ### Running CLI with a personal Chrome instance 134 | 135 | You can use Index with personal Chrome browser instance instead of launching a new browser. Main advantage is that all your existing logged-in sessions will be available. 136 | 137 | ```bash 138 | # Basic usage with default Chrome path 139 | index run --local-chrome 140 | ``` 141 | 142 | ## Use Index via API 143 | 144 | The easiest way to use Index in production is with [serverless API](https://docs.lmnr.ai/index-agent/api/getting-started). Index API manages remote browser sessions, agent infrastructure and [browser observability](https://docs.lmnr.ai/index-agent/api/tracing). To get started, create a project API key in [Laminar](https://lmnr.ai). 145 | 146 | ### Install Laminar 147 | ```bash 148 | pip install lmnr 149 | ``` 150 | 151 | ### Use Index via API 152 | ```python 153 | from lmnr import Laminar, LaminarClient 154 | # you can also set LMNR_PROJECT_API_KEY environment variable 155 | 156 | # Initialize tracing 157 | Laminar.initialize(project_api_key="your_api_key") 158 | 159 | # Initialize the client 160 | client = LaminarClient(project_api_key="your_api_key") 161 | 162 | for chunk in client.agent.run( 163 | stream=True, 164 | model_provider="gemini", 165 | model="gemini-2.5-pro-preview-05-06", 166 | prompt="Navigate to news.ycombinator.com, find a post about AI, and summarize it" 167 | ): 168 | print(chunk) 169 | 170 | ``` 171 | 172 | 173 | ## Browser agent observability 174 | 175 | Both code run and API run provide advanced browser observability. To trace Index agent's actions and record browser session you simply need to initialize Laminar tracing before running the agent. 176 | 177 | ```python 178 | from lmnr import Laminar 179 | 180 | Laminar.initialize(project_api_key="your_api_key") 181 | ``` 182 | 183 | Then you will get full observability on the agent's actions synced with the browser session in the Laminar platform. Learn more about browser agent observability in the [documentation](https://docs.lmnr.ai/index-agent/tracing). 184 | 185 | 186 | Index observability 187 | 188 | 189 | --- 190 | 191 | Made with ❤️ by the [Laminar team](https://lmnr.ai) 192 | -------------------------------------------------------------------------------- /evals/eval_generate_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict 3 | 4 | from lmnr import evaluate 5 | 6 | from index import AnthropicProvider 7 | from index.agent.utils import generate_proper_json 8 | 9 | llm = AnthropicProvider(model="claude-3-7-sonnet-20250219", enable_thinking=True, thinking_token_budget=1024) 10 | 11 | async def run_json_correction(data: Dict[str, Any]): 12 | """Execute the JSON correction function.""" 13 | malformed_json = data["malformed_json"] 14 | # We'll need an LLM provider. Let's use GeminiProvider as in the reference. 15 | # In a real scenario, you might want to configure this or pass it differently. 16 | 17 | corrected_json_str = await generate_proper_json(llm=llm, json_str=malformed_json) 18 | 19 | # The function returns a string, let's try to parse it to ensure it's valid JSON for the eval 20 | try: 21 | return json.loads(corrected_json_str) 22 | except json.JSONDecodeError: 23 | # If it's not valid JSON, return the string itself for the evaluator to handle 24 | return corrected_json_str 25 | 26 | 27 | async def eval_json_correction(output: Any, target: Dict[str, Any]): 28 | """Evaluate the JSON correction accuracy.""" 29 | # Assuming target is a Python dict representing the expected JSON 30 | # And output is also a Python dict (if parsing was successful) or a string 31 | 32 | if isinstance(output, str): 33 | # This means the corrected_json_str was not valid JSON 34 | # For this simple eval, we can consider this a failure if the target is a dict 35 | # Or, if the target itself is expected to be a non-JSON string (e.g. an error message) 36 | # For now, let's assume target is always a valid JSON object. 37 | try: 38 | # Attempt to parse the output string here for comparison 39 | output_dict = json.loads(output) 40 | exact_match = output_dict == target 41 | except json.JSONDecodeError: 42 | exact_match = False # Output was not valid JSON 43 | else: # Output is already a dict 44 | exact_match = output == target 45 | 46 | return exact_match 47 | 48 | test_data = [ 49 | { 50 | "data": { 51 | # Trailing comma, single quotes 52 | "malformed_json": "{'name': 'John Doe', 'age': 30, 'city': 'New York',}", 53 | }, 54 | "target": { 55 | "name": "John Doe", 56 | "age": 30, 57 | "city": "New York" 58 | } 59 | }, 60 | { 61 | "data": { 62 | "malformed_json": '''{ 63 | "item": "Book", 64 | "details": { 65 | "title": "The "Great Gatsby"", 66 | "author": "F. Scott Fitzgerald" 67 | }, 68 | "price": 10.99 69 | }''' 70 | }, 71 | "target": { 72 | "item": "Book", 73 | "details": { 74 | "title": "The \"Great Gatsby\"", 75 | "author": "F. Scott Fitzgerald" 76 | }, 77 | "price": 10.99 78 | } 79 | }, 80 | { 81 | "data": { 82 | # No closing brace 83 | "malformed_json": '''{ 84 | "key1": "value1", 85 | "key2": "value2" 86 | ''' # Corrected: Removed trailing content that looked like a comment inside string 87 | }, 88 | "target": { 89 | "key1": "value1", 90 | "key2": "value2" 91 | } 92 | }, 93 | { 94 | "data": { 95 | # JSON with comments (not standard, should be removed by the fixer) 96 | "malformed_json": '''{ 97 | // This is a comment 98 | "product_id": 123, 99 | "status": "active" 100 | }''' 101 | }, 102 | "target": { 103 | "product_id": 123, 104 | "status": "active" 105 | } 106 | }, 107 | # Example of a more complex malformed JSON 108 | { 109 | "data": { 110 | "malformed_json": "{\"name\": \"incomplete, \"value\": [1, 2, \"unfinished_array\"" # Missing closing bracket and quote 111 | }, 112 | "target": { # Assuming the LLM can make a reasonable guess or fix structure 113 | "name": "incomplete", 114 | "value": [1, 2, "unfinished_array"] 115 | } 116 | }, 117 | { 118 | "data": { 119 | "malformed_json": "{'key with space': 'value', 'another key': true, 'numeric_string': '123.45' }" # Single quotes, boolean 120 | }, 121 | "target": { 122 | "key with space": "value", 123 | "another key": True, # Python bool 124 | "numeric_string": "123.45" 125 | } 126 | } 127 | ] 128 | 129 | # Run the evaluation 130 | evaluate( 131 | data=test_data, 132 | executor=run_json_correction, 133 | evaluators={"json_correction_accuracy": eval_json_correction}, 134 | concurrency_limit=10, 135 | group_name="json_correction_eval", 136 | ) 137 | -------------------------------------------------------------------------------- /evals/eval_output_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict 3 | 4 | from lmnr import evaluate 5 | from pydantic import BaseModel 6 | 7 | from index import Agent, GeminiProvider 8 | 9 | 10 | class CountryInfo(BaseModel): 11 | """Model for country information extraction""" 12 | country: str 13 | capital: str 14 | currency: str 15 | 16 | 17 | async def run_agent(data: Dict[str, Any]): 18 | """Execute the agent with data extraction based on output_model""" 19 | prompt = data["prompt"] 20 | output_model = data.get("output_model") 21 | start_url = data.get("start_url") 22 | 23 | llm = GeminiProvider(model="gemini-2.5-pro-preview-03-25") 24 | 25 | agent = Agent(llm=llm) 26 | output = await agent.run( 27 | prompt=prompt, 28 | output_model=output_model, 29 | start_url=start_url 30 | ) 31 | 32 | return output.result.content 33 | 34 | 35 | async def eval_extraction(output: Dict[str, Any], target: Dict[str, Any]): 36 | """Evaluate the extraction accuracy""" 37 | 38 | exact_match = json.dumps(output, sort_keys=True) == json.dumps(target, sort_keys=True) 39 | 40 | return exact_match 41 | 42 | data = [ 43 | { 44 | "data": { 45 | "prompt": "Extract information about France. For currency only use text description, such as 'Euro'.", 46 | "output_model": CountryInfo, 47 | "start_url": "https://en.wikipedia.org/wiki/France" 48 | }, 49 | "target": { 50 | "country": "France", 51 | "capital": "Paris", 52 | "currency": "Euro" 53 | } 54 | }, 55 | { 56 | "data": { 57 | "prompt": "Extract information about Japan. For currency only use text description, such as 'Euro'.", 58 | "output_model": CountryInfo, 59 | "start_url": "https://en.wikipedia.org/wiki/Japan" 60 | }, 61 | "target": { 62 | "country": "Japan", 63 | "capital": "Tokyo", 64 | "currency": "Japanese yen" 65 | } 66 | }, 67 | { 68 | "data": { 69 | "prompt": "Extract information about Brazil. For currency only use text description, such as 'Euro'.", 70 | "output_model": CountryInfo, 71 | "start_url": "https://en.wikipedia.org/wiki/Brazil" 72 | }, 73 | "target": { 74 | "country": "Brazil", 75 | "capital": "Brasília", 76 | "currency": "Real" 77 | } 78 | }, 79 | ] 80 | 81 | evaluate( 82 | data=data, 83 | executor=run_agent, 84 | evaluators={"accuracy": eval_extraction}, 85 | concurrency_limit=1, 86 | group_name="country_extraction", 87 | ) 88 | -------------------------------------------------------------------------------- /index/__init__.py: -------------------------------------------------------------------------------- 1 | from index.agent.agent import Agent 2 | from index.agent.models import ActionModel, ActionResult, AgentOutput 3 | from index.browser.browser import Browser, BrowserConfig 4 | from index.browser.detector import Detector 5 | from index.browser.models import InteractiveElement 6 | from index.llm.providers.anthropic import AnthropicProvider 7 | from index.llm.providers.anthropic_bedrock import AnthropicBedrockProvider 8 | from index.llm.providers.gemini import GeminiProvider 9 | from index.llm.providers.gemini_vertex import GeminiVertexProvider 10 | from index.llm.providers.groq import GroqProvider 11 | from index.llm.providers.openai import OpenAIProvider 12 | 13 | __all__ = [ 14 | 'Agent', 15 | 'Browser', 16 | 'BrowserConfig', 17 | 'ActionResult', 18 | 'ActionModel', 19 | 'AnthropicProvider', 20 | 'AnthropicBedrockProvider', 21 | 'OpenAIProvider', 22 | 'GeminiProvider', 23 | 'GeminiVertexProvider', 24 | 'GroqProvider', 25 | 'AgentOutput', 26 | 'Detector', 27 | 'InteractiveElement', 28 | ] 29 | -------------------------------------------------------------------------------- /index/agent/agent.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import time 5 | import uuid 6 | from typing import AsyncGenerator, Optional 7 | 8 | from dotenv import load_dotenv 9 | from lmnr import Laminar, LaminarSpanContext, observe, use_span 10 | from pydantic import BaseModel 11 | 12 | from index.agent.message_manager import MessageManager 13 | from index.agent.models import ( 14 | ActionResult, 15 | AgentLLMOutput, 16 | AgentOutput, 17 | AgentState, 18 | AgentStreamChunk, 19 | FinalOutputChunk, 20 | StepChunk, 21 | StepChunkContent, 22 | StepChunkError, 23 | TimeoutChunk, 24 | TimeoutChunkContent, 25 | ) 26 | from index.agent.utils import validate_json 27 | from index.browser.browser import Browser, BrowserConfig 28 | from index.controller.controller import Controller 29 | from index.llm.llm import BaseLLMProvider, Message 30 | 31 | load_dotenv() 32 | logger = logging.getLogger(__name__) 33 | 34 | class Agent: 35 | def __init__( 36 | self, 37 | llm: BaseLLMProvider, 38 | browser_config: BrowserConfig | None = None 39 | ): 40 | self.llm = llm 41 | self.controller = Controller() 42 | 43 | # Initialize browser or use the provided one 44 | self.browser = Browser(config=browser_config if browser_config is not None else BrowserConfig()) 45 | 46 | action_descriptions = self.controller.get_action_descriptions() 47 | 48 | self.message_manager = MessageManager( 49 | action_descriptions=action_descriptions, 50 | ) 51 | 52 | self.state = AgentState( 53 | messages=[], 54 | ) 55 | 56 | async def step(self, step: int, previous_result: ActionResult | None = None, step_span_context: Optional[LaminarSpanContext] = None) -> tuple[ActionResult, str]: 57 | """Execute one step of the task""" 58 | 59 | with Laminar.start_as_current_span( 60 | name="agent.step", 61 | parent_span_context=step_span_context, 62 | input={ 63 | "step": step, 64 | }, 65 | ): 66 | state = await self.browser.update_state() 67 | 68 | if previous_result: 69 | self.message_manager.add_current_state_message(state, previous_result) 70 | 71 | input_messages = self.message_manager.get_messages() 72 | 73 | try: 74 | model_output = await self._generate_action(input_messages) 75 | except Exception as e: 76 | # model call failed, remove last state message from history before retrying 77 | self.message_manager.remove_last_message() 78 | raise e 79 | 80 | if previous_result: 81 | # we're removing the state message that we've just added because we want to append it in a different format 82 | self.message_manager.remove_last_message() 83 | 84 | self.message_manager.add_message_from_model_output(step, previous_result, model_output, state.screenshot) 85 | 86 | try: 87 | result: ActionResult = await self.controller.execute_action( 88 | model_output.action, 89 | self.browser 90 | ) 91 | 92 | if result.is_done: 93 | logger.info(f'Result: {result.content}') 94 | self.final_output = result.content 95 | 96 | return result, model_output.summary 97 | 98 | except Exception as e: 99 | raise e 100 | 101 | 102 | @observe(name='agent.generate_action', ignore_input=True) 103 | async def _generate_action(self, input_messages: list[Message]) -> AgentLLMOutput: 104 | """Get next action from LLM based on current state""" 105 | 106 | response = await self.llm.call(input_messages) 107 | 108 | try: 109 | # Pass the raw LLM response content to validate_json 110 | output = await validate_json(response.content, self.llm) 111 | 112 | logger.info(f'💡 Thought: {output.thought}') 113 | logger.info(f'💡 Summary: {output.summary}') 114 | logger.info(f'🛠️ Action: {output.action.model_dump_json(exclude_unset=True)}') 115 | 116 | if response.thinking: 117 | output.thinking_block = response.thinking 118 | 119 | return output 120 | except ValueError as e: 121 | # Re-raise the ValueError from validate_json, which now includes detailed context 122 | logger.error(f"Failed to generate and validate action after multiple retries: {e}") 123 | raise e 124 | 125 | async def _setup_messages(self, 126 | prompt: str, 127 | agent_state: str | None = None, 128 | start_url: str | None = None, 129 | output_model: BaseModel | str | None = None 130 | ): 131 | """Set up messages based on state dict or initialize with system message""" 132 | if agent_state: 133 | # assuming that the structure of the state.messages is correct 134 | state = AgentState.model_validate_json(agent_state) 135 | self.message_manager.set_messages(state.messages) 136 | # Update browser_context to browser 137 | browser_state = await self.browser.update_state() 138 | self.message_manager.add_current_state_message(browser_state, user_follow_up_message=prompt) 139 | else: 140 | self.message_manager.add_system_message_and_user_prompt(prompt, output_model) 141 | 142 | if start_url: 143 | await self.browser.goto(start_url) 144 | browser_state = await self.browser.update_state() 145 | self.message_manager.add_current_state_message(browser_state) 146 | 147 | 148 | async def run(self, 149 | prompt: str, 150 | max_steps: int = 100, 151 | agent_state: str | None = None, 152 | parent_span_context: Optional[LaminarSpanContext] = None, 153 | close_context: bool = True, 154 | session_id: str | None = None, 155 | return_agent_state: bool = False, 156 | return_storage_state: bool = False, 157 | start_url: str | None = None, 158 | output_model: BaseModel | str | None = None 159 | ) -> AgentOutput: 160 | """Execute the task with maximum number of steps and return the final result 161 | 162 | Args: 163 | prompt: The prompt to execute the task with 164 | max_steps: The maximum number of steps to execute the task with. Defaults to 100. 165 | agent_state: Optional, the state of the agent to execute the task with 166 | parent_span_context: Optional, parent span context in Laminar format to execute the task with 167 | close_context: Whether to close the browser context after the task is executed 168 | session_id: Optional, Agent session id 169 | return_agent_state: Whether to return the agent state with the final output 170 | return_storage_state: Whether to return the storage state with the final output 171 | start_url: Optional, the URL to start the task with 172 | output_model: Optional, the output model to use for the task 173 | """ 174 | 175 | if prompt is None and agent_state is None: 176 | raise ValueError("Either prompt or agent_state must be provided") 177 | 178 | with Laminar.start_as_current_span( 179 | name="agent.run", 180 | parent_span_context=parent_span_context, 181 | input={ 182 | "prompt": prompt, 183 | "max_steps": max_steps, 184 | "stream": False, 185 | }, 186 | ) as span: 187 | if session_id is not None: 188 | span.set_attribute("lmnr.internal.agent_session_id", session_id) 189 | 190 | await self._setup_messages(prompt, agent_state, start_url, output_model) 191 | 192 | step = 0 193 | result = None 194 | is_done = False 195 | 196 | trace_id = str(uuid.UUID(int=span.get_span_context().trace_id)) 197 | 198 | try: 199 | while not is_done and step < max_steps: 200 | logger.info(f'📍 Step {step}') 201 | result, _ = await self.step(step, result) 202 | step += 1 203 | is_done = result.is_done 204 | 205 | if is_done: 206 | logger.info(f'✅ Task completed successfully in {step} steps') 207 | break 208 | 209 | if not is_done: 210 | logger.info('❌ Maximum number of steps reached') 211 | 212 | except Exception as e: 213 | logger.info(f'❌ Error in run: {e}') 214 | raise e 215 | finally: 216 | storage_state = await self.browser.get_storage_state() 217 | 218 | if close_context: 219 | # Update to close the browser directly 220 | await self.browser.close() 221 | 222 | span.set_attribute("lmnr.span.output", result.model_dump_json()) 223 | 224 | return AgentOutput( 225 | agent_state=self.get_state() if return_agent_state else None, 226 | result=result, 227 | storage_state=storage_state if return_storage_state else None, 228 | step_count=step, 229 | trace_id=trace_id, 230 | ) 231 | 232 | async def run_stream(self, 233 | prompt: str, 234 | max_steps: int = 100, 235 | agent_state: str | None = None, 236 | parent_span_context: Optional[LaminarSpanContext] = None, 237 | close_context: bool = True, 238 | timeout: Optional[int] = None, 239 | session_id: str | None = None, 240 | return_screenshots: bool = False, 241 | return_agent_state: bool = False, 242 | return_storage_state: bool = False, 243 | start_url: str | None = None, 244 | output_model: BaseModel | str | None = None 245 | ) -> AsyncGenerator[AgentStreamChunk, None]: 246 | """Execute the task with maximum number of steps and stream step chunks as they happen 247 | 248 | Args: 249 | prompt: The prompt to execute the task with 250 | max_steps: The maximum number of steps to execute the task with 251 | agent_state: The state of the agent to execute the task with 252 | parent_span_context: Parent span context in Laminar format to execute the task with 253 | close_context: Whether to close the browser context after the task is executed 254 | timeout: The timeout for the task 255 | session_id: Agent session id 256 | return_screenshots: Whether to return screenshots with the step chunks 257 | return_agent_state: Whether to return the agent state with the final output chunk 258 | return_storage_state: Whether to return the storage state with the final output chunk 259 | start_url: Optional, the URL to start the task with 260 | output_model: Optional, the output model to use for the task 261 | """ 262 | 263 | # Create a span for the streaming execution 264 | span = Laminar.start_span( 265 | name="agent.run_stream", 266 | parent_span_context=parent_span_context, 267 | input={ 268 | "prompt": prompt, 269 | "max_steps": max_steps, 270 | "stream": True, 271 | }, 272 | ) 273 | 274 | trace_id = str(uuid.UUID(int=span.get_span_context().trace_id)) 275 | 276 | if session_id is not None: 277 | span.set_attribute("lmnr.internal.agent_session_id", session_id) 278 | 279 | with use_span(span): 280 | await self._setup_messages(prompt, agent_state, start_url, output_model) 281 | 282 | step = 0 283 | result = None 284 | is_done = False 285 | 286 | if timeout is not None: 287 | start_time = time.time() 288 | 289 | try: 290 | # Execute steps and yield results 291 | while not is_done and step < max_steps: 292 | logger.info(f'📍 Step {step}') 293 | 294 | with use_span(span): 295 | result, summary = await self.step(step, result) 296 | 297 | step += 1 298 | is_done = result.is_done 299 | 300 | screenshot = None 301 | if return_screenshots: 302 | state = self.browser.get_state() 303 | screenshot = state.screenshot 304 | 305 | if timeout is not None and time.time() - start_time > timeout: 306 | 307 | yield TimeoutChunk( 308 | content=TimeoutChunkContent( 309 | action_result=result, 310 | summary=summary, 311 | step=step, 312 | agent_state=self.get_state() if return_agent_state else None, 313 | screenshot=screenshot, 314 | trace_id=trace_id 315 | ) 316 | ) 317 | return 318 | 319 | yield StepChunk( 320 | content=StepChunkContent( 321 | action_result=result, 322 | summary=summary, 323 | trace_id=trace_id, 324 | screenshot=screenshot 325 | ) 326 | ) 327 | 328 | if is_done: 329 | logger.info(f'✅ Task completed successfully in {step} steps') 330 | 331 | storage_state = await self.browser.get_storage_state() 332 | 333 | # Yield the final output as a chunk 334 | final_output = AgentOutput( 335 | agent_state=self.get_state() if return_agent_state else None, 336 | result=result, 337 | storage_state=storage_state if return_storage_state else None, 338 | step_count=step, 339 | trace_id=trace_id, 340 | ) 341 | 342 | span.set_attribute("lmnr.span.output", result.model_dump_json()) 343 | yield FinalOutputChunk(content=final_output) 344 | 345 | break 346 | 347 | if not is_done: 348 | logger.info('❌ Maximum number of steps reached') 349 | yield StepChunkError(content=f'Maximum number of steps reached: {max_steps}') 350 | 351 | except Exception as e: 352 | logger.info(f'❌ Error in run: {e}') 353 | span.record_exception(e) 354 | 355 | yield StepChunkError(content=f'Error in run stream: {e}') 356 | finally: 357 | # Clean up resources 358 | if close_context: 359 | # Update to close the browser directly 360 | await self.browser.close() 361 | 362 | span.end() 363 | logger.info('Stream complete, span closed') 364 | 365 | def get_state(self) -> AgentState: 366 | 367 | self.state.messages = self.message_manager.get_messages() 368 | 369 | return self.state 370 | -------------------------------------------------------------------------------- /index/agent/demo_images/complex_layout_highlight.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/agent/demo_images/complex_layout_highlight.png -------------------------------------------------------------------------------- /index/agent/demo_images/complex_layout_small_elements.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/agent/demo_images/complex_layout_small_elements.png -------------------------------------------------------------------------------- /index/agent/demo_images/loading.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/agent/demo_images/loading.png -------------------------------------------------------------------------------- /index/agent/demo_images/scroll.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/agent/demo_images/scroll.png -------------------------------------------------------------------------------- /index/agent/message_manager.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import json 4 | import logging 5 | from datetime import datetime 6 | from typing import List, Optional, Type 7 | 8 | from pydantic import BaseModel 9 | 10 | from index.agent.models import ActionResult, AgentLLMOutput 11 | from index.agent.prompts import system_message 12 | from index.agent.utils import load_demo_image_as_b64, pydantic_to_custom_jtd 13 | from index.browser.models import BrowserState 14 | from index.browser.utils import scale_b64_image 15 | from index.llm.llm import ImageContent, Message, TextContent 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class MessageManager: 21 | def __init__( 22 | self, 23 | action_descriptions: str, 24 | ): 25 | self._messages: List[Message] = [] 26 | self.action_descriptions = action_descriptions 27 | 28 | 29 | def add_system_message_and_user_prompt(self, prompt: str, output_model: Type[BaseModel] | str | None = None) -> None: 30 | 31 | complex_layout_highlight = load_demo_image_as_b64('complex_layout_highlight.png') 32 | complex_layout_small_elements = load_demo_image_as_b64('complex_layout_small_elements.png') 33 | still_loading = load_demo_image_as_b64('loading.png') 34 | scroll_over_element_example = load_demo_image_as_b64('scroll.png') 35 | system_msg = Message( 36 | role="system", 37 | content=[ 38 | TextContent(text=system_message(self.action_descriptions), cache_control=True), 39 | ], 40 | ) 41 | 42 | self._messages.append(system_msg) 43 | output_model_str = '' 44 | if output_model: 45 | output_format = '' 46 | if isinstance(output_model, type) and issubclass(output_model, BaseModel): 47 | output_format = json.dumps(pydantic_to_custom_jtd(output_model), indent=2) 48 | elif isinstance(output_model, str): 49 | output_format = output_model 50 | 51 | output_model_str = f""" 52 | 53 | When you are ready to complete the task use `done_with_structured_output` action. Strictly provide output in the following JSON format and infer which fields best match the information you have gathered: 54 | 55 | 56 | {output_format} 57 | 58 | """ 59 | 60 | self._messages.append(Message( 61 | role="user", 62 | content=[ 63 | TextContent(text=''), 64 | TextContent(text="Here's an example of a complex layout. As an example, if you want to select a 'Roster' section for Colorado Rockies. Then you need to click on element with index 121."), 65 | ImageContent(image_b64=complex_layout_highlight), 66 | TextContent(text=''), 67 | TextContent(text=''), 68 | TextContent(text="Here's an example of small elements on the page and their functions. Element 7, represented by 'x' icon, is a 'clear text' button. Element 8 is a 'submit' button, represented by '=' icon. This clarification should help you better understand similar layouts."), 69 | ImageContent(image_b64=complex_layout_small_elements), 70 | TextContent(text=''), 71 | TextContent(text=''), 72 | TextContent(text="Here is an example of a loading page. If the main content on the page is empty or if there are loading elements, such as 'skeleton' screens or loading indicators, page is still loading. Then, you HAVE to perform `wait_for_page_to_load` action because you can't interact with the page until it is fully loaded."), 73 | ImageContent(image_b64=still_loading), 74 | TextContent(text=''), 75 | TextContent(text=''), 76 | TextContent(text="In some cases, to reveal more content, you need to scroll in scrollable areas of the webpage. Scrollable areas have VERTICAL scrollbars very clearly visible on their right side. In the screenshot below, you can clearly see a scrollbar on the right side of the list of search items. This indicates that the list is scrollable. To scroll over this area, you need to identify any element within the scrollable area and use its index with `scroll_down_over_element` action to scroll over it. In this example, appropriate element is with index 15."), 77 | ImageContent(image_b64=scroll_over_element_example), 78 | TextContent(text='', cache_control=True), 79 | TextContent(text=f"""Here is the task you need to complete: 80 | 81 | 82 | {prompt} 83 | 84 | 85 | Today's date and time is: {datetime.now().strftime('%B %d, %Y, %I:%M%p')} - keep this date and time in mind when planning your actions.{output_model_str}"""), 86 | ] 87 | )) 88 | 89 | def get_messages_as_state(self) -> List[Message]: 90 | """Get messages as state messages""" 91 | return [msg for msg in self._messages if msg.is_state_message] 92 | 93 | 94 | def remove_last_message(self) -> None: 95 | """Remove last message from history""" 96 | if len(self._messages) > 1: 97 | self._messages.pop() 98 | 99 | def add_current_state_message( 100 | self, 101 | state: BrowserState, 102 | previous_result: ActionResult | None = None, 103 | user_follow_up_message: str | None = None, 104 | ) -> None: 105 | """Add browser state as a user message""" 106 | 107 | if state.interactive_elements: 108 | highlighted_elements = '' 109 | for element in state.interactive_elements.values(): 110 | 111 | # exclude sheets elements 112 | if element.browser_agent_id.startswith("row_") or element.browser_agent_id.startswith("column_"): 113 | continue 114 | 115 | start_tag = f"[{element.index}]<{element.tag_name}" 116 | 117 | if element.input_type: 118 | start_tag += f" type=\"{element.input_type}\"" 119 | 120 | start_tag += ">" 121 | element_text = element.text.replace('\n', ' ') 122 | highlighted_elements += f"{start_tag}{element_text}\n" 123 | else: 124 | highlighted_elements = '' 125 | 126 | scroll_distance_above_viewport = state.viewport.scroll_distance_above_viewport or 0 127 | scroll_distance_below_viewport = state.viewport.scroll_distance_below_viewport or 0 128 | 129 | if scroll_distance_above_viewport > 0: 130 | elements_text = f'{scroll_distance_above_viewport}px scroll distance above current viewport\n' 131 | else: 132 | elements_text = '[Start of page]\n' 133 | 134 | if highlighted_elements != '': 135 | elements_text += f'\nHighlighted elements:\n{highlighted_elements}' 136 | 137 | if scroll_distance_below_viewport > 0: 138 | elements_text += f'\n{scroll_distance_below_viewport}px scroll distance below current viewport\n' 139 | else: 140 | elements_text += '\n[End of page]' 141 | 142 | previous_action_output = '' 143 | if previous_result: 144 | previous_action_output = f'\n{previous_result.content}\n\n\n' if previous_result.content else '' 145 | 146 | if previous_result.error: 147 | previous_action_output += f'\n{previous_result.error}\n\n\n' 148 | 149 | if user_follow_up_message: 150 | user_follow_up_message = f'\n{user_follow_up_message}\n\n\n' 151 | else: 152 | user_follow_up_message = '' 153 | 154 | state_description = f"""{previous_action_output}{user_follow_up_message} 155 | 156 | Current URL: {state.url} 157 | 158 | Open tabs: 159 | {state.tabs} 160 | 161 | Current viewport information: 162 | {elements_text} 163 | """ 164 | 165 | state_msg = Message( 166 | role='user', 167 | content=[ 168 | TextContent(text=state_description), 169 | TextContent(text=''), 170 | ImageContent(image_b64=state.screenshot), 171 | TextContent(text=''), 172 | TextContent(text=''), 173 | ImageContent(image_b64=state.screenshot_with_highlights), 174 | TextContent(text=''), 175 | ] 176 | ) 177 | 178 | self._messages.append(state_msg) 179 | 180 | def add_message_from_model_output(self, step: int, previous_result: ActionResult | None, model_output: AgentLLMOutput, screenshot: Optional[str] = None) -> None: 181 | """Add model output as AI message""" 182 | 183 | previous_action_output = '' 184 | 185 | for msg in self._messages: 186 | if msg.is_state_message: 187 | msg.content = [msg.content[0]] 188 | 189 | if previous_result and screenshot: 190 | previous_action_output = f'\n{previous_result.content}\n' if previous_result.content else '' 191 | 192 | if previous_result.error: 193 | previous_action_output += f'\n{previous_result.error}\n' 194 | 195 | usr_msg = Message( 196 | role='user', 197 | content=[ 198 | TextContent(text=previous_action_output, cache_control=True), 199 | TextContent(text=f""), 200 | ImageContent(image_b64=scale_b64_image(screenshot, 0.75)), 201 | TextContent(text=f""), 202 | ], 203 | is_state_message=True, 204 | ) 205 | self._messages.append(usr_msg) 206 | 207 | assistant_content = [ 208 | TextContent(text=f""" 209 | {model_output.model_dump_json(indent=2, include={"thought", "action", "summary"}).strip()} 210 | """), 211 | ] 212 | 213 | if model_output.thinking_block: 214 | assistant_content = [ 215 | model_output.thinking_block, 216 | ] + assistant_content 217 | 218 | msg = Message( 219 | role='assistant', 220 | content=assistant_content, 221 | ) 222 | 223 | self._messages.append(msg) 224 | 225 | def get_messages(self) -> List[Message]: 226 | 227 | found_first_cache_control = False 228 | 229 | # clear all past cache control except the latest one 230 | for msg in self._messages[::-1]: 231 | 232 | # ignore system messages 233 | if msg.role == 'system': 234 | continue 235 | 236 | if found_first_cache_control: 237 | msg.remove_cache_control() 238 | 239 | if msg.has_cache_control(): 240 | found_first_cache_control = True 241 | 242 | 243 | return self._messages 244 | 245 | def set_messages(self, messages: List[Message]) -> None: 246 | """Set messages""" 247 | self._messages = messages 248 | -------------------------------------------------------------------------------- /index/agent/models.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Dict, Literal, Optional 4 | 5 | from playwright.async_api import StorageState 6 | from pydantic import BaseModel 7 | 8 | from index.llm.llm import Message, ThinkingBlock 9 | 10 | 11 | class AgentState(BaseModel): 12 | """State of the agent""" 13 | 14 | messages: list[Message] 15 | 16 | class ActionResult(BaseModel): 17 | """Result of executing an action""" 18 | 19 | is_done: Optional[bool] = False 20 | content: Optional[str | Dict[str, Any]] = None 21 | error: Optional[str] = None 22 | give_control: Optional[bool] = False 23 | 24 | class ActionModel(BaseModel): 25 | """Model for an action""" 26 | 27 | name: str 28 | params: Dict[str, Any] 29 | 30 | class AgentLLMOutput(BaseModel): 31 | """Output model for agent""" 32 | 33 | action: ActionModel 34 | thought: Optional[str] = None 35 | summary: Optional[str] = None 36 | thinking_block: Optional[ThinkingBlock] = None 37 | 38 | class AgentOutput(BaseModel): 39 | """Output model for agent""" 40 | 41 | agent_state: Optional[AgentState] = None 42 | result: ActionResult 43 | step_count: int = 0 44 | storage_state: Optional[StorageState] = None 45 | trace_id: str | None = None 46 | 47 | class AgentStreamChunk(BaseModel): 48 | """Base class for chunks in the agent stream""" 49 | type: str 50 | 51 | class StepChunkContent(BaseModel): 52 | action_result: ActionResult 53 | summary: str 54 | trace_id: str | None = None 55 | screenshot: Optional[str] = None 56 | 57 | class StepChunk(AgentStreamChunk): 58 | """Chunk containing a step result""" 59 | type: Literal["step"] = "step" 60 | content: StepChunkContent 61 | 62 | class TimeoutChunkContent(BaseModel): 63 | action_result: ActionResult 64 | summary: str 65 | step: int 66 | agent_state: AgentState | None = None 67 | trace_id: str | None = None 68 | screenshot: Optional[str] = None 69 | 70 | class TimeoutChunk(AgentStreamChunk): 71 | """Chunk containing a timeout""" 72 | type: Literal["step_timeout"] = "step_timeout" 73 | content: TimeoutChunkContent 74 | 75 | class StepChunkError(AgentStreamChunk): 76 | """Chunk containing an error""" 77 | type: Literal["step_error"] = "step_error" 78 | content: str 79 | 80 | class FinalOutputChunk(AgentStreamChunk): 81 | """Chunk containing the final output""" 82 | type: Literal["final_output"] = "final_output" 83 | content: AgentOutput 84 | -------------------------------------------------------------------------------- /index/agent/prompts.py: -------------------------------------------------------------------------------- 1 | def system_message(action_descriptions: str) -> str: 2 | return f"""You are an advanced AI assistant designed to interact with a web browser and complete user tasks. Your capabilities include analyzing web page screenshots, interacting with page elements, and navigating through websites to accomplish various objectives. 3 | 4 | First, let's review the available actions you can perform: 5 | 6 | 7 | {action_descriptions} 8 | 9 | 10 | Your goal is to complete the user's task by carefully analyzing the current state of the web page, planning your actions, reflecting on the outcomes of the previous actions, and avoiding repetition of unsuccessful approaches. Follow the guidelines below: 11 | 12 | 1. Element Identification: 13 | - Interactable elements on the page are enclosed in uniquely colored bounding boxes with numbered labels. 14 | - Label corresponding to its bounding box is placed at the top right corner of the bounding box, and has exact same color as the bounding box. If the label is larger than the bounding box, the label is placed right outside and tangent to the bounding box. 15 | - Carefully match labels to their corresponding bounding boxes based on the color and position of the label, as labels might slightly overlap with unrelated bounding boxes. 16 | - If bounding box doesn't enclose any element, simply ignore it (most likely the bounding box was incorrectly detected). 17 | - Screenshot enclosed in tag contains clean screenshot of a current browser window. 18 | - Screenshot enclosed in tag has bounding boxes with labels drawn around interactable elements. 19 | - Carefully analyze both screenshots to understand the layout of the page and accurately map bounding boxes to their corresponding elements. 20 | - Remember: each bounding box and corresponding label have the same unique color. 21 | 22 | 2. Element Interaction: 23 | - Infer role and function of elements based on their appearance, text/icon inside the element, and location on the page. 24 | - Interact only with visible elements on the screen. 25 | - Before entering a text into an input area, make sure that you have clicked on the target input area first. 26 | - Scroll or interact with elements to reveal more content if necessary information is not visible. 27 | - To scroll within areas with scrollbars, first identify any element inside the scrollable area and use its index with `scroll_down_over_element` or `scroll_up_over_element` actions instead of scrolling the entire page. Pay attention to the scrollbar position and direction to identify the correct element. 28 | - Some pages have navigation menu on the left, which might contain useful information, such as filters, categories, navigation, etc. Pay close attention to whether the side menu has scrollbars. If it does, scroll over it using an element within the side menu. 29 | - For clicking on a cell in a spreadsheet, first identify the correct column and row that corresponds to the cell you want to click on. Then, strictly use the `click_on_spreadsheet_cell` action to click on the cell. Don't use `click_element` action for interacting with a spreadsheet cells. 30 | 31 | 3. Task Execution: 32 | - After you perform an action, analyze the state screenshot to verify that the intended result was achieved (filter was applied, correct date range was selected, text was entered, etc.). If the result was not achieved, identify the problem and fix it. Be creative and persistent in your approach and don't repeat the same actions that failed. 33 | - Break down multi-step tasks into sub-tasks and complete each sub-task one by one. 34 | - Thoroughly explore all possible approaches before declaring the task complete. 35 | - If you encounter obstacles, consider alternative approaches such as returning to a previous page, initiating a new search, or opening a new tab. 36 | - Understand elements on the page and infer the most relevant ones for the current step of the task. 37 | - Ensure that your final output fully addresses all aspects of the user's request. 38 | - Include ALL requested information in the "done" action. Include markdown-formatted links where relevant and useful. 39 | - Important: For research tasks, be persistent and explore multiple results (at least 5-10) before giving up. 40 | - Be persistent and creative in your approach, e.g., using site-specific Google searches to find precise information. 41 | 42 | 4. Special Situations: 43 | - Cookie popups: Click "I accept" if present. If it persists after clicking, ignore it. 44 | - CAPTCHA: Attempt to solve logically. If unsuccessful, open a new tab and continue the task. 45 | 46 | 5. Returning control to human: 47 | - For steps that require user information to proceed, such as providing first name, last name, email, phone number, booking information, login, password, credit card information, credentials, etc., unless this information was provided in the initial prompt, you must use `give_human_control` action to give human control of the browser. 48 | - If you can't solve the CAPTCHA, use the `give_human_control` action to give human control of the browser to aid you in solving the CAPTCHA. 49 | - Control is guaranteed to be returned to you after the human has entered the information or solved the CAPTCHA, so you should plan your next actions accordingly. 50 | 51 | 6. Source citations: 52 | - When you perform research tasks, include links to the websites that you found the information in your final output. 53 | - In general, include links to the websites that you found the information in your final output. 54 | - Strictly use markdown format for the links, because the final output will be rendered as markdown. 55 | 56 | 7. Spreadsheet interaction: 57 | - To click on a cell in a spreadsheet, use the `click_on_spreadsheet_cell` action to click on a specific cell. DON'T use `click_element` action for interacting with a spreadsheet cells or other elements when the goal is to click on a specific cell. 58 | - To input text into a spreadsheet cell, first click on the cell using the `click_on_spreadsheet_cell` action, then use the `enter_text` action to input text. 59 | 60 | Your response must always be in the following JSON format, enclosed in tags: 61 | 62 | 63 | {{ 64 | "thought": "EITHER a very short summary of your thinking process with key points OR exact information that you need to remember for the future (in case of research tasks).", 65 | "action": {{ 66 | "name": "action_name", 67 | "params": {{ 68 | "param1": "value1", 69 | "param2": "value2" 70 | }} 71 | }}, 72 | "summary": "Extremely brief summary of what you are doing to display to the user to help them understand what you are doing" 73 | }} 74 | 75 | 76 | Remember: 77 | - Think concisely. 78 | - Output only a single action per response. 79 | - You will be prompted again after each action. 80 | - Always provide an output in the specified JSON format, enclosed in tags. 81 | - Reflect on the outcomes of the past actions to avoid repeating unsuccessful approaches. 82 | - Be creative and persistent in trying different strategies within the boundaries of the website. 83 | - Break down multi-step tasks into sub-tasks and complete each sub-task one by one. 84 | - For research tasks, be thorough and explore multiple results before concluding that the desired information is unavailable. 85 | 86 | Continue this process until you are absolutely certain that you have completed the user's task fully and accurately. Be thorough, creative, and persistent in your approach. 87 | 88 | Your final output should consist only of the correctly formatted JSON object enclosed in tags and should not duplicate or rehash any of the work you did in the thinking block.""" -------------------------------------------------------------------------------- /index/agent/utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import enum 3 | import importlib.resources 4 | import json 5 | import logging 6 | import re 7 | from typing import Any, Dict, Type 8 | 9 | from pydantic import BaseModel, ValidationError 10 | 11 | from index.agent.models import AgentLLMOutput 12 | from index.browser.utils import scale_b64_image 13 | from index.llm.llm import BaseLLMProvider, Message 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | def load_demo_image_as_b64(image_name: str) -> str: 18 | """ 19 | Load an image from the demo_images directory and return it as a base64 string. 20 | Works reliably whether the package is used directly or as a library. 21 | 22 | Args: 23 | image_name: Name of the image file (including extension) 24 | 25 | Returns: 26 | Base64 encoded string of the image 27 | """ 28 | try: 29 | # Using importlib.resources to reliably find package data 30 | with importlib.resources.path('index.agent.demo_images', image_name) as img_path: 31 | with open(img_path, 'rb') as img_file: 32 | b64 = base64.b64encode(img_file.read()).decode('utf-8') 33 | return scale_b64_image(b64, 0.75) 34 | except Exception as e: 35 | logger.error(f"Error loading demo image {image_name}: {e}") 36 | raise 37 | 38 | def pydantic_to_custom_jtd(model_class: Type[BaseModel]) -> Dict[str, Any]: 39 | """ 40 | Convert a Pydantic model class to a custom JSON Typedef-like schema 41 | with proper array and object handling. 42 | """ 43 | def python_type_to_jtd_type(annotation): 44 | if annotation is str: 45 | return {"type": "string"} 46 | elif annotation is int: 47 | return {"type": "int32"} 48 | elif annotation is float: 49 | return {"type": "float64"} 50 | elif annotation is bool: 51 | return {"type": "boolean"} 52 | elif isinstance(annotation, type) and issubclass(annotation, enum.Enum): 53 | values = [e.value for e in annotation] 54 | return {"type": "string", "enum": values} 55 | else: 56 | return {"type": "string"} # fallback 57 | 58 | def process_model(model): 59 | model_schema = { 60 | "type": "object", 61 | "properties": {}, 62 | "required": [], 63 | "additionalProperties": False 64 | } 65 | 66 | for name, field in model.model_fields.items(): 67 | annotation = field.annotation 68 | origin = getattr(annotation, "__origin__", None) 69 | 70 | if origin is list: 71 | inner = annotation.__args__[0] 72 | if isinstance(inner, type) and issubclass(inner, enum.Enum): 73 | item_schema = {"type": "string", "enum": [e.value for e in inner]} 74 | elif hasattr(inner, "mro") and BaseModel in inner.mro(): 75 | item_schema = process_model(inner) 76 | else: 77 | item_schema = python_type_to_jtd_type(inner) 78 | 79 | model_schema["properties"][name] = { 80 | "type": "array", 81 | "items": item_schema 82 | } 83 | elif isinstance(annotation, type) and issubclass(annotation, enum.Enum): 84 | model_schema["properties"][name] = { 85 | "type": "string", 86 | "enum": [e.value for e in annotation] 87 | } 88 | elif hasattr(annotation, "mro") and BaseModel in annotation.mro(): 89 | model_schema["properties"][name] = process_model(annotation) 90 | else: 91 | model_schema["properties"][name] = python_type_to_jtd_type(annotation) 92 | 93 | if field.is_required(): 94 | model_schema["required"].append(name) 95 | 96 | return model_schema 97 | 98 | return process_model(model_class) 99 | 100 | 101 | async def generate_proper_json(llm: BaseLLMProvider, json_str: str) -> str: 102 | 103 | prompt = f"""The following JSON string is malformed or has issues. Please correct it while preserving the original structure and content as much as possible. 104 | Return ONLY the corrected JSON string, without any surrounding text, comments, or markdown. Do not add any explanations. 105 | 106 | Problematic JSON string: 107 | {json_str} 108 | """ 109 | 110 | input_messages = [ 111 | Message(role="user", content=prompt) 112 | ] 113 | 114 | response = await llm.call(input_messages) 115 | corrected_json_str = response.content.strip() 116 | if corrected_json_str.startswith("```json"): 117 | corrected_json_str = corrected_json_str[7:] 118 | if corrected_json_str.endswith("```"): 119 | corrected_json_str = corrected_json_str[:-3] 120 | return corrected_json_str.strip() 121 | 122 | 123 | async def validate_json(raw_llm_response_content: str, llm: BaseLLMProvider, max_retries: int = 3) -> AgentLLMOutput: 124 | """ 125 | Extracts, validates, and parses a JSON string from raw LLM output, 126 | attempting to fix it if necessary using retries with cleaning and LLM-based correction. 127 | 128 | Args: 129 | raw_llm_response_content: The raw string content from the LLM response. 130 | llm: The LLM provider instance for fixing JSON if needed. 131 | max_retries: Maximum number of attempts to parse the JSON. 132 | 133 | Returns: 134 | An AgentLLMOutput object. 135 | 136 | Raises: 137 | ValueError: If the JSON string cannot be parsed or validated after all retries. 138 | """ 139 | # 1. Regex extraction from raw_llm_response_content 140 | pattern = r"]*)>(.*?)]*)>" 141 | match = re.search(pattern, raw_llm_response_content, re.DOTALL) 142 | 143 | current_json_str = "" 144 | if not match: 145 | # if we couldn't find the tags, it most likely means the tag is not present in the response 146 | # remove closing and opening tags just in case 147 | closing_tag_pattern = r"]*)>" 148 | json_str_no_closing = re.sub(closing_tag_pattern, "", raw_llm_response_content).strip() 149 | open_tag_pattern = r"]*)>" 150 | json_str_no_tags = re.sub(open_tag_pattern, "", json_str_no_closing).strip() 151 | # Also remove potential markdown code blocks if not already handled by regex 152 | current_json_str = json_str_no_tags.replace("```json", "").replace("```", "").strip() 153 | else: 154 | current_json_str = match.group(1).strip() 155 | 156 | last_exception = None 157 | 158 | for attempt in range(max_retries): 159 | logger.debug(f"JSON parsing attempt {attempt + 1}/{max_retries}") 160 | 161 | # Stage 1: Try to parse the current_json_str as is 162 | try: 163 | # Remove potential markdown that might have been added by LLM fix 164 | temp_json_str = current_json_str 165 | if temp_json_str.startswith("```json"): 166 | temp_json_str = temp_json_str[7:] 167 | if temp_json_str.endswith("```"): 168 | temp_json_str = temp_json_str[:-3] 169 | temp_json_str = temp_json_str.strip() 170 | 171 | logger.debug(f"Attempting to parse JSON on attempt {attempt + 1}. Raw JSON: '{temp_json_str}'") 172 | output = AgentLLMOutput.model_validate_json(temp_json_str) 173 | logger.debug(f"Successfully parsed JSON on attempt {attempt + 1}.") 174 | return output 175 | except (json.JSONDecodeError, ValidationError) as e1: 176 | logger.warning(f"Direct JSON parsing failed on attempt {attempt + 1}: {e1}") 177 | last_exception = e1 178 | 179 | # Stage 2: Try to parse after cleaning common issues 180 | try: 181 | json_str_cleaned = current_json_str # Start with the current_json_str for cleaning 182 | # Removed explicit replacement of \n, \r, \t - rely on JSON parser 183 | # json_str_cleaned = json_str_cleaned.replace('\\\\n', '\n').replace('\\\\r', '\r').replace('\\\\t', '\t') 184 | # Keep control character removal 185 | json_str_cleaned = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', json_str_cleaned) 186 | 187 | if json_str_cleaned.startswith("```json"): 188 | json_str_cleaned = json_str_cleaned[7:] 189 | if json_str_cleaned.endswith("```"): 190 | json_str_cleaned = json_str_cleaned[:-3] 191 | json_str_cleaned = json_str_cleaned.strip() 192 | 193 | logger.debug(f"Attempting to parse cleaned JSON on attempt {attempt + 1}. Cleaned JSON: '{json_str_cleaned[:250]}...'") 194 | output = AgentLLMOutput.model_validate_json(json_str_cleaned) 195 | logger.debug(f"Successfully parsed JSON on attempt {attempt + 1} (after cleaning).") 196 | return output 197 | except (json.JSONDecodeError, ValidationError) as e2: 198 | logger.warning(f"Cleaned JSON parsing failed on attempt {attempt + 1}: {e2}") 199 | last_exception = e2 200 | 201 | if attempt < max_retries - 1: 202 | logger.debug(f"Attempt {attempt + 1} failed. Attempting to fix JSON with LLM.") 203 | try: 204 | # Pass the original problematic string (before this attempt's cleaning) to LLM 205 | current_json_str = await generate_proper_json(llm, current_json_str) 206 | logger.debug(f"LLM proposed a new JSON string: '{current_json_str}'") 207 | except Exception as llm_fix_exception: 208 | logger.error(f"LLM call to fix JSON failed during attempt {attempt + 1}: {llm_fix_exception}") 209 | # If LLM fix fails, loop continues with the previous current_json_str, 210 | # and will eventually fail if parsing doesn't succeed. 211 | pass 212 | else: 213 | logger.error(f"All {max_retries} attempts to parse JSON failed. Final attempt was with: '{current_json_str[:250]}...'") 214 | break 215 | 216 | raise ValueError( 217 | f"Could not parse or validate response after {max_retries} attempts. " 218 | f"Last error: {str(last_exception)}\\n" 219 | f"Final problematic JSON string after all attempts: '{current_json_str[:500]}...'" 220 | ) from last_exception 221 | 222 | -------------------------------------------------------------------------------- /index/browser/browser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Streamlined Playwright browser implementation. 3 | """ 4 | 5 | import asyncio 6 | import base64 7 | import io 8 | import logging 9 | from dataclasses import dataclass, field 10 | from importlib import resources 11 | from typing import Any, Optional 12 | 13 | from lmnr import observe 14 | from PIL import Image 15 | from playwright.async_api import ( 16 | Browser as PlaywrightBrowser, 17 | ) 18 | from playwright.async_api import ( 19 | BrowserContext as PlaywrightBrowserContext, 20 | ) 21 | from playwright.async_api import ( 22 | Page, 23 | Playwright, 24 | StorageState, 25 | async_playwright, 26 | ) 27 | from tenacity import ( 28 | retry, 29 | retry_if_exception_type, 30 | stop_after_attempt, 31 | wait_exponential, 32 | ) 33 | from typing_extensions import TypedDict # to account for older python versions 34 | 35 | # Import detector class 36 | from index.browser.detector import Detector 37 | from index.browser.models import ( 38 | BrowserError, 39 | BrowserState, 40 | InteractiveElementsData, 41 | TabInfo, 42 | ) 43 | from index.browser.utils import ( 44 | filter_elements, 45 | put_highlight_elements_on_screenshot, 46 | scale_b64_image, 47 | ) 48 | 49 | logger = logging.getLogger(__name__) 50 | 51 | INTERACTIVE_ELEMENTS_JS_CODE = resources.read_text('index.browser', 'findVisibleInteractiveElements.js') 52 | 53 | class ViewportSize(TypedDict): 54 | width: int 55 | height: int 56 | 57 | @dataclass 58 | class BrowserConfig: 59 | """ 60 | Simplified configuration for the Browser. 61 | 62 | Parameters: 63 | cdp_url: Optional[str] = None 64 | Connect to a browser instance via CDP 65 | 66 | viewport_size: ViewportSize = {"width": 1024, "height": 768} 67 | Default browser window size 68 | 69 | storage_state: Optional[StorageState] = None 70 | Storage state to set 71 | 72 | detector: Optional[Detector] = None 73 | Detector instance for CV element detection. If None, CV detection is disabled. 74 | 75 | """ 76 | cdp_url: Optional[str] = None 77 | viewport_size: ViewportSize = field(default_factory=lambda: {"width": 1024, "height": 768}) 78 | storage_state: Optional[StorageState] = None 79 | detector: Optional[Detector] = None 80 | 81 | class Browser: 82 | """ 83 | Unified Browser responsible for interacting with the browser via Playwright. 84 | """ 85 | 86 | def __init__(self, config: BrowserConfig = BrowserConfig(), close_context: bool = True): 87 | logger.debug('Initializing browser') 88 | self.config = config 89 | self.close_context = close_context 90 | # Playwright-related attributes 91 | self.playwright: Optional[Playwright] = None 92 | self.playwright_browser: Optional[PlaywrightBrowser] = None 93 | self.context: Optional[PlaywrightBrowserContext] = None 94 | 95 | # Page and state management 96 | self.current_page: Optional[Page] = None 97 | self._state: Optional[BrowserState] = None 98 | self._cdp_session = None 99 | 100 | # CV detection-related attributes 101 | self.detector: Optional[Detector] = config.detector 102 | 103 | self.screenshot_scale_factor = None 104 | 105 | # Initialize state 106 | self._init_state() 107 | 108 | async def __aenter__(self): 109 | """Async context manager entry""" 110 | await self._init_browser() 111 | return self 112 | 113 | async def __aexit__(self, exc_type, exc_val, exc_tb): 114 | """Async context manager exit""" 115 | if self.close_context: 116 | await self.close() 117 | 118 | def _init_state(self, url: str = '') -> None: 119 | """Initialize browser state""" 120 | self._state = BrowserState( 121 | url=url, 122 | screenshot_with_highlights=None, 123 | tabs=[], 124 | interactive_elements={}, 125 | ) 126 | 127 | async def _init_browser(self): 128 | """Initialize the browser and context""" 129 | logger.debug('Initializing browser context') 130 | # Start playwright if needed 131 | if self.playwright is None: 132 | self.playwright = await async_playwright().start() 133 | 134 | # Initialize browser if needed 135 | if self.playwright_browser is None: 136 | if self.config.cdp_url: 137 | logger.info(f'Connecting to remote browser via CDP {self.config.cdp_url}') 138 | attempts = 0 139 | while True: 140 | try: 141 | self.playwright_browser = await self.playwright.chromium.connect_over_cdp( 142 | self.config.cdp_url, 143 | timeout=2500, 144 | ) 145 | break 146 | except Exception as e: 147 | logger.error(f'Failed to connect to remote browser via CDP {self.config.cdp_url}: {e}. Retrying...') 148 | await asyncio.sleep(1) 149 | attempts += 1 150 | if attempts > 3: 151 | raise e 152 | logger.info(f'Connected to remote browser via CDP {self.config.cdp_url}') 153 | else: 154 | logger.info('Launching new browser instance') 155 | self.playwright_browser = await self.playwright.chromium.launch( 156 | headless=False, 157 | args=[ 158 | '--no-sandbox', 159 | '--disable-blink-features=AutomationControlled', 160 | '--disable-web-security', 161 | '--disable-site-isolation-trials', 162 | '--disable-features=IsolateOrigins,site-per-process', 163 | f'--window-size={self.config.viewport_size["width"]},{self.config.viewport_size["height"]}', 164 | ] 165 | ) 166 | 167 | # Create context if needed 168 | if self.context is None: 169 | 170 | if len(self.playwright_browser.contexts) > 0: 171 | self.context = self.playwright_browser.contexts[0] 172 | else: 173 | self.context = await self.playwright_browser.new_context( 174 | viewport=self.config.viewport_size, 175 | user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36', 176 | java_script_enabled=True, 177 | bypass_csp=True, 178 | ignore_https_errors=True 179 | ) 180 | 181 | # Apply anti-detection scripts 182 | await self._apply_anti_detection_scripts() 183 | 184 | self.context.on('page', self._on_page_change) 185 | 186 | if self.config.storage_state and 'cookies' in self.config.storage_state: 187 | await self.context.add_cookies(self.config.storage_state['cookies']) 188 | 189 | # Create page if needed 190 | if self.current_page is None: 191 | if len(self.context.pages) > 0: 192 | self.current_page = self.context.pages[-1] 193 | else: 194 | self.current_page = await self.context.new_page() 195 | 196 | return self 197 | 198 | async def _on_page_change(self, page: Page): 199 | """Handle page change events""" 200 | logger.info(f'Current page changed to {page.url}') 201 | 202 | self._cdp_session = await self.context.new_cdp_session(page) 203 | self.current_page = page 204 | 205 | async def _apply_anti_detection_scripts(self): 206 | """Apply scripts to avoid detection as automation""" 207 | await self.context.add_init_script( 208 | """ 209 | // Webdriver property 210 | Object.defineProperty(navigator, 'webdriver', { 211 | get: () => undefined 212 | }); 213 | 214 | // Languages 215 | Object.defineProperty(navigator, 'languages', { 216 | get: () => ['en-US'] 217 | }); 218 | 219 | // Plugins 220 | Object.defineProperty(navigator, 'plugins', { 221 | get: () => [1, 2, 3, 4, 5] 222 | }); 223 | 224 | // Chrome runtime 225 | window.chrome = { runtime: {} }; 226 | 227 | // Permissions 228 | const originalQuery = window.navigator.permissions.query; 229 | window.navigator.permissions.query = (parameters) => ( 230 | parameters.name === 'notifications' ? 231 | Promise.resolve({ state: Notification.permission }) : 232 | originalQuery(parameters) 233 | ); 234 | (function () { 235 | const originalAttachShadow = Element.prototype.attachShadow; 236 | Element.prototype.attachShadow = function attachShadow(options) { 237 | return originalAttachShadow.call(this, { ...options, mode: "open" }); 238 | }; 239 | })(); 240 | """ 241 | ) 242 | 243 | async def close(self): 244 | """Close the browser instance and cleanup resources""" 245 | logger.debug('Closing browser') 246 | 247 | try: 248 | 249 | # Close CDP session if exists 250 | self._cdp_session = None 251 | 252 | # Close context 253 | if self.context: 254 | try: 255 | await self.context.close() 256 | except Exception as e: 257 | logger.debug(f'Failed to close context: {e}') 258 | self.context = None 259 | 260 | # Close browser 261 | if self.playwright_browser: 262 | try: 263 | await self.playwright_browser.close() 264 | except Exception as e: 265 | logger.debug(f'Failed to close browser: {e}') 266 | self.playwright_browser = None 267 | 268 | # Stop playwright 269 | if self.playwright: 270 | await self.playwright.stop() 271 | self.playwright = None 272 | except Exception as e: 273 | logger.error(f'Error during browser cleanup: {e}') 274 | finally: 275 | self.context = None 276 | self.current_page = None 277 | self._state = None 278 | self.playwright_browser = None 279 | self.playwright = None 280 | 281 | async def goto(self, url: str): 282 | """Navigate to a URL""" 283 | page = await self.get_current_page() 284 | await page.goto(url, wait_until='domcontentloaded') 285 | await asyncio.sleep(2) 286 | 287 | async def get_tabs_info(self) -> list[TabInfo]: 288 | """Get information about all tabs""" 289 | 290 | tabs_info = [] 291 | for page_id, page in enumerate(self.context.pages): 292 | tab_info = TabInfo(page_id=page_id, url=page.url, title=await page.title()) 293 | tabs_info.append(tab_info) 294 | 295 | return tabs_info 296 | 297 | async def switch_to_tab(self, page_id: int) -> None: 298 | """Switch to a specific tab by its page_id""" 299 | if self.context is None: 300 | await self._init_browser() 301 | 302 | pages = self.context.pages 303 | if page_id >= len(pages): 304 | raise BrowserError(f'No tab found with page_id: {page_id}') 305 | 306 | page = pages[page_id] 307 | self.current_page = page 308 | 309 | await page.bring_to_front() 310 | await page.wait_for_load_state('domcontentloaded') 311 | 312 | async def create_new_tab(self, url: str | None = None) -> None: 313 | """Create a new tab and optionally navigate to a URL""" 314 | if self.context is None: 315 | await self._init_browser() 316 | 317 | new_page = await self.context.new_page() 318 | self.current_page = new_page 319 | 320 | await new_page.wait_for_load_state('domcontentloaded') 321 | 322 | if url: 323 | await new_page.goto(url, wait_until='domcontentloaded') 324 | 325 | async def close_current_tab(self): 326 | """Close the current tab""" 327 | if self.current_page is None: 328 | return 329 | 330 | await self.current_page.close() 331 | 332 | # Switch to the first available tab if any exist 333 | if self.context and self.context.pages: 334 | await self.switch_to_tab(0) 335 | 336 | async def get_current_page(self) -> Page: 337 | """Get the current page""" 338 | if self.current_page is None: 339 | await self._init_browser() 340 | return self.current_page 341 | 342 | def get_state(self) -> BrowserState: 343 | """Get the current browser state""" 344 | return self._state 345 | 346 | @observe(name='browser.update_state', ignore_output=True) 347 | async def update_state(self) -> BrowserState: 348 | """Update the browser state with current page information and return it""" 349 | self._state = await self._update_state() 350 | return self._state 351 | 352 | @observe(name='browser._update_state', ignore_output=True) 353 | async def _update_state(self) -> BrowserState: 354 | """Update and return state.""" 355 | @retry( 356 | stop=stop_after_attempt(3), 357 | wait=wait_exponential(multiplier=0.5, min=0.5, max=2), 358 | retry=retry_if_exception_type((Exception)), 359 | reraise=True 360 | ) 361 | async def get_stable_state(): 362 | if self.current_page is None: 363 | await self._init_browser() 364 | url = self.current_page.url 365 | 366 | detect_sheets = 'docs.google.com/spreadsheets/d' in url 367 | 368 | screenshot_b64 = await self.fast_screenshot() 369 | 370 | interactive_elements_data = await self.get_interactive_elements(screenshot_b64, detect_sheets) 371 | interactive_elements = {element.index: element for element in interactive_elements_data.elements} 372 | 373 | # Create highlighted version of the screenshot 374 | screenshot_with_highlights = put_highlight_elements_on_screenshot( 375 | interactive_elements, 376 | screenshot_b64 377 | ) 378 | 379 | tabs = await self.get_tabs_info() 380 | 381 | return BrowserState( 382 | url=url, 383 | tabs=tabs, 384 | screenshot_with_highlights=screenshot_with_highlights, 385 | screenshot=screenshot_b64, 386 | viewport=interactive_elements_data.viewport, 387 | interactive_elements=interactive_elements, 388 | ) 389 | 390 | try: 391 | self._state = await get_stable_state() 392 | return self._state 393 | except Exception as e: 394 | logger.error(f'Failed to update state after multiple attempts: {str(e)}') 395 | # Return last known good state if available 396 | if hasattr(self, '_state'): 397 | return self._state 398 | raise 399 | 400 | @observe(name='browser.detect_browser_elements') 401 | async def detect_browser_elements(self) -> InteractiveElementsData: 402 | """Get all interactive elements on the page""" 403 | page = await self.get_current_page() 404 | result = await page.evaluate(INTERACTIVE_ELEMENTS_JS_CODE) 405 | interactive_elements_data = InteractiveElementsData(**result) 406 | 407 | return interactive_elements_data 408 | 409 | @observe(name='browser.get_interactive_elements', ignore_inputs=["screenshot_b64"]) 410 | async def get_interactive_elements(self, screenshot_b64: str, detect_sheets: bool = False) -> InteractiveElementsData: 411 | """ 412 | Get interactive elements using combined browser and CV detection. 413 | 414 | Args: 415 | screenshot_b64: Optional base64 encoded screenshot. If None, a new screenshot will be taken. 416 | detect_sheets: Whether to detect sheets elements 417 | Returns: 418 | Combined detection results 419 | """ 420 | 421 | elements = [] 422 | 423 | if self.detector is not None: 424 | browser_elements_data = await self.detect_browser_elements() 425 | 426 | scale_factor = browser_elements_data.viewport.width / 1024 427 | 428 | cv_elements = await self.detector.detect_from_image(screenshot_b64, scale_factor, detect_sheets) 429 | 430 | # Combine and filter detections 431 | elements = filter_elements(browser_elements_data.elements + cv_elements) 432 | else: 433 | browser_elements_data = await self.detect_browser_elements() 434 | elements = browser_elements_data.elements 435 | 436 | # Create new InteractiveElementsData with combined elements 437 | return InteractiveElementsData( 438 | viewport=browser_elements_data.viewport, 439 | elements=elements 440 | ) 441 | 442 | async def get_cdp_session(self): 443 | """Get or create a CDP session for the current page""" 444 | 445 | # Create a new session if we don't have one or the page has changed 446 | if (self._cdp_session is None or 447 | not hasattr(self._cdp_session, '_page') or 448 | self._cdp_session._page != self.current_page): 449 | self._cdp_session = await self.context.new_cdp_session(self.current_page) 450 | # Store reference to the page this session belongs to 451 | self._cdp_session._page = self.current_page 452 | 453 | return self._cdp_session 454 | 455 | @observe(name='browser.take_screenshot', ignore_output=True) 456 | async def fast_screenshot(self) -> str: 457 | """ 458 | Returns a base64 encoded screenshot of the current page. 459 | 460 | Returns: 461 | Base64 encoded screenshot 462 | """ 463 | # Use cached CDP session instead of creating a new one each time 464 | cdp_session = await self.get_cdp_session() 465 | screenshot_params = { 466 | "format": "png", 467 | "fromSurface": False, 468 | "captureBeyondViewport": False, 469 | } 470 | 471 | # Capture screenshot using CDP Session 472 | screenshot_data = await cdp_session.send("Page.captureScreenshot", screenshot_params) 473 | screenshot_b64 = screenshot_data["data"] 474 | 475 | if self.screenshot_scale_factor is None: 476 | 477 | test_img_data = base64.b64decode(screenshot_b64) 478 | test_img = Image.open(io.BytesIO(test_img_data)) 479 | logger.info(f'Test image size: {test_img.size}') 480 | self.screenshot_scale_factor = 1024 / test_img.size[0] 481 | logger.info(f'Screenshot scale factor: {self.screenshot_scale_factor}') 482 | 483 | screenshot_b64 = scale_b64_image(screenshot_b64, self.screenshot_scale_factor) 484 | return screenshot_b64 485 | 486 | async def get_cookies(self) -> list[dict[str, Any]]: 487 | """Get cookies from the browser""" 488 | if self.context: 489 | cookies = await self.context.cookies() 490 | return cookies 491 | return [] 492 | 493 | async def get_storage_state(self) -> dict[str, Any]: 494 | """Get local storage from the browser""" 495 | 496 | if self.context: 497 | cookies = await self.context.cookies() 498 | 499 | return { 500 | 'cookies': cookies, 501 | } 502 | return {} 503 | -------------------------------------------------------------------------------- /index/browser/detector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Computer vision detector module. 3 | """ 4 | 5 | from abc import ABC, abstractmethod 6 | from typing import List 7 | 8 | from index.browser.models import InteractiveElement 9 | 10 | 11 | class Detector(ABC): 12 | """Abstract interface for object detection in browser screenshots.""" 13 | 14 | @abstractmethod 15 | async def detect_from_image(self, image_b64: str, scale_factor: float, detect_sheets: bool = False) -> List[InteractiveElement]: 16 | """ 17 | Detect interactive elements from a base64 encoded image. 18 | 19 | Args: 20 | image_b64: Base64 encoded image screenshot. 21 | scale_factor: Scale factor to scale the coordinates of screenshot to browser viewport coordinates. 22 | detect_sheets: Flag to indicate if specialized sheet detection should be used. 23 | 24 | Returns: 25 | List of detected InteractiveElement objects. 26 | """ 27 | pass -------------------------------------------------------------------------------- /index/browser/fonts/OpenSans-Medium.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/browser/fonts/OpenSans-Medium.ttf -------------------------------------------------------------------------------- /index/browser/models.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | 4 | from pydantic import BaseModel, ConfigDict 5 | from pydantic.alias_generators import to_camel 6 | 7 | 8 | # Pydantic 9 | class TabInfo(BaseModel): 10 | """Represents information about a browser tab""" 11 | 12 | page_id: int 13 | url: str 14 | title: str 15 | 16 | class Coordinates(BaseModel): 17 | x: int 18 | y: int 19 | width: Optional[int] = None 20 | height: Optional[int] = None 21 | 22 | class Rect(BaseModel): 23 | left: int 24 | top: int 25 | right: int 26 | bottom: int 27 | width: int 28 | height: int 29 | 30 | class InteractiveElement(BaseModel): 31 | """Represents an interactive element on the page""" 32 | model_config = ConfigDict( 33 | alias_generator=to_camel, 34 | populate_by_name=True, 35 | from_attributes=True, 36 | ) 37 | 38 | index: int 39 | tag_name: str 40 | text: str 41 | attributes: dict[str, str] 42 | viewport: Coordinates 43 | page: Coordinates 44 | center: Coordinates 45 | weight: float 46 | browser_agent_id: str 47 | input_type: Optional[str] = field(default=None) 48 | rect: Rect 49 | z_index: int 50 | 51 | class BrowserError(Exception): 52 | """Base class for all browser errors""" 53 | 54 | 55 | class URLNotAllowedError(BrowserError): 56 | """Error raised when a URL is not allowed""" 57 | 58 | class Viewport(BaseModel): 59 | """Represents the viewport of the browser""" 60 | model_config = ConfigDict( 61 | alias_generator=to_camel, 62 | populate_by_name=True, 63 | from_attributes=True, 64 | ) 65 | 66 | width: int = field(default_factory=lambda: 1024) 67 | height: int = field(default_factory=lambda: 768) 68 | scroll_x: int = field(default_factory=lambda: 0) 69 | scroll_y: int = field(default_factory=lambda: 0) 70 | device_pixel_ratio: float = field(default_factory=lambda: 1) 71 | scroll_distance_above_viewport: int = field(default_factory=lambda: 0) 72 | scroll_distance_below_viewport: int = field(default_factory=lambda: 0) 73 | 74 | class InteractiveElementsData(BaseModel): 75 | """Represents the data returned by the interactive elements script""" 76 | 77 | viewport: Viewport 78 | elements: list[InteractiveElement] 79 | 80 | @dataclass 81 | class BrowserState: 82 | url: str 83 | tabs: list[TabInfo] 84 | viewport: Viewport = field(default_factory=Viewport) 85 | screenshot_with_highlights: Optional[str] = None 86 | screenshot: Optional[str] = None 87 | interactive_elements: dict[int, InteractiveElement] = field(default_factory=dict) 88 | -------------------------------------------------------------------------------- /index/browser/utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import logging 3 | from io import BytesIO 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from PIL import Image, ImageDraw, ImageFont 8 | 9 | from index.browser.models import InteractiveElement, Rect 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | def put_highlight_elements_on_screenshot(elements: dict[int, InteractiveElement], screenshot_b64: str) -> str: 14 | """Highlight elements using Pillow instead of OpenCV""" 15 | try: 16 | # Decode base64 to PIL Image 17 | image_data = base64.b64decode(screenshot_b64) 18 | image = Image.open(BytesIO(image_data)) 19 | draw = ImageDraw.Draw(image) 20 | 21 | # Colors (RGB format for PIL) 22 | base_colors = [ 23 | (204, 0, 0), 24 | (0, 136, 0), 25 | (0, 0, 204), 26 | (204, 112, 0), 27 | (102, 0, 102), 28 | (0, 102, 102), 29 | (204, 51, 153), 30 | (44, 0, 102), 31 | (204, 35, 0), 32 | (28, 102, 66), 33 | (170, 0, 0), 34 | (36, 82, 123) 35 | ] 36 | placed_labels = [] 37 | 38 | def generate_unique_color(base_color, element_idx): 39 | """Generate a unique color variation based on element index""" 40 | r, g, b = base_color 41 | # Use prime numbers to create deterministic but non-repeating patterns 42 | offset_r = (element_idx * 17) % 31 - 15 # Range: -15 to 15 43 | offset_g = (element_idx * 23) % 29 - 14 # Range: -14 to 14 44 | offset_b = (element_idx * 13) % 27 - 13 # Range: -13 to 13 45 | 46 | # Ensure RGB values stay within 0-255 range 47 | r = max(0, min(255, r + offset_r)) 48 | g = max(0, min(255, g + offset_g)) 49 | b = max(0, min(255, b + offset_b)) 50 | 51 | return (r, g, b) 52 | 53 | # Load custom font from the package 54 | try: 55 | # Path to your packaged font 56 | font_path = Path(__file__).parent / "fonts" / "OpenSans-Medium.ttf" 57 | font = ImageFont.truetype(str(font_path), 11) 58 | except Exception as e: 59 | logger.warning(f"Could not load custom font: {e}, falling back to default") 60 | font = ImageFont.load_default() 61 | 62 | for idx, element in elements.items(): 63 | 64 | # don't draw sheets elements 65 | if element.browser_agent_id.startswith("row_") or element.browser_agent_id.startswith("column_"): 66 | continue 67 | 68 | base_color = base_colors[idx % len(base_colors)] 69 | color = generate_unique_color(base_color, idx) 70 | 71 | rect = element.rect 72 | 73 | # Draw rectangle 74 | draw.rectangle( 75 | [(rect.left, rect.top), (rect.right, rect.bottom)], 76 | outline=color, 77 | width=2 78 | ) 79 | 80 | # Prepare label 81 | text = str(idx) 82 | 83 | # Get precise text dimensions for proper centering 84 | text_bbox = draw.textbbox((0, 0), text, font=font) 85 | text_width = text_bbox[2] - text_bbox[0] 86 | text_height = text_bbox[3] - text_bbox[1] 87 | 88 | # Make label size exactly proportional for better aesthetics 89 | label_width = text_width + 4 90 | label_height = text_height + 4 91 | 92 | # Positioning logic 93 | if label_width > rect.width or label_height > rect.height: 94 | label_x = rect.left + rect.width 95 | label_y = rect.top 96 | else: 97 | label_x = rect.left + rect.width - label_width 98 | label_y = rect.top 99 | 100 | # Check for overlaps with existing labels 101 | label_rect = { 102 | 'left': label_x, 'top': label_y, 103 | 'right': label_x + label_width, 'bottom': label_y + label_height 104 | } 105 | 106 | for existing in placed_labels: 107 | if not (label_rect['right'] < existing['left'] or 108 | label_rect['left'] > existing['right'] or 109 | label_rect['bottom'] < existing['top'] or 110 | label_rect['top'] > existing['bottom']): 111 | label_y = existing['bottom'] + 2 112 | label_rect['top'] = label_y 113 | label_rect['bottom'] = label_y + label_height 114 | break 115 | 116 | # Ensure label is visible within image boundaries 117 | img_width, img_height = image.size 118 | if label_x < 0: 119 | label_x = 0 120 | elif label_x + label_width >= img_width: 121 | label_x = img_width - label_width - 1 122 | 123 | if label_y < 0: 124 | label_y = 0 125 | elif label_y + label_height >= img_height: 126 | label_y = img_height - label_height - 1 127 | 128 | # Draw label background 129 | draw.rectangle( 130 | [(label_x, label_y), (label_x + label_width, label_y + label_height)], 131 | fill=color 132 | ) 133 | 134 | # magic numbers to center the text 135 | text_x = label_x + 3 136 | text_y = label_y - 1 137 | 138 | # Draw text 139 | draw.text( 140 | (text_x, text_y), 141 | text, 142 | fill=(255, 255, 255), 143 | font=font 144 | ) 145 | 146 | placed_labels.append(label_rect) 147 | 148 | # Convert back to base64 149 | buffer = BytesIO() 150 | image.save(buffer, format="PNG") 151 | new_image_base64 = base64.b64encode(buffer.getvalue()).decode() 152 | 153 | return new_image_base64 154 | 155 | except Exception as e: 156 | logger.error(f"Failed to add highlights to screenshot: {str(e)}") 157 | return screenshot_b64 158 | 159 | 160 | def scale_b64_image(image_b64: str, scale_factor: float) -> str: 161 | """ 162 | Scale down a base64 encoded image using Pillow. 163 | 164 | Args: 165 | image_b64: Base64 encoded image string 166 | scale_factor: Factor to scale the image by (0.5 = half size) 167 | 168 | Returns: 169 | Base64 encoded scaled image 170 | """ 171 | try: 172 | # Decode base64 to PIL Image 173 | image_data = base64.b64decode(image_b64) 174 | image = Image.open(BytesIO(image_data)) 175 | 176 | if image is None: 177 | return image_b64 178 | 179 | # Get original dimensions 180 | width, height = image.size 181 | 182 | # Calculate new dimensions 183 | new_width = int(width * scale_factor) 184 | new_height = int(height * scale_factor) 185 | 186 | # Resize the image using high quality resampling 187 | resized_image = image.resize( 188 | (new_width, new_height), 189 | Image.LANCZOS 190 | ) 191 | 192 | # Convert back to base64 193 | buffer = BytesIO() 194 | resized_image.save(buffer, format="PNG") 195 | resized_image_b64 = base64.b64encode(buffer.getvalue()).decode() 196 | 197 | return resized_image_b64 198 | 199 | except Exception: 200 | return image_b64 201 | 202 | 203 | def calculate_iou(rect1: Rect, rect2: Rect) -> float: 204 | """ 205 | Calculate Intersection over Union between two rectangles. 206 | 207 | Args: 208 | rect1: First rectangle with left, top, right, bottom keys 209 | rect2: Second rectangle with left, top, right, bottom keys 210 | 211 | Returns: 212 | IoU value 213 | """ 214 | # Calculate intersection 215 | intersect_left = max(rect1.left, rect2.left) 216 | intersect_top = max(rect1.top, rect2.top) 217 | intersect_right = min(rect1.right, rect2.right) 218 | intersect_bottom = min(rect1.bottom, rect2.bottom) 219 | 220 | # Check if intersection exists 221 | if intersect_right < intersect_left or intersect_bottom < intersect_top: 222 | return 0.0 # No intersection 223 | 224 | # Calculate area of each rectangle 225 | area1 = (rect1.right - rect1.left) * (rect1.bottom - rect1.top) 226 | area2 = (rect2.right - rect2.left) * (rect2.bottom - rect2.top) 227 | 228 | # Calculate area of intersection 229 | intersection_area = (intersect_right - intersect_left) * (intersect_bottom - intersect_top) 230 | 231 | # Calculate union area 232 | union_area = area1 + area2 - intersection_area 233 | 234 | # Calculate IoU 235 | return intersection_area / union_area if union_area > 0 else 0.0 236 | 237 | 238 | def is_fully_contained(rect1: Rect, rect2: Rect) -> bool: 239 | """ 240 | Check if rect1 is fully contained within rect2. 241 | 242 | Args: 243 | rect1: First rectangle with left, top, right, bottom keys 244 | rect2: Second rectangle with left, top, right, bottom keys 245 | 246 | Returns: 247 | True if rect1 is fully contained within rect2 248 | """ 249 | return (rect1.left >= rect2.left and 250 | rect1.right <= rect2.right and 251 | rect1.top >= rect2.top and 252 | rect1.bottom <= rect2.bottom) 253 | 254 | 255 | def filter_overlapping_elements(elements: List[InteractiveElement], iou_threshold: float = 0.7) -> List[InteractiveElement]: 256 | """ 257 | Filter overlapping elements using weight and IoU. 258 | 259 | Args: 260 | elements: Elements to filter 261 | iou_threshold: Threshold for considering elements as overlapping 262 | 263 | Returns: 264 | Filtered elements 265 | """ 266 | if not elements: 267 | return [] 268 | 269 | # Sort by area (descending), then by weight (descending) 270 | elements.sort(key=lambda e: ( 271 | -(e.rect.width * e.rect.height), # Negative area for descending sort 272 | -e.weight # Negative weight for descending sort 273 | )) 274 | 275 | filtered_elements: List[InteractiveElement] = [] 276 | 277 | # Add elements one by one, checking against already added elements 278 | for current in elements: 279 | should_add = True 280 | 281 | # For each element already in our filtered list 282 | for existing in filtered_elements: 283 | # Check overlap with IoU 284 | iou = calculate_iou(current.rect, existing.rect) 285 | if iou > iou_threshold: 286 | should_add = False 287 | break 288 | 289 | # Check if current element is fully contained within an existing element with higher weight 290 | if is_fully_contained(current.rect, existing.rect): 291 | if existing.weight >= current.weight and existing.z_index == current.z_index: 292 | should_add = False 293 | break 294 | else: 295 | # If current element has higher weight and is more than 50% of the size of the existing element, remove the existing element 296 | if current.rect.width * current.rect.height >= existing.rect.width * existing.rect.height * 0.5: 297 | filtered_elements.remove(existing) 298 | break 299 | 300 | if should_add: 301 | filtered_elements.append(current) 302 | 303 | return filtered_elements 304 | 305 | 306 | def sort_elements_by_position(elements: List[InteractiveElement]) -> List[InteractiveElement]: 307 | """ 308 | Sort elements by position (top to bottom, left to right). 309 | 310 | Args: 311 | elements: Elements to sort 312 | 313 | Returns: 314 | Sorted elements 315 | """ 316 | if not elements: 317 | return [] 318 | 319 | # Define what "same row" means 320 | ROW_THRESHOLD = 20 # pixels 321 | 322 | # First, group elements into rows based on Y position 323 | rows = [] 324 | current_row = [] 325 | 326 | # Copy and sort elements by Y position 327 | sorted_by_y = sorted(elements, key=lambda e: e.rect.top) 328 | 329 | # Group into rows 330 | for element in sorted_by_y: 331 | if not current_row: 332 | # Start a new row 333 | current_row.append(element) 334 | else: 335 | # Check if this element is in the same row as the previous ones 336 | last_element = current_row[-1] 337 | if abs(element.rect.top - last_element.rect.top) <= ROW_THRESHOLD: 338 | # Same row 339 | current_row.append(element) 340 | else: 341 | # New row 342 | rows.append(list(current_row)) 343 | current_row = [element] 344 | 345 | # Add the last row if not empty 346 | if current_row: 347 | rows.append(current_row) 348 | 349 | # Sort each row by X position (left to right) 350 | for row in rows: 351 | row.sort(key=lambda e: e.rect.left) 352 | 353 | # Flatten the rows back into a single array 354 | elements = [element for row in rows for element in row] 355 | 356 | for i, element in enumerate(elements): 357 | element.index = i 358 | 359 | return elements 360 | 361 | 362 | def filter_elements( 363 | elements: List[InteractiveElement], 364 | iou_threshold: float = 0.7 365 | ) -> List[InteractiveElement]: 366 | """ 367 | Combine interactive elements from multiple detection methods and filter duplicates. 368 | 369 | Args: 370 | elements: Interactive elements from multiple detection methods 371 | iou_threshold: Threshold for considering elements as overlapping 372 | 373 | Returns: 374 | Combined and filtered elements 375 | """ 376 | #Filter overlapping elements 377 | filtered = filter_overlapping_elements(elements, iou_threshold) 378 | 379 | # Sort elements by position 380 | sorted_elements = sort_elements_by_position(filtered) 381 | 382 | return sorted_elements -------------------------------------------------------------------------------- /index/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import asyncio 3 | import json 4 | import logging 5 | import os 6 | import subprocess 7 | import time 8 | from typing import Dict, List, Optional 9 | 10 | import requests 11 | import typer 12 | from dotenv import load_dotenv 13 | from rich.console import Console 14 | from rich.logging import RichHandler 15 | from rich.markdown import Markdown 16 | from rich.panel import Panel 17 | from rich.prompt import Prompt 18 | from textual.app import App 19 | from textual.containers import Container, Horizontal, Vertical 20 | from textual.reactive import reactive 21 | from textual.widgets import Button, Footer, Header, Input, Static 22 | 23 | from index.agent.agent import Agent 24 | from index.agent.models import AgentOutput, AgentState 25 | from index.browser.browser import BrowserConfig 26 | from index.llm.llm import BaseLLMProvider 27 | from index.llm.providers.anthropic import AnthropicProvider 28 | from index.llm.providers.gemini import GeminiProvider 29 | from index.llm.providers.openai import OpenAIProvider 30 | 31 | load_dotenv() 32 | 33 | # Create Typer app 34 | app = typer.Typer(help="Index - Browser AI agent CLI") 35 | 36 | # Configuration constants 37 | BROWSER_STATE_FILE = "browser_state.json" 38 | DEFAULT_CHROME_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" 39 | DEFAULT_DEBUGGING_PORT = 9222 40 | 41 | console = Console() 42 | 43 | def setup_logging(debug: bool = False): 44 | """Configure logging based on debug flag""" 45 | log_level = logging.INFO if debug else logging.WARNING 46 | 47 | # Configure root logger 48 | logging.basicConfig( 49 | level=log_level, 50 | format="%(message)s", 51 | datefmt="[%X]", 52 | handlers=[RichHandler(rich_tracebacks=True, console=console)] 53 | ) 54 | 55 | # Set specific logger levels 56 | logging.getLogger("index").setLevel(log_level) 57 | logging.getLogger("playwright").setLevel(logging.WARNING) # Always keep playwright at WARNING 58 | 59 | if debug: 60 | console.print("[yellow]Debug mode enabled - logging set to INFO level[/]") 61 | 62 | class AgentSession: 63 | """Manages an agent session with state persistence""" 64 | 65 | def __init__(self, llm: Optional[BaseLLMProvider] = None, use_local_chrome: bool = False, chrome_path: str = DEFAULT_CHROME_PATH, debugging_port: int = DEFAULT_DEBUGGING_PORT, debug: bool = False): 66 | self.llm = llm 67 | self.chrome_process = None 68 | self.use_local_chrome = use_local_chrome 69 | self.chrome_path = chrome_path 70 | self.debugging_port = debugging_port 71 | self.logger = logging.getLogger("index.agent_session") 72 | 73 | browser_config = None 74 | 75 | if os.path.exists(BROWSER_STATE_FILE) and not use_local_chrome: 76 | with open(BROWSER_STATE_FILE, "r") as f: 77 | self.storage_state = json.load(f) 78 | console.print("[green]Loaded existing browser state[/green]") 79 | browser_config = BrowserConfig( 80 | storage_state=self.storage_state, 81 | viewport_size={ 82 | "width": 1200, 83 | "height": 800 84 | } 85 | ) 86 | else: 87 | if use_local_chrome: 88 | # Launch Chrome and connect to it 89 | self._launch_local_chrome() 90 | browser_config = BrowserConfig( 91 | cdp_url="http://localhost:" + str(self.debugging_port), 92 | ) 93 | else: 94 | browser_config = BrowserConfig( 95 | viewport_size={ 96 | "width": 1200, 97 | "height": 800 98 | } 99 | ) 100 | 101 | self.agent = Agent(llm=self.llm, browser_config=browser_config) 102 | self.agent_state: Optional[str] = None 103 | self.step_count: int = 0 104 | self.action_results: List[Dict] = [] 105 | self.is_running: bool = False 106 | self.storage_state: Optional[Dict] = None 107 | 108 | def _launch_local_chrome(self): 109 | """Launch a local Chrome instance with remote debugging enabled""" 110 | # Check if Chrome is already running with the specified debugging port 111 | try: 112 | response = requests.get(f"http://localhost:{self.debugging_port}/json/version", timeout=2) 113 | if response.status_code == 200: 114 | console.print(f"[green]Connected to already running Chrome instance on port {self.debugging_port}[/green]") 115 | self.logger.info(f"Connected to existing Chrome instance on port {self.debugging_port}") 116 | return 117 | except requests.RequestException: 118 | # No running Chrome instance found on the specified port, proceed with launching a new one 119 | pass 120 | 121 | console.print(f"[blue]Launching Chrome from {self.chrome_path} with debugging port {self.debugging_port}[/blue]") 122 | 123 | try: 124 | self.chrome_process = subprocess.Popen( 125 | [self.chrome_path, f"--remote-debugging-port={self.debugging_port}", "--no-first-run", "--no-default-browser-check"], 126 | stdout=subprocess.DEVNULL, 127 | stderr=subprocess.DEVNULL, 128 | ) 129 | console.print("[green]Chrome launched successfully[/green]") 130 | self.logger.info(f"Chrome process started with PID {self.chrome_process.pid}") 131 | # Give Chrome time to start up 132 | time.sleep(2) 133 | except Exception as e: 134 | self.logger.error(f"Failed to launch Chrome: {str(e)}") 135 | console.print(f"[red]Failed to launch Chrome: {str(e)}[/red]") 136 | raise 137 | 138 | def save_state(self, agent_output: AgentOutput): 139 | """Save agent state to file""" 140 | 141 | if agent_output.storage_state: 142 | with open(BROWSER_STATE_FILE, "w") as f: 143 | json.dump(agent_output.storage_state, f) 144 | 145 | self.logger.info("Agent state saved to file") 146 | console.print("[green]Saved agent state[/green]") 147 | 148 | async def run_agent(self, prompt: str) -> AgentOutput: 149 | """Run the agent with the given prompt""" 150 | self.is_running = True 151 | self.logger.info(f"Running agent with prompt: {prompt}") 152 | 153 | try: 154 | # Run the agent 155 | if self.agent_state: 156 | result = await self.agent.run( 157 | prompt=prompt, 158 | agent_state=self.agent_state, 159 | close_context=False, 160 | return_storage_state=True, 161 | return_agent_state=True 162 | ) 163 | else: 164 | result = await self.agent.run( 165 | prompt=prompt, 166 | close_context=False, 167 | return_storage_state=True, 168 | return_agent_state=True 169 | ) 170 | 171 | self.step_count = result.step_count 172 | self.agent_state = result.agent_state.model_dump_json() 173 | self.save_state(result) 174 | 175 | return result 176 | finally: 177 | self.is_running = False 178 | 179 | async def stream_run(self, prompt: str): 180 | """Run the agent with streaming output""" 181 | self.is_running = True 182 | self.logger.info(f"Running agent with streaming and prompt: {prompt}") 183 | 184 | try: 185 | # Run the agent with streaming 186 | if self.agent_state: 187 | stream = self.agent.run_stream( 188 | prompt=prompt, 189 | agent_state=self.agent_state, 190 | close_context=False, 191 | max_steps=500, # large number to allow the agent to run for a long time 192 | return_agent_state=True, 193 | return_storage_state=True 194 | ) 195 | else: 196 | stream = self.agent.run_stream( 197 | prompt=prompt, 198 | close_context=False, 199 | max_steps=500, # large number to allow the agent to run for a long time 200 | return_agent_state=True, 201 | return_storage_state=True 202 | ) 203 | 204 | final_output = None 205 | async for chunk in stream: 206 | # Directly yield the raw chunk without any modifications 207 | yield chunk 208 | 209 | # Store final output for state saving 210 | if chunk.type == "final_output": 211 | final_output = chunk.content 212 | 213 | if final_output: 214 | self.step_count = final_output.step_count 215 | self.agent_state = final_output.agent_state.model_dump_json() 216 | self.save_state(final_output) 217 | 218 | finally: 219 | self.is_running = False 220 | 221 | def reset(self): 222 | """Reset agent state""" 223 | if os.path.exists(BROWSER_STATE_FILE): 224 | os.remove(BROWSER_STATE_FILE) 225 | self.agent_state = None 226 | self.step_count = 0 227 | self.action_results = [] 228 | self.logger.info("Agent state reset") 229 | console.print("[yellow]Agent state reset[/yellow]") 230 | 231 | async def close(self): 232 | """Close the agent and any associated resources""" 233 | # Close the browser instance 234 | if self.agent and self.agent.browser: 235 | self.logger.info("Closing browser instance") 236 | await self.agent.browser.close() 237 | 238 | # Terminate Chrome process if launched locally 239 | if self.chrome_process: 240 | self.logger.info(f"Terminating Chrome process with PID {self.chrome_process.pid}") 241 | console.print("[yellow]Terminating local Chrome instance...[/yellow]") 242 | self.chrome_process.terminate() 243 | self.chrome_process = None 244 | 245 | 246 | class AgentUI(App): 247 | """Textual-based UI for interacting with the agent""" 248 | 249 | CSS = """ 250 | Header { 251 | background: #3b82f6; 252 | color: white; 253 | text-align: center; 254 | padding: 1; 255 | } 256 | 257 | Footer { 258 | background: #1e3a8a; 259 | color: white; 260 | text-align: center; 261 | padding: 1; 262 | } 263 | 264 | #prompt-input { 265 | padding: 1 2; 266 | border: tall $accent; 267 | margin: 1 1; 268 | height: 3; 269 | } 270 | 271 | #output-container { 272 | height: 1fr; 273 | border: solid #ccc; 274 | background: #f8fafc; 275 | padding: 1; 276 | margin: 0 1; 277 | overflow-y: auto; 278 | } 279 | 280 | #action-results { 281 | height: 15; 282 | border: solid #ccc; 283 | background: #f8fafc; 284 | margin: 0 1 1 1; 285 | overflow-y: auto; 286 | } 287 | 288 | .action-result { 289 | border: solid #e5e7eb; 290 | margin: 1 0; 291 | padding: 1; 292 | } 293 | 294 | .action-title { 295 | color: #3b82f6; 296 | text-style: bold; 297 | } 298 | 299 | .action-content { 300 | margin-top: 1; 301 | } 302 | 303 | Button { 304 | margin: 1 1; 305 | } 306 | 307 | #buttons-container { 308 | height: auto; 309 | align: center middle; 310 | } 311 | 312 | .running { 313 | color: #f97316; 314 | text-style: bold; 315 | } 316 | 317 | .completed { 318 | color: #22c55e; 319 | text-style: bold; 320 | } 321 | 322 | .error { 323 | color: #ef4444; 324 | text-style: bold; 325 | } 326 | """ 327 | 328 | TITLE = "Index Browser Agent CLI" 329 | BINDINGS = [ 330 | ("q", "quit", "Quit"), 331 | ("r", "reset", "Reset Agent"), 332 | ("ctrl+s", "send", "Send Message"), 333 | ] 334 | 335 | agent_session = None 336 | status = reactive("Ready") 337 | 338 | def compose(self): 339 | yield Header() 340 | 341 | with Vertical(): 342 | with Container(id="output-container"): 343 | yield Static(id="output", expand=True) 344 | 345 | with Container(id="action-results"): 346 | yield Static(id="results", expand=True) 347 | 348 | with Horizontal(id="buttons-container"): 349 | yield Button("Send", id="send-btn", variant="primary") 350 | yield Button("Reset", id="reset-btn", variant="error") 351 | 352 | yield Input(placeholder="Enter your task or follow-up message...", id="prompt-input") 353 | 354 | yield Footer() 355 | 356 | def update_output(self): 357 | """Update the output display""" 358 | output = "" 359 | 360 | if self.agent_session.agent_state: 361 | state = AgentState.model_validate_json(self.agent_session.agent_state) 362 | 363 | # Get the latest user and assistant messages 364 | user_msgs = [m for m in state.messages if m.role == "user"] 365 | assistant_msgs = [m for m in state.messages if m.role == "assistant"] 366 | 367 | if user_msgs: 368 | latest_user = user_msgs[-1] 369 | output += f"[bold blue]User:[/] {latest_user.content}\n\n" 370 | 371 | if assistant_msgs: 372 | latest_assistant = assistant_msgs[-1] 373 | output += f"[bold green]Assistant:[/] {latest_assistant.content}\n\n" 374 | 375 | output += f"[dim]Steps completed: {self.agent_session.step_count}[/]\n" 376 | output += f"[dim]Status: {self.status}[/]\n" 377 | else: 378 | output = "[italic]No previous session. Start by sending a task.[/]" 379 | 380 | self.query_one("#output", Static).update(Markdown(output)) 381 | 382 | # Update action results 383 | if self.agent_session.action_results: 384 | results_output = "" 385 | for i, result in enumerate(reversed(self.agent_session.action_results[-5:])): 386 | action_type = result.get("type", "unknown") 387 | content = result.get("content", {}) 388 | 389 | if action_type == "step": 390 | action_result = content.get("action_result", {}) 391 | summary = content.get("summary", "No summary available") 392 | 393 | results_output += f"[bold]Step {i+1}[/]\n" 394 | results_output += f"Summary: {summary}\n" 395 | 396 | if action_result.get("is_done"): 397 | results_output += "[green]Task completed[/]\n" 398 | 399 | if action_result.get("give_control"): 400 | results_output += "[yellow]Agent requested human control[/]\n" 401 | results_output += f"Message: {action_result.get('content', '')}\n" 402 | 403 | results_output += "\n" 404 | 405 | elif action_type == "error": 406 | results_output += "[bold red]Error[/]\n" 407 | results_output += f"{content}\n\n" 408 | 409 | self.query_one("#results", Static).update(Markdown(results_output)) 410 | 411 | async def on_button_pressed(self, event: Button.Pressed): 412 | """Handle button presses""" 413 | if event.button.id == "send-btn": 414 | await self.action_send() 415 | elif event.button.id == "reset-btn": 416 | self.action_reset() 417 | 418 | def action_reset(self): 419 | """Reset the agent state""" 420 | self.agent_session.reset() 421 | self.agent_session.action_results = [] 422 | self.update_output() 423 | 424 | async def action_send(self): 425 | """Send the current prompt to the agent""" 426 | prompt = self.query_one("#prompt-input", Input).value 427 | 428 | if not prompt.strip(): 429 | return 430 | 431 | self.status = "Running..." 432 | self.query_one("#prompt-input", Input).value = "" 433 | self.update_output() 434 | 435 | try: 436 | # Stream the results to provide real-time feedback 437 | async for chunk in self.agent_session.stream_run(prompt): 438 | self.agent_session.action_results.append(chunk) 439 | self.update_output() 440 | await asyncio.sleep(0.1) # Small delay to ensure UI updates 441 | 442 | self.status = "Ready" 443 | except Exception as e: 444 | self.status = f"Error: {str(e)}" 445 | finally: 446 | self.update_output() 447 | 448 | async def on_mount(self): 449 | """Called when the app is mounted""" 450 | # Register cleanup handler 451 | self.set_interval(0.1, self._check_exit) 452 | 453 | async def _check_exit(self): 454 | """Check if app is exiting and clean up resources""" 455 | if self.exiting: 456 | if self.agent_session: 457 | await self.agent_session.close() 458 | 459 | def action_quit(self): 460 | """Quit the application""" 461 | self.exit() 462 | 463 | 464 | @app.command() 465 | def run( 466 | prompt: str = typer.Option(None, "--prompt", "-p", help="Initial prompt to send to the agent"), 467 | use_local_chrome: bool = typer.Option(False, "--local-chrome", help="Use local Chrome instance instead of launching a new browser"), 468 | chrome_path: str = typer.Option(DEFAULT_CHROME_PATH, "--chrome-path", help="Path to Chrome executable"), 469 | debugging_port: int = typer.Option(DEFAULT_DEBUGGING_PORT, "--port", help="Remote debugging port for Chrome"), 470 | debug: bool = typer.Option(False, "--debug", help="Enable debug logging") 471 | ): 472 | """ 473 | Launch the interactive loop for the Index browser agent 474 | """ 475 | # Set up logging if debug mode is enabled 476 | setup_logging(debug) 477 | 478 | asyncio.run(_interactive_loop( 479 | initial_prompt=prompt, 480 | use_local_chrome=use_local_chrome, 481 | chrome_path=chrome_path, 482 | debugging_port=debugging_port, 483 | debug=debug 484 | )) 485 | 486 | 487 | @app.command(name="ui") 488 | def run_ui( 489 | prompt: str = typer.Option(None, "--prompt", "-p", help="Initial prompt to send to the agent"), 490 | use_local_chrome: bool = typer.Option(False, "--local-chrome", help="Use local Chrome instance instead of launching a new browser"), 491 | chrome_path: str = typer.Option(DEFAULT_CHROME_PATH, "--chrome-path", help="Path to Chrome executable"), 492 | debugging_port: int = typer.Option(DEFAULT_DEBUGGING_PORT, "--port", help="Remote debugging port for Chrome"), 493 | debug: bool = typer.Option(False, "--debug", help="Enable debug logging") 494 | ): 495 | """ 496 | Launch the graphical UI for the Index browser agent 497 | """ 498 | # Set up logging if debug mode is enabled 499 | setup_logging(debug) 500 | 501 | # Select model and check API key 502 | llm_provider = select_model_and_check_key() 503 | 504 | # Initialize UI with the selected LLM provider 505 | agent_ui = AgentUI() 506 | agent_ui.agent_session = AgentSession( 507 | llm=llm_provider, 508 | use_local_chrome=use_local_chrome, 509 | chrome_path=chrome_path, 510 | debugging_port=debugging_port, 511 | debug=debug 512 | ) 513 | 514 | if prompt: 515 | # If a prompt is provided, we'll send it once the UI is ready 516 | async def send_initial_prompt(): 517 | await asyncio.sleep(0.5) # Give UI time to initialize 518 | agent_ui.query_one("#prompt-input", Input).value = prompt 519 | await agent_ui.action_send() 520 | 521 | agent_ui.set_interval(0.1, lambda: asyncio.create_task(send_initial_prompt())) 522 | 523 | agent_ui.run() 524 | 525 | 526 | def create_llm_provider(provider: str, model: str) -> BaseLLMProvider: 527 | """Create an LLM provider based on model choice""" 528 | if provider == "openai": 529 | # OpenAI model 530 | console.print(f"[cyan]Using OpenAI model: {model}[/]") 531 | return OpenAIProvider(model=model, reasoning_effort="low") 532 | elif provider == "gemini": 533 | # Gemini model 534 | if model == "gemini-2.5-pro-preview-03-25": 535 | console.print(f"[cyan]Using Gemini model: {model}[/]") 536 | return GeminiProvider( 537 | model=model, 538 | thinking_token_budget=8192 539 | ) 540 | elif model == "gemini-2.5-flash-preview-04-17": 541 | console.print(f"[cyan]Using Gemini model: {model}[/]") 542 | return GeminiProvider( 543 | model=model, 544 | thinking_token_budget=8192 545 | ) 546 | else: 547 | raise ValueError(f"Unsupported Gemini model: {model}") 548 | elif provider == "anthropic": 549 | # Anthropic model 550 | console.print(f"[cyan]Using Anthropic model: {model}[/]") 551 | return AnthropicProvider( 552 | model=model, 553 | enable_thinking=True, 554 | thinking_token_budget=2048 555 | ) 556 | else: 557 | raise ValueError(f"Unsupported provider: {provider}") 558 | 559 | 560 | def check_and_save_api_key(required_key: str): 561 | """Check if API key exists, prompt for it if missing, and save to .env file""" 562 | if not os.environ.get(required_key): 563 | console.print(f"\n[yellow]API key {required_key} not found in environment.[/]") 564 | api_key = Prompt.ask(f"Enter your {required_key}", password=True) 565 | 566 | # Save to .env file 567 | env_path = ".env" 568 | 569 | if os.path.exists(env_path): 570 | # Read existing content 571 | with open(env_path, "r") as f: 572 | env_content = f.read() 573 | env_content += f"\n{required_key}={api_key}" 574 | 575 | with open(env_path, "w") as f: 576 | f.write(env_content) 577 | console.print(f"[green]Saved {required_key} to .env file[/]") 578 | else: 579 | # Create new .env file 580 | with open(env_path, "w") as f: 581 | f.write(f"{required_key}={api_key}") 582 | console.print("[green]Created .env file[/]") 583 | 584 | # Update environment variable for current session 585 | os.environ[required_key] = api_key 586 | 587 | # Reload dotenv to ensure changes are applied 588 | load_dotenv(override=True) 589 | 590 | 591 | def select_model_and_check_key(): 592 | """Select a model and check for required API key""" 593 | console.print("\n[bold green]Choose an LLM model:[/]") 594 | console.print("1. [bold]Gemini 2.5 Pro[/]") 595 | console.print("2. [bold]Gemini 2.5 Flash[/]") 596 | console.print("3. [bold]Claude 3.7 Sonnet[/]") 597 | console.print("4. [bold]OpenAI o4-mini[/]") 598 | 599 | choice = Prompt.ask( 600 | "[bold]Select model[/]", 601 | choices=["1", "2", "3", "4"], 602 | default="1" 603 | ) 604 | 605 | provider = "" 606 | model = "" 607 | required_key = "" 608 | 609 | # Create LLM provider based on selection 610 | if choice == "1": 611 | provider = "gemini" 612 | model = "gemini-2.5-pro-preview-03-25" 613 | required_key = "GEMINI_API_KEY" 614 | elif choice == "2": 615 | provider = "gemini" 616 | model = "gemini-2.5-flash-preview-04-17" 617 | required_key = "GEMINI_API_KEY" 618 | elif choice == "3": 619 | provider = "anthropic" 620 | model = "claude-3-7-sonnet-20250219" 621 | required_key = "ANTHROPIC_API_KEY" 622 | elif choice == "4": 623 | provider = "openai" 624 | model = "o4-mini" 625 | required_key = "OPENAI_API_KEY" 626 | else: 627 | raise ValueError(f"Invalid choice: {choice}") 628 | 629 | # Check and save API key if needed 630 | check_and_save_api_key(required_key) 631 | 632 | return create_llm_provider(provider, model) 633 | 634 | 635 | async def _interactive_loop(initial_prompt: str = None, use_local_chrome: bool = False, chrome_path: str = DEFAULT_CHROME_PATH, debugging_port: int = DEFAULT_DEBUGGING_PORT, debug: bool = False): 636 | """Implementation of the interactive loop mode""" 637 | # Display welcome panel 638 | console.print(Panel.fit( 639 | "Index Browser Agent Interactive Mode\n" 640 | "Type your message and press Enter. The agent will respond.\n" 641 | "Press Ctrl+C to exit.", 642 | title="Interactive Mode", 643 | border_style="blue" 644 | )) 645 | 646 | # Select model and check API key 647 | llm_provider = select_model_and_check_key() 648 | 649 | # Create agent session with selected provider 650 | session = AgentSession( 651 | llm=llm_provider, 652 | use_local_chrome=use_local_chrome, 653 | chrome_path=chrome_path, 654 | debugging_port=debugging_port, 655 | debug=debug 656 | ) 657 | 658 | try: 659 | first_message = True 660 | awaiting_human_input = False 661 | 662 | while True: 663 | # Check if we're waiting for the user to return control to the agent 664 | if awaiting_human_input: 665 | console.print("\n[yellow]Agent is waiting for control to be returned.[/]") 666 | console.print("[yellow]Press Enter to return control to the agent...[/]", end="") 667 | input() # Wait for Enter key 668 | user_message = "Returning control back, continue your task" 669 | console.print(f"\n[bold blue]Your message:[/] {user_message}") 670 | awaiting_human_input = False 671 | # Normal message input flow 672 | elif first_message and initial_prompt: 673 | user_message = initial_prompt 674 | console.print(f"\n[bold blue]Your message:[/] {user_message}") 675 | first_message = False 676 | else: 677 | console.print("\n[bold blue]Your message:[/] ", end="") 678 | user_message = input() 679 | first_message = False 680 | 681 | if not user_message.strip(): 682 | continue 683 | 684 | console.print("\n[bold cyan]Agent is working...[/]") 685 | 686 | step_num = 1 687 | human_control_requested = False 688 | 689 | # Run the agent with streaming output 690 | try: 691 | async for chunk in session.stream_run(user_message): 692 | if chunk.type == "step": 693 | action_result = chunk.content.action_result 694 | summary = chunk.content.summary 695 | 696 | # Simple single-line output for steps 697 | console.print(f"[bold blue]Step {step_num}:[/] {summary}") 698 | # Display additional info for special actions as separate lines 699 | if action_result and action_result.is_done and not action_result.give_control: 700 | console.print(" [green bold]✓ Task completed successfully![/]") 701 | 702 | if action_result and action_result.give_control: 703 | human_control_requested = True 704 | message = action_result.content or "No message provided" 705 | console.print(" [yellow bold]⚠ Human control requested:[/]") 706 | console.print(f" [yellow]{message}[/]") 707 | 708 | # Increment step counter for next step 709 | step_num += 1 710 | 711 | elif chunk.type == "step_error": 712 | console.print(f"[bold red]Error:[/] {chunk.content}") 713 | 714 | elif chunk.type == "final_output": 715 | # Keep panel for final output 716 | result_content = chunk.content.result.content if chunk.content.result else "No result content" 717 | console.print(Panel( 718 | f"{result_content}", 719 | title="Final Output", 720 | border_style="green", 721 | expand=False 722 | )) 723 | 724 | except Exception as e: 725 | console.print(f"[bold red]Error:[/] {str(e)}") 726 | console.print(f"[dim]Type: {type(e)}[/]") 727 | console.print_exception() 728 | 729 | # After agent completes 730 | if human_control_requested: 731 | console.print("\n[yellow]Agent has requested human control.[/]") 732 | awaiting_human_input = True 733 | else: 734 | console.print("\n[green]Agent has completed the task.[/]") 735 | console.print("[dim]Waiting for your next message...[/]") 736 | 737 | except KeyboardInterrupt: 738 | console.print("\n[yellow]Exiting interactive mode...[/]") 739 | # Close the browser before exiting 740 | await session.close() 741 | 742 | 743 | def main(): 744 | """Entry point for the CLI""" 745 | app() 746 | 747 | 748 | if __name__ == "__main__": 749 | main() -------------------------------------------------------------------------------- /index/controller/controller.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import json 3 | import logging 4 | from dataclasses import dataclass 5 | from functools import wraps 6 | from typing import Any, Callable, Dict, get_type_hints 7 | 8 | from docstring_parser import parse 9 | from lmnr import Laminar 10 | 11 | from index.agent.models import ActionModel, ActionResult 12 | from index.browser.browser import Browser 13 | from index.controller.default_actions import register_default_actions 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | @dataclass 19 | class Action: 20 | """Represents a registered action""" 21 | name: str 22 | description: str 23 | function: Callable 24 | browser_context: bool = False 25 | 26 | 27 | class Controller: 28 | """Controller for browser actions with integrated registry functionality""" 29 | 30 | def __init__(self): 31 | self._actions: Dict[str, Action] = {} 32 | # Register default actions 33 | register_default_actions(self) 34 | 35 | def action(self, description: str = None): 36 | """ 37 | Decorator for registering actions 38 | 39 | Args: 40 | description: Optional description of what the action does. 41 | If not provided, uses the function's docstring. 42 | """ 43 | def decorator(func: Callable) -> Callable: 44 | 45 | # Use provided description or function docstring 46 | action_description = description 47 | if action_description is None: 48 | action_description = inspect.getdoc(func) or "No description provided" 49 | 50 | # Clean up docstring (remove indentation) 51 | action_description = inspect.cleandoc(action_description) 52 | 53 | browser_context = False 54 | if 'browser' in inspect.signature(func).parameters: 55 | browser_context = True 56 | 57 | @wraps(func) 58 | async def async_wrapper(*args, **kwargs): 59 | return await func(*args, **kwargs) 60 | 61 | # Register the action 62 | self._actions[func.__name__] = Action( 63 | name=func.__name__, 64 | description=action_description, 65 | function=async_wrapper, 66 | browser_context=browser_context, 67 | ) 68 | return func 69 | 70 | return decorator 71 | 72 | async def execute_action( 73 | self, 74 | action: ActionModel, 75 | browser: Browser, 76 | ) -> ActionResult: 77 | """Execute an action from an ActionModel""" 78 | 79 | action_name = action.name 80 | params = action.params 81 | 82 | if params is not None: 83 | with Laminar.start_as_current_span( 84 | name=action_name, 85 | input={ 86 | 'action': action_name, 87 | 'params': params, 88 | }, 89 | span_type='TOOL', 90 | ): 91 | 92 | logger.info(f'Executing action: {action_name} with params: {params}') 93 | action = self._actions.get(action_name) 94 | 95 | if action is None: 96 | raise ValueError(f'Action {action_name} not found') 97 | 98 | try: 99 | 100 | kwargs = params.copy() if params else {} 101 | 102 | # Add browser to kwargs if it's provided 103 | if action.browser_context and browser is not None: 104 | kwargs['browser'] = browser 105 | 106 | result = await action.function(**kwargs) 107 | 108 | Laminar.set_span_output(result) 109 | return result 110 | 111 | except Exception as e: 112 | raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e 113 | 114 | else: 115 | raise ValueError('Params are not provided for action: {action_name}') 116 | 117 | def get_action_descriptions(self) -> str: 118 | """Return a dictionary of all registered actions and their metadata""" 119 | 120 | action_info = [] 121 | 122 | for name, action in self._actions.items(): 123 | sig = inspect.signature(action.function) 124 | type_hints = get_type_hints(action.function) 125 | 126 | # Extract parameter descriptions using docstring_parser 127 | param_descriptions = {} 128 | docstring = inspect.getdoc(action.function) 129 | if docstring: 130 | parsed_docstring = parse(docstring) 131 | for param in parsed_docstring.params: 132 | param_descriptions[param.arg_name] = param.description 133 | 134 | # Build parameter info 135 | params = {} 136 | for param_name in sig.parameters.keys(): 137 | if param_name == 'browser': # Skip browser parameter in descriptions 138 | continue 139 | 140 | param_type = type_hints.get(param_name, Any).__name__ 141 | 142 | params[param_name] = { 143 | 'type': param_type, 144 | 'description': param_descriptions.get(param_name, '') 145 | } 146 | 147 | # Use short description from docstring when available 148 | description = action.description 149 | if docstring: 150 | parsed_docstring = parse(docstring) 151 | if parsed_docstring.short_description: 152 | description = parsed_docstring.short_description 153 | 154 | action_info.append(json.dumps({ 155 | 'name': name, 156 | 'description': description, 157 | 'parameters': params 158 | }, indent=2)) 159 | 160 | return '\n\n'.join(action_info) -------------------------------------------------------------------------------- /index/controller/default_actions.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import logging 4 | import platform 5 | import re 6 | from typing import Any, Dict 7 | 8 | from tenacity import retry, stop_after_attempt, wait_exponential 9 | 10 | from index.agent.models import ActionResult 11 | from index.browser.browser import Browser 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | def register_default_actions(controller): 16 | """Register all default browser actions to the provided controller""" 17 | 18 | @controller.action() 19 | async def done(output: str): 20 | """Use this action when you have completed the task. 21 | 22 | Args: 23 | output: Output of the task. 24 | """ 25 | return ActionResult(is_done=True, content=output) 26 | 27 | @controller.action() 28 | async def done_with_structured_output(output: Dict[str, Any]): 29 | """Use this action ONLY when you are provided with a structured output model. Otherwise, use simple `done` action. 30 | 31 | Args: 32 | output: JSON object that adheres to the provided output model. 33 | """ 34 | return ActionResult(is_done=True, content=output) 35 | 36 | 37 | @controller.action() 38 | async def give_human_control(message: str, browser: Browser): 39 | """Give human control of the browser. Use this action when you need to use user information, such as first name, last name, email, phone number, booking information, login/password, etc. to proceed with the task. Also, if you can't solve the CAPTCHA, use this action. 40 | 41 | Args: 42 | message: Message to give to the human, explaining why you need human intervention. 43 | """ 44 | return ActionResult(give_control=True, content=message, is_done=True) 45 | 46 | 47 | @controller.action() 48 | async def search_google(query: str, browser: Browser): 49 | """ 50 | Open google search in new tab and search for the query. 51 | """ 52 | page = await browser.get_current_page() 53 | await page.goto(f'https://www.google.com/search?q={query}&udm=14', wait_until='domcontentloaded') 54 | await asyncio.sleep(1) 55 | msg = f"Searched for '{query}' in Google" 56 | logger.info(msg) 57 | return ActionResult(content=msg) 58 | 59 | @controller.action() 60 | @retry( 61 | stop=stop_after_attempt(3), 62 | wait=wait_exponential(multiplier=1, min=1, max=10), 63 | reraise=True, 64 | before_sleep=lambda retry_state: logger.warning( 65 | f"Retrying step after error: {retry_state.outcome.exception()}. Attempt {retry_state.attempt_number}" 66 | ) 67 | ) 68 | async def go_to_url(url: str, browser: Browser): 69 | """Navigate to URL in the current tab""" 70 | page = await browser.get_current_page() 71 | await page.goto(url, wait_until='domcontentloaded') 72 | await asyncio.sleep(1.5) 73 | msg = f"Navigated to {url}" 74 | logger.info(msg) 75 | return ActionResult(content=msg) 76 | 77 | @controller.action() 78 | async def go_back_to_previous_page(browser: Browser): 79 | """Go back to the previous page""" 80 | try: 81 | page = await browser.get_current_page() 82 | await page.go_back(wait_until='domcontentloaded') 83 | 84 | await asyncio.sleep(2) 85 | msg = 'Navigated back to the previous page' 86 | logger.info(msg) 87 | return ActionResult(content=msg) 88 | 89 | except Exception as e: 90 | logger.debug(f'During go_back: {e}') 91 | return ActionResult(error=str(e)) 92 | 93 | @controller.action() 94 | async def click_on_spreadsheet_cell(row: str, column: str, browser: Browser) -> ActionResult: 95 | """Click on a spreadsheet cell at a specific row and column. You HAVE to use this action when you need to click on a cell in a spreadsheet. DON'T try to use click_element action, it will not work. 96 | 97 | Args: 98 | row: Row of the cell to click on, it should be a number formatted as a string. e.g. "1" 99 | column: Column of the cell to click on, it should be a letter formatted as a string. e.g. "A" 100 | """ 101 | page = await browser.get_current_page() 102 | state = browser.get_state() 103 | 104 | elements = state.interactive_elements.values() 105 | 106 | row_element = next((e for e in elements if e.browser_agent_id == f"row_{row}"), None) 107 | column_element = next((e for e in elements if e.browser_agent_id == f"column_{column}"), None) 108 | 109 | if not row_element or not column_element: 110 | return ActionResult(error='Row or column element not found - pay close attention to the row and column numbers.') 111 | 112 | # reseting click just in case 113 | await page.mouse.click(state.viewport.width / 2, state.viewport.height / 2) 114 | await asyncio.sleep(0.05) 115 | 116 | await page.mouse.click(column_element.center.x, row_element.center.y, click_count=2) 117 | await asyncio.sleep(0.05) 118 | 119 | return ActionResult(content=f'Clicked on spreadsheet cell with row {row} and column {column}') 120 | 121 | 122 | @controller.action() 123 | async def click_element(index: int, browser: Browser, wait_after_click: bool = False): 124 | """ 125 | Click on the element with index. 126 | 127 | Args: 128 | index: Index of the element to click on. 129 | wait_after_click: If True, wait for 2 second after clicking the element. Only set it to True when you think that clicking will trigger loading state, for instance navigation to new page, search, loading of a content, etc. 130 | """ 131 | # clean index if it contains any non-numeric characters 132 | cleaned_index_str = re.sub(r'\D', '', str(index)) 133 | if cleaned_index_str == '': 134 | logger.error(f'Index is not a number. Index: {index}') 135 | return ActionResult(error="`index` should be a valid number.") 136 | 137 | index = int(cleaned_index_str) 138 | 139 | state = browser.get_state() 140 | 141 | if index not in state.interactive_elements: 142 | return ActionResult(error=f"Element with index {index} does not exist - retry or use alternative actions.") 143 | 144 | element = state.interactive_elements[index] 145 | initial_pages = len(browser.context.pages) if browser.context else 0 146 | 147 | try: 148 | page = await browser.get_current_page() 149 | 150 | await page.mouse.click(element.center.x, element.center.y) 151 | 152 | msg = f'Clicked element with index {index}: <{element.tag_name}>' 153 | 154 | logger.info(msg) 155 | if browser.context and len(browser.context.pages) > initial_pages: 156 | new_tab_msg = 'New tab opened - switching to it' 157 | msg += f' - {new_tab_msg}' 158 | logger.info(new_tab_msg) 159 | await browser.switch_to_tab(-1) 160 | 161 | if wait_after_click: 162 | await asyncio.sleep(2) 163 | 164 | return ActionResult(content=msg) 165 | except Exception as e: 166 | return ActionResult(error=str(e)) 167 | 168 | @controller.action( 169 | description='Use this action to wait for the page to load, if you see that the content on the clean screenshot is empty or loading UI elements such as skeleton screens. This action will wait for page to load. Then you can continue with your actions.', 170 | ) 171 | async def wait_for_page_to_load() -> ActionResult: 172 | return ActionResult(content='Waited for page to load') 173 | 174 | @controller.action() 175 | async def enter_text(text: str, press_enter: bool, browser: Browser): 176 | """Enter text with a keyboard. Use it AFTER you have clicked on an input element. This action will override the current text in the element. 177 | 178 | Args: 179 | text: Text to enter with a keyboard. 180 | press_enter: If True, `Enter` button will be pressed after entering the text. Use this when you think it would make sense to press `Enter` after entering the text, such as when you're submitting a form, performing a search, etc. 181 | """ 182 | 183 | try: 184 | page = await browser.get_current_page() 185 | # clear the element 186 | await page.keyboard.press("ControlOrMeta+a") 187 | await asyncio.sleep(0.1) 188 | await page.keyboard.press("Backspace") 189 | await asyncio.sleep(0.1) 190 | 191 | # input text into the element 192 | await page.keyboard.type(text) 193 | 194 | if press_enter: 195 | await page.keyboard.press("Enter") 196 | await asyncio.sleep(2) 197 | 198 | msg = f'Entered "{text}" on the keyboard. Make sure to double check that the text was entered to where you intended.' 199 | logger.info(msg) 200 | return ActionResult(content=msg) 201 | except Exception as e: 202 | return ActionResult(error=f'Failed to enter text. Error: {str(e)}') 203 | 204 | # Tab Management Actions 205 | @controller.action('Switch tab') 206 | async def switch_tab(page_id: int, browser: Browser): 207 | await browser.switch_to_tab(page_id) 208 | await asyncio.sleep(0.5) 209 | msg = f'Switched to tab {page_id}' 210 | logger.info(msg) 211 | return ActionResult(content=msg) 212 | 213 | @controller.action('Open url in new tab') 214 | async def open_tab(url: str, browser: Browser): 215 | await browser.create_new_tab(url) 216 | msg = f'Opened new tab with {url}' 217 | logger.info(msg) 218 | return ActionResult(content=msg) 219 | 220 | @controller.action( 221 | "Scrolls entire page down. Use this action when you want to scroll the entire page down. Don't use this action if you want to scroll over a specific scrollable area on a page." 222 | ) 223 | async def scroll_page_down(browser: Browser): 224 | page = await browser.get_current_page() 225 | state = browser.get_state() 226 | # move mouse to the center of the page 227 | await page.mouse.move(state.viewport.width / 2, state.viewport.height / 2) 228 | await asyncio.sleep(0.1) 229 | # scroll down by one page 230 | await page.mouse.wheel(0, state.viewport.height * 0.8) 231 | return ActionResult(content="Scrolled mouse wheel down (it doesn't guarantee that something has scrolled, you need to check new state screenshot to confirm)") 232 | 233 | 234 | @controller.action( 235 | "Scrolls entire page up. Use this action when you want to scroll the entire page up. Don't use this action if you want to scroll over a specific scrollable area on a page." 236 | ) 237 | async def scroll_page_up(browser: Browser): 238 | page = await browser.get_current_page() 239 | state = browser.get_state() 240 | # move mouse to the center of the page 241 | await page.mouse.move(state.viewport.width / 2, state.viewport.height / 2) 242 | await asyncio.sleep(0.1) 243 | # scroll up by one page 244 | await page.mouse.wheel(0, -state.viewport.height * 0.8) 245 | return ActionResult(content="Scrolled mouse wheel up (it doesn't guarantee that something has scrolled, you need to check new state screenshot to confirm)") 246 | 247 | @controller.action( 248 | "Moves mouse to the element with index `index`, located inside scrollable area of the webpage, identified by scrollbars. Then scrolls mouse wheel down." 249 | ) 250 | async def scroll_down_over_element(index: int, browser: Browser): 251 | page = await browser.get_current_page() 252 | state = browser.get_state() 253 | 254 | if index not in state.interactive_elements: 255 | return ActionResult(error=f'Element index {index} does not exist - retry or use alternative actions') 256 | 257 | element = state.interactive_elements[index] 258 | 259 | await page.mouse.move(element.center.x, element.center.y) 260 | await asyncio.sleep(0.1) 261 | await page.mouse.wheel(0, state.viewport.height / 3) 262 | 263 | return ActionResult(content=f"Move mouse to element with index {index} and scroll mouse wheel down. (It doesn't guarantee that something has scrolled, you need to check new state screenshot to confirm)") 264 | 265 | @controller.action( 266 | "Moves mouse to the element with index `index`, located inside scrollable area of the webpage, identified by scrollbars. Then scrolls mouse wheel up." 267 | ) 268 | async def scroll_up_over_element(index: int, browser: Browser): 269 | page = await browser.get_current_page() 270 | state = browser.get_state() 271 | 272 | if index not in state.interactive_elements: 273 | return ActionResult(error=f'Element index {index} does not exist - retry or use alternative actions') 274 | 275 | element = state.interactive_elements[index] 276 | 277 | await page.mouse.move(element.center.x, element.center.y) 278 | await asyncio.sleep(0.1) 279 | await page.mouse.wheel(0, -state.viewport.height / 3) 280 | 281 | return ActionResult(content=f"Move mouse to element with index {index} and scroll mouse wheel up. (It doesn't guarantee that something has scrolled, you need to check new state screenshot to confirm)") 282 | 283 | @controller.action( 284 | "Moves mouse at the location of the element with index `index`, which should be inside scrollable area of the webpage, identified by scrollbars. Then scrolls mouse wheel horizontally to the right." 285 | ) 286 | async def scroll_right_over_element(index: int, browser: Browser): 287 | page = await browser.get_current_page() 288 | state = browser.get_state() 289 | 290 | if index not in state.interactive_elements: 291 | return ActionResult(error=f'Element index {index} does not exist - retry or use an alternative action') 292 | 293 | element = state.interactive_elements[index] 294 | 295 | await page.mouse.move(element.center.x, element.center.y) 296 | await asyncio.sleep(0.1) 297 | await page.mouse.wheel(state.viewport.width / 3, 0) 298 | 299 | return ActionResult(content=f"Moved mouse to element with index {index} and scroll mouse wheel horizontally to the right. (It doesn't guarantee that something has scrolled, you need to check new state screenshot to confirm)") 300 | 301 | 302 | @controller.action( 303 | "Moves mouse at the location of the element with index `index`, which should be inside scrollable area of the webpage, identified by scrollbars. Then scrolls mouse wheel horizontally to the left." 304 | ) 305 | async def scroll_left_over_element(index: int, browser: Browser): 306 | page = await browser.get_current_page() 307 | state = browser.get_state() 308 | 309 | if index not in state.interactive_elements: 310 | return ActionResult(error=f'Element index {index} does not exist - retry or use an alternative action') 311 | 312 | element = state.interactive_elements[index] 313 | 314 | await page.mouse.move(element.center.x, element.center.y) 315 | await asyncio.sleep(0.1) 316 | await page.mouse.wheel(-state.viewport.width / 3, 0) 317 | 318 | return ActionResult(content=f"Moved mouse to element with index {index} and scroll mouse wheel horizontally to the left. (It doesn't guarantee that something has scrolled, you need to check new state screenshot to confirm)") 319 | 320 | 321 | @controller.action( 322 | 'Press enter key. Use this action when you need to submit a form or perform an action that requires pressing enter.' 323 | ) 324 | async def press_enter(browser: Browser): 325 | page = await browser.get_current_page() 326 | 327 | await page.keyboard.press('Enter') 328 | return ActionResult(content='Pressed enter key') 329 | 330 | @controller.action( 331 | 'Remove all text in the element with index.' 332 | ) 333 | async def clear_text_in_element(index: int, browser: Browser): 334 | page = await browser.get_current_page() 335 | 336 | state = browser.get_state() 337 | 338 | if index not in state.interactive_elements: 339 | return ActionResult(error=f'Element index {index} does not exist - retry or use alternative actions') 340 | 341 | element = state.interactive_elements[index] 342 | 343 | await page.mouse.move(element.center.x, element.center.y) 344 | await page.mouse.click(element.center.x, element.center.y) 345 | await asyncio.sleep(0.1) 346 | 347 | if platform.system() == "Darwin": 348 | await page.keyboard.press('Meta+A') 349 | else: 350 | await page.keyboard.press('Control+A') 351 | await asyncio.sleep(0.1) 352 | await page.keyboard.press('Backspace') 353 | return ActionResult(content='Removed all text in the element with index') 354 | 355 | @controller.action() 356 | async def get_select_options(index: int, browser: Browser) -> ActionResult: 357 | """Get all options from a element by the text (name) of the option. Use this after get_select_options and when you need to select an option from a dropdown.', 412 | ) 413 | async def select_dropdown_option( 414 | index: int, 415 | option: str, 416 | browser: Browser, 417 | ) -> ActionResult: 418 | """Select dropdown option by the text of the option you want to select""" 419 | try: 420 | # Get the interactive element 421 | page = await browser.get_current_page() 422 | interactive_elements = browser.get_state().interactive_elements 423 | 424 | # Verify the element exists and is a select 425 | if index not in interactive_elements: 426 | return ActionResult(error=f"No element found with index {index}") 427 | 428 | element = interactive_elements[index] 429 | 430 | # Check if it's a select element 431 | if element.tag_name.lower() != 'select': 432 | return ActionResult(error=f"Element {index} is not a select element, it's a {element.tag_name}") 433 | 434 | logger.debug(f"Attempting to select '{option}' using browser_agent_id: {element.browser_agent_id}") 435 | 436 | # Use JavaScript to select the option using the unique ID 437 | result = await page.evaluate(""" 438 | (args) => { 439 | const uniqueId = args.uniqueId; 440 | const optionText = args.optionText; 441 | 442 | try { 443 | // Find the select element by unique ID - works across frames too 444 | function findElementByUniqueId(root, id) { 445 | // Check in main document first 446 | let element = document.querySelector(`[data-browser-agent-id="${id}"]`); 447 | if (element) return element; 448 | } 449 | 450 | const select = findElementByUniqueId(window, uniqueId); 451 | if (!select) { 452 | return { 453 | success: false, 454 | error: "Select element not found with ID: " + uniqueId 455 | }; 456 | } 457 | 458 | // Find the option with matching text 459 | let found = false; 460 | let selectedValue = null; 461 | let selectedIndex = -1; 462 | 463 | for (let i = 0; i < select.options.length; i++) { 464 | const opt = select.options[i]; 465 | if (opt.text === optionText) { 466 | // Select this option 467 | opt.selected = true; 468 | found = true; 469 | selectedValue = opt.value; 470 | selectedIndex = i; 471 | 472 | // Trigger change event 473 | const event = new Event('change', { bubbles: true }); 474 | select.dispatchEvent(event); 475 | break; 476 | } 477 | } 478 | 479 | if (found) { 480 | return { 481 | success: true, 482 | value: selectedValue, 483 | index: selectedIndex 484 | }; 485 | } else { 486 | return { 487 | success: false, 488 | error: "Option not found: " + optionText, 489 | availableOptions: Array.from(select.options).map(o => o.text) 490 | }; 491 | } 492 | } catch (e) { 493 | return { 494 | success: false, 495 | error: e.toString() 496 | }; 497 | } 498 | } 499 | """, {"uniqueId": element.browser_agent_id, "optionText": option}) 500 | 501 | if result.get('success'): 502 | msg = f"Selected option '{option}' with value '{result.get('value')}' at index {result.get('index')}" 503 | logger.info(msg) 504 | return ActionResult(content=msg) 505 | else: 506 | error_msg = result.get('error', 'Unknown error') 507 | if 'availableOptions' in result: 508 | available = result.get('availableOptions', []) 509 | error_msg += f". Available options: {', '.join(available)}" 510 | 511 | logger.error(f"Selection failed: {error_msg}") 512 | return ActionResult(error=error_msg) 513 | 514 | except Exception as e: 515 | msg = f'Selection failed: {str(e)}' 516 | logger.error(msg) 517 | return ActionResult(error=msg) 518 | -------------------------------------------------------------------------------- /index/llm/llm.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from dataclasses import dataclass 3 | from enum import Enum 4 | from typing import Any, Dict, List, Optional, Union 5 | 6 | from pydantic import BaseModel 7 | 8 | 9 | class MessageRole(Enum): 10 | SYSTEM = "system" 11 | USER = "user" 12 | ASSISTANT = "assistant" 13 | TOOL = "tool" # For OpenAI function calling responses 14 | 15 | @dataclass 16 | class MessageContent: 17 | """Base class for message content""" 18 | cache_control: Optional[bool] = None 19 | 20 | @dataclass 21 | class TextContent(MessageContent): 22 | """Text content in a message""" 23 | text: str = "" 24 | type: str = "text" 25 | 26 | @dataclass 27 | class ImageContent(MessageContent): 28 | """Image content in a message""" 29 | image_b64: Optional[str] = None 30 | image_url: Optional[str] = None 31 | type: str = "image" 32 | 33 | @dataclass 34 | class ThinkingBlock(MessageContent): 35 | """Thinking block in a message""" 36 | thinking: str = "" 37 | signature: str = "" 38 | type: str = "thinking" 39 | 40 | @dataclass 41 | class Message: 42 | """A message in a conversation""" 43 | role: Union[str, MessageRole] 44 | content: Union[str, List[Union[TextContent, ImageContent, ThinkingBlock]]] 45 | name: Optional[str] = None # For tool/function messages 46 | tool_call_id: Optional[str] = None # For tool/function responses 47 | is_state_message: Optional[bool] = False 48 | 49 | def __post_init__(self): 50 | # Convert role enum to string if needed 51 | if isinstance(self.role, MessageRole): 52 | self.role = self.role.value 53 | 54 | # Convert string content to TextContent if needed 55 | if isinstance(self.content, str): 56 | self.content = [TextContent(text=self.content)] 57 | elif isinstance(self.content, (TextContent, ImageContent)): 58 | self.content = [self.content] 59 | 60 | def to_openai_format(self) -> Dict: 61 | """Convert to OpenAI message format""" 62 | message = {"role": self.role} 63 | 64 | if isinstance(self.content, str): 65 | message["content"] = self.content 66 | 67 | elif isinstance(self.content, list): 68 | 69 | content_blocks = [] 70 | 71 | for content_block in self.content: 72 | 73 | block = {} 74 | 75 | if isinstance(content_block, TextContent): 76 | block["type"] = "text" 77 | block["text"] = content_block.text 78 | elif isinstance(content_block, ImageContent): 79 | block["type"] = "image_url" 80 | block["image_url"] = { 81 | "url": "data:image/png;base64," + content_block.image_b64 82 | } 83 | 84 | content_blocks.append(block) 85 | 86 | message["content"] = content_blocks 87 | 88 | return message 89 | 90 | def to_groq_format(self) -> Dict: 91 | """Convert to Groq message format""" 92 | message = {"role": self.role} 93 | 94 | if isinstance(self.content, str): 95 | message["content"] = self.content 96 | 97 | elif isinstance(self.content, list): 98 | 99 | content_blocks = [] 100 | 101 | # content of a system and assistant messages in groq can only contain text 102 | if self.role == "system" or self.role == "assistant": 103 | block = self.content[0] 104 | if isinstance(block, TextContent): 105 | message["content"] = block.text 106 | 107 | return message 108 | 109 | for content_block in self.content: 110 | 111 | block = {} 112 | 113 | if isinstance(content_block, TextContent): 114 | block["type"] = "text" 115 | block["text"] = content_block.text 116 | elif isinstance(content_block, ImageContent): 117 | block["type"] = "image_url" 118 | block["image_url"] = { 119 | "url": "data:image/png;base64," + content_block.image_b64 120 | } 121 | 122 | content_blocks.append(block) 123 | 124 | message["content"] = content_blocks 125 | 126 | return message 127 | 128 | def to_anthropic_format(self, enable_cache_control: bool = True) -> Dict: 129 | """Convert to Anthropic message format""" 130 | message = {"role": self.role} 131 | 132 | if isinstance(self.content, str): 133 | message["content"] = self.content 134 | 135 | elif isinstance(self.content, list): 136 | 137 | content_blocks = [] 138 | 139 | for content_block in self.content: 140 | 141 | block = {} 142 | 143 | 144 | if isinstance(content_block, TextContent): 145 | block["type"] = "text" 146 | block["text"] = content_block.text 147 | elif isinstance(content_block, ImageContent): 148 | block["type"] = "image" 149 | block["source"] = { 150 | "type": "base64", 151 | "media_type": "image/png", # This should be configurable based on image type 152 | "data": content_block.image_b64 if content_block.image_b64 else content_block.image_url 153 | } 154 | elif isinstance(content_block, ThinkingBlock): 155 | block["type"] = "thinking" 156 | block["thinking"] = content_block.thinking 157 | block["signature"] = content_block.signature 158 | 159 | if content_block.cache_control and enable_cache_control: 160 | block["cache_control"] = {"type": "ephemeral"} 161 | 162 | content_blocks.append(block) 163 | 164 | message["content"] = content_blocks 165 | 166 | return message 167 | 168 | def to_gemini_format(self) -> Dict: 169 | """Convert to Gemini message format""" 170 | parts = [] 171 | 172 | if isinstance(self.content, str): 173 | parts = [{"text": self.content}] 174 | elif isinstance(self.content, list): 175 | for content_block in self.content: 176 | if isinstance(content_block, TextContent): 177 | parts.append({"text": content_block.text}) 178 | elif isinstance(content_block, ImageContent): 179 | if content_block.image_b64: 180 | parts.append({"inline_data": { 181 | "mime_type": "image/png", 182 | "data": content_block.image_b64 183 | }}) 184 | elif content_block.image_url: 185 | parts.append({"file_data": { 186 | "mime_type": "image/png", 187 | "file_uri": content_block.image_url 188 | }}) 189 | 190 | return { 191 | "role": 'model' if self.role == 'assistant' else 'user', 192 | "parts": parts 193 | } 194 | 195 | def remove_cache_control(self): 196 | if isinstance(self.content, list): 197 | for content_block in self.content: 198 | if isinstance(content_block, TextContent): 199 | content_block.cache_control = None 200 | elif isinstance(content_block, ImageContent): 201 | content_block.cache_control = None 202 | 203 | def add_cache_control_to_state_message(self): 204 | 205 | if not self.is_state_message or not isinstance(self.content, list) or len(self.content) < 3: 206 | return 207 | 208 | if len(self.content) == 3: 209 | self.content[-1].cache_control = True 210 | 211 | def has_cache_control(self): 212 | 213 | if not isinstance(self.content, list): 214 | return False 215 | 216 | return any(content.cache_control for content in self.content) 217 | 218 | 219 | class LLMResponse(BaseModel): 220 | content: str 221 | raw_response: Any 222 | usage: Dict[str, Any] 223 | thinking: Optional[ThinkingBlock] = None 224 | 225 | 226 | class BaseLLMProvider(ABC): 227 | def __init__(self, model: str): 228 | self.model = model 229 | 230 | @abstractmethod 231 | async def call( 232 | self, 233 | messages: List[Message], 234 | temperature: float = 1, 235 | max_tokens: Optional[int] = None, 236 | **kwargs 237 | ) -> LLMResponse: 238 | pass 239 | -------------------------------------------------------------------------------- /index/llm/providers/__init__.py: -------------------------------------------------------------------------------- 1 | from .anthropic import AnthropicProvider 2 | from .anthropic_bedrock import AnthropicBedrockProvider 3 | from .gemini import GeminiProvider 4 | from .openai import OpenAIProvider 5 | 6 | __all__ = [ 7 | "OpenAIProvider", 8 | "AnthropicProvider", 9 | "AnthropicBedrockProvider", 10 | "GeminiProvider", 11 | ] -------------------------------------------------------------------------------- /index/llm/providers/anthropic.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Optional 3 | 4 | import backoff 5 | from anthropic import AsyncAnthropic 6 | 7 | from ..llm import BaseLLMProvider, LLMResponse, Message, ThinkingBlock 8 | from ..providers.anthropic_bedrock import AnthropicBedrockProvider 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class AnthropicProvider(BaseLLMProvider): 14 | def __init__(self, model: str, enable_thinking: bool = True, thinking_token_budget: Optional[int] = 2048): 15 | super().__init__(model=model) 16 | self.client = AsyncAnthropic() 17 | self.thinking_token_budget = thinking_token_budget 18 | 19 | self.anthropic_bedrock = AnthropicBedrockProvider(model=f"us.anthropic.{model}-v1:0", enable_thinking=enable_thinking, thinking_token_budget=thinking_token_budget) 20 | 21 | self.enable_thinking = enable_thinking 22 | 23 | @backoff.on_exception( 24 | backoff.constant, # constant backoff 25 | Exception, # retry on any exception 26 | max_tries=3, # stop after 3 attempts 27 | interval=10, 28 | on_backoff=lambda details: logger.info( 29 | f"API error, retrying in {details['wait']:.2f} seconds... (attempt {details['tries']})" 30 | ) 31 | ) 32 | async def call( 33 | self, 34 | messages: List[Message], 35 | temperature: float = -1, 36 | max_tokens: Optional[int] = 16000, 37 | **kwargs 38 | ) -> LLMResponse: 39 | # Make a copy of messages to prevent modifying the original list during retries 40 | messages_copy = messages.copy() 41 | 42 | if not messages_copy: 43 | raise ValueError("Messages list cannot be empty.") 44 | 45 | conversation_messages_input: List[Message] = [] 46 | 47 | system = [] 48 | 49 | if messages_copy[0].role == "system": 50 | system = messages_copy[0].content[0].text 51 | conversation_messages_input = messages_copy[1:] 52 | else: 53 | conversation_messages_input = messages_copy 54 | 55 | anthropic_api_messages = [msg.to_anthropic_format() for msg in conversation_messages_input] 56 | 57 | if self.enable_thinking: 58 | 59 | try: 60 | response = await self.client.messages.create( 61 | model=self.model, 62 | system=system, 63 | messages=anthropic_api_messages, 64 | thinking={ 65 | "type": "enabled", 66 | "budget_tokens": self.thinking_token_budget, 67 | }, 68 | max_tokens=max(self.thinking_token_budget + 1, max_tokens), 69 | **kwargs 70 | ) 71 | except Exception as e: 72 | logger.error(f"Error calling Anthropic: {str(e)}") 73 | # Fallback to anthropic_bedrock with the original messages_copy 74 | response = await self.anthropic_bedrock.call( 75 | messages_copy, # Pass original messages_copy, bedrock provider has its own logic 76 | temperature=temperature, # Pass original temperature 77 | max_tokens=max_tokens, # Pass original max_tokens 78 | **kwargs 79 | ) 80 | 81 | return LLMResponse( 82 | content=response.content[1].text, 83 | raw_response=response, 84 | usage=response.usage.model_dump(), 85 | thinking=ThinkingBlock(thinking=response.content[0].thinking, signature=response.content[0].signature) 86 | ) 87 | else: # Not enable_thinking 88 | response = await self.client.messages.create( 89 | model=self.model, 90 | messages=anthropic_api_messages, 91 | temperature=temperature, # Use adjusted temperature 92 | max_tokens=max_tokens, # Use adjusted max_tokens 93 | system=system, 94 | **kwargs 95 | ) 96 | 97 | return LLMResponse( 98 | content=response.content[0].text, 99 | raw_response=response, 100 | usage=response.usage.model_dump() 101 | ) -------------------------------------------------------------------------------- /index/llm/providers/anthropic_bedrock.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import List, Optional 4 | 5 | import backoff 6 | from anthropic import AsyncAnthropicBedrock 7 | from dotenv import load_dotenv 8 | 9 | from ..llm import BaseLLMProvider, LLMResponse, Message 10 | 11 | load_dotenv() 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class AnthropicBedrockProvider(BaseLLMProvider): 17 | def __init__(self, model: str, enable_thinking: bool = True, thinking_token_budget: Optional[int] = 8192): 18 | super().__init__(model=model) 19 | 20 | self.client = AsyncAnthropicBedrock( 21 | aws_access_key=os.getenv('AWS_ACCESS_KEY_ID'), 22 | aws_secret_key=os.getenv('AWS_SECRET_ACCESS_KEY'), 23 | aws_region=os.getenv('AWS_REGION'), 24 | ) 25 | self.enable_thinking = enable_thinking 26 | self.thinking_token_budget = thinking_token_budget 27 | @backoff.on_exception( # noqa: F821 28 | backoff.constant, # constant backoff 29 | Exception, # retry on any exception 30 | max_tries=3, # stop after 3 attempts 31 | interval=10, 32 | ) 33 | async def call( 34 | self, 35 | messages: List[Message], 36 | temperature: float = 1, 37 | max_tokens: Optional[int] = 2048, 38 | **kwargs 39 | ) -> LLMResponse: 40 | 41 | messages_copy = messages.copy() 42 | 43 | if len(messages_copy) < 2 or messages_copy[0].role != "system": 44 | raise ValueError("System message is required for Anthropic Bedrock and length of messages must be at least 2") 45 | 46 | system_message = messages_copy[0] 47 | 48 | try: 49 | if self.enable_thinking: 50 | 51 | response = await self.client.messages.create( 52 | model=self.model, 53 | system=system_message.to_anthropic_format(enable_cache_control=False)["content"], 54 | messages=[msg.to_anthropic_format(enable_cache_control=False) for msg in messages_copy[1:]], 55 | temperature=1, 56 | thinking={ 57 | "type": "enabled", 58 | "budget_tokens": self.thinking_token_budget, 59 | }, 60 | max_tokens=max(self.thinking_token_budget + 1, max_tokens), 61 | **kwargs 62 | ) 63 | 64 | return LLMResponse( 65 | content=response.content[1].text, 66 | raw_response=response, 67 | usage=response.usage 68 | ) 69 | else: 70 | 71 | response = await self.client.messages.create( 72 | model=self.model, 73 | messages=[msg.to_anthropic_format(enable_cache_control=False) for msg in messages_copy[1:]], 74 | temperature=temperature, 75 | max_tokens=max_tokens, 76 | system=system_message.to_anthropic_format(enable_cache_control=False)["content"], 77 | **kwargs 78 | ) 79 | 80 | return LLMResponse( 81 | content=response.content[0].text, 82 | raw_response=response, 83 | usage=response.usage 84 | ) 85 | except Exception as e: 86 | logger.error(f"Error calling Anthropic Bedrock: {str(e)}") 87 | raise e -------------------------------------------------------------------------------- /index/llm/providers/gemini.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import List, Optional 4 | 5 | import backoff 6 | from google import genai 7 | 8 | from ..llm import BaseLLMProvider, LLMResponse, Message 9 | 10 | logger = logging.getLogger(__name__) 11 | class GeminiProvider(BaseLLMProvider): 12 | def __init__(self, model: str, thinking_token_budget: int = 8192): 13 | super().__init__(model=model) 14 | self.client = genai.Client(api_key=os.getenv("GEMINI_API_KEY")) 15 | self.thinking_token_budget = thinking_token_budget 16 | 17 | 18 | @backoff.on_exception( 19 | backoff.constant, # constant backoff 20 | Exception, # retry on any exception 21 | max_tries=3, # stop after 3 attempts 22 | interval=0.5, 23 | on_backoff=lambda details: logger.info( 24 | f"API error, retrying in {details['wait']:.2f} seconds... (attempt {details['tries']})" 25 | ), 26 | ) 27 | async def call( 28 | self, 29 | messages: List[Message], 30 | temperature: float = 1.0, 31 | max_tokens: Optional[int] = None, 32 | **kwargs 33 | ) -> LLMResponse: 34 | 35 | if len(messages) == 0: 36 | raise ValueError("Messages must be non-empty") 37 | 38 | config = { 39 | "temperature": temperature, 40 | "thinking_config": { 41 | "thinking_budget": self.thinking_token_budget 42 | }, 43 | } 44 | 45 | if messages[0].role == "system": 46 | system = messages[0].content[0].text 47 | gemini_messages = [msg.to_gemini_format() for msg in messages[1:]] 48 | 49 | config["system_instruction"] = { 50 | "text": system 51 | } 52 | else: 53 | gemini_messages = [msg.to_gemini_format() for msg in messages] 54 | 55 | 56 | if max_tokens: 57 | config["max_output_tokens"] = max_tokens 58 | 59 | response = await self.client.aio.models.generate_content( 60 | model=self.model, 61 | contents=gemini_messages, 62 | config=config, 63 | ) 64 | 65 | # Extract usage information if available 66 | usage = {} 67 | if hasattr(response, "usage_metadata"): 68 | usage = { 69 | "prompt_tokens": getattr(response.usage_metadata, "prompt_token_count", 0), 70 | "completion_tokens": getattr(response.usage_metadata, "candidates_token_count", 0), 71 | "total_tokens": getattr(response.usage_metadata, "total_token_count", 0) 72 | } 73 | 74 | return LLMResponse( 75 | content=response.text, 76 | raw_response=response, 77 | usage=usage 78 | ) -------------------------------------------------------------------------------- /index/llm/providers/gemini_vertex.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Optional 3 | 4 | import backoff 5 | from google import genai 6 | 7 | from ..llm import BaseLLMProvider, LLMResponse, Message 8 | 9 | logger = logging.getLogger(__name__) 10 | class GeminiVertexProvider(BaseLLMProvider): 11 | def __init__(self, model: str, project: str = None, location: str = None): 12 | super().__init__(model=model) 13 | self.client = genai.Client( 14 | vertexai=True, 15 | project=project, 16 | location=location) 17 | 18 | 19 | @backoff.on_exception( 20 | backoff.constant, # constant backoff 21 | Exception, # retry on any exception 22 | max_tries=3, # stop after 3 attempts 23 | interval=0.5, 24 | on_backoff=lambda details: logger.info( 25 | f"API error, retrying in {details['wait']:.2f} seconds... (attempt {details['tries']})" 26 | ), 27 | ) 28 | async def call( 29 | self, 30 | messages: List[Message], 31 | temperature: float = 1.0, 32 | max_tokens: Optional[int] = None, 33 | ) -> LLMResponse: 34 | 35 | if len(messages) == 0: 36 | raise ValueError("Messages must be non-empty") 37 | 38 | config = { 39 | "temperature": temperature, 40 | } 41 | 42 | if messages[0].role == "system": 43 | system = messages[0].content[0].text 44 | gemini_messages = [msg.to_gemini_format() for msg in messages[1:]] 45 | 46 | config["system_instruction"] = { 47 | "text": system 48 | } 49 | else: 50 | gemini_messages = [msg.to_gemini_format() for msg in messages] 51 | 52 | 53 | if max_tokens: 54 | config["max_output_tokens"] = max_tokens 55 | 56 | response = await self.client.aio.models.generate_content( 57 | model=self.model, 58 | contents=gemini_messages, 59 | config=config, 60 | ) 61 | 62 | # Extract usage information if available 63 | usage = {} 64 | if hasattr(response, "usage_metadata"): 65 | usage = { 66 | "prompt_tokens": getattr(response.usage_metadata, "prompt_token_count", 0), 67 | "completion_tokens": getattr(response.usage_metadata, "candidates_token_count", 0), 68 | "total_tokens": getattr(response.usage_metadata, "total_token_count", 0) 69 | } 70 | 71 | return LLMResponse( 72 | content=response.text, 73 | raw_response=response, 74 | usage=usage 75 | ) -------------------------------------------------------------------------------- /index/llm/providers/groq.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Optional 3 | 4 | import backoff 5 | from groq import AsyncGroq # Assuming AsyncGroq for asynchronous operations 6 | 7 | from ..llm import BaseLLMProvider, LLMResponse, Message 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class GroqProvider(BaseLLMProvider): 12 | """ 13 | A provider for interacting with the Groq API. 14 | """ 15 | def __init__(self, model: str): 16 | """ 17 | Initializes the GroqProvider. 18 | 19 | Args: 20 | model: The model name to use (e.g., "llama-3.3-70b-versatile"). 21 | """ 22 | super().__init__(model=model) 23 | # The Groq client, by default, should pick up the GROQ_API_KEY 24 | # from environment variables if not explicitly passed. 25 | # Ref: https://console.groq.com/docs/libraries 26 | # client = Groq(api_key=os.environ.get("GROQ_API_KEY")) where api_key param is optional. 27 | self.client = AsyncGroq() 28 | 29 | @backoff.on_exception( 30 | backoff.constant, 31 | Exception, # Retry on any exception. Consider refining with specific Groq API errors if known. 32 | max_tries=3, 33 | interval=0.5, 34 | ) 35 | async def call( 36 | self, 37 | messages: List[Message], 38 | temperature: float = 1.0, 39 | max_tokens: Optional[int] = None, 40 | ) -> LLMResponse: 41 | """ 42 | Makes an asynchronous call to the Groq API. 43 | 44 | Args: 45 | messages: A list of Message objects representing the conversation history. 46 | temperature: The sampling temperature to use. Groq converts 0 to 1e-8. 47 | Values should ideally be > 0 and <= 2. 48 | max_tokens: The maximum number of tokens to generate. 49 | 50 | Returns: 51 | An LLMResponse object containing the model's response and usage data. 52 | 53 | Raises: 54 | ValueError: If the messages list is empty or the API response is invalid. 55 | """ 56 | if not messages: 57 | raise ValueError("Messages list cannot be empty.") 58 | 59 | # Format messages to be compatible with Groq's API (OpenAI format) 60 | formatted_messages = [msg.to_groq_format() for msg in messages] 61 | 62 | if formatted_messages[0]["role"] == "system": 63 | # remove couple of examples from first user message because llama4 model supports only 5 images. 64 | # TODO: remove this once we have a model that supports more images. 65 | formatted_messages[1]["content"] = formatted_messages[1]["content"][0:4] + formatted_messages[1]["content"][12:] 66 | 67 | 68 | api_params = { 69 | "model": self.model, 70 | "messages": formatted_messages, 71 | "temperature": temperature, 72 | } 73 | 74 | if max_tokens is not None: 75 | api_params["max_tokens"] = max_tokens 76 | 77 | # Groq API notes: 78 | # - 'N' (number of choices) must be 1 if supplied. Defaults to 1. 79 | # - Unsupported OpenAI fields (will result in 400 error if supplied): 80 | # logprobs, logit_bias, top_logprobs, messages[].name 81 | 82 | response = await self.client.chat.completions.create(**api_params) 83 | 84 | if not response.choices or not response.choices[0].message: 85 | logger.error(f"Groq API response missing choices or message: {response}") 86 | raise ValueError("Invalid response structure from Groq API") 87 | 88 | content = response.choices[0].message.content 89 | # Handle cases where content might be None (e.g., if finish_reason indicates tool use in the future) 90 | if content is None: 91 | content = "" 92 | 93 | usage_data = {} 94 | # Attempt to extract usage data, assuming an OpenAI-compatible structure. 95 | # The Groq Python SDK might provide usage data in `response.usage`. 96 | if hasattr(response, "usage") and response.usage is not None: 97 | usage_data = { 98 | "prompt_tokens": getattr(response.usage, "prompt_tokens", 0), 99 | "completion_tokens": getattr(response.usage, "completion_tokens", 0), 100 | "total_tokens": getattr(response.usage, "total_tokens", 0), 101 | } 102 | 103 | return LLMResponse( 104 | content=content, 105 | raw_response=response, 106 | usage=usage_data 107 | ) -------------------------------------------------------------------------------- /index/llm/providers/openai.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from openai import AsyncOpenAI 4 | 5 | from ..llm import BaseLLMProvider, LLMResponse, Message 6 | 7 | 8 | class OpenAIProvider(BaseLLMProvider): 9 | def __init__(self, model: str, reasoning_effort: Optional[str] = "low"): 10 | super().__init__(model=model) 11 | self.client = AsyncOpenAI() 12 | self.reasoning_effort = reasoning_effort 13 | 14 | async def call( 15 | self, 16 | messages: List[Message], 17 | temperature: float = 1.0, 18 | ) -> LLMResponse: 19 | 20 | args = { 21 | "temperature": temperature, 22 | } 23 | 24 | if self.model.startswith("o") and self.reasoning_effort: 25 | args["reasoning_effort"] = self.reasoning_effort 26 | args["temperature"] = 1 27 | 28 | response = await self.client.chat.completions.create( 29 | model=self.model, 30 | messages=[msg.to_openai_format() for msg in messages], 31 | **args 32 | ) 33 | 34 | return LLMResponse( 35 | content=response.choices[0].message.content, 36 | raw_response=response, 37 | usage={ 38 | "prompt_tokens": response.usage.prompt_tokens, 39 | "completion_tokens": response.usage.completion_tokens, 40 | "total_tokens": response.usage.total_tokens 41 | } 42 | ) -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [tool.hatch.metadata] 6 | allow-direct-references = true 7 | 8 | [tool.hatch.build.targets.wheel] 9 | packages = ["index"] 10 | 11 | [project] 12 | name = "lmnr-index" 13 | version = "0.1.13" 14 | description = "Index - SOTA browser AI agent for autonomous task execution on the web" 15 | readme = "README.md" 16 | requires-python = ">=3.10" 17 | 18 | dependencies = [ 19 | "anthropic[bedrock]>=0.52.0", 20 | "backoff>=2.2.1", 21 | "lmnr[anthropic,openai,groq]>=0.6.2", 22 | "openai>=1.65.2", 23 | "playwright>=1.50.0", 24 | "tenacity>=9.0.0", 25 | "pillow>=11.1.0", 26 | "rich>=13.5.0", 27 | "textual>=0.50.1", 28 | "typer>=0.9.0", 29 | "google-genai>=1.11.0", 30 | "docstring-parser>=0.16", 31 | "groq>=0.24.0", 32 | ] 33 | 34 | [project.scripts] 35 | index = "index.cli:main" 36 | 37 | [tool.uv] 38 | dev-dependencies = [ 39 | "pytest>=8.3.3", 40 | "pytest-asyncio" 41 | ] 42 | 43 | [project.license] 44 | file = "LICENSE" 45 | 46 | [tool.pytest.ini_options] 47 | asyncio_mode = "auto" 48 | testpaths = ["tests"] 49 | python_files = ["test_*.py"] 50 | addopts = "-v -ra -q" 51 | -------------------------------------------------------------------------------- /static/logo_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/static/logo_dark.png -------------------------------------------------------------------------------- /static/logo_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/static/logo_light.png -------------------------------------------------------------------------------- /static/traces.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/static/traces.png -------------------------------------------------------------------------------- /tests/agent/test_utils.py: -------------------------------------------------------------------------------- 1 | # Test cases for agent utility functions 2 | 3 | import pytest 4 | 5 | from index.agent.models import ( # Assuming ActionModel is part of AgentLLMOutput 6 | ActionModel, 7 | AgentLLMOutput, 8 | ) 9 | from index.agent.utils import generate_proper_json, validate_json 10 | from index.llm.llm import ( # Assuming LLMResponse is the type returned by llm.call 11 | BaseLLMProvider, 12 | LLMResponse, 13 | Message, 14 | ) 15 | 16 | 17 | # Mock LLM Provider 18 | class MockLLMProvider(BaseLLMProvider): 19 | def __init__(self, responses=None, call_should_fail=False, exception_to_raise=None): 20 | self.responses = responses if responses is not None else [] 21 | self.call_history = [] 22 | self.call_should_fail = call_should_fail 23 | self.exception_to_raise = exception_to_raise if exception_to_raise else Exception("LLM call failed") 24 | 25 | async def call(self, messages: list[Message]) -> LLMResponse: 26 | self.call_history.append(messages) 27 | if self.call_should_fail: 28 | raise self.exception_to_raise 29 | if self.responses: 30 | response_content = self.responses.pop(0) 31 | # Simulate LLMResponse structure; adjust if it's different 32 | return LLMResponse(content=response_content, thinking=None, raw_response=None, cost=None, usage={"prompt_tokens": 10, "completion_tokens": 10}) 33 | return LLMResponse(content="", thinking=None, raw_response=None, cost=None, usage={"prompt_tokens": 0, "completion_tokens": 0}) # Default empty response 34 | 35 | def get_token_limit(self) -> int: 36 | return 4096 # Dummy value 37 | 38 | def count_tokens(self, text: str) -> int: 39 | return len(text.split()) # Dummy value 40 | 41 | # --- Tests for validate_json --- 42 | 43 | @pytest.mark.asyncio 44 | async def test_validate_json_valid_with_output_tags(): 45 | raw_response = "{\"action\": {\"name\": \"click\", \"params\": {\"selector\": \".btn\"}}, \"thought\": \"Thinking...\", \"summary\": \"Clicked button\"}" 46 | mock_llm = MockLLMProvider() 47 | 48 | expected_action = ActionModel(name="click", params={"selector": ".btn"}) 49 | expected_output = AgentLLMOutput(action=expected_action, thought="Thinking...", summary="Clicked button") 50 | 51 | result = await validate_json(raw_response, mock_llm) 52 | 53 | assert result.action == expected_action 54 | assert result.thought == expected_output.thought 55 | assert result.summary == expected_output.summary 56 | assert len(mock_llm.call_history) == 0 # LLM should not be called 57 | 58 | @pytest.mark.asyncio 59 | async def test_validate_json_valid_with_json_markdown(): 60 | raw_response = "```json\n{\"action\": {\"name\": \"type\", \"params\": {\"text\": \"hello\"}}, \"thought\": \"Typing...\", \"summary\": \"Typed hello\"}\n```" 61 | mock_llm = MockLLMProvider() 62 | 63 | expected_action = ActionModel(name="type", params={"text": "hello"}) 64 | expected_output = AgentLLMOutput(action=expected_action, thought="Typing...", summary="Typed hello") 65 | 66 | result = await validate_json(raw_response, mock_llm) 67 | 68 | assert result.action == expected_action 69 | assert result.thought == expected_output.thought 70 | assert result.summary == expected_output.summary 71 | assert len(mock_llm.call_history) == 0 72 | 73 | @pytest.mark.asyncio 74 | async def test_validate_json_valid_plain_json_no_tags_no_markdown(): 75 | raw_response = "{\"action\": {\"name\": \"scroll\", \"params\": {\"direction\": \"down\"}}, \"thought\": \"Scrolling...\", \"summary\": \"Scrolled down\"}" 76 | mock_llm = MockLLMProvider() 77 | 78 | expected_action = ActionModel(name="scroll", params={"direction": "down"}) 79 | expected_output = AgentLLMOutput(action=expected_action, thought="Scrolling...", summary="Scrolled down") 80 | 81 | result = await validate_json(raw_response, mock_llm) 82 | 83 | assert result.action == expected_action 84 | assert result.thought == expected_output.thought 85 | assert result.summary == expected_output.summary 86 | assert len(mock_llm.call_history) == 0 87 | 88 | @pytest.mark.asyncio 89 | async def test_validate_json_needs_cleaning_escaped_chars(): 90 | # Contains \\n which should be cleaned to \n by the first cleaning pass 91 | # Changed input to use standard JSON escape \n instead of \\\\n 92 | raw_response = "{\"action\": {\"name\": \"navigate\", \"params\": {\"url\": \"test.com\"}}, \"thought\": \"Navigating...\\nNext line.\", \"summary\": \"Navigated\"}" 93 | mock_llm = MockLLMProvider() 94 | 95 | expected_action = ActionModel(name="navigate", params={"url": "test.com"}) 96 | # Expected output still has a real newline 97 | expected_output = AgentLLMOutput(action=expected_action, thought="Navigating...\nNext line.", summary="Navigated") 98 | 99 | result = await validate_json(raw_response, mock_llm) 100 | 101 | assert result.action == expected_action 102 | assert result.thought == expected_output.thought # Direct comparison 103 | assert result.summary == expected_output.summary 104 | assert len(mock_llm.call_history) == 0 105 | 106 | @pytest.mark.asyncio 107 | async def test_validate_json_needs_cleaning_control_chars(): 108 | # Contains a control character (bell \x07) that should be removed 109 | raw_response = "{\"action\": {\"name\": \"wait\", \"params\": {}}, \"thought\": \"Waiting...\x07\", \"summary\": \"Waited\"}" 110 | mock_llm = MockLLMProvider() 111 | 112 | expected_action = ActionModel(name="wait", params={}) 113 | expected_output = AgentLLMOutput(action=expected_action, thought="Waiting...", summary="Waited") 114 | 115 | result = await validate_json(raw_response, mock_llm) 116 | 117 | assert result.action.name == expected_action.name 118 | assert result.action.params == expected_action.params 119 | assert result.thought == expected_output.thought 120 | assert result.summary == expected_output.summary 121 | assert len(mock_llm.call_history) == 0 122 | 123 | # --- Tests for generate_proper_json (can be simple, as it's a direct LLM call) --- 124 | @pytest.mark.asyncio 125 | async def test_generate_proper_json_calls_llm_and_strips(): 126 | mock_llm = MockLLMProvider(responses=[" ```json\n{\"key\": \"fixed_value\"}``` "]) 127 | malformed_json = "{key: 'broken_value'" 128 | 129 | result = await generate_proper_json(mock_llm, malformed_json) 130 | 131 | assert result == "{\"key\": \"fixed_value\"}" 132 | assert len(mock_llm.call_history) == 1 133 | # Check prompt content 134 | assert "Problematic JSON string:" in mock_llm.call_history[0][0].content[0].text 135 | assert malformed_json in mock_llm.call_history[0][0].content[0].text 136 | 137 | # More tests for validate_json involving LLM fixes and failures will follow 138 | 139 | @pytest.mark.asyncio 140 | async def test_validate_json_llm_fix_succeeds_on_first_llm_call(): 141 | malformed_raw_response = "{\"action\": {\"name\": \"bad\", \"params\": {\"selector\": \".err\"}}, \"thought\": \"Oops\"summary\": \"Bad JSON\"}" 142 | corrected_json_str = "{\"action\": {\"name\": \"fixed\", \"params\": {\"detail\": \"good\"}}, \"thought\": \"Fixed!\", \"summary\": \"JSON is now good\"}" 143 | 144 | mock_llm = MockLLMProvider(responses=[corrected_json_str]) 145 | 146 | expected_action = ActionModel(name="fixed", params={"detail": "good"}) 147 | expected_output = AgentLLMOutput(action=expected_action, thought="Fixed!", summary="JSON is now good") 148 | 149 | result = await validate_json(malformed_raw_response, mock_llm) 150 | 151 | assert result.action == expected_action 152 | assert result.thought == expected_output.thought 153 | assert result.summary == expected_output.summary 154 | assert len(mock_llm.call_history) == 1 # LLM called once to fix 155 | assert "Problematic JSON string:" in mock_llm.call_history[0][0].content[0].text 156 | # The string passed to LLM should be the extracted content from output tags 157 | assert "{\"action\": {\"name\": \"bad\", \"params\": {\"selector\": \".err\"}}, \"thought\": \"Oops\"summary\": \"Bad JSON\"}" in mock_llm.call_history[0][0].content[0].text 158 | 159 | @pytest.mark.asyncio 160 | async def test_validate_json_llm_fix_succeeds_after_one_failed_llm_fix_attempt(): 161 | malformed_raw_response = "this is very broken" 162 | still_malformed_json_from_llm1 = "{still: \"broken\"" 163 | corrected_json_str_from_llm2 = "{\"action\": {\"name\": \"finally_fixed\", \"params\": {}}, \"thought\": \"Phew!\", \"summary\": \"Fixed on second try\"}" 164 | 165 | mock_llm = MockLLMProvider(responses=[still_malformed_json_from_llm1, corrected_json_str_from_llm2]) 166 | 167 | expected_action = ActionModel(name="finally_fixed", params={}) 168 | expected_output = AgentLLMOutput(action=expected_action, thought="Phew!", summary="Fixed on second try") 169 | 170 | result = await validate_json(malformed_raw_response, mock_llm, max_retries=3) 171 | 172 | assert result.action.name == expected_action.name 173 | assert result.action.params == expected_action.params 174 | assert result.thought == expected_output.thought 175 | assert result.summary == expected_output.summary 176 | assert len(mock_llm.call_history) == 2 # LLM called twice 177 | # Check what was sent to LLM on first call 178 | assert "this is very broken" in mock_llm.call_history[0][0].content[0].text 179 | # Check what was sent to LLM on second call 180 | assert still_malformed_json_from_llm1 in mock_llm.call_history[1][0].content[0].text 181 | 182 | @pytest.mark.asyncio 183 | async def test_validate_json_fails_after_max_retries_with_llm(): 184 | malformed_raw_response = "totally unfixable {" 185 | bad_fix1 = "{attempt1: 'bad'" 186 | bad_fix2 = "{attempt2: 'still bad'" 187 | bad_fix3 = "{attempt3: 'nope'" # Assuming max_retries is 3 by default in validate_json 188 | 189 | mock_llm = MockLLMProvider(responses=[bad_fix1, bad_fix2, bad_fix3]) 190 | 191 | with pytest.raises(ValueError) as excinfo: 192 | await validate_json(malformed_raw_response, mock_llm, max_retries=3) 193 | 194 | assert "Could not parse or validate response after 3 attempts" in str(excinfo.value) 195 | assert len(mock_llm.call_history) == 2 # Corrected from 3 to 2 196 | # The final problematic string in the error message should be the last one LLM produced 197 | assert f"Final problematic JSON string after all attempts: '{bad_fix2[:500]}" in str(excinfo.value) # LLM is called twice, so bad_fix2 is the last output from LLM 198 | 199 | @pytest.mark.asyncio 200 | async def test_validate_json_empty_string_after_extraction(): 201 | # Scenario: or 202 | raw_response = " " 203 | mock_llm = MockLLMProvider() # Returns empty string by default 204 | 205 | with pytest.raises(ValueError) as excinfo: 206 | await validate_json(raw_response, mock_llm) 207 | 208 | assert "Could not parse or validate response" in str(excinfo.value) 209 | assert "Final problematic JSON string after all attempts: '...'" in str(excinfo.value) 210 | # LLM is called max_retries - 1 = 2 times in this path 211 | assert len(mock_llm.call_history) == 2 212 | 213 | @pytest.mark.asyncio 214 | async def test_validate_json_llm_call_itself_fails(): 215 | malformed_raw_response = "broken { " 216 | mock_llm = MockLLMProvider(call_should_fail=True, exception_to_raise=RuntimeError("LLM service down")) 217 | 218 | with pytest.raises(ValueError) as excinfo: 219 | await validate_json(malformed_raw_response, mock_llm, max_retries=3) 220 | 221 | assert "Could not parse or validate response after 3 attempts" in str(excinfo.value) 222 | assert len(mock_llm.call_history) == 2 # Ensure LLM call count is 2 223 | # Check that the error message ENDS with the expected final string part 224 | expected_ending = "Final problematic JSON string after all attempts: 'broken {...'" 225 | assert str(excinfo.value).endswith(expected_ending) 226 | 227 | @pytest.mark.asyncio 228 | async def test_validate_json_llm_fix_unescaped_quotes(): 229 | # Input has unescaped double quotes inside string values 230 | malformed_core = '''{ 231 | "action": { 232 | "name": "click_element", 233 | "params": { 234 | "index": 24, 235 | "wait_after_click": true 236 | } 237 | }, 238 | "thought": "The available options for batches are "ik12" (index 24).", 239 | "summary": "Trying to click on "ik12" which could be X25." 240 | } 241 | ''' 242 | malformed_raw_response = f"{malformed_core.strip()}" 243 | 244 | # Expected corrected JSON from LLM (with escaped quotes) 245 | corrected_json_string = """ 246 | { 247 | "action": { 248 | "name": "click_element", 249 | "params": { 250 | "index": 24, 251 | "wait_after_click": true 252 | } 253 | }, 254 | "thought": "The available options for batches are \\\"ik12\\\" (index 24).", 255 | "summary": "Trying to click on \\\"ik12\\\" which could be X25." 256 | } 257 | """ 258 | 259 | # Mock LLM returns the corrected version on the first call 260 | mock_llm = MockLLMProvider(responses=[corrected_json_string.strip()]) 261 | 262 | # Expected Python object representation 263 | expected_action = ActionModel( 264 | name="click_element", 265 | params={"index": 24, "wait_after_click": True} 266 | ) 267 | expected_thought = 'The available options for batches are "ik12" (index 24).' 268 | expected_summary = 'Trying to click on "ik12" which could be X25.' 269 | 270 | # Run the validation 271 | result = await validate_json(malformed_raw_response, mock_llm) 272 | 273 | # Assertions 274 | assert result.action == expected_action 275 | assert result.thought == expected_thought 276 | assert result.summary == expected_summary 277 | assert len(mock_llm.call_history) == 1 # LLM should be called exactly once 278 | # Check that the LLM was called with the initially extracted (malformed) string 279 | assert malformed_core.strip() in mock_llm.call_history[0][0].content[0].text --------------------------------------------------------------------------------