├── .github
    └── workflows
    │   └── publish.yml
├── .gitignore
├── .vscode
    └── settings.json
├── LICENSE
├── README.md
├── evals
    ├── eval_generate_json.py
    └── eval_output_model.py
├── index
    ├── __init__.py
    ├── agent
    │   ├── agent.py
    │   ├── demo_images
    │   │   ├── complex_layout_highlight.png
    │   │   ├── complex_layout_small_elements.png
    │   │   ├── loading.png
    │   │   └── scroll.png
    │   ├── message_manager.py
    │   ├── models.py
    │   ├── prompts.py
    │   └── utils.py
    ├── browser
    │   ├── browser.py
    │   ├── detector.py
    │   ├── findVisibleInteractiveElements.js
    │   ├── fonts
    │   │   └── OpenSans-Medium.ttf
    │   ├── models.py
    │   └── utils.py
    ├── cli.py
    ├── controller
    │   ├── controller.py
    │   └── default_actions.py
    └── llm
    │   ├── llm.py
    │   └── providers
    │       ├── __init__.py
    │       ├── anthropic.py
    │       ├── anthropic_bedrock.py
    │       ├── gemini.py
    │       ├── gemini_vertex.py
    │       ├── groq.py
    │       └── openai.py
├── pyproject.toml
├── static
    ├── logo_dark.png
    ├── logo_light.png
    └── traces.png
├── tests
    └── agent
    │   └── test_utils.py
└── uv.lock


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python Package
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | jobs:
12 |   publish:
13 |     runs-on: ubuntu-latest
14 |     environment:
15 |       name: pypi
16 |       url: https://pypi.org/p/lmnr/
17 |     permissions:
18 |       id-token: write
19 |     steps:
20 |     - uses: actions/checkout@v4
21 |     - name: Install uv
22 |       uses: astral-sh/setup-uv@v4
23 |     - name: Set up Python
24 |       uses: actions/setup-python@v5
25 |       with:
26 |         python-version: '3.10'
27 |     - name: Install the project
28 |       run: uv sync --all-extras --dev
29 |     - name: Verify tag matches package version
30 |       run: |
31 |         # Extract version from tag (remove 'v' prefix)
32 |         TAG_VERSION=${GITHUB_REF#refs/tags/v}
33 |         # Extract version from pyproject.toml
34 |         PACKAGE_VERSION=$(grep -oP '(?<=version = ")[^"]+' pyproject.toml)
35 |         echo "Tag version: $TAG_VERSION"
36 |         echo "Package version: $PACKAGE_VERSION"
37 |         # Check if versions match
38 |         if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
39 |           echo "Error: Tag version ($TAG_VERSION) does not match package version ($PACKAGE_VERSION)"
40 |           exit 1
41 |         fi
42 |     - name: Build package
43 |       run: uv build
44 |     - name: Publish package
45 |       uses: pypa/gh-action-pypi-publish@release/v1


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # poetry
100 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
102 | #   commonly ignored for libraries.
103 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 | 
106 | # pdm
107 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | #   in version control.
111 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
112 | .pdm.toml
113 | .pdm-python
114 | .pdm-build/
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "[python]": {
 3 |     "editor.codeActionsOnSave": {
 4 |       "source.fixAll": "explicit",
 5 |       "source.organizeImports": "explicit"
 6 |     },
 7 |     "editor.defaultFormatter": "charliermarsh.ruff"
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [2025] [LMNR AI, Inc.]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <a href="https://github.com/lmnr-ai/index">![GitHub stars](https://img.shields.io/github/stars/lmnr-ai/index?style=social)</a>
  2 | <a href="https://www.ycombinator.com/companies/laminar-ai">![Static Badge](https://img.shields.io/badge/Y%20Combinator-S24-orange)</a>
  3 | <a href="https://x.com/lmnrai">![X (formerly Twitter) Follow](https://img.shields.io/twitter/follow/lmnrai)</a>
  4 | <a href="https://discord.gg/nNFUUDAKub"> ![Static Badge](https://img.shields.io/badge/Join_Discord-464646?&logo=discord&logoColor=5865F2) </a>
  5 | 
  6 | <picture>
  7 |   <source media="(prefers-color-scheme: dark)" srcset="./static/logo_dark.png">
  8 |   <source media="(prefers-color-scheme: light)" srcset="./static/logo_light.png">
  9 |   <img alt="Laminar logo" src="./static/logo_light.png">
 10 | </picture>
 11 | 
 12 | # Index
 13 | 
 14 | Index is a state-of-the-art open-source browser agent that autonomously executes complex web tasks. It turns any website into an accessible API and can be seamlessly integrated with just a few lines of code.
 15 | 
 16 | - [x] Powered by reasoning LLMs with vision capabilities.
 17 |     - [x] Gemini 2.5 Pro (really fast and accurate)
 18 |     - [x] Claude 3.7 Sonnet with extended thinking (reliable and accurate)
 19 |     - [x] OpenAI o4-mini (depending on the reasoning effort, provides good balance between speed, cost and accuracy)
 20 |     - [x] Gemini 2.5 Flash (really fast, cheap, and good for less complex tasks)
 21 | - [x] `pip install lmnr-index` and use it in your project
 22 | - [x] `index run` to run the agent in the interactive CLI
 23 | - [x] Supports structured output via Pydantic schemas for reliable data extraction.
 24 | - [x] Index is also available as a [serverless API.](https://docs.lmnr.ai/index-agent/api/getting-started)
 25 | - [x] You can also try out Index via [Chat UI](https://lmnr.ai/chat).
 26 | - [x] Supports advanced [browser agent observability](https://docs.lmnr.ai/index-agent/tracing) powered by open-source platform [Laminar](https://github.com/lmnr-ai/lmnr).
 27 | 
 28 | prompt: go to ycombinator.com. summarize first 3 companies in the W25 batch and make new spreadsheet in google sheets.
 29 | 
 30 | https://github.com/user-attachments/assets/2b46ee20-81b6-4188-92fb-4d97fe0b3d6a
 31 | 
 32 | ## Documentation
 33 | 
 34 | Check out full documentation [here](https://docs.lmnr.ai/index-agent/getting-started)
 35 | 
 36 | ## Quickstart
 37 | 
 38 | ### Install dependencies
 39 | ```bash
 40 | pip install lmnr-index 'lmnr[all]'
 41 | 
 42 | # Install playwright
 43 | playwright install chromium
 44 | ```
 45 | 
 46 | ### Setup model API keys
 47 | 
 48 | Setup your model API keys in `.env` file in your project root:
 49 | ```
 50 | GEMINI_API_KEY=
 51 | ANTHROPIC_API_KEY=
 52 | OPENAI_API_KEY=
 53 | # Optional, to trace the agent's actions and record browser session
 54 | LMNR_PROJECT_API_KEY=
 55 | ```
 56 | 
 57 | ### Run Index with code
 58 | ```python
 59 | import asyncio
 60 | from index import Agent, GeminiProvider
 61 | from pydantic import BaseModel
 62 | from lmnr import Laminar
 63 | import os
 64 | 
 65 | # to trace the agent's actions and record browser session
 66 | Laminar.initialize()
 67 | 
 68 | # Define Pydantic schema for structured output
 69 | class NewsSummary(BaseModel):
 70 |     title: str
 71 |     summary: str
 72 | 
 73 | async def main():
 74 | 
 75 |     llm = GeminiProvider(model="gemini-2.5-pro-preview-05-06")
 76 |     agent = Agent(llm=llm)
 77 | 
 78 |     # Example of getting structured output
 79 |     output = await agent.run(
 80 |         prompt="Navigate to news.ycombinator.com, find a post about AI, extract its title and provide a concise summary.",
 81 |         output_model=NewsSummary
 82 |     )
 83 |     
 84 |     summary = NewsSummary.model_validate(output.result.content)
 85 |     print(f"Title: {summary.title}")
 86 |     print(f"Summary: {summary.summary}")
 87 |     
 88 | if __name__ == "__main__":
 89 |     asyncio.run(main())
 90 | ```
 91 | 
 92 | ### Run Index with CLI
 93 | 
 94 | Index CLI features:
 95 | - Browser state persistence between sessions
 96 | - Follow-up messages with support for "give human control" action
 97 | - Real-time streaming updates
 98 | - Beautiful terminal UI using Textual
 99 | 
100 | You can run Index CLI with the following command.
101 | ```bash
102 | index run
103 | ```
104 | 
105 | Output will look like this:
106 | 
107 | ```
108 | Loaded existing browser state
109 | ╭───────────────────── Interactive Mode ─────────────────────╮
110 | │ Index Browser Agent Interactive Mode                       │
111 | │ Type your message and press Enter. The agent will respond. │
112 | │ Press Ctrl+C to exit.                                      │
113 | ╰────────────────────────────────────────────────────────────╯
114 | 
115 | Choose an LLM model:
116 | 1. Gemini 2.5 Flash
117 | 2. Claude 3.7 Sonnet
118 | 3. OpenAI o4-mini
119 | Select model [1/2] (1): 3
120 | Using OpenAI model: o4-mini
121 | Loaded existing browser state
122 | 
123 | Your message: go to lmnr.ai, summarize pricing page
124 | 
125 | Agent is working...
126 | Step 1: Opening lmnr.ai
127 | Step 2: Opening Pricing page
128 | Step 3: Scrolling for more pricing details
129 | Step 4: Scrolling back up to view pricing tiers
130 | Step 5: Provided concise summary of the three pricing tiers
131 | ```
132 | 
133 | ### Running CLI with a personal Chrome instance
134 | 
135 | You can use Index with personal Chrome browser instance instead of launching a new browser. Main advantage is that all your existing logged-in sessions will be available.
136 | 
137 | ```bash
138 | # Basic usage with default Chrome path
139 | index run --local-chrome
140 | ```
141 | 
142 | ## Use Index via API
143 | 
144 | The easiest way to use Index in production is with [serverless API](https://docs.lmnr.ai/index-agent/api/getting-started). Index API manages remote browser sessions, agent infrastructure and [browser observability](https://docs.lmnr.ai/index-agent/api/tracing). To get started, create a project API key in [Laminar](https://lmnr.ai).
145 | 
146 | ### Install Laminar
147 | ```bash
148 | pip install lmnr
149 | ```
150 | 
151 | ### Use Index via API
152 | ```python
153 | from lmnr import Laminar, LaminarClient
154 | # you can also set LMNR_PROJECT_API_KEY environment variable
155 | 
156 | # Initialize tracing
157 | Laminar.initialize(project_api_key="your_api_key")
158 | 
159 | # Initialize the client
160 | client = LaminarClient(project_api_key="your_api_key")
161 | 
162 | for chunk in client.agent.run(
163 |     stream=True,
164 |     model_provider="gemini",
165 |     model="gemini-2.5-pro-preview-05-06",
166 |     prompt="Navigate to news.ycombinator.com, find a post about AI, and summarize it"
167 | ):
168 |     print(chunk)
169 |     
170 | ```
171 | 
172 | 
173 | ## Browser agent observability
174 | 
175 | Both code run and API run provide advanced browser observability. To trace Index agent's actions and record browser session you simply need to initialize Laminar tracing before running the agent.
176 | 
177 | ```python
178 | from lmnr import Laminar
179 | 
180 | Laminar.initialize(project_api_key="your_api_key")
181 | ```
182 | 
183 | Then you will get full observability on the agent's actions synced with the browser session in the Laminar platform. Learn more about browser agent observability in the [documentation](https://docs.lmnr.ai/index-agent/tracing).
184 | 
185 | <picture>
186 |     <img src="./static/traces.png" alt="Index observability" width="800"/>
187 | </picture>
188 | 
189 | ---
190 | 
191 | Made with ❤️ by the [Laminar team](https://lmnr.ai)
192 | 


--------------------------------------------------------------------------------
/evals/eval_generate_json.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import Any, Dict
  3 | 
  4 | from lmnr import evaluate
  5 | 
  6 | from index import AnthropicProvider
  7 | from index.agent.utils import generate_proper_json
  8 | 
  9 | llm = AnthropicProvider(model="claude-3-7-sonnet-20250219", enable_thinking=True, thinking_token_budget=1024)
 10 |     
 11 | async def run_json_correction(data: Dict[str, Any]):
 12 |     """Execute the JSON correction function."""
 13 |     malformed_json = data["malformed_json"]
 14 |     # We'll need an LLM provider. Let's use GeminiProvider as in the reference.
 15 |     # In a real scenario, you might want to configure this or pass it differently.
 16 |     
 17 |     corrected_json_str = await generate_proper_json(llm=llm, json_str=malformed_json)
 18 |     
 19 |     # The function returns a string, let's try to parse it to ensure it's valid JSON for the eval
 20 |     try:
 21 |         return json.loads(corrected_json_str)
 22 |     except json.JSONDecodeError:
 23 |         # If it's not valid JSON, return the string itself for the evaluator to handle
 24 |         return corrected_json_str
 25 | 
 26 | 
 27 | async def eval_json_correction(output: Any, target: Dict[str, Any]):
 28 |     """Evaluate the JSON correction accuracy."""
 29 |     # Assuming target is a Python dict representing the expected JSON
 30 |     # And output is also a Python dict (if parsing was successful) or a string
 31 |     
 32 |     if isinstance(output, str):
 33 |         # This means the corrected_json_str was not valid JSON
 34 |         # For this simple eval, we can consider this a failure if the target is a dict
 35 |         # Or, if the target itself is expected to be a non-JSON string (e.g. an error message)
 36 |         # For now, let's assume target is always a valid JSON object.
 37 |         try:
 38 |             # Attempt to parse the output string here for comparison
 39 |             output_dict = json.loads(output)
 40 |             exact_match = output_dict == target
 41 |         except json.JSONDecodeError:
 42 |             exact_match = False # Output was not valid JSON
 43 |     else: # Output is already a dict
 44 |         exact_match = output == target
 45 |     
 46 |     return exact_match
 47 | 
 48 | test_data = [
 49 |     {
 50 |         "data": {
 51 |             # Trailing comma, single quotes
 52 |             "malformed_json": "{'name': 'John Doe', 'age': 30, 'city': 'New York',}",
 53 |         },
 54 |         "target": {
 55 |             "name": "John Doe",
 56 |             "age": 30,
 57 |             "city": "New York"
 58 |         }
 59 |     },
 60 |     {
 61 |         "data": {
 62 |             "malformed_json": '''{
 63 |                 "item": "Book",
 64 |                 "details": {
 65 |                     "title": "The "Great Gatsby"",
 66 |                     "author": "F. Scott Fitzgerald"
 67 |                 },
 68 |                 "price": 10.99
 69 |             }'''
 70 |         },
 71 |         "target": {
 72 |             "item": "Book",
 73 |             "details": {
 74 |                 "title": "The \"Great Gatsby\"",
 75 |                 "author": "F. Scott Fitzgerald"
 76 |             },
 77 |             "price": 10.99
 78 |         }
 79 |     },
 80 |     {
 81 |         "data": {
 82 |             # No closing brace
 83 |             "malformed_json": '''{
 84 |                 "key1": "value1",
 85 |                 "key2": "value2"
 86 |             ''' # Corrected: Removed trailing content that looked like a comment inside string
 87 |         },
 88 |         "target": {
 89 |             "key1": "value1",
 90 |             "key2": "value2"
 91 |         }
 92 |     },
 93 |     {
 94 |         "data": {
 95 |             # JSON with comments (not standard, should be removed by the fixer)
 96 |             "malformed_json": '''{
 97 |                 // This is a comment
 98 |                 "product_id": 123,
 99 |                 "status": "active"
100 |             }'''
101 |         },
102 |         "target": {
103 |             "product_id": 123,
104 |             "status": "active"
105 |         }
106 |     },
107 |     # Example of a more complex malformed JSON
108 |     {
109 |         "data": {
110 |             "malformed_json": "{\"name\": \"incomplete, \"value\": [1, 2, \"unfinished_array\"" # Missing closing bracket and quote
111 |         },
112 |         "target": { # Assuming the LLM can make a reasonable guess or fix structure
113 |             "name": "incomplete",
114 |             "value": [1, 2, "unfinished_array"]
115 |         }
116 |     },
117 |     {
118 |         "data": {
119 |             "malformed_json": "{'key with space': 'value', 'another key': true, 'numeric_string': '123.45' }" # Single quotes, boolean
120 |         },
121 |         "target": {
122 |             "key with space": "value",
123 |             "another key": True, # Python bool
124 |             "numeric_string": "123.45"
125 |         }
126 |     }
127 | ]
128 | 
129 | # Run the evaluation
130 | evaluate(
131 |     data=test_data,
132 |     executor=run_json_correction,
133 |     evaluators={"json_correction_accuracy": eval_json_correction},
134 |     concurrency_limit=10,
135 |     group_name="json_correction_eval",
136 | )
137 | 


--------------------------------------------------------------------------------
/evals/eval_output_model.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Any, Dict
 3 | 
 4 | from lmnr import evaluate
 5 | from pydantic import BaseModel
 6 | 
 7 | from index import Agent, GeminiProvider
 8 | 
 9 | 
10 | class CountryInfo(BaseModel):
11 |     """Model for country information extraction"""
12 |     country: str
13 |     capital: str
14 |     currency: str
15 | 
16 | 
17 | async def run_agent(data: Dict[str, Any]):
18 |     """Execute the agent with data extraction based on output_model"""
19 |     prompt = data["prompt"]
20 |     output_model = data.get("output_model")
21 |     start_url = data.get("start_url")
22 |     
23 |     llm = GeminiProvider(model="gemini-2.5-pro-preview-03-25")
24 |     
25 |     agent = Agent(llm=llm)
26 |     output = await agent.run(
27 |         prompt=prompt,
28 |         output_model=output_model,
29 |         start_url=start_url
30 |     )
31 |     
32 |     return output.result.content
33 | 
34 | 
35 | async def eval_extraction(output: Dict[str, Any], target: Dict[str, Any]):
36 |     """Evaluate the extraction accuracy"""
37 | 
38 |     exact_match = json.dumps(output, sort_keys=True) == json.dumps(target, sort_keys=True)
39 |     
40 |     return exact_match
41 | 
42 | data = [
43 |     {
44 |         "data": {
45 |             "prompt": "Extract information about France. For currency only use text description, such as 'Euro'.",
46 |             "output_model": CountryInfo,
47 |             "start_url": "https://en.wikipedia.org/wiki/France"
48 |         },
49 |         "target": {
50 |             "country": "France",
51 |             "capital": "Paris",
52 |             "currency": "Euro"
53 |         }
54 |     },
55 |     {
56 |         "data": {
57 |             "prompt": "Extract information about Japan. For currency only use text description, such as 'Euro'.",
58 |             "output_model": CountryInfo,
59 |             "start_url": "https://en.wikipedia.org/wiki/Japan"
60 |         },
61 |         "target": {
62 |             "country": "Japan",
63 |             "capital": "Tokyo",
64 |             "currency": "Japanese yen"
65 |         }
66 |     },
67 |     {
68 |         "data": {
69 |             "prompt": "Extract information about Brazil. For currency only use text description, such as 'Euro'.",
70 |             "output_model": CountryInfo,
71 |             "start_url": "https://en.wikipedia.org/wiki/Brazil"
72 |         },
73 |         "target": {
74 |             "country": "Brazil",
75 |             "capital": "Brasília",
76 |             "currency": "Real"
77 |         }
78 |     },
79 | ]
80 | 
81 | evaluate(
82 |     data=data,
83 |     executor=run_agent,
84 |     evaluators={"accuracy": eval_extraction},
85 |     concurrency_limit=1,
86 |     group_name="country_extraction",
87 | )
88 | 


--------------------------------------------------------------------------------
/index/__init__.py:
--------------------------------------------------------------------------------
 1 | from index.agent.agent import Agent
 2 | from index.agent.models import ActionModel, ActionResult, AgentOutput
 3 | from index.browser.browser import Browser, BrowserConfig
 4 | from index.browser.detector import Detector
 5 | from index.browser.models import InteractiveElement
 6 | from index.llm.providers.anthropic import AnthropicProvider
 7 | from index.llm.providers.anthropic_bedrock import AnthropicBedrockProvider
 8 | from index.llm.providers.gemini import GeminiProvider
 9 | from index.llm.providers.gemini_vertex import GeminiVertexProvider
10 | from index.llm.providers.groq import GroqProvider
11 | from index.llm.providers.openai import OpenAIProvider
12 | 
13 | __all__ = [
14 | 	'Agent',
15 | 	'Browser',
16 | 	'BrowserConfig',
17 | 	'ActionResult',
18 | 	'ActionModel',
19 | 	'AnthropicProvider',
20 | 	'AnthropicBedrockProvider',
21 | 	'OpenAIProvider',
22 | 	'GeminiProvider',
23 | 	'GeminiVertexProvider',
24 | 	'GroqProvider',
25 | 	'AgentOutput',
26 | 	'Detector',
27 | 	'InteractiveElement',
28 | ]
29 | 


--------------------------------------------------------------------------------
/index/agent/agent.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import logging
  4 | import time
  5 | import uuid
  6 | from typing import AsyncGenerator, Optional
  7 | 
  8 | from dotenv import load_dotenv
  9 | from lmnr import Laminar, LaminarSpanContext, observe, use_span
 10 | from pydantic import BaseModel
 11 | 
 12 | from index.agent.message_manager import MessageManager
 13 | from index.agent.models import (
 14 | 	ActionResult,
 15 | 	AgentLLMOutput,
 16 | 	AgentOutput,
 17 | 	AgentState,
 18 | 	AgentStreamChunk,
 19 | 	FinalOutputChunk,
 20 | 	StepChunk,
 21 | 	StepChunkContent,
 22 | 	StepChunkError,
 23 | 	TimeoutChunk,
 24 | 	TimeoutChunkContent,
 25 | )
 26 | from index.agent.utils import validate_json
 27 | from index.browser.browser import Browser, BrowserConfig
 28 | from index.controller.controller import Controller
 29 | from index.llm.llm import BaseLLMProvider, Message
 30 | 
 31 | load_dotenv()
 32 | logger = logging.getLogger(__name__)
 33 | 
 34 | class Agent:
 35 | 	def __init__(
 36 | 		self,
 37 | 		llm: BaseLLMProvider,
 38 | 		browser_config: BrowserConfig | None = None
 39 | 	):
 40 | 		self.llm = llm
 41 | 		self.controller = Controller()
 42 | 
 43 | 		# Initialize browser or use the provided one
 44 | 		self.browser = Browser(config=browser_config if browser_config is not None else BrowserConfig())
 45 | 		
 46 | 		action_descriptions = self.controller.get_action_descriptions()
 47 | 
 48 | 		self.message_manager = MessageManager(
 49 | 			action_descriptions=action_descriptions,
 50 | 		)
 51 | 
 52 | 		self.state = AgentState(
 53 | 			messages=[],
 54 | 		)
 55 | 
 56 | 	async def step(self, step: int, previous_result: ActionResult | None = None, step_span_context: Optional[LaminarSpanContext] = None) -> tuple[ActionResult, str]:
 57 | 		"""Execute one step of the task"""
 58 | 
 59 | 		with Laminar.start_as_current_span(
 60 | 			name="agent.step",
 61 | 			parent_span_context=step_span_context,
 62 | 			input={
 63 | 				"step": step,
 64 | 			},
 65 | 		):
 66 | 			state = await self.browser.update_state()
 67 | 
 68 | 			if previous_result:
 69 | 				self.message_manager.add_current_state_message(state, previous_result)
 70 | 
 71 | 			input_messages = self.message_manager.get_messages()
 72 | 
 73 | 			try:
 74 | 				model_output = await self._generate_action(input_messages)
 75 | 			except Exception as e:
 76 | 				# model call failed, remove last state message from history before retrying
 77 | 				self.message_manager.remove_last_message()
 78 | 				raise e
 79 | 			
 80 | 			if previous_result:
 81 | 				# we're removing the state message that we've just added because we want to append it in a different format
 82 | 				self.message_manager.remove_last_message()
 83 | 
 84 | 			self.message_manager.add_message_from_model_output(step, previous_result, model_output, state.screenshot)
 85 | 			
 86 | 			try:
 87 | 				result: ActionResult = await self.controller.execute_action(
 88 | 					model_output.action,
 89 | 					self.browser
 90 | 				)
 91 | 
 92 | 				if result.is_done:
 93 | 					logger.info(f'Result: {result.content}')
 94 | 					self.final_output = result.content
 95 | 
 96 | 				return result, model_output.summary
 97 | 				
 98 | 			except Exception as e:
 99 | 				raise e
100 | 
101 | 
102 | 	@observe(name='agent.generate_action', ignore_input=True)
103 | 	async def _generate_action(self, input_messages: list[Message]) -> AgentLLMOutput:
104 | 		"""Get next action from LLM based on current state"""
105 | 
106 | 		response = await self.llm.call(input_messages)
107 | 		
108 | 		try:
109 | 			# Pass the raw LLM response content to validate_json
110 | 			output = await validate_json(response.content, self.llm)
111 | 			
112 | 			logger.info(f'💡 Thought: {output.thought}')
113 | 			logger.info(f'💡 Summary: {output.summary}')
114 | 			logger.info(f'🛠️ Action: {output.action.model_dump_json(exclude_unset=True)}')
115 | 			
116 | 			if response.thinking:
117 | 				output.thinking_block = response.thinking
118 | 
119 | 			return output
120 | 		except ValueError as e:
121 | 			# Re-raise the ValueError from validate_json, which now includes detailed context
122 | 			logger.error(f"Failed to generate and validate action after multiple retries: {e}")
123 | 			raise e
124 | 
125 | 	async def _setup_messages(self, 
126 | 							prompt: str, 
127 | 							agent_state: str | None = None, 
128 | 							start_url: str | None = None,
129 | 							output_model: BaseModel | str | None = None
130 | 							):
131 | 		"""Set up messages based on state dict or initialize with system message"""
132 | 		if agent_state:
133 | 			# assuming that the structure of the state.messages is correct
134 | 			state = AgentState.model_validate_json(agent_state)
135 | 			self.message_manager.set_messages(state.messages)
136 | 			# Update browser_context to browser
137 | 			browser_state = await self.browser.update_state()
138 | 			self.message_manager.add_current_state_message(browser_state, user_follow_up_message=prompt)
139 | 		else:
140 | 			self.message_manager.add_system_message_and_user_prompt(prompt, output_model)
141 | 
142 | 			if start_url:
143 | 				await self.browser.goto(start_url)
144 | 				browser_state = await self.browser.update_state()
145 | 				self.message_manager.add_current_state_message(browser_state)
146 | 				
147 | 
148 | 	async def run(self, 
149 | 			   	prompt: str,
150 | 			   	max_steps: int = 100,
151 | 				agent_state: str | None = None,
152 | 			   	parent_span_context: Optional[LaminarSpanContext] = None, 		
153 | 			   	close_context: bool = True,
154 | 			   	session_id: str | None = None,
155 | 			   	return_agent_state: bool = False,
156 | 			   	return_storage_state: bool = False,
157 | 			   	start_url: str | None = None,
158 | 			   	output_model: BaseModel | str | None = None
159 | 	) -> AgentOutput:
160 | 		"""Execute the task with maximum number of steps and return the final result
161 | 		
162 | 		Args:
163 | 			prompt: The prompt to execute the task with
164 | 			max_steps: The maximum number of steps to execute the task with. Defaults to 100.
165 | 			agent_state: Optional, the state of the agent to execute the task with
166 | 			parent_span_context: Optional, parent span context in Laminar format to execute the task with
167 | 			close_context: Whether to close the browser context after the task is executed
168 | 			session_id: Optional, Agent session id
169 | 			return_agent_state: Whether to return the agent state with the final output
170 | 			return_storage_state: Whether to return the storage state with the final output
171 | 			start_url: Optional, the URL to start the task with
172 | 			output_model: Optional, the output model to use for the task
173 | 		"""
174 | 
175 | 		if prompt is None and agent_state is None:
176 | 			raise ValueError("Either prompt or agent_state must be provided")
177 | 
178 | 		with Laminar.start_as_current_span(
179 | 			name="agent.run",
180 | 			parent_span_context=parent_span_context,
181 | 			input={
182 | 				"prompt": prompt,
183 | 				"max_steps": max_steps,
184 | 				"stream": False,
185 | 			},
186 | 		) as span:
187 | 			if session_id is not None:
188 | 				span.set_attribute("lmnr.internal.agent_session_id", session_id)
189 | 			
190 | 			await self._setup_messages(prompt, agent_state, start_url, output_model)
191 | 
192 | 			step = 0
193 | 			result = None
194 | 			is_done = False
195 | 
196 | 			trace_id = str(uuid.UUID(int=span.get_span_context().trace_id))
197 | 
198 | 			try:
199 | 				while not is_done and step < max_steps:
200 | 					logger.info(f'📍 Step {step}')
201 | 					result, _ = await self.step(step, result)
202 | 					step += 1
203 | 					is_done = result.is_done
204 | 					
205 | 					if is_done:
206 | 						logger.info(f'✅ Task completed successfully in {step} steps')
207 | 						break
208 | 						
209 | 				if not is_done:
210 | 					logger.info('❌ Maximum number of steps reached')
211 | 
212 | 			except Exception as e:
213 | 				logger.info(f'❌ Error in run: {e}')
214 | 				raise e
215 | 			finally:
216 | 				storage_state = await self.browser.get_storage_state()
217 | 
218 | 				if close_context:
219 | 					# Update to close the browser directly
220 | 					await self.browser.close()
221 | 
222 | 				span.set_attribute("lmnr.span.output", result.model_dump_json())
223 | 
224 | 				return AgentOutput(
225 | 					agent_state=self.get_state() if return_agent_state else None,
226 | 					result=result,
227 | 					storage_state=storage_state if return_storage_state else None,
228 | 					step_count=step,
229 | 					trace_id=trace_id,
230 | 				)
231 | 
232 | 	async def run_stream(self, 
233 | 						prompt: str,
234 | 						max_steps: int = 100, 
235 | 						agent_state: str | None = None,
236 | 						parent_span_context: Optional[LaminarSpanContext] = None,
237 | 						close_context: bool = True,
238 | 						timeout: Optional[int] = None,
239 | 						session_id: str | None = None,
240 | 						return_screenshots: bool = False,
241 | 						return_agent_state: bool = False,
242 | 						return_storage_state: bool = False,
243 | 						start_url: str | None = None,
244 | 						output_model: BaseModel | str | None = None
245 | 						) -> AsyncGenerator[AgentStreamChunk, None]:
246 | 		"""Execute the task with maximum number of steps and stream step chunks as they happen
247 | 		
248 | 		Args:
249 | 			prompt: The prompt to execute the task with
250 | 			max_steps: The maximum number of steps to execute the task with
251 | 			agent_state: The state of the agent to execute the task with
252 | 			parent_span_context: Parent span context in Laminar format to execute the task with
253 | 			close_context: Whether to close the browser context after the task is executed
254 | 			timeout: The timeout for the task
255 | 			session_id: Agent session id
256 | 			return_screenshots: Whether to return screenshots with the step chunks
257 | 			return_agent_state: Whether to return the agent state with the final output chunk
258 | 			return_storage_state: Whether to return the storage state with the final output chunk
259 | 			start_url: Optional, the URL to start the task with
260 | 			output_model: Optional, the output model to use for the task
261 | 		"""
262 | 		
263 | 		# Create a span for the streaming execution
264 | 		span = Laminar.start_span(
265 | 			name="agent.run_stream",
266 | 			parent_span_context=parent_span_context,
267 | 			input={
268 | 				"prompt": prompt,
269 | 				"max_steps": max_steps,
270 | 				"stream": True,
271 | 			},
272 | 		)
273 | 
274 | 		trace_id = str(uuid.UUID(int=span.get_span_context().trace_id))
275 | 		
276 | 		if session_id is not None:
277 | 			span.set_attribute("lmnr.internal.agent_session_id", session_id)
278 | 		
279 | 		with use_span(span):
280 | 			await self._setup_messages(prompt, agent_state, start_url, output_model)
281 | 
282 | 		step = 0
283 | 		result = None
284 | 		is_done = False
285 | 
286 | 		if timeout is not None:
287 | 			start_time = time.time()
288 | 
289 | 		try:
290 | 			# Execute steps and yield results
291 | 			while not is_done and step < max_steps:
292 | 				logger.info(f'📍 Step {step}')
293 | 
294 | 				with use_span(span):
295 | 					result, summary = await self.step(step, result)
296 | 
297 | 				step += 1
298 | 				is_done = result.is_done
299 | 
300 | 				screenshot = None
301 | 				if return_screenshots:
302 | 					state = self.browser.get_state()
303 | 					screenshot = state.screenshot
304 | 
305 | 				if timeout is not None and time.time() - start_time > timeout:
306 | 					
307 | 					yield TimeoutChunk(
308 | 							content=TimeoutChunkContent(
309 | 										action_result=result, 
310 | 										summary=summary, 
311 | 										step=step, 
312 | 										agent_state=self.get_state() if return_agent_state else None, 
313 | 										screenshot=screenshot,
314 | 										trace_id=trace_id
315 | 										)
316 | 					)
317 | 					return
318 | 
319 | 				yield StepChunk(
320 | 						content=StepChunkContent(
321 | 									action_result=result, 
322 | 									summary=summary, 
323 | 									trace_id=trace_id,
324 | 									screenshot=screenshot
325 | 									)
326 | 				)
327 | 
328 | 				if is_done:
329 | 					logger.info(f'✅ Task completed successfully in {step} steps')
330 | 					
331 | 					storage_state = await self.browser.get_storage_state()
332 | 
333 | 					# Yield the final output as a chunk
334 | 					final_output = AgentOutput(
335 | 						agent_state=self.get_state() if return_agent_state else None,
336 | 						result=result,
337 | 						storage_state=storage_state if return_storage_state else None,
338 | 						step_count=step,
339 | 						trace_id=trace_id,
340 | 					)
341 | 
342 | 					span.set_attribute("lmnr.span.output", result.model_dump_json())
343 | 					yield FinalOutputChunk(content=final_output)
344 | 
345 | 					break
346 | 
347 | 			if not is_done:
348 | 				logger.info('❌ Maximum number of steps reached')
349 | 				yield StepChunkError(content=f'Maximum number of steps reached: {max_steps}')
350 | 			
351 | 		except Exception as e:
352 | 			logger.info(f'❌ Error in run: {e}')
353 | 			span.record_exception(e)
354 | 			
355 | 			yield StepChunkError(content=f'Error in run stream: {e}')
356 | 		finally:
357 | 			# Clean up resources		
358 | 			if close_context:
359 | 				# Update to close the browser directly
360 | 				await self.browser.close()
361 | 
362 | 			span.end()
363 | 			logger.info('Stream complete, span closed')
364 | 
365 | 	def get_state(self) -> AgentState:
366 | 
367 | 		self.state.messages = self.message_manager.get_messages()
368 | 
369 | 		return self.state
370 | 


--------------------------------------------------------------------------------
/index/agent/demo_images/complex_layout_highlight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/agent/demo_images/complex_layout_highlight.png


--------------------------------------------------------------------------------
/index/agent/demo_images/complex_layout_small_elements.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/agent/demo_images/complex_layout_small_elements.png


--------------------------------------------------------------------------------
/index/agent/demo_images/loading.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/agent/demo_images/loading.png


--------------------------------------------------------------------------------
/index/agent/demo_images/scroll.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/agent/demo_images/scroll.png


--------------------------------------------------------------------------------
/index/agent/message_manager.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import json
  4 | import logging
  5 | from datetime import datetime
  6 | from typing import List, Optional, Type
  7 | 
  8 | from pydantic import BaseModel
  9 | 
 10 | from index.agent.models import ActionResult, AgentLLMOutput
 11 | from index.agent.prompts import system_message
 12 | from index.agent.utils import load_demo_image_as_b64, pydantic_to_custom_jtd
 13 | from index.browser.models import BrowserState
 14 | from index.browser.utils import scale_b64_image
 15 | from index.llm.llm import ImageContent, Message, TextContent
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | class MessageManager:
 21 | 	def __init__(
 22 | 		self,
 23 | 		action_descriptions: str,
 24 | 	):
 25 | 		self._messages: List[Message] = []
 26 | 		self.action_descriptions = action_descriptions
 27 | 
 28 | 
 29 | 	def add_system_message_and_user_prompt(self, prompt: str, output_model: Type[BaseModel] | str | None = None) -> None:
 30 | 
 31 | 		complex_layout_highlight = load_demo_image_as_b64('complex_layout_highlight.png')
 32 | 		complex_layout_small_elements = load_demo_image_as_b64('complex_layout_small_elements.png')
 33 | 		still_loading = load_demo_image_as_b64('loading.png')
 34 | 		scroll_over_element_example = load_demo_image_as_b64('scroll.png')
 35 | 		system_msg = Message(
 36 | 			role="system",
 37 | 			content=[
 38 | 				TextContent(text=system_message(self.action_descriptions), cache_control=True),
 39 | 			],
 40 | 		)
 41 | 
 42 | 		self._messages.append(system_msg)
 43 | 		output_model_str = ''
 44 | 		if output_model:
 45 | 			output_format = ''
 46 | 			if isinstance(output_model, type) and issubclass(output_model, BaseModel):
 47 | 				output_format = json.dumps(pydantic_to_custom_jtd(output_model), indent=2)
 48 | 			elif isinstance(output_model, str):
 49 | 				output_format = output_model
 50 | 
 51 | 			output_model_str = f"""
 52 | 
 53 | When you are ready to complete the task use `done_with_structured_output` action. Strictly provide output in the following JSON format and infer which fields best match the information you have gathered:
 54 | 
 55 | <output_model>
 56 | {output_format}
 57 | </output_model>
 58 | """
 59 | 
 60 | 		self._messages.append(Message(
 61 | 			role="user",
 62 | 			content=[
 63 | 				TextContent(text='<complex_layout_example>'),
 64 | 				TextContent(text="Here's an example of a complex layout. As an example, if you want to select a 'Roster' section for Colorado Rockies. Then you need to click on element with index 121."),
 65 | 				ImageContent(image_b64=complex_layout_highlight),
 66 | 				TextContent(text='</complex_layout_example>'),
 67 | 				TextContent(text='<small_elements_example>'),
 68 | 				TextContent(text="Here's an example of small elements on the page and their functions. Element 7, represented by 'x' icon, is a 'clear text' button. Element 8 is a 'submit' button, represented by '=' icon. This clarification should help you better understand similar layouts."),
 69 | 				ImageContent(image_b64=complex_layout_small_elements),
 70 | 				TextContent(text='</small_elements_example>'),
 71 | 				TextContent(text='<loading_pages_example>'),
 72 | 				TextContent(text="Here is an example of a loading page. If the main content on the page is empty or if there are loading elements, such as 'skeleton' screens or loading indicators, page is still loading. Then, you HAVE to perform `wait_for_page_to_load` action because you can't interact with the page until it is fully loaded."),
 73 | 				ImageContent(image_b64=still_loading),
 74 | 				TextContent(text='</loading_pages_example>'),
 75 | 				TextContent(text='<scroll_over_element_example>'),
 76 | 				TextContent(text="In some cases, to reveal more content, you need to scroll in scrollable areas of the webpage. Scrollable areas have VERTICAL scrollbars very clearly visible on their right side. In the screenshot below, you can clearly see a scrollbar on the right side of the list of search items. This indicates that the list is scrollable. To scroll over this area, you need to identify any element within the scrollable area and use its index with `scroll_down_over_element` action to scroll over it. In this example, appropriate element is with index 15."),
 77 | 				ImageContent(image_b64=scroll_over_element_example),
 78 | 				TextContent(text='</scroll_over_element_example>', cache_control=True),
 79 | 				TextContent(text=f"""Here is the task you need to complete:
 80 | 
 81 | <task>
 82 | {prompt}
 83 | </task>
 84 | 
 85 | Today's date and time is: {datetime.now().strftime('%B %d, %Y, %I:%M%p')} - keep this date and time in mind when planning your actions.{output_model_str}"""),
 86 | 			]
 87 | 		))
 88 | 
 89 | 	def get_messages_as_state(self) -> List[Message]:
 90 | 		"""Get messages as state messages"""
 91 | 		return [msg for msg in self._messages if msg.is_state_message]
 92 | 
 93 | 
 94 | 	def remove_last_message(self) -> None:
 95 | 		"""Remove last message from history"""
 96 | 		if len(self._messages) > 1:
 97 | 			self._messages.pop()
 98 | 
 99 | 	def add_current_state_message(
100 | 		self,
101 | 		state: BrowserState,
102 | 		previous_result: ActionResult | None = None,
103 | 		user_follow_up_message: str | None = None,
104 | 	) -> None:
105 | 		"""Add browser state as a user message"""
106 | 
107 | 		if state.interactive_elements:
108 | 			highlighted_elements = ''
109 | 			for element in state.interactive_elements.values():
110 | 				
111 | 				# exclude sheets elements
112 | 				if element.browser_agent_id.startswith("row_") or element.browser_agent_id.startswith("column_"):
113 | 					continue
114 | 
115 | 				start_tag = f"[{element.index}]<{element.tag_name}"
116 | 
117 | 				if element.input_type:
118 | 					start_tag += f" type=\"{element.input_type}\""
119 | 
120 | 				start_tag += ">"
121 | 				element_text = element.text.replace('\n', ' ')
122 | 				highlighted_elements += f"{start_tag}{element_text}</{element.tag_name}>\n"
123 | 		else:
124 | 			highlighted_elements = ''
125 | 
126 | 		scroll_distance_above_viewport = state.viewport.scroll_distance_above_viewport or 0
127 | 		scroll_distance_below_viewport = state.viewport.scroll_distance_below_viewport or 0
128 | 
129 | 		if scroll_distance_above_viewport > 0:
130 | 			elements_text = f'{scroll_distance_above_viewport}px scroll distance above current viewport\n'
131 | 		else:
132 | 			elements_text = '[Start of page]\n'
133 | 
134 | 		if highlighted_elements != '':
135 | 			elements_text += f'\nHighlighted elements:\n{highlighted_elements}'
136 | 
137 | 		if scroll_distance_below_viewport > 0:
138 | 			elements_text += f'\n{scroll_distance_below_viewport}px scroll distance below current viewport\n'
139 | 		else:
140 | 			elements_text += '\n[End of page]'
141 | 
142 | 		previous_action_output = ''
143 | 		if previous_result:
144 | 			previous_action_output = f'<previous_action_output>\n{previous_result.content}\n</previous_action_output>\n\n' if previous_result.content else ''
145 | 
146 | 			if previous_result.error:
147 | 				previous_action_output += f'<previous_action_error>\n{previous_result.error}\n</previous_action_error>\n\n'
148 | 
149 | 		if user_follow_up_message:
150 | 			user_follow_up_message = f'<user_follow_up_message>\n{user_follow_up_message}\n</user_follow_up_message>\n\n'
151 | 		else:
152 | 			user_follow_up_message = ''
153 | 
154 | 		state_description = f"""{previous_action_output}{user_follow_up_message}
155 | <viewport>
156 | Current URL: {state.url}
157 | 
158 | Open tabs:
159 | {state.tabs}
160 | 
161 | Current viewport information:
162 | {elements_text}
163 | </viewport>"""
164 | 
165 | 		state_msg = Message(
166 | 			role='user',
167 | 			content=[
168 | 				TextContent(text=state_description),
169 | 				TextContent(text='<current_state_clean_screenshot>'),
170 | 				ImageContent(image_b64=state.screenshot),
171 | 				TextContent(text='</current_state_clean_screenshot>'),
172 | 				TextContent(text='<current_state>'),
173 | 				ImageContent(image_b64=state.screenshot_with_highlights),
174 | 				TextContent(text='</current_state>'),
175 | 			]
176 | 		)
177 | 	
178 | 		self._messages.append(state_msg)
179 | 
180 | 	def add_message_from_model_output(self, step: int, previous_result: ActionResult | None, model_output: AgentLLMOutput, screenshot: Optional[str] = None) -> None:
181 | 		"""Add model output as AI message"""
182 | 
183 | 		previous_action_output = ''
184 | 
185 | 		for msg in self._messages:
186 | 			if msg.is_state_message:
187 | 				msg.content = [msg.content[0]]
188 | 
189 | 		if previous_result and screenshot:
190 | 			previous_action_output = f'<action_output_{step-1}>\n{previous_result.content}\n</action_output_{step-1}>' if previous_result.content else ''
191 | 
192 | 			if previous_result.error:
193 | 				previous_action_output += f'<action_error_{step-1}>\n{previous_result.error}\n</action_error_{step-1}>'
194 | 
195 | 			usr_msg = Message(
196 | 				role='user',
197 | 				content=[
198 | 					TextContent(text=previous_action_output, cache_control=True),
199 | 					TextContent(text=f"<state_{step}>"),
200 | 					ImageContent(image_b64=scale_b64_image(screenshot, 0.75)),
201 | 					TextContent(text=f"</state_{step}>"),
202 | 				],
203 | 				is_state_message=True,
204 | 			)
205 | 			self._messages.append(usr_msg)
206 | 
207 | 		assistant_content = [
208 | 			TextContent(text=f"""<output_{step}>
209 | {model_output.model_dump_json(indent=2, include={"thought", "action", "summary"}).strip()}
210 | </output_{step}>"""),
211 | 			]
212 | 		
213 | 		if model_output.thinking_block:
214 | 			assistant_content = [
215 | 				model_output.thinking_block,
216 | 			] + assistant_content
217 | 		
218 | 		msg = Message(
219 | 			role='assistant',
220 | 			content=assistant_content,
221 | 		)
222 | 
223 | 		self._messages.append(msg)
224 | 
225 | 	def get_messages(self) -> List[Message]:
226 | 
227 | 		found_first_cache_control = False
228 | 
229 | 		# clear all past cache control except the latest one
230 | 		for msg in self._messages[::-1]:
231 | 
232 | 			# ignore system messages
233 | 			if msg.role == 'system':
234 | 				continue
235 | 
236 | 			if found_first_cache_control:
237 | 				msg.remove_cache_control()
238 | 
239 | 			if msg.has_cache_control():
240 | 				found_first_cache_control = True
241 | 			
242 | 
243 | 		return self._messages
244 | 	
245 | 	def set_messages(self, messages: List[Message]) -> None:
246 | 		"""Set messages"""
247 | 		self._messages = messages
248 | 


--------------------------------------------------------------------------------
/index/agent/models.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import Any, Dict, Literal, Optional
 4 | 
 5 | from playwright.async_api import StorageState
 6 | from pydantic import BaseModel
 7 | 
 8 | from index.llm.llm import Message, ThinkingBlock
 9 | 
10 | 
11 | class AgentState(BaseModel):
12 | 	"""State of the agent"""
13 | 
14 | 	messages: list[Message]
15 | 
16 | class ActionResult(BaseModel):
17 | 	"""Result of executing an action"""
18 | 
19 | 	is_done: Optional[bool] = False
20 | 	content: Optional[str | Dict[str, Any]] = None
21 | 	error: Optional[str] = None
22 | 	give_control: Optional[bool] = False
23 | 
24 | class ActionModel(BaseModel):
25 | 	"""Model for an action"""
26 | 
27 | 	name: str
28 | 	params: Dict[str, Any]
29 | 
30 | class AgentLLMOutput(BaseModel):
31 | 	"""Output model for agent"""
32 | 
33 | 	action: ActionModel
34 | 	thought: Optional[str] = None
35 | 	summary: Optional[str] = None
36 | 	thinking_block: Optional[ThinkingBlock] = None
37 | 
38 | class AgentOutput(BaseModel):
39 | 	"""Output model for agent"""
40 | 
41 | 	agent_state: Optional[AgentState] = None
42 | 	result: ActionResult
43 | 	step_count: int = 0
44 | 	storage_state: Optional[StorageState] = None
45 | 	trace_id: str | None = None
46 | 
47 | class AgentStreamChunk(BaseModel):
48 | 	"""Base class for chunks in the agent stream"""
49 | 	type: str
50 | 
51 | class StepChunkContent(BaseModel):
52 | 	action_result: ActionResult
53 | 	summary: str
54 | 	trace_id: str | None = None
55 | 	screenshot: Optional[str] = None
56 | 
57 | class StepChunk(AgentStreamChunk):
58 | 	"""Chunk containing a step result"""
59 | 	type: Literal["step"] = "step"
60 | 	content: StepChunkContent
61 | 
62 | class TimeoutChunkContent(BaseModel):
63 | 	action_result: ActionResult
64 | 	summary: str
65 | 	step: int
66 | 	agent_state: AgentState | None = None
67 | 	trace_id: str | None = None
68 | 	screenshot: Optional[str] = None
69 | 
70 | class TimeoutChunk(AgentStreamChunk):
71 | 	"""Chunk containing a timeout"""
72 | 	type: Literal["step_timeout"] = "step_timeout"
73 | 	content: TimeoutChunkContent
74 | 
75 | class StepChunkError(AgentStreamChunk):
76 | 	"""Chunk containing an error"""
77 | 	type: Literal["step_error"] = "step_error"
78 | 	content: str
79 | 
80 | class FinalOutputChunk(AgentStreamChunk):
81 | 	"""Chunk containing the final output"""
82 | 	type: Literal["final_output"] = "final_output"
83 | 	content: AgentOutput
84 | 


--------------------------------------------------------------------------------
/index/agent/prompts.py:
--------------------------------------------------------------------------------
 1 | def system_message(action_descriptions: str) -> str:
 2 | 	return f"""You are an advanced AI assistant designed to interact with a web browser and complete user tasks. Your capabilities include analyzing web page screenshots, interacting with page elements, and navigating through websites to accomplish various objectives.
 3 | 
 4 | First, let's review the available actions you can perform:
 5 | 
 6 | <action_descriptions>
 7 | {action_descriptions}
 8 | </action_descriptions>
 9 | 
10 | Your goal is to complete the user's task by carefully analyzing the current state of the web page, planning your actions, reflecting on the outcomes of the previous actions, and avoiding repetition of unsuccessful approaches. Follow the guidelines below:
11 | 
12 | 1. Element Identification:
13 |    - Interactable elements on the page are enclosed in uniquely colored bounding boxes with numbered labels.
14 |    - Label corresponding to its bounding box is placed at the top right corner of the bounding box, and has exact same color as the bounding box. If the label is larger than the bounding box, the label is placed right outside and tangent to the bounding box.
15 |    - Carefully match labels to their corresponding bounding boxes based on the color and position of the label, as labels might slightly overlap with unrelated bounding boxes.
16 |    - If bounding box doesn't enclose any element, simply ignore it (most likely the bounding box was incorrectly detected).
17 |    - Screenshot enclosed in <current_state_clean_screenshot> tag contains clean screenshot of a current browser window.
18 | 	- Screenshot enclosed in <current_state> tag has bounding boxes with labels drawn around interactable elements.
19 | 	- Carefully analyze both screenshots to understand the layout of the page and accurately map bounding boxes to their corresponding elements.
20 |    - Remember: each bounding box and corresponding label have the same unique color.
21 | 
22 | 2. Element Interaction:
23 |    - Infer role and function of elements based on their appearance, text/icon inside the element, and location on the page.
24 |    - Interact only with visible elements on the screen.
25 |    - Before entering a text into an input area, make sure that you have clicked on the target input area first.
26 |    - Scroll or interact with elements to reveal more content if necessary information is not visible.
27 |    - To scroll within areas with scrollbars, first identify any element inside the scrollable area and use its index with `scroll_down_over_element` or `scroll_up_over_element` actions instead of scrolling the entire page. Pay attention to the scrollbar position and direction to identify the correct element.
28 |    - Some pages have navigation menu on the left, which might contain useful information, such as filters, categories, navigation, etc. Pay close attention to whether the side menu has scrollbars. If it does, scroll over it using an element within the side menu.
29 |    - For clicking on a cell in a spreadsheet, first identify the correct column and row that corresponds to the cell you want to click on. Then, strictly use the `click_on_spreadsheet_cell` action to click on the cell. Don't use `click_element` action for interacting with a spreadsheet cells.
30 |       
31 | 3. Task Execution:
32 |    - After you perform an action, analyze the state screenshot to verify that the intended result was achieved (filter was applied, correct date range was selected, text was entered, etc.). If the result was not achieved, identify the problem and fix it. Be creative and persistent in your approach and don't repeat the same actions that failed.
33 |    - Break down multi-step tasks into sub-tasks and complete each sub-task one by one.
34 |    - Thoroughly explore all possible approaches before declaring the task complete.
35 |    - If you encounter obstacles, consider alternative approaches such as returning to a previous page, initiating a new search, or opening a new tab.
36 |    - Understand elements on the page and infer the most relevant ones for the current step of the task.
37 |    - Ensure that your final output fully addresses all aspects of the user's request.
38 |    - Include ALL requested information in the "done" action. Include markdown-formatted links where relevant and useful.
39 |    - Important: For research tasks, be persistent and explore multiple results (at least 5-10) before giving up.
40 |    - Be persistent and creative in your approach, e.g., using site-specific Google searches to find precise information.
41 | 
42 | 4. Special Situations:
43 |    - Cookie popups: Click "I accept" if present. If it persists after clicking, ignore it.
44 |    - CAPTCHA: Attempt to solve logically. If unsuccessful, open a new tab and continue the task.
45 | 
46 | 5. Returning control to human:
47 |    - For steps that require user information to proceed, such as providing first name, last name, email, phone number, booking information, login, password, credit card information, credentials, etc., unless this information was provided in the initial prompt, you must use `give_human_control` action to give human control of the browser.
48 |    - If you can't solve the CAPTCHA, use the `give_human_control` action to give human control of the browser to aid you in solving the CAPTCHA.
49 |    - Control is guaranteed to be returned to you after the human has entered the information or solved the CAPTCHA, so you should plan your next actions accordingly.
50 | 
51 | 6. Source citations:
52 |    - When you perform research tasks, include links to the websites that you found the information in your final output.
53 |    - In general, include links to the websites that you found the information in your final output.
54 |    - Strictly use markdown format for the links, because the final output will be rendered as markdown.
55 | 
56 | 7. Spreadsheet interaction:
57 |    - To click on a cell in a spreadsheet, use the `click_on_spreadsheet_cell` action to click on a specific cell. DON'T use `click_element` action for interacting with a spreadsheet cells or other elements when the goal is to click on a specific cell.
58 |    - To input text into a spreadsheet cell, first click on the cell using the `click_on_spreadsheet_cell` action, then use the `enter_text` action to input text.
59 | 
60 | Your response must always be in the following JSON format, enclosed in <output> tags:
61 | 
62 | <output>
63 | {{
64 |   "thought": "EITHER a very short summary of your thinking process with key points OR exact information that you need to remember for the future (in case of research tasks).",
65 |   "action": {{
66 |     "name": "action_name",
67 |     "params": {{
68 |       "param1": "value1",
69 |       "param2": "value2"
70 |     }}
71 |   }},
72 |   "summary": "Extremely brief summary of what you are doing to display to the user to help them understand what you are doing"
73 | }}
74 | </output>
75 | 
76 | Remember:
77 | - Think concisely.
78 | - Output only a single action per response.
79 | - You will be prompted again after each action.
80 | - Always provide an output in the specified JSON format, enclosed in <output> tags.
81 | - Reflect on the outcomes of the past actions to avoid repeating unsuccessful approaches.
82 | - Be creative and persistent in trying different strategies within the boundaries of the website.
83 | - Break down multi-step tasks into sub-tasks and complete each sub-task one by one.
84 | - For research tasks, be thorough and explore multiple results before concluding that the desired information is unavailable.
85 | 
86 | Continue this process until you are absolutely certain that you have completed the user's task fully and accurately. Be thorough, creative, and persistent in your approach.
87 | 
88 | Your final output should consist only of the correctly formatted JSON object enclosed in <output> tags and should not duplicate or rehash any of the work you did in the thinking block."""


--------------------------------------------------------------------------------
/index/agent/utils.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import enum
  3 | import importlib.resources
  4 | import json
  5 | import logging
  6 | import re
  7 | from typing import Any, Dict, Type
  8 | 
  9 | from pydantic import BaseModel, ValidationError
 10 | 
 11 | from index.agent.models import AgentLLMOutput
 12 | from index.browser.utils import scale_b64_image
 13 | from index.llm.llm import BaseLLMProvider, Message
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | def load_demo_image_as_b64(image_name: str) -> str:
 18 |     """
 19 |     Load an image from the demo_images directory and return it as a base64 string.
 20 |     Works reliably whether the package is used directly or as a library.
 21 |     
 22 |     Args:
 23 |         image_name: Name of the image file (including extension)
 24 |         
 25 |     Returns:
 26 |         Base64 encoded string of the image
 27 |     """
 28 |     try:
 29 |         # Using importlib.resources to reliably find package data
 30 |         with importlib.resources.path('index.agent.demo_images', image_name) as img_path:
 31 |             with open(img_path, 'rb') as img_file:
 32 |                 b64 = base64.b64encode(img_file.read()).decode('utf-8')
 33 |                 return scale_b64_image(b64, 0.75)
 34 |     except Exception as e:
 35 |         logger.error(f"Error loading demo image {image_name}: {e}")
 36 |         raise
 37 | 
 38 | def pydantic_to_custom_jtd(model_class: Type[BaseModel]) -> Dict[str, Any]:
 39 |     """
 40 |     Convert a Pydantic model class to a custom JSON Typedef-like schema
 41 |     with proper array and object handling.
 42 |     """
 43 |     def python_type_to_jtd_type(annotation):
 44 |         if annotation is str:
 45 |             return {"type": "string"}
 46 |         elif annotation is int:
 47 |             return {"type": "int32"}
 48 |         elif annotation is float:
 49 |             return {"type": "float64"}
 50 |         elif annotation is bool:
 51 |             return {"type": "boolean"}
 52 |         elif isinstance(annotation, type) and issubclass(annotation, enum.Enum):
 53 |             values = [e.value for e in annotation]
 54 |             return {"type": "string", "enum": values}
 55 |         else:
 56 |             return {"type": "string"}  # fallback
 57 | 
 58 |     def process_model(model):
 59 |         model_schema = {
 60 |             "type": "object",
 61 |             "properties": {},
 62 |             "required": [],
 63 |             "additionalProperties": False
 64 |         }
 65 |         
 66 |         for name, field in model.model_fields.items():
 67 |             annotation = field.annotation
 68 |             origin = getattr(annotation, "__origin__", None)
 69 |             
 70 |             if origin is list:
 71 |                 inner = annotation.__args__[0]
 72 |                 if isinstance(inner, type) and issubclass(inner, enum.Enum):
 73 |                     item_schema = {"type": "string", "enum": [e.value for e in inner]}
 74 |                 elif hasattr(inner, "mro") and BaseModel in inner.mro():
 75 |                     item_schema = process_model(inner)
 76 |                 else:
 77 |                     item_schema = python_type_to_jtd_type(inner)
 78 |                 
 79 |                 model_schema["properties"][name] = {
 80 |                     "type": "array",
 81 |                     "items": item_schema
 82 |                 }
 83 |             elif isinstance(annotation, type) and issubclass(annotation, enum.Enum):
 84 |                 model_schema["properties"][name] = {
 85 |                     "type": "string", 
 86 |                     "enum": [e.value for e in annotation]
 87 |                 }
 88 |             elif hasattr(annotation, "mro") and BaseModel in annotation.mro():
 89 |                 model_schema["properties"][name] = process_model(annotation)
 90 |             else:
 91 |                 model_schema["properties"][name] = python_type_to_jtd_type(annotation)
 92 |             
 93 |             if field.is_required():
 94 |                 model_schema["required"].append(name)
 95 |                 
 96 |         return model_schema
 97 |     
 98 |     return process_model(model_class)
 99 | 
100 | 
101 | async def generate_proper_json(llm: BaseLLMProvider, json_str: str) -> str:
102 | 
103 |     prompt = f"""The following JSON string is malformed or has issues. Please correct it while preserving the original structure and content as much as possible.
104 | Return ONLY the corrected JSON string, without any surrounding text, comments, or markdown. Do not add any explanations.
105 | 
106 | Problematic JSON string:
107 | {json_str}
108 | """
109 | 
110 |     input_messages = [
111 |         Message(role="user", content=prompt)
112 |     ]
113 | 
114 |     response = await llm.call(input_messages)
115 |     corrected_json_str = response.content.strip()
116 |     if corrected_json_str.startswith("```json"):
117 |         corrected_json_str = corrected_json_str[7:]
118 |     if corrected_json_str.endswith("```"):
119 |         corrected_json_str = corrected_json_str[:-3]
120 |     return corrected_json_str.strip()
121 | 
122 | 
123 | async def validate_json(raw_llm_response_content: str, llm: BaseLLMProvider, max_retries: int = 3) -> AgentLLMOutput:
124 |     """
125 |     Extracts, validates, and parses a JSON string from raw LLM output,
126 |     attempting to fix it if necessary using retries with cleaning and LLM-based correction.
127 |     
128 |     Args:
129 |         raw_llm_response_content: The raw string content from the LLM response.
130 |         llm: The LLM provider instance for fixing JSON if needed.
131 |         max_retries: Maximum number of attempts to parse the JSON.
132 |         
133 |     Returns:
134 |         An AgentLLMOutput object.
135 |         
136 |     Raises:
137 |         ValueError: If the JSON string cannot be parsed or validated after all retries.
138 |     """
139 |     # 1. Regex extraction from raw_llm_response_content
140 |     pattern = r"<output(?:[^>]*)>(.*?)</output(?:[^>]*)>"
141 |     match = re.search(pattern, raw_llm_response_content, re.DOTALL)
142 |     
143 |     current_json_str = ""
144 |     if not match:
145 |         # if we couldn't find the <output> tags, it most likely means the <output*> tag is not present in the response
146 |         # remove closing and opening tags just in case
147 |         closing_tag_pattern = r"</output(?:[^>]*)>"
148 |         json_str_no_closing = re.sub(closing_tag_pattern, "", raw_llm_response_content).strip()
149 |         open_tag_pattern = r"<output(?:[^>]*)>"
150 |         json_str_no_tags = re.sub(open_tag_pattern, "", json_str_no_closing).strip()
151 |         # Also remove potential markdown code blocks if not already handled by regex
152 |         current_json_str = json_str_no_tags.replace("```json", "").replace("```", "").strip()
153 |     else:
154 |         current_json_str = match.group(1).strip()
155 | 
156 |     last_exception = None
157 | 
158 |     for attempt in range(max_retries):
159 |         logger.debug(f"JSON parsing attempt {attempt + 1}/{max_retries}")
160 |         
161 |         # Stage 1: Try to parse the current_json_str as is
162 |         try:
163 |             # Remove potential markdown that might have been added by LLM fix
164 |             temp_json_str = current_json_str
165 |             if temp_json_str.startswith("```json"):
166 |                 temp_json_str = temp_json_str[7:]
167 |             if temp_json_str.endswith("```"):
168 |                 temp_json_str = temp_json_str[:-3]
169 |             temp_json_str = temp_json_str.strip()
170 | 
171 |             logger.debug(f"Attempting to parse JSON on attempt {attempt + 1}. Raw JSON: '{temp_json_str}'")
172 |             output = AgentLLMOutput.model_validate_json(temp_json_str)
173 |             logger.debug(f"Successfully parsed JSON on attempt {attempt + 1}.")
174 |             return output
175 |         except (json.JSONDecodeError, ValidationError) as e1:
176 |             logger.warning(f"Direct JSON parsing failed on attempt {attempt + 1}: {e1}")
177 |             last_exception = e1
178 | 
179 |             # Stage 2: Try to parse after cleaning common issues
180 |             try:
181 |                 json_str_cleaned = current_json_str # Start with the current_json_str for cleaning
182 |                 # Removed explicit replacement of \n, \r, \t - rely on JSON parser
183 |                 # json_str_cleaned = json_str_cleaned.replace('\\\\n', '\n').replace('\\\\r', '\r').replace('\\\\t', '\t')
184 |                 # Keep control character removal
185 |                 json_str_cleaned = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', json_str_cleaned)
186 |                 
187 |                 if json_str_cleaned.startswith("```json"):
188 |                     json_str_cleaned = json_str_cleaned[7:]
189 |                 if json_str_cleaned.endswith("```"):
190 |                     json_str_cleaned = json_str_cleaned[:-3]
191 |                 json_str_cleaned = json_str_cleaned.strip()
192 | 
193 |                 logger.debug(f"Attempting to parse cleaned JSON on attempt {attempt + 1}. Cleaned JSON: '{json_str_cleaned[:250]}...'")
194 |                 output = AgentLLMOutput.model_validate_json(json_str_cleaned)
195 |                 logger.debug(f"Successfully parsed JSON on attempt {attempt + 1} (after cleaning).")
196 |                 return output
197 |             except (json.JSONDecodeError, ValidationError) as e2:
198 |                 logger.warning(f"Cleaned JSON parsing failed on attempt {attempt + 1}: {e2}")
199 |                 last_exception = e2 
200 | 
201 |                 if attempt < max_retries - 1:
202 |                     logger.debug(f"Attempt {attempt + 1} failed. Attempting to fix JSON with LLM.")
203 |                     try:
204 |                         # Pass the original problematic string (before this attempt's cleaning) to LLM
205 |                         current_json_str = await generate_proper_json(llm, current_json_str) 
206 |                         logger.debug(f"LLM proposed a new JSON string: '{current_json_str}'")
207 |                     except Exception as llm_fix_exception:
208 |                         logger.error(f"LLM call to fix JSON failed during attempt {attempt + 1}: {llm_fix_exception}")
209 |                         # If LLM fix fails, loop continues with the previous current_json_str,
210 |                         # and will eventually fail if parsing doesn't succeed.
211 |                         pass 
212 |                 else:
213 |                     logger.error(f"All {max_retries} attempts to parse JSON failed. Final attempt was with: '{current_json_str[:250]}...'")
214 |                     break 
215 |     
216 |     raise ValueError(
217 |         f"Could not parse or validate response after {max_retries} attempts. "
218 |         f"Last error: {str(last_exception)}\\n"
219 |         f"Final problematic JSON string after all attempts: '{current_json_str[:500]}...'"
220 |     ) from last_exception
221 | 
222 | 


--------------------------------------------------------------------------------
/index/browser/browser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Streamlined Playwright browser implementation.
  3 | """
  4 | 
  5 | import asyncio
  6 | import base64
  7 | import io
  8 | import logging
  9 | from dataclasses import dataclass, field
 10 | from importlib import resources
 11 | from typing import Any, Optional
 12 | 
 13 | from lmnr import observe
 14 | from PIL import Image
 15 | from playwright.async_api import (
 16 | 	Browser as PlaywrightBrowser,
 17 | )
 18 | from playwright.async_api import (
 19 | 	BrowserContext as PlaywrightBrowserContext,
 20 | )
 21 | from playwright.async_api import (
 22 | 	Page,
 23 | 	Playwright,
 24 | 	StorageState,
 25 | 	async_playwright,
 26 | )
 27 | from tenacity import (
 28 | 	retry,
 29 | 	retry_if_exception_type,
 30 | 	stop_after_attempt,
 31 | 	wait_exponential,
 32 | )
 33 | from typing_extensions import TypedDict  # to account for older python versions
 34 | 
 35 | # Import detector class
 36 | from index.browser.detector import Detector
 37 | from index.browser.models import (
 38 | 	BrowserError,
 39 | 	BrowserState,
 40 | 	InteractiveElementsData,
 41 | 	TabInfo,
 42 | )
 43 | from index.browser.utils import (
 44 | 	filter_elements,
 45 | 	put_highlight_elements_on_screenshot,
 46 | 	scale_b64_image,
 47 | )
 48 | 
 49 | logger = logging.getLogger(__name__)
 50 | 
 51 | INTERACTIVE_ELEMENTS_JS_CODE = resources.read_text('index.browser', 'findVisibleInteractiveElements.js')
 52 | 
 53 | class ViewportSize(TypedDict):
 54 | 	width: int
 55 | 	height: int
 56 | 
 57 | @dataclass
 58 | class BrowserConfig:
 59 | 	"""
 60 | 	Simplified configuration for the Browser.
 61 | 	
 62 | 	Parameters:
 63 | 		cdp_url: Optional[str] = None
 64 | 			Connect to a browser instance via CDP
 65 | 		
 66 | 		viewport_size: ViewportSize = {"width": 1024, "height": 768}
 67 | 			Default browser window size
 68 | 			
 69 | 		storage_state: Optional[StorageState] = None
 70 | 			Storage state to set
 71 | 			
 72 | 		detector: Optional[Detector] = None
 73 | 			Detector instance for CV element detection. If None, CV detection is disabled.
 74 | 
 75 | 	"""
 76 | 	cdp_url: Optional[str] = None
 77 | 	viewport_size: ViewportSize = field(default_factory=lambda: {"width": 1024, "height": 768})
 78 | 	storage_state: Optional[StorageState] = None
 79 | 	detector: Optional[Detector] = None
 80 | 
 81 | class Browser:
 82 | 	"""
 83 | 	Unified Browser responsible for interacting with the browser via Playwright.
 84 | 	"""
 85 | 
 86 | 	def __init__(self, config: BrowserConfig = BrowserConfig(), close_context: bool = True):
 87 | 		logger.debug('Initializing browser')
 88 | 		self.config = config
 89 | 		self.close_context = close_context
 90 | 		# Playwright-related attributes
 91 | 		self.playwright: Optional[Playwright] = None
 92 | 		self.playwright_browser: Optional[PlaywrightBrowser] = None
 93 | 		self.context: Optional[PlaywrightBrowserContext] = None
 94 | 		
 95 | 		# Page and state management
 96 | 		self.current_page: Optional[Page] = None
 97 | 		self._state: Optional[BrowserState] = None
 98 | 		self._cdp_session = None
 99 | 		
100 | 		# CV detection-related attributes
101 | 		self.detector: Optional[Detector] = config.detector
102 | 
103 | 		self.screenshot_scale_factor = None
104 | 		
105 | 		# Initialize state
106 | 		self._init_state()
107 | 
108 | 	async def __aenter__(self):
109 | 		"""Async context manager entry"""
110 | 		await self._init_browser()
111 | 		return self
112 | 
113 | 	async def __aexit__(self, exc_type, exc_val, exc_tb):
114 | 		"""Async context manager exit"""
115 | 		if self.close_context:
116 | 			await self.close()
117 | 
118 | 	def _init_state(self, url: str = '') -> None:
119 | 		"""Initialize browser state"""
120 | 		self._state = BrowserState(
121 | 			url=url,
122 | 			screenshot_with_highlights=None,
123 | 			tabs=[],
124 | 			interactive_elements={},
125 | 		)
126 | 
127 | 	async def _init_browser(self):
128 | 		"""Initialize the browser and context"""
129 | 		logger.debug('Initializing browser context')
130 | 		# Start playwright if needed
131 | 		if self.playwright is None:
132 | 			self.playwright = await async_playwright().start()
133 | 		
134 | 		# Initialize browser if needed
135 | 		if self.playwright_browser is None:
136 | 			if self.config.cdp_url:
137 | 				logger.info(f'Connecting to remote browser via CDP {self.config.cdp_url}')
138 | 				attempts = 0
139 | 				while True:
140 | 					try:
141 | 						self.playwright_browser = await self.playwright.chromium.connect_over_cdp(
142 | 							self.config.cdp_url,
143 | 							timeout=2500,
144 | 						)
145 | 						break
146 | 					except Exception as e:
147 | 						logger.error(f'Failed to connect to remote browser via CDP {self.config.cdp_url}: {e}. Retrying...')
148 | 						await asyncio.sleep(1)
149 | 						attempts += 1
150 | 						if attempts > 3:
151 | 							raise e
152 | 				logger.info(f'Connected to remote browser via CDP {self.config.cdp_url}')
153 | 			else:
154 | 				logger.info('Launching new browser instance')
155 | 				self.playwright_browser = await self.playwright.chromium.launch(
156 | 					headless=False,
157 | 					args=[
158 | 						'--no-sandbox',
159 | 						'--disable-blink-features=AutomationControlled',
160 | 						'--disable-web-security',
161 | 						'--disable-site-isolation-trials',
162 | 						'--disable-features=IsolateOrigins,site-per-process',
163 | 						f'--window-size={self.config.viewport_size["width"]},{self.config.viewport_size["height"]}',
164 | 					]
165 | 				)
166 | 		
167 | 		# Create context if needed
168 | 		if self.context is None:
169 | 
170 | 			if len(self.playwright_browser.contexts) > 0:
171 | 				self.context = self.playwright_browser.contexts[0]
172 | 			else:
173 | 				self.context = await self.playwright_browser.new_context(
174 | 				viewport=self.config.viewport_size,
175 | 				user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
176 | 				java_script_enabled=True,
177 | 				bypass_csp=True,
178 | 				ignore_https_errors=True
179 | 			)
180 | 			
181 | 			# Apply anti-detection scripts
182 | 			await self._apply_anti_detection_scripts()
183 | 			
184 | 		self.context.on('page', self._on_page_change)	
185 | 
186 | 		if self.config.storage_state and 'cookies' in self.config.storage_state:
187 | 			await self.context.add_cookies(self.config.storage_state['cookies'])
188 | 		
189 | 		# Create page if needed
190 | 		if self.current_page is None:
191 | 			if len(self.context.pages) > 0:
192 | 				self.current_page = self.context.pages[-1]
193 | 			else:
194 | 				self.current_page = await self.context.new_page()
195 | 		
196 | 		return self
197 | 	
198 | 	async def _on_page_change(self, page: Page):
199 | 		"""Handle page change events"""
200 | 		logger.info(f'Current page changed to {page.url}')
201 | 
202 | 		self._cdp_session = await self.context.new_cdp_session(page)
203 | 		self.current_page = page
204 | 
205 | 	async def _apply_anti_detection_scripts(self):
206 | 		"""Apply scripts to avoid detection as automation"""
207 | 		await self.context.add_init_script(
208 | 			"""
209 | 			// Webdriver property
210 | 			Object.defineProperty(navigator, 'webdriver', {
211 | 				get: () => undefined
212 | 			});
213 | 
214 | 			// Languages
215 | 			Object.defineProperty(navigator, 'languages', {
216 | 				get: () => ['en-US']
217 | 			});
218 | 
219 | 			// Plugins
220 | 			Object.defineProperty(navigator, 'plugins', {
221 | 				get: () => [1, 2, 3, 4, 5]
222 | 			});
223 | 
224 | 			// Chrome runtime
225 | 			window.chrome = { runtime: {} };
226 | 
227 | 			// Permissions
228 | 			const originalQuery = window.navigator.permissions.query;
229 | 			window.navigator.permissions.query = (parameters) => (
230 | 				parameters.name === 'notifications' ?
231 | 					Promise.resolve({ state: Notification.permission }) :
232 | 					originalQuery(parameters)
233 | 			);
234 | 			(function () {
235 | 				const originalAttachShadow = Element.prototype.attachShadow;
236 | 				Element.prototype.attachShadow = function attachShadow(options) {
237 | 					return originalAttachShadow.call(this, { ...options, mode: "open" });
238 | 				};
239 | 			})();
240 | 			"""
241 | 		)
242 | 	
243 | 	async def close(self):
244 | 		"""Close the browser instance and cleanup resources"""
245 | 		logger.debug('Closing browser')
246 | 		
247 | 		try:
248 | 			
249 | 			# Close CDP session if exists
250 | 			self._cdp_session = None
251 | 			
252 | 			# Close context
253 | 			if self.context:
254 | 				try:
255 | 					await self.context.close()
256 | 				except Exception as e:
257 | 					logger.debug(f'Failed to close context: {e}')
258 | 				self.context = None
259 | 			
260 | 			# Close browser
261 | 			if self.playwright_browser:
262 | 				try:
263 | 					await self.playwright_browser.close()
264 | 				except Exception as e:
265 | 					logger.debug(f'Failed to close browser: {e}')
266 | 				self.playwright_browser = None
267 | 			
268 | 			# Stop playwright
269 | 			if self.playwright:
270 | 				await self.playwright.stop()
271 | 				self.playwright = None
272 | 		except Exception as e:
273 | 			logger.error(f'Error during browser cleanup: {e}')
274 | 		finally:
275 | 			self.context = None
276 | 			self.current_page = None
277 | 			self._state = None
278 | 			self.playwright_browser = None
279 | 			self.playwright = None
280 | 	
281 | 	async def goto(self, url: str):
282 | 		"""Navigate to a URL"""
283 | 		page = await self.get_current_page()
284 | 		await page.goto(url, wait_until='domcontentloaded')
285 | 		await asyncio.sleep(2)
286 | 	
287 | 	async def get_tabs_info(self) -> list[TabInfo]:
288 | 		"""Get information about all tabs"""
289 | 
290 | 		tabs_info = []
291 | 		for page_id, page in enumerate(self.context.pages):
292 | 			tab_info = TabInfo(page_id=page_id, url=page.url, title=await page.title())
293 | 			tabs_info.append(tab_info)
294 | 
295 | 		return tabs_info
296 | 
297 | 	async def switch_to_tab(self, page_id: int) -> None:
298 | 		"""Switch to a specific tab by its page_id"""
299 | 		if self.context is None:
300 | 			await self._init_browser()
301 | 
302 | 		pages = self.context.pages
303 | 		if page_id >= len(pages):
304 | 			raise BrowserError(f'No tab found with page_id: {page_id}')
305 | 
306 | 		page = pages[page_id]
307 | 		self.current_page = page
308 | 
309 | 		await page.bring_to_front()
310 | 		await page.wait_for_load_state('domcontentloaded')
311 | 
312 | 	async def create_new_tab(self, url: str | None = None) -> None:
313 | 		"""Create a new tab and optionally navigate to a URL"""
314 | 		if self.context is None:
315 | 			await self._init_browser()
316 | 
317 | 		new_page = await self.context.new_page()
318 | 		self.current_page = new_page
319 | 
320 | 		await new_page.wait_for_load_state('domcontentloaded')
321 | 
322 | 		if url:
323 | 			await new_page.goto(url, wait_until='domcontentloaded')
324 | 
325 | 	async def close_current_tab(self):
326 | 		"""Close the current tab"""
327 | 		if self.current_page is None:
328 | 			return
329 | 			
330 | 		await self.current_page.close()
331 | 
332 | 		# Switch to the first available tab if any exist
333 | 		if self.context and self.context.pages:
334 | 			await self.switch_to_tab(0)
335 | 	
336 | 	async def get_current_page(self) -> Page:
337 | 		"""Get the current page"""
338 | 		if self.current_page is None:
339 | 			await self._init_browser()
340 | 		return self.current_page
341 | 	
342 | 	def get_state(self) -> BrowserState:
343 | 		"""Get the current browser state"""
344 | 		return self._state
345 | 
346 | 	@observe(name='browser.update_state', ignore_output=True)
347 | 	async def update_state(self) -> BrowserState:
348 | 		"""Update the browser state with current page information and return it"""
349 | 		self._state = await self._update_state()
350 | 		return self._state
351 | 
352 | 	@observe(name='browser._update_state', ignore_output=True)
353 | 	async def _update_state(self) -> BrowserState:
354 | 		"""Update and return state."""
355 | 		@retry(
356 | 			stop=stop_after_attempt(3),
357 | 			wait=wait_exponential(multiplier=0.5, min=0.5, max=2),
358 | 			retry=retry_if_exception_type((Exception)),
359 | 			reraise=True
360 | 		)
361 | 		async def get_stable_state():
362 | 			if self.current_page is None:
363 | 				await self._init_browser()
364 | 			url = self.current_page.url
365 | 
366 | 			detect_sheets = 'docs.google.com/spreadsheets/d' in url
367 | 
368 | 			screenshot_b64 = await self.fast_screenshot()
369 | 			
370 | 			interactive_elements_data = await self.get_interactive_elements(screenshot_b64, detect_sheets)
371 | 			interactive_elements = {element.index: element for element in interactive_elements_data.elements}
372 | 			
373 | 			# Create highlighted version of the screenshot
374 | 			screenshot_with_highlights = put_highlight_elements_on_screenshot(
375 | 				interactive_elements, 
376 | 				screenshot_b64
377 | 			)
378 | 			
379 | 			tabs = await self.get_tabs_info()
380 | 
381 | 			return BrowserState(
382 | 				url=url,
383 | 				tabs=tabs,
384 | 				screenshot_with_highlights=screenshot_with_highlights,
385 | 				screenshot=screenshot_b64,
386 | 				viewport=interactive_elements_data.viewport,
387 | 				interactive_elements=interactive_elements,
388 | 			)
389 | 
390 | 		try:
391 | 			self._state = await get_stable_state()
392 | 			return self._state
393 | 		except Exception as e:
394 | 			logger.error(f'Failed to update state after multiple attempts: {str(e)}')
395 | 			# Return last known good state if available
396 | 			if hasattr(self, '_state'):
397 | 				return self._state
398 | 			raise
399 | 	
400 | 	@observe(name='browser.detect_browser_elements')
401 | 	async def detect_browser_elements(self) -> InteractiveElementsData:
402 | 		"""Get all interactive elements on the page"""
403 | 		page = await self.get_current_page()	
404 | 		result = await page.evaluate(INTERACTIVE_ELEMENTS_JS_CODE)
405 | 		interactive_elements_data = InteractiveElementsData(**result)
406 | 
407 | 		return interactive_elements_data
408 | 
409 | 	@observe(name='browser.get_interactive_elements', ignore_inputs=["screenshot_b64"])
410 | 	async def get_interactive_elements(self, screenshot_b64: str, detect_sheets: bool = False) -> InteractiveElementsData:
411 | 		"""
412 | 		Get interactive elements using combined browser and CV detection.
413 | 		
414 | 		Args:
415 | 			screenshot_b64: Optional base64 encoded screenshot. If None, a new screenshot will be taken.
416 | 			detect_sheets: Whether to detect sheets elements
417 | 		Returns:
418 | 			Combined detection results
419 | 		"""
420 | 	
421 | 		elements = []
422 | 
423 | 		if self.detector is not None:
424 | 			browser_elements_data = await self.detect_browser_elements()
425 |  
426 | 			scale_factor = browser_elements_data.viewport.width / 1024
427 | 
428 | 			cv_elements = await self.detector.detect_from_image(screenshot_b64, scale_factor, detect_sheets)
429 | 
430 | 			# Combine and filter detections
431 | 			elements = filter_elements(browser_elements_data.elements + cv_elements)
432 | 		else:
433 | 			browser_elements_data = await self.detect_browser_elements()
434 | 			elements = browser_elements_data.elements
435 | 		
436 | 		# Create new InteractiveElementsData with combined elements
437 | 		return InteractiveElementsData(
438 | 			viewport=browser_elements_data.viewport,
439 | 			elements=elements
440 | 		)
441 | 
442 | 	async def get_cdp_session(self):
443 | 		"""Get or create a CDP session for the current page"""
444 | 		
445 | 		# Create a new session if we don't have one or the page has changed
446 | 		if (self._cdp_session is None or 
447 | 			not hasattr(self._cdp_session, '_page') or 
448 | 			self._cdp_session._page != self.current_page):
449 | 			self._cdp_session = await self.context.new_cdp_session(self.current_page)
450 | 			# Store reference to the page this session belongs to
451 | 			self._cdp_session._page = self.current_page
452 | 			
453 | 		return self._cdp_session
454 | 
455 | 	@observe(name='browser.take_screenshot', ignore_output=True)
456 | 	async def fast_screenshot(self) -> str:
457 | 		"""
458 | 		Returns a base64 encoded screenshot of the current page.
459 | 			
460 | 		Returns:
461 | 			Base64 encoded screenshot
462 | 		"""
463 | 		# Use cached CDP session instead of creating a new one each time
464 | 		cdp_session = await self.get_cdp_session()
465 | 		screenshot_params = {
466 | 			"format": "png",
467 | 			"fromSurface": False,
468 | 			"captureBeyondViewport": False,
469 | 		}
470 | 		
471 | 		# Capture screenshot using CDP Session
472 | 		screenshot_data = await cdp_session.send("Page.captureScreenshot", screenshot_params)
473 | 		screenshot_b64 = screenshot_data["data"]
474 | 
475 | 		if self.screenshot_scale_factor is None:
476 | 
477 | 			test_img_data = base64.b64decode(screenshot_b64)
478 | 			test_img = Image.open(io.BytesIO(test_img_data))
479 | 			logger.info(f'Test image size: {test_img.size}')
480 | 			self.screenshot_scale_factor = 1024 / test_img.size[0]
481 | 			logger.info(f'Screenshot scale factor: {self.screenshot_scale_factor}')
482 | 
483 | 		screenshot_b64 = scale_b64_image(screenshot_b64, self.screenshot_scale_factor)
484 | 		return screenshot_b64
485 | 
486 | 	async def get_cookies(self) -> list[dict[str, Any]]:
487 | 		"""Get cookies from the browser"""
488 | 		if self.context:
489 | 			cookies = await self.context.cookies()
490 | 			return cookies
491 | 		return []
492 | 	
493 | 	async def get_storage_state(self) -> dict[str, Any]:
494 | 		"""Get local storage from the browser"""
495 | 
496 | 		if self.context:
497 | 			cookies = await self.context.cookies()
498 | 
499 | 			return {
500 | 				'cookies': cookies,
501 | 			}
502 | 		return {}
503 | 


--------------------------------------------------------------------------------
/index/browser/detector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Computer vision detector module.
 3 | """
 4 | 
 5 | from abc import ABC, abstractmethod
 6 | from typing import List
 7 | 
 8 | from index.browser.models import InteractiveElement
 9 | 
10 | 
11 | class Detector(ABC):
12 |     """Abstract interface for object detection in browser screenshots."""
13 | 
14 |     @abstractmethod
15 |     async def detect_from_image(self, image_b64: str, scale_factor: float, detect_sheets: bool = False) -> List[InteractiveElement]:
16 |         """
17 |         Detect interactive elements from a base64 encoded image.
18 |         
19 |         Args:
20 |             image_b64: Base64 encoded image screenshot.
21 |             scale_factor: Scale factor to scale the coordinates of screenshot to browser viewport coordinates.
22 |             detect_sheets: Flag to indicate if specialized sheet detection should be used.
23 |             
24 |         Returns:
25 |             List of detected InteractiveElement objects.
26 |         """
27 |         pass


--------------------------------------------------------------------------------
/index/browser/fonts/OpenSans-Medium.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/browser/fonts/OpenSans-Medium.ttf


--------------------------------------------------------------------------------
/index/browser/models.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | 
 4 | from pydantic import BaseModel, ConfigDict
 5 | from pydantic.alias_generators import to_camel
 6 | 
 7 | 
 8 | # Pydantic
 9 | class TabInfo(BaseModel):
10 | 	"""Represents information about a browser tab"""
11 | 
12 | 	page_id: int
13 | 	url: str
14 | 	title: str
15 | 
16 | class Coordinates(BaseModel):
17 | 	x: int
18 | 	y: int
19 | 	width: Optional[int] = None
20 | 	height: Optional[int] = None
21 | 
22 | class Rect(BaseModel):
23 | 	left: int
24 | 	top: int
25 | 	right: int
26 | 	bottom: int
27 | 	width: int
28 | 	height: int
29 | 
30 | class InteractiveElement(BaseModel):
31 | 	"""Represents an interactive element on the page"""
32 | 	model_config = ConfigDict(
33 |         alias_generator=to_camel,
34 |         populate_by_name=True,
35 |         from_attributes=True,
36 |     )
37 | 	
38 | 	index: int
39 | 	tag_name: str
40 | 	text: str
41 | 	attributes: dict[str, str]
42 | 	viewport: Coordinates
43 | 	page: Coordinates
44 | 	center: Coordinates
45 | 	weight: float
46 | 	browser_agent_id: str
47 | 	input_type: Optional[str] = field(default=None)
48 | 	rect: Rect
49 | 	z_index: int
50 | 
51 | class BrowserError(Exception):
52 | 	"""Base class for all browser errors"""
53 | 
54 | 
55 | class URLNotAllowedError(BrowserError):
56 | 	"""Error raised when a URL is not allowed"""
57 | 
58 | class Viewport(BaseModel):
59 | 	"""Represents the viewport of the browser"""
60 | 	model_config = ConfigDict(
61 |         alias_generator=to_camel,
62 |         populate_by_name=True,
63 |         from_attributes=True,
64 |     )
65 | 	
66 | 	width: int = field(default_factory=lambda: 1024)
67 | 	height: int = field(default_factory=lambda: 768)
68 | 	scroll_x: int = field(default_factory=lambda: 0)
69 | 	scroll_y: int = field(default_factory=lambda: 0)
70 | 	device_pixel_ratio: float = field(default_factory=lambda: 1)
71 | 	scroll_distance_above_viewport: int = field(default_factory=lambda: 0)
72 | 	scroll_distance_below_viewport: int = field(default_factory=lambda: 0)
73 | 
74 | class InteractiveElementsData(BaseModel):
75 | 	"""Represents the data returned by the interactive elements script"""
76 | 
77 | 	viewport: Viewport
78 | 	elements: list[InteractiveElement]
79 | 
80 | @dataclass
81 | class BrowserState:
82 | 	url: str
83 | 	tabs: list[TabInfo]
84 | 	viewport: Viewport = field(default_factory=Viewport)
85 | 	screenshot_with_highlights: Optional[str] = None
86 | 	screenshot: Optional[str] = None
87 | 	interactive_elements: dict[int, InteractiveElement] = field(default_factory=dict)
88 | 


--------------------------------------------------------------------------------
/index/browser/utils.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import logging
  3 | from io import BytesIO
  4 | from pathlib import Path
  5 | from typing import List
  6 | 
  7 | from PIL import Image, ImageDraw, ImageFont
  8 | 
  9 | from index.browser.models import InteractiveElement, Rect
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | def put_highlight_elements_on_screenshot(elements: dict[int, InteractiveElement], screenshot_b64: str) -> str:
 14 |     """Highlight elements using Pillow instead of OpenCV"""
 15 |     try:
 16 |         # Decode base64 to PIL Image
 17 |         image_data = base64.b64decode(screenshot_b64)
 18 |         image = Image.open(BytesIO(image_data))
 19 |         draw = ImageDraw.Draw(image)
 20 |         
 21 |         # Colors (RGB format for PIL)
 22 |         base_colors = [
 23 |             (204, 0, 0),
 24 |             (0, 136, 0),
 25 |             (0, 0, 204),
 26 |             (204, 112, 0),
 27 |             (102, 0, 102),
 28 |             (0, 102, 102),
 29 |             (204, 51, 153),
 30 |             (44, 0, 102),
 31 |             (204, 35, 0), 
 32 |             (28, 102, 66),
 33 |             (170, 0, 0),
 34 |             (36, 82, 123)
 35 |         ]
 36 |         placed_labels = []
 37 |         
 38 |         def generate_unique_color(base_color, element_idx):
 39 |             """Generate a unique color variation based on element index"""
 40 |             r, g, b = base_color
 41 |             # Use prime numbers to create deterministic but non-repeating patterns
 42 |             offset_r = (element_idx * 17) % 31 - 15  # Range: -15 to 15
 43 |             offset_g = (element_idx * 23) % 29 - 14  # Range: -14 to 14
 44 |             offset_b = (element_idx * 13) % 27 - 13  # Range: -13 to 13
 45 |             
 46 |             # Ensure RGB values stay within 0-255 range
 47 |             r = max(0, min(255, r + offset_r))
 48 |             g = max(0, min(255, g + offset_g))
 49 |             b = max(0, min(255, b + offset_b))
 50 |             
 51 |             return (r, g, b) 
 52 |         
 53 |         # Load custom font from the package
 54 |         try:
 55 |             # Path to your packaged font
 56 |             font_path = Path(__file__).parent / "fonts" / "OpenSans-Medium.ttf"
 57 |             font = ImageFont.truetype(str(font_path), 11)
 58 |         except Exception as e:
 59 |             logger.warning(f"Could not load custom font: {e}, falling back to default")
 60 |             font = ImageFont.load_default()
 61 |             
 62 |         for idx, element in elements.items():
 63 | 
 64 |             # don't draw sheets elements
 65 |             if element.browser_agent_id.startswith("row_") or element.browser_agent_id.startswith("column_"):
 66 |                 continue
 67 | 
 68 |             base_color = base_colors[idx % len(base_colors)]
 69 |             color = generate_unique_color(base_color, idx)
 70 |             
 71 |             rect = element.rect
 72 |             
 73 |             # Draw rectangle
 74 |             draw.rectangle(
 75 |                 [(rect.left, rect.top), (rect.right, rect.bottom)],
 76 |                 outline=color,
 77 |                 width=2
 78 |             )
 79 |             
 80 |             # Prepare label
 81 |             text = str(idx)
 82 |                 
 83 |             # Get precise text dimensions for proper centering
 84 |             text_bbox = draw.textbbox((0, 0), text, font=font)
 85 |             text_width = text_bbox[2] - text_bbox[0]
 86 |             text_height = text_bbox[3] - text_bbox[1]
 87 |             
 88 |             # Make label size exactly proportional for better aesthetics
 89 |             label_width = text_width + 4
 90 |             label_height = text_height + 4
 91 |             
 92 |             # Positioning logic
 93 |             if label_width > rect.width or label_height > rect.height:
 94 |                 label_x = rect.left + rect.width
 95 |                 label_y = rect.top
 96 |             else:
 97 |                 label_x = rect.left + rect.width - label_width
 98 |                 label_y = rect.top
 99 |             
100 |             # Check for overlaps with existing labels
101 |             label_rect = {
102 |                 'left': label_x, 'top': label_y,
103 |                 'right': label_x + label_width, 'bottom': label_y + label_height
104 |             }
105 |             
106 |             for existing in placed_labels:
107 |                 if not (label_rect['right'] < existing['left'] or 
108 |                         label_rect['left'] > existing['right'] or 
109 |                         label_rect['bottom'] < existing['top'] or 
110 |                         label_rect['top'] > existing['bottom']):
111 |                     label_y = existing['bottom'] + 2
112 |                     label_rect['top'] = label_y
113 |                     label_rect['bottom'] = label_y + label_height
114 |                     break
115 |             
116 |             # Ensure label is visible within image boundaries
117 |             img_width, img_height = image.size
118 |             if label_x < 0:
119 |                 label_x = 0
120 |             elif label_x + label_width >= img_width:
121 |                 label_x = img_width - label_width - 1
122 |                 
123 |             if label_y < 0:
124 |                 label_y = 0
125 |             elif label_y + label_height >= img_height:
126 |                 label_y = img_height - label_height - 1
127 |             
128 |             # Draw label background
129 |             draw.rectangle(
130 |                 [(label_x, label_y), (label_x + label_width, label_y + label_height)],
131 |                 fill=color
132 |             )
133 |                         
134 |             # magic numbers to center the text
135 |             text_x = label_x + 3
136 |             text_y = label_y - 1
137 |             
138 |             # Draw text
139 |             draw.text(
140 |                 (text_x, text_y),
141 |                 text,
142 |                 fill=(255, 255, 255),
143 |                 font=font
144 |             )
145 |                 
146 |             placed_labels.append(label_rect)
147 |         
148 |         # Convert back to base64
149 |         buffer = BytesIO()
150 |         image.save(buffer, format="PNG")
151 |         new_image_base64 = base64.b64encode(buffer.getvalue()).decode()
152 |         
153 |         return new_image_base64
154 |     
155 |     except Exception as e:
156 |         logger.error(f"Failed to add highlights to screenshot: {str(e)}")
157 |         return screenshot_b64
158 | 
159 | 
160 | def scale_b64_image(image_b64: str, scale_factor: float) -> str:
161 |     """
162 |     Scale down a base64 encoded image using Pillow.
163 |     
164 |     Args:
165 |         image_b64: Base64 encoded image string
166 |         scale_factor: Factor to scale the image by (0.5 = half size)
167 |     
168 |     Returns:
169 |         Base64 encoded scaled image
170 |     """
171 |     try:
172 |         # Decode base64 to PIL Image
173 |         image_data = base64.b64decode(image_b64)
174 |         image = Image.open(BytesIO(image_data))
175 |         
176 |         if image is None:
177 |             return image_b64
178 |             
179 |         # Get original dimensions
180 |         width, height = image.size
181 |         
182 |         # Calculate new dimensions
183 |         new_width = int(width * scale_factor)
184 |         new_height = int(height * scale_factor)
185 |         
186 |         # Resize the image using high quality resampling
187 |         resized_image = image.resize(
188 |             (new_width, new_height),
189 |             Image.LANCZOS
190 |         )
191 |         
192 |         # Convert back to base64
193 |         buffer = BytesIO()
194 |         resized_image.save(buffer, format="PNG")
195 |         resized_image_b64 = base64.b64encode(buffer.getvalue()).decode()
196 |         
197 |         return resized_image_b64
198 |         
199 |     except Exception:
200 |         return image_b64
201 | 
202 | 
203 | def calculate_iou(rect1: Rect, rect2: Rect) -> float:
204 |     """
205 |     Calculate Intersection over Union between two rectangles.
206 |     
207 |     Args:
208 |         rect1: First rectangle with left, top, right, bottom keys
209 |         rect2: Second rectangle with left, top, right, bottom keys
210 |         
211 |     Returns:
212 |         IoU value
213 |     """
214 |     # Calculate intersection
215 |     intersect_left = max(rect1.left, rect2.left)
216 |     intersect_top = max(rect1.top, rect2.top)
217 |     intersect_right = min(rect1.right, rect2.right)
218 |     intersect_bottom = min(rect1.bottom, rect2.bottom)
219 |     
220 |     # Check if intersection exists
221 |     if intersect_right < intersect_left or intersect_bottom < intersect_top:
222 |         return 0.0  # No intersection
223 |     
224 |     # Calculate area of each rectangle
225 |     area1 = (rect1.right - rect1.left) * (rect1.bottom - rect1.top)
226 |     area2 = (rect2.right - rect2.left) * (rect2.bottom - rect2.top)
227 |     
228 |     # Calculate area of intersection
229 |     intersection_area = (intersect_right - intersect_left) * (intersect_bottom - intersect_top)
230 |     
231 |     # Calculate union area
232 |     union_area = area1 + area2 - intersection_area
233 |     
234 |     # Calculate IoU
235 |     return intersection_area / union_area if union_area > 0 else 0.0
236 | 
237 | 
238 | def is_fully_contained(rect1: Rect, rect2: Rect) -> bool:
239 |     """
240 |     Check if rect1 is fully contained within rect2.
241 |     
242 |     Args:
243 |         rect1: First rectangle with left, top, right, bottom keys
244 |         rect2: Second rectangle with left, top, right, bottom keys
245 |         
246 |     Returns:
247 |         True if rect1 is fully contained within rect2
248 |     """
249 |     return (rect1.left >= rect2.left and
250 |             rect1.right <= rect2.right and
251 |             rect1.top >= rect2.top and
252 |             rect1.bottom <= rect2.bottom)
253 | 
254 | 
255 | def filter_overlapping_elements(elements: List[InteractiveElement], iou_threshold: float = 0.7) -> List[InteractiveElement]:
256 |     """
257 |     Filter overlapping elements using weight and IoU.
258 |     
259 |     Args:
260 |         elements: Elements to filter
261 |         iou_threshold: Threshold for considering elements as overlapping
262 |         
263 |     Returns:
264 |         Filtered elements
265 |     """
266 |     if not elements:
267 |         return []
268 |         
269 |     # Sort by area (descending), then by weight (descending)
270 |     elements.sort(key=lambda e: (
271 |         -(e.rect.width * e.rect.height),  # Negative area for descending sort
272 |         -e.weight  # Negative weight for descending sort
273 |     ))
274 |     
275 |     filtered_elements: List[InteractiveElement] = []
276 |     
277 |     # Add elements one by one, checking against already added elements
278 |     for current in elements:
279 |         should_add = True
280 |         
281 |         # For each element already in our filtered list
282 |         for existing in filtered_elements:
283 |             # Check overlap with IoU
284 |             iou = calculate_iou(current.rect, existing.rect)
285 |             if iou > iou_threshold:
286 |                 should_add = False
287 |                 break
288 |             
289 |             # Check if current element is fully contained within an existing element with higher weight
290 |             if is_fully_contained(current.rect, existing.rect):
291 |                 if existing.weight >= current.weight and existing.z_index == current.z_index:
292 |                     should_add = False
293 |                     break
294 |                 else:
295 |                     # If current element has higher weight and is more than 50% of the size of the existing element, remove the existing element
296 |                     if current.rect.width * current.rect.height >= existing.rect.width * existing.rect.height * 0.5:
297 |                         filtered_elements.remove(existing)
298 |                         break
299 |         
300 |         if should_add:
301 |             filtered_elements.append(current)
302 |     
303 |     return filtered_elements
304 | 
305 | 
306 | def sort_elements_by_position(elements: List[InteractiveElement]) -> List[InteractiveElement]:
307 |     """
308 |     Sort elements by position (top to bottom, left to right).
309 |     
310 |     Args:
311 |         elements: Elements to sort
312 |         
313 |     Returns:
314 |         Sorted elements
315 |     """
316 |     if not elements:
317 |         return []
318 |     
319 |     # Define what "same row" means
320 |     ROW_THRESHOLD = 20  # pixels
321 |     
322 |     # First, group elements into rows based on Y position
323 |     rows = []
324 |     current_row = []
325 |     
326 |     # Copy and sort elements by Y position
327 |     sorted_by_y = sorted(elements, key=lambda e: e.rect.top)
328 |     
329 |     # Group into rows
330 |     for element in sorted_by_y:
331 |         if not current_row:
332 |             # Start a new row
333 |             current_row.append(element)
334 |         else:
335 |             # Check if this element is in the same row as the previous ones
336 |             last_element = current_row[-1]
337 |             if abs(element.rect.top - last_element.rect.top) <= ROW_THRESHOLD:
338 |                 # Same row
339 |                 current_row.append(element)
340 |             else:
341 |                 # New row
342 |                 rows.append(list(current_row))
343 |                 current_row = [element]
344 |     
345 |     # Add the last row if not empty
346 |     if current_row:
347 |         rows.append(current_row)
348 |     
349 |     # Sort each row by X position (left to right)
350 |     for row in rows:
351 |         row.sort(key=lambda e: e.rect.left)
352 |     
353 |     # Flatten the rows back into a single array
354 |     elements = [element for row in rows for element in row]
355 | 
356 |     for i, element in enumerate(elements):
357 |         element.index = i
358 | 
359 |     return elements
360 | 
361 | 
362 | def filter_elements(
363 |     elements: List[InteractiveElement],
364 |     iou_threshold: float = 0.7
365 | ) -> List[InteractiveElement]:
366 |     """
367 |     Combine interactive elements from multiple detection methods and filter duplicates.
368 |     
369 |     Args:
370 |         elements: Interactive elements from multiple detection methods
371 |         iou_threshold: Threshold for considering elements as overlapping
372 |         
373 |     Returns:
374 |         Combined and filtered elements
375 |     """
376 |     #Filter overlapping elements
377 |     filtered = filter_overlapping_elements(elements, iou_threshold)
378 |     
379 |     # Sort elements by position
380 |     sorted_elements = sort_elements_by_position(filtered)
381 |     
382 |     return sorted_elements


--------------------------------------------------------------------------------
/index/cli.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import asyncio
  3 | import json
  4 | import logging
  5 | import os
  6 | import subprocess
  7 | import time
  8 | from typing import Dict, List, Optional
  9 | 
 10 | import requests
 11 | import typer
 12 | from dotenv import load_dotenv
 13 | from rich.console import Console
 14 | from rich.logging import RichHandler
 15 | from rich.markdown import Markdown
 16 | from rich.panel import Panel
 17 | from rich.prompt import Prompt
 18 | from textual.app import App
 19 | from textual.containers import Container, Horizontal, Vertical
 20 | from textual.reactive import reactive
 21 | from textual.widgets import Button, Footer, Header, Input, Static
 22 | 
 23 | from index.agent.agent import Agent
 24 | from index.agent.models import AgentOutput, AgentState
 25 | from index.browser.browser import BrowserConfig
 26 | from index.llm.llm import BaseLLMProvider
 27 | from index.llm.providers.anthropic import AnthropicProvider
 28 | from index.llm.providers.gemini import GeminiProvider
 29 | from index.llm.providers.openai import OpenAIProvider
 30 | 
 31 | load_dotenv()
 32 | 
 33 | # Create Typer app
 34 | app = typer.Typer(help="Index - Browser AI agent CLI")
 35 | 
 36 | # Configuration constants
 37 | BROWSER_STATE_FILE = "browser_state.json"
 38 | DEFAULT_CHROME_PATH = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
 39 | DEFAULT_DEBUGGING_PORT = 9222
 40 | 
 41 | console = Console()
 42 | 
 43 | def setup_logging(debug: bool = False):
 44 |     """Configure logging based on debug flag"""
 45 |     log_level = logging.INFO if debug else logging.WARNING
 46 |     
 47 |     # Configure root logger
 48 |     logging.basicConfig(
 49 |         level=log_level,
 50 |         format="%(message)s",
 51 |         datefmt="[%X]",
 52 |         handlers=[RichHandler(rich_tracebacks=True, console=console)]
 53 |     )
 54 |     
 55 |     # Set specific logger levels
 56 |     logging.getLogger("index").setLevel(log_level)
 57 |     logging.getLogger("playwright").setLevel(logging.WARNING)  # Always keep playwright at WARNING
 58 |     
 59 |     if debug:
 60 |         console.print("[yellow]Debug mode enabled - logging set to INFO level[/]")
 61 | 
 62 | class AgentSession:
 63 |     """Manages an agent session with state persistence"""
 64 |     
 65 |     def __init__(self, llm: Optional[BaseLLMProvider] = None, use_local_chrome: bool = False, chrome_path: str = DEFAULT_CHROME_PATH, debugging_port: int = DEFAULT_DEBUGGING_PORT, debug: bool = False):
 66 |         self.llm = llm
 67 |         self.chrome_process = None
 68 |         self.use_local_chrome = use_local_chrome
 69 |         self.chrome_path = chrome_path
 70 |         self.debugging_port = debugging_port
 71 |         self.logger = logging.getLogger("index.agent_session")
 72 |         
 73 |         browser_config = None
 74 | 
 75 |         if os.path.exists(BROWSER_STATE_FILE) and not use_local_chrome:
 76 |             with open(BROWSER_STATE_FILE, "r") as f:
 77 |                 self.storage_state = json.load(f)
 78 |                 console.print("[green]Loaded existing browser state[/green]")
 79 |                 browser_config = BrowserConfig(
 80 |                     storage_state=self.storage_state,
 81 |                     viewport_size={
 82 |                         "width": 1200,
 83 |                         "height": 800
 84 |                     }
 85 |                 )
 86 |         else:
 87 |             if use_local_chrome:
 88 |                 # Launch Chrome and connect to it
 89 |                 self._launch_local_chrome()
 90 |                 browser_config = BrowserConfig(
 91 |                     cdp_url="http://localhost:" + str(self.debugging_port),
 92 |                 )
 93 |             else:
 94 |                 browser_config = BrowserConfig(
 95 |                     viewport_size={
 96 |                         "width": 1200,
 97 |                         "height": 800
 98 |                     }
 99 |                 )
100 | 
101 |         self.agent = Agent(llm=self.llm, browser_config=browser_config)
102 |         self.agent_state: Optional[str] = None
103 |         self.step_count: int = 0
104 |         self.action_results: List[Dict] = []
105 |         self.is_running: bool = False
106 |         self.storage_state: Optional[Dict] = None
107 |     
108 |     def _launch_local_chrome(self):
109 |         """Launch a local Chrome instance with remote debugging enabled"""
110 |         # Check if Chrome is already running with the specified debugging port
111 |         try:
112 |             response = requests.get(f"http://localhost:{self.debugging_port}/json/version", timeout=2)
113 |             if response.status_code == 200:
114 |                 console.print(f"[green]Connected to already running Chrome instance on port {self.debugging_port}[/green]")
115 |                 self.logger.info(f"Connected to existing Chrome instance on port {self.debugging_port}")
116 |                 return
117 |         except requests.RequestException:
118 |             # No running Chrome instance found on the specified port, proceed with launching a new one
119 |             pass
120 |             
121 |         console.print(f"[blue]Launching Chrome from {self.chrome_path} with debugging port {self.debugging_port}[/blue]")
122 |         
123 |         try:
124 |             self.chrome_process = subprocess.Popen(
125 |                 [self.chrome_path, f"--remote-debugging-port={self.debugging_port}", "--no-first-run", "--no-default-browser-check"],
126 |                 stdout=subprocess.DEVNULL,
127 |                 stderr=subprocess.DEVNULL,
128 |             )
129 |             console.print("[green]Chrome launched successfully[/green]")
130 |             self.logger.info(f"Chrome process started with PID {self.chrome_process.pid}")
131 |             # Give Chrome time to start up
132 |             time.sleep(2)
133 |         except Exception as e:
134 |             self.logger.error(f"Failed to launch Chrome: {str(e)}")
135 |             console.print(f"[red]Failed to launch Chrome: {str(e)}[/red]")
136 |             raise
137 |         
138 |     def save_state(self, agent_output: AgentOutput):
139 |         """Save agent state to file"""
140 |         
141 |         if agent_output.storage_state:
142 |             with open(BROWSER_STATE_FILE, "w") as f:
143 |                 json.dump(agent_output.storage_state, f)
144 |                 
145 |             self.logger.info("Agent state saved to file")
146 |             console.print("[green]Saved agent state[/green]")
147 |     
148 |     async def run_agent(self, prompt: str) -> AgentOutput:
149 |         """Run the agent with the given prompt"""
150 |         self.is_running = True
151 |         self.logger.info(f"Running agent with prompt: {prompt}")
152 |         
153 |         try:
154 |             # Run the agent
155 |             if self.agent_state:
156 |                 result = await self.agent.run(
157 |                     prompt=prompt, 
158 |                     agent_state=self.agent_state, 
159 |                     close_context=False,
160 |                     return_storage_state=True,
161 |                     return_agent_state=True
162 |                 )
163 |             else:
164 |                 result = await self.agent.run(
165 |                     prompt=prompt,
166 |                     close_context=False,
167 |                     return_storage_state=True,
168 |                     return_agent_state=True
169 |                 )
170 |             
171 |             self.step_count = result.step_count
172 |             self.agent_state = result.agent_state.model_dump_json()
173 |             self.save_state(result)
174 |             
175 |             return result
176 |         finally:
177 |             self.is_running = False
178 | 
179 |     async def stream_run(self, prompt: str):
180 |         """Run the agent with streaming output"""
181 |         self.is_running = True
182 |         self.logger.info(f"Running agent with streaming and prompt: {prompt}")
183 |         
184 |         try:
185 |             # Run the agent with streaming
186 |             if self.agent_state:
187 |                 stream = self.agent.run_stream(
188 |                     prompt=prompt, 
189 |                     agent_state=self.agent_state, 
190 |                     close_context=False,
191 |                     max_steps=500, # large number to allow the agent to run for a long time
192 |                     return_agent_state=True,
193 |                     return_storage_state=True
194 |                 )
195 |             else:
196 |                 stream = self.agent.run_stream(
197 |                     prompt=prompt,
198 |                     close_context=False,
199 |                     max_steps=500, # large number to allow the agent to run for a long time
200 |                     return_agent_state=True,
201 |                     return_storage_state=True
202 |                 )
203 |             
204 |             final_output = None
205 |             async for chunk in stream:
206 |                 # Directly yield the raw chunk without any modifications
207 |                 yield chunk
208 |                 
209 |                 # Store final output for state saving
210 |                 if chunk.type == "final_output":
211 |                     final_output = chunk.content
212 |             
213 |             if final_output:
214 |                 self.step_count = final_output.step_count
215 |                 self.agent_state = final_output.agent_state.model_dump_json()
216 |                 self.save_state(final_output)
217 |                 
218 |         finally:
219 |             self.is_running = False
220 | 
221 |     def reset(self):
222 |         """Reset agent state"""
223 |         if os.path.exists(BROWSER_STATE_FILE):
224 |             os.remove(BROWSER_STATE_FILE)
225 |         self.agent_state = None
226 |         self.step_count = 0
227 |         self.action_results = []
228 |         self.logger.info("Agent state reset")
229 |         console.print("[yellow]Agent state reset[/yellow]")
230 |     
231 |     async def close(self):
232 |         """Close the agent and any associated resources"""
233 |         # Close the browser instance
234 |         if self.agent and self.agent.browser:
235 |             self.logger.info("Closing browser instance")
236 |             await self.agent.browser.close()
237 |         
238 |         # Terminate Chrome process if launched locally
239 |         if self.chrome_process:
240 |             self.logger.info(f"Terminating Chrome process with PID {self.chrome_process.pid}")
241 |             console.print("[yellow]Terminating local Chrome instance...[/yellow]")
242 |             self.chrome_process.terminate()
243 |             self.chrome_process = None
244 | 
245 | 
246 | class AgentUI(App):
247 |     """Textual-based UI for interacting with the agent"""
248 |     
249 |     CSS = """
250 |     Header {
251 |         background: #3b82f6;
252 |         color: white;
253 |         text-align: center;
254 |         padding: 1;
255 |     }
256 |     
257 |     Footer {
258 |         background: #1e3a8a;
259 |         color: white;
260 |         text-align: center;
261 |         padding: 1;
262 |     }
263 |     
264 |     #prompt-input {
265 |         padding: 1 2;
266 |         border: tall $accent;
267 |         margin: 1 1;
268 |         height: 3;
269 |     }
270 |     
271 |     #output-container {
272 |         height: 1fr;
273 |         border: solid #ccc;
274 |         background: #f8fafc;
275 |         padding: 1;
276 |         margin: 0 1;
277 |         overflow-y: auto;
278 |     }
279 |     
280 |     #action-results {
281 |         height: 15;
282 |         border: solid #ccc;
283 |         background: #f8fafc;
284 |         margin: 0 1 1 1;
285 |         overflow-y: auto;
286 |     }
287 |     
288 |     .action-result {
289 |         border: solid #e5e7eb;
290 |         margin: 1 0;
291 |         padding: 1;
292 |     }
293 |     
294 |     .action-title {
295 |         color: #3b82f6;
296 |         text-style: bold;
297 |     }
298 |     
299 |     .action-content {
300 |         margin-top: 1;
301 |     }
302 |     
303 |     Button {
304 |         margin: 1 1;
305 |     }
306 |     
307 |     #buttons-container {
308 |         height: auto;
309 |         align: center middle;
310 |     }
311 |     
312 |     .running {
313 |         color: #f97316;
314 |         text-style: bold;
315 |     }
316 |     
317 |     .completed {
318 |         color: #22c55e;
319 |         text-style: bold;
320 |     }
321 |     
322 |     .error {
323 |         color: #ef4444;
324 |         text-style: bold;
325 |     }
326 |     """
327 |     
328 |     TITLE = "Index Browser Agent CLI"
329 |     BINDINGS = [
330 |         ("q", "quit", "Quit"),
331 |         ("r", "reset", "Reset Agent"),
332 |         ("ctrl+s", "send", "Send Message"),
333 |     ]
334 |     
335 |     agent_session = None
336 |     status = reactive("Ready")
337 |     
338 |     def compose(self):
339 |         yield Header()
340 |         
341 |         with Vertical():
342 |             with Container(id="output-container"):
343 |                 yield Static(id="output", expand=True)
344 |                 
345 |             with Container(id="action-results"):
346 |                 yield Static(id="results", expand=True)
347 |                 
348 |             with Horizontal(id="buttons-container"):
349 |                 yield Button("Send", id="send-btn", variant="primary")
350 |                 yield Button("Reset", id="reset-btn", variant="error")
351 |                 
352 |             yield Input(placeholder="Enter your task or follow-up message...", id="prompt-input")
353 |                 
354 |         yield Footer()
355 |         
356 |     def update_output(self):
357 |         """Update the output display"""
358 |         output = ""
359 |         
360 |         if self.agent_session.agent_state:
361 |             state = AgentState.model_validate_json(self.agent_session.agent_state)
362 |             
363 |             # Get the latest user and assistant messages
364 |             user_msgs = [m for m in state.messages if m.role == "user"]
365 |             assistant_msgs = [m for m in state.messages if m.role == "assistant"]
366 |             
367 |             if user_msgs:
368 |                 latest_user = user_msgs[-1]
369 |                 output += f"[bold blue]User:[/] {latest_user.content}\n\n"
370 |                 
371 |             if assistant_msgs:
372 |                 latest_assistant = assistant_msgs[-1]
373 |                 output += f"[bold green]Assistant:[/] {latest_assistant.content}\n\n"
374 |                 
375 |             output += f"[dim]Steps completed: {self.agent_session.step_count}[/]\n"
376 |             output += f"[dim]Status: {self.status}[/]\n"
377 |         else:
378 |             output = "[italic]No previous session. Start by sending a task.[/]"
379 |             
380 |         self.query_one("#output", Static).update(Markdown(output))
381 |         
382 |         # Update action results
383 |         if self.agent_session.action_results:
384 |             results_output = ""
385 |             for i, result in enumerate(reversed(self.agent_session.action_results[-5:])):
386 |                 action_type = result.get("type", "unknown")
387 |                 content = result.get("content", {})
388 |                 
389 |                 if action_type == "step":
390 |                     action_result = content.get("action_result", {})
391 |                     summary = content.get("summary", "No summary available")
392 |                     
393 |                     results_output += f"[bold]Step {i+1}[/]\n"
394 |                     results_output += f"Summary: {summary}\n"
395 |                     
396 |                     if action_result.get("is_done"):
397 |                         results_output += "[green]Task completed[/]\n"
398 |                     
399 |                     if action_result.get("give_control"):
400 |                         results_output += "[yellow]Agent requested human control[/]\n"
401 |                         results_output += f"Message: {action_result.get('content', '')}\n"
402 |                     
403 |                     results_output += "\n"
404 |                     
405 |                 elif action_type == "error":
406 |                     results_output += "[bold red]Error[/]\n"
407 |                     results_output += f"{content}\n\n"
408 |                     
409 |             self.query_one("#results", Static).update(Markdown(results_output))
410 |     
411 |     async def on_button_pressed(self, event: Button.Pressed):
412 |         """Handle button presses"""
413 |         if event.button.id == "send-btn":
414 |             await self.action_send()
415 |         elif event.button.id == "reset-btn":
416 |             self.action_reset()
417 |     
418 |     def action_reset(self):
419 |         """Reset the agent state"""
420 |         self.agent_session.reset()
421 |         self.agent_session.action_results = []
422 |         self.update_output()
423 |     
424 |     async def action_send(self):
425 |         """Send the current prompt to the agent"""
426 |         prompt = self.query_one("#prompt-input", Input).value
427 |         
428 |         if not prompt.strip():
429 |             return
430 |             
431 |         self.status = "Running..."
432 |         self.query_one("#prompt-input", Input).value = ""
433 |         self.update_output()
434 |         
435 |         try:
436 |             # Stream the results to provide real-time feedback
437 |             async for chunk in self.agent_session.stream_run(prompt):
438 |                 self.agent_session.action_results.append(chunk)
439 |                 self.update_output()
440 |                 await asyncio.sleep(0.1)  # Small delay to ensure UI updates
441 |                 
442 |             self.status = "Ready"
443 |         except Exception as e:
444 |             self.status = f"Error: {str(e)}"
445 |         finally:
446 |             self.update_output()
447 |     
448 |     async def on_mount(self):
449 |         """Called when the app is mounted"""
450 |         # Register cleanup handler
451 |         self.set_interval(0.1, self._check_exit)
452 |     
453 |     async def _check_exit(self):
454 |         """Check if app is exiting and clean up resources"""
455 |         if self.exiting:
456 |             if self.agent_session:
457 |                 await self.agent_session.close()
458 |     
459 |     def action_quit(self):
460 |         """Quit the application"""
461 |         self.exit()
462 | 
463 | 
464 | @app.command()
465 | def run(
466 |     prompt: str = typer.Option(None, "--prompt", "-p", help="Initial prompt to send to the agent"),
467 |     use_local_chrome: bool = typer.Option(False, "--local-chrome", help="Use local Chrome instance instead of launching a new browser"),
468 |     chrome_path: str = typer.Option(DEFAULT_CHROME_PATH, "--chrome-path", help="Path to Chrome executable"),
469 |     debugging_port: int = typer.Option(DEFAULT_DEBUGGING_PORT, "--port", help="Remote debugging port for Chrome"),
470 |     debug: bool = typer.Option(False, "--debug", help="Enable debug logging")
471 | ):
472 |     """
473 |     Launch the interactive loop for the Index browser agent
474 |     """
475 |     # Set up logging if debug mode is enabled
476 |     setup_logging(debug)
477 |     
478 |     asyncio.run(_interactive_loop(
479 |         initial_prompt=prompt, 
480 |         use_local_chrome=use_local_chrome, 
481 |         chrome_path=chrome_path, 
482 |         debugging_port=debugging_port,
483 |         debug=debug
484 |     ))
485 | 
486 | 
487 | @app.command(name="ui")
488 | def run_ui(
489 |     prompt: str = typer.Option(None, "--prompt", "-p", help="Initial prompt to send to the agent"),
490 |     use_local_chrome: bool = typer.Option(False, "--local-chrome", help="Use local Chrome instance instead of launching a new browser"),
491 |     chrome_path: str = typer.Option(DEFAULT_CHROME_PATH, "--chrome-path", help="Path to Chrome executable"),
492 |     debugging_port: int = typer.Option(DEFAULT_DEBUGGING_PORT, "--port", help="Remote debugging port for Chrome"),
493 |     debug: bool = typer.Option(False, "--debug", help="Enable debug logging")
494 | ):
495 |     """
496 |     Launch the graphical UI for the Index browser agent
497 |     """
498 |     # Set up logging if debug mode is enabled
499 |     setup_logging(debug)
500 |     
501 |     # Select model and check API key
502 |     llm_provider = select_model_and_check_key()
503 |     
504 |     # Initialize UI with the selected LLM provider
505 |     agent_ui = AgentUI()
506 |     agent_ui.agent_session = AgentSession(
507 |         llm=llm_provider,
508 |         use_local_chrome=use_local_chrome,
509 |         chrome_path=chrome_path,
510 |         debugging_port=debugging_port,
511 |         debug=debug
512 |     )
513 |     
514 |     if prompt:
515 |         # If a prompt is provided, we'll send it once the UI is ready
516 |         async def send_initial_prompt():
517 |             await asyncio.sleep(0.5)  # Give UI time to initialize
518 |             agent_ui.query_one("#prompt-input", Input).value = prompt
519 |             await agent_ui.action_send()
520 |         
521 |         agent_ui.set_interval(0.1, lambda: asyncio.create_task(send_initial_prompt()))
522 |     
523 |     agent_ui.run()
524 | 
525 | 
526 | def create_llm_provider(provider: str, model: str) -> BaseLLMProvider:
527 |     """Create an LLM provider based on model choice"""
528 |     if provider == "openai":
529 |         # OpenAI model
530 |         console.print(f"[cyan]Using OpenAI model: {model}[/]")
531 |         return OpenAIProvider(model=model, reasoning_effort="low")
532 |     elif provider == "gemini":
533 |         # Gemini model
534 |         if model == "gemini-2.5-pro-preview-03-25":
535 |             console.print(f"[cyan]Using Gemini model: {model}[/]")
536 |             return GeminiProvider(
537 |                 model=model,
538 |                 thinking_token_budget=8192
539 |             )
540 |         elif model == "gemini-2.5-flash-preview-04-17":
541 |             console.print(f"[cyan]Using Gemini model: {model}[/]")
542 |             return GeminiProvider(
543 |                 model=model,
544 |                 thinking_token_budget=8192
545 |             )
546 |         else:
547 |             raise ValueError(f"Unsupported Gemini model: {model}")
548 |     elif provider == "anthropic":
549 |         # Anthropic model
550 |         console.print(f"[cyan]Using Anthropic model: {model}[/]")
551 |         return AnthropicProvider(
552 |             model=model,
553 |             enable_thinking=True,
554 |             thinking_token_budget=2048
555 |         )
556 |     else:
557 |         raise ValueError(f"Unsupported provider: {provider}")
558 | 
559 | 
560 | def check_and_save_api_key(required_key: str):
561 |     """Check if API key exists, prompt for it if missing, and save to .env file"""
562 |     if not os.environ.get(required_key):
563 |         console.print(f"\n[yellow]API key {required_key} not found in environment.[/]")
564 |         api_key = Prompt.ask(f"Enter your {required_key}", password=True)
565 |         
566 |         # Save to .env file
567 |         env_path = ".env"
568 |         
569 |         if os.path.exists(env_path):
570 |             # Read existing content
571 |             with open(env_path, "r") as f:
572 |                 env_content = f.read()
573 |             env_content += f"\n{required_key}={api_key}"
574 |             
575 |             with open(env_path, "w") as f:
576 |                 f.write(env_content)
577 |             console.print(f"[green]Saved {required_key} to .env file[/]")
578 |         else:
579 |             # Create new .env file
580 |             with open(env_path, "w") as f:
581 |                 f.write(f"{required_key}={api_key}")
582 |             console.print("[green]Created .env file[/]")
583 | 
584 |         # Update environment variable for current session
585 |         os.environ[required_key] = api_key
586 |         
587 |         # Reload dotenv to ensure changes are applied
588 |         load_dotenv(override=True)
589 |         
590 | 
591 | def select_model_and_check_key():
592 |     """Select a model and check for required API key"""
593 |     console.print("\n[bold green]Choose an LLM model:[/]")
594 |     console.print("1. [bold]Gemini 2.5 Pro[/]")
595 |     console.print("2. [bold]Gemini 2.5 Flash[/]")
596 |     console.print("3. [bold]Claude 3.7 Sonnet[/]")
597 |     console.print("4. [bold]OpenAI o4-mini[/]")
598 |     
599 |     choice = Prompt.ask(
600 |         "[bold]Select model[/]",
601 |         choices=["1", "2", "3", "4"],
602 |         default="1"
603 |     )
604 |     
605 |     provider = ""
606 |     model = ""
607 |     required_key = ""
608 |     
609 |     # Create LLM provider based on selection
610 |     if choice == "1":
611 |         provider = "gemini"
612 |         model = "gemini-2.5-pro-preview-03-25"
613 |         required_key = "GEMINI_API_KEY"
614 |     elif choice == "2":
615 |         provider = "gemini"
616 |         model = "gemini-2.5-flash-preview-04-17"
617 |         required_key = "GEMINI_API_KEY"
618 |     elif choice == "3":
619 |         provider = "anthropic"
620 |         model = "claude-3-7-sonnet-20250219"
621 |         required_key = "ANTHROPIC_API_KEY"
622 |     elif choice == "4":
623 |         provider = "openai"
624 |         model = "o4-mini"
625 |         required_key = "OPENAI_API_KEY"
626 |     else:
627 |         raise ValueError(f"Invalid choice: {choice}")
628 |     
629 |     # Check and save API key if needed
630 |     check_and_save_api_key(required_key)
631 |     
632 |     return create_llm_provider(provider, model)
633 | 
634 | 
635 | async def _interactive_loop(initial_prompt: str = None, use_local_chrome: bool = False, chrome_path: str = DEFAULT_CHROME_PATH, debugging_port: int = DEFAULT_DEBUGGING_PORT, debug: bool = False):
636 |     """Implementation of the interactive loop mode"""
637 |     # Display welcome panel
638 |     console.print(Panel.fit(
639 |         "Index Browser Agent Interactive Mode\n"
640 |         "Type your message and press Enter. The agent will respond.\n"
641 |         "Press Ctrl+C to exit.",
642 |         title="Interactive Mode",
643 |         border_style="blue"
644 |     ))
645 |     
646 |     # Select model and check API key
647 |     llm_provider = select_model_and_check_key()
648 |     
649 |     # Create agent session with selected provider
650 |     session = AgentSession(
651 |         llm=llm_provider,
652 |         use_local_chrome=use_local_chrome,
653 |         chrome_path=chrome_path,
654 |         debugging_port=debugging_port,
655 |         debug=debug
656 |     )
657 |     
658 |     try:
659 |         first_message = True
660 |         awaiting_human_input = False
661 |         
662 |         while True:
663 |             # Check if we're waiting for the user to return control to the agent
664 |             if awaiting_human_input:
665 |                 console.print("\n[yellow]Agent is waiting for control to be returned.[/]")
666 |                 console.print("[yellow]Press Enter to return control to the agent...[/]", end="")
667 |                 input()  # Wait for Enter key
668 |                 user_message = "Returning control back, continue your task"
669 |                 console.print(f"\n[bold blue]Your message:[/] {user_message}")
670 |                 awaiting_human_input = False
671 |             # Normal message input flow
672 |             elif first_message and initial_prompt:
673 |                 user_message = initial_prompt
674 |                 console.print(f"\n[bold blue]Your message:[/] {user_message}")
675 |                 first_message = False
676 |             else:
677 |                 console.print("\n[bold blue]Your message:[/] ", end="")
678 |                 user_message = input()
679 |                 first_message = False
680 |             
681 |             if not user_message.strip():
682 |                 continue
683 |             
684 |             console.print("\n[bold cyan]Agent is working...[/]")
685 |             
686 |             step_num = 1
687 |             human_control_requested = False
688 |             
689 |             # Run the agent with streaming output
690 |             try:
691 |                 async for chunk in session.stream_run(user_message):
692 |                     if chunk.type == "step":
693 |                         action_result = chunk.content.action_result
694 |                         summary = chunk.content.summary
695 |                         
696 |                         # Simple single-line output for steps
697 |                         console.print(f"[bold blue]Step {step_num}:[/] {summary}")
698 |                         # Display additional info for special actions as separate lines
699 |                         if action_result and action_result.is_done and not action_result.give_control:
700 |                             console.print("  [green bold]✓ Task completed successfully![/]")
701 |                         
702 |                         if action_result and action_result.give_control:
703 |                             human_control_requested = True
704 |                             message = action_result.content or "No message provided"
705 |                             console.print("  [yellow bold]⚠ Human control requested:[/]")
706 |                             console.print(f"  [yellow]{message}[/]")
707 |                         
708 |                         # Increment step counter for next step
709 |                         step_num += 1
710 |                         
711 |                     elif chunk.type == "step_error":
712 |                         console.print(f"[bold red]Error:[/] {chunk.content}")
713 |                         
714 |                     elif chunk.type == "final_output":
715 |                         # Keep panel for final output
716 |                         result_content = chunk.content.result.content if chunk.content.result else "No result content"
717 |                         console.print(Panel(
718 |                             f"{result_content}",
719 |                             title="Final Output",
720 |                             border_style="green",
721 |                             expand=False
722 |                         ))
723 |                 
724 |             except Exception as e:
725 |                 console.print(f"[bold red]Error:[/] {str(e)}")
726 |                 console.print(f"[dim]Type: {type(e)}[/]")
727 |                 console.print_exception()
728 |             
729 |             # After agent completes
730 |             if human_control_requested:
731 |                 console.print("\n[yellow]Agent has requested human control.[/]")
732 |                 awaiting_human_input = True
733 |             else:
734 |                 console.print("\n[green]Agent has completed the task.[/]")
735 |                 console.print("[dim]Waiting for your next message...[/]")
736 |             
737 |     except KeyboardInterrupt:
738 |         console.print("\n[yellow]Exiting interactive mode...[/]")
739 |         # Close the browser before exiting
740 |         await session.close()
741 | 
742 | 
743 | def main():
744 |     """Entry point for the CLI"""
745 |     app()
746 | 
747 | 
748 | if __name__ == "__main__":
749 |     main()


--------------------------------------------------------------------------------
/index/controller/controller.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import json
  3 | import logging
  4 | from dataclasses import dataclass
  5 | from functools import wraps
  6 | from typing import Any, Callable, Dict, get_type_hints
  7 | 
  8 | from docstring_parser import parse
  9 | from lmnr import Laminar
 10 | 
 11 | from index.agent.models import ActionModel, ActionResult
 12 | from index.browser.browser import Browser
 13 | from index.controller.default_actions import register_default_actions
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | @dataclass
 19 | class Action:
 20 |     """Represents a registered action"""
 21 |     name: str
 22 |     description: str
 23 |     function: Callable
 24 |     browser_context: bool = False
 25 | 
 26 | 
 27 | class Controller:
 28 |     """Controller for browser actions with integrated registry functionality"""
 29 |     
 30 |     def __init__(self):
 31 |         self._actions: Dict[str, Action] = {}
 32 |         # Register default actions
 33 |         register_default_actions(self)
 34 | 
 35 |     def action(self, description: str = None):
 36 |         """
 37 |         Decorator for registering actions
 38 |         
 39 |         Args:
 40 |             description: Optional description of what the action does.
 41 |                         If not provided, uses the function's docstring.
 42 |         """
 43 |         def decorator(func: Callable) -> Callable:
 44 | 
 45 |             # Use provided description or function docstring
 46 |             action_description = description
 47 |             if action_description is None:
 48 |                 action_description = inspect.getdoc(func) or "No description provided"
 49 |             
 50 |             # Clean up docstring (remove indentation)
 51 |             action_description = inspect.cleandoc(action_description)
 52 | 
 53 |             browser_context = False
 54 |             if 'browser' in inspect.signature(func).parameters:
 55 |                 browser_context = True
 56 | 
 57 |             @wraps(func)
 58 |             async def async_wrapper(*args, **kwargs):
 59 |                 return await func(*args, **kwargs)
 60 | 
 61 |             # Register the action
 62 |             self._actions[func.__name__] = Action(
 63 |                 name=func.__name__,
 64 |                 description=action_description,
 65 |                 function=async_wrapper,
 66 |                 browser_context=browser_context,
 67 |             )
 68 |             return func
 69 | 
 70 |         return decorator
 71 | 
 72 |     async def execute_action(
 73 |         self,
 74 |         action: ActionModel,
 75 |         browser: Browser,
 76 |     ) -> ActionResult:
 77 |         """Execute an action from an ActionModel"""
 78 | 
 79 |         action_name = action.name
 80 |         params = action.params
 81 | 
 82 |         if params is not None:
 83 |             with Laminar.start_as_current_span(
 84 |                 name=action_name,
 85 |                 input={
 86 |                     'action': action_name,
 87 |                     'params': params,
 88 |                 },
 89 |                 span_type='TOOL',
 90 |             ):
 91 |                 
 92 |                 logger.info(f'Executing action: {action_name} with params: {params}')
 93 |                 action = self._actions.get(action_name)
 94 | 
 95 |                 if action is None:
 96 |                     raise ValueError(f'Action {action_name} not found')
 97 |                 
 98 |                 try:
 99 | 
100 |                     kwargs = params.copy() if params else {}
101 | 
102 |                     # Add browser to kwargs if it's provided
103 |                     if action.browser_context and browser is not None:
104 |                         kwargs['browser'] = browser
105 | 
106 |                     result = await action.function(**kwargs)
107 | 
108 |                     Laminar.set_span_output(result)
109 |                     return result
110 | 
111 |                 except Exception as e:
112 |                     raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e
113 | 
114 |         else:
115 |             raise ValueError('Params are not provided for action: {action_name}')
116 | 
117 |     def get_action_descriptions(self) -> str:
118 |         """Return a dictionary of all registered actions and their metadata"""
119 |         
120 |         action_info = []
121 |         
122 |         for name, action in self._actions.items():
123 |             sig = inspect.signature(action.function)
124 |             type_hints = get_type_hints(action.function)
125 |             
126 |             # Extract parameter descriptions using docstring_parser
127 |             param_descriptions = {}
128 |             docstring = inspect.getdoc(action.function)
129 |             if docstring:
130 |                 parsed_docstring = parse(docstring)
131 |                 for param in parsed_docstring.params:
132 |                     param_descriptions[param.arg_name] = param.description
133 |             
134 |             # Build parameter info
135 |             params = {}
136 |             for param_name in sig.parameters.keys():
137 |                 if param_name == 'browser':  # Skip browser parameter in descriptions
138 |                     continue
139 |                     
140 |                 param_type = type_hints.get(param_name, Any).__name__
141 |                 
142 |                 params[param_name] = {
143 |                     'type': param_type,
144 |                     'description': param_descriptions.get(param_name, '')
145 |                 }
146 |             
147 |             # Use short description from docstring when available
148 |             description = action.description
149 |             if docstring:
150 |                 parsed_docstring = parse(docstring)
151 |                 if parsed_docstring.short_description:
152 |                     description = parsed_docstring.short_description
153 |             
154 |             action_info.append(json.dumps({
155 |                 'name': name,
156 |                 'description': description,
157 |                 'parameters': params
158 |             }, indent=2))
159 |         
160 |         return '\n\n'.join(action_info)


--------------------------------------------------------------------------------
/index/controller/default_actions.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import logging
  4 | import platform
  5 | import re
  6 | from typing import Any, Dict
  7 | 
  8 | from tenacity import retry, stop_after_attempt, wait_exponential
  9 | 
 10 | from index.agent.models import ActionResult
 11 | from index.browser.browser import Browser
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | def register_default_actions(controller):
 16 |     """Register all default browser actions to the provided controller"""
 17 | 
 18 |     @controller.action()
 19 |     async def done(output: str):
 20 |         """Use this action when you have completed the task.
 21 |         
 22 |         Args:
 23 |             output: Output of the task.
 24 |         """
 25 |         return ActionResult(is_done=True, content=output)
 26 | 
 27 |     @controller.action()
 28 |     async def done_with_structured_output(output: Dict[str, Any]):
 29 |         """Use this action ONLY when you are provided with a structured output model. Otherwise, use simple `done` action.
 30 |         
 31 |         Args:
 32 |             output: JSON object that adheres to the provided output model.
 33 |         """
 34 |         return ActionResult(is_done=True, content=output)
 35 | 
 36 | 
 37 |     @controller.action()
 38 |     async def give_human_control(message: str, browser: Browser):
 39 |         """Give human control of the browser. Use this action when you need to use user information, such as first name, last name, email, phone number, booking information, login/password, etc. to proceed with the task. Also, if you can't solve the CAPTCHA, use this action.
 40 |         
 41 |         Args:
 42 |             message: Message to give to the human, explaining why you need human intervention.
 43 |         """
 44 |         return ActionResult(give_control=True, content=message, is_done=True)
 45 | 
 46 | 
 47 |     @controller.action()
 48 |     async def search_google(query: str, browser: Browser):
 49 |         """
 50 |         Open google search in new tab and search for the query.
 51 |         """
 52 |         page = await browser.get_current_page()
 53 |         await page.goto(f'https://www.google.com/search?q={query}&udm=14', wait_until='domcontentloaded')
 54 |         await asyncio.sleep(1)
 55 |         msg = f"Searched for '{query}' in Google"
 56 |         logger.info(msg)
 57 |         return ActionResult(content=msg)
 58 | 
 59 |     @controller.action()
 60 |     @retry(
 61 |         stop=stop_after_attempt(3),
 62 |         wait=wait_exponential(multiplier=1, min=1, max=10),
 63 |         reraise=True,
 64 |         before_sleep=lambda retry_state: logger.warning(
 65 |             f"Retrying step after error: {retry_state.outcome.exception()}. Attempt {retry_state.attempt_number}"
 66 |         )
 67 |     )
 68 |     async def go_to_url(url: str, browser: Browser):
 69 |         """Navigate to URL in the current tab"""
 70 |         page = await browser.get_current_page()
 71 |         await page.goto(url, wait_until='domcontentloaded')
 72 |         await asyncio.sleep(1.5)
 73 |         msg = f"Navigated to {url}"
 74 |         logger.info(msg)
 75 |         return ActionResult(content=msg)
 76 | 
 77 |     @controller.action()
 78 |     async def go_back_to_previous_page(browser: Browser):
 79 |         """Go back to the previous page"""
 80 |         try:
 81 |             page = await browser.get_current_page()            
 82 |             await page.go_back(wait_until='domcontentloaded')
 83 | 
 84 |             await asyncio.sleep(2)
 85 |             msg = 'Navigated back to the previous page'
 86 |             logger.info(msg)
 87 |             return ActionResult(content=msg)
 88 | 
 89 |         except Exception as e:
 90 |             logger.debug(f'During go_back: {e}')
 91 |             return ActionResult(error=str(e))
 92 | 
 93 |     @controller.action()
 94 |     async def click_on_spreadsheet_cell(row: str, column: str, browser: Browser) -> ActionResult:
 95 |         """Click on a spreadsheet cell at a specific row and column. You HAVE to use this action when you need to click on a cell in a spreadsheet. DON'T try to use click_element action, it will not work.
 96 |         
 97 |         Args:
 98 |             row: Row of the cell to click on, it should be a number formatted as a string. e.g. "1"
 99 |             column: Column of the cell to click on, it should be a letter formatted as a string. e.g. "A"
100 |         """
101 |         page = await browser.get_current_page()
102 |         state = browser.get_state()
103 |         
104 |         elements = state.interactive_elements.values()
105 | 
106 |         row_element = next((e for e in elements if e.browser_agent_id == f"row_{row}"), None)
107 |         column_element = next((e for e in elements if e.browser_agent_id == f"column_{column}"), None)
108 | 
109 |         if not row_element or not column_element:
110 |             return ActionResult(error='Row or column element not found - pay close attention to the row and column numbers.')
111 | 
112 |         # reseting click just in case
113 |         await page.mouse.click(state.viewport.width / 2, state.viewport.height / 2)
114 |         await asyncio.sleep(0.05)
115 | 
116 |         await page.mouse.click(column_element.center.x, row_element.center.y, click_count=2)
117 |         await asyncio.sleep(0.05)
118 | 
119 |         return ActionResult(content=f'Clicked on spreadsheet cell with row {row} and column {column}')
120 | 
121 | 
122 |     @controller.action()
123 |     async def click_element(index: int, browser: Browser, wait_after_click: bool = False):
124 |         """
125 |         Click on the element with index. 
126 | 
127 |         Args:
128 |             index: Index of the element to click on.
129 |             wait_after_click: If True, wait for 2 second after clicking the element. Only set it to True when you think that clicking will trigger loading state, for instance navigation to new page, search, loading of a content, etc.
130 |         """
131 |         # clean index if it contains any non-numeric characters
132 |         cleaned_index_str = re.sub(r'\D', '', str(index))
133 |         if cleaned_index_str == '':
134 |             logger.error(f'Index is not a number. Index: {index}')
135 |             return ActionResult(error="`index` should be a valid number.")
136 |         
137 |         index = int(cleaned_index_str)
138 | 
139 |         state = browser.get_state()
140 | 
141 |         if index not in state.interactive_elements:
142 |             return ActionResult(error=f"Element with index {index} does not exist - retry or use alternative actions.")
143 | 
144 |         element = state.interactive_elements[index]
145 |         initial_pages = len(browser.context.pages) if browser.context else 0
146 | 
147 |         try:
148 |             page = await browser.get_current_page()
149 | 
150 |             await page.mouse.click(element.center.x, element.center.y)
151 | 
152 |             msg = f'Clicked element with index {index}: <{element.tag_name}></{element.tag_name}>'
153 | 
154 |             logger.info(msg)
155 |             if browser.context and len(browser.context.pages) > initial_pages:
156 |                 new_tab_msg = 'New tab opened - switching to it'
157 |                 msg += f' - {new_tab_msg}'
158 |                 logger.info(new_tab_msg)
159 |                 await browser.switch_to_tab(-1)
160 |             
161 |             if wait_after_click:
162 |                 await asyncio.sleep(2)
163 | 
164 |             return ActionResult(content=msg)
165 |         except Exception as e:
166 |             return ActionResult(error=str(e))
167 |  
168 |     @controller.action(
169 |         description='Use this action to wait for the page to load, if you see that the content on the clean screenshot is empty or loading UI elements such as skeleton screens. This action will wait for page to load. Then you can continue with your actions.',
170 |     )
171 |     async def wait_for_page_to_load() -> ActionResult:
172 |         return ActionResult(content='Waited for page to load')
173 | 
174 |     @controller.action()
175 |     async def enter_text(text: str, press_enter: bool, browser: Browser):
176 |         """Enter text with a keyboard. Use it AFTER you have clicked on an input element. This action will override the current text in the element.
177 |         
178 |         Args:
179 |             text: Text to enter with a keyboard.
180 |             press_enter: If True, `Enter` button will be pressed after entering the text. Use this when you think it would make sense to press `Enter` after entering the text, such as when you're submitting a form, performing a search, etc.
181 |         """
182 |                     
183 |         try:
184 |             page = await browser.get_current_page()
185 |             # clear the element
186 |             await page.keyboard.press("ControlOrMeta+a")
187 |             await asyncio.sleep(0.1)
188 |             await page.keyboard.press("Backspace")
189 |             await asyncio.sleep(0.1)
190 | 
191 |             # input text into the element
192 |             await page.keyboard.type(text)
193 | 
194 |             if press_enter:
195 |                 await page.keyboard.press("Enter")
196 |                 await asyncio.sleep(2)
197 | 
198 |             msg = f'Entered "{text}" on the keyboard. Make sure to double check that the text was entered to where you intended.'
199 |             logger.info(msg)
200 |             return ActionResult(content=msg)
201 |         except Exception as e:
202 |             return ActionResult(error=f'Failed to enter text. Error: {str(e)}')
203 | 
204 |     # Tab Management Actions
205 |     @controller.action('Switch tab')
206 |     async def switch_tab(page_id: int, browser: Browser):
207 |         await browser.switch_to_tab(page_id)
208 |         await asyncio.sleep(0.5)
209 |         msg = f'Switched to tab {page_id}'
210 |         logger.info(msg)
211 |         return ActionResult(content=msg)
212 | 
213 |     @controller.action('Open url in new tab')
214 |     async def open_tab(url: str, browser: Browser):
215 |         await browser.create_new_tab(url)
216 |         msg = f'Opened new tab with {url}'
217 |         logger.info(msg)
218 |         return ActionResult(content=msg)
219 | 
220 |     @controller.action(
221 |         "Scrolls entire page down. Use this action when you want to scroll the entire page down. Don't use this action if you want to scroll over a specific scrollable area on a page."
222 |     )
223 |     async def scroll_page_down(browser: Browser):
224 |         page = await browser.get_current_page()
225 |         state = browser.get_state()
226 |         # move mouse to the center of the page
227 |         await page.mouse.move(state.viewport.width / 2, state.viewport.height / 2)
228 |         await asyncio.sleep(0.1)
229 |         # scroll down by one page
230 |         await page.mouse.wheel(0, state.viewport.height * 0.8)
231 |         return ActionResult(content="Scrolled mouse wheel down (it doesn't guarantee that something has scrolled, you need to check new state screenshot to confirm)")
232 |     
233 |     
234 |     @controller.action(
235 |         "Scrolls entire page up. Use this action when you want to scroll the entire page up. Don't use this action if you want to scroll over a specific scrollable area on a page."
236 |     )
237 |     async def scroll_page_up(browser: Browser):
238 |         page = await browser.get_current_page()
239 |         state = browser.get_state()
240 |         # move mouse to the center of the page
241 |         await page.mouse.move(state.viewport.width / 2, state.viewport.height / 2)
242 |         await asyncio.sleep(0.1)
243 |         # scroll up by one page
244 |         await page.mouse.wheel(0, -state.viewport.height * 0.8)
245 |         return ActionResult(content="Scrolled mouse wheel up (it doesn't guarantee that something has scrolled, you need to check new state screenshot to confirm)")
246 | 
247 |     @controller.action(
248 |         "Moves mouse to the element with index `index`, located inside scrollable area of the webpage, identified by scrollbars. Then scrolls mouse wheel down."
249 |     )
250 |     async def scroll_down_over_element(index: int, browser: Browser):
251 |         page = await browser.get_current_page()
252 |         state = browser.get_state()
253 | 
254 |         if index not in state.interactive_elements:
255 |             return ActionResult(error=f'Element index {index} does not exist - retry or use alternative actions')
256 | 
257 |         element = state.interactive_elements[index]
258 | 
259 |         await page.mouse.move(element.center.x, element.center.y)
260 |         await asyncio.sleep(0.1)
261 |         await page.mouse.wheel(0, state.viewport.height / 3)
262 | 
263 |         return ActionResult(content=f"Move mouse to element with index {index} and scroll mouse wheel down. (It doesn't guarantee that something has scrolled, you need to check new state screenshot to confirm)")
264 |     
265 |     @controller.action(
266 |         "Moves mouse to the element with index `index`, located inside scrollable area of the webpage, identified by scrollbars. Then scrolls mouse wheel up."
267 |     )
268 |     async def scroll_up_over_element(index: int, browser: Browser):
269 |         page = await browser.get_current_page()
270 |         state = browser.get_state()
271 | 
272 |         if index not in state.interactive_elements:
273 |             return ActionResult(error=f'Element index {index} does not exist - retry or use alternative actions')
274 | 
275 |         element = state.interactive_elements[index]
276 | 
277 |         await page.mouse.move(element.center.x, element.center.y)
278 |         await asyncio.sleep(0.1)
279 |         await page.mouse.wheel(0, -state.viewport.height / 3)
280 | 
281 |         return ActionResult(content=f"Move mouse to element with index {index} and scroll mouse wheel up. (It doesn't guarantee that something has scrolled, you need to check new state screenshot to confirm)")
282 | 
283 |     @controller.action(
284 |         "Moves mouse at the location of the element with index `index`, which should be inside scrollable area of the webpage, identified by scrollbars. Then scrolls mouse wheel horizontally to the right."
285 |     )
286 |     async def scroll_right_over_element(index: int, browser: Browser):
287 |         page = await browser.get_current_page()
288 |         state = browser.get_state()
289 | 
290 |         if index not in state.interactive_elements:
291 |             return ActionResult(error=f'Element index {index} does not exist - retry or use an alternative action')
292 | 
293 |         element = state.interactive_elements[index]
294 | 
295 |         await page.mouse.move(element.center.x, element.center.y)
296 |         await asyncio.sleep(0.1)
297 |         await page.mouse.wheel(state.viewport.width / 3, 0)
298 | 
299 |         return ActionResult(content=f"Moved mouse to element with index {index} and scroll mouse wheel horizontally to the right. (It doesn't guarantee that something has scrolled, you need to check new state screenshot to confirm)")
300 | 
301 | 
302 |     @controller.action(
303 |         "Moves mouse at the location of the element with index `index`, which should be inside scrollable area of the webpage, identified by scrollbars. Then scrolls mouse wheel horizontally to the left."
304 |     )
305 |     async def scroll_left_over_element(index: int, browser: Browser):
306 |         page = await browser.get_current_page()
307 |         state = browser.get_state()
308 | 
309 |         if index not in state.interactive_elements:
310 |             return ActionResult(error=f'Element index {index} does not exist - retry or use an alternative action')
311 | 
312 |         element = state.interactive_elements[index]
313 | 
314 |         await page.mouse.move(element.center.x, element.center.y)
315 |         await asyncio.sleep(0.1)
316 |         await page.mouse.wheel(-state.viewport.width / 3, 0)
317 | 
318 |         return ActionResult(content=f"Moved mouse to element with index {index} and scroll mouse wheel horizontally to the left. (It doesn't guarantee that something has scrolled, you need to check new state screenshot to confirm)")
319 | 
320 | 
321 |     @controller.action(
322 |         'Press enter key. Use this action when you need to submit a form or perform an action that requires pressing enter.'
323 |     )
324 |     async def press_enter(browser: Browser):
325 |         page = await browser.get_current_page()
326 | 
327 |         await page.keyboard.press('Enter')
328 |         return ActionResult(content='Pressed enter key')
329 |     
330 |     @controller.action(
331 |         'Remove all text in the element with index.'
332 |     )
333 |     async def clear_text_in_element(index: int, browser: Browser):
334 |         page = await browser.get_current_page()
335 |         
336 |         state = browser.get_state()
337 | 
338 |         if index not in state.interactive_elements:
339 |             return ActionResult(error=f'Element index {index} does not exist - retry or use alternative actions')
340 | 
341 |         element = state.interactive_elements[index]
342 | 
343 |         await page.mouse.move(element.center.x, element.center.y)
344 |         await page.mouse.click(element.center.x, element.center.y)
345 |         await asyncio.sleep(0.1)
346 | 
347 |         if platform.system() == "Darwin":
348 |             await page.keyboard.press('Meta+A')
349 |         else:
350 |             await page.keyboard.press('Control+A')
351 |         await asyncio.sleep(0.1)
352 |         await page.keyboard.press('Backspace')
353 |         return ActionResult(content='Removed all text in the element with index')
354 | 
355 |     @controller.action()
356 |     async def get_select_options(index: int, browser: Browser) -> ActionResult:
357 |         """Get all options from a <select> element. Use this action when you need to get all options from a dropdown."""
358 | 
359 |         try:
360 |             # Get the page and element information
361 |             page = await browser.get_current_page()
362 |             interactive_elements = browser.get_state().interactive_elements
363 |             
364 |             # Verify the element exists and is a select
365 |             if index not in interactive_elements:
366 |                 return ActionResult(error=f"No element found with index {index}")
367 |                 
368 |             element = interactive_elements[index]
369 |             
370 |             # Check if it's a select element
371 |             if element.tag_name.lower() != 'select':
372 |                 return ActionResult(error=f"Element {index} is not a select element, it's a {element.tag_name}")
373 |             
374 |             # Use the unique ID to find the element
375 |             options_data = await page.evaluate("""
376 |             (args) => {
377 |                 // Find the select element using the unique ID
378 |                 const select = document.querySelector(`[data-browser-agent-id="${args.browserAgentId}"]`);
379 |                 if (!select) return null;
380 |                 
381 |                 // Get all options	
382 |                 return {
383 |                     options: Array.from(select.options).map(opt => ({
384 |                         text: opt.text,
385 |                         value: opt.value,
386 |                         index: opt.index
387 |                     })),
388 |                     id: select.id,
389 |                     name: select.name
390 |                 };
391 |             }
392 |             """, {"browserAgentId": element.browser_agent_id})
393 | 
394 |             # Process options from direct approach
395 |             formatted_options = []
396 |             for opt in options_data['options']:
397 |                 encoded_text = json.dumps(opt['text'])
398 |                 formatted_options.append(f'{opt["index"]}: option={encoded_text}')
399 |                 
400 |             msg = '\n'.join(formatted_options)
401 |             msg += '\nIf you decide to use this select element, use the exact option name in select_dropdown_option'
402 |             
403 |             logger.info(f'Found dropdown with ID: {options_data["id"]}, Name: {options_data["name"]}')
404 |             return ActionResult(content=msg)
405 |             
406 |         except Exception as e:
407 |             logger.error(f'Failed to get dropdown options: {str(e)}')
408 |             return ActionResult(error=f'Error getting dropdown options: {str(e)}')
409 | 
410 |     @controller.action(
411 |         description='Select an option from a <select> element by the text (name) of the option. Use this after get_select_options and when you need to select an option from a dropdown.',
412 |     )
413 |     async def select_dropdown_option(
414 |         index: int,
415 |         option: str,
416 |         browser: Browser,
417 |     ) -> ActionResult:
418 |         """Select dropdown option by the text of the option you want to select"""
419 |         try:
420 |             # Get the interactive element
421 |             page = await browser.get_current_page()
422 |             interactive_elements = browser.get_state().interactive_elements
423 |             
424 |             # Verify the element exists and is a select
425 |             if index not in interactive_elements:
426 |                 return ActionResult(error=f"No element found with index {index}")
427 |                 
428 |             element = interactive_elements[index]
429 |             
430 |             # Check if it's a select element
431 |             if element.tag_name.lower() != 'select':
432 |                 return ActionResult(error=f"Element {index} is not a select element, it's a {element.tag_name}")
433 |             
434 |             logger.debug(f"Attempting to select '{option}' using browser_agent_id: {element.browser_agent_id}")
435 |             
436 |             # Use JavaScript to select the option using the unique ID
437 |             result = await page.evaluate("""
438 |             (args) => {
439 |                 const uniqueId = args.uniqueId;
440 |                 const optionText = args.optionText;
441 |                 
442 |                 try {
443 |                     // Find the select element by unique ID - works across frames too
444 |                     function findElementByUniqueId(root, id) {
445 |                         // Check in main document first
446 |                         let element = document.querySelector(`[data-browser-agent-id="${id}"]`);
447 |                         if (element) return element;
448 |                     }
449 |                     
450 |                     const select = findElementByUniqueId(window, uniqueId);
451 |                     if (!select) {
452 |                         return { 
453 |                             success: false, 
454 |                             error: "Select element not found with ID: " + uniqueId 
455 |                         };
456 |                     }
457 |                     
458 |                     // Find the option with matching text
459 |                     let found = false;
460 |                     let selectedValue = null;
461 |                     let selectedIndex = -1;
462 |                     
463 |                     for (let i = 0; i < select.options.length; i++) {
464 |                         const opt = select.options[i];
465 |                         if (opt.text === optionText) {
466 |                             // Select this option
467 |                             opt.selected = true;
468 |                             found = true;
469 |                             selectedValue = opt.value;
470 |                             selectedIndex = i;
471 |                             
472 |                             // Trigger change event
473 |                             const event = new Event('change', { bubbles: true });
474 |                             select.dispatchEvent(event);
475 |                             break;
476 |                         }
477 |                     }
478 |                     
479 |                     if (found) {
480 |                         return { 
481 |                             success: true, 
482 |                             value: selectedValue, 
483 |                             index: selectedIndex 
484 |                         };
485 |                     } else {
486 |                         return { 
487 |                             success: false, 
488 |                             error: "Option not found: " + optionText,
489 |                             availableOptions: Array.from(select.options).map(o => o.text)
490 |                         };
491 |                     }
492 |                 } catch (e) {
493 |                     return { 
494 |                         success: false, 
495 |                         error: e.toString() 
496 |                     };
497 |                 }
498 |             }
499 |             """, {"uniqueId": element.browser_agent_id, "optionText": option})
500 |             
501 |             if result.get('success'):
502 |                 msg = f"Selected option '{option}' with value '{result.get('value')}' at index {result.get('index')}"
503 |                 logger.info(msg)
504 |                 return ActionResult(content=msg)
505 |             else:
506 |                 error_msg = result.get('error', 'Unknown error')
507 |                 if 'availableOptions' in result:
508 |                     available = result.get('availableOptions', [])
509 |                     error_msg += f". Available options: {', '.join(available)}"
510 |                     
511 |                 logger.error(f"Selection failed: {error_msg}")
512 |                 return ActionResult(error=error_msg)
513 |                 
514 |         except Exception as e:
515 |             msg = f'Selection failed: {str(e)}'
516 |             logger.error(msg)
517 |             return ActionResult(error=msg)
518 | 


--------------------------------------------------------------------------------
/index/llm/llm.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from dataclasses import dataclass
  3 | from enum import Enum
  4 | from typing import Any, Dict, List, Optional, Union
  5 | 
  6 | from pydantic import BaseModel
  7 | 
  8 | 
  9 | class MessageRole(Enum):
 10 |     SYSTEM = "system"
 11 |     USER = "user"
 12 |     ASSISTANT = "assistant"
 13 |     TOOL = "tool"  # For OpenAI function calling responses
 14 | 
 15 | @dataclass
 16 | class MessageContent:
 17 |     """Base class for message content"""
 18 |     cache_control: Optional[bool] = None
 19 | 
 20 | @dataclass
 21 | class TextContent(MessageContent):
 22 |     """Text content in a message"""
 23 |     text: str = ""
 24 |     type: str = "text"
 25 | 
 26 | @dataclass
 27 | class ImageContent(MessageContent):
 28 |     """Image content in a message"""
 29 |     image_b64: Optional[str] = None
 30 |     image_url: Optional[str] = None
 31 |     type: str = "image"
 32 | 
 33 | @dataclass
 34 | class ThinkingBlock(MessageContent):
 35 |     """Thinking block in a message"""
 36 |     thinking: str = ""
 37 |     signature: str = ""
 38 |     type: str = "thinking"
 39 | 
 40 | @dataclass
 41 | class Message:
 42 |     """A message in a conversation"""
 43 |     role: Union[str, MessageRole]
 44 |     content: Union[str, List[Union[TextContent, ImageContent, ThinkingBlock]]]
 45 |     name: Optional[str] = None  # For tool/function messages
 46 |     tool_call_id: Optional[str] = None  # For tool/function responses
 47 |     is_state_message: Optional[bool] = False
 48 | 
 49 |     def __post_init__(self):
 50 |         # Convert role enum to string if needed
 51 |         if isinstance(self.role, MessageRole):
 52 |             self.role = self.role.value
 53 |             
 54 |         # Convert string content to TextContent if needed
 55 |         if isinstance(self.content, str):
 56 |             self.content = [TextContent(text=self.content)]
 57 |         elif isinstance(self.content, (TextContent, ImageContent)):
 58 |             self.content = [self.content]
 59 | 
 60 |     def to_openai_format(self) -> Dict:
 61 |         """Convert to OpenAI message format"""
 62 |         message = {"role": self.role}
 63 |         
 64 |         if isinstance(self.content, str):
 65 |             message["content"] = self.content
 66 |             
 67 |         elif isinstance(self.content, list):
 68 | 
 69 |             content_blocks = []
 70 | 
 71 |             for content_block in self.content:
 72 | 
 73 |                 block = {}
 74 |                 
 75 |                 if isinstance(content_block, TextContent):
 76 |                     block["type"] = "text"
 77 |                     block["text"] = content_block.text
 78 |                 elif isinstance(content_block, ImageContent):
 79 |                     block["type"] = "image_url"
 80 |                     block["image_url"] = {
 81 |                         "url": "data:image/png;base64," + content_block.image_b64
 82 |                     }
 83 | 
 84 |                 content_blocks.append(block)
 85 | 
 86 |             message["content"] = content_blocks
 87 | 
 88 |         return message
 89 |     
 90 |     def to_groq_format(self) -> Dict:
 91 |         """Convert to Groq message format"""
 92 |         message = {"role": self.role}
 93 | 
 94 |         if isinstance(self.content, str):
 95 |             message["content"] = self.content
 96 |             
 97 |         elif isinstance(self.content, list):
 98 | 
 99 |             content_blocks = []
100 | 
101 |             # content of a system and assistant messages in groq can only contain text
102 |             if self.role == "system" or self.role == "assistant":
103 |                 block = self.content[0]
104 |                 if isinstance(block, TextContent):
105 |                     message["content"] = block.text
106 | 
107 |                 return message
108 | 
109 |             for content_block in self.content:
110 | 
111 |                 block = {}
112 |                 
113 |                 if isinstance(content_block, TextContent):
114 |                     block["type"] = "text"
115 |                     block["text"] = content_block.text
116 |                 elif isinstance(content_block, ImageContent):
117 |                     block["type"] = "image_url"
118 |                     block["image_url"] = {
119 |                         "url": "data:image/png;base64," + content_block.image_b64
120 |                     }
121 | 
122 |                 content_blocks.append(block)
123 | 
124 |             message["content"] = content_blocks
125 | 
126 |         return message
127 | 
128 |     def to_anthropic_format(self, enable_cache_control: bool = True) -> Dict:
129 |         """Convert to Anthropic message format"""
130 |         message = {"role": self.role}
131 | 
132 |         if isinstance(self.content, str):
133 |             message["content"] = self.content
134 |             
135 |         elif isinstance(self.content, list):
136 | 
137 |             content_blocks = []
138 | 
139 |             for content_block in self.content:
140 | 
141 |                 block = {}
142 | 
143 | 
144 |                 if isinstance(content_block, TextContent):
145 |                     block["type"] = "text"
146 |                     block["text"] = content_block.text
147 |                 elif isinstance(content_block, ImageContent):
148 |                     block["type"] = "image"
149 |                     block["source"] = {
150 |                         "type": "base64",
151 |                         "media_type": "image/png",  # This should be configurable based on image type
152 |                         "data": content_block.image_b64 if content_block.image_b64 else content_block.image_url
153 |                     }
154 |                 elif isinstance(content_block, ThinkingBlock):
155 |                     block["type"] = "thinking"
156 |                     block["thinking"] = content_block.thinking
157 |                     block["signature"] = content_block.signature
158 | 
159 |                 if content_block.cache_control and enable_cache_control:
160 |                     block["cache_control"] = {"type": "ephemeral"}
161 | 
162 |                 content_blocks.append(block)
163 | 
164 |             message["content"] = content_blocks
165 |                      
166 |         return message
167 |     
168 |     def to_gemini_format(self) -> Dict:
169 |         """Convert to Gemini message format"""
170 |         parts = []
171 |         
172 |         if isinstance(self.content, str):
173 |             parts = [{"text": self.content}]
174 |         elif isinstance(self.content, list):
175 |             for content_block in self.content:
176 |                 if isinstance(content_block, TextContent):
177 |                     parts.append({"text": content_block.text})
178 |                 elif isinstance(content_block, ImageContent):
179 |                     if content_block.image_b64:
180 |                         parts.append({"inline_data": {
181 |                             "mime_type": "image/png",
182 |                             "data": content_block.image_b64
183 |                         }})
184 |                     elif content_block.image_url:
185 |                         parts.append({"file_data": {
186 |                             "mime_type": "image/png",
187 |                             "file_uri": content_block.image_url
188 |                         }})
189 |         
190 |         return {
191 |             "role": 'model' if self.role == 'assistant' else 'user',
192 |             "parts": parts
193 |         }
194 |     
195 |     def remove_cache_control(self):
196 |         if isinstance(self.content, list):
197 |             for content_block in self.content:
198 |                 if isinstance(content_block, TextContent):
199 |                     content_block.cache_control = None
200 |                 elif isinstance(content_block, ImageContent):
201 |                     content_block.cache_control = None
202 | 
203 |     def add_cache_control_to_state_message(self):
204 | 
205 |         if not self.is_state_message or not isinstance(self.content, list) or len(self.content) < 3:
206 |             return
207 | 
208 |         if len(self.content) == 3:
209 |             self.content[-1].cache_control = True
210 | 
211 |     def has_cache_control(self):
212 |         
213 |         if not isinstance(self.content, list):
214 |             return False
215 | 
216 |         return any(content.cache_control for content in self.content)
217 | 
218 | 
219 | class LLMResponse(BaseModel):
220 |     content: str
221 |     raw_response: Any
222 |     usage: Dict[str, Any]
223 |     thinking: Optional[ThinkingBlock] = None
224 | 
225 | 
226 | class BaseLLMProvider(ABC):
227 |     def __init__(self, model: str):
228 |         self.model = model
229 | 
230 |     @abstractmethod
231 |     async def call(
232 |         self,
233 |         messages: List[Message],
234 |         temperature: float = 1,
235 |         max_tokens: Optional[int] = None,
236 |         **kwargs
237 |     ) -> LLMResponse:
238 |         pass
239 | 


--------------------------------------------------------------------------------
/index/llm/providers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .anthropic import AnthropicProvider
 2 | from .anthropic_bedrock import AnthropicBedrockProvider
 3 | from .gemini import GeminiProvider
 4 | from .openai import OpenAIProvider
 5 | 
 6 | __all__ = [
 7 |     "OpenAIProvider",
 8 |     "AnthropicProvider",
 9 |     "AnthropicBedrockProvider",
10 |     "GeminiProvider",
11 | ] 


--------------------------------------------------------------------------------
/index/llm/providers/anthropic.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import List, Optional
  3 | 
  4 | import backoff
  5 | from anthropic import AsyncAnthropic
  6 | 
  7 | from ..llm import BaseLLMProvider, LLMResponse, Message, ThinkingBlock
  8 | from ..providers.anthropic_bedrock import AnthropicBedrockProvider
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class AnthropicProvider(BaseLLMProvider):
 14 |     def __init__(self, model: str, enable_thinking: bool = True, thinking_token_budget: Optional[int] = 2048):
 15 |         super().__init__(model=model)
 16 |         self.client = AsyncAnthropic()
 17 |         self.thinking_token_budget = thinking_token_budget
 18 | 
 19 |         self.anthropic_bedrock = AnthropicBedrockProvider(model=f"us.anthropic.{model}-v1:0", enable_thinking=enable_thinking, thinking_token_budget=thinking_token_budget)
 20 | 
 21 |         self.enable_thinking = enable_thinking
 22 | 
 23 |     @backoff.on_exception(
 24 |         backoff.constant,  # constant backoff
 25 |         Exception,     # retry on any exception
 26 |         max_tries=3,   # stop after 3 attempts
 27 |         interval=10,
 28 |         on_backoff=lambda details: logger.info(
 29 |             f"API error, retrying in {details['wait']:.2f} seconds... (attempt {details['tries']})"
 30 |         )
 31 |     )
 32 |     async def call(
 33 |         self,
 34 |         messages: List[Message],
 35 |         temperature: float = -1,
 36 |         max_tokens: Optional[int] = 16000,
 37 |         **kwargs
 38 |     ) -> LLMResponse:
 39 |         # Make a copy of messages to prevent modifying the original list during retries
 40 |         messages_copy = messages.copy()
 41 | 
 42 |         if not messages_copy:
 43 |             raise ValueError("Messages list cannot be empty.")
 44 | 
 45 |         conversation_messages_input: List[Message] = []
 46 | 
 47 |         system = []
 48 | 
 49 |         if messages_copy[0].role == "system":
 50 |             system = messages_copy[0].content[0].text
 51 |             conversation_messages_input = messages_copy[1:]
 52 |         else:
 53 |             conversation_messages_input = messages_copy
 54 |         
 55 |         anthropic_api_messages = [msg.to_anthropic_format() for msg in conversation_messages_input]
 56 |         
 57 |         if self.enable_thinking:
 58 | 
 59 |             try:
 60 |                 response = await self.client.messages.create(
 61 |                     model=self.model,
 62 |                     system=system,
 63 |                     messages=anthropic_api_messages,
 64 |                     thinking={
 65 |                         "type": "enabled",
 66 |                         "budget_tokens": self.thinking_token_budget,
 67 |                     },
 68 |                     max_tokens=max(self.thinking_token_budget + 1, max_tokens),
 69 |                     **kwargs
 70 |                 )
 71 |             except Exception as e:
 72 |                 logger.error(f"Error calling Anthropic: {str(e)}")
 73 |                 # Fallback to anthropic_bedrock with the original messages_copy
 74 |                 response = await self.anthropic_bedrock.call(
 75 |                     messages_copy, # Pass original messages_copy, bedrock provider has its own logic
 76 |                     temperature=temperature, # Pass original temperature
 77 |                     max_tokens=max_tokens,   # Pass original max_tokens
 78 |                     **kwargs
 79 |                 )
 80 | 
 81 |             return LLMResponse(
 82 |                 content=response.content[1].text,
 83 |                 raw_response=response,
 84 |                 usage=response.usage.model_dump(),
 85 |                 thinking=ThinkingBlock(thinking=response.content[0].thinking, signature=response.content[0].signature)
 86 |             )
 87 |         else: # Not enable_thinking
 88 |             response = await self.client.messages.create(
 89 |                 model=self.model,
 90 |                 messages=anthropic_api_messages,
 91 |                 temperature=temperature, # Use adjusted temperature
 92 |                 max_tokens=max_tokens, # Use adjusted max_tokens
 93 |                 system=system,
 94 |                 **kwargs
 95 |             )
 96 |      
 97 |             return LLMResponse(
 98 |                 content=response.content[0].text,
 99 |                 raw_response=response,
100 |                 usage=response.usage.model_dump()
101 |             )


--------------------------------------------------------------------------------
/index/llm/providers/anthropic_bedrock.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from typing import List, Optional
 4 | 
 5 | import backoff
 6 | from anthropic import AsyncAnthropicBedrock
 7 | from dotenv import load_dotenv
 8 | 
 9 | from ..llm import BaseLLMProvider, LLMResponse, Message
10 | 
11 | load_dotenv()
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | class AnthropicBedrockProvider(BaseLLMProvider):
17 |     def __init__(self, model: str, enable_thinking: bool = True, thinking_token_budget: Optional[int] = 8192):
18 |         super().__init__(model=model)
19 | 
20 |         self.client = AsyncAnthropicBedrock(
21 |             aws_access_key=os.getenv('AWS_ACCESS_KEY_ID'),
22 |             aws_secret_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
23 |             aws_region=os.getenv('AWS_REGION'),
24 |         )
25 |         self.enable_thinking = enable_thinking
26 |         self.thinking_token_budget = thinking_token_budget
27 |     @backoff.on_exception(  # noqa: F821
28 |         backoff.constant,  # constant backoff
29 |         Exception,     # retry on any exception
30 |         max_tries=3,   # stop after 3 attempts
31 |         interval=10,
32 |     )
33 |     async def call(
34 |         self,
35 |         messages: List[Message],
36 |         temperature: float = 1,
37 |         max_tokens: Optional[int] = 2048,
38 |         **kwargs
39 |     ) -> LLMResponse:
40 |     
41 |         messages_copy = messages.copy()
42 | 
43 |         if len(messages_copy) < 2 or messages_copy[0].role != "system":
44 |             raise ValueError("System message is required for Anthropic Bedrock and length of messages must be at least 2")
45 |             
46 |         system_message = messages_copy[0]
47 | 
48 |         try:
49 |             if self.enable_thinking:
50 |                     
51 |                 response = await self.client.messages.create(
52 |                     model=self.model,
53 |                     system=system_message.to_anthropic_format(enable_cache_control=False)["content"],
54 |                     messages=[msg.to_anthropic_format(enable_cache_control=False) for msg in messages_copy[1:]],
55 |                     temperature=1,
56 |                     thinking={
57 |                         "type": "enabled",
58 |                         "budget_tokens": self.thinking_token_budget,
59 |                     },
60 |                     max_tokens=max(self.thinking_token_budget + 1, max_tokens),
61 |                     **kwargs
62 |                 )
63 |                
64 |                 return LLMResponse(
65 |                     content=response.content[1].text,
66 |                     raw_response=response,
67 |                     usage=response.usage
68 |                 )
69 |             else:
70 | 
71 |                 response = await self.client.messages.create(
72 |                     model=self.model,
73 |                     messages=[msg.to_anthropic_format(enable_cache_control=False) for msg in messages_copy[1:]],
74 |                     temperature=temperature,
75 |                     max_tokens=max_tokens,
76 |                     system=system_message.to_anthropic_format(enable_cache_control=False)["content"],
77 |                     **kwargs
78 |                 )
79 |               
80 |                 return LLMResponse(
81 |                     content=response.content[0].text,
82 |                     raw_response=response,
83 |                     usage=response.usage
84 |                 )
85 |         except Exception as e:
86 |             logger.error(f"Error calling Anthropic Bedrock: {str(e)}")
87 |             raise e


--------------------------------------------------------------------------------
/index/llm/providers/gemini.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from typing import List, Optional
 4 | 
 5 | import backoff
 6 | from google import genai
 7 | 
 8 | from ..llm import BaseLLMProvider, LLMResponse, Message
 9 | 
10 | logger = logging.getLogger(__name__)
11 | class GeminiProvider(BaseLLMProvider):
12 |     def __init__(self, model: str, thinking_token_budget: int = 8192):
13 |         super().__init__(model=model)
14 |         self.client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
15 |         self.thinking_token_budget = thinking_token_budget
16 | 
17 | 
18 |     @backoff.on_exception(
19 |         backoff.constant,  # constant backoff
20 |         Exception,     # retry on any exception
21 |         max_tries=3,   # stop after 3 attempts
22 |         interval=0.5,
23 |         on_backoff=lambda details: logger.info(
24 |             f"API error, retrying in {details['wait']:.2f} seconds... (attempt {details['tries']})"
25 |         ),
26 |     )
27 |     async def call(
28 |         self,
29 |         messages: List[Message],
30 |         temperature: float = 1.0,
31 |         max_tokens: Optional[int] = None,
32 |         **kwargs
33 |     ) -> LLMResponse:
34 |         
35 |         if len(messages) == 0:
36 |             raise ValueError("Messages must be non-empty")
37 |         
38 |         config = {
39 |             "temperature": temperature,
40 |             "thinking_config": {
41 |                 "thinking_budget": self.thinking_token_budget
42 |             },
43 |         }
44 |         
45 |         if messages[0].role == "system":
46 |             system = messages[0].content[0].text
47 |             gemini_messages = [msg.to_gemini_format() for msg in messages[1:]]
48 | 
49 |             config["system_instruction"] = {
50 |                 "text": system
51 |             }
52 |         else:
53 |             gemini_messages = [msg.to_gemini_format() for msg in messages]
54 |         
55 |         
56 |         if max_tokens:
57 |             config["max_output_tokens"] = max_tokens
58 | 
59 |         response = await self.client.aio.models.generate_content(
60 |             model=self.model,
61 |             contents=gemini_messages,
62 |             config=config,   
63 |         )
64 |         
65 |         # Extract usage information if available
66 |         usage = {}
67 |         if hasattr(response, "usage_metadata"):
68 |             usage = {
69 |                 "prompt_tokens": getattr(response.usage_metadata, "prompt_token_count", 0),
70 |                 "completion_tokens": getattr(response.usage_metadata, "candidates_token_count", 0),
71 |                 "total_tokens": getattr(response.usage_metadata, "total_token_count", 0)
72 |             }
73 |         
74 |         return LLMResponse(
75 |             content=response.text,
76 |             raw_response=response,
77 |             usage=usage
78 |         ) 


--------------------------------------------------------------------------------
/index/llm/providers/gemini_vertex.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import List, Optional
 3 | 
 4 | import backoff
 5 | from google import genai
 6 | 
 7 | from ..llm import BaseLLMProvider, LLMResponse, Message
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | class GeminiVertexProvider(BaseLLMProvider):
11 |     def __init__(self, model: str, project: str = None, location: str = None):
12 |         super().__init__(model=model)
13 |         self.client = genai.Client(
14 |             vertexai=True,
15 |             project=project,
16 |             location=location)
17 | 
18 | 
19 |     @backoff.on_exception(
20 |         backoff.constant,  # constant backoff
21 |         Exception,     # retry on any exception
22 |         max_tries=3,   # stop after 3 attempts
23 |         interval=0.5,
24 |         on_backoff=lambda details: logger.info(
25 |             f"API error, retrying in {details['wait']:.2f} seconds... (attempt {details['tries']})"
26 |         ),
27 |     )
28 |     async def call(
29 |         self,
30 |         messages: List[Message],
31 |         temperature: float = 1.0,
32 |         max_tokens: Optional[int] = None,
33 |     ) -> LLMResponse:
34 |         
35 |         if len(messages) == 0:
36 |             raise ValueError("Messages must be non-empty")
37 |         
38 |         config = {
39 |             "temperature": temperature,
40 |         }
41 |         
42 |         if messages[0].role == "system":
43 |             system = messages[0].content[0].text
44 |             gemini_messages = [msg.to_gemini_format() for msg in messages[1:]]
45 | 
46 |             config["system_instruction"] = {
47 |                 "text": system
48 |             }
49 |         else:
50 |             gemini_messages = [msg.to_gemini_format() for msg in messages]
51 |         
52 |         
53 |         if max_tokens:
54 |             config["max_output_tokens"] = max_tokens
55 | 
56 |         response = await self.client.aio.models.generate_content(
57 |             model=self.model,
58 |             contents=gemini_messages,
59 |             config=config,   
60 |         )
61 |         
62 |         # Extract usage information if available
63 |         usage = {}
64 |         if hasattr(response, "usage_metadata"):
65 |             usage = {
66 |                 "prompt_tokens": getattr(response.usage_metadata, "prompt_token_count", 0),
67 |                 "completion_tokens": getattr(response.usage_metadata, "candidates_token_count", 0),
68 |                 "total_tokens": getattr(response.usage_metadata, "total_token_count", 0)
69 |             }
70 |         
71 |         return LLMResponse(
72 |             content=response.text,
73 |             raw_response=response,
74 |             usage=usage
75 |         ) 


--------------------------------------------------------------------------------
/index/llm/providers/groq.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import List, Optional
  3 | 
  4 | import backoff
  5 | from groq import AsyncGroq  # Assuming AsyncGroq for asynchronous operations
  6 | 
  7 | from ..llm import BaseLLMProvider, LLMResponse, Message
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | class GroqProvider(BaseLLMProvider):
 12 |     """
 13 |     A provider for interacting with the Groq API.
 14 |     """
 15 |     def __init__(self, model: str):
 16 |         """
 17 |         Initializes the GroqProvider.
 18 | 
 19 |         Args:
 20 |             model: The model name to use (e.g., "llama-3.3-70b-versatile").
 21 |         """
 22 |         super().__init__(model=model)
 23 |         # The Groq client, by default, should pick up the GROQ_API_KEY 
 24 |         # from environment variables if not explicitly passed.
 25 |         # Ref: https://console.groq.com/docs/libraries
 26 |         # client = Groq(api_key=os.environ.get("GROQ_API_KEY")) where api_key param is optional.
 27 |         self.client = AsyncGroq()
 28 | 
 29 |     @backoff.on_exception(
 30 |         backoff.constant,
 31 |         Exception,  # Retry on any exception. Consider refining with specific Groq API errors if known.
 32 |         max_tries=3,
 33 |         interval=0.5,
 34 |     )
 35 |     async def call(
 36 |         self,
 37 |         messages: List[Message],
 38 |         temperature: float = 1.0,
 39 |         max_tokens: Optional[int] = None,
 40 |     ) -> LLMResponse:
 41 |         """
 42 |         Makes an asynchronous call to the Groq API.
 43 | 
 44 |         Args:
 45 |             messages: A list of Message objects representing the conversation history.
 46 |             temperature: The sampling temperature to use. Groq converts 0 to 1e-8.
 47 |                          Values should ideally be > 0 and <= 2.
 48 |             max_tokens: The maximum number of tokens to generate.
 49 | 
 50 |         Returns:
 51 |             An LLMResponse object containing the model's response and usage data.
 52 |         
 53 |         Raises:
 54 |             ValueError: If the messages list is empty or the API response is invalid.
 55 |         """
 56 |         if not messages:
 57 |             raise ValueError("Messages list cannot be empty.")
 58 | 
 59 |         # Format messages to be compatible with Groq's API (OpenAI format)
 60 |         formatted_messages = [msg.to_groq_format() for msg in messages]
 61 | 
 62 |         if formatted_messages[0]["role"] == "system":
 63 |             # remove couple of examples from first user message because llama4 model supports only 5 images.
 64 |             # TODO: remove this once we have a model that supports more images.
 65 |             formatted_messages[1]["content"] = formatted_messages[1]["content"][0:4] + formatted_messages[1]["content"][12:]
 66 | 
 67 | 
 68 |         api_params = {
 69 |             "model": self.model,
 70 |             "messages": formatted_messages,
 71 |             "temperature": temperature,
 72 |         }
 73 | 
 74 |         if max_tokens is not None:
 75 |             api_params["max_tokens"] = max_tokens
 76 | 
 77 |         # Groq API notes:
 78 |         # - 'N' (number of choices) must be 1 if supplied. Defaults to 1.
 79 |         # - Unsupported OpenAI fields (will result in 400 error if supplied):
 80 |         #   logprobs, logit_bias, top_logprobs, messages[].name
 81 | 
 82 |         response = await self.client.chat.completions.create(**api_params)
 83 | 
 84 |         if not response.choices or not response.choices[0].message:
 85 |             logger.error(f"Groq API response missing choices or message: {response}")
 86 |             raise ValueError("Invalid response structure from Groq API")
 87 | 
 88 |         content = response.choices[0].message.content
 89 |         # Handle cases where content might be None (e.g., if finish_reason indicates tool use in the future)
 90 |         if content is None:
 91 |             content = ""
 92 | 
 93 |         usage_data = {}
 94 |         # Attempt to extract usage data, assuming an OpenAI-compatible structure.
 95 |         # The Groq Python SDK might provide usage data in `response.usage`.
 96 |         if hasattr(response, "usage") and response.usage is not None:
 97 |             usage_data = {
 98 |                 "prompt_tokens": getattr(response.usage, "prompt_tokens", 0),
 99 |                 "completion_tokens": getattr(response.usage, "completion_tokens", 0),
100 |                 "total_tokens": getattr(response.usage, "total_tokens", 0),
101 |             }
102 | 
103 |         return LLMResponse(
104 |             content=content,
105 |             raw_response=response,
106 |             usage=usage_data
107 |         ) 


--------------------------------------------------------------------------------
/index/llm/providers/openai.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | 
 3 | from openai import AsyncOpenAI
 4 | 
 5 | from ..llm import BaseLLMProvider, LLMResponse, Message
 6 | 
 7 | 
 8 | class OpenAIProvider(BaseLLMProvider):
 9 |     def __init__(self, model: str, reasoning_effort: Optional[str] = "low"):
10 |         super().__init__(model=model)
11 |         self.client = AsyncOpenAI()
12 |         self.reasoning_effort = reasoning_effort
13 | 
14 |     async def call(
15 |         self,
16 |         messages: List[Message],
17 |         temperature: float = 1.0,
18 |     ) -> LLMResponse:
19 | 
20 |         args = {
21 |             "temperature": temperature,
22 |         }
23 |     
24 |         if self.model.startswith("o") and self.reasoning_effort:
25 |             args["reasoning_effort"] = self.reasoning_effort
26 |             args["temperature"] = 1
27 | 
28 |         response = await self.client.chat.completions.create(
29 |             model=self.model,
30 |             messages=[msg.to_openai_format() for msg in messages],
31 |             **args
32 |         )
33 |         
34 |         return LLMResponse(
35 |             content=response.choices[0].message.content,
36 |             raw_response=response,
37 |             usage={
38 |                 "prompt_tokens": response.usage.prompt_tokens,
39 |                 "completion_tokens": response.usage.completion_tokens,
40 |                 "total_tokens": response.usage.total_tokens
41 |             }
42 |         ) 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [tool.hatch.metadata]
 6 | allow-direct-references = true
 7 | 
 8 | [tool.hatch.build.targets.wheel]
 9 | packages = ["index"]
10 | 
11 | [project]
12 | name = "lmnr-index"
13 | version = "0.1.13"
14 | description = "Index - SOTA browser AI agent for autonomous task execution on the web"
15 | readme = "README.md"
16 | requires-python = ">=3.10"
17 | 
18 | dependencies = [
19 |     "anthropic[bedrock]>=0.52.0",
20 |     "backoff>=2.2.1",
21 |     "lmnr[anthropic,openai,groq]>=0.6.2",
22 |     "openai>=1.65.2",
23 |     "playwright>=1.50.0",
24 |     "tenacity>=9.0.0",
25 |     "pillow>=11.1.0",
26 |     "rich>=13.5.0",
27 |     "textual>=0.50.1",
28 |     "typer>=0.9.0",
29 |     "google-genai>=1.11.0",
30 |     "docstring-parser>=0.16",
31 |     "groq>=0.24.0",
32 | ]
33 | 
34 | [project.scripts]
35 | index = "index.cli:main"
36 | 
37 | [tool.uv]
38 | dev-dependencies = [
39 |     "pytest>=8.3.3",
40 |     "pytest-asyncio"
41 | ]
42 | 
43 | [project.license]
44 | file = "LICENSE"
45 | 
46 | [tool.pytest.ini_options]
47 | asyncio_mode = "auto"
48 | testpaths = ["tests"]
49 | python_files = ["test_*.py"]
50 | addopts = "-v -ra -q"
51 | 


--------------------------------------------------------------------------------
/static/logo_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/static/logo_dark.png


--------------------------------------------------------------------------------
/static/logo_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/static/logo_light.png


--------------------------------------------------------------------------------
/static/traces.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/static/traces.png


--------------------------------------------------------------------------------
/tests/agent/test_utils.py:
--------------------------------------------------------------------------------
  1 | # Test cases for agent utility functions 
  2 | 
  3 | import pytest
  4 | 
  5 | from index.agent.models import (  # Assuming ActionModel is part of AgentLLMOutput
  6 |     ActionModel,
  7 |     AgentLLMOutput,
  8 | )
  9 | from index.agent.utils import generate_proper_json, validate_json
 10 | from index.llm.llm import (  # Assuming LLMResponse is the type returned by llm.call
 11 |     BaseLLMProvider,
 12 |     LLMResponse,
 13 |     Message,
 14 | )
 15 | 
 16 | 
 17 | # Mock LLM Provider
 18 | class MockLLMProvider(BaseLLMProvider):
 19 |     def __init__(self, responses=None, call_should_fail=False, exception_to_raise=None):
 20 |         self.responses = responses if responses is not None else []
 21 |         self.call_history = []
 22 |         self.call_should_fail = call_should_fail
 23 |         self.exception_to_raise = exception_to_raise if exception_to_raise else Exception("LLM call failed")
 24 | 
 25 |     async def call(self, messages: list[Message]) -> LLMResponse:
 26 |         self.call_history.append(messages)
 27 |         if self.call_should_fail:
 28 |             raise self.exception_to_raise
 29 |         if self.responses:
 30 |             response_content = self.responses.pop(0)
 31 |             # Simulate LLMResponse structure; adjust if it's different
 32 |             return LLMResponse(content=response_content, thinking=None, raw_response=None, cost=None, usage={"prompt_tokens": 10, "completion_tokens": 10})
 33 |         return LLMResponse(content="", thinking=None, raw_response=None, cost=None, usage={"prompt_tokens": 0, "completion_tokens": 0}) # Default empty response
 34 | 
 35 |     def get_token_limit(self) -> int:
 36 |         return 4096 # Dummy value
 37 | 
 38 |     def count_tokens(self, text: str) -> int:
 39 |         return len(text.split()) # Dummy value
 40 | 
 41 | # --- Tests for validate_json ---
 42 | 
 43 | @pytest.mark.asyncio
 44 | async def test_validate_json_valid_with_output_tags():
 45 |     raw_response = "<output_123>{\"action\": {\"name\": \"click\", \"params\": {\"selector\": \".btn\"}}, \"thought\": \"Thinking...\", \"summary\": \"Clicked button\"}</output_123>"
 46 |     mock_llm = MockLLMProvider()
 47 |     
 48 |     expected_action = ActionModel(name="click", params={"selector": ".btn"})
 49 |     expected_output = AgentLLMOutput(action=expected_action, thought="Thinking...", summary="Clicked button")
 50 |     
 51 |     result = await validate_json(raw_response, mock_llm)
 52 |     
 53 |     assert result.action == expected_action
 54 |     assert result.thought == expected_output.thought
 55 |     assert result.summary == expected_output.summary
 56 |     assert len(mock_llm.call_history) == 0 # LLM should not be called
 57 | 
 58 | @pytest.mark.asyncio
 59 | async def test_validate_json_valid_with_json_markdown():
 60 |     raw_response = "```json\n{\"action\": {\"name\": \"type\", \"params\": {\"text\": \"hello\"}}, \"thought\": \"Typing...\", \"summary\": \"Typed hello\"}\n```"
 61 |     mock_llm = MockLLMProvider()
 62 | 
 63 |     expected_action = ActionModel(name="type", params={"text": "hello"})
 64 |     expected_output = AgentLLMOutput(action=expected_action, thought="Typing...", summary="Typed hello")
 65 | 
 66 |     result = await validate_json(raw_response, mock_llm)
 67 | 
 68 |     assert result.action == expected_action
 69 |     assert result.thought == expected_output.thought
 70 |     assert result.summary == expected_output.summary
 71 |     assert len(mock_llm.call_history) == 0
 72 | 
 73 | @pytest.mark.asyncio
 74 | async def test_validate_json_valid_plain_json_no_tags_no_markdown():
 75 |     raw_response = "{\"action\": {\"name\": \"scroll\", \"params\": {\"direction\": \"down\"}}, \"thought\": \"Scrolling...\", \"summary\": \"Scrolled down\"}"
 76 |     mock_llm = MockLLMProvider()
 77 | 
 78 |     expected_action = ActionModel(name="scroll", params={"direction": "down"})
 79 |     expected_output = AgentLLMOutput(action=expected_action, thought="Scrolling...", summary="Scrolled down")
 80 |     
 81 |     result = await validate_json(raw_response, mock_llm)
 82 | 
 83 |     assert result.action == expected_action
 84 |     assert result.thought == expected_output.thought
 85 |     assert result.summary == expected_output.summary
 86 |     assert len(mock_llm.call_history) == 0
 87 | 
 88 | @pytest.mark.asyncio
 89 | async def test_validate_json_needs_cleaning_escaped_chars():
 90 |     # Contains \\n which should be cleaned to \n by the first cleaning pass
 91 |     # Changed input to use standard JSON escape \n instead of \\\\n
 92 |     raw_response = "<output>{\"action\": {\"name\": \"navigate\", \"params\": {\"url\": \"test.com\"}}, \"thought\": \"Navigating...\\nNext line.\", \"summary\": \"Navigated\"}</output>"
 93 |     mock_llm = MockLLMProvider()
 94 | 
 95 |     expected_action = ActionModel(name="navigate", params={"url": "test.com"})
 96 |     # Expected output still has a real newline
 97 |     expected_output = AgentLLMOutput(action=expected_action, thought="Navigating...\nNext line.", summary="Navigated")
 98 | 
 99 |     result = await validate_json(raw_response, mock_llm)
100 | 
101 |     assert result.action == expected_action
102 |     assert result.thought == expected_output.thought # Direct comparison
103 |     assert result.summary == expected_output.summary
104 |     assert len(mock_llm.call_history) == 0
105 | 
106 | @pytest.mark.asyncio
107 | async def test_validate_json_needs_cleaning_control_chars():
108 |     # Contains a control character (bell \x07) that should be removed
109 |     raw_response = "<output>{\"action\": {\"name\": \"wait\", \"params\": {}}, \"thought\": \"Waiting...\x07\", \"summary\": \"Waited\"}</output>"
110 |     mock_llm = MockLLMProvider()
111 |     
112 |     expected_action = ActionModel(name="wait", params={})
113 |     expected_output = AgentLLMOutput(action=expected_action, thought="Waiting...", summary="Waited")
114 | 
115 |     result = await validate_json(raw_response, mock_llm)
116 |     
117 |     assert result.action.name == expected_action.name
118 |     assert result.action.params == expected_action.params
119 |     assert result.thought == expected_output.thought
120 |     assert result.summary == expected_output.summary
121 |     assert len(mock_llm.call_history) == 0
122 | 
123 | # --- Tests for generate_proper_json (can be simple, as it's a direct LLM call) ---
124 | @pytest.mark.asyncio
125 | async def test_generate_proper_json_calls_llm_and_strips():
126 |     mock_llm = MockLLMProvider(responses=["  ```json\n{\"key\": \"fixed_value\"}```  "])
127 |     malformed_json = "{key: 'broken_value'"
128 |     
129 |     result = await generate_proper_json(mock_llm, malformed_json)
130 |     
131 |     assert result == "{\"key\": \"fixed_value\"}"
132 |     assert len(mock_llm.call_history) == 1
133 |     # Check prompt content
134 |     assert "Problematic JSON string:" in mock_llm.call_history[0][0].content[0].text
135 |     assert malformed_json in mock_llm.call_history[0][0].content[0].text
136 | 
137 | # More tests for validate_json involving LLM fixes and failures will follow
138 | 
139 | @pytest.mark.asyncio
140 | async def test_validate_json_llm_fix_succeeds_on_first_llm_call():
141 |     malformed_raw_response = "<output>{\"action\": {\"name\": \"bad\", \"params\": {\"selector\": \".err\"}}, \"thought\": \"Oops\"summary\": \"Bad JSON\"}</output>"
142 |     corrected_json_str = "{\"action\": {\"name\": \"fixed\", \"params\": {\"detail\": \"good\"}}, \"thought\": \"Fixed!\", \"summary\": \"JSON is now good\"}"
143 |     
144 |     mock_llm = MockLLMProvider(responses=[corrected_json_str])
145 |     
146 |     expected_action = ActionModel(name="fixed", params={"detail": "good"})
147 |     expected_output = AgentLLMOutput(action=expected_action, thought="Fixed!", summary="JSON is now good")
148 | 
149 |     result = await validate_json(malformed_raw_response, mock_llm)
150 | 
151 |     assert result.action == expected_action
152 |     assert result.thought == expected_output.thought
153 |     assert result.summary == expected_output.summary
154 |     assert len(mock_llm.call_history) == 1 # LLM called once to fix
155 |     assert "Problematic JSON string:" in mock_llm.call_history[0][0].content[0].text
156 |     # The string passed to LLM should be the extracted content from output tags
157 |     assert "{\"action\": {\"name\": \"bad\", \"params\": {\"selector\": \".err\"}}, \"thought\": \"Oops\"summary\": \"Bad JSON\"}" in mock_llm.call_history[0][0].content[0].text
158 | 
159 | @pytest.mark.asyncio
160 | async def test_validate_json_llm_fix_succeeds_after_one_failed_llm_fix_attempt():
161 |     malformed_raw_response = "<output>this is very broken</output>"
162 |     still_malformed_json_from_llm1 = "{still: \"broken\""
163 |     corrected_json_str_from_llm2 = "{\"action\": {\"name\": \"finally_fixed\", \"params\": {}}, \"thought\": \"Phew!\", \"summary\": \"Fixed on second try\"}"
164 |     
165 |     mock_llm = MockLLMProvider(responses=[still_malformed_json_from_llm1, corrected_json_str_from_llm2])
166 |     
167 |     expected_action = ActionModel(name="finally_fixed", params={})
168 |     expected_output = AgentLLMOutput(action=expected_action, thought="Phew!", summary="Fixed on second try")
169 | 
170 |     result = await validate_json(malformed_raw_response, mock_llm, max_retries=3)
171 | 
172 |     assert result.action.name == expected_action.name
173 |     assert result.action.params == expected_action.params
174 |     assert result.thought == expected_output.thought
175 |     assert result.summary == expected_output.summary
176 |     assert len(mock_llm.call_history) == 2 # LLM called twice
177 |     # Check what was sent to LLM on first call
178 |     assert "this is very broken" in mock_llm.call_history[0][0].content[0].text 
179 |     # Check what was sent to LLM on second call
180 |     assert still_malformed_json_from_llm1 in mock_llm.call_history[1][0].content[0].text
181 | 
182 | @pytest.mark.asyncio
183 | async def test_validate_json_fails_after_max_retries_with_llm():
184 |     malformed_raw_response = "<output>totally unfixable {</output>"
185 |     bad_fix1 = "{attempt1: 'bad'"
186 |     bad_fix2 = "{attempt2: 'still bad'"
187 |     bad_fix3 = "{attempt3: 'nope'" # Assuming max_retries is 3 by default in validate_json
188 |     
189 |     mock_llm = MockLLMProvider(responses=[bad_fix1, bad_fix2, bad_fix3])
190 |     
191 |     with pytest.raises(ValueError) as excinfo:
192 |         await validate_json(malformed_raw_response, mock_llm, max_retries=3)
193 |     
194 |     assert "Could not parse or validate response after 3 attempts" in str(excinfo.value)
195 |     assert len(mock_llm.call_history) == 2 # Corrected from 3 to 2
196 |     # The final problematic string in the error message should be the last one LLM produced
197 |     assert f"Final problematic JSON string after all attempts: '{bad_fix2[:500]}" in str(excinfo.value) # LLM is called twice, so bad_fix2 is the last output from LLM
198 | 
199 | @pytest.mark.asyncio
200 | async def test_validate_json_empty_string_after_extraction():
201 |     # Scenario: <output></output> or <output>   </output>
202 |     raw_response = "<output>  </output>"
203 |     mock_llm = MockLLMProvider() # Returns empty string by default
204 | 
205 |     with pytest.raises(ValueError) as excinfo:
206 |         await validate_json(raw_response, mock_llm)
207 | 
208 |     assert "Could not parse or validate response" in str(excinfo.value)
209 |     assert "Final problematic JSON string after all attempts: '...'" in str(excinfo.value)
210 |     # LLM is called max_retries - 1 = 2 times in this path
211 |     assert len(mock_llm.call_history) == 2
212 | 
213 | @pytest.mark.asyncio
214 | async def test_validate_json_llm_call_itself_fails():
215 |     malformed_raw_response = "<output>broken { </output>"
216 |     mock_llm = MockLLMProvider(call_should_fail=True, exception_to_raise=RuntimeError("LLM service down"))
217 | 
218 |     with pytest.raises(ValueError) as excinfo:
219 |         await validate_json(malformed_raw_response, mock_llm, max_retries=3)
220 | 
221 |     assert "Could not parse or validate response after 3 attempts" in str(excinfo.value)
222 |     assert len(mock_llm.call_history) == 2 # Ensure LLM call count is 2
223 |     # Check that the error message ENDS with the expected final string part
224 |     expected_ending = "Final problematic JSON string after all attempts: 'broken {...'"
225 |     assert str(excinfo.value).endswith(expected_ending)
226 | 
227 | @pytest.mark.asyncio
228 | async def test_validate_json_llm_fix_unescaped_quotes():
229 |     # Input has unescaped double quotes inside string values
230 |     malformed_core = '''{
231 |     "action": {
232 |     "name": "click_element",
233 |     "params": {
234 |         "index": 24,
235 |         "wait_after_click": true
236 |     }
237 |     },
238 |     "thought": "The available options for batches are "ik12" (index 24).",
239 |     "summary": "Trying to click on "ik12" which could be X25."
240 | }
241 |     '''
242 |     malformed_raw_response = f"<output_7>{malformed_core.strip()}</output_7>"
243 | 
244 |     # Expected corrected JSON from LLM (with escaped quotes)
245 |     corrected_json_string = """
246 | {
247 |     "action": {
248 |         "name": "click_element",
249 |         "params": {
250 |             "index": 24,
251 |             "wait_after_click": true
252 |         }
253 |     },
254 |     "thought": "The available options for batches are \\\"ik12\\\" (index 24).",
255 |     "summary": "Trying to click on \\\"ik12\\\" which could be X25."
256 | }
257 | """
258 | 
259 |     # Mock LLM returns the corrected version on the first call
260 |     mock_llm = MockLLMProvider(responses=[corrected_json_string.strip()])
261 | 
262 |     # Expected Python object representation
263 |     expected_action = ActionModel(
264 |         name="click_element",
265 |         params={"index": 24, "wait_after_click": True}
266 |     )
267 |     expected_thought = 'The available options for batches are "ik12" (index 24).'
268 |     expected_summary = 'Trying to click on "ik12" which could be X25.'
269 | 
270 |     # Run the validation
271 |     result = await validate_json(malformed_raw_response, mock_llm)
272 | 
273 |     # Assertions
274 |     assert result.action == expected_action
275 |     assert result.thought == expected_thought
276 |     assert result.summary == expected_summary
277 |     assert len(mock_llm.call_history) == 1 # LLM should be called exactly once
278 |     # Check that the LLM was called with the initially extracted (malformed) string
279 |     assert malformed_core.strip() in mock_llm.call_history[0][0].content[0].text


--------------------------------------------------------------------------------