├── .github
└── workflows
│ └── publish.yml
├── .gitignore
├── .vscode
└── settings.json
├── LICENSE
├── README.md
├── evals
├── eval_generate_json.py
└── eval_output_model.py
├── index
├── __init__.py
├── agent
│ ├── agent.py
│ ├── demo_images
│ │ ├── complex_layout_highlight.png
│ │ ├── complex_layout_small_elements.png
│ │ ├── loading.png
│ │ └── scroll.png
│ ├── message_manager.py
│ ├── models.py
│ ├── prompts.py
│ └── utils.py
├── browser
│ ├── browser.py
│ ├── detector.py
│ ├── findVisibleInteractiveElements.js
│ ├── fonts
│ │ └── OpenSans-Medium.ttf
│ ├── models.py
│ └── utils.py
├── cli.py
├── controller
│ ├── controller.py
│ └── default_actions.py
└── llm
│ ├── llm.py
│ └── providers
│ ├── __init__.py
│ ├── anthropic.py
│ ├── anthropic_bedrock.py
│ ├── gemini.py
│ ├── gemini_vertex.py
│ ├── groq.py
│ └── openai.py
├── pyproject.toml
├── static
├── logo_dark.png
├── logo_light.png
└── traces.png
├── tests
└── agent
│ └── test_utils.py
└── uv.lock
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish Python Package
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v*'
7 |
8 | permissions:
9 | contents: read
10 |
11 | jobs:
12 | publish:
13 | runs-on: ubuntu-latest
14 | environment:
15 | name: pypi
16 | url: https://pypi.org/p/lmnr/
17 | permissions:
18 | id-token: write
19 | steps:
20 | - uses: actions/checkout@v4
21 | - name: Install uv
22 | uses: astral-sh/setup-uv@v4
23 | - name: Set up Python
24 | uses: actions/setup-python@v5
25 | with:
26 | python-version: '3.10'
27 | - name: Install the project
28 | run: uv sync --all-extras --dev
29 | - name: Verify tag matches package version
30 | run: |
31 | # Extract version from tag (remove 'v' prefix)
32 | TAG_VERSION=${GITHUB_REF#refs/tags/v}
33 | # Extract version from pyproject.toml
34 | PACKAGE_VERSION=$(grep -oP '(?<=version = ")[^"]+' pyproject.toml)
35 | echo "Tag version: $TAG_VERSION"
36 | echo "Package version: $PACKAGE_VERSION"
37 | # Check if versions match
38 | if [ "$TAG_VERSION" != "$PACKAGE_VERSION" ]; then
39 | echo "Error: Tag version ($TAG_VERSION) does not match package version ($PACKAGE_VERSION)"
40 | exit 1
41 | fi
42 | - name: Build package
43 | run: uv build
44 | - name: Publish package
45 | uses: pypa/gh-action-pypi-publish@release/v1
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 | cover/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | .pybuilder/
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | # For a library or package, you might want to ignore these files since the code is
89 | # intended to run in multiple environments; otherwise, check them in:
90 | # .python-version
91 |
92 | # pipenv
93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
96 | # install all needed dependencies.
97 | #Pipfile.lock
98 |
99 | # poetry
100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | # This is especially recommended for binary packages to ensure reproducibility, and is more
102 | # commonly ignored for libraries.
103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 |
106 | # pdm
107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | # in version control.
111 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
112 | .pdm.toml
113 | .pdm-python
114 | .pdm-build/
115 |
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 |
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 |
123 | # SageMath parsed files
124 | *.sage.py
125 |
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 |
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 |
139 | # Rope project settings
140 | .ropeproject
141 |
142 | # mkdocs documentation
143 | /site
144 |
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 |
150 | # Pyre type checker
151 | .pyre/
152 |
153 | # pytype static type analyzer
154 | .pytype/
155 |
156 | # Cython debug symbols
157 | cython_debug/
158 |
159 | # PyCharm
160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | # and can be added to the global gitignore or merged into this file. For a more nuclear
163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "[python]": {
3 | "editor.codeActionsOnSave": {
4 | "source.fixAll": "explicit",
5 | "source.organizeImports": "explicit"
6 | },
7 | "editor.defaultFormatter": "charliermarsh.ruff"
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [2025] [LMNR AI, Inc.]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | 
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | # Index
13 |
14 | Index is a state-of-the-art open-source browser agent that autonomously executes complex web tasks. It turns any website into an accessible API and can be seamlessly integrated with just a few lines of code.
15 |
16 | - [x] Powered by reasoning LLMs with vision capabilities.
17 | - [x] Gemini 2.5 Pro (really fast and accurate)
18 | - [x] Claude 3.7 Sonnet with extended thinking (reliable and accurate)
19 | - [x] OpenAI o4-mini (depending on the reasoning effort, provides good balance between speed, cost and accuracy)
20 | - [x] Gemini 2.5 Flash (really fast, cheap, and good for less complex tasks)
21 | - [x] `pip install lmnr-index` and use it in your project
22 | - [x] `index run` to run the agent in the interactive CLI
23 | - [x] Supports structured output via Pydantic schemas for reliable data extraction.
24 | - [x] Index is also available as a [serverless API.](https://docs.lmnr.ai/index-agent/api/getting-started)
25 | - [x] You can also try out Index via [Chat UI](https://lmnr.ai/chat).
26 | - [x] Supports advanced [browser agent observability](https://docs.lmnr.ai/index-agent/tracing) powered by open-source platform [Laminar](https://github.com/lmnr-ai/lmnr).
27 |
28 | prompt: go to ycombinator.com. summarize first 3 companies in the W25 batch and make new spreadsheet in google sheets.
29 |
30 | https://github.com/user-attachments/assets/2b46ee20-81b6-4188-92fb-4d97fe0b3d6a
31 |
32 | ## Documentation
33 |
34 | Check out full documentation [here](https://docs.lmnr.ai/index-agent/getting-started)
35 |
36 | ## Quickstart
37 |
38 | ### Install dependencies
39 | ```bash
40 | pip install lmnr-index 'lmnr[all]'
41 |
42 | # Install playwright
43 | playwright install chromium
44 | ```
45 |
46 | ### Setup model API keys
47 |
48 | Setup your model API keys in `.env` file in your project root:
49 | ```
50 | GEMINI_API_KEY=
51 | ANTHROPIC_API_KEY=
52 | OPENAI_API_KEY=
53 | # Optional, to trace the agent's actions and record browser session
54 | LMNR_PROJECT_API_KEY=
55 | ```
56 |
57 | ### Run Index with code
58 | ```python
59 | import asyncio
60 | from index import Agent, GeminiProvider
61 | from pydantic import BaseModel
62 | from lmnr import Laminar
63 | import os
64 |
65 | # to trace the agent's actions and record browser session
66 | Laminar.initialize()
67 |
68 | # Define Pydantic schema for structured output
69 | class NewsSummary(BaseModel):
70 | title: str
71 | summary: str
72 |
73 | async def main():
74 |
75 | llm = GeminiProvider(model="gemini-2.5-pro-preview-05-06")
76 | agent = Agent(llm=llm)
77 |
78 | # Example of getting structured output
79 | output = await agent.run(
80 | prompt="Navigate to news.ycombinator.com, find a post about AI, extract its title and provide a concise summary.",
81 | output_model=NewsSummary
82 | )
83 |
84 | summary = NewsSummary.model_validate(output.result.content)
85 | print(f"Title: {summary.title}")
86 | print(f"Summary: {summary.summary}")
87 |
88 | if __name__ == "__main__":
89 | asyncio.run(main())
90 | ```
91 |
92 | ### Run Index with CLI
93 |
94 | Index CLI features:
95 | - Browser state persistence between sessions
96 | - Follow-up messages with support for "give human control" action
97 | - Real-time streaming updates
98 | - Beautiful terminal UI using Textual
99 |
100 | You can run Index CLI with the following command.
101 | ```bash
102 | index run
103 | ```
104 |
105 | Output will look like this:
106 |
107 | ```
108 | Loaded existing browser state
109 | ╭───────────────────── Interactive Mode ─────────────────────╮
110 | │ Index Browser Agent Interactive Mode │
111 | │ Type your message and press Enter. The agent will respond. │
112 | │ Press Ctrl+C to exit. │
113 | ╰────────────────────────────────────────────────────────────╯
114 |
115 | Choose an LLM model:
116 | 1. Gemini 2.5 Flash
117 | 2. Claude 3.7 Sonnet
118 | 3. OpenAI o4-mini
119 | Select model [1/2] (1): 3
120 | Using OpenAI model: o4-mini
121 | Loaded existing browser state
122 |
123 | Your message: go to lmnr.ai, summarize pricing page
124 |
125 | Agent is working...
126 | Step 1: Opening lmnr.ai
127 | Step 2: Opening Pricing page
128 | Step 3: Scrolling for more pricing details
129 | Step 4: Scrolling back up to view pricing tiers
130 | Step 5: Provided concise summary of the three pricing tiers
131 | ```
132 |
133 | ### Running CLI with a personal Chrome instance
134 |
135 | You can use Index with personal Chrome browser instance instead of launching a new browser. Main advantage is that all your existing logged-in sessions will be available.
136 |
137 | ```bash
138 | # Basic usage with default Chrome path
139 | index run --local-chrome
140 | ```
141 |
142 | ## Use Index via API
143 |
144 | The easiest way to use Index in production is with [serverless API](https://docs.lmnr.ai/index-agent/api/getting-started). Index API manages remote browser sessions, agent infrastructure and [browser observability](https://docs.lmnr.ai/index-agent/api/tracing). To get started, create a project API key in [Laminar](https://lmnr.ai).
145 |
146 | ### Install Laminar
147 | ```bash
148 | pip install lmnr
149 | ```
150 |
151 | ### Use Index via API
152 | ```python
153 | from lmnr import Laminar, LaminarClient
154 | # you can also set LMNR_PROJECT_API_KEY environment variable
155 |
156 | # Initialize tracing
157 | Laminar.initialize(project_api_key="your_api_key")
158 |
159 | # Initialize the client
160 | client = LaminarClient(project_api_key="your_api_key")
161 |
162 | for chunk in client.agent.run(
163 | stream=True,
164 | model_provider="gemini",
165 | model="gemini-2.5-pro-preview-05-06",
166 | prompt="Navigate to news.ycombinator.com, find a post about AI, and summarize it"
167 | ):
168 | print(chunk)
169 |
170 | ```
171 |
172 |
173 | ## Browser agent observability
174 |
175 | Both code run and API run provide advanced browser observability. To trace Index agent's actions and record browser session you simply need to initialize Laminar tracing before running the agent.
176 |
177 | ```python
178 | from lmnr import Laminar
179 |
180 | Laminar.initialize(project_api_key="your_api_key")
181 | ```
182 |
183 | Then you will get full observability on the agent's actions synced with the browser session in the Laminar platform. Learn more about browser agent observability in the [documentation](https://docs.lmnr.ai/index-agent/tracing).
184 |
185 |
186 |
187 |
188 |
189 | ---
190 |
191 | Made with ❤️ by the [Laminar team](https://lmnr.ai)
192 |
--------------------------------------------------------------------------------
/evals/eval_generate_json.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Any, Dict
3 |
4 | from lmnr import evaluate
5 |
6 | from index import AnthropicProvider
7 | from index.agent.utils import generate_proper_json
8 |
9 | llm = AnthropicProvider(model="claude-3-7-sonnet-20250219", enable_thinking=True, thinking_token_budget=1024)
10 |
11 | async def run_json_correction(data: Dict[str, Any]):
12 | """Execute the JSON correction function."""
13 | malformed_json = data["malformed_json"]
14 | # We'll need an LLM provider. Let's use GeminiProvider as in the reference.
15 | # In a real scenario, you might want to configure this or pass it differently.
16 |
17 | corrected_json_str = await generate_proper_json(llm=llm, json_str=malformed_json)
18 |
19 | # The function returns a string, let's try to parse it to ensure it's valid JSON for the eval
20 | try:
21 | return json.loads(corrected_json_str)
22 | except json.JSONDecodeError:
23 | # If it's not valid JSON, return the string itself for the evaluator to handle
24 | return corrected_json_str
25 |
26 |
27 | async def eval_json_correction(output: Any, target: Dict[str, Any]):
28 | """Evaluate the JSON correction accuracy."""
29 | # Assuming target is a Python dict representing the expected JSON
30 | # And output is also a Python dict (if parsing was successful) or a string
31 |
32 | if isinstance(output, str):
33 | # This means the corrected_json_str was not valid JSON
34 | # For this simple eval, we can consider this a failure if the target is a dict
35 | # Or, if the target itself is expected to be a non-JSON string (e.g. an error message)
36 | # For now, let's assume target is always a valid JSON object.
37 | try:
38 | # Attempt to parse the output string here for comparison
39 | output_dict = json.loads(output)
40 | exact_match = output_dict == target
41 | except json.JSONDecodeError:
42 | exact_match = False # Output was not valid JSON
43 | else: # Output is already a dict
44 | exact_match = output == target
45 |
46 | return exact_match
47 |
48 | test_data = [
49 | {
50 | "data": {
51 | # Trailing comma, single quotes
52 | "malformed_json": "{'name': 'John Doe', 'age': 30, 'city': 'New York',}",
53 | },
54 | "target": {
55 | "name": "John Doe",
56 | "age": 30,
57 | "city": "New York"
58 | }
59 | },
60 | {
61 | "data": {
62 | "malformed_json": '''{
63 | "item": "Book",
64 | "details": {
65 | "title": "The "Great Gatsby"",
66 | "author": "F. Scott Fitzgerald"
67 | },
68 | "price": 10.99
69 | }'''
70 | },
71 | "target": {
72 | "item": "Book",
73 | "details": {
74 | "title": "The \"Great Gatsby\"",
75 | "author": "F. Scott Fitzgerald"
76 | },
77 | "price": 10.99
78 | }
79 | },
80 | {
81 | "data": {
82 | # No closing brace
83 | "malformed_json": '''{
84 | "key1": "value1",
85 | "key2": "value2"
86 | ''' # Corrected: Removed trailing content that looked like a comment inside string
87 | },
88 | "target": {
89 | "key1": "value1",
90 | "key2": "value2"
91 | }
92 | },
93 | {
94 | "data": {
95 | # JSON with comments (not standard, should be removed by the fixer)
96 | "malformed_json": '''{
97 | // This is a comment
98 | "product_id": 123,
99 | "status": "active"
100 | }'''
101 | },
102 | "target": {
103 | "product_id": 123,
104 | "status": "active"
105 | }
106 | },
107 | # Example of a more complex malformed JSON
108 | {
109 | "data": {
110 | "malformed_json": "{\"name\": \"incomplete, \"value\": [1, 2, \"unfinished_array\"" # Missing closing bracket and quote
111 | },
112 | "target": { # Assuming the LLM can make a reasonable guess or fix structure
113 | "name": "incomplete",
114 | "value": [1, 2, "unfinished_array"]
115 | }
116 | },
117 | {
118 | "data": {
119 | "malformed_json": "{'key with space': 'value', 'another key': true, 'numeric_string': '123.45' }" # Single quotes, boolean
120 | },
121 | "target": {
122 | "key with space": "value",
123 | "another key": True, # Python bool
124 | "numeric_string": "123.45"
125 | }
126 | }
127 | ]
128 |
129 | # Run the evaluation
130 | evaluate(
131 | data=test_data,
132 | executor=run_json_correction,
133 | evaluators={"json_correction_accuracy": eval_json_correction},
134 | concurrency_limit=10,
135 | group_name="json_correction_eval",
136 | )
137 |
--------------------------------------------------------------------------------
/evals/eval_output_model.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Any, Dict
3 |
4 | from lmnr import evaluate
5 | from pydantic import BaseModel
6 |
7 | from index import Agent, GeminiProvider
8 |
9 |
10 | class CountryInfo(BaseModel):
11 | """Model for country information extraction"""
12 | country: str
13 | capital: str
14 | currency: str
15 |
16 |
17 | async def run_agent(data: Dict[str, Any]):
18 | """Execute the agent with data extraction based on output_model"""
19 | prompt = data["prompt"]
20 | output_model = data.get("output_model")
21 | start_url = data.get("start_url")
22 |
23 | llm = GeminiProvider(model="gemini-2.5-pro-preview-03-25")
24 |
25 | agent = Agent(llm=llm)
26 | output = await agent.run(
27 | prompt=prompt,
28 | output_model=output_model,
29 | start_url=start_url
30 | )
31 |
32 | return output.result.content
33 |
34 |
35 | async def eval_extraction(output: Dict[str, Any], target: Dict[str, Any]):
36 | """Evaluate the extraction accuracy"""
37 |
38 | exact_match = json.dumps(output, sort_keys=True) == json.dumps(target, sort_keys=True)
39 |
40 | return exact_match
41 |
42 | data = [
43 | {
44 | "data": {
45 | "prompt": "Extract information about France. For currency only use text description, such as 'Euro'.",
46 | "output_model": CountryInfo,
47 | "start_url": "https://en.wikipedia.org/wiki/France"
48 | },
49 | "target": {
50 | "country": "France",
51 | "capital": "Paris",
52 | "currency": "Euro"
53 | }
54 | },
55 | {
56 | "data": {
57 | "prompt": "Extract information about Japan. For currency only use text description, such as 'Euro'.",
58 | "output_model": CountryInfo,
59 | "start_url": "https://en.wikipedia.org/wiki/Japan"
60 | },
61 | "target": {
62 | "country": "Japan",
63 | "capital": "Tokyo",
64 | "currency": "Japanese yen"
65 | }
66 | },
67 | {
68 | "data": {
69 | "prompt": "Extract information about Brazil. For currency only use text description, such as 'Euro'.",
70 | "output_model": CountryInfo,
71 | "start_url": "https://en.wikipedia.org/wiki/Brazil"
72 | },
73 | "target": {
74 | "country": "Brazil",
75 | "capital": "Brasília",
76 | "currency": "Real"
77 | }
78 | },
79 | ]
80 |
81 | evaluate(
82 | data=data,
83 | executor=run_agent,
84 | evaluators={"accuracy": eval_extraction},
85 | concurrency_limit=1,
86 | group_name="country_extraction",
87 | )
88 |
--------------------------------------------------------------------------------
/index/__init__.py:
--------------------------------------------------------------------------------
1 | from index.agent.agent import Agent
2 | from index.agent.models import ActionModel, ActionResult, AgentOutput
3 | from index.browser.browser import Browser, BrowserConfig
4 | from index.browser.detector import Detector
5 | from index.browser.models import InteractiveElement
6 | from index.llm.providers.anthropic import AnthropicProvider
7 | from index.llm.providers.anthropic_bedrock import AnthropicBedrockProvider
8 | from index.llm.providers.gemini import GeminiProvider
9 | from index.llm.providers.gemini_vertex import GeminiVertexProvider
10 | from index.llm.providers.groq import GroqProvider
11 | from index.llm.providers.openai import OpenAIProvider
12 |
13 | __all__ = [
14 | 'Agent',
15 | 'Browser',
16 | 'BrowserConfig',
17 | 'ActionResult',
18 | 'ActionModel',
19 | 'AnthropicProvider',
20 | 'AnthropicBedrockProvider',
21 | 'OpenAIProvider',
22 | 'GeminiProvider',
23 | 'GeminiVertexProvider',
24 | 'GroqProvider',
25 | 'AgentOutput',
26 | 'Detector',
27 | 'InteractiveElement',
28 | ]
29 |
--------------------------------------------------------------------------------
/index/agent/agent.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import logging
4 | import time
5 | import uuid
6 | from typing import AsyncGenerator, Optional
7 |
8 | from dotenv import load_dotenv
9 | from lmnr import Laminar, LaminarSpanContext, observe, use_span
10 | from pydantic import BaseModel
11 |
12 | from index.agent.message_manager import MessageManager
13 | from index.agent.models import (
14 | ActionResult,
15 | AgentLLMOutput,
16 | AgentOutput,
17 | AgentState,
18 | AgentStreamChunk,
19 | FinalOutputChunk,
20 | StepChunk,
21 | StepChunkContent,
22 | StepChunkError,
23 | TimeoutChunk,
24 | TimeoutChunkContent,
25 | )
26 | from index.agent.utils import validate_json
27 | from index.browser.browser import Browser, BrowserConfig
28 | from index.controller.controller import Controller
29 | from index.llm.llm import BaseLLMProvider, Message
30 |
31 | load_dotenv()
32 | logger = logging.getLogger(__name__)
33 |
34 | class Agent:
35 | def __init__(
36 | self,
37 | llm: BaseLLMProvider,
38 | browser_config: BrowserConfig | None = None
39 | ):
40 | self.llm = llm
41 | self.controller = Controller()
42 |
43 | # Initialize browser or use the provided one
44 | self.browser = Browser(config=browser_config if browser_config is not None else BrowserConfig())
45 |
46 | action_descriptions = self.controller.get_action_descriptions()
47 |
48 | self.message_manager = MessageManager(
49 | action_descriptions=action_descriptions,
50 | )
51 |
52 | self.state = AgentState(
53 | messages=[],
54 | )
55 |
56 | async def step(self, step: int, previous_result: ActionResult | None = None, step_span_context: Optional[LaminarSpanContext] = None) -> tuple[ActionResult, str]:
57 | """Execute one step of the task"""
58 |
59 | with Laminar.start_as_current_span(
60 | name="agent.step",
61 | parent_span_context=step_span_context,
62 | input={
63 | "step": step,
64 | },
65 | ):
66 | state = await self.browser.update_state()
67 |
68 | if previous_result:
69 | self.message_manager.add_current_state_message(state, previous_result)
70 |
71 | input_messages = self.message_manager.get_messages()
72 |
73 | try:
74 | model_output = await self._generate_action(input_messages)
75 | except Exception as e:
76 | # model call failed, remove last state message from history before retrying
77 | self.message_manager.remove_last_message()
78 | raise e
79 |
80 | if previous_result:
81 | # we're removing the state message that we've just added because we want to append it in a different format
82 | self.message_manager.remove_last_message()
83 |
84 | self.message_manager.add_message_from_model_output(step, previous_result, model_output, state.screenshot)
85 |
86 | try:
87 | result: ActionResult = await self.controller.execute_action(
88 | model_output.action,
89 | self.browser
90 | )
91 |
92 | if result.is_done:
93 | logger.info(f'Result: {result.content}')
94 | self.final_output = result.content
95 |
96 | return result, model_output.summary
97 |
98 | except Exception as e:
99 | raise e
100 |
101 |
102 | @observe(name='agent.generate_action', ignore_input=True)
103 | async def _generate_action(self, input_messages: list[Message]) -> AgentLLMOutput:
104 | """Get next action from LLM based on current state"""
105 |
106 | response = await self.llm.call(input_messages)
107 |
108 | try:
109 | # Pass the raw LLM response content to validate_json
110 | output = await validate_json(response.content, self.llm)
111 |
112 | logger.info(f'💡 Thought: {output.thought}')
113 | logger.info(f'💡 Summary: {output.summary}')
114 | logger.info(f'🛠️ Action: {output.action.model_dump_json(exclude_unset=True)}')
115 |
116 | if response.thinking:
117 | output.thinking_block = response.thinking
118 |
119 | return output
120 | except ValueError as e:
121 | # Re-raise the ValueError from validate_json, which now includes detailed context
122 | logger.error(f"Failed to generate and validate action after multiple retries: {e}")
123 | raise e
124 |
125 | async def _setup_messages(self,
126 | prompt: str,
127 | agent_state: str | None = None,
128 | start_url: str | None = None,
129 | output_model: BaseModel | str | None = None
130 | ):
131 | """Set up messages based on state dict or initialize with system message"""
132 | if agent_state:
133 | # assuming that the structure of the state.messages is correct
134 | state = AgentState.model_validate_json(agent_state)
135 | self.message_manager.set_messages(state.messages)
136 | # Update browser_context to browser
137 | browser_state = await self.browser.update_state()
138 | self.message_manager.add_current_state_message(browser_state, user_follow_up_message=prompt)
139 | else:
140 | self.message_manager.add_system_message_and_user_prompt(prompt, output_model)
141 |
142 | if start_url:
143 | await self.browser.goto(start_url)
144 | browser_state = await self.browser.update_state()
145 | self.message_manager.add_current_state_message(browser_state)
146 |
147 |
148 | async def run(self,
149 | prompt: str,
150 | max_steps: int = 100,
151 | agent_state: str | None = None,
152 | parent_span_context: Optional[LaminarSpanContext] = None,
153 | close_context: bool = True,
154 | session_id: str | None = None,
155 | return_agent_state: bool = False,
156 | return_storage_state: bool = False,
157 | start_url: str | None = None,
158 | output_model: BaseModel | str | None = None
159 | ) -> AgentOutput:
160 | """Execute the task with maximum number of steps and return the final result
161 |
162 | Args:
163 | prompt: The prompt to execute the task with
164 | max_steps: The maximum number of steps to execute the task with. Defaults to 100.
165 | agent_state: Optional, the state of the agent to execute the task with
166 | parent_span_context: Optional, parent span context in Laminar format to execute the task with
167 | close_context: Whether to close the browser context after the task is executed
168 | session_id: Optional, Agent session id
169 | return_agent_state: Whether to return the agent state with the final output
170 | return_storage_state: Whether to return the storage state with the final output
171 | start_url: Optional, the URL to start the task with
172 | output_model: Optional, the output model to use for the task
173 | """
174 |
175 | if prompt is None and agent_state is None:
176 | raise ValueError("Either prompt or agent_state must be provided")
177 |
178 | with Laminar.start_as_current_span(
179 | name="agent.run",
180 | parent_span_context=parent_span_context,
181 | input={
182 | "prompt": prompt,
183 | "max_steps": max_steps,
184 | "stream": False,
185 | },
186 | ) as span:
187 | if session_id is not None:
188 | span.set_attribute("lmnr.internal.agent_session_id", session_id)
189 |
190 | await self._setup_messages(prompt, agent_state, start_url, output_model)
191 |
192 | step = 0
193 | result = None
194 | is_done = False
195 |
196 | trace_id = str(uuid.UUID(int=span.get_span_context().trace_id))
197 |
198 | try:
199 | while not is_done and step < max_steps:
200 | logger.info(f'📍 Step {step}')
201 | result, _ = await self.step(step, result)
202 | step += 1
203 | is_done = result.is_done
204 |
205 | if is_done:
206 | logger.info(f'✅ Task completed successfully in {step} steps')
207 | break
208 |
209 | if not is_done:
210 | logger.info('❌ Maximum number of steps reached')
211 |
212 | except Exception as e:
213 | logger.info(f'❌ Error in run: {e}')
214 | raise e
215 | finally:
216 | storage_state = await self.browser.get_storage_state()
217 |
218 | if close_context:
219 | # Update to close the browser directly
220 | await self.browser.close()
221 |
222 | span.set_attribute("lmnr.span.output", result.model_dump_json())
223 |
224 | return AgentOutput(
225 | agent_state=self.get_state() if return_agent_state else None,
226 | result=result,
227 | storage_state=storage_state if return_storage_state else None,
228 | step_count=step,
229 | trace_id=trace_id,
230 | )
231 |
232 | async def run_stream(self,
233 | prompt: str,
234 | max_steps: int = 100,
235 | agent_state: str | None = None,
236 | parent_span_context: Optional[LaminarSpanContext] = None,
237 | close_context: bool = True,
238 | timeout: Optional[int] = None,
239 | session_id: str | None = None,
240 | return_screenshots: bool = False,
241 | return_agent_state: bool = False,
242 | return_storage_state: bool = False,
243 | start_url: str | None = None,
244 | output_model: BaseModel | str | None = None
245 | ) -> AsyncGenerator[AgentStreamChunk, None]:
246 | """Execute the task with maximum number of steps and stream step chunks as they happen
247 |
248 | Args:
249 | prompt: The prompt to execute the task with
250 | max_steps: The maximum number of steps to execute the task with
251 | agent_state: The state of the agent to execute the task with
252 | parent_span_context: Parent span context in Laminar format to execute the task with
253 | close_context: Whether to close the browser context after the task is executed
254 | timeout: The timeout for the task
255 | session_id: Agent session id
256 | return_screenshots: Whether to return screenshots with the step chunks
257 | return_agent_state: Whether to return the agent state with the final output chunk
258 | return_storage_state: Whether to return the storage state with the final output chunk
259 | start_url: Optional, the URL to start the task with
260 | output_model: Optional, the output model to use for the task
261 | """
262 |
263 | # Create a span for the streaming execution
264 | span = Laminar.start_span(
265 | name="agent.run_stream",
266 | parent_span_context=parent_span_context,
267 | input={
268 | "prompt": prompt,
269 | "max_steps": max_steps,
270 | "stream": True,
271 | },
272 | )
273 |
274 | trace_id = str(uuid.UUID(int=span.get_span_context().trace_id))
275 |
276 | if session_id is not None:
277 | span.set_attribute("lmnr.internal.agent_session_id", session_id)
278 |
279 | with use_span(span):
280 | await self._setup_messages(prompt, agent_state, start_url, output_model)
281 |
282 | step = 0
283 | result = None
284 | is_done = False
285 |
286 | if timeout is not None:
287 | start_time = time.time()
288 |
289 | try:
290 | # Execute steps and yield results
291 | while not is_done and step < max_steps:
292 | logger.info(f'📍 Step {step}')
293 |
294 | with use_span(span):
295 | result, summary = await self.step(step, result)
296 |
297 | step += 1
298 | is_done = result.is_done
299 |
300 | screenshot = None
301 | if return_screenshots:
302 | state = self.browser.get_state()
303 | screenshot = state.screenshot
304 |
305 | if timeout is not None and time.time() - start_time > timeout:
306 |
307 | yield TimeoutChunk(
308 | content=TimeoutChunkContent(
309 | action_result=result,
310 | summary=summary,
311 | step=step,
312 | agent_state=self.get_state() if return_agent_state else None,
313 | screenshot=screenshot,
314 | trace_id=trace_id
315 | )
316 | )
317 | return
318 |
319 | yield StepChunk(
320 | content=StepChunkContent(
321 | action_result=result,
322 | summary=summary,
323 | trace_id=trace_id,
324 | screenshot=screenshot
325 | )
326 | )
327 |
328 | if is_done:
329 | logger.info(f'✅ Task completed successfully in {step} steps')
330 |
331 | storage_state = await self.browser.get_storage_state()
332 |
333 | # Yield the final output as a chunk
334 | final_output = AgentOutput(
335 | agent_state=self.get_state() if return_agent_state else None,
336 | result=result,
337 | storage_state=storage_state if return_storage_state else None,
338 | step_count=step,
339 | trace_id=trace_id,
340 | )
341 |
342 | span.set_attribute("lmnr.span.output", result.model_dump_json())
343 | yield FinalOutputChunk(content=final_output)
344 |
345 | break
346 |
347 | if not is_done:
348 | logger.info('❌ Maximum number of steps reached')
349 | yield StepChunkError(content=f'Maximum number of steps reached: {max_steps}')
350 |
351 | except Exception as e:
352 | logger.info(f'❌ Error in run: {e}')
353 | span.record_exception(e)
354 |
355 | yield StepChunkError(content=f'Error in run stream: {e}')
356 | finally:
357 | # Clean up resources
358 | if close_context:
359 | # Update to close the browser directly
360 | await self.browser.close()
361 |
362 | span.end()
363 | logger.info('Stream complete, span closed')
364 |
365 | def get_state(self) -> AgentState:
366 |
367 | self.state.messages = self.message_manager.get_messages()
368 |
369 | return self.state
370 |
--------------------------------------------------------------------------------
/index/agent/demo_images/complex_layout_highlight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/agent/demo_images/complex_layout_highlight.png
--------------------------------------------------------------------------------
/index/agent/demo_images/complex_layout_small_elements.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/agent/demo_images/complex_layout_small_elements.png
--------------------------------------------------------------------------------
/index/agent/demo_images/loading.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/agent/demo_images/loading.png
--------------------------------------------------------------------------------
/index/agent/demo_images/scroll.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lmnr-ai/index/d64bce88d95ce459f75e514a442c6260930f703c/index/agent/demo_images/scroll.png
--------------------------------------------------------------------------------
/index/agent/message_manager.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import json
4 | import logging
5 | from datetime import datetime
6 | from typing import List, Optional, Type
7 |
8 | from pydantic import BaseModel
9 |
10 | from index.agent.models import ActionResult, AgentLLMOutput
11 | from index.agent.prompts import system_message
12 | from index.agent.utils import load_demo_image_as_b64, pydantic_to_custom_jtd
13 | from index.browser.models import BrowserState
14 | from index.browser.utils import scale_b64_image
15 | from index.llm.llm import ImageContent, Message, TextContent
16 |
17 | logger = logging.getLogger(__name__)
18 |
19 |
20 | class MessageManager:
21 | def __init__(
22 | self,
23 | action_descriptions: str,
24 | ):
25 | self._messages: List[Message] = []
26 | self.action_descriptions = action_descriptions
27 |
28 |
29 | def add_system_message_and_user_prompt(self, prompt: str, output_model: Type[BaseModel] | str | None = None) -> None:
30 |
31 | complex_layout_highlight = load_demo_image_as_b64('complex_layout_highlight.png')
32 | complex_layout_small_elements = load_demo_image_as_b64('complex_layout_small_elements.png')
33 | still_loading = load_demo_image_as_b64('loading.png')
34 | scroll_over_element_example = load_demo_image_as_b64('scroll.png')
35 | system_msg = Message(
36 | role="system",
37 | content=[
38 | TextContent(text=system_message(self.action_descriptions), cache_control=True),
39 | ],
40 | )
41 |
42 | self._messages.append(system_msg)
43 | output_model_str = ''
44 | if output_model:
45 | output_format = ''
46 | if isinstance(output_model, type) and issubclass(output_model, BaseModel):
47 | output_format = json.dumps(pydantic_to_custom_jtd(output_model), indent=2)
48 | elif isinstance(output_model, str):
49 | output_format = output_model
50 |
51 | output_model_str = f"""
52 |
53 | When you are ready to complete the task use `done_with_structured_output` action. Strictly provide output in the following JSON format and infer which fields best match the information you have gathered:
54 |
55 |
56 | {output_format}
57 |
58 | """
59 |
60 | self._messages.append(Message(
61 | role="user",
62 | content=[
63 | TextContent(text=''),
64 | TextContent(text="Here's an example of a complex layout. As an example, if you want to select a 'Roster' section for Colorado Rockies. Then you need to click on element with index 121."),
65 | ImageContent(image_b64=complex_layout_highlight),
66 | TextContent(text=''),
67 | TextContent(text=''),
68 | TextContent(text="Here's an example of small elements on the page and their functions. Element 7, represented by 'x' icon, is a 'clear text' button. Element 8 is a 'submit' button, represented by '=' icon. This clarification should help you better understand similar layouts."),
69 | ImageContent(image_b64=complex_layout_small_elements),
70 | TextContent(text=''),
71 | TextContent(text=''),
72 | TextContent(text="Here is an example of a loading page. If the main content on the page is empty or if there are loading elements, such as 'skeleton' screens or loading indicators, page is still loading. Then, you HAVE to perform `wait_for_page_to_load` action because you can't interact with the page until it is fully loaded."),
73 | ImageContent(image_b64=still_loading),
74 | TextContent(text=''),
75 | TextContent(text=''),
76 | TextContent(text="In some cases, to reveal more content, you need to scroll in scrollable areas of the webpage. Scrollable areas have VERTICAL scrollbars very clearly visible on their right side. In the screenshot below, you can clearly see a scrollbar on the right side of the list of search items. This indicates that the list is scrollable. To scroll over this area, you need to identify any element within the scrollable area and use its index with `scroll_down_over_element` action to scroll over it. In this example, appropriate element is with index 15."),
77 | ImageContent(image_b64=scroll_over_element_example),
78 | TextContent(text='', cache_control=True),
79 | TextContent(text=f"""Here is the task you need to complete:
80 |
81 |
82 | {prompt}
83 |
84 |
85 | Today's date and time is: {datetime.now().strftime('%B %d, %Y, %I:%M%p')} - keep this date and time in mind when planning your actions.{output_model_str}"""),
86 | ]
87 | ))
88 |
89 | def get_messages_as_state(self) -> List[Message]:
90 | """Get messages as state messages"""
91 | return [msg for msg in self._messages if msg.is_state_message]
92 |
93 |
94 | def remove_last_message(self) -> None:
95 | """Remove last message from history"""
96 | if len(self._messages) > 1:
97 | self._messages.pop()
98 |
99 | def add_current_state_message(
100 | self,
101 | state: BrowserState,
102 | previous_result: ActionResult | None = None,
103 | user_follow_up_message: str | None = None,
104 | ) -> None:
105 | """Add browser state as a user message"""
106 |
107 | if state.interactive_elements:
108 | highlighted_elements = ''
109 | for element in state.interactive_elements.values():
110 |
111 | # exclude sheets elements
112 | if element.browser_agent_id.startswith("row_") or element.browser_agent_id.startswith("column_"):
113 | continue
114 |
115 | start_tag = f"[{element.index}]<{element.tag_name}"
116 |
117 | if element.input_type:
118 | start_tag += f" type=\"{element.input_type}\""
119 |
120 | start_tag += ">"
121 | element_text = element.text.replace('\n', ' ')
122 | highlighted_elements += f"{start_tag}{element_text}{element.tag_name}>\n"
123 | else:
124 | highlighted_elements = ''
125 |
126 | scroll_distance_above_viewport = state.viewport.scroll_distance_above_viewport or 0
127 | scroll_distance_below_viewport = state.viewport.scroll_distance_below_viewport or 0
128 |
129 | if scroll_distance_above_viewport > 0:
130 | elements_text = f'{scroll_distance_above_viewport}px scroll distance above current viewport\n'
131 | else:
132 | elements_text = '[Start of page]\n'
133 |
134 | if highlighted_elements != '':
135 | elements_text += f'\nHighlighted elements:\n{highlighted_elements}'
136 |
137 | if scroll_distance_below_viewport > 0:
138 | elements_text += f'\n{scroll_distance_below_viewport}px scroll distance below current viewport\n'
139 | else:
140 | elements_text += '\n[End of page]'
141 |
142 | previous_action_output = ''
143 | if previous_result:
144 | previous_action_output = f'\n{previous_result.content}\n\n\n' if previous_result.content else ''
145 |
146 | if previous_result.error:
147 | previous_action_output += f'\n{previous_result.error}\n\n\n'
148 |
149 | if user_follow_up_message:
150 | user_follow_up_message = f'\n{user_follow_up_message}\n\n\n'
151 | else:
152 | user_follow_up_message = ''
153 |
154 | state_description = f"""{previous_action_output}{user_follow_up_message}
155 |
156 | Current URL: {state.url}
157 |
158 | Open tabs:
159 | {state.tabs}
160 |
161 | Current viewport information:
162 | {elements_text}
163 | """
164 |
165 | state_msg = Message(
166 | role='user',
167 | content=[
168 | TextContent(text=state_description),
169 | TextContent(text=''),
170 | ImageContent(image_b64=state.screenshot),
171 | TextContent(text=''),
172 | TextContent(text=''),
173 | ImageContent(image_b64=state.screenshot_with_highlights),
174 | TextContent(text=''),
175 | ]
176 | )
177 |
178 | self._messages.append(state_msg)
179 |
180 | def add_message_from_model_output(self, step: int, previous_result: ActionResult | None, model_output: AgentLLMOutput, screenshot: Optional[str] = None) -> None:
181 | """Add model output as AI message"""
182 |
183 | previous_action_output = ''
184 |
185 | for msg in self._messages:
186 | if msg.is_state_message:
187 | msg.content = [msg.content[0]]
188 |
189 | if previous_result and screenshot:
190 | previous_action_output = f'\n{previous_result.content}\n' if previous_result.content else ''
191 |
192 | if previous_result.error:
193 | previous_action_output += f'\n{previous_result.error}\n'
194 |
195 | usr_msg = Message(
196 | role='user',
197 | content=[
198 | TextContent(text=previous_action_output, cache_control=True),
199 | TextContent(text=f""),
200 | ImageContent(image_b64=scale_b64_image(screenshot, 0.75)),
201 | TextContent(text=f""),
202 | ],
203 | is_state_message=True,
204 | )
205 | self._messages.append(usr_msg)
206 |
207 | assistant_content = [
208 | TextContent(text=f"""
209 | {model_output.model_dump_json(indent=2, include={"thought", "action", "summary"}).strip()}
210 | """),
211 | ]
212 |
213 | if model_output.thinking_block:
214 | assistant_content = [
215 | model_output.thinking_block,
216 | ] + assistant_content
217 |
218 | msg = Message(
219 | role='assistant',
220 | content=assistant_content,
221 | )
222 |
223 | self._messages.append(msg)
224 |
225 | def get_messages(self) -> List[Message]:
226 |
227 | found_first_cache_control = False
228 |
229 | # clear all past cache control except the latest one
230 | for msg in self._messages[::-1]:
231 |
232 | # ignore system messages
233 | if msg.role == 'system':
234 | continue
235 |
236 | if found_first_cache_control:
237 | msg.remove_cache_control()
238 |
239 | if msg.has_cache_control():
240 | found_first_cache_control = True
241 |
242 |
243 | return self._messages
244 |
245 | def set_messages(self, messages: List[Message]) -> None:
246 | """Set messages"""
247 | self._messages = messages
248 |
--------------------------------------------------------------------------------
/index/agent/models.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import Any, Dict, Literal, Optional
4 |
5 | from playwright.async_api import StorageState
6 | from pydantic import BaseModel
7 |
8 | from index.llm.llm import Message, ThinkingBlock
9 |
10 |
11 | class AgentState(BaseModel):
12 | """State of the agent"""
13 |
14 | messages: list[Message]
15 |
16 | class ActionResult(BaseModel):
17 | """Result of executing an action"""
18 |
19 | is_done: Optional[bool] = False
20 | content: Optional[str | Dict[str, Any]] = None
21 | error: Optional[str] = None
22 | give_control: Optional[bool] = False
23 |
24 | class ActionModel(BaseModel):
25 | """Model for an action"""
26 |
27 | name: str
28 | params: Dict[str, Any]
29 |
30 | class AgentLLMOutput(BaseModel):
31 | """Output model for agent"""
32 |
33 | action: ActionModel
34 | thought: Optional[str] = None
35 | summary: Optional[str] = None
36 | thinking_block: Optional[ThinkingBlock] = None
37 |
38 | class AgentOutput(BaseModel):
39 | """Output model for agent"""
40 |
41 | agent_state: Optional[AgentState] = None
42 | result: ActionResult
43 | step_count: int = 0
44 | storage_state: Optional[StorageState] = None
45 | trace_id: str | None = None
46 |
47 | class AgentStreamChunk(BaseModel):
48 | """Base class for chunks in the agent stream"""
49 | type: str
50 |
51 | class StepChunkContent(BaseModel):
52 | action_result: ActionResult
53 | summary: str
54 | trace_id: str | None = None
55 | screenshot: Optional[str] = None
56 |
57 | class StepChunk(AgentStreamChunk):
58 | """Chunk containing a step result"""
59 | type: Literal["step"] = "step"
60 | content: StepChunkContent
61 |
62 | class TimeoutChunkContent(BaseModel):
63 | action_result: ActionResult
64 | summary: str
65 | step: int
66 | agent_state: AgentState | None = None
67 | trace_id: str | None = None
68 | screenshot: Optional[str] = None
69 |
70 | class TimeoutChunk(AgentStreamChunk):
71 | """Chunk containing a timeout"""
72 | type: Literal["step_timeout"] = "step_timeout"
73 | content: TimeoutChunkContent
74 |
75 | class StepChunkError(AgentStreamChunk):
76 | """Chunk containing an error"""
77 | type: Literal["step_error"] = "step_error"
78 | content: str
79 |
80 | class FinalOutputChunk(AgentStreamChunk):
81 | """Chunk containing the final output"""
82 | type: Literal["final_output"] = "final_output"
83 | content: AgentOutput
84 |
--------------------------------------------------------------------------------
/index/agent/prompts.py:
--------------------------------------------------------------------------------
1 | def system_message(action_descriptions: str) -> str:
2 | return f"""You are an advanced AI assistant designed to interact with a web browser and complete user tasks. Your capabilities include analyzing web page screenshots, interacting with page elements, and navigating through websites to accomplish various objectives.
3 |
4 | First, let's review the available actions you can perform:
5 |
6 |
7 | {action_descriptions}
8 |
9 |
10 | Your goal is to complete the user's task by carefully analyzing the current state of the web page, planning your actions, reflecting on the outcomes of the previous actions, and avoiding repetition of unsuccessful approaches. Follow the guidelines below:
11 |
12 | 1. Element Identification:
13 | - Interactable elements on the page are enclosed in uniquely colored bounding boxes with numbered labels.
14 | - Label corresponding to its bounding box is placed at the top right corner of the bounding box, and has exact same color as the bounding box. If the label is larger than the bounding box, the label is placed right outside and tangent to the bounding box.
15 | - Carefully match labels to their corresponding bounding boxes based on the color and position of the label, as labels might slightly overlap with unrelated bounding boxes.
16 | - If bounding box doesn't enclose any element, simply ignore it (most likely the bounding box was incorrectly detected).
17 | - Screenshot enclosed in tag contains clean screenshot of a current browser window.
18 | - Screenshot enclosed in tag has bounding boxes with labels drawn around interactable elements.
19 | - Carefully analyze both screenshots to understand the layout of the page and accurately map bounding boxes to their corresponding elements.
20 | - Remember: each bounding box and corresponding label have the same unique color.
21 |
22 | 2. Element Interaction:
23 | - Infer role and function of elements based on their appearance, text/icon inside the element, and location on the page.
24 | - Interact only with visible elements on the screen.
25 | - Before entering a text into an input area, make sure that you have clicked on the target input area first.
26 | - Scroll or interact with elements to reveal more content if necessary information is not visible.
27 | - To scroll within areas with scrollbars, first identify any element inside the scrollable area and use its index with `scroll_down_over_element` or `scroll_up_over_element` actions instead of scrolling the entire page. Pay attention to the scrollbar position and direction to identify the correct element.
28 | - Some pages have navigation menu on the left, which might contain useful information, such as filters, categories, navigation, etc. Pay close attention to whether the side menu has scrollbars. If it does, scroll over it using an element within the side menu.
29 | - For clicking on a cell in a spreadsheet, first identify the correct column and row that corresponds to the cell you want to click on. Then, strictly use the `click_on_spreadsheet_cell` action to click on the cell. Don't use `click_element` action for interacting with a spreadsheet cells.
30 |
31 | 3. Task Execution:
32 | - After you perform an action, analyze the state screenshot to verify that the intended result was achieved (filter was applied, correct date range was selected, text was entered, etc.). If the result was not achieved, identify the problem and fix it. Be creative and persistent in your approach and don't repeat the same actions that failed.
33 | - Break down multi-step tasks into sub-tasks and complete each sub-task one by one.
34 | - Thoroughly explore all possible approaches before declaring the task complete.
35 | - If you encounter obstacles, consider alternative approaches such as returning to a previous page, initiating a new search, or opening a new tab.
36 | - Understand elements on the page and infer the most relevant ones for the current step of the task.
37 | - Ensure that your final output fully addresses all aspects of the user's request.
38 | - Include ALL requested information in the "done" action. Include markdown-formatted links where relevant and useful.
39 | - Important: For research tasks, be persistent and explore multiple results (at least 5-10) before giving up.
40 | - Be persistent and creative in your approach, e.g., using site-specific Google searches to find precise information.
41 |
42 | 4. Special Situations:
43 | - Cookie popups: Click "I accept" if present. If it persists after clicking, ignore it.
44 | - CAPTCHA: Attempt to solve logically. If unsuccessful, open a new tab and continue the task.
45 |
46 | 5. Returning control to human:
47 | - For steps that require user information to proceed, such as providing first name, last name, email, phone number, booking information, login, password, credit card information, credentials, etc., unless this information was provided in the initial prompt, you must use `give_human_control` action to give human control of the browser.
48 | - If you can't solve the CAPTCHA, use the `give_human_control` action to give human control of the browser to aid you in solving the CAPTCHA.
49 | - Control is guaranteed to be returned to you after the human has entered the information or solved the CAPTCHA, so you should plan your next actions accordingly.
50 |
51 | 6. Source citations:
52 | - When you perform research tasks, include links to the websites that you found the information in your final output.
53 | - In general, include links to the websites that you found the information in your final output.
54 | - Strictly use markdown format for the links, because the final output will be rendered as markdown.
55 |
56 | 7. Spreadsheet interaction:
57 | - To click on a cell in a spreadsheet, use the `click_on_spreadsheet_cell` action to click on a specific cell. DON'T use `click_element` action for interacting with a spreadsheet cells or other elements when the goal is to click on a specific cell.
58 | - To input text into a spreadsheet cell, first click on the cell using the `click_on_spreadsheet_cell` action, then use the `enter_text` action to input text.
59 |
60 | Your response must always be in the following JSON format, enclosed in