├── .env example
├── .gitignore
├── LICENSE
├── README.md
├── poetry.lock
├── pyproject.toml
└── src
    ├── __init__.py
    ├── autogen_configuration
        ├── autogen_config.py
        └── utils
        │   └── oai_config_list.json
    ├── configs
        └── logging
        │   ├── color_formatter.py
        │   ├── logging_config.json
        │   └── logging_config.py
    ├── create_oai_agent
        ├── oai_agent_creator.py
        ├── tool_definition_factory.py
        └── utils
        │   ├── oai_instructions.json
        │   └── tool_definitions.json
    ├── data
        └── assistant_id.json
    ├── oai_agent
        ├── oai_agent.py
        └── utils
        │   ├── create_oai_agent.py
        │   ├── load_assistant_id.py
        │   └── prompt.py
    ├── tests
        ├── test_autogen_config.py
        ├── test_logging_config.py
        └── test_webdriver.py
    ├── tools
        ├── analyze_content.py
        ├── click_element.py
        ├── go_back.py
        ├── input_text.py
        ├── jump_to_search_engine.py
        ├── read_url.py
        ├── save_to_file.py
        ├── scroll.py
        ├── utils
        │   ├── analyze_image.py
        │   ├── get_b64_screenshot.py
        │   ├── get_webdriver_instance.py
        │   ├── highlight_elements.py
        │   ├── load_context.py
        │   ├── mark_page.js
        │   ├── openai_client.py
        │   ├── prompts.json
        │   ├── save_screenshot.py
        │   └── vision_template.py
        └── wait.py
    └── webdriver
        └── webdriver.py


/.env example:
--------------------------------------------------------------------------------
1 | PYTHONDONTWRITEBYTECODE=1
2 | OPENAI_API_KEY=""
3 | OPENAI_MODEL="gpt-4-turbo-preview"


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | src/data/chrome_profile
163 | src/data/screenshots
164 | src/data/saved_data


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MultimodalWebAgent
 2 | 
 3 | [![Watch the video](https://img.youtube.com/vi/jQ2Os682Ybs/0.jpg)](https://www.youtube.com/watch?v=jQ2Os682Ybs&t=0s)
 4 | 
 5 | ## Roadmap
 6 | 
 7 | I aim to develop an open-source variant inspired by [MultiOn AI](https://www.multion.ai/) that focuses on automating web workflows, complemented by a user-friendly interface. My current effort is towards crafting a streamlined yet sophisticated version of this WebAgent within the AutoGen Repository, specifically in the [multimodal_web_surfer branch](https://github.com/microsoft/autogen/tree/multimodal_web_surfer). If you're interested in collaborating on this project or want to create a product that makes a difference, I'm eager to connect.
 8 | 
 9 | ## Introduction
10 | 
11 | This is a multimodal web agent that can understand and generate natural language and visual content implemented using the [AutoGen](https://microsoft.github.io/autogen/) framework and the [Assistants API](https://platform.openai.com/docs/assistants/overview).\
12 | It is based on the Paper [WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models](https://arxiv.org/abs/2401.13919).
13 | 
14 | ## Disclaimer
15 | 
16 | This project is still in development and not yet ready for use.\
17 | I managed to get the agent to work, but the results are not yet satisfactory.
18 | The prompt has to be carefully crafted to get good results. F.e. the prompt describes every step what the agent has to do in detail like "Go to the website, click on the button using tool X, scroll down using tool Y, click on the next button using tool Z, etc.". With this approach the webagent is working pretty well tbh. So feel free to try it out, give feedback and contribute.
19 | 
20 | ## Installation
21 | 
22 | 1. Rename the file '.env example' to '.env' and fill in your OpenAI API key.
23 | 2. Install the required packages using the following command:
24 |    `poetry install`
25 | 
26 | ## Run the agent
27 | 
28 | 1. Craft an precise prompt for the agent. The prompt should describe every step what the agent has to do in detail.
29 |    You find an example prompt in the file `src/oai_agent/utils/prompt`.
30 | 2. Run the agent using the following command:
31 |    `poetry run webagent`
32 | 
33 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "multimodalwebagent"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["schauppi <46676413+schauppi@users.noreply.github.com>"]
 6 | license = "Apache-2.0"
 7 | readme = "README.md"
 8 | packages = [
 9 |     { include = "src", from = ".", format = "sdist" },
10 | ]
11 | 
12 | [tool.poetry.dependencies]
13 | python = ">=3.11,<3.13"
14 | python-dotenv = "^1.0.1"
15 | pyautogen = "^0.2.17"
16 | openai = "^1.13.3"
17 | colorama = "^0.4.6"
18 | tzlocal = "^5.2"
19 | playwright = "^1.42.0"
20 | pytest-playwright = "^0.4.4"
21 | instructor = "^0.6.4"
22 | 
23 | 
24 | [build-system]
25 | requires = ["poetry-core"]
26 | build-backend = "poetry.core.masonry.api"
27 | 
28 | [tool.poetry.scripts]
29 | webagent = 'src.oai_agent.oai_agent:main'
30 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/schauppi/MultimodalWebAgent/eb57e9f7efc7718c558d614255cbe18d480f7d20/src/__init__.py


--------------------------------------------------------------------------------
/src/autogen_configuration/autogen_config.py:
--------------------------------------------------------------------------------
 1 | from src.configs.logging.logging_config import setup_logging
 2 | import autogen
 3 | from autogen import config_list_from_json
 4 | import logging
 5 | from dotenv import load_dotenv
 6 | 
 7 | import os
 8 | 
 9 | setup_logging()
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | dotenv_path = os.path.normpath(os.path.join(
14 |     os.path.dirname(__file__), '..', '..', '.env'))
15 | 
16 | try:
17 |     load_dotenv(dotenv_path=dotenv_path)
18 |     logger.info("Environment variables loaded successfully.")
19 | except Exception as e:
20 |     logger.error("Failed to load the .env file.", exc_info=e)
21 | 
22 | 
23 | class GetConfig:
24 |     """
25 |     Get and enrich config from config file.
26 | 
27 |     Methods:
28 |         __init__():
29 |             Initialize with API key and config list.
30 |         base_dir() -> str:
31 |             Returns the base directory path.
32 |         load_and_enrich_config_list() -> dict:
33 |             Loads config list from a JSON file and enriches it with the API key.
34 | 
35 |     """
36 | 
37 |     def __init__(self) -> None:
38 |         """
39 |         Initialize with API key and config list.
40 | 
41 |         Args:
42 |             None
43 |         """
44 |         logger.info('Initializing GetConfig class')
45 |         self.api_key = os.environ.get('OPENAI_API_KEY', '')
46 |         if not self.api_key:
47 |             logger.error('OPENAI_API_KEY not found in environment variables')
48 |         self.config_list = self.load_and_enrich_config_list()
49 | 
50 |     @property
51 |     def base_dir(self) -> str:
52 |         """
53 |         Returns the base directory path.
54 | 
55 |         Args:
56 |             None
57 | 
58 |         Returns:
59 |             str: The base directory path.
60 |         """
61 |         base_dir = os.path.dirname(os.path.dirname(__file__))
62 |         logger.info(f'Retrieved base directory path: {base_dir}')
63 |         return base_dir
64 | 
65 |     def load_and_enrich_config_list(self) -> dict:
66 |         """
67 |         Loads config list from a JSON file and enriches it with the API key.
68 | 
69 |         Args:
70 |             None
71 | 
72 |         Returns:
73 |             dict: The enriched config list.
74 |         """
75 |         config_path = 'src/autogen_configuration/utils/oai_config_list.json'
76 |         try:
77 |             config_list = config_list_from_json(
78 |                 env_or_file=config_path,
79 |                 filter_dict={"model": os.environ.get('OPENAI_MODEL', '')}
80 |             )
81 |             logger.info('Config list loaded successfully')
82 |             for config in config_list:
83 |                 config['api_key'] = self.api_key
84 |             logger.info('Config list enriched with API key')
85 |         except Exception as e:
86 |             logger.error(
87 |                 "Failed to load or enrich the config list.", exc_info=e)
88 |             config_list = []
89 |         return {'config_list': config_list}
90 | 
91 | 
92 | # test
93 | config = GetConfig()
94 | 


--------------------------------------------------------------------------------
/src/autogen_configuration/utils/oai_config_list.json:
--------------------------------------------------------------------------------
1 | [
2 |     {
3 |         "model": "gpt-4-turbo-preview",
4 |         "api_key": ""
5 |     }
6 | ]


--------------------------------------------------------------------------------
/src/configs/logging/color_formatter.py:
--------------------------------------------------------------------------------
 1 | from colorama import init, Fore, Back, Style
 2 | import logging
 3 | 
 4 | init()
 5 | 
 6 | 
 7 | class ColoredFormatter(logging.Formatter):
 8 |     """
 9 |     A custom logging formatter that adds color to log messages based on their log level.
10 | 
11 |     Methods:
12 |         format(record: logging.LogRecord) -> str:
13 |             Formats the log record and adds color to the log message based on the log level.
14 | 
15 |     """
16 | 
17 |     COLOR_CODES = {
18 |         "DEBUG": Fore.WHITE,
19 |         "INFO": Fore.GREEN,
20 |         "WARNING": Fore.YELLOW,
21 |         "ERROR": Fore.RED,
22 |         "CRITICAL": Back.RED,
23 |     }
24 |     RESET = Style.RESET_ALL
25 | 
26 |     def format(self, record: logging.LogRecord) -> str:
27 |         """
28 |         Formats the log record and adds color to the log message based on the log level.
29 | 
30 |         Args:
31 |             record (logging.LogRecord): The log record to format.
32 | 
33 |         Returns:
34 |             str: The formatted log message with color added.
35 | 
36 |         """
37 |         color_code = self.COLOR_CODES.get(record.levelname, self.RESET)
38 |         message = super().format(record)
39 |         return f"{color_code}{message}{self.RESET}"
40 | 


--------------------------------------------------------------------------------
/src/configs/logging/logging_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 1,
 3 |     "handlers": {
 4 |         "console_info": {
 5 |             "class": "logging.StreamHandler",
 6 |             "level": "INFO",
 7 |             "formatter": "simple",
 8 |             "stream": "ext://sys.stdout"
 9 |         },
10 |         "console_warning": {
11 |             "class": "logging.StreamHandler",
12 |             "level": "WARNING",
13 |             "formatter": "simple",
14 |             "stream": "ext://sys.stdout"
15 |         },
16 |         "console_error": {
17 |             "class": "logging.StreamHandler",
18 |             "level": "ERROR",
19 |             "formatter": "error",
20 |             "stream": "ext://sys.stderr"
21 |         }
22 |     },
23 |     "formatters": {
24 |         "simple": {
25 |             "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
26 |         },
27 |         "error": {
28 |             "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
29 |         }
30 |     },
31 |     "root": {
32 |         "level": "INFO",
33 |         "handlers": ["console_info", "console_warning", "console_error"]
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/configs/logging/logging_config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import logging.config
 4 | from typing import Optional
 5 | 
 6 | from src.configs.logging.color_formatter import ColoredFormatter
 7 | 
 8 | 
 9 | def setup_logging(default_path: str = 'src/configs/logging/logging_config.json', default_level: int = logging.INFO) -> None:
10 |     """
11 |     Setup logging configuration.
12 | 
13 |     Args:
14 |         default_path (str): The default path to the logging configuration file.
15 |         default_level (int): The default logging level.
16 | 
17 |     Returns:
18 |         None
19 |     """
20 |     path = default_path
21 |     try:
22 |         if os.path.exists(path):
23 |             with open(path, 'rt') as f:
24 |                 config = json.load(f)
25 | 
26 |             logging.config.dictConfig(config)
27 | 
28 |             for _, handler_details in config['handlers'].items():
29 |                 if 'formatter' in handler_details:
30 |                     formatter_config = config['formatters'][handler_details['formatter']]
31 |                     formatter = ColoredFormatter(
32 |                         fmt=formatter_config['format'])
33 |                     logging.getLogger().handlers[0].setFormatter(formatter)
34 |         else:
35 |             logging.basicConfig(level=default_level)
36 |     except Exception as e:
37 |         print(f"Error occurred while setting up logging: {e}")
38 | 


--------------------------------------------------------------------------------
/src/create_oai_agent/oai_agent_creator.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | from datetime import datetime
  5 | from dotenv import load_dotenv
  6 | import openai
  7 | 
  8 | from src.configs.logging.logging_config import setup_logging
  9 | from src.create_oai_agent.tool_definition_factory import ToolDefinitionFactory
 10 | 
 11 | setup_logging()
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class OAIAssistantCreator:
 16 |     """
 17 |     Class responsible for creating an OpenAI assistant with specified tools and instructions.
 18 | 
 19 |     Methods:
 20 |         load_environment() -> None:
 21 |             Load environment variables from the .env file.
 22 |         initialize_openai_client() -> None:
 23 |             Initialize the OpenAI client with the API key from environment variables.
 24 |         load_instructions() -> str:
 25 |             Load instructions for the assistant from the specified JSON file
 26 |     """
 27 | 
 28 |     def __init__(self, dotenv_path: str, config_path: str, instruction_path: str, assistant_save_path: str, assistant_type: str) -> None:
 29 |         """
 30 |         Initialize the creator with paths to necessary configurations and instructions.
 31 | 
 32 |         Args:
 33 |             dotenv_path (str): Path to the .env file.
 34 |             config_path (str): Path to the tool definitions JSON file.
 35 |             instruction_path (str): Path to the instructions JSON file.
 36 |             assistant_save_path (str): Path to the file to save the assistant ID.
 37 |             assistant_name (str): Name of the assistant to be created.
 38 | 
 39 |         Returns:
 40 |             None
 41 |         """
 42 |         self.dotenv_path = dotenv_path
 43 |         self.config_path = config_path
 44 |         self.instruction_path = instruction_path
 45 |         self.assistant_save_path = assistant_save_path
 46 |         self.assistant_type = assistant_type
 47 |         self.client = None
 48 |         logger.info(
 49 |             "OAIAssistantCreator initialized with config and instruction paths.")
 50 | 
 51 |     def load_environment(self) -> None:
 52 |         """
 53 |         Load environment variables from the .env file.
 54 | 
 55 |         Args:
 56 |             None
 57 | 
 58 |         Returns:
 59 |             None
 60 |         """
 61 |         try:
 62 |             load_dotenv(dotenv_path=self.dotenv_path)
 63 |             logger.info("Environment variables loaded successfully.")
 64 |         except Exception as e:
 65 |             logger.error("Failed to load the .env file.", exc_info=True)
 66 | 
 67 |     def initialize_openai_client(self) -> None:
 68 |         """
 69 |         Initialize the OpenAI client with the API key from environment variables.
 70 | 
 71 |         Args:
 72 |             None
 73 | 
 74 |         Returns:
 75 |             None
 76 |         """
 77 |         try:
 78 |             self.client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"])
 79 |             logger.info("OpenAI client initialized successfully.")
 80 |         except KeyError as e:
 81 |             logger.error(
 82 |                 "OPENAI_API_KEY not found in environment variables.", exc_info=True)
 83 |             raise
 84 | 
 85 |     def load_instructions(self) -> str:
 86 |         """
 87 |         Load instructions for the assistant from the specified JSON file.
 88 | 
 89 |         Args:
 90 |             None
 91 | 
 92 |         Returns:
 93 |             str: The loaded instructions.
 94 |         """
 95 |         try:
 96 |             with open(self.instruction_path, 'r') as file:
 97 |                 instruction = json.load(file)["instruction"]
 98 |             logger.info("Instructions loaded successfully.")
 99 |             return instruction
100 |         except (FileNotFoundError, json.JSONDecodeError) as e:
101 |             logger.error("Failed to load instructions.", exc_info=True)
102 |             raise
103 | 
104 |     def create_assistant(self) -> str:
105 |         """
106 |         Create an OpenAI assistant with the loaded tools and instructions.
107 | 
108 |         Args:
109 |             None
110 | 
111 |         Returns:
112 |             str: The ID of the created assistant.
113 |         """
114 |         try:
115 |             factory = ToolDefinitionFactory(self.config_path)
116 |             oai_tools = [factory.create_tool_definition(
117 |                 tool_type) for tool_type in factory.config]
118 | 
119 |             instruction = self.load_instructions()
120 |             assistant = self.client.beta.assistants.create(
121 |                 model=os.environ.get('OPENAI_MODEL', ''),
122 |                 instructions=instruction,
123 |                 tools=oai_tools,
124 |                 name=self.assistant_type
125 |             )
126 |             logger.info(
127 |                 f"Assistant created successfully with ID: {assistant.id}")
128 |             return assistant.id
129 |         except Exception as e:
130 |             logger.error("Failed to create assistant.", exc_info=True)
131 |             raise
132 | 
133 |     def save_assistant_id(self, assistant_id: str) -> None:
134 |         """
135 |         Save the assistant's ID to a JSON file.
136 | 
137 |         Args:
138 |             assistant_id (str): The ID of the assistant to save.
139 | 
140 |         Returns:
141 |             None
142 |         """
143 |         data_to_append = {
144 |             "type": self.assistant_type,
145 |             "id": assistant_id,
146 |             "date": str(datetime.now())
147 |         }
148 | 
149 |         try:
150 |             with open(self.assistant_save_path, "r") as file:
151 |                 data = json.load(file)
152 |                 if not isinstance(data, list):
153 |                     data = []
154 |             logger.info("Existing assistant data loaded successfully.")
155 |         except (FileNotFoundError, json.JSONDecodeError) as e:
156 |             data = []
157 |             logger.info(
158 |                 "No existing assistant data found or error in loading; starting fresh.")
159 | 
160 |         data.append(data_to_append)
161 | 
162 |         with open(self.assistant_save_path, "w") as file:
163 |             json.dump(data, file, indent=4)
164 |             logger.info(f"Assistant ID {assistant_id} saved successfully.")
165 | 
166 |     def run(self) -> None:
167 |         """
168 |         Run the process of creating an assistant and saving its ID.
169 | 
170 |         Args:
171 |             None
172 | 
173 |         Returns:
174 |             None
175 |         """
176 |         logger.info("Starting the OAIAssistantCreator process.")
177 |         self.load_environment()
178 |         self.initialize_openai_client()
179 |         assistant_id = self.create_assistant()
180 |         self.save_assistant_id(assistant_id)
181 |         logger.info("OAIAssistantCreator process completed successfully.")
182 | 
183 | 
184 | """if __name__ == "__main__":
185 |     dotenv_path = os.path.normpath(os.path.join(
186 |         os.path.dirname(__file__), '..', '..', '.env'))
187 |     config_path = 'src/create_oai_agent/utils/tool_definitions.json'
188 |     instruction_path = 'src/create_oai_agent/utils/oai_instructions.json'
189 |     assistant_save_path = "src/data/assistant_id.json"
190 | 
191 |     creator = OAIAssistantCreator(
192 |         dotenv_path, config_path, instruction_path, assistant_save_path, "BrowsingAgent")
193 |     creator.run()"""
194 | 


--------------------------------------------------------------------------------
/src/create_oai_agent/tool_definition_factory.py:
--------------------------------------------------------------------------------
 1 | from src.configs.logging.logging_config import setup_logging
 2 | import json
 3 | import logging
 4 | 
 5 | setup_logging()
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class ToolDefinitionFactory:
11 |     """
12 |     Factory class for creating tool definitions.
13 | 
14 |     Methods:
15 |         load_config(path: str) -> dict:
16 |             Load the configuration data from the configuration file.
17 |         create_tool_definition(tool_type: str) -> dict:
18 |             Create a tool definition based on the tool type.
19 |     """
20 | 
21 |     def __init__(self, config_path: str) -> None:
22 |         """
23 |         Initialize the factory with the path to the configuration file.
24 | 
25 |         Args:
26 |             config_path (str): The path to the configuration file.
27 |         """
28 |         self.config = self.load_config(config_path)
29 | 
30 |     def load_config(self, path: str) -> dict:
31 |         """
32 |         Load the configuration data from the configuration file.
33 | 
34 |         Args:
35 |             path (str): The path to the configuration file.
36 | 
37 |         Returns:
38 |             dict: The loaded configuration data.
39 |         """
40 |         try:
41 |             with open(path, 'r') as file:
42 |                 return json.load(file)
43 |         except FileNotFoundError:
44 |             logger.error(f"Config file not found at path: {path}")
45 |             raise
46 |         except json.JSONDecodeError:
47 |             logger.error(
48 |                 f"Failed to parse JSON from config file at path: {path}")
49 |             raise
50 | 
51 |     def create_tool_definition(self, tool_type: str) -> dict:
52 |         """
53 |         Create a tool definition based on the tool type.
54 | 
55 |         Args:
56 |             tool_type (str): The type of the tool.
57 | 
58 |         Returns:
59 |             dict: The tool definition.
60 |         """
61 |         tool_config = self.config.get(tool_type)
62 |         if not tool_config:
63 |             logger.error(f"No tool definition found for type: {tool_type}")
64 |             raise ValueError(f"No tool definition found for type: {tool_type}")
65 | 
66 |         try:
67 |             return {
68 |                 "type": 'function',
69 |                 "function": {
70 |                     "name": tool_config['name'],
71 |                     "description": tool_config['description'],
72 |                     "parameters": {
73 |                         "properties": tool_config['parameters'],
74 |                         "required": tool_config['required'],
75 |                         "type": "object"
76 |                     }
77 |                 }
78 |             }
79 |         except KeyError as e:
80 |             logger.error(
81 |                 f"Key {e} not found in tool config for type: {tool_type}")
82 |             raise
83 | 


--------------------------------------------------------------------------------
/src/create_oai_agent/utils/oai_instructions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "instruction": "As an advanced web browsing assistant, you possess capabilities that mimic human web interaction but are enhanced by specialized tools for web navigation and analysis. Your primary role involves structured and efficient web page interaction, utilizing a suite of tools designed for a broad spectrum of web-related tasks. These tasks include text input, element clicking, navigating search engines, URL reading, page scrolling, waiting for elements to load, analyzing content via screenshots, and navigating back in browser history.\n\n##Your Toolkit Includes:\n\n- **Input Text Tool:** Enables interaction with web page input fields. Use it to fill forms or conduct site-specific searches by entering text as described by users.\n\n- **Click Element Tool:** Allows you to click on described webpage elements, such as buttons, links, or checkboxes, facilitating site navigation and interaction.\n\n- **Jump to Search Engine Tool:** Directs you to a search engine (Google) for initiating searches or web navigation from a known point.\n\n- **Read URL Tool:** Lets you load webpages from specific URLs, granting access to any required site.\n\n- **Scroll Tool:** Enables vertical webpage scrolling to uncover content not immediately visible upon page load.\n\n- **Wait Tool:** Pauses operations for a specified duration, useful for allowing page elements to load or introducing delays between interactions.\n\n- **Analyze Content Tool:** Analyzes webpage content via screenshots, enabling comprehension and reporting on page information or layout.\n\n- **Go Back Tool:** Navigates back one page in the browser history, useful for retracing steps or revisiting previously viewed content efficiently.\n\n- **Save to File Tool:** Saves important information to a file for record-keeping or later use.\n\n##Objective:\n\nUtilize these tools to fulfill user-assigned tasks, ranging from locating specific webpage information, form submissions, to navigating through webpages for complex workflows. Your responses should be accurate and efficient, addressing the user's request while navigating the dynamic web environment.\n\n##Execution Steps:\n\n1. **Identify the Goal:** Understand the user's request and the desired outcome.\n2. **Select Appropriate Tools:** Choose the tools necessary for accomplishing the goal.\n3. **Execute the Task:** Employ the selected tools to interact with the web as required.\n4. **Report Back:** Provide the user with requested information, action confirmations, or findings explanations.\n\nEfficiency, accuracy in interpreting user needs, selecting the right tools, and effective navigation are key metrics of your performance."
3 | }


--------------------------------------------------------------------------------
/src/create_oai_agent/utils/tool_definitions.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "input_text": {
  3 |       "name": "input_text",
  4 |       "description": "Interacts with a webpage by sending keys to input fields based on a provided query, and returns a response string with the result of the action or an error message. The process includes highlighting input elements, taking a screenshot for analysis, and then processing the inputs as per the query.",
  5 |       "parameters": {
  6 |         "query": {
  7 |           "description": "The query to be used for identifying the input fields and the keys to be sent.",
  8 |           "examples": [
  9 |             "Type 'OpenAI' into the 'Search' input field.",
 10 |             "Type example@gmail.com into the 'Email' input field, and type 'securePassword!' into the 'Password' input field.",
 11 |             "Enter '123 Main St' into the address field and select 'United States' from the country dropdown."
 12 |           ],
 13 |           "title": "Query",
 14 |           "type": "string"
 15 |         }
 16 |       },
 17 |       "required": ["query"]
 18 |     },
 19 |     "click_element": {
 20 |       "name": "click_element",
 21 |       "description": "Clicks on a webpage element based on a user query.",
 22 |       "parameters": {
 23 |         "query": {
 24 |           "description": "A query string representing the user's description of the target element to click.",
 25 |           "examples": [
 26 |             "Click the 'Submit' button.",
 27 |             "Click on the link titled 'Read More'.",
 28 |             "Select the checkbox next to 'I Agree'."
 29 |           ],
 30 |           "title": "Query",
 31 |           "type": "string"
 32 |         }
 33 |       },
 34 |       "required": ["query"]
 35 |     },
 36 |     "jump_to_search_engine": {
 37 |       "name": "jump_to_search_engine",
 38 |       "description": "Navigates to the Google search engine using a WebDriver instance, and returns a response string with the result of the action and the current URL.",
 39 |       "parameters": {},
 40 |       "required": []
 41 |     },
 42 |     "read_url": {
 43 |       "name": "read_url",
 44 |       "description": "Reads the content of a URL using a WebDriver instance adapted for Playwright, executes a script to remove popups, and returns the current URL.",
 45 |       "parameters": {
 46 |         "url": {
 47 |           "description": "The URL to read.",
 48 |           "examples": [
 49 |             "https://www.example.com",
 50 |             "www.example.com"
 51 |           ],
 52 |           "title": "URL",
 53 |           "type": "string"
 54 |         }
 55 |       },
 56 |       "required": ["url"]
 57 |     },
 58 |     "scroll": {
 59 |       "name": "scroll",
 60 |       "description": "Scrolls the webpage in a specified direction ('up' or 'down') by a fixed amount of pixels and returns a response string with the result of the action.",
 61 |       "parameters": {
 62 |         "direction": {
 63 |           "description": "The direction to scroll the webpage in.",
 64 |           "enum": ["up", "down"],
 65 |           "examples": ["up", "down"],
 66 |           "title": "Direction",
 67 |           "type": "string"
 68 |         }
 69 |       },
 70 |       "required": ["direction"]
 71 |     },
 72 |     "wait": {
 73 |       "name": "wait",
 74 |       "description": "Waits for 5 seconds and returns a response string with the result of the action and the current URL.",
 75 |       "parameters": {},
 76 |       "required": []
 77 |     },
 78 |     "analyze_content": {
 79 |       "name": "analyze_content",
 80 |       "description": "Analyzes the content of a webpage based on a screenshot and a user query, and returns a response string with insights and answers.",
 81 |       "parameters": {
 82 |         "query": {
 83 |           "description": "A query string representing the user's inquiry about the contents of the current web browser window.",
 84 |           "examples": [
 85 |             "What are the main headlines in this news article screenshot?",
 86 |             "Is there any contact information available on this page?"
 87 |           ],
 88 |           "title": "Query",
 89 |           "type": "string"
 90 |         }
 91 |       },
 92 |       "required": ["query"]
 93 |     },
 94 |     "go_back": {
 95 |       "name": "go_back",
 96 |       "description": "Navigates back one page in the browser history using a WebDriver instance, and returns a response string with the result of the action and the current URL.",
 97 |       "parameters": {},
 98 |       "required": []
 99 |     },
100 |     "save_to_file": {
101 |       "name": "save_to_file",
102 |       "description": "Saves the data to a file.",
103 |       "parameters": {
104 |         "data": {
105 |           "description": "The data to save to the file.",
106 |           "examples": [
107 |             "This is some example data.",
108 |             "{\"name\": \"John\", \"age\": 30}",
109 |             "12345"
110 |           ],
111 |           "title": "Data",
112 |           "type": "string"
113 |         }
114 |       },
115 |       "required": ["data"]
116 |     }
117 |   }
118 |   


--------------------------------------------------------------------------------
/src/data/assistant_id.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "type": "oai_browsing_assistant",
 4 |         "id": "asst_gX0omAoCJ8A9vtbJkMLYtXd6",
 5 |         "date": "2024-03-10 11:03:05.827440"
 6 |     },
 7 |     {
 8 |         "type": "oai_browsing_assistant",
 9 |         "id": "asst_WoIiFNMSnhKOgq6c36ET1wWb",
10 |         "date": "2024-03-10 11:06:14.870218"
11 |     },
12 |     {
13 |         "type": "oai_browsing_assistant",
14 |         "id": "asst_cvX7mpjg9oacOrUwGMD0RX5C",
15 |         "date": "2024-03-10 11:08:08.545855"
16 |     },
17 |     {
18 |         "type": "oai_browsing_assistant",
19 |         "id": "asst_6vU1tjrRqMf1QhB04DEDLobF",
20 |         "date": "2024-03-11 13:15:11.977078"
21 |     },
22 |     {
23 |         "type": "oai_browsing_assistant",
24 |         "id": "asst_gnT2lP147V1QCW4TFv6pDFim",
25 |         "date": "2024-03-11 13:17:07.707864"
26 |     },
27 |     {
28 |         "type": "oai_browsing_assistant",
29 |         "id": "asst_hZmZxOScHFufdhTsQOlw4dV0",
30 |         "date": "2024-03-11 13:41:12.295331"
31 |     },
32 |     {
33 |         "type": "oai_browsing_assistant",
34 |         "id": "asst_BzvWlaRSCb38ulMSCZXW3d2L",
35 |         "date": "2024-03-13 18:35:25.562663"
36 |     },
37 |     {
38 |         "type": "BrowsingAgent",
39 |         "id": "asst_YGjquz12EX0M1hgk8QR1sTKY",
40 |         "date": "2024-03-13 18:38:23.557789"
41 |     },
42 |     {
43 |         "type": "BrowsingAgent",
44 |         "id": "asst_prAKf2fRPin7eWyotdcaEVHh",
45 |         "date": "2024-03-14 08:00:30.929771"
46 |     }
47 | ]


--------------------------------------------------------------------------------
/src/oai_agent/oai_agent.py:
--------------------------------------------------------------------------------
  1 | from src.configs.logging.logging_config import setup_logging
  2 | from src.oai_agent.utils.load_assistant_id import load_assistant_id
  3 | from src.oai_agent.utils.create_oai_agent import create_agent
  4 | from src.autogen_configuration.autogen_config import GetConfig
  5 | from src.tools.read_url import read_url
  6 | from src.tools.scroll import scroll
  7 | from src.tools.jump_to_search_engine import jump_to_search_engine
  8 | from src.tools.go_back import go_back
  9 | from src.tools.wait import wait
 10 | from src.tools.click_element import click_element
 11 | from src.tools.input_text import input_text
 12 | from src.tools.analyze_content import analyze_content
 13 | from src.tools.save_to_file import save_to_file
 14 | from src.oai_agent.utils.prompt import prompt
 15 | 
 16 | import logging
 17 | import autogen
 18 | from autogen.agentchat import AssistantAgent
 19 | from autogen.agentchat.contrib.gpt_assistant_agent import GPTAssistantAgent
 20 | 
 21 | import openai
 22 | 
 23 | setup_logging()
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | def configure_agent(assistant_type: str) -> GPTAssistantAgent:
 28 |     """
 29 |     Configure the GPT Assistant Agent with the specified tools and instructions.
 30 | 
 31 |     Args:
 32 |         None
 33 | 
 34 |     Returns:
 35 |         GPTAssistantAgent: An instance of the GPTAssistantAgent.
 36 |     """
 37 |     try:
 38 |         logger.info("Configuring GPT Assistant Agent...")
 39 |         assistant_id = load_assistant_id(assistant_type)
 40 |         llm_config = GetConfig().config_list
 41 |         oai_config = {
 42 |             "config_list": llm_config["config_list"], "assistant_id": assistant_id}
 43 |         gpt_assistant = GPTAssistantAgent(
 44 |             name=assistant_type, instructions=AssistantAgent.DEFAULT_SYSTEM_MESSAGE, llm_config=oai_config
 45 |         )
 46 |         logger.info("GPT Assistant Agent configured.")
 47 |         return gpt_assistant
 48 |     except openai.NotFoundError:
 49 |         logger.warning("Assistant not found. Creating new assistant...")
 50 |         create_agent(assistant_type)
 51 |         return configure_agent()
 52 |     except Exception as e:
 53 |         logger.error(f"Unexpected error during agent configuration: {str(e)}")
 54 |         raise
 55 | 
 56 | 
 57 | def register_functions(agent):
 58 |     """
 59 |     Register the functions used by the GPT Assistant Agent.
 60 | 
 61 |     Args:
 62 |         agent (GPTAssistantAgent): An instance of the GPTAssistantAgent.
 63 | 
 64 |     Returns:
 65 |         None
 66 |     """
 67 |     logger.info("Registering functions...")
 68 |     function_map = {
 69 |         "analyze_content": analyze_content,
 70 |         "click_element": click_element,
 71 |         "go_back": go_back,
 72 |         "input_text": input_text,
 73 |         "jump_to_search_engine": jump_to_search_engine,
 74 |         "read_url": read_url,
 75 |         "scroll": scroll,
 76 |         "wait": wait,
 77 |         "save_to_file": save_to_file,
 78 |     }
 79 |     agent.register_function(function_map=function_map)
 80 |     logger.info("Functions registered.")
 81 | 
 82 | 
 83 | def create_user_proxy():
 84 |     """
 85 |     Create a User Proxy Agent.
 86 | 
 87 |     Args:
 88 |         None
 89 | 
 90 |     Returns:
 91 |         UserProxyAgent: An instance of the UserProxyAgent.
 92 |     """
 93 |     logger.info("Creating User Proxy Agent...")
 94 |     user_proxy = autogen.UserProxyAgent(
 95 |         name="user_proxy",
 96 |         is_termination_msg=lambda msg: "TERMINATE" in msg["content"],
 97 |         human_input_mode="NEVER",
 98 |         code_execution_config={
 99 |             "work_dir": "coding",
100 |             "use_docker": False,
101 |         },
102 |     )
103 |     logger.info("User Proxy Agent created.")
104 |     return user_proxy
105 | 
106 | 
107 | def main():
108 |     """
109 |     Main function to run the GPT Assistant Agent.
110 | 
111 |     Args:
112 |         None
113 | 
114 |     Returns:
115 |         None
116 |     """
117 |     try:
118 |         gpt_assistant = configure_agent("BrowsingAgent")
119 |         register_functions(gpt_assistant)
120 |         user_proxy = create_user_proxy()
121 |         user_proxy.initiate_chat(
122 |             gpt_assistant, message=prompt)
123 |     except Exception as e:
124 |         logger.error(f"An error occurred: {str(e)}")
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     main()
129 | 


--------------------------------------------------------------------------------
/src/oai_agent/utils/create_oai_agent.py:
--------------------------------------------------------------------------------
 1 | from src.configs.logging.logging_config import setup_logging
 2 | from src.create_oai_agent.oai_agent_creator import OAIAssistantCreator
 3 | 
 4 | import logging
 5 | import os
 6 | 
 7 | setup_logging()
 8 | logger = logging.getLogger()
 9 | 
10 | 
11 | def create_agent(assistant_type: str):
12 |     """
13 |     Create an OpenAI assistant with the specified tools and instructions.
14 | 
15 |     Args:
16 |         assistant_type (str): The type of assistant to create.
17 | 
18 |     Returns:
19 |         None
20 |     """
21 |     try:
22 |         dotenv_path = os.path.normpath(os.path.join(
23 |             os.path.dirname(__file__), '..', '..', '.env'))
24 |         config_path = 'src/create_oai_agent/utils/tool_definitions.json'
25 |         instruction_path = 'src/create_oai_agent/utils/oai_instructions.json'
26 |         assistant_save_path = "src/data/assistant_id.json"
27 |         logger.info("Creating OpenAI Assistant...")
28 |         creator = OAIAssistantCreator(
29 |             dotenv_path, config_path, instruction_path, assistant_save_path, assistant_type)
30 |         creator.run()
31 |     except Exception as e:
32 |         logger.error(f"An error occurred while creating the agent: {str(e)}")
33 | 


--------------------------------------------------------------------------------
/src/oai_agent/utils/load_assistant_id.py:
--------------------------------------------------------------------------------
 1 | from src.configs.logging.logging_config import setup_logging
 2 | 
 3 | import logging
 4 | import json
 5 | import datetime
 6 | 
 7 | setup_logging()
 8 | logger = logging.getLogger()
 9 | 
10 | 
11 | def load_assistant_id(assistant_type: str) -> str:
12 |     """
13 |     Load the assistant ID from the assistant ID file.
14 | 
15 |     Args:
16 |     - assistant_type (str): The type of assistant to load.
17 | 
18 |     Returns:
19 |     - assistant_id (str): The assistant ID. 
20 |     """
21 |     try:
22 |         with open("src/data/assistant_id.json", "r") as file:
23 |             data = json.load(file)
24 |             filtered_data = [
25 |                 entry for entry in data if entry['type'] == assistant_type]
26 |             latest_entry = max(filtered_data, key=lambda x: datetime.datetime.strptime(
27 |                 x['date'], "%Y-%m-%d %H:%M:%S.%f"))
28 |             assistant_id = latest_entry['id']
29 |     except FileNotFoundError:
30 |         logger.error("Assistant ID file not found.")
31 |         return None
32 |     except (json.JSONDecodeError, KeyError) as e:
33 |         logger.error(f"Error loading assistant ID: {str(e)}")
34 |         return None
35 |     except Exception as e:
36 |         logger.error(f"An error occurred while loading assistant ID: {str(e)}")
37 |         return None
38 | 
39 |     return assistant_id
40 | 


--------------------------------------------------------------------------------
/src/oai_agent/utils/prompt.py:
--------------------------------------------------------------------------------
 1 | prompt = """
 2 | 
 3 | TASK: Go to the Amazon website and search for a laptop, filter for laptops with more than 4 stars, select the first and put in in the cart.
 4 | 
 5 | 1. Go to the website https://www.amazon.com using the 'read_url'.
 6 | 2. Search for 'laptop' using the 'input_text'.
 7 | 3. Click on the more than 4 stars filter using the 'click_element'.
 8 | 3. Click on the first product using the 'click_element'.
 9 | 4. Add the item to the cart clicking on the 'Add to Cart' button using the 'click_element'.
10 | 
11 | 
12 | Write 'TERMINATE' to end the conversation.
13 | 
14 | """
15 | 


--------------------------------------------------------------------------------
/src/tests/test_autogen_config.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import patch
 3 | from src.autogen_configuration.autogen_config import GetConfig
 4 | 
 5 | 
 6 | class TestGetConfig(unittest.TestCase):
 7 | 
 8 |     def setUp(self):
 9 |         self.get_config = GetConfig()
10 | 
11 |     @patch('src.autogen_configuration.autogen_config.config_list_from_json')
12 |     def test_load_and_enrich_config_list(self, mock_config_list_from_json):
13 |         mock_config_list = [
14 |             {'model': 'model1'},
15 |             {'model': 'model2'}
16 |         ]
17 |         mock_config_list_from_json.return_value = mock_config_list
18 | 
19 |         expected_config_list = [
20 |             {'model': 'model1', 'api_key': 'test_api_key'},
21 |             {'model': 'model2', 'api_key': 'test_api_key'}
22 |         ]
23 | 
24 |         self.get_config.api_key = 'test_api_key'
25 |         config_list = self.get_config.load_and_enrich_config_list()
26 | 
27 |         self.assertEqual(config_list['config_list'], expected_config_list)
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     unittest.main()
32 | 


--------------------------------------------------------------------------------
/src/tests/test_logging_config.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import patch, mock_open
 3 | import os
 4 | import logging.config
 5 | # Assuming your updated code is in a file named log_config.py
 6 | from src.configs.logging.logging_config import setup_logging
 7 | 
 8 | 
 9 | class TestSetupLogging(unittest.TestCase):
10 | 
11 |     @patch('os.path.exists', return_value=True)
12 |     @patch('builtins.open', new_callable=mock_open, read_data='{"version": 1}')
13 |     @patch('logging.config.dictConfig')
14 |     def test_default_path(self, mock_dict_config, mock_file, mock_exists):
15 |         setup_logging()
16 |         mock_exists.assert_called_once_with(
17 |             'src/configs/logging/logging_config.json')
18 |         mock_file.assert_called_once_with(
19 |             'src/configs/logging/logging_config.json', 'rt')
20 |         mock_dict_config.assert_called_once()
21 | 
22 |     @patch('os.path.exists', return_value=False)
23 |     @patch('logging.basicConfig')
24 |     def test_fallback_to_basic_config(self, mock_basic_config, mock_exists):
25 |         setup_logging()
26 |         mock_exists.assert_called_once_with(
27 |             'src/configs/logging/logging_config.json')
28 |         mock_basic_config.assert_called_once_with(level=logging.INFO)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     unittest.main()
33 | 


--------------------------------------------------------------------------------
/src/tests/test_webdriver.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import patch, MagicMock
 3 | from src.webdriver.webdriver import WebDriver
 4 | import locale
 5 | from tzlocal import get_localzone_name
 6 | 
 7 | 
 8 | class TestWebDriver(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         self.webdriver = WebDriver.getInstance()
12 | 
13 |     def tearDown(self):
14 |         self.webdriver.closeDriver()
15 | 
16 |     @patch('src.webdriver.webdriver.sync_playwright')
17 |     def test_createDriver(self, mock_sync_playwright):
18 |         mock_playwright = MagicMock()
19 |         mock_browser = MagicMock()
20 |         mock_page = MagicMock()
21 |         mock_sync_playwright.return_value.start.return_value = mock_playwright
22 |         mock_playwright.chromium.launch_persistent_context.return_value = mock_browser
23 |         mock_browser.new_page.return_value = mock_page
24 | 
25 |         self.webdriver.createDriver()
26 | 
27 |         mock_sync_playwright.assert_called_once()
28 |         timezone_id = get_localzone_name()
29 |         system_locale = locale.getlocale()
30 |         mock_sync_playwright.return_value.start.assert_called_once()
31 |         mock_playwright.chromium.launch_persistent_context.assert_called_once_with(
32 |             user_data_dir="src/data/chrome_profile",
33 |             headless=False,
34 |             args=[
35 |                 "--disable-gpu",
36 |                 "--disable-dev-shm-usage",
37 |                 "--no-sandbox",
38 |                 "--disable-web-security",
39 |                 "--allow-running-insecure-content",
40 |             ],
41 |             locale=system_locale[0],
42 |             timezone_id=timezone_id,
43 |         )
44 |         mock_browser.new_page.assert_called_once()
45 |         mock_page.set_viewport_size.assert_called_once_with(
46 |             {"width": 960, "height": 1080})
47 | 
48 |     def test_getDriver(self):
49 |         page = self.webdriver.getDriver()
50 |         self.assertIsNotNone(page)
51 | 
52 |     def test_closeCurrentTab(self):
53 |         self.webdriver.closeCurrentTab()
54 |         page = self.webdriver.getDriver()
55 |         self.assertIsNotNone(page)
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     unittest.main()
60 | 


--------------------------------------------------------------------------------
/src/tools/analyze_content.py:
--------------------------------------------------------------------------------
 1 | from src.webdriver.webdriver import WebDriver
 2 | from src.configs.logging.logging_config import setup_logging
 3 | from src.tools.utils.get_b64_screenshot import get_b64_screenshot
 4 | from src.tools.utils.vision_template import get_vision_template
 5 | from src.tools.utils.analyze_image import analyze_image
 6 | from src.tools.utils.load_context import load_context
 7 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance
 8 | 
 9 | import logging
10 | 
11 | setup_logging()
12 | logger = logging.getLogger()
13 | 
14 | 
15 | def analyze_content(query: str) -> str:
16 |     """
17 |     Analyzes the content of a webpage based on a screenshot and a user query, and returns a response string with insights and answers.
18 | 
19 |     Args:
20 |         query (str): A query string representing the user's inquiry about the contents of the current web browser window.
21 | 
22 |     Returns:
23 |         str: A response string providing insights and answers regarding the information presented in the active browser window.
24 |     """
25 |     try:
26 |         logger.info("Initializing WebDriver to capture a webpage screenshot.")
27 |         driver = get_webdriver_instance()
28 | 
29 |         screenshot_b64 = get_b64_screenshot(driver)
30 | 
31 |         context = load_context("analyze_content")
32 | 
33 |         logger.info("Generating message history for image analysis.")
34 |         message_history = get_vision_template(context, screenshot_b64, query)
35 | 
36 |         logger.info("Analyzing the webpage screenshot.")
37 |         message = analyze_image(message_history)
38 | 
39 |         return message
40 |     except Exception as e:
41 |         logger.error("Failed to analyze content.", exc_info=True)
42 |         return "An error occurred while analyzing the webpage content. Please try again later."
43 | 


--------------------------------------------------------------------------------
/src/tools/click_element.py:
--------------------------------------------------------------------------------
 1 | from src.webdriver.webdriver import WebDriver
 2 | from src.configs.logging.logging_config import setup_logging
 3 | from src.tools.utils.highlight_elements import highlight_elements
 4 | from src.tools.utils.get_b64_screenshot import get_b64_screenshot
 5 | from src.tools.utils.vision_template import get_vision_template
 6 | from src.tools.utils.load_context import load_context
 7 | from src.tools.utils.analyze_image import analyze_image
 8 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance
 9 | 
10 | import logging
11 | import time
12 | 
13 | setup_logging()
14 | logger = logging.getLogger()
15 | 
16 | 
17 | def click_element(query: str) -> str:
18 |     """
19 |     Clicks on a webpage element based on a user query, and returns a response string with the result of the action.
20 | 
21 |     Args:
22 |         query (str): A query string representing the user's inquiry about the contents of the current web browser window.
23 | 
24 |     Returns:
25 |         str: A response string providing insights and answers regarding the information presented in the active browser window.
26 |     """
27 |     try:
28 |         driver = get_webdriver_instance()
29 |         logger.info("Highlighting clickable elements on the page.")
30 |         bbox_descriptions, bbox_coordinates, driver = highlight_elements(
31 |             driver, "click")
32 |         screenshot = get_b64_screenshot(driver)
33 |         driver = highlight_elements(driver, "remove")
34 |     except Exception as e:
35 |         logger.error("Error highlighting elements: %s", e, exc_info=True)
36 |         return "Failed to highlight clickable elements. Please check the logs for more details."
37 | 
38 |     try:
39 |         click_template = load_context("click_template")
40 |         enriched_query = f"{query}.\n\nText on all visible clickable elements: {bbox_descriptions}"
41 |         message_history = get_vision_template(
42 |             click_template, screenshot, enriched_query)
43 |         return process_click(driver, message_history, bbox_coordinates, bbox_descriptions)
44 |     except Exception as e:
45 |         logger.error("Error processing click action: %s", e, exc_info=True)
46 |         return "Failed to process click action. Please check the logs for more details."
47 | 
48 | 
49 | def process_click(driver, message_history, bbox_coordinates, bbox_descriptions) -> str:
50 |     """
51 |     Processes the click action by analyzing the image and clicking the identified element.
52 | 
53 |     Args:
54 |         driver (WebDriver): An instance of the WebDriver.
55 |         message_history (str): A string representing the message history for image analysis.
56 |         bbox_coordinates (list): A list of bounding box coordinates.
57 |         bbox_descriptions (str): A JSON string of elements' descriptions.
58 | 
59 |     Returns:
60 |         str: A response string with the result of the click action.
61 |     """
62 |     for attempt in range(3):
63 |         try:
64 |             logger.info("Analyzing image to identify the clickable element.")
65 |             message = analyze_image(message_history)
66 |             if "none" in message.lower():
67 |                 return "No element found matching the description."
68 |             element_index = int(''.join(filter(str.isdigit, message)))
69 |             bbox = bbox_coordinates[element_index]
70 |             return click_field(driver, bbox, bbox_descriptions, element_index)
71 |         except Exception as e:
72 |             logger.warning("Attempt %d: %s", attempt + 1, e, exc_info=True)
73 |             if attempt == 2:  # Last attempt
74 |                 return "Failed to click on the element after several attempts."
75 |     return "Failed to process click action."
76 | 
77 | 
78 | def click_field(driver, bbox, bbox_descriptions, element) -> str:
79 |     """
80 |     Executes the click action on the specified element.
81 | 
82 |     Args:
83 |         driver (WebDriver): An instance of the WebDriver.
84 |         bbox (tuple): A tuple containing the x and y coordinates of the element.
85 |         bbox_descriptions (str): A JSON string of elements' descriptions.
86 |         element (int): The index of the element to click.
87 | 
88 |     Returns:
89 |         str: A response string with the result of the click action.
90 |     """
91 |     try:
92 |         driver.mouse.click(bbox[0], bbox[1])
93 |         time.sleep(3)  # Wait for potential page changes
94 |         logger.info("Clicked on element: %s", bbox_descriptions[element])
95 |         return f"Clicked on element {element}. Text on clicked element: '{bbox_descriptions[element]}'. Current URL is {driver.url}."
96 |     except Exception as e:
97 |         logger.error("Error during click action: %s", e, exc_info=True)
98 |         return "Failed to click on the element. Please check the logs for more details."
99 | 


--------------------------------------------------------------------------------
/src/tools/go_back.py:
--------------------------------------------------------------------------------
 1 | from src.webdriver.webdriver import WebDriver
 2 | from src.configs.logging.logging_config import setup_logging
 3 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance
 4 | 
 5 | import logging
 6 | import time
 7 | 
 8 | setup_logging()
 9 | logger = logging.getLogger()
10 | 
11 | 
12 | def go_back() -> str:
13 |     """
14 |     Navigates back one page in the browser history using a WebDriver instance, and returns a response string with the result of the action.
15 | 
16 |     Args: 
17 |         None
18 | 
19 |     Returns:
20 |         str: A response string stating the success of the action and the current URL.
21 |     """
22 | 
23 |     try:
24 |         driver = get_webdriver_instance()
25 | 
26 |         logger.info("Navigating back 1 page...")
27 |         driver.go_back()
28 | 
29 |         time.sleep(3)
30 | 
31 |         return "Success. Went back 1 page. Current URL is: " + driver.url
32 |     except Exception as e:
33 |         logger.error("An error occurred while navigating back: %s", str(e))
34 |         return "Error occurred while navigating back. Please check the logs for more details."
35 | 


--------------------------------------------------------------------------------
/src/tools/input_text.py:
--------------------------------------------------------------------------------
  1 | from src.webdriver.webdriver import WebDriver
  2 | from src.configs.logging.logging_config import setup_logging
  3 | from src.tools.utils.highlight_elements import highlight_elements
  4 | from src.tools.utils.get_b64_screenshot import get_b64_screenshot
  5 | from src.tools.utils.vision_template import get_vision_template
  6 | from src.tools.utils.load_context import load_context
  7 | from src.tools.utils.analyze_image import analyze_image
  8 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance
  9 | 
 10 | import logging
 11 | import time
 12 | import json
 13 | import platform
 14 | 
 15 | setup_logging()
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def input_text(query: str) -> str:
 20 |     """
 21 |     Sends keys to input fields based on a query and returns a result or error message.
 22 | 
 23 |     Args:
 24 |         query (str): A string representing the user's input text query.
 25 | 
 26 |     Returns:
 27 |         str: A response string indicating the success or failure of the input action.
 28 |     """
 29 |     try:
 30 |         driver = get_webdriver_instance()
 31 |         logger.info("Highlighting input elements on the page.")
 32 |         bbox_descriptions, bbox_coordinates, driver = highlight_elements(
 33 |             driver, "input")
 34 |         screenshot = get_b64_screenshot(driver)
 35 |         highlight_elements(driver, "remove")
 36 |     except Exception as e:
 37 |         logger.error("Error highlighting input elements: %s", e, exc_info=True)
 38 |         return "Failed to highlight input elements. Please check the logs for more details."
 39 | 
 40 |     try:
 41 |         input_template = load_context("input_template")
 42 |         enriched_query = f"{query}.\n\nText on all visible input elements: {bbox_descriptions}"
 43 |         message_history = get_vision_template(
 44 |             input_template, screenshot, enriched_query)
 45 |         return process_input(driver, message_history, bbox_coordinates)
 46 |     except Exception as e:
 47 |         logger.error("Error processing input action: %s", e, exc_info=True)
 48 |         return "Failed to process input action. Please check the logs for more details."
 49 | 
 50 | 
 51 | def process_input(driver, message_history, bbox_coordinates) -> str:
 52 |     """
 53 |     Identifies input fields from an image and attempts to fill them.
 54 | 
 55 |     Args:
 56 |         driver (WebDriver): An instance of the WebDriver.
 57 |         message_history (str): A string representing the message history for image analysis.
 58 |         bbox_coordinates (list): A list of bounding box coordinates for input elements.
 59 | 
 60 |     Returns:
 61 |         str: A response string indicating the success or failure of the input action.
 62 |     """
 63 |     for attempt in range(3):
 64 |         try:
 65 |             logger.info("Analyzing the image to identify input fields.")
 66 |             message = analyze_image(message_history)
 67 |             if "none" in message.lower():
 68 |                 return "No matching element found. Use AnalyzeContent tool for further analysis."
 69 |             input_elements = extract_input_elements(message)
 70 |             fill_input_fields(driver, input_elements, bbox_coordinates)
 71 |             return f"Inserted text into the following elements: {list(input_elements.keys())}"
 72 |         except Exception as e:
 73 |             logger.warning(f"Attempt {attempt+1}: {e}", exc_info=True)
 74 |             if attempt == 2:  # Last attempt
 75 |                 return "Failed to input text after several attempts."
 76 |     return "Failed to input text. Use AnalyzeContent tool for further analysis."
 77 | 
 78 | 
 79 | def extract_input_elements(message: str) -> dict:
 80 |     """
 81 |     Extracts input elements from a message.
 82 | 
 83 |     Args:
 84 |         message (str): A string representing the message from image analysis.
 85 | 
 86 |     Returns:
 87 |         dict: A dictionary containing input elements and their corresponding values.
 88 |     """
 89 |     logger.info("Extracting input elements from the message.")
 90 |     try:
 91 |         start = message.find('```json') + len('```json\n')
 92 |         end = message.rfind('```')
 93 |         json_str = message[start:end].strip()
 94 |         return json.loads(json_str)
 95 |     except json.JSONDecodeError as e:
 96 |         logger.error("Error parsing JSON from message: %s", e, exc_info=True)
 97 |         raise
 98 | 
 99 | 
100 | def fill_input_fields(driver, input_elements, bbox_coordinates) -> None:
101 |     """
102 |     Fills identified input fields with the provided values.
103 | 
104 |     Args:
105 |         driver (WebDriver): An instance of the WebDriver.
106 |         input_elements (dict): A dictionary containing input elements and their corresponding values.
107 |         bbox_coordinates (list): A list of bounding box coordinates for input elements.
108 | 
109 |     Returns:
110 |         None
111 |     """
112 |     logger.info("Filling input fields with provided values.")
113 |     try:
114 |         for key, value in input_elements.items():
115 |             bbox = bbox_coordinates[int(key)]
116 |             click_and_fill(driver, bbox, str(value))
117 |         logger.info("Input successfully completed.")
118 |     except Exception as e:
119 |         logger.error("Error during input: %s", e, exc_info=True)
120 |         raise
121 | 
122 | 
123 | def click_and_fill(driver, bbox, text):
124 |     """
125 |     Clicks on an input field and fills it with text.
126 | 
127 |     Args:
128 |         driver (WebDriver): An instance of the WebDriver.
129 |         bbox (tuple): A tuple containing the x and y coordinates of the input field.
130 |         text (str): A string representing the text to fill in the input field.
131 | 
132 |     Returns:
133 |         None
134 |     """
135 |     driver.mouse.click(bbox[0], bbox[1])
136 |     select_all_shortcut = "Meta+A" if platform.system() == "Darwin" else "Control+A"
137 |     driver.keyboard.press(select_all_shortcut)
138 |     time.sleep(1)
139 |     driver.keyboard.press("Backspace")
140 |     time.sleep(1)
141 |     driver.keyboard.type(text)
142 |     time.sleep(1)
143 |     driver.keyboard.press("Enter")
144 |     time.sleep(1)
145 |     logger.info(f"Filled input field with text: {text}")
146 | 


--------------------------------------------------------------------------------
/src/tools/jump_to_search_engine.py:
--------------------------------------------------------------------------------
 1 | from src.webdriver.webdriver import WebDriver
 2 | from src.configs.logging.logging_config import setup_logging
 3 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance
 4 | 
 5 | import logging
 6 | import time
 7 | 
 8 | setup_logging()
 9 | logger = logging.getLogger()
10 | 
11 | 
12 | def jump_to_search_engine() -> str:
13 |     """
14 |     Navigates to the Google search engine using a WebDriver instance, and returns a response string with the result of the action.
15 | 
16 |     Args:
17 |         None
18 | 
19 |     Returns:
20 |         str: A response string stating the success of the action and the current URL.
21 |     """
22 | 
23 |     try:
24 |         driver = get_webdriver_instance()
25 | 
26 |         driver.goto("https://www.google.com")
27 | 
28 |         time.sleep(3)
29 | 
30 |         logger.info(
31 |             "Success. Jumped to Google search engine. Current URL is: " + driver.url)
32 |         return "Success. Jumped to Google search engine. Current URL is: " + driver.url
33 | 
34 |     except Exception as e:
35 |         logger.error("An error occurred: " + str(e))
36 |         return "An error occurred: " + str(e)
37 | 


--------------------------------------------------------------------------------
/src/tools/read_url.py:
--------------------------------------------------------------------------------
 1 | from src.webdriver.webdriver import WebDriver
 2 | from src.configs.logging.logging_config import setup_logging
 3 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance
 4 | 
 5 | import logging
 6 | 
 7 | setup_logging()
 8 | logger = logging.getLogger()
 9 | 
10 | 
11 | def read_url(url: str) -> str:
12 |     """
13 |     Reads the content of a URL using a WebDriver instance adapted for Playwright, 
14 |     executes a script to remove popups, and returns the current URL.
15 | 
16 |     Args:
17 |         url (str): The URL to read.
18 | 
19 |     Returns:
20 |         str: The current URL.
21 |     """
22 |     try:
23 |         if not url.startswith('https://'):
24 |             url = 'https://' + url
25 | 
26 |         logger.info(f"Reading URL: {url}")
27 |         driver = get_webdriver_instance()
28 |         driver.goto(url)
29 | 
30 |         return "Current URL is: " + driver.url + "\n"
31 |     except Exception as e:
32 |         logger.error(f"Failed to read URL {url}: {e}")
33 |         raise
34 | 


--------------------------------------------------------------------------------
/src/tools/save_to_file.py:
--------------------------------------------------------------------------------
 1 | from src.configs.logging.logging_config import setup_logging
 2 | 
 3 | import logging
 4 | import os
 5 | import datetime
 6 | 
 7 | setup_logging()
 8 | logger = logging.getLogger()
 9 | 
10 | 
11 | def save_to_file(data: str) -> None:
12 |     """
13 |     Saves the data to a file.
14 | 
15 |     Args:
16 |         data (str): The data to save to the file.
17 | 
18 |     Returns:
19 |         None
20 |     """
21 |     try:
22 |         base_folder = 'src/data/saved_data/'
23 |         if not os.path.exists(base_folder):
24 |             os.makedirs(base_folder)
25 |         os.chdir(base_folder)
26 |         folder_name = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
27 |         os.makedirs(folder_name)
28 |         file_name = folder_name + "/data.txt"
29 |         with open(file_name, "w") as file:
30 |             file.write(data)
31 | 
32 |         logger.info(f"Data saved to file: {file_name}")
33 |     except Exception as e:
34 |         logger.error(f"Error saving data to file: {str(e)}")
35 |     return None
36 | 


--------------------------------------------------------------------------------
/src/tools/scroll.py:
--------------------------------------------------------------------------------
 1 | from src.webdriver.webdriver import WebDriver
 2 | from src.configs.logging.logging_config import setup_logging
 3 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance
 4 | 
 5 | import logging
 6 | import time
 7 | from typing import Literal
 8 | 
 9 | setup_logging()
10 | logger = logging.getLogger()
11 | 
12 | 
13 | def scroll(direction: Literal["up", "down"]) -> str:
14 |     """
15 |     Scrolls the current page up or down by 500 pixels.
16 | 
17 |     Args:
18 |         direction (Literal["up", "down"]): The direction to scroll.
19 | 
20 |     Returns:
21 |         str: A message confirming the scroll action.
22 |     """
23 | 
24 |     try:
25 |         if direction not in ["up", "down"]:
26 |             raise ValueError("Direction must be either 'up' or 'down'.")
27 | 
28 |         driver = get_webdriver_instance()
29 | 
30 |         scroll_amount = 500
31 |         if direction == "up":
32 |             driver.evaluate(f"window.scrollBy(0, {-scroll_amount})")
33 |         else:
34 |             driver.evaluate(f"window.scrollBy(0, {scroll_amount})")
35 | 
36 |         time.sleep(2)
37 | 
38 |         logger.info(f"Scrolled {direction} by {scroll_amount} pixels.")
39 |         return f"Scrolled {direction} by {scroll_amount} pixels."
40 | 
41 |     except Exception as e:
42 |         logger.error(f"Failed to scroll {direction}: {e}")
43 |         raise
44 | 


--------------------------------------------------------------------------------
/src/tools/utils/analyze_image.py:
--------------------------------------------------------------------------------
 1 | from src.configs.logging.logging_config import setup_logging
 2 | from src.tools.utils.openai_client import get_openai_client
 3 | 
 4 | import logging
 5 | 
 6 | setup_logging()
 7 | logger = logging.getLogger()
 8 | 
 9 | 
10 | def analyze_image(message_history, **kwargs):
11 |     """
12 |     Analyzes an image using OpenAI's GPT-4 Vision model and returns the response message.
13 | 
14 |     Args:
15 |         message_history (list): A list of message objects representing the conversation history.
16 |         **kwargs: Arbitrary keyword arguments.
17 | 
18 |     Returns:
19 |         str: The response message if successful, else logs error and returns an error message.
20 |     """
21 | 
22 |     try:
23 |         logger.info("Getting OpenAI client.")
24 |         client = get_openai_client()
25 | 
26 |         logger.info("Sending request to OpenAI's GPT-4 Vision model.")
27 |         response = client.chat.completions.create(
28 |             model="gpt-4-vision-preview",
29 |             messages=message_history,
30 |             max_tokens=1024,
31 |             temperature=0.1,
32 |             **kwargs)
33 | 
34 |         message = response.choices[0].message
35 |         message_text = message.content
36 | 
37 |         logger.info("Image analysis completed successfully.")
38 |         return message_text
39 | 
40 |     except Exception as e:
41 |         logger.error(
42 |             "Failed to analyze image with OpenAI's GPT-4 Vision model.", exc_info=True)
43 |         return "An error occurred while analyzing the image. Please try again later."
44 | 


--------------------------------------------------------------------------------
/src/tools/utils/get_b64_screenshot.py:
--------------------------------------------------------------------------------
 1 | from src.configs.logging.logging_config import setup_logging
 2 | from src.tools.utils.save_screenshot import save_screenshot_to_file
 3 | import base64
 4 | import os
 5 | import logging
 6 | 
 7 | setup_logging()
 8 | logger = logging.getLogger()
 9 | 
10 | 
11 | def get_b64_screenshot(driver, element=None) -> str:
12 |     """
13 |     Captures a screenshot of the current web page or a specific element and returns it as a base64 encoded string.
14 | 
15 |     Args:
16 |         driver (Playwright Browser): The Playwright browser instance.
17 |         element (Playwright ElementHandle, optional): The Playwright element handle. Defaults to None.
18 | 
19 |     Returns:
20 |         str: A base64 encoded string representing the screenshot of the current web page or a specific element.
21 |     """
22 |     temp_file_path = "temp_screenshot.png"
23 |     try:
24 |         logger.info("Capturing screenshot")
25 |         if element:
26 |             screenshot = element.screenshot(
27 |                 path=temp_file_path, full_page=False)
28 |         else:
29 |             screenshot = driver.screenshot(
30 |                 path=temp_file_path, full_page=False)
31 | 
32 |         with open(temp_file_path, "rb") as image_file:
33 |             screenshot = base64.b64encode(image_file.read()).decode("utf-8")
34 | 
35 |         os.remove(temp_file_path)
36 | 
37 |         _ = save_screenshot_to_file(screenshot)
38 | 
39 |         return screenshot
40 |     except Exception as e:
41 |         logger.error(f"Error occurred while capturing screenshot: {str(e)}")
42 |         return ""
43 | 


--------------------------------------------------------------------------------
/src/tools/utils/get_webdriver_instance.py:
--------------------------------------------------------------------------------
 1 | from src.webdriver.webdriver import WebDriver
 2 | from src.configs.logging.logging_config import setup_logging
 3 | 
 4 | import logging
 5 | 
 6 | setup_logging()
 7 | logger = logging.getLogger()
 8 | 
 9 | 
10 | def get_webdriver_instance():
11 |     """
12 |     Returns an instance of the WebDriver.
13 | 
14 |     Args:
15 |         None
16 | 
17 |     Returns:
18 |         WebDriver: An instance of the WebDriver.
19 |     """
20 |     try:
21 |         return WebDriver.getInstance().getDriver()
22 |     except Exception as e:
23 |         logger.error("Failed to get WebDriver instance: %s", e, exc_info=True)
24 |         raise
25 | 


--------------------------------------------------------------------------------
/src/tools/utils/highlight_elements.py:
--------------------------------------------------------------------------------
 1 | from src.webdriver.webdriver import WebDriver
 2 | from typing import Literal
 3 | import json
 4 | import logging
 5 | 
 6 | from src.configs.logging.logging_config import setup_logging
 7 | setup_logging()
 8 | logger = logging.getLogger()
 9 | 
10 | 
11 | def format_description(elements: list) -> tuple:
12 |     """
13 |     Formats the description of page elements and their bounding box coordinates.
14 | 
15 |     Args:
16 |         elements (list): A list of element dictionaries with keys like 'ariaLabel', 'x', 'y', 'text'.
17 | 
18 |     Returns:
19 |         tuple: A tuple containing a JSON string of elements' descriptions and a list of bounding box coordinates.
20 |     """
21 |     labels = []
22 |     bbox_coordinates = []
23 |     for i, bbox in enumerate(elements):
24 |         text = bbox.get("ariaLabel") or ""
25 |         bbox_coordinates.append((bbox.get("x"), bbox.get("y")))
26 |         if not text.strip():
27 |             text = bbox.get("text")
28 |         labels.append({str(i):  text})
29 | 
30 |     bbox_descriptions = json.dumps(labels, indent=4)
31 |     return bbox_descriptions, bbox_coordinates
32 | 
33 | 
34 | def highlight_elements(driver, mark: Literal["click", "input", "all", "remove"]):
35 |     """
36 |     Highlights elements on a webpage based on the mark type.
37 | 
38 |     Args:
39 |         driver (WebDriver): An instance of the WebDriver.
40 |         mark (Literal["click", "input", "all", "remove"]): The type of elements to mark.
41 | 
42 |     Returns:
43 |         A tuple containing descriptions and coordinates of marked elements or the driver itself after unmarking elements.
44 |         In case of errors, it returns an error message.
45 |     """
46 |     if mark not in ["click", "input", "all", "remove"]:
47 |         raise ValueError(
48 |             "Mark must be either 'click', 'input', 'all' or 'remove'.")
49 | 
50 |     try:
51 |         with open('src/tools/utils/mark_page.js') as f:
52 |             mark_page_script = f.read()
53 | 
54 |         if mark == "click":
55 |             elements = driver.evaluate(f"""() => {{
56 |                 {mark_page_script}
57 |                 return markClickableElements();
58 |             }}""")
59 |             bbox_descriptions, bbox_coordinates = format_description(elements)
60 |             return bbox_descriptions, bbox_coordinates, driver
61 |         elif mark == "input":
62 |             elements = driver.evaluate(f"""() => {{
63 |                 {mark_page_script}
64 |                 return markInputElements();
65 |             }}""")
66 |             bbox_descriptions, bbox_coordinates = format_description(elements)
67 |             return bbox_descriptions, bbox_coordinates, driver
68 |         elif mark == "all":
69 |             elements = driver.evaluate(f"""() => {{
70 |                 {mark_page_script}
71 |                 return markAllElements();
72 |             }}""")
73 |             bbox_descriptions, bbox_coordinates = format_description(elements)
74 |             return bbox_descriptions, bbox_coordinates, driver
75 |         else:
76 |             driver.evaluate(f"""() => {{
77 |                 {mark_page_script}
78 |                 return unmarkPage();
79 |             }}""")
80 |             return driver
81 |     except Exception as e:
82 |         return str(e)
83 | 


--------------------------------------------------------------------------------
/src/tools/utils/load_context.py:
--------------------------------------------------------------------------------
 1 | from src.configs.logging.logging_config import setup_logging
 2 | 
 3 | import logging
 4 | import json
 5 | 
 6 | setup_logging()
 7 | logger = logging.getLogger()
 8 | 
 9 | 
10 | def load_context(prompt_template: str) -> str:
11 |     """
12 |     Load the prompt for the assistant from the specified JSON file.
13 | 
14 |     Args:
15 |         None
16 | 
17 |     Returns:
18 |         str: The prompt for the assistant to use in generating responses.
19 |     """
20 |     try:
21 |         logger.info("Loading the prompt from the JSON file.")
22 |         with open('src/tools/utils/prompts.json', 'r') as file:
23 |             prompt = json.load(file)
24 |             prompt = prompt[prompt_template]["prompt"]
25 |             return prompt
26 |     except Exception as e:
27 |         logger.error(
28 |             "Failed to load the prompt from the JSON file.", exc_info=True)
29 |         raise
30 | 


--------------------------------------------------------------------------------
/src/tools/utils/mark_page.js:
--------------------------------------------------------------------------------
  1 | const customCSS = `
  2 |     ::-webkit-scrollbar {
  3 |         width: 10px;
  4 |     }
  5 |     ::-webkit-scrollbar-track {
  6 |         background: #27272a;
  7 |     }
  8 |     ::-webkit-scrollbar-thumb {
  9 |         background: #888;
 10 |         border-radius: 0.375rem;
 11 |     }
 12 |     ::-webkit-scrollbar-thumb:hover {
 13 |         background: #555;
 14 |     }
 15 | `;
 16 | 
 17 | const styleTag = document.createElement("style");
 18 | styleTag.textContent = customCSS;
 19 | document.head.append(styleTag);
 20 | 
 21 | let labels = [];
 22 | 
 23 | function unmarkPage() {
 24 |   var labels = document.querySelectorAll('div[data-label]');
 25 |   for (var i = 0; i < labels.length; i++) {
 26 |     document.body.removeChild(labels[i]);
 27 |   }
 28 | }
 29 | 
 30 | function markElements(includeCondition) {
 31 |   unmarkPage();
 32 | 
 33 |   var bodyRect = document.body.getBoundingClientRect();
 34 | 
 35 |   var items = Array.prototype.slice
 36 |     .call(document.querySelectorAll("*"))
 37 |     .map(function (element) {
 38 |       var vw = Math.max(
 39 |         document.documentElement.clientWidth || 0,
 40 |         window.innerWidth || 0
 41 |       );
 42 |       var vh = Math.max(
 43 |         document.documentElement.clientHeight || 0,
 44 |         window.innerHeight || 0
 45 |       );
 46 |       var textualContent = element.textContent.trim().replace(/\s{2,}/g, " ");
 47 |       var elementType = element.tagName.toLowerCase();
 48 |       var ariaLabel = element.getAttribute("aria-label") || "";
 49 | 
 50 |       var rects = [...element.getClientRects()]
 51 |         .filter((bb) => {
 52 |           var center_x = bb.left + bb.width / 2;
 53 |           var center_y = bb.top + bb.height / 2;
 54 |           var elAtCenter = document.elementFromPoint(center_x, center_y);
 55 | 
 56 |           return elAtCenter === element || element.contains(elAtCenter);
 57 |         })
 58 |         .map((bb) => {
 59 |           const rect = {
 60 |             left: Math.max(0, bb.left),
 61 |             top: Math.max(0, bb.top),
 62 |             right: Math.min(vw, bb.right),
 63 |             bottom: Math.min(vh, bb.bottom),
 64 |           };
 65 |           return {
 66 |             ...rect,
 67 |             width: rect.right - rect.left,
 68 |             height: rect.bottom - rect.top,
 69 |           };
 70 |         });
 71 | 
 72 |       var area = rects.reduce((acc, rect) => acc + rect.width * rect.height, 0);
 73 | 
 74 |       return {
 75 |         element: element,
 76 |         include: includeCondition(element),
 77 |         area,
 78 |         rects,
 79 |         text: textualContent,
 80 |         type: elementType,
 81 |         ariaLabel: ariaLabel,
 82 |         id: element.id,
 83 |       };
 84 |     })
 85 |     .filter((item) => item.include && item.area >= 20);
 86 | 
 87 |   items = items.filter(
 88 |     (x) => !items.some((y) => x.element.contains(y.element) && !(x == y))
 89 |   );
 90 | 
 91 |   function getRandomColor() {
 92 |     var letters = "0123456789ABCDEF";
 93 |     var color = "#";
 94 |     for (var i = 0; i < 6; i++) {
 95 |       color += letters[Math.floor(Math.random() * 16)];
 96 |     }
 97 |     return color;
 98 |   }
 99 | 
100 |   items.forEach(function (item, index) {
101 |     item.rects.forEach((bbox) => {
102 |       var newElement = document.createElement("div");
103 |       newElement.className = 'highlighted-element';
104 |       newElement.setAttribute('data-label', ''); 
105 |       newElement.setAttribute('data-text', item.text);
106 |       var borderColor = "#000";
107 |       newElement.style.outline = `2px dashed ${borderColor}`;
108 |       newElement.style.position = "fixed";
109 |       newElement.style.left = bbox.left + "px";
110 |       newElement.style.top = bbox.top + "px";
111 |       newElement.style.width = bbox.width + "px";
112 |       newElement.style.height = bbox.height + "px";
113 |       newElement.style.pointerEvents = "none";
114 |       newElement.style.boxSizing = "border-box";
115 |       newElement.style.zIndex = 2147483647;
116 |   
117 |       var label = document.createElement("span");
118 |       label.textContent = index;
119 |       label.style.position = "absolute";
120 |       label.style.top = "-19px";
121 |       label.style.left = "0px";
122 |       label.style.background = borderColor;
123 |       label.style.color = "white";
124 |       label.style.padding = "2px 4px";
125 |       label.style.fontSize = "12px";
126 |       label.style.borderRadius = "2px";
127 |       newElement.appendChild(label);
128 |   
129 |       document.body.appendChild(newElement);
130 |       labels.push(newElement);
131 |     });
132 |   });
133 | 
134 |   const coordinates = items.flatMap((item) =>
135 |     item.rects.map(({ left, top, width, height }) => ({
136 |       x: (left + left + width) / 2,
137 |       y: (top + top + height) / 2,
138 |       type: item.type,
139 |       text: item.text,
140 |       ariaLabel: item.ariaLabel,
141 |     }))
142 |   );
143 |   return coordinates;
144 | }
145 | 
146 | function markClickableElements() {
147 |   return markElements((element) => {
148 |     return element.tagName === "A" ||
149 |       element.tagName === "TEXTAREA" ||
150 |       element.tagName === "SELECT" ||
151 |       element.onclick != null ||
152 |       window.getComputedStyle(element).cursor == "pointer";
153 |   });
154 | }
155 | 
156 | function markInputElements() {
157 |   return markElements((element) => {
158 |     return element.tagName === "INPUT";
159 |   });
160 | }
161 | 
162 | function markAllElements() {
163 |   return markElements((element) => {
164 |     return element.tagName === "INPUT" ||
165 |       element.tagName === "TEXTAREA" ||
166 |       element.tagName === "SELECT" ||
167 |       element.tagName === "BUTTON" ||
168 |       element.tagName === "A" ||
169 |       element.onclick != null ||
170 |       window.getComputedStyle(element).cursor == "pointer" ||
171 |       element.tagName === "IFRAME" ||
172 |       element.tagName === "VIDEO";
173 |   });
174 | }


--------------------------------------------------------------------------------
/src/tools/utils/openai_client.py:
--------------------------------------------------------------------------------
 1 | from src.configs.logging.logging_config import setup_logging
 2 | import openai
 3 | from dotenv import load_dotenv
 4 | import threading
 5 | import instructor
 6 | import os
 7 | import logging
 8 | 
 9 | setup_logging()
10 | logger = logging.getLogger()
11 | 
12 | dotenv_path = os.path.normpath(os.path.join(
13 |     os.path.dirname(__file__), '..', '..', '..', '.env'))
14 | 
15 | load_dotenv(dotenv_path)
16 | 
17 | client_lock = threading.Lock()
18 | client = None
19 | 
20 | 
21 | def get_openai_client():
22 |     """
23 |     Returns an OpenAI client instance.
24 | 
25 |     Args:
26 |         None
27 | 
28 |     Returns:
29 |         openai.OpenAI: An OpenAI client instance.
30 |     """
31 |     global client
32 |     with client_lock:
33 |         if client is None:
34 |             try:
35 |                 logging.info("Creating OpenAI client")
36 |                 api_key = openai.api_key or os.getenv('OPENAI_API_KEY')
37 |                 if api_key is None:
38 |                     raise ValueError(
39 |                         "OpenAI API key is not set. Please set it using set_openai_key.")
40 |                 client = instructor.patch(openai.OpenAI(api_key=api_key,
41 |                                                         max_retries=5,))
42 |                 logging.info("OpenAI client created successfully.")
43 |             except Exception as e:
44 |                 logging.error(f"Error creating OpenAI client: {str(e)}")
45 |     return client
46 | 


--------------------------------------------------------------------------------
/src/tools/utils/prompts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "analyze_content": {
 3 |       "prompt": "As a web scraping tool, your primary task is to accurately extract and provide information in response to user queries based on webpage screenshots. When a user asks a question, analyze the provided screenshot of the webpage for relevant information. Your goal is to ensure relevant data retrieval from webpages. If some elements are obscured by pop ups, notify the user about how to close them. If there might be additional information on the page regarding the user's question by scrolling up or down, notify the user about it as well."
 4 |     },
 5 |     "click_template": {
 6 |       "prompt": "You are a robot browsing the web, just like humans. Your objective is to analyze a high-resolution screenshot of a webpage. The screenshot includes interactive elements, each framed by a bounding box made of dashed lines. These bounding boxes are clearly visible and labeled with numerical identifiers at the top left corner, allowing for each clickable element on the page to be uniquely recognized. \n\nA sequence number is displayed at the top left of each element's bounding box, ranging from 1 to n, where 'n' represents the total number of interactive elements depicted in the screenshot. \n\nWhen given a user's description of a target element, your duty is to thoroughly analyze the screenshot to find the element that accurately corresponds to this description. Identify the target element and report the sequence number found at the top left of its bounding box. The response should be solely the sequence number, with no additional characters or text. \n\nIf there is no element that matches the user's description in the screenshot, your response should be the word 'none'. This confirms that a detailed inspection was performed with no matching element located. \n\nIt is imperative to provide the precise sequence number for the identified element. The accuracy of this information is essential to enable proper interaction with the webpage as depicted in the screenshot."
 7 |     },
 8 |     "input_template": {
 9 |       "prompt": "You are a robot browsing the web, just like humans. Your objective is to analyze a high-resolution screenshot of a webpage. The screenshot includes input fields, each framed by a bounding box made of dashed lines. These bounding boxes are clearly visible and labeled with numerical identifiers at the top left corner, allowing for each clickable element on the page to be uniquely recognized. \n\nYour task is to analyze the screenshot, identify the input fields based on the user's description, and output the sequence numbers of these fields in JSON format, paired with the specified text. For instance, if the user's task involves entering an email and password, your output should be in the format: {'52': 'johndoe@gmail.com', '53': 'password123'}, where 52 and 53 are sequence numbers of the input fields. \n\nThe enter key will be pressed on the last element automatically. \n\nIf no element on the screenshot matches the user’s description, explain to the user what's on the page instead, and tell him where these elements are most likely to be located."
10 |     }
11 |   }
12 |   


--------------------------------------------------------------------------------
/src/tools/utils/save_screenshot.py:
--------------------------------------------------------------------------------
 1 | from src.configs.logging.logging_config import setup_logging
 2 | 
 3 | import logging
 4 | import os
 5 | import base64
 6 | 
 7 | setup_logging()
 8 | logger = logging.getLogger()
 9 | 
10 | 
11 | def save_screenshot_to_file(screenshot_b64: str, folder_path: str = "src/data/screenshots") -> str:
12 |     """
13 |     Saves a base64 encoded screenshot to a PNG file.
14 | 
15 |     Args:
16 |         screenshot_b64 (str): The base64 encoded screenshot.
17 |         folder_path (str): The folder path where the screenshot will be saved.
18 | 
19 |     Returns:
20 |         str: The path to the saved screenshot file.
21 |     """
22 |     try:
23 |         if not os.path.exists(folder_path):
24 |             logger.info(f"Creating directory {folder_path}.")
25 |             os.makedirs(folder_path)
26 | 
27 |         count = len(os.listdir(folder_path)) + 1
28 |         file_path = os.path.join(folder_path, f"{count}.png")
29 | 
30 |         with open(file_path, "wb") as file:
31 |             logger.info(f"Saving screenshot to {file_path}.")
32 |             file.write(base64.b64decode(screenshot_b64))
33 | 
34 |         return file_path
35 |     except Exception as e:
36 |         logger.error(f"Error saving screenshot: {str(e)}")
37 |         return ""
38 | 


--------------------------------------------------------------------------------
/src/tools/utils/vision_template.py:
--------------------------------------------------------------------------------
 1 | from src.configs.logging.logging_config import setup_logging
 2 | 
 3 | import logging
 4 | 
 5 | setup_logging()
 6 | logger = logging.getLogger()
 7 | 
 8 | 
 9 | def get_vision_template(context: str, screenshot: str, question: str) -> list:
10 |     """
11 |     Returns a message chat template for the vision model.
12 | 
13 |     Args:
14 |         context (str): The context of the message chat.
15 |         screenshot (str): A base64 encoded string representing the screenshot of the current web page.
16 |         question (str): The user's query string.
17 | 
18 |     Returns:
19 |         list: A list of dictionaries representing the message chat template for the vision model.
20 |     """
21 | 
22 |     try:
23 |         logger.info("Creating vision template")
24 |         message_chat = [
25 |             {
26 |                 "role": "system",
27 |                 "content": context,
28 |             },
29 |             {
30 |                 "role": "user",
31 |                 "content": [
32 |                     {
33 |                         "type": "image_url",
34 |                         "image_url": f"data:image/jpeg;base64,{screenshot}",
35 |                     },
36 |                     {
37 |                         "type": "text",
38 |                         "text": f"{question}",
39 |                     }
40 |                 ]
41 |             }
42 |         ]
43 |         return message_chat
44 |     except Exception as e:
45 |         logger.error(f"Error occurred in get_vision_template: {e}")
46 |         return []
47 | 


--------------------------------------------------------------------------------
/src/tools/wait.py:
--------------------------------------------------------------------------------
 1 | from src.webdriver.webdriver import WebDriver
 2 | from src.configs.logging.logging_config import setup_logging
 3 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance
 4 | 
 5 | import logging
 6 | import time
 7 | 
 8 | setup_logging()
 9 | logger = logging.getLogger()
10 | 
11 | 
12 | def wait() -> str:
13 |     """
14 |     Waits for 5 seconds and returns a response string with the result of the action and the current URL.
15 | 
16 |     Args:
17 |         None
18 | 
19 |     Returns:
20 |         str: A response string stating the success of the action and the current URL.
21 |     """
22 | 
23 |     try:
24 |         driver = get_webdriver_instance()
25 | 
26 |         logger.info("Waiting 5 seconds...")
27 |         time.sleep(5)
28 | 
29 |         return "Success. Waited 5 seconds. Current URL is: " + driver.url
30 |     except Exception as e:
31 |         logger.error("An error occurred while waiting: %s", str(e))
32 |         return "Error occurred while waiting. Please check the logs for more details."
33 | 


--------------------------------------------------------------------------------
/src/webdriver/webdriver.py:
--------------------------------------------------------------------------------
  1 | from playwright.sync_api import sync_playwright
  2 | from src.configs.logging.logging_config import setup_logging
  3 | import locale
  4 | from tzlocal import get_localzone_name
  5 | import logging
  6 | 
  7 | setup_logging()
  8 | logger = logging.getLogger()
  9 | 
 10 | 
 11 | class WebDriver:
 12 |     """
 13 |     A singleton class representing a web driver instance.
 14 | 
 15 |     Methods:
 16 |         getInstance(*args, **kwargs) -> WebDriver:
 17 |             Returns the singleton instance of the WebDriver class.
 18 |         __init__(*args, **kwargs) -> None:
 19 |             Initializes the WebDriver class.
 20 |         createDriver(*args, **kwargs) -> None:
 21 |             Creates a new browser instance and sets up the page.
 22 |         getDriver() -> Page:
 23 |             Returns the current page instance.
 24 |         closeDriver() -> None:
 25 |             Closes the browser instance and stops Playwright.
 26 |         closeCurrentTab() -> None:
 27 |             Closes the current tab (page) without affecting the browser instance.
 28 |     """
 29 | 
 30 |     __instance = None
 31 | 
 32 |     @staticmethod
 33 |     def getInstance(*args, **kwargs):
 34 |         """
 35 |         Returns the singleton instance of the WebDriver class.
 36 | 
 37 |         Args:
 38 |             *args: Variable length argument list.
 39 |             **kwargs: Arbitrary keyword arguments.
 40 | 
 41 |         Returns:
 42 |             The singleton instance of the WebDriver class.
 43 |         """
 44 |         if WebDriver.__instance is None:
 45 |             WebDriver.__instance = WebDriver(*args, **kwargs)
 46 |         return WebDriver.__instance
 47 | 
 48 |     def __init__(self, *args, **kwargs):
 49 |         """
 50 |         Initializes the WebDriver class.
 51 | 
 52 |         Args:
 53 |             *args: Variable length argument list.
 54 |             **kwargs: Arbitrary keyword arguments.
 55 | 
 56 |         Returns:
 57 |             None
 58 |         """
 59 |         if WebDriver.__instance is not None:
 60 |             raise Exception("This class is a singleton!")
 61 |         else:
 62 |             WebDriver.__instance = self
 63 |             self.createDriver(*args, **kwargs)
 64 | 
 65 |     def createDriver(self, *args, **kwargs):
 66 |         """
 67 |         Creates a new browser instance and sets up the page.
 68 | 
 69 |         Args:
 70 |             *args: Variable length argument list.
 71 |             **kwargs: Arbitrary keyword arguments.
 72 | 
 73 |         Returns:
 74 |             None
 75 |         """
 76 |         timezone_id = get_localzone_name()
 77 |         system_locale = locale.getdefaultlocale()
 78 | 
 79 |         try:
 80 |             playwright = sync_playwright().start()
 81 |             browser = playwright.chromium.launch_persistent_context(
 82 |                 user_data_dir="src/data/chrome_profile",
 83 |                 headless=False,
 84 |                 args=[
 85 |                     "--disable-gpu",
 86 |                     "--disable-dev-shm-usage",
 87 |                     "--no-sandbox",
 88 |                     "--disable-web-security",
 89 |                     "--allow-running-insecure-content",
 90 |                 ],
 91 |                 locale=system_locale[0],
 92 |                 timezone_id=timezone_id,
 93 |             )
 94 |             self.playwright = playwright
 95 |             self.browser = browser
 96 |             self.page = browser.new_page()
 97 |             self.page.set_viewport_size({"width": 960, "height": 1080})
 98 |             logger.info("Browser instance created successfully.")
 99 |         except Exception as e:
100 |             logger.error("Failed to create browser instance.", exc_info=True)
101 |             raise e
102 | 
103 |     def getDriver(self):
104 |         """
105 |         Returns the current page instance.
106 | 
107 |         Args:
108 |             None
109 | 
110 |         Returns:
111 |             Page: The current page instance.
112 |         """
113 |         return self.page
114 | 
115 |     def closeDriver(self):
116 |         """
117 |         Closes the browser instance and stops Playwright.
118 | 
119 |         Args:
120 |             None
121 | 
122 |         Returns:
123 |             None
124 |         """
125 |         try:
126 |             self.browser.close()
127 |             self.playwright.stop()
128 |             logger.info("Browser instance closed successfully.")
129 |         except Exception as e:
130 |             logger.error("Failed to close browser instance.", exc_info=True)
131 |             raise e
132 | 
133 |     def closeCurrentTab(self):
134 |         """
135 |         Closes the current tab (page) without affecting the browser instance.
136 | 
137 |         Args:
138 |             None
139 | 
140 |         Returns:
141 |             None
142 |         """
143 |         if self.page and not self.page.is_closed():
144 |             try:
145 |                 self.page.close()
146 |                 self.page = self.browser.new_page()
147 |                 self.page.set_viewport_size({"width": 960, "height": 1080})
148 |                 logger.info("Current tab closed successfully.")
149 |             except Exception as e:
150 |                 logger.error("Failed to close current tab.", exc_info=True)
151 |                 raise e
152 | 


--------------------------------------------------------------------------------