├── .env example ├── .gitignore ├── LICENSE ├── README.md ├── poetry.lock ├── pyproject.toml └── src ├── __init__.py ├── autogen_configuration ├── autogen_config.py └── utils │ └── oai_config_list.json ├── configs └── logging │ ├── color_formatter.py │ ├── logging_config.json │ └── logging_config.py ├── create_oai_agent ├── oai_agent_creator.py ├── tool_definition_factory.py └── utils │ ├── oai_instructions.json │ └── tool_definitions.json ├── data └── assistant_id.json ├── oai_agent ├── oai_agent.py └── utils │ ├── create_oai_agent.py │ ├── load_assistant_id.py │ └── prompt.py ├── tests ├── test_autogen_config.py ├── test_logging_config.py └── test_webdriver.py ├── tools ├── analyze_content.py ├── click_element.py ├── go_back.py ├── input_text.py ├── jump_to_search_engine.py ├── read_url.py ├── save_to_file.py ├── scroll.py ├── utils │ ├── analyze_image.py │ ├── get_b64_screenshot.py │ ├── get_webdriver_instance.py │ ├── highlight_elements.py │ ├── load_context.py │ ├── mark_page.js │ ├── openai_client.py │ ├── prompts.json │ ├── save_screenshot.py │ └── vision_template.py └── wait.py └── webdriver └── webdriver.py /.env example: -------------------------------------------------------------------------------- 1 | PYTHONDONTWRITEBYTECODE=1 2 | OPENAI_API_KEY="" 3 | OPENAI_MODEL="gpt-4-turbo-preview" -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | src/data/chrome_profile 163 | src/data/screenshots 164 | src/data/saved_data -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MultimodalWebAgent 2 | 3 | [![Watch the video](https://img.youtube.com/vi/jQ2Os682Ybs/0.jpg)](https://www.youtube.com/watch?v=jQ2Os682Ybs&t=0s) 4 | 5 | ## Roadmap 6 | 7 | I aim to develop an open-source variant inspired by [MultiOn AI](https://www.multion.ai/) that focuses on automating web workflows, complemented by a user-friendly interface. My current effort is towards crafting a streamlined yet sophisticated version of this WebAgent within the AutoGen Repository, specifically in the [multimodal_web_surfer branch](https://github.com/microsoft/autogen/tree/multimodal_web_surfer). If you're interested in collaborating on this project or want to create a product that makes a difference, I'm eager to connect. 8 | 9 | ## Introduction 10 | 11 | This is a multimodal web agent that can understand and generate natural language and visual content implemented using the [AutoGen](https://microsoft.github.io/autogen/) framework and the [Assistants API](https://platform.openai.com/docs/assistants/overview).\ 12 | It is based on the Paper [WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models](https://arxiv.org/abs/2401.13919). 13 | 14 | ## Disclaimer 15 | 16 | This project is still in development and not yet ready for use.\ 17 | I managed to get the agent to work, but the results are not yet satisfactory. 18 | The prompt has to be carefully crafted to get good results. F.e. the prompt describes every step what the agent has to do in detail like "Go to the website, click on the button using tool X, scroll down using tool Y, click on the next button using tool Z, etc.". With this approach the webagent is working pretty well tbh. So feel free to try it out, give feedback and contribute. 19 | 20 | ## Installation 21 | 22 | 1. Rename the file '.env example' to '.env' and fill in your OpenAI API key. 23 | 2. Install the required packages using the following command: 24 | `poetry install` 25 | 26 | ## Run the agent 27 | 28 | 1. Craft an precise prompt for the agent. The prompt should describe every step what the agent has to do in detail. 29 | You find an example prompt in the file `src/oai_agent/utils/prompt`. 30 | 2. Run the agent using the following command: 31 | `poetry run webagent` 32 | 33 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "multimodalwebagent" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["schauppi <46676413+schauppi@users.noreply.github.com>"] 6 | license = "Apache-2.0" 7 | readme = "README.md" 8 | packages = [ 9 | { include = "src", from = ".", format = "sdist" }, 10 | ] 11 | 12 | [tool.poetry.dependencies] 13 | python = ">=3.11,<3.13" 14 | python-dotenv = "^1.0.1" 15 | pyautogen = "^0.2.17" 16 | openai = "^1.13.3" 17 | colorama = "^0.4.6" 18 | tzlocal = "^5.2" 19 | playwright = "^1.42.0" 20 | pytest-playwright = "^0.4.4" 21 | instructor = "^0.6.4" 22 | 23 | 24 | [build-system] 25 | requires = ["poetry-core"] 26 | build-backend = "poetry.core.masonry.api" 27 | 28 | [tool.poetry.scripts] 29 | webagent = 'src.oai_agent.oai_agent:main' 30 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/schauppi/MultimodalWebAgent/eb57e9f7efc7718c558d614255cbe18d480f7d20/src/__init__.py -------------------------------------------------------------------------------- /src/autogen_configuration/autogen_config.py: -------------------------------------------------------------------------------- 1 | from src.configs.logging.logging_config import setup_logging 2 | import autogen 3 | from autogen import config_list_from_json 4 | import logging 5 | from dotenv import load_dotenv 6 | 7 | import os 8 | 9 | setup_logging() 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | dotenv_path = os.path.normpath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '.env')) 15 | 16 | try: 17 | load_dotenv(dotenv_path=dotenv_path) 18 | logger.info("Environment variables loaded successfully.") 19 | except Exception as e: 20 | logger.error("Failed to load the .env file.", exc_info=e) 21 | 22 | 23 | class GetConfig: 24 | """ 25 | Get and enrich config from config file. 26 | 27 | Methods: 28 | __init__(): 29 | Initialize with API key and config list. 30 | base_dir() -> str: 31 | Returns the base directory path. 32 | load_and_enrich_config_list() -> dict: 33 | Loads config list from a JSON file and enriches it with the API key. 34 | 35 | """ 36 | 37 | def __init__(self) -> None: 38 | """ 39 | Initialize with API key and config list. 40 | 41 | Args: 42 | None 43 | """ 44 | logger.info('Initializing GetConfig class') 45 | self.api_key = os.environ.get('OPENAI_API_KEY', '') 46 | if not self.api_key: 47 | logger.error('OPENAI_API_KEY not found in environment variables') 48 | self.config_list = self.load_and_enrich_config_list() 49 | 50 | @property 51 | def base_dir(self) -> str: 52 | """ 53 | Returns the base directory path. 54 | 55 | Args: 56 | None 57 | 58 | Returns: 59 | str: The base directory path. 60 | """ 61 | base_dir = os.path.dirname(os.path.dirname(__file__)) 62 | logger.info(f'Retrieved base directory path: {base_dir}') 63 | return base_dir 64 | 65 | def load_and_enrich_config_list(self) -> dict: 66 | """ 67 | Loads config list from a JSON file and enriches it with the API key. 68 | 69 | Args: 70 | None 71 | 72 | Returns: 73 | dict: The enriched config list. 74 | """ 75 | config_path = 'src/autogen_configuration/utils/oai_config_list.json' 76 | try: 77 | config_list = config_list_from_json( 78 | env_or_file=config_path, 79 | filter_dict={"model": os.environ.get('OPENAI_MODEL', '')} 80 | ) 81 | logger.info('Config list loaded successfully') 82 | for config in config_list: 83 | config['api_key'] = self.api_key 84 | logger.info('Config list enriched with API key') 85 | except Exception as e: 86 | logger.error( 87 | "Failed to load or enrich the config list.", exc_info=e) 88 | config_list = [] 89 | return {'config_list': config_list} 90 | 91 | 92 | # test 93 | config = GetConfig() 94 | -------------------------------------------------------------------------------- /src/autogen_configuration/utils/oai_config_list.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "model": "gpt-4-turbo-preview", 4 | "api_key": "" 5 | } 6 | ] -------------------------------------------------------------------------------- /src/configs/logging/color_formatter.py: -------------------------------------------------------------------------------- 1 | from colorama import init, Fore, Back, Style 2 | import logging 3 | 4 | init() 5 | 6 | 7 | class ColoredFormatter(logging.Formatter): 8 | """ 9 | A custom logging formatter that adds color to log messages based on their log level. 10 | 11 | Methods: 12 | format(record: logging.LogRecord) -> str: 13 | Formats the log record and adds color to the log message based on the log level. 14 | 15 | """ 16 | 17 | COLOR_CODES = { 18 | "DEBUG": Fore.WHITE, 19 | "INFO": Fore.GREEN, 20 | "WARNING": Fore.YELLOW, 21 | "ERROR": Fore.RED, 22 | "CRITICAL": Back.RED, 23 | } 24 | RESET = Style.RESET_ALL 25 | 26 | def format(self, record: logging.LogRecord) -> str: 27 | """ 28 | Formats the log record and adds color to the log message based on the log level. 29 | 30 | Args: 31 | record (logging.LogRecord): The log record to format. 32 | 33 | Returns: 34 | str: The formatted log message with color added. 35 | 36 | """ 37 | color_code = self.COLOR_CODES.get(record.levelname, self.RESET) 38 | message = super().format(record) 39 | return f"{color_code}{message}{self.RESET}" 40 | -------------------------------------------------------------------------------- /src/configs/logging/logging_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "handlers": { 4 | "console_info": { 5 | "class": "logging.StreamHandler", 6 | "level": "INFO", 7 | "formatter": "simple", 8 | "stream": "ext://sys.stdout" 9 | }, 10 | "console_warning": { 11 | "class": "logging.StreamHandler", 12 | "level": "WARNING", 13 | "formatter": "simple", 14 | "stream": "ext://sys.stdout" 15 | }, 16 | "console_error": { 17 | "class": "logging.StreamHandler", 18 | "level": "ERROR", 19 | "formatter": "error", 20 | "stream": "ext://sys.stderr" 21 | } 22 | }, 23 | "formatters": { 24 | "simple": { 25 | "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 26 | }, 27 | "error": { 28 | "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 29 | } 30 | }, 31 | "root": { 32 | "level": "INFO", 33 | "handlers": ["console_info", "console_warning", "console_error"] 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/configs/logging/logging_config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import logging.config 4 | from typing import Optional 5 | 6 | from src.configs.logging.color_formatter import ColoredFormatter 7 | 8 | 9 | def setup_logging(default_path: str = 'src/configs/logging/logging_config.json', default_level: int = logging.INFO) -> None: 10 | """ 11 | Setup logging configuration. 12 | 13 | Args: 14 | default_path (str): The default path to the logging configuration file. 15 | default_level (int): The default logging level. 16 | 17 | Returns: 18 | None 19 | """ 20 | path = default_path 21 | try: 22 | if os.path.exists(path): 23 | with open(path, 'rt') as f: 24 | config = json.load(f) 25 | 26 | logging.config.dictConfig(config) 27 | 28 | for _, handler_details in config['handlers'].items(): 29 | if 'formatter' in handler_details: 30 | formatter_config = config['formatters'][handler_details['formatter']] 31 | formatter = ColoredFormatter( 32 | fmt=formatter_config['format']) 33 | logging.getLogger().handlers[0].setFormatter(formatter) 34 | else: 35 | logging.basicConfig(level=default_level) 36 | except Exception as e: 37 | print(f"Error occurred while setting up logging: {e}") 38 | -------------------------------------------------------------------------------- /src/create_oai_agent/oai_agent_creator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | from datetime import datetime 5 | from dotenv import load_dotenv 6 | import openai 7 | 8 | from src.configs.logging.logging_config import setup_logging 9 | from src.create_oai_agent.tool_definition_factory import ToolDefinitionFactory 10 | 11 | setup_logging() 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class OAIAssistantCreator: 16 | """ 17 | Class responsible for creating an OpenAI assistant with specified tools and instructions. 18 | 19 | Methods: 20 | load_environment() -> None: 21 | Load environment variables from the .env file. 22 | initialize_openai_client() -> None: 23 | Initialize the OpenAI client with the API key from environment variables. 24 | load_instructions() -> str: 25 | Load instructions for the assistant from the specified JSON file 26 | """ 27 | 28 | def __init__(self, dotenv_path: str, config_path: str, instruction_path: str, assistant_save_path: str, assistant_type: str) -> None: 29 | """ 30 | Initialize the creator with paths to necessary configurations and instructions. 31 | 32 | Args: 33 | dotenv_path (str): Path to the .env file. 34 | config_path (str): Path to the tool definitions JSON file. 35 | instruction_path (str): Path to the instructions JSON file. 36 | assistant_save_path (str): Path to the file to save the assistant ID. 37 | assistant_name (str): Name of the assistant to be created. 38 | 39 | Returns: 40 | None 41 | """ 42 | self.dotenv_path = dotenv_path 43 | self.config_path = config_path 44 | self.instruction_path = instruction_path 45 | self.assistant_save_path = assistant_save_path 46 | self.assistant_type = assistant_type 47 | self.client = None 48 | logger.info( 49 | "OAIAssistantCreator initialized with config and instruction paths.") 50 | 51 | def load_environment(self) -> None: 52 | """ 53 | Load environment variables from the .env file. 54 | 55 | Args: 56 | None 57 | 58 | Returns: 59 | None 60 | """ 61 | try: 62 | load_dotenv(dotenv_path=self.dotenv_path) 63 | logger.info("Environment variables loaded successfully.") 64 | except Exception as e: 65 | logger.error("Failed to load the .env file.", exc_info=True) 66 | 67 | def initialize_openai_client(self) -> None: 68 | """ 69 | Initialize the OpenAI client with the API key from environment variables. 70 | 71 | Args: 72 | None 73 | 74 | Returns: 75 | None 76 | """ 77 | try: 78 | self.client = openai.OpenAI(api_key=os.environ["OPENAI_API_KEY"]) 79 | logger.info("OpenAI client initialized successfully.") 80 | except KeyError as e: 81 | logger.error( 82 | "OPENAI_API_KEY not found in environment variables.", exc_info=True) 83 | raise 84 | 85 | def load_instructions(self) -> str: 86 | """ 87 | Load instructions for the assistant from the specified JSON file. 88 | 89 | Args: 90 | None 91 | 92 | Returns: 93 | str: The loaded instructions. 94 | """ 95 | try: 96 | with open(self.instruction_path, 'r') as file: 97 | instruction = json.load(file)["instruction"] 98 | logger.info("Instructions loaded successfully.") 99 | return instruction 100 | except (FileNotFoundError, json.JSONDecodeError) as e: 101 | logger.error("Failed to load instructions.", exc_info=True) 102 | raise 103 | 104 | def create_assistant(self) -> str: 105 | """ 106 | Create an OpenAI assistant with the loaded tools and instructions. 107 | 108 | Args: 109 | None 110 | 111 | Returns: 112 | str: The ID of the created assistant. 113 | """ 114 | try: 115 | factory = ToolDefinitionFactory(self.config_path) 116 | oai_tools = [factory.create_tool_definition( 117 | tool_type) for tool_type in factory.config] 118 | 119 | instruction = self.load_instructions() 120 | assistant = self.client.beta.assistants.create( 121 | model=os.environ.get('OPENAI_MODEL', ''), 122 | instructions=instruction, 123 | tools=oai_tools, 124 | name=self.assistant_type 125 | ) 126 | logger.info( 127 | f"Assistant created successfully with ID: {assistant.id}") 128 | return assistant.id 129 | except Exception as e: 130 | logger.error("Failed to create assistant.", exc_info=True) 131 | raise 132 | 133 | def save_assistant_id(self, assistant_id: str) -> None: 134 | """ 135 | Save the assistant's ID to a JSON file. 136 | 137 | Args: 138 | assistant_id (str): The ID of the assistant to save. 139 | 140 | Returns: 141 | None 142 | """ 143 | data_to_append = { 144 | "type": self.assistant_type, 145 | "id": assistant_id, 146 | "date": str(datetime.now()) 147 | } 148 | 149 | try: 150 | with open(self.assistant_save_path, "r") as file: 151 | data = json.load(file) 152 | if not isinstance(data, list): 153 | data = [] 154 | logger.info("Existing assistant data loaded successfully.") 155 | except (FileNotFoundError, json.JSONDecodeError) as e: 156 | data = [] 157 | logger.info( 158 | "No existing assistant data found or error in loading; starting fresh.") 159 | 160 | data.append(data_to_append) 161 | 162 | with open(self.assistant_save_path, "w") as file: 163 | json.dump(data, file, indent=4) 164 | logger.info(f"Assistant ID {assistant_id} saved successfully.") 165 | 166 | def run(self) -> None: 167 | """ 168 | Run the process of creating an assistant and saving its ID. 169 | 170 | Args: 171 | None 172 | 173 | Returns: 174 | None 175 | """ 176 | logger.info("Starting the OAIAssistantCreator process.") 177 | self.load_environment() 178 | self.initialize_openai_client() 179 | assistant_id = self.create_assistant() 180 | self.save_assistant_id(assistant_id) 181 | logger.info("OAIAssistantCreator process completed successfully.") 182 | 183 | 184 | """if __name__ == "__main__": 185 | dotenv_path = os.path.normpath(os.path.join( 186 | os.path.dirname(__file__), '..', '..', '.env')) 187 | config_path = 'src/create_oai_agent/utils/tool_definitions.json' 188 | instruction_path = 'src/create_oai_agent/utils/oai_instructions.json' 189 | assistant_save_path = "src/data/assistant_id.json" 190 | 191 | creator = OAIAssistantCreator( 192 | dotenv_path, config_path, instruction_path, assistant_save_path, "BrowsingAgent") 193 | creator.run()""" 194 | -------------------------------------------------------------------------------- /src/create_oai_agent/tool_definition_factory.py: -------------------------------------------------------------------------------- 1 | from src.configs.logging.logging_config import setup_logging 2 | import json 3 | import logging 4 | 5 | setup_logging() 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class ToolDefinitionFactory: 11 | """ 12 | Factory class for creating tool definitions. 13 | 14 | Methods: 15 | load_config(path: str) -> dict: 16 | Load the configuration data from the configuration file. 17 | create_tool_definition(tool_type: str) -> dict: 18 | Create a tool definition based on the tool type. 19 | """ 20 | 21 | def __init__(self, config_path: str) -> None: 22 | """ 23 | Initialize the factory with the path to the configuration file. 24 | 25 | Args: 26 | config_path (str): The path to the configuration file. 27 | """ 28 | self.config = self.load_config(config_path) 29 | 30 | def load_config(self, path: str) -> dict: 31 | """ 32 | Load the configuration data from the configuration file. 33 | 34 | Args: 35 | path (str): The path to the configuration file. 36 | 37 | Returns: 38 | dict: The loaded configuration data. 39 | """ 40 | try: 41 | with open(path, 'r') as file: 42 | return json.load(file) 43 | except FileNotFoundError: 44 | logger.error(f"Config file not found at path: {path}") 45 | raise 46 | except json.JSONDecodeError: 47 | logger.error( 48 | f"Failed to parse JSON from config file at path: {path}") 49 | raise 50 | 51 | def create_tool_definition(self, tool_type: str) -> dict: 52 | """ 53 | Create a tool definition based on the tool type. 54 | 55 | Args: 56 | tool_type (str): The type of the tool. 57 | 58 | Returns: 59 | dict: The tool definition. 60 | """ 61 | tool_config = self.config.get(tool_type) 62 | if not tool_config: 63 | logger.error(f"No tool definition found for type: {tool_type}") 64 | raise ValueError(f"No tool definition found for type: {tool_type}") 65 | 66 | try: 67 | return { 68 | "type": 'function', 69 | "function": { 70 | "name": tool_config['name'], 71 | "description": tool_config['description'], 72 | "parameters": { 73 | "properties": tool_config['parameters'], 74 | "required": tool_config['required'], 75 | "type": "object" 76 | } 77 | } 78 | } 79 | except KeyError as e: 80 | logger.error( 81 | f"Key {e} not found in tool config for type: {tool_type}") 82 | raise 83 | -------------------------------------------------------------------------------- /src/create_oai_agent/utils/oai_instructions.json: -------------------------------------------------------------------------------- 1 | { 2 | "instruction": "As an advanced web browsing assistant, you possess capabilities that mimic human web interaction but are enhanced by specialized tools for web navigation and analysis. Your primary role involves structured and efficient web page interaction, utilizing a suite of tools designed for a broad spectrum of web-related tasks. These tasks include text input, element clicking, navigating search engines, URL reading, page scrolling, waiting for elements to load, analyzing content via screenshots, and navigating back in browser history.\n\n##Your Toolkit Includes:\n\n- **Input Text Tool:** Enables interaction with web page input fields. Use it to fill forms or conduct site-specific searches by entering text as described by users.\n\n- **Click Element Tool:** Allows you to click on described webpage elements, such as buttons, links, or checkboxes, facilitating site navigation and interaction.\n\n- **Jump to Search Engine Tool:** Directs you to a search engine (Google) for initiating searches or web navigation from a known point.\n\n- **Read URL Tool:** Lets you load webpages from specific URLs, granting access to any required site.\n\n- **Scroll Tool:** Enables vertical webpage scrolling to uncover content not immediately visible upon page load.\n\n- **Wait Tool:** Pauses operations for a specified duration, useful for allowing page elements to load or introducing delays between interactions.\n\n- **Analyze Content Tool:** Analyzes webpage content via screenshots, enabling comprehension and reporting on page information or layout.\n\n- **Go Back Tool:** Navigates back one page in the browser history, useful for retracing steps or revisiting previously viewed content efficiently.\n\n- **Save to File Tool:** Saves important information to a file for record-keeping or later use.\n\n##Objective:\n\nUtilize these tools to fulfill user-assigned tasks, ranging from locating specific webpage information, form submissions, to navigating through webpages for complex workflows. Your responses should be accurate and efficient, addressing the user's request while navigating the dynamic web environment.\n\n##Execution Steps:\n\n1. **Identify the Goal:** Understand the user's request and the desired outcome.\n2. **Select Appropriate Tools:** Choose the tools necessary for accomplishing the goal.\n3. **Execute the Task:** Employ the selected tools to interact with the web as required.\n4. **Report Back:** Provide the user with requested information, action confirmations, or findings explanations.\n\nEfficiency, accuracy in interpreting user needs, selecting the right tools, and effective navigation are key metrics of your performance." 3 | } -------------------------------------------------------------------------------- /src/create_oai_agent/utils/tool_definitions.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_text": { 3 | "name": "input_text", 4 | "description": "Interacts with a webpage by sending keys to input fields based on a provided query, and returns a response string with the result of the action or an error message. The process includes highlighting input elements, taking a screenshot for analysis, and then processing the inputs as per the query.", 5 | "parameters": { 6 | "query": { 7 | "description": "The query to be used for identifying the input fields and the keys to be sent.", 8 | "examples": [ 9 | "Type 'OpenAI' into the 'Search' input field.", 10 | "Type example@gmail.com into the 'Email' input field, and type 'securePassword!' into the 'Password' input field.", 11 | "Enter '123 Main St' into the address field and select 'United States' from the country dropdown." 12 | ], 13 | "title": "Query", 14 | "type": "string" 15 | } 16 | }, 17 | "required": ["query"] 18 | }, 19 | "click_element": { 20 | "name": "click_element", 21 | "description": "Clicks on a webpage element based on a user query.", 22 | "parameters": { 23 | "query": { 24 | "description": "A query string representing the user's description of the target element to click.", 25 | "examples": [ 26 | "Click the 'Submit' button.", 27 | "Click on the link titled 'Read More'.", 28 | "Select the checkbox next to 'I Agree'." 29 | ], 30 | "title": "Query", 31 | "type": "string" 32 | } 33 | }, 34 | "required": ["query"] 35 | }, 36 | "jump_to_search_engine": { 37 | "name": "jump_to_search_engine", 38 | "description": "Navigates to the Google search engine using a WebDriver instance, and returns a response string with the result of the action and the current URL.", 39 | "parameters": {}, 40 | "required": [] 41 | }, 42 | "read_url": { 43 | "name": "read_url", 44 | "description": "Reads the content of a URL using a WebDriver instance adapted for Playwright, executes a script to remove popups, and returns the current URL.", 45 | "parameters": { 46 | "url": { 47 | "description": "The URL to read.", 48 | "examples": [ 49 | "https://www.example.com", 50 | "www.example.com" 51 | ], 52 | "title": "URL", 53 | "type": "string" 54 | } 55 | }, 56 | "required": ["url"] 57 | }, 58 | "scroll": { 59 | "name": "scroll", 60 | "description": "Scrolls the webpage in a specified direction ('up' or 'down') by a fixed amount of pixels and returns a response string with the result of the action.", 61 | "parameters": { 62 | "direction": { 63 | "description": "The direction to scroll the webpage in.", 64 | "enum": ["up", "down"], 65 | "examples": ["up", "down"], 66 | "title": "Direction", 67 | "type": "string" 68 | } 69 | }, 70 | "required": ["direction"] 71 | }, 72 | "wait": { 73 | "name": "wait", 74 | "description": "Waits for 5 seconds and returns a response string with the result of the action and the current URL.", 75 | "parameters": {}, 76 | "required": [] 77 | }, 78 | "analyze_content": { 79 | "name": "analyze_content", 80 | "description": "Analyzes the content of a webpage based on a screenshot and a user query, and returns a response string with insights and answers.", 81 | "parameters": { 82 | "query": { 83 | "description": "A query string representing the user's inquiry about the contents of the current web browser window.", 84 | "examples": [ 85 | "What are the main headlines in this news article screenshot?", 86 | "Is there any contact information available on this page?" 87 | ], 88 | "title": "Query", 89 | "type": "string" 90 | } 91 | }, 92 | "required": ["query"] 93 | }, 94 | "go_back": { 95 | "name": "go_back", 96 | "description": "Navigates back one page in the browser history using a WebDriver instance, and returns a response string with the result of the action and the current URL.", 97 | "parameters": {}, 98 | "required": [] 99 | }, 100 | "save_to_file": { 101 | "name": "save_to_file", 102 | "description": "Saves the data to a file.", 103 | "parameters": { 104 | "data": { 105 | "description": "The data to save to the file.", 106 | "examples": [ 107 | "This is some example data.", 108 | "{\"name\": \"John\", \"age\": 30}", 109 | "12345" 110 | ], 111 | "title": "Data", 112 | "type": "string" 113 | } 114 | }, 115 | "required": ["data"] 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/data/assistant_id.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "type": "oai_browsing_assistant", 4 | "id": "asst_gX0omAoCJ8A9vtbJkMLYtXd6", 5 | "date": "2024-03-10 11:03:05.827440" 6 | }, 7 | { 8 | "type": "oai_browsing_assistant", 9 | "id": "asst_WoIiFNMSnhKOgq6c36ET1wWb", 10 | "date": "2024-03-10 11:06:14.870218" 11 | }, 12 | { 13 | "type": "oai_browsing_assistant", 14 | "id": "asst_cvX7mpjg9oacOrUwGMD0RX5C", 15 | "date": "2024-03-10 11:08:08.545855" 16 | }, 17 | { 18 | "type": "oai_browsing_assistant", 19 | "id": "asst_6vU1tjrRqMf1QhB04DEDLobF", 20 | "date": "2024-03-11 13:15:11.977078" 21 | }, 22 | { 23 | "type": "oai_browsing_assistant", 24 | "id": "asst_gnT2lP147V1QCW4TFv6pDFim", 25 | "date": "2024-03-11 13:17:07.707864" 26 | }, 27 | { 28 | "type": "oai_browsing_assistant", 29 | "id": "asst_hZmZxOScHFufdhTsQOlw4dV0", 30 | "date": "2024-03-11 13:41:12.295331" 31 | }, 32 | { 33 | "type": "oai_browsing_assistant", 34 | "id": "asst_BzvWlaRSCb38ulMSCZXW3d2L", 35 | "date": "2024-03-13 18:35:25.562663" 36 | }, 37 | { 38 | "type": "BrowsingAgent", 39 | "id": "asst_YGjquz12EX0M1hgk8QR1sTKY", 40 | "date": "2024-03-13 18:38:23.557789" 41 | }, 42 | { 43 | "type": "BrowsingAgent", 44 | "id": "asst_prAKf2fRPin7eWyotdcaEVHh", 45 | "date": "2024-03-14 08:00:30.929771" 46 | } 47 | ] -------------------------------------------------------------------------------- /src/oai_agent/oai_agent.py: -------------------------------------------------------------------------------- 1 | from src.configs.logging.logging_config import setup_logging 2 | from src.oai_agent.utils.load_assistant_id import load_assistant_id 3 | from src.oai_agent.utils.create_oai_agent import create_agent 4 | from src.autogen_configuration.autogen_config import GetConfig 5 | from src.tools.read_url import read_url 6 | from src.tools.scroll import scroll 7 | from src.tools.jump_to_search_engine import jump_to_search_engine 8 | from src.tools.go_back import go_back 9 | from src.tools.wait import wait 10 | from src.tools.click_element import click_element 11 | from src.tools.input_text import input_text 12 | from src.tools.analyze_content import analyze_content 13 | from src.tools.save_to_file import save_to_file 14 | from src.oai_agent.utils.prompt import prompt 15 | 16 | import logging 17 | import autogen 18 | from autogen.agentchat import AssistantAgent 19 | from autogen.agentchat.contrib.gpt_assistant_agent import GPTAssistantAgent 20 | 21 | import openai 22 | 23 | setup_logging() 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | def configure_agent(assistant_type: str) -> GPTAssistantAgent: 28 | """ 29 | Configure the GPT Assistant Agent with the specified tools and instructions. 30 | 31 | Args: 32 | None 33 | 34 | Returns: 35 | GPTAssistantAgent: An instance of the GPTAssistantAgent. 36 | """ 37 | try: 38 | logger.info("Configuring GPT Assistant Agent...") 39 | assistant_id = load_assistant_id(assistant_type) 40 | llm_config = GetConfig().config_list 41 | oai_config = { 42 | "config_list": llm_config["config_list"], "assistant_id": assistant_id} 43 | gpt_assistant = GPTAssistantAgent( 44 | name=assistant_type, instructions=AssistantAgent.DEFAULT_SYSTEM_MESSAGE, llm_config=oai_config 45 | ) 46 | logger.info("GPT Assistant Agent configured.") 47 | return gpt_assistant 48 | except openai.NotFoundError: 49 | logger.warning("Assistant not found. Creating new assistant...") 50 | create_agent(assistant_type) 51 | return configure_agent() 52 | except Exception as e: 53 | logger.error(f"Unexpected error during agent configuration: {str(e)}") 54 | raise 55 | 56 | 57 | def register_functions(agent): 58 | """ 59 | Register the functions used by the GPT Assistant Agent. 60 | 61 | Args: 62 | agent (GPTAssistantAgent): An instance of the GPTAssistantAgent. 63 | 64 | Returns: 65 | None 66 | """ 67 | logger.info("Registering functions...") 68 | function_map = { 69 | "analyze_content": analyze_content, 70 | "click_element": click_element, 71 | "go_back": go_back, 72 | "input_text": input_text, 73 | "jump_to_search_engine": jump_to_search_engine, 74 | "read_url": read_url, 75 | "scroll": scroll, 76 | "wait": wait, 77 | "save_to_file": save_to_file, 78 | } 79 | agent.register_function(function_map=function_map) 80 | logger.info("Functions registered.") 81 | 82 | 83 | def create_user_proxy(): 84 | """ 85 | Create a User Proxy Agent. 86 | 87 | Args: 88 | None 89 | 90 | Returns: 91 | UserProxyAgent: An instance of the UserProxyAgent. 92 | """ 93 | logger.info("Creating User Proxy Agent...") 94 | user_proxy = autogen.UserProxyAgent( 95 | name="user_proxy", 96 | is_termination_msg=lambda msg: "TERMINATE" in msg["content"], 97 | human_input_mode="NEVER", 98 | code_execution_config={ 99 | "work_dir": "coding", 100 | "use_docker": False, 101 | }, 102 | ) 103 | logger.info("User Proxy Agent created.") 104 | return user_proxy 105 | 106 | 107 | def main(): 108 | """ 109 | Main function to run the GPT Assistant Agent. 110 | 111 | Args: 112 | None 113 | 114 | Returns: 115 | None 116 | """ 117 | try: 118 | gpt_assistant = configure_agent("BrowsingAgent") 119 | register_functions(gpt_assistant) 120 | user_proxy = create_user_proxy() 121 | user_proxy.initiate_chat( 122 | gpt_assistant, message=prompt) 123 | except Exception as e: 124 | logger.error(f"An error occurred: {str(e)}") 125 | 126 | 127 | if __name__ == "__main__": 128 | main() 129 | -------------------------------------------------------------------------------- /src/oai_agent/utils/create_oai_agent.py: -------------------------------------------------------------------------------- 1 | from src.configs.logging.logging_config import setup_logging 2 | from src.create_oai_agent.oai_agent_creator import OAIAssistantCreator 3 | 4 | import logging 5 | import os 6 | 7 | setup_logging() 8 | logger = logging.getLogger() 9 | 10 | 11 | def create_agent(assistant_type: str): 12 | """ 13 | Create an OpenAI assistant with the specified tools and instructions. 14 | 15 | Args: 16 | assistant_type (str): The type of assistant to create. 17 | 18 | Returns: 19 | None 20 | """ 21 | try: 22 | dotenv_path = os.path.normpath(os.path.join( 23 | os.path.dirname(__file__), '..', '..', '.env')) 24 | config_path = 'src/create_oai_agent/utils/tool_definitions.json' 25 | instruction_path = 'src/create_oai_agent/utils/oai_instructions.json' 26 | assistant_save_path = "src/data/assistant_id.json" 27 | logger.info("Creating OpenAI Assistant...") 28 | creator = OAIAssistantCreator( 29 | dotenv_path, config_path, instruction_path, assistant_save_path, assistant_type) 30 | creator.run() 31 | except Exception as e: 32 | logger.error(f"An error occurred while creating the agent: {str(e)}") 33 | -------------------------------------------------------------------------------- /src/oai_agent/utils/load_assistant_id.py: -------------------------------------------------------------------------------- 1 | from src.configs.logging.logging_config import setup_logging 2 | 3 | import logging 4 | import json 5 | import datetime 6 | 7 | setup_logging() 8 | logger = logging.getLogger() 9 | 10 | 11 | def load_assistant_id(assistant_type: str) -> str: 12 | """ 13 | Load the assistant ID from the assistant ID file. 14 | 15 | Args: 16 | - assistant_type (str): The type of assistant to load. 17 | 18 | Returns: 19 | - assistant_id (str): The assistant ID. 20 | """ 21 | try: 22 | with open("src/data/assistant_id.json", "r") as file: 23 | data = json.load(file) 24 | filtered_data = [ 25 | entry for entry in data if entry['type'] == assistant_type] 26 | latest_entry = max(filtered_data, key=lambda x: datetime.datetime.strptime( 27 | x['date'], "%Y-%m-%d %H:%M:%S.%f")) 28 | assistant_id = latest_entry['id'] 29 | except FileNotFoundError: 30 | logger.error("Assistant ID file not found.") 31 | return None 32 | except (json.JSONDecodeError, KeyError) as e: 33 | logger.error(f"Error loading assistant ID: {str(e)}") 34 | return None 35 | except Exception as e: 36 | logger.error(f"An error occurred while loading assistant ID: {str(e)}") 37 | return None 38 | 39 | return assistant_id 40 | -------------------------------------------------------------------------------- /src/oai_agent/utils/prompt.py: -------------------------------------------------------------------------------- 1 | prompt = """ 2 | 3 | TASK: Go to the Amazon website and search for a laptop, filter for laptops with more than 4 stars, select the first and put in in the cart. 4 | 5 | 1. Go to the website https://www.amazon.com using the 'read_url'. 6 | 2. Search for 'laptop' using the 'input_text'. 7 | 3. Click on the more than 4 stars filter using the 'click_element'. 8 | 3. Click on the first product using the 'click_element'. 9 | 4. Add the item to the cart clicking on the 'Add to Cart' button using the 'click_element'. 10 | 11 | 12 | Write 'TERMINATE' to end the conversation. 13 | 14 | """ 15 | -------------------------------------------------------------------------------- /src/tests/test_autogen_config.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch 3 | from src.autogen_configuration.autogen_config import GetConfig 4 | 5 | 6 | class TestGetConfig(unittest.TestCase): 7 | 8 | def setUp(self): 9 | self.get_config = GetConfig() 10 | 11 | @patch('src.autogen_configuration.autogen_config.config_list_from_json') 12 | def test_load_and_enrich_config_list(self, mock_config_list_from_json): 13 | mock_config_list = [ 14 | {'model': 'model1'}, 15 | {'model': 'model2'} 16 | ] 17 | mock_config_list_from_json.return_value = mock_config_list 18 | 19 | expected_config_list = [ 20 | {'model': 'model1', 'api_key': 'test_api_key'}, 21 | {'model': 'model2', 'api_key': 'test_api_key'} 22 | ] 23 | 24 | self.get_config.api_key = 'test_api_key' 25 | config_list = self.get_config.load_and_enrich_config_list() 26 | 27 | self.assertEqual(config_list['config_list'], expected_config_list) 28 | 29 | 30 | if __name__ == '__main__': 31 | unittest.main() 32 | -------------------------------------------------------------------------------- /src/tests/test_logging_config.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, mock_open 3 | import os 4 | import logging.config 5 | # Assuming your updated code is in a file named log_config.py 6 | from src.configs.logging.logging_config import setup_logging 7 | 8 | 9 | class TestSetupLogging(unittest.TestCase): 10 | 11 | @patch('os.path.exists', return_value=True) 12 | @patch('builtins.open', new_callable=mock_open, read_data='{"version": 1}') 13 | @patch('logging.config.dictConfig') 14 | def test_default_path(self, mock_dict_config, mock_file, mock_exists): 15 | setup_logging() 16 | mock_exists.assert_called_once_with( 17 | 'src/configs/logging/logging_config.json') 18 | mock_file.assert_called_once_with( 19 | 'src/configs/logging/logging_config.json', 'rt') 20 | mock_dict_config.assert_called_once() 21 | 22 | @patch('os.path.exists', return_value=False) 23 | @patch('logging.basicConfig') 24 | def test_fallback_to_basic_config(self, mock_basic_config, mock_exists): 25 | setup_logging() 26 | mock_exists.assert_called_once_with( 27 | 'src/configs/logging/logging_config.json') 28 | mock_basic_config.assert_called_once_with(level=logging.INFO) 29 | 30 | 31 | if __name__ == '__main__': 32 | unittest.main() 33 | -------------------------------------------------------------------------------- /src/tests/test_webdriver.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, MagicMock 3 | from src.webdriver.webdriver import WebDriver 4 | import locale 5 | from tzlocal import get_localzone_name 6 | 7 | 8 | class TestWebDriver(unittest.TestCase): 9 | 10 | def setUp(self): 11 | self.webdriver = WebDriver.getInstance() 12 | 13 | def tearDown(self): 14 | self.webdriver.closeDriver() 15 | 16 | @patch('src.webdriver.webdriver.sync_playwright') 17 | def test_createDriver(self, mock_sync_playwright): 18 | mock_playwright = MagicMock() 19 | mock_browser = MagicMock() 20 | mock_page = MagicMock() 21 | mock_sync_playwright.return_value.start.return_value = mock_playwright 22 | mock_playwright.chromium.launch_persistent_context.return_value = mock_browser 23 | mock_browser.new_page.return_value = mock_page 24 | 25 | self.webdriver.createDriver() 26 | 27 | mock_sync_playwright.assert_called_once() 28 | timezone_id = get_localzone_name() 29 | system_locale = locale.getlocale() 30 | mock_sync_playwright.return_value.start.assert_called_once() 31 | mock_playwright.chromium.launch_persistent_context.assert_called_once_with( 32 | user_data_dir="src/data/chrome_profile", 33 | headless=False, 34 | args=[ 35 | "--disable-gpu", 36 | "--disable-dev-shm-usage", 37 | "--no-sandbox", 38 | "--disable-web-security", 39 | "--allow-running-insecure-content", 40 | ], 41 | locale=system_locale[0], 42 | timezone_id=timezone_id, 43 | ) 44 | mock_browser.new_page.assert_called_once() 45 | mock_page.set_viewport_size.assert_called_once_with( 46 | {"width": 960, "height": 1080}) 47 | 48 | def test_getDriver(self): 49 | page = self.webdriver.getDriver() 50 | self.assertIsNotNone(page) 51 | 52 | def test_closeCurrentTab(self): 53 | self.webdriver.closeCurrentTab() 54 | page = self.webdriver.getDriver() 55 | self.assertIsNotNone(page) 56 | 57 | 58 | if __name__ == '__main__': 59 | unittest.main() 60 | -------------------------------------------------------------------------------- /src/tools/analyze_content.py: -------------------------------------------------------------------------------- 1 | from src.webdriver.webdriver import WebDriver 2 | from src.configs.logging.logging_config import setup_logging 3 | from src.tools.utils.get_b64_screenshot import get_b64_screenshot 4 | from src.tools.utils.vision_template import get_vision_template 5 | from src.tools.utils.analyze_image import analyze_image 6 | from src.tools.utils.load_context import load_context 7 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance 8 | 9 | import logging 10 | 11 | setup_logging() 12 | logger = logging.getLogger() 13 | 14 | 15 | def analyze_content(query: str) -> str: 16 | """ 17 | Analyzes the content of a webpage based on a screenshot and a user query, and returns a response string with insights and answers. 18 | 19 | Args: 20 | query (str): A query string representing the user's inquiry about the contents of the current web browser window. 21 | 22 | Returns: 23 | str: A response string providing insights and answers regarding the information presented in the active browser window. 24 | """ 25 | try: 26 | logger.info("Initializing WebDriver to capture a webpage screenshot.") 27 | driver = get_webdriver_instance() 28 | 29 | screenshot_b64 = get_b64_screenshot(driver) 30 | 31 | context = load_context("analyze_content") 32 | 33 | logger.info("Generating message history for image analysis.") 34 | message_history = get_vision_template(context, screenshot_b64, query) 35 | 36 | logger.info("Analyzing the webpage screenshot.") 37 | message = analyze_image(message_history) 38 | 39 | return message 40 | except Exception as e: 41 | logger.error("Failed to analyze content.", exc_info=True) 42 | return "An error occurred while analyzing the webpage content. Please try again later." 43 | -------------------------------------------------------------------------------- /src/tools/click_element.py: -------------------------------------------------------------------------------- 1 | from src.webdriver.webdriver import WebDriver 2 | from src.configs.logging.logging_config import setup_logging 3 | from src.tools.utils.highlight_elements import highlight_elements 4 | from src.tools.utils.get_b64_screenshot import get_b64_screenshot 5 | from src.tools.utils.vision_template import get_vision_template 6 | from src.tools.utils.load_context import load_context 7 | from src.tools.utils.analyze_image import analyze_image 8 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance 9 | 10 | import logging 11 | import time 12 | 13 | setup_logging() 14 | logger = logging.getLogger() 15 | 16 | 17 | def click_element(query: str) -> str: 18 | """ 19 | Clicks on a webpage element based on a user query, and returns a response string with the result of the action. 20 | 21 | Args: 22 | query (str): A query string representing the user's inquiry about the contents of the current web browser window. 23 | 24 | Returns: 25 | str: A response string providing insights and answers regarding the information presented in the active browser window. 26 | """ 27 | try: 28 | driver = get_webdriver_instance() 29 | logger.info("Highlighting clickable elements on the page.") 30 | bbox_descriptions, bbox_coordinates, driver = highlight_elements( 31 | driver, "click") 32 | screenshot = get_b64_screenshot(driver) 33 | driver = highlight_elements(driver, "remove") 34 | except Exception as e: 35 | logger.error("Error highlighting elements: %s", e, exc_info=True) 36 | return "Failed to highlight clickable elements. Please check the logs for more details." 37 | 38 | try: 39 | click_template = load_context("click_template") 40 | enriched_query = f"{query}.\n\nText on all visible clickable elements: {bbox_descriptions}" 41 | message_history = get_vision_template( 42 | click_template, screenshot, enriched_query) 43 | return process_click(driver, message_history, bbox_coordinates, bbox_descriptions) 44 | except Exception as e: 45 | logger.error("Error processing click action: %s", e, exc_info=True) 46 | return "Failed to process click action. Please check the logs for more details." 47 | 48 | 49 | def process_click(driver, message_history, bbox_coordinates, bbox_descriptions) -> str: 50 | """ 51 | Processes the click action by analyzing the image and clicking the identified element. 52 | 53 | Args: 54 | driver (WebDriver): An instance of the WebDriver. 55 | message_history (str): A string representing the message history for image analysis. 56 | bbox_coordinates (list): A list of bounding box coordinates. 57 | bbox_descriptions (str): A JSON string of elements' descriptions. 58 | 59 | Returns: 60 | str: A response string with the result of the click action. 61 | """ 62 | for attempt in range(3): 63 | try: 64 | logger.info("Analyzing image to identify the clickable element.") 65 | message = analyze_image(message_history) 66 | if "none" in message.lower(): 67 | return "No element found matching the description." 68 | element_index = int(''.join(filter(str.isdigit, message))) 69 | bbox = bbox_coordinates[element_index] 70 | return click_field(driver, bbox, bbox_descriptions, element_index) 71 | except Exception as e: 72 | logger.warning("Attempt %d: %s", attempt + 1, e, exc_info=True) 73 | if attempt == 2: # Last attempt 74 | return "Failed to click on the element after several attempts." 75 | return "Failed to process click action." 76 | 77 | 78 | def click_field(driver, bbox, bbox_descriptions, element) -> str: 79 | """ 80 | Executes the click action on the specified element. 81 | 82 | Args: 83 | driver (WebDriver): An instance of the WebDriver. 84 | bbox (tuple): A tuple containing the x and y coordinates of the element. 85 | bbox_descriptions (str): A JSON string of elements' descriptions. 86 | element (int): The index of the element to click. 87 | 88 | Returns: 89 | str: A response string with the result of the click action. 90 | """ 91 | try: 92 | driver.mouse.click(bbox[0], bbox[1]) 93 | time.sleep(3) # Wait for potential page changes 94 | logger.info("Clicked on element: %s", bbox_descriptions[element]) 95 | return f"Clicked on element {element}. Text on clicked element: '{bbox_descriptions[element]}'. Current URL is {driver.url}." 96 | except Exception as e: 97 | logger.error("Error during click action: %s", e, exc_info=True) 98 | return "Failed to click on the element. Please check the logs for more details." 99 | -------------------------------------------------------------------------------- /src/tools/go_back.py: -------------------------------------------------------------------------------- 1 | from src.webdriver.webdriver import WebDriver 2 | from src.configs.logging.logging_config import setup_logging 3 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance 4 | 5 | import logging 6 | import time 7 | 8 | setup_logging() 9 | logger = logging.getLogger() 10 | 11 | 12 | def go_back() -> str: 13 | """ 14 | Navigates back one page in the browser history using a WebDriver instance, and returns a response string with the result of the action. 15 | 16 | Args: 17 | None 18 | 19 | Returns: 20 | str: A response string stating the success of the action and the current URL. 21 | """ 22 | 23 | try: 24 | driver = get_webdriver_instance() 25 | 26 | logger.info("Navigating back 1 page...") 27 | driver.go_back() 28 | 29 | time.sleep(3) 30 | 31 | return "Success. Went back 1 page. Current URL is: " + driver.url 32 | except Exception as e: 33 | logger.error("An error occurred while navigating back: %s", str(e)) 34 | return "Error occurred while navigating back. Please check the logs for more details." 35 | -------------------------------------------------------------------------------- /src/tools/input_text.py: -------------------------------------------------------------------------------- 1 | from src.webdriver.webdriver import WebDriver 2 | from src.configs.logging.logging_config import setup_logging 3 | from src.tools.utils.highlight_elements import highlight_elements 4 | from src.tools.utils.get_b64_screenshot import get_b64_screenshot 5 | from src.tools.utils.vision_template import get_vision_template 6 | from src.tools.utils.load_context import load_context 7 | from src.tools.utils.analyze_image import analyze_image 8 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance 9 | 10 | import logging 11 | import time 12 | import json 13 | import platform 14 | 15 | setup_logging() 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def input_text(query: str) -> str: 20 | """ 21 | Sends keys to input fields based on a query and returns a result or error message. 22 | 23 | Args: 24 | query (str): A string representing the user's input text query. 25 | 26 | Returns: 27 | str: A response string indicating the success or failure of the input action. 28 | """ 29 | try: 30 | driver = get_webdriver_instance() 31 | logger.info("Highlighting input elements on the page.") 32 | bbox_descriptions, bbox_coordinates, driver = highlight_elements( 33 | driver, "input") 34 | screenshot = get_b64_screenshot(driver) 35 | highlight_elements(driver, "remove") 36 | except Exception as e: 37 | logger.error("Error highlighting input elements: %s", e, exc_info=True) 38 | return "Failed to highlight input elements. Please check the logs for more details." 39 | 40 | try: 41 | input_template = load_context("input_template") 42 | enriched_query = f"{query}.\n\nText on all visible input elements: {bbox_descriptions}" 43 | message_history = get_vision_template( 44 | input_template, screenshot, enriched_query) 45 | return process_input(driver, message_history, bbox_coordinates) 46 | except Exception as e: 47 | logger.error("Error processing input action: %s", e, exc_info=True) 48 | return "Failed to process input action. Please check the logs for more details." 49 | 50 | 51 | def process_input(driver, message_history, bbox_coordinates) -> str: 52 | """ 53 | Identifies input fields from an image and attempts to fill them. 54 | 55 | Args: 56 | driver (WebDriver): An instance of the WebDriver. 57 | message_history (str): A string representing the message history for image analysis. 58 | bbox_coordinates (list): A list of bounding box coordinates for input elements. 59 | 60 | Returns: 61 | str: A response string indicating the success or failure of the input action. 62 | """ 63 | for attempt in range(3): 64 | try: 65 | logger.info("Analyzing the image to identify input fields.") 66 | message = analyze_image(message_history) 67 | if "none" in message.lower(): 68 | return "No matching element found. Use AnalyzeContent tool for further analysis." 69 | input_elements = extract_input_elements(message) 70 | fill_input_fields(driver, input_elements, bbox_coordinates) 71 | return f"Inserted text into the following elements: {list(input_elements.keys())}" 72 | except Exception as e: 73 | logger.warning(f"Attempt {attempt+1}: {e}", exc_info=True) 74 | if attempt == 2: # Last attempt 75 | return "Failed to input text after several attempts." 76 | return "Failed to input text. Use AnalyzeContent tool for further analysis." 77 | 78 | 79 | def extract_input_elements(message: str) -> dict: 80 | """ 81 | Extracts input elements from a message. 82 | 83 | Args: 84 | message (str): A string representing the message from image analysis. 85 | 86 | Returns: 87 | dict: A dictionary containing input elements and their corresponding values. 88 | """ 89 | logger.info("Extracting input elements from the message.") 90 | try: 91 | start = message.find('```json') + len('```json\n') 92 | end = message.rfind('```') 93 | json_str = message[start:end].strip() 94 | return json.loads(json_str) 95 | except json.JSONDecodeError as e: 96 | logger.error("Error parsing JSON from message: %s", e, exc_info=True) 97 | raise 98 | 99 | 100 | def fill_input_fields(driver, input_elements, bbox_coordinates) -> None: 101 | """ 102 | Fills identified input fields with the provided values. 103 | 104 | Args: 105 | driver (WebDriver): An instance of the WebDriver. 106 | input_elements (dict): A dictionary containing input elements and their corresponding values. 107 | bbox_coordinates (list): A list of bounding box coordinates for input elements. 108 | 109 | Returns: 110 | None 111 | """ 112 | logger.info("Filling input fields with provided values.") 113 | try: 114 | for key, value in input_elements.items(): 115 | bbox = bbox_coordinates[int(key)] 116 | click_and_fill(driver, bbox, str(value)) 117 | logger.info("Input successfully completed.") 118 | except Exception as e: 119 | logger.error("Error during input: %s", e, exc_info=True) 120 | raise 121 | 122 | 123 | def click_and_fill(driver, bbox, text): 124 | """ 125 | Clicks on an input field and fills it with text. 126 | 127 | Args: 128 | driver (WebDriver): An instance of the WebDriver. 129 | bbox (tuple): A tuple containing the x and y coordinates of the input field. 130 | text (str): A string representing the text to fill in the input field. 131 | 132 | Returns: 133 | None 134 | """ 135 | driver.mouse.click(bbox[0], bbox[1]) 136 | select_all_shortcut = "Meta+A" if platform.system() == "Darwin" else "Control+A" 137 | driver.keyboard.press(select_all_shortcut) 138 | time.sleep(1) 139 | driver.keyboard.press("Backspace") 140 | time.sleep(1) 141 | driver.keyboard.type(text) 142 | time.sleep(1) 143 | driver.keyboard.press("Enter") 144 | time.sleep(1) 145 | logger.info(f"Filled input field with text: {text}") 146 | -------------------------------------------------------------------------------- /src/tools/jump_to_search_engine.py: -------------------------------------------------------------------------------- 1 | from src.webdriver.webdriver import WebDriver 2 | from src.configs.logging.logging_config import setup_logging 3 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance 4 | 5 | import logging 6 | import time 7 | 8 | setup_logging() 9 | logger = logging.getLogger() 10 | 11 | 12 | def jump_to_search_engine() -> str: 13 | """ 14 | Navigates to the Google search engine using a WebDriver instance, and returns a response string with the result of the action. 15 | 16 | Args: 17 | None 18 | 19 | Returns: 20 | str: A response string stating the success of the action and the current URL. 21 | """ 22 | 23 | try: 24 | driver = get_webdriver_instance() 25 | 26 | driver.goto("https://www.google.com") 27 | 28 | time.sleep(3) 29 | 30 | logger.info( 31 | "Success. Jumped to Google search engine. Current URL is: " + driver.url) 32 | return "Success. Jumped to Google search engine. Current URL is: " + driver.url 33 | 34 | except Exception as e: 35 | logger.error("An error occurred: " + str(e)) 36 | return "An error occurred: " + str(e) 37 | -------------------------------------------------------------------------------- /src/tools/read_url.py: -------------------------------------------------------------------------------- 1 | from src.webdriver.webdriver import WebDriver 2 | from src.configs.logging.logging_config import setup_logging 3 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance 4 | 5 | import logging 6 | 7 | setup_logging() 8 | logger = logging.getLogger() 9 | 10 | 11 | def read_url(url: str) -> str: 12 | """ 13 | Reads the content of a URL using a WebDriver instance adapted for Playwright, 14 | executes a script to remove popups, and returns the current URL. 15 | 16 | Args: 17 | url (str): The URL to read. 18 | 19 | Returns: 20 | str: The current URL. 21 | """ 22 | try: 23 | if not url.startswith('https://'): 24 | url = 'https://' + url 25 | 26 | logger.info(f"Reading URL: {url}") 27 | driver = get_webdriver_instance() 28 | driver.goto(url) 29 | 30 | return "Current URL is: " + driver.url + "\n" 31 | except Exception as e: 32 | logger.error(f"Failed to read URL {url}: {e}") 33 | raise 34 | -------------------------------------------------------------------------------- /src/tools/save_to_file.py: -------------------------------------------------------------------------------- 1 | from src.configs.logging.logging_config import setup_logging 2 | 3 | import logging 4 | import os 5 | import datetime 6 | 7 | setup_logging() 8 | logger = logging.getLogger() 9 | 10 | 11 | def save_to_file(data: str) -> None: 12 | """ 13 | Saves the data to a file. 14 | 15 | Args: 16 | data (str): The data to save to the file. 17 | 18 | Returns: 19 | None 20 | """ 21 | try: 22 | base_folder = 'src/data/saved_data/' 23 | if not os.path.exists(base_folder): 24 | os.makedirs(base_folder) 25 | os.chdir(base_folder) 26 | folder_name = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 27 | os.makedirs(folder_name) 28 | file_name = folder_name + "/data.txt" 29 | with open(file_name, "w") as file: 30 | file.write(data) 31 | 32 | logger.info(f"Data saved to file: {file_name}") 33 | except Exception as e: 34 | logger.error(f"Error saving data to file: {str(e)}") 35 | return None 36 | -------------------------------------------------------------------------------- /src/tools/scroll.py: -------------------------------------------------------------------------------- 1 | from src.webdriver.webdriver import WebDriver 2 | from src.configs.logging.logging_config import setup_logging 3 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance 4 | 5 | import logging 6 | import time 7 | from typing import Literal 8 | 9 | setup_logging() 10 | logger = logging.getLogger() 11 | 12 | 13 | def scroll(direction: Literal["up", "down"]) -> str: 14 | """ 15 | Scrolls the current page up or down by 500 pixels. 16 | 17 | Args: 18 | direction (Literal["up", "down"]): The direction to scroll. 19 | 20 | Returns: 21 | str: A message confirming the scroll action. 22 | """ 23 | 24 | try: 25 | if direction not in ["up", "down"]: 26 | raise ValueError("Direction must be either 'up' or 'down'.") 27 | 28 | driver = get_webdriver_instance() 29 | 30 | scroll_amount = 500 31 | if direction == "up": 32 | driver.evaluate(f"window.scrollBy(0, {-scroll_amount})") 33 | else: 34 | driver.evaluate(f"window.scrollBy(0, {scroll_amount})") 35 | 36 | time.sleep(2) 37 | 38 | logger.info(f"Scrolled {direction} by {scroll_amount} pixels.") 39 | return f"Scrolled {direction} by {scroll_amount} pixels." 40 | 41 | except Exception as e: 42 | logger.error(f"Failed to scroll {direction}: {e}") 43 | raise 44 | -------------------------------------------------------------------------------- /src/tools/utils/analyze_image.py: -------------------------------------------------------------------------------- 1 | from src.configs.logging.logging_config import setup_logging 2 | from src.tools.utils.openai_client import get_openai_client 3 | 4 | import logging 5 | 6 | setup_logging() 7 | logger = logging.getLogger() 8 | 9 | 10 | def analyze_image(message_history, **kwargs): 11 | """ 12 | Analyzes an image using OpenAI's GPT-4 Vision model and returns the response message. 13 | 14 | Args: 15 | message_history (list): A list of message objects representing the conversation history. 16 | **kwargs: Arbitrary keyword arguments. 17 | 18 | Returns: 19 | str: The response message if successful, else logs error and returns an error message. 20 | """ 21 | 22 | try: 23 | logger.info("Getting OpenAI client.") 24 | client = get_openai_client() 25 | 26 | logger.info("Sending request to OpenAI's GPT-4 Vision model.") 27 | response = client.chat.completions.create( 28 | model="gpt-4-vision-preview", 29 | messages=message_history, 30 | max_tokens=1024, 31 | temperature=0.1, 32 | **kwargs) 33 | 34 | message = response.choices[0].message 35 | message_text = message.content 36 | 37 | logger.info("Image analysis completed successfully.") 38 | return message_text 39 | 40 | except Exception as e: 41 | logger.error( 42 | "Failed to analyze image with OpenAI's GPT-4 Vision model.", exc_info=True) 43 | return "An error occurred while analyzing the image. Please try again later." 44 | -------------------------------------------------------------------------------- /src/tools/utils/get_b64_screenshot.py: -------------------------------------------------------------------------------- 1 | from src.configs.logging.logging_config import setup_logging 2 | from src.tools.utils.save_screenshot import save_screenshot_to_file 3 | import base64 4 | import os 5 | import logging 6 | 7 | setup_logging() 8 | logger = logging.getLogger() 9 | 10 | 11 | def get_b64_screenshot(driver, element=None) -> str: 12 | """ 13 | Captures a screenshot of the current web page or a specific element and returns it as a base64 encoded string. 14 | 15 | Args: 16 | driver (Playwright Browser): The Playwright browser instance. 17 | element (Playwright ElementHandle, optional): The Playwright element handle. Defaults to None. 18 | 19 | Returns: 20 | str: A base64 encoded string representing the screenshot of the current web page or a specific element. 21 | """ 22 | temp_file_path = "temp_screenshot.png" 23 | try: 24 | logger.info("Capturing screenshot") 25 | if element: 26 | screenshot = element.screenshot( 27 | path=temp_file_path, full_page=False) 28 | else: 29 | screenshot = driver.screenshot( 30 | path=temp_file_path, full_page=False) 31 | 32 | with open(temp_file_path, "rb") as image_file: 33 | screenshot = base64.b64encode(image_file.read()).decode("utf-8") 34 | 35 | os.remove(temp_file_path) 36 | 37 | _ = save_screenshot_to_file(screenshot) 38 | 39 | return screenshot 40 | except Exception as e: 41 | logger.error(f"Error occurred while capturing screenshot: {str(e)}") 42 | return "" 43 | -------------------------------------------------------------------------------- /src/tools/utils/get_webdriver_instance.py: -------------------------------------------------------------------------------- 1 | from src.webdriver.webdriver import WebDriver 2 | from src.configs.logging.logging_config import setup_logging 3 | 4 | import logging 5 | 6 | setup_logging() 7 | logger = logging.getLogger() 8 | 9 | 10 | def get_webdriver_instance(): 11 | """ 12 | Returns an instance of the WebDriver. 13 | 14 | Args: 15 | None 16 | 17 | Returns: 18 | WebDriver: An instance of the WebDriver. 19 | """ 20 | try: 21 | return WebDriver.getInstance().getDriver() 22 | except Exception as e: 23 | logger.error("Failed to get WebDriver instance: %s", e, exc_info=True) 24 | raise 25 | -------------------------------------------------------------------------------- /src/tools/utils/highlight_elements.py: -------------------------------------------------------------------------------- 1 | from src.webdriver.webdriver import WebDriver 2 | from typing import Literal 3 | import json 4 | import logging 5 | 6 | from src.configs.logging.logging_config import setup_logging 7 | setup_logging() 8 | logger = logging.getLogger() 9 | 10 | 11 | def format_description(elements: list) -> tuple: 12 | """ 13 | Formats the description of page elements and their bounding box coordinates. 14 | 15 | Args: 16 | elements (list): A list of element dictionaries with keys like 'ariaLabel', 'x', 'y', 'text'. 17 | 18 | Returns: 19 | tuple: A tuple containing a JSON string of elements' descriptions and a list of bounding box coordinates. 20 | """ 21 | labels = [] 22 | bbox_coordinates = [] 23 | for i, bbox in enumerate(elements): 24 | text = bbox.get("ariaLabel") or "" 25 | bbox_coordinates.append((bbox.get("x"), bbox.get("y"))) 26 | if not text.strip(): 27 | text = bbox.get("text") 28 | labels.append({str(i): text}) 29 | 30 | bbox_descriptions = json.dumps(labels, indent=4) 31 | return bbox_descriptions, bbox_coordinates 32 | 33 | 34 | def highlight_elements(driver, mark: Literal["click", "input", "all", "remove"]): 35 | """ 36 | Highlights elements on a webpage based on the mark type. 37 | 38 | Args: 39 | driver (WebDriver): An instance of the WebDriver. 40 | mark (Literal["click", "input", "all", "remove"]): The type of elements to mark. 41 | 42 | Returns: 43 | A tuple containing descriptions and coordinates of marked elements or the driver itself after unmarking elements. 44 | In case of errors, it returns an error message. 45 | """ 46 | if mark not in ["click", "input", "all", "remove"]: 47 | raise ValueError( 48 | "Mark must be either 'click', 'input', 'all' or 'remove'.") 49 | 50 | try: 51 | with open('src/tools/utils/mark_page.js') as f: 52 | mark_page_script = f.read() 53 | 54 | if mark == "click": 55 | elements = driver.evaluate(f"""() => {{ 56 | {mark_page_script} 57 | return markClickableElements(); 58 | }}""") 59 | bbox_descriptions, bbox_coordinates = format_description(elements) 60 | return bbox_descriptions, bbox_coordinates, driver 61 | elif mark == "input": 62 | elements = driver.evaluate(f"""() => {{ 63 | {mark_page_script} 64 | return markInputElements(); 65 | }}""") 66 | bbox_descriptions, bbox_coordinates = format_description(elements) 67 | return bbox_descriptions, bbox_coordinates, driver 68 | elif mark == "all": 69 | elements = driver.evaluate(f"""() => {{ 70 | {mark_page_script} 71 | return markAllElements(); 72 | }}""") 73 | bbox_descriptions, bbox_coordinates = format_description(elements) 74 | return bbox_descriptions, bbox_coordinates, driver 75 | else: 76 | driver.evaluate(f"""() => {{ 77 | {mark_page_script} 78 | return unmarkPage(); 79 | }}""") 80 | return driver 81 | except Exception as e: 82 | return str(e) 83 | -------------------------------------------------------------------------------- /src/tools/utils/load_context.py: -------------------------------------------------------------------------------- 1 | from src.configs.logging.logging_config import setup_logging 2 | 3 | import logging 4 | import json 5 | 6 | setup_logging() 7 | logger = logging.getLogger() 8 | 9 | 10 | def load_context(prompt_template: str) -> str: 11 | """ 12 | Load the prompt for the assistant from the specified JSON file. 13 | 14 | Args: 15 | None 16 | 17 | Returns: 18 | str: The prompt for the assistant to use in generating responses. 19 | """ 20 | try: 21 | logger.info("Loading the prompt from the JSON file.") 22 | with open('src/tools/utils/prompts.json', 'r') as file: 23 | prompt = json.load(file) 24 | prompt = prompt[prompt_template]["prompt"] 25 | return prompt 26 | except Exception as e: 27 | logger.error( 28 | "Failed to load the prompt from the JSON file.", exc_info=True) 29 | raise 30 | -------------------------------------------------------------------------------- /src/tools/utils/mark_page.js: -------------------------------------------------------------------------------- 1 | const customCSS = ` 2 | ::-webkit-scrollbar { 3 | width: 10px; 4 | } 5 | ::-webkit-scrollbar-track { 6 | background: #27272a; 7 | } 8 | ::-webkit-scrollbar-thumb { 9 | background: #888; 10 | border-radius: 0.375rem; 11 | } 12 | ::-webkit-scrollbar-thumb:hover { 13 | background: #555; 14 | } 15 | `; 16 | 17 | const styleTag = document.createElement("style"); 18 | styleTag.textContent = customCSS; 19 | document.head.append(styleTag); 20 | 21 | let labels = []; 22 | 23 | function unmarkPage() { 24 | var labels = document.querySelectorAll('div[data-label]'); 25 | for (var i = 0; i < labels.length; i++) { 26 | document.body.removeChild(labels[i]); 27 | } 28 | } 29 | 30 | function markElements(includeCondition) { 31 | unmarkPage(); 32 | 33 | var bodyRect = document.body.getBoundingClientRect(); 34 | 35 | var items = Array.prototype.slice 36 | .call(document.querySelectorAll("*")) 37 | .map(function (element) { 38 | var vw = Math.max( 39 | document.documentElement.clientWidth || 0, 40 | window.innerWidth || 0 41 | ); 42 | var vh = Math.max( 43 | document.documentElement.clientHeight || 0, 44 | window.innerHeight || 0 45 | ); 46 | var textualContent = element.textContent.trim().replace(/\s{2,}/g, " "); 47 | var elementType = element.tagName.toLowerCase(); 48 | var ariaLabel = element.getAttribute("aria-label") || ""; 49 | 50 | var rects = [...element.getClientRects()] 51 | .filter((bb) => { 52 | var center_x = bb.left + bb.width / 2; 53 | var center_y = bb.top + bb.height / 2; 54 | var elAtCenter = document.elementFromPoint(center_x, center_y); 55 | 56 | return elAtCenter === element || element.contains(elAtCenter); 57 | }) 58 | .map((bb) => { 59 | const rect = { 60 | left: Math.max(0, bb.left), 61 | top: Math.max(0, bb.top), 62 | right: Math.min(vw, bb.right), 63 | bottom: Math.min(vh, bb.bottom), 64 | }; 65 | return { 66 | ...rect, 67 | width: rect.right - rect.left, 68 | height: rect.bottom - rect.top, 69 | }; 70 | }); 71 | 72 | var area = rects.reduce((acc, rect) => acc + rect.width * rect.height, 0); 73 | 74 | return { 75 | element: element, 76 | include: includeCondition(element), 77 | area, 78 | rects, 79 | text: textualContent, 80 | type: elementType, 81 | ariaLabel: ariaLabel, 82 | id: element.id, 83 | }; 84 | }) 85 | .filter((item) => item.include && item.area >= 20); 86 | 87 | items = items.filter( 88 | (x) => !items.some((y) => x.element.contains(y.element) && !(x == y)) 89 | ); 90 | 91 | function getRandomColor() { 92 | var letters = "0123456789ABCDEF"; 93 | var color = "#"; 94 | for (var i = 0; i < 6; i++) { 95 | color += letters[Math.floor(Math.random() * 16)]; 96 | } 97 | return color; 98 | } 99 | 100 | items.forEach(function (item, index) { 101 | item.rects.forEach((bbox) => { 102 | var newElement = document.createElement("div"); 103 | newElement.className = 'highlighted-element'; 104 | newElement.setAttribute('data-label', ''); 105 | newElement.setAttribute('data-text', item.text); 106 | var borderColor = "#000"; 107 | newElement.style.outline = `2px dashed ${borderColor}`; 108 | newElement.style.position = "fixed"; 109 | newElement.style.left = bbox.left + "px"; 110 | newElement.style.top = bbox.top + "px"; 111 | newElement.style.width = bbox.width + "px"; 112 | newElement.style.height = bbox.height + "px"; 113 | newElement.style.pointerEvents = "none"; 114 | newElement.style.boxSizing = "border-box"; 115 | newElement.style.zIndex = 2147483647; 116 | 117 | var label = document.createElement("span"); 118 | label.textContent = index; 119 | label.style.position = "absolute"; 120 | label.style.top = "-19px"; 121 | label.style.left = "0px"; 122 | label.style.background = borderColor; 123 | label.style.color = "white"; 124 | label.style.padding = "2px 4px"; 125 | label.style.fontSize = "12px"; 126 | label.style.borderRadius = "2px"; 127 | newElement.appendChild(label); 128 | 129 | document.body.appendChild(newElement); 130 | labels.push(newElement); 131 | }); 132 | }); 133 | 134 | const coordinates = items.flatMap((item) => 135 | item.rects.map(({ left, top, width, height }) => ({ 136 | x: (left + left + width) / 2, 137 | y: (top + top + height) / 2, 138 | type: item.type, 139 | text: item.text, 140 | ariaLabel: item.ariaLabel, 141 | })) 142 | ); 143 | return coordinates; 144 | } 145 | 146 | function markClickableElements() { 147 | return markElements((element) => { 148 | return element.tagName === "A" || 149 | element.tagName === "TEXTAREA" || 150 | element.tagName === "SELECT" || 151 | element.onclick != null || 152 | window.getComputedStyle(element).cursor == "pointer"; 153 | }); 154 | } 155 | 156 | function markInputElements() { 157 | return markElements((element) => { 158 | return element.tagName === "INPUT"; 159 | }); 160 | } 161 | 162 | function markAllElements() { 163 | return markElements((element) => { 164 | return element.tagName === "INPUT" || 165 | element.tagName === "TEXTAREA" || 166 | element.tagName === "SELECT" || 167 | element.tagName === "BUTTON" || 168 | element.tagName === "A" || 169 | element.onclick != null || 170 | window.getComputedStyle(element).cursor == "pointer" || 171 | element.tagName === "IFRAME" || 172 | element.tagName === "VIDEO"; 173 | }); 174 | } -------------------------------------------------------------------------------- /src/tools/utils/openai_client.py: -------------------------------------------------------------------------------- 1 | from src.configs.logging.logging_config import setup_logging 2 | import openai 3 | from dotenv import load_dotenv 4 | import threading 5 | import instructor 6 | import os 7 | import logging 8 | 9 | setup_logging() 10 | logger = logging.getLogger() 11 | 12 | dotenv_path = os.path.normpath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | 15 | load_dotenv(dotenv_path) 16 | 17 | client_lock = threading.Lock() 18 | client = None 19 | 20 | 21 | def get_openai_client(): 22 | """ 23 | Returns an OpenAI client instance. 24 | 25 | Args: 26 | None 27 | 28 | Returns: 29 | openai.OpenAI: An OpenAI client instance. 30 | """ 31 | global client 32 | with client_lock: 33 | if client is None: 34 | try: 35 | logging.info("Creating OpenAI client") 36 | api_key = openai.api_key or os.getenv('OPENAI_API_KEY') 37 | if api_key is None: 38 | raise ValueError( 39 | "OpenAI API key is not set. Please set it using set_openai_key.") 40 | client = instructor.patch(openai.OpenAI(api_key=api_key, 41 | max_retries=5,)) 42 | logging.info("OpenAI client created successfully.") 43 | except Exception as e: 44 | logging.error(f"Error creating OpenAI client: {str(e)}") 45 | return client 46 | -------------------------------------------------------------------------------- /src/tools/utils/prompts.json: -------------------------------------------------------------------------------- 1 | { 2 | "analyze_content": { 3 | "prompt": "As a web scraping tool, your primary task is to accurately extract and provide information in response to user queries based on webpage screenshots. When a user asks a question, analyze the provided screenshot of the webpage for relevant information. Your goal is to ensure relevant data retrieval from webpages. If some elements are obscured by pop ups, notify the user about how to close them. If there might be additional information on the page regarding the user's question by scrolling up or down, notify the user about it as well." 4 | }, 5 | "click_template": { 6 | "prompt": "You are a robot browsing the web, just like humans. Your objective is to analyze a high-resolution screenshot of a webpage. The screenshot includes interactive elements, each framed by a bounding box made of dashed lines. These bounding boxes are clearly visible and labeled with numerical identifiers at the top left corner, allowing for each clickable element on the page to be uniquely recognized. \n\nA sequence number is displayed at the top left of each element's bounding box, ranging from 1 to n, where 'n' represents the total number of interactive elements depicted in the screenshot. \n\nWhen given a user's description of a target element, your duty is to thoroughly analyze the screenshot to find the element that accurately corresponds to this description. Identify the target element and report the sequence number found at the top left of its bounding box. The response should be solely the sequence number, with no additional characters or text. \n\nIf there is no element that matches the user's description in the screenshot, your response should be the word 'none'. This confirms that a detailed inspection was performed with no matching element located. \n\nIt is imperative to provide the precise sequence number for the identified element. The accuracy of this information is essential to enable proper interaction with the webpage as depicted in the screenshot." 7 | }, 8 | "input_template": { 9 | "prompt": "You are a robot browsing the web, just like humans. Your objective is to analyze a high-resolution screenshot of a webpage. The screenshot includes input fields, each framed by a bounding box made of dashed lines. These bounding boxes are clearly visible and labeled with numerical identifiers at the top left corner, allowing for each clickable element on the page to be uniquely recognized. \n\nYour task is to analyze the screenshot, identify the input fields based on the user's description, and output the sequence numbers of these fields in JSON format, paired with the specified text. For instance, if the user's task involves entering an email and password, your output should be in the format: {'52': 'johndoe@gmail.com', '53': 'password123'}, where 52 and 53 are sequence numbers of the input fields. \n\nThe enter key will be pressed on the last element automatically. \n\nIf no element on the screenshot matches the user’s description, explain to the user what's on the page instead, and tell him where these elements are most likely to be located." 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/tools/utils/save_screenshot.py: -------------------------------------------------------------------------------- 1 | from src.configs.logging.logging_config import setup_logging 2 | 3 | import logging 4 | import os 5 | import base64 6 | 7 | setup_logging() 8 | logger = logging.getLogger() 9 | 10 | 11 | def save_screenshot_to_file(screenshot_b64: str, folder_path: str = "src/data/screenshots") -> str: 12 | """ 13 | Saves a base64 encoded screenshot to a PNG file. 14 | 15 | Args: 16 | screenshot_b64 (str): The base64 encoded screenshot. 17 | folder_path (str): The folder path where the screenshot will be saved. 18 | 19 | Returns: 20 | str: The path to the saved screenshot file. 21 | """ 22 | try: 23 | if not os.path.exists(folder_path): 24 | logger.info(f"Creating directory {folder_path}.") 25 | os.makedirs(folder_path) 26 | 27 | count = len(os.listdir(folder_path)) + 1 28 | file_path = os.path.join(folder_path, f"{count}.png") 29 | 30 | with open(file_path, "wb") as file: 31 | logger.info(f"Saving screenshot to {file_path}.") 32 | file.write(base64.b64decode(screenshot_b64)) 33 | 34 | return file_path 35 | except Exception as e: 36 | logger.error(f"Error saving screenshot: {str(e)}") 37 | return "" 38 | -------------------------------------------------------------------------------- /src/tools/utils/vision_template.py: -------------------------------------------------------------------------------- 1 | from src.configs.logging.logging_config import setup_logging 2 | 3 | import logging 4 | 5 | setup_logging() 6 | logger = logging.getLogger() 7 | 8 | 9 | def get_vision_template(context: str, screenshot: str, question: str) -> list: 10 | """ 11 | Returns a message chat template for the vision model. 12 | 13 | Args: 14 | context (str): The context of the message chat. 15 | screenshot (str): A base64 encoded string representing the screenshot of the current web page. 16 | question (str): The user's query string. 17 | 18 | Returns: 19 | list: A list of dictionaries representing the message chat template for the vision model. 20 | """ 21 | 22 | try: 23 | logger.info("Creating vision template") 24 | message_chat = [ 25 | { 26 | "role": "system", 27 | "content": context, 28 | }, 29 | { 30 | "role": "user", 31 | "content": [ 32 | { 33 | "type": "image_url", 34 | "image_url": f"data:image/jpeg;base64,{screenshot}", 35 | }, 36 | { 37 | "type": "text", 38 | "text": f"{question}", 39 | } 40 | ] 41 | } 42 | ] 43 | return message_chat 44 | except Exception as e: 45 | logger.error(f"Error occurred in get_vision_template: {e}") 46 | return [] 47 | -------------------------------------------------------------------------------- /src/tools/wait.py: -------------------------------------------------------------------------------- 1 | from src.webdriver.webdriver import WebDriver 2 | from src.configs.logging.logging_config import setup_logging 3 | from src.tools.utils.get_webdriver_instance import get_webdriver_instance 4 | 5 | import logging 6 | import time 7 | 8 | setup_logging() 9 | logger = logging.getLogger() 10 | 11 | 12 | def wait() -> str: 13 | """ 14 | Waits for 5 seconds and returns a response string with the result of the action and the current URL. 15 | 16 | Args: 17 | None 18 | 19 | Returns: 20 | str: A response string stating the success of the action and the current URL. 21 | """ 22 | 23 | try: 24 | driver = get_webdriver_instance() 25 | 26 | logger.info("Waiting 5 seconds...") 27 | time.sleep(5) 28 | 29 | return "Success. Waited 5 seconds. Current URL is: " + driver.url 30 | except Exception as e: 31 | logger.error("An error occurred while waiting: %s", str(e)) 32 | return "Error occurred while waiting. Please check the logs for more details." 33 | -------------------------------------------------------------------------------- /src/webdriver/webdriver.py: -------------------------------------------------------------------------------- 1 | from playwright.sync_api import sync_playwright 2 | from src.configs.logging.logging_config import setup_logging 3 | import locale 4 | from tzlocal import get_localzone_name 5 | import logging 6 | 7 | setup_logging() 8 | logger = logging.getLogger() 9 | 10 | 11 | class WebDriver: 12 | """ 13 | A singleton class representing a web driver instance. 14 | 15 | Methods: 16 | getInstance(*args, **kwargs) -> WebDriver: 17 | Returns the singleton instance of the WebDriver class. 18 | __init__(*args, **kwargs) -> None: 19 | Initializes the WebDriver class. 20 | createDriver(*args, **kwargs) -> None: 21 | Creates a new browser instance and sets up the page. 22 | getDriver() -> Page: 23 | Returns the current page instance. 24 | closeDriver() -> None: 25 | Closes the browser instance and stops Playwright. 26 | closeCurrentTab() -> None: 27 | Closes the current tab (page) without affecting the browser instance. 28 | """ 29 | 30 | __instance = None 31 | 32 | @staticmethod 33 | def getInstance(*args, **kwargs): 34 | """ 35 | Returns the singleton instance of the WebDriver class. 36 | 37 | Args: 38 | *args: Variable length argument list. 39 | **kwargs: Arbitrary keyword arguments. 40 | 41 | Returns: 42 | The singleton instance of the WebDriver class. 43 | """ 44 | if WebDriver.__instance is None: 45 | WebDriver.__instance = WebDriver(*args, **kwargs) 46 | return WebDriver.__instance 47 | 48 | def __init__(self, *args, **kwargs): 49 | """ 50 | Initializes the WebDriver class. 51 | 52 | Args: 53 | *args: Variable length argument list. 54 | **kwargs: Arbitrary keyword arguments. 55 | 56 | Returns: 57 | None 58 | """ 59 | if WebDriver.__instance is not None: 60 | raise Exception("This class is a singleton!") 61 | else: 62 | WebDriver.__instance = self 63 | self.createDriver(*args, **kwargs) 64 | 65 | def createDriver(self, *args, **kwargs): 66 | """ 67 | Creates a new browser instance and sets up the page. 68 | 69 | Args: 70 | *args: Variable length argument list. 71 | **kwargs: Arbitrary keyword arguments. 72 | 73 | Returns: 74 | None 75 | """ 76 | timezone_id = get_localzone_name() 77 | system_locale = locale.getdefaultlocale() 78 | 79 | try: 80 | playwright = sync_playwright().start() 81 | browser = playwright.chromium.launch_persistent_context( 82 | user_data_dir="src/data/chrome_profile", 83 | headless=False, 84 | args=[ 85 | "--disable-gpu", 86 | "--disable-dev-shm-usage", 87 | "--no-sandbox", 88 | "--disable-web-security", 89 | "--allow-running-insecure-content", 90 | ], 91 | locale=system_locale[0], 92 | timezone_id=timezone_id, 93 | ) 94 | self.playwright = playwright 95 | self.browser = browser 96 | self.page = browser.new_page() 97 | self.page.set_viewport_size({"width": 960, "height": 1080}) 98 | logger.info("Browser instance created successfully.") 99 | except Exception as e: 100 | logger.error("Failed to create browser instance.", exc_info=True) 101 | raise e 102 | 103 | def getDriver(self): 104 | """ 105 | Returns the current page instance. 106 | 107 | Args: 108 | None 109 | 110 | Returns: 111 | Page: The current page instance. 112 | """ 113 | return self.page 114 | 115 | def closeDriver(self): 116 | """ 117 | Closes the browser instance and stops Playwright. 118 | 119 | Args: 120 | None 121 | 122 | Returns: 123 | None 124 | """ 125 | try: 126 | self.browser.close() 127 | self.playwright.stop() 128 | logger.info("Browser instance closed successfully.") 129 | except Exception as e: 130 | logger.error("Failed to close browser instance.", exc_info=True) 131 | raise e 132 | 133 | def closeCurrentTab(self): 134 | """ 135 | Closes the current tab (page) without affecting the browser instance. 136 | 137 | Args: 138 | None 139 | 140 | Returns: 141 | None 142 | """ 143 | if self.page and not self.page.is_closed(): 144 | try: 145 | self.page.close() 146 | self.page = self.browser.new_page() 147 | self.page.set_viewport_size({"width": 960, "height": 1080}) 148 | logger.info("Current tab closed successfully.") 149 | except Exception as e: 150 | logger.error("Failed to close current tab.", exc_info=True) 151 | raise e 152 | --------------------------------------------------------------------------------