├── autowing
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   ├── llm
    │   │   ├── __init__.py
    │   │   ├── client
    │   │   │   ├── __init__.py
    │   │   │   ├── qwen.py
    │   │   │   ├── deepseek.py
    │   │   │   ├── doubao.py
    │   │   │   └── openai.py
    │   │   ├── base.py
    │   │   └── factory.py
    │   ├── ai_context.py
    │   ├── cache
    │   │   └── cache_manager.py
    │   └── ai_fixture_base.py
    ├── utils
    │   └── transition.py
    ├── appium
    │   ├── actions.py
    │   └── fixture.py
    ├── selenium
    │   └── fixture.py
    └── playwright
    │   └── fixture.py
├── wechat.jpg
├── auto-wing.png
├── docs
    ├── image
    │   ├── working.png
    │   └── element_list.png
    └── how_to_work.md
├── examples
    ├── .env
    ├── test_selenium_iframes.py
    ├── test_playwright_iframes.py
    ├── test_selenium_pytest.py
    ├── test_selenium_unittest.py
    ├── test_playwright_pytest.py
    ├── test_appium_unittest.py
    ├── test_appium_pytest.py
    └── test_playwright_unittest.py
├── pyproject.toml
├── CHANGES.md
├── .gitignore
├── README.md
└── LICENSE


/autowing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autowing/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autowing/core/llm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autowing/core/llm/client/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wechat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeldomQA/auto-wing/HEAD/wechat.jpg


--------------------------------------------------------------------------------
/auto-wing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeldomQA/auto-wing/HEAD/auto-wing.png


--------------------------------------------------------------------------------
/docs/image/working.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeldomQA/auto-wing/HEAD/docs/image/working.png


--------------------------------------------------------------------------------
/docs/image/element_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeldomQA/auto-wing/HEAD/docs/image/element_list.png


--------------------------------------------------------------------------------
/examples/.env:
--------------------------------------------------------------------------------
1 | AUTOWING_MODEL_PROVIDER=deepseek
2 | DEEPSEEK_API_KEY=sk-abdefghijklmnopqrstwvwxyz0123456789


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["poetry-core>=1.0.0"]
 3 | build-backend = "poetry.core.masonry.api"
 4 | 
 5 | [project]
 6 | name = "autowing"
 7 | version = "0.6.1"
 8 | description = "auto-wing is a tool that uses LLM to assist automated testing."
 9 | readme = "README.md"
10 | authors = [
11 |     { name = "defnngj", email = "defnngj@gmail.com" }
12 | ]
13 | requires-python = ">=3.9"
14 | 
15 | dependencies = [
16 |     "openai>=1.60.1,<2.0.0",
17 |     "dashscope>=1.22.1,<2.0.0",
18 |     "pytest-playwright>=0.6.2,<0.7.0",
19 |     "appium-python-client>=5.1.0,<6.0.0",
20 |     "python-dotenv>=1.0.1,<2.0.0",
21 |     "loguru>=0.7.3,<0.8.0",
22 | ]
23 | 
24 | [project.urls]
25 | repository = "https://github.com/SeldomQA/auto-wing"
26 | homepage = "https://github.com/SeldomQA/auto-wing"
27 | 


--------------------------------------------------------------------------------
/autowing/utils/transition.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def selector_to_locator(selector: str) -> str:
 5 |     """
 6 |     selector to playwright locator
 7 |     :param selector:
 8 |     :return:
 9 |     """
10 |     if '[text()=' in selector:
11 |         return re.sub(
12 |             r'\[text\(\)\s*=\s*(?P<quote>[\'"])(?P<content>.*?)(?P=quote)\]',
13 |             lambda m: f':has-text({m.group("quote")}{m.group("content")}{m.group("quote")})',
14 |             selector
15 |         )
16 | 
17 |     return selector
18 | 
19 | 
20 | def selector_to_selenium(selector: str) -> str:
21 |     """
22 |     selector to selenium
23 |     :param selector:
24 |     :return:
25 |     """
26 |     if '[text()=' in selector:
27 |         pattern = re.compile(r'\[text\(\)\s*=\s*(?P<quote>[\'"])(?P<content>.*?)(?P=quote)\]')
28 |         return pattern.sub(r'[contains(text(),\g<quote>\g<content>\g<quote>)]', selector)
29 | 
30 |     return selector
31 | 


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
 1 | ### 0.6.1
 2 | 
 3 | * 支持AI操作文本链接。
 4 | * appium升级`>5.1`。
 5 | * 更新qwen默认模型，使用最新`qwen3`。
 6 | 
 7 | ### 0.6.0
 8 | 
 9 | * 增加默认缓存功能，减少不必要的LLM调用，增加速度。
10 | * 移动端支持iOS❕。
11 | * 更新qwen默认模型，使用最新`qwen2.5`。
12 | 
13 | ### 0.5.1
14 | 
15 | * 识别更多的页面元素。
16 | * CSS选择器优化提示词，用于识别包含`$`符号的ID属性。
17 | * `playwright`/`selenium` 分别支持表单操作。
18 | * 移除`prompt`中无效信息，节省`tokens`使用。
19 | * LLM客户端代码优化。
20 | 
21 | ### 0.5.0
22 | 
23 | * 功能：增加 `ai_function_case()`, 识别页面元素生成功能用例。
24 | * 功能：增加appium依赖，支持App端的AI操作。
25 | 
26 | ### 0.4.0
27 | 
28 | * 功能：增加 `doubao`支持。
29 | 
30 | ### 0.3.0
31 | 
32 | * 增加日志功能，调用相关API显示日志。
33 | * 优化fixture相关代码。
34 | * python版本要求`>=3.9`(最新selenium版本要求)。
35 | 
36 | ### 0.2.2
37 | 
38 | * 优化：`ai_query()`、`ai_assert()`识别速度和格式兼容性。
39 | 
40 | ### 0.2.1
41 | 
42 | * 优化：python版本要求改为`>=3.8`。
43 | 
44 | ### 0.2.0
45 | 
46 | * 功能：增加 `openai`支持。
47 | 
48 | ### 0.1.0
49 | 
50 | * 功能：
51 |     * 支持LLM: `qwen`、`deepseek`。
52 |     * 提供操作：`ai_action()`、`ai_query()`、`ai_assert()`。
53 |     * 支持测试库： `playwright`、`selenium`等。
54 | 


--------------------------------------------------------------------------------
/examples/test_selenium_iframes.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import pytest
 4 | from dotenv import load_dotenv
 5 | from selenium import webdriver
 6 | from selenium.webdriver.common.by import By
 7 | 
 8 | from autowing.selenium.fixture import create_fixture
 9 | 
10 | 
11 | @pytest.fixture(scope="session")
12 | def driver():
13 |     """
14 |     Create and configure Edge WebDriver instance.
15 |     """
16 | 
17 |     load_dotenv()
18 | 
19 |     driver = webdriver.Edge()
20 | 
21 |     yield driver
22 | 
23 |     driver.quit()
24 | 
25 | 
26 | @pytest.fixture
27 | def ai(driver):
28 |     """
29 |     Create an AI-powered Selenium fixture.
30 |     """
31 |     ai_fixture = create_fixture()
32 |     return ai_fixture(driver)
33 | 
34 | 
35 | def test_iframes(ai, driver):
36 |     driver.get("https://sahitest.com/demo/iframesTest.htm")
37 | 
38 |     iframe = driver.find_element(By.XPATH, "/html/body/iframe")
39 |     driver.switch_to.frame(iframe)
40 | 
41 |     ai.ai_action('点击"Link Test"链接')
42 | 
43 |     time.sleep(2)
44 | 
45 |     ai.ai_query('检查页面是否包含"linkByContent"字符串')
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     pytest.main(["test_selenium_iframes.py", "-s"])
50 | 


--------------------------------------------------------------------------------
/examples/test_playwright_iframes.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from dotenv import load_dotenv
 3 | from playwright.sync_api import Page, sync_playwright
 4 | 
 5 | from autowing.playwright.fixture import create_fixture
 6 | 
 7 | 
 8 | @pytest.fixture(scope="session")
 9 | def page():
10 |     """playwright fixture"""
11 | 
12 |     load_dotenv()
13 | 
14 |     with sync_playwright() as p:
15 |         browser = p.chromium.launch(headless=False)
16 |         context = browser.new_context()
17 |         page = context.new_page()
18 | 
19 |         yield page
20 | 
21 |         context.close()
22 |         browser.close()
23 | 
24 | 
25 | @pytest.fixture
26 | def ai(page):
27 |     """ai fixture"""
28 |     ai_fixture = create_fixture()
29 |     return ai_fixture(page)
30 | 
31 | 
32 | def test_baidu_search(page: Page, ai):
33 |     page.goto("https://sahitest.com/demo/iframesTest.htm")
34 | 
35 |     iframe = page.frame_locator("body > iframe")
36 | 
37 |     ai.ai_action('点击"Link Test"链接', iframe)
38 | 
39 |     page.wait_for_timeout(2000)
40 | 
41 |     ai.ai_query('检查页面是否包含"linkByContent"字符串')
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     pytest.main(["test_playwright_iframes.py", "-s"])
46 | 


--------------------------------------------------------------------------------
/autowing/core/ai_context.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Optional
 2 | import json
 3 | 
 4 | 
 5 | class AiContext:
 6 |     """
 7 |     A class for managing AI context data.
 8 |     Provides storage and retrieval of context information used in AI operations.
 9 |     """
10 | 
11 |     def __init__(self):
12 |         """
13 |         Initialize an empty context storage.
14 |         """
15 |         self._context: Dict[str, Any] = {}
16 |         
17 |     def set_context(self, key: str, value: Any) -> None:
18 |         """
19 |         Store a value in the context.
20 | 
21 |         Args:
22 |             key (str): The key under which to store the value
23 |             value (Any): The value to store
24 |         """
25 |         self._context[key] = value
26 | 
27 |     def get_context(self, key: str) -> Optional[Any]:
28 |         """
29 |         Retrieve a value from the context.
30 | 
31 |         Args:
32 |             key (str): The key of the value to retrieve
33 | 
34 |         Returns:
35 |             Optional[Any]: The stored value, or None if the key doesn't exist
36 |         """
37 |         return self._context.get(key)
38 | 
39 |     def to_json(self) -> str:
40 |         """
41 |         Convert the context to a JSON string.
42 | 
43 |         Returns:
44 |             str: JSON string representation of the context
45 |         """
46 |         return json.dumps(self._context)
47 | 


--------------------------------------------------------------------------------
/autowing/appium/actions.py:
--------------------------------------------------------------------------------
 1 | from time import sleep as sys_sleep
 2 | 
 3 | from loguru import logger
 4 | from selenium.webdriver.common.action_chains import ActionChains
 5 | from selenium.webdriver.common.actions import interaction
 6 | from selenium.webdriver.common.actions.action_builder import ActionBuilder
 7 | from selenium.webdriver.common.actions.pointer_input import PointerInput
 8 | 
 9 | 
10 | class Action:
11 |     """
12 |     Encapsulate basic actions: tap, etc
13 |     """
14 | 
15 |     def __init__(self, driver=None):
16 |         self.driver = driver
17 | 
18 |     def tap(self, x: int, y: int, pause: float = 0.1, sleep: float = 1) -> None:
19 |         """
20 |         Tap on the coordinates
21 |         :param x: x coordinates
22 |         :param y: y coordinates
23 |         :param pause: pause time
24 |         :param sleep: sleep time
25 |         :return:
26 |         """
27 |         logger.info(f"👆 top x={x},y={y}.")
28 |         actions = ActionChains(self.driver)
29 |         actions.w3c_actions = ActionBuilder(self.driver, mouse=PointerInput(interaction.POINTER_TOUCH, "touch"))
30 |         actions.w3c_actions.pointer_action.move_to_location(x, y)
31 |         actions.w3c_actions.pointer_action.pointer_down()
32 |         actions.w3c_actions.pointer_action.pause(pause)
33 |         actions.w3c_actions.pointer_action.release()
34 |         actions.perform()
35 |         sys_sleep(sleep)
36 | 


--------------------------------------------------------------------------------
/examples/test_selenium_pytest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pytest example for Selenium with AI automation.
 3 | """
 4 | import time
 5 | import pytest
 6 | from selenium import webdriver
 7 | from autowing.selenium.fixture import create_fixture
 8 | 
 9 | from dotenv import load_dotenv
10 | 
11 | 
12 | @pytest.fixture(scope="session")
13 | def driver():
14 |     """
15 |     Create and configure Edge WebDriver instance.
16 |     """
17 |     # loading .env file
18 |     load_dotenv()
19 | 
20 |     driver = webdriver.Edge()
21 |     
22 |     yield driver
23 |     
24 |     driver.quit()
25 | 
26 | 
27 | @pytest.fixture
28 | def ai(driver):
29 |     """
30 |     Create an AI-powered Selenium fixture.
31 |     """
32 |     ai_fixture = create_fixture()
33 |     return ai_fixture(driver)
34 | 
35 | 
36 | def test_bing_search(ai, driver):
37 |     """
38 |     Test Bing search functionality using AI-driven automation.
39 | 
40 |     This test demonstrates:
41 |     1. Navigating to Bing
42 |     2. Performing a search
43 |     3. Verifying search results
44 |     """
45 |     # Navigate to Bing
46 |     driver.get("https://cn.bing.com")
47 | 
48 |     ai.ai_action('搜索输入框输入"playwright"关键字，并回车')
49 |     time.sleep(3)
50 | 
51 |     items = ai.ai_query('string[], 搜索结果列表中包含"playwright"相关的标题')
52 |     assert len(items) > 1
53 | 
54 |     # 使用AI断言
55 |     assert ai.ai_assert('检查搜索结果列表第一条标题是否包含"playwright"字符串')
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     pytest.main(["test_selenium_pytest.py", "-s"])
60 | 


--------------------------------------------------------------------------------
/examples/test_selenium_unittest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unittest example for Selenium with AI automation.
 3 | """
 4 | import unittest
 5 | import time
 6 | from selenium import webdriver
 7 | from autowing.selenium.fixture import create_fixture
 8 | from dotenv import load_dotenv
 9 | 
10 | 
11 | class TestBingSearch(unittest.TestCase):
12 | 
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         # load .env file
16 |         load_dotenv()
17 |         # Initialize Edge WebDriver
18 |         cls.driver = webdriver.Edge()
19 |         # Create AI fixture
20 |         ai_fixture = create_fixture()
21 |         cls.ai = ai_fixture(cls.driver)
22 | 
23 |     @classmethod
24 |     def tearDownClass(cls):
25 |         cls.driver.quit()
26 | 
27 |     def test_01_bing_search(self):
28 |         """
29 |         Test Bing search functionality using AI-driven automation.
30 | 
31 |         This test demonstrates:
32 |         1. Navigating to Bing
33 |         2. Performing a search
34 |         3. Verifying search results
35 |         """
36 |         self.driver.get("https://cn.bing.com")
37 | 
38 |         self.ai.ai_action('搜索输入框输入"playwright"关键字，并回车')
39 |         time.sleep(3)
40 | 
41 |         items = self.ai.ai_query('string[], 搜索结果列表中包含"playwright"相关的标题')
42 | 
43 |         self.assertGreater(len(items), 1)
44 | 
45 |         self.assertTrue(
46 |             self.ai.ai_assert('检查搜索结果列表第一条标题是否包含"playwright"字符串')
47 |         )
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     unittest.main()
52 | 


--------------------------------------------------------------------------------
/examples/test_playwright_pytest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | pytest example for Playwright with AI automation.
 3 | """
 4 | import pytest
 5 | from playwright.sync_api import Page, sync_playwright
 6 | from autowing.playwright.fixture import create_fixture
 7 | 
 8 | from dotenv import load_dotenv
 9 | 
10 | 
11 | @pytest.fixture(scope="session")
12 | def page():
13 |     """
14 |     playwright fixture
15 |     """
16 |     # loading .env file
17 |     load_dotenv()
18 |     with sync_playwright() as p:
19 |         browser = p.chromium.launch(headless=False)
20 |         context = browser.new_context()
21 |         page = context.new_page()
22 |         yield page
23 |         context.close()
24 |         browser.close()
25 | 
26 | 
27 | @pytest.fixture
28 | def ai(page):
29 |     """
30 |     ai fixture
31 |     """
32 |     ai_fixture = create_fixture()
33 |     return ai_fixture(page)
34 | 
35 | 
36 | def test_bing_search(page: Page, ai):
37 |     """
38 |     Test Bing search functionality using AI-driven automation.
39 |     This test demonstrates:
40 |     1. Navigating to Bing
41 |     2. Performing a search
42 |     3. Verifying search results
43 |     """
44 |     page.goto("https://cn.bing.com")
45 | 
46 |     ai.ai_action('搜索输入框输入"playwright"关键字，并回车')
47 |     page.wait_for_timeout(3000)
48 | 
49 |     items = ai.ai_query('string[], 搜索结果列表中包含"playwright"相关的标题')
50 | 
51 |     assert len(items) > 1
52 | 
53 |     print("assert")
54 |     assert ai.ai_assert('检查搜索结果列表第一条标题是否包含"playwright"字符串')
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     pytest.main(["test_playwright_pytest.py", "-s"])
59 | 


--------------------------------------------------------------------------------
/examples/test_appium_unittest.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import unittest
 3 | 
 4 | from appium import webdriver
 5 | from appium.options.android import UiAutomator2Options
 6 | from dotenv import load_dotenv
 7 | 
 8 | from autowing.appium.fixture import create_fixture
 9 | 
10 | 
11 | class TestBingApp(unittest.TestCase):
12 |     """
13 |     Test Bing APP
14 |     """
15 | 
16 |     @classmethod
17 |     def setUpClass(cls):
18 |         load_dotenv()
19 | 
20 |     def setUp(self):
21 |         capabilities = {
22 |             'deviceName': 'MDX0220413011925',
23 |             'automationName': 'UiAutomator2',
24 |             'platformName': 'Android',
25 |             'appPackage': 'com.microsoft.bing',
26 |             'appActivity': 'com.microsoft.sapphire.app.main.MainSapphireActivity',
27 |             'noReset': True,
28 |         }
29 |         options = UiAutomator2Options().load_capabilities(capabilities)
30 |         self.driver = webdriver.Remote(command_executor="http://127.0.0.1:4723", options=options)
31 | 
32 |         ai_fixture = create_fixture()
33 |         self.ai = ai_fixture(self.driver)
34 | 
35 |     def tearDown(self):
36 |         self.driver.quit()
37 | 
38 |     def test_bing_search(self):
39 |         """
40 |         test bing App search
41 |         """
42 |         self.ai.ai_action('点击搜索框，然后输入"auto-wing"关键字，然后回车搜索')
43 |         time.sleep(3)
44 | 
45 |         items = self.ai.ai_query('string[], 搜索结果列表中包含"auto-wing"相关的标题')
46 |         assert len(items) > 1
47 | 
48 |         self.ai.ai_assert('检查搜索结果列表第一条标题是否包含"auto-wing"字符串')
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     unittest.main()
53 | 


--------------------------------------------------------------------------------
/examples/test_appium_pytest.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import pytest
 4 | from appium import webdriver
 5 | from appium.options.android import UiAutomator2Options
 6 | from dotenv import load_dotenv
 7 | 
 8 | from autowing.appium.fixture import create_fixture
 9 | 
10 | 
11 | @pytest.fixture(scope="function")
12 | def driver():
13 |     """
14 |     Create and configure Edge WebDriver instance.
15 |     """
16 |     # loading .env file
17 |     load_dotenv()
18 | 
19 |     capabilities = {
20 |         'deviceName': 'MDX0220413011925',
21 |         'automationName': 'UiAutomator2',
22 |         'platformName': 'Android',
23 |         'appPackage': 'com.microsoft.bing',
24 |         'appActivity': 'com.microsoft.sapphire.app.main.MainSapphireActivity',
25 |         'noReset': True,
26 |     }
27 |     options = UiAutomator2Options().load_capabilities(capabilities)
28 |     driver = webdriver.Remote(command_executor="http://127.0.0.1:4723", options=options)
29 | 
30 |     yield driver
31 | 
32 |     driver.quit()
33 | 
34 | 
35 | @pytest.fixture
36 | def ai(driver):
37 |     """
38 |     Create an AI-powered Selenium fixture.
39 |     """
40 |     ai_fixture = create_fixture()
41 |     return ai_fixture(driver, "Android")
42 | 
43 | 
44 | def test_bing_search(ai, driver):
45 |     """
46 |     test bing App search
47 |     """
48 |     ai.ai_action('点击搜索框，然后输入"auto-wing"关键字，然后回车搜索')
49 |     time.sleep(3)
50 | 
51 |     items = ai.ai_query('string[], 搜索结果列表中包含"auto-wing"相关的标题')
52 |     assert len(items) > 1
53 | 
54 |     ai.ai_assert('检查搜索结果列表第一条标题是否包含"auto-wing"字符串')
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     pytest.main(["test_appium_pytest.py", "-s"])
59 | 


--------------------------------------------------------------------------------
/examples/test_playwright_unittest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unittest example for Playwright with AI automation.
 3 | """
 4 | import unittest
 5 | from playwright.sync_api import sync_playwright
 6 | from autowing.playwright.fixture import create_fixture
 7 | from dotenv import load_dotenv
 8 | 
 9 | 
10 | class TestBingSearch(unittest.TestCase):
11 | 
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         # loading .env file
15 |         load_dotenv()
16 |         # Initialize browser
17 |         cls.playwright = sync_playwright().start()
18 |         cls.browser = cls.playwright.chromium.launch(headless=False)
19 |         cls.context = cls.browser.new_context()
20 |         cls.page = cls.context.new_page()
21 |         # Create AI fixture
22 |         ai_fixture = create_fixture()
23 |         cls.ai = ai_fixture(cls.page)
24 | 
25 |     @classmethod
26 |     def tearDownClass(cls):
27 |         cls.context.close()
28 |         cls.browser.close()
29 |         cls.playwright.stop()
30 | 
31 |     def test_01_bing_search(self):
32 |         """
33 |         Test Bing search functionality using AI-driven automation.
34 |         This test demonstrates:
35 |         1. Navigating to Bing
36 |         2. Performing a search
37 |         3. Verifying search results
38 |         """
39 |         self.page.goto("https://cn.bing.com")
40 | 
41 |         self.ai.ai_action('搜索输入框输入"playwright"关键字，并回车')
42 |         self.page.wait_for_timeout(3000)
43 | 
44 |         items = self.ai.ai_query('string[], 搜索结果列表中包含"playwright"相关的标题')
45 | 
46 |         self.assertGreater(len(items), 1)
47 | 
48 |         self.assertTrue(
49 |             self.ai.ai_assert('检查搜索结果列表第一条标题是否包含"playwright"字符串')
50 |         )
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     unittest.main()
55 | 


--------------------------------------------------------------------------------
/autowing/core/llm/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Dict, Any, Optional
 3 | 
 4 | 
 5 | class BaseLLMClient(ABC):
 6 |     """
 7 |     Abstract base class for Language Model clients.
 8 |     Defines the interface that all LLM clients must implement.
 9 |     """
10 | 
11 |     @abstractmethod
12 |     def complete(self, prompt: str, context: Optional[Dict[str, Any]] = None) -> str:
13 |         """
14 |         Generate a completion for the given prompt with optional context.
15 | 
16 |         Args:
17 |             prompt (str): The input text to generate completion for
18 |             context (Optional[Dict[str, Any]]): Additional context information for the completion
19 | 
20 |         Returns:
21 |             str: The generated completion text
22 | 
23 |         Raises:
24 |             NotImplementedError: If the subclass doesn't implement this method
25 |         """
26 |         pass
27 | 
28 |     @abstractmethod
29 |     def complete_with_vision(self, prompt: Dict[str, Any]) -> str:
30 |         """
31 |         Generate a completion for vision-based tasks.
32 | 
33 |         Args:
34 |             prompt (Dict[str, Any]): A dictionary containing the prompt and image data
35 |                                    in the format required by the specific model
36 | 
37 |         Returns:
38 |             str: The generated completion text
39 | 
40 |         Raises:
41 |             NotImplementedError: If the subclass doesn't implement this method
42 |         """
43 |         pass
44 | 
45 |     @classmethod
46 |     def get_model_name(cls) -> str:
47 |         """
48 |         Get the standardized name of the model.
49 | 
50 |         Returns:
51 |             str: The model name in lowercase, with 'client' suffix removed
52 |         """
53 |         return cls.__name__.lower().replace('client', '')
54 | 


--------------------------------------------------------------------------------
/autowing/core/llm/factory.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Type
 3 | 
 4 | from loguru import logger
 5 | 
 6 | from autowing.core.llm.base import BaseLLMClient
 7 | from autowing.core.llm.client.deepseek import DeepSeekClient
 8 | from autowing.core.llm.client.doubao import DoubaoClient
 9 | from autowing.core.llm.client.openai import OpenAIClient
10 | from autowing.core.llm.client.qwen import QwenClient
11 | 
12 | 
13 | class LLMFactory:
14 |     """
15 |     Factory class for creating Language Model clients.
16 |     Provides centralized management of different LLM implementations.
17 |     """
18 | 
19 |     _models = {
20 |         'openai': OpenAIClient,
21 |         'qwen': QwenClient,
22 |         'deepseek': DeepSeekClient,
23 |         'doubao': DoubaoClient
24 |     }
25 | 
26 |     @classmethod
27 |     def create(cls) -> BaseLLMClient:
28 |         """
29 |         Create an instance of the configured LLM client.
30 | 
31 |         Returns:
32 |             BaseLLMClient: An instance of the specified LLM client
33 | 
34 |         Raises:
35 |             ValueError: If the specified model provider is not supported
36 |         """
37 |         model_name = os.getenv("AUTOWING_MODEL_PROVIDER", "deepseek").lower()
38 |         if model_name not in cls._models:
39 |             raise ValueError(f"Unsupported model provider: {model_name}")
40 | 
41 |         logger.info(f"🤖 AUTOWING_MODEL_PROVIDER={model_name}")
42 | 
43 |         model_class = cls._models[model_name]
44 |         return model_class()
45 | 
46 |     @classmethod
47 |     def register_model(cls, name: str, model_class: Type[BaseLLMClient]) -> None:
48 |         """
49 |         Register a new LLM client implementation.
50 | 
51 |         Args:
52 |             name (str): The name to register the model under
53 |             model_class (Type[BaseLLMClient]): The class implementing the BaseLLMClient interface
54 |         """
55 |         cls._models[name.lower()] = model_class
56 | 


--------------------------------------------------------------------------------
/autowing/core/cache/cache_manager.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import json
  3 | import os
  4 | from datetime import datetime, timedelta
  5 | from typing import Any, Optional
  6 | 
  7 | 
  8 | class CacheManager:
  9 |     """
 10 |     Manages caching of AI responses to improve performance.
 11 |     """
 12 | 
 13 |     def __init__(self, cache_dir: str = ".auto-wing/cache", ttl_days: int = 7):
 14 |         """
 15 |         Initialize the cache manager.
 16 |         
 17 |         Args:
 18 |             cache_dir: Directory to store cache files
 19 |             ttl_days: Number of days to keep cache entries
 20 |         """
 21 |         self.cache_dir = cache_dir
 22 |         self.ttl_days = ttl_days
 23 |         os.makedirs(cache_dir, exist_ok=True)
 24 | 
 25 |     def _generate_cache_key(self, prompt: str, context: dict) -> str:
 26 |         """Generate a unique cache key based on prompt and context."""
 27 |         # Create a string combining prompt and relevant context
 28 |         cache_str = f"{prompt}:{json.dumps(context, sort_keys=True)}"
 29 |         return hashlib.md5(cache_str.encode()).hexdigest()
 30 | 
 31 |     def _get_cache_path(self, cache_key: str) -> str:
 32 |         """Get the file path for a cache entry."""
 33 |         return os.path.join(self.cache_dir, f"{cache_key}.json")
 34 | 
 35 |     def get(self, prompt: str, context: dict) -> Optional[Any]:
 36 |         """
 37 |         Get a cached response if available and not expired.
 38 |         """
 39 |         cache_key = self._generate_cache_key(prompt, context)
 40 |         cache_path = self._get_cache_path(cache_key)
 41 | 
 42 |         if not os.path.exists(cache_path):
 43 |             return None
 44 | 
 45 |         try:
 46 |             with open(cache_path, 'r', encoding='utf-8') as f:
 47 |                 cache_data = json.load(f)
 48 | 
 49 |             # Check if cache has expired
 50 |             cached_time = datetime.fromisoformat(cache_data['timestamp'])
 51 |             if datetime.now() - cached_time > timedelta(days=self.ttl_days):
 52 |                 os.remove(cache_path)
 53 |                 return None
 54 | 
 55 |             return cache_data['response']
 56 |         except (json.JSONDecodeError, KeyError, ValueError):
 57 |             return None
 58 | 
 59 |     def set(self, prompt: str, context: dict, response: Any) -> None:
 60 |         """
 61 |         Cache a response.
 62 |         """
 63 |         cache_key = self._generate_cache_key(prompt, context)
 64 |         cache_path = self._get_cache_path(cache_key)
 65 | 
 66 |         cache_data = {
 67 |             'timestamp': datetime.now().isoformat(),
 68 |             'prompt': prompt,
 69 |             'context': context,
 70 |             'response': response
 71 |         }
 72 | 
 73 |         with open(cache_path, 'w', encoding='utf-8') as f:
 74 |             json.dump(cache_data, f, ensure_ascii=False, indent=2)
 75 | 
 76 |     def clear(self, days: Optional[int] = None) -> None:
 77 |         """
 78 |         Clear expired cache entries.
 79 |         
 80 |         Args:
 81 |             days: Optional number of days, defaults to ttl_days
 82 |         """
 83 |         if days is None:
 84 |             days = self.ttl_days
 85 | 
 86 |         for filename in os.listdir(self.cache_dir):
 87 |             if not filename.endswith('.json'):
 88 |                 continue
 89 | 
 90 |             filepath = os.path.join(self.cache_dir, filename)
 91 |             try:
 92 |                 with open(filepath, 'r', encoding='utf-8') as f:
 93 |                     cache_data = json.load(f)
 94 | 
 95 |                 cached_time = datetime.fromisoformat(cache_data['timestamp'])
 96 |                 if datetime.now() - cached_time > timedelta(days=days):
 97 |                     os.remove(filepath)
 98 |             except (json.JSONDecodeError, KeyError, ValueError):
 99 |                 # Remove invalid cache files
100 |                 os.remove(filepath)
101 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # PyPI configuration file
171 | .pypirc
172 | .qodo
173 | .idea/


--------------------------------------------------------------------------------
/autowing/core/ai_fixture_base.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | from autowing.core.cache.cache_manager import CacheManager
  4 | 
  5 | 
  6 | class AiFixtureBase:
  7 |     """
  8 |     Base class for AI Fixtures. Contains common response processing logic
  9 |     shared between Playwright and Selenium fixtures.
 10 |     """
 11 | 
 12 |     def __init__(self):
 13 |         """Initialize the base fixture with cache support."""
 14 |         self.cache_manager = CacheManager()
 15 | 
 16 |     def _remove_empty_keys(self, dict_list: list) -> list:
 17 |         """
 18 |         remove element keys, Reduce tokens use.
 19 |         :return:
 20 |         """
 21 |         if not dict_list:
 22 |             return []
 23 | 
 24 |         new_list = []
 25 |         for d in dict_list:
 26 |             new_dict = {k: v for k, v in d.items() if v != '' and v is not None}
 27 |             new_list.append(new_dict)
 28 | 
 29 |         return new_list
 30 | 
 31 |     def _clean_response(self, response: str) -> str:
 32 |         """
 33 |         Clean the response text by stripping markdown formatting.
 34 |         
 35 |         Args:
 36 |             response (str): Raw response from LLM
 37 | 
 38 |         Returns:
 39 |             str: Cleaned response text.
 40 |         """
 41 |         response = response.strip()
 42 |         if '```' in response:
 43 |             # Prioritize handling ```json format
 44 |             if '```json' in response:
 45 |                 response = response.split('```json')[1].split('```')[0].strip()
 46 |             else:
 47 |                 response = response.split('```')[1].split('```')[0].strip()
 48 |             # If the cleaned response starts with "json" or "python", remove the first line description
 49 |             if response.startswith(('json', 'python')):
 50 |                 parts = response.split('\n', 1)
 51 |                 if len(parts) > 1:
 52 |                     response = parts[1].strip()
 53 |         return response
 54 | 
 55 |     def _validate_result_format(self, result: Any, format_hint: str) -> Any:
 56 |         """
 57 |         Validate and convert the result to match the requested format.
 58 |     
 59 |         Args:
 60 |             result: The parsed result from AI response.
 61 |             format_hint: The requested format (e.g., 'string[]').
 62 |     
 63 |         Returns:
 64 |             The validated and possibly converted result.
 65 |     
 66 |         Raises:
 67 |             ValueError: If the result doesn't match the requested format.
 68 |         """
 69 |         if not format_hint:
 70 |             return result
 71 | 
 72 |         if format_hint == 'string[]':
 73 |             if not isinstance(result, list):
 74 |                 result = [str(result)]
 75 |             return [str(item) for item in result]
 76 | 
 77 |         if format_hint == 'number[]':
 78 |             if not isinstance(result, list):
 79 |                 result = [result]
 80 |             try:
 81 |                 return [float(item) for item in result]
 82 |             except (ValueError, TypeError):
 83 |                 raise ValueError(f"Cannot convert results to numbers: {result}")
 84 | 
 85 |         if format_hint == 'object[]':
 86 |             if not isinstance(result, list):
 87 |                 result = [result]
 88 |             if not all(isinstance(item, dict) for item in result):
 89 |                 raise ValueError(f"Not all items are objects: {result}")
 90 |             return result
 91 | 
 92 |         return result
 93 | 
 94 |     def _get_cached_or_compute(self, prompt: str, context: dict, compute_func) -> Any:
 95 |         """
 96 |         Get response from cache or compute it using the provided function.
 97 |         
 98 |         Args:
 99 |             prompt: The prompt to generate cache key
100 |             context: The context to generate cache key
101 |             compute_func: Function to compute response if not cached
102 |         """
103 |         # Try to get from cache first
104 |         cached_response = self.cache_manager.get(prompt, context)
105 |         if cached_response is not None:
106 |             return cached_response
107 | 
108 |         # Compute response if not cached
109 |         response = compute_func()
110 | 
111 |         # Cache the computed response
112 |         self.cache_manager.set(prompt, context, response)
113 | 
114 |         return response
115 | 


--------------------------------------------------------------------------------
/docs/how_to_work.md:
--------------------------------------------------------------------------------
  1 | # auto-wing 实现原理 - 如何利用AI识别页面元素。
  2 | 
  3 | ## auto-wing 介绍
  4 | 
  5 | ...
  6 | 
  7 | ## 工作原理
  8 | 
  9 | * __流程图__
 10 | 
 11 | ![](./image/working.png)
 12 | 
 13 | 
 14 | ## 实现过程
 15 | 
 16 | ### 1. 抓取当前页面所有元素
 17 | 
 18 | ```js
 19 | const getVisibleElements = () => {
 20 |     const elements = [];
 21 |     const selectors = [
 22 |         'input', 'button', 'a', '[role="button"]',
 23 |         '[role="link"]', '[role="searchbox"]', 'textarea'
 24 |     ];
 25 |     
 26 |     for (const selector of selectors) {
 27 |         document.querySelectorAll(selector).forEach(el => {
 28 |             if (el.offsetWidth > 0 && el.offsetHeight > 0) {
 29 |                 elements.push({
 30 |                     tag: el.tagName.toLowerCase(),
 31 |                     type: el.getAttribute('type'),
 32 |                     placeholder: el.getAttribute('placeholder'),
 33 |                     value: el.value,
 34 |                     text: el.textContent?.trim(),
 35 |                     aria: el.getAttribute('aria-label'),
 36 |                     id: el.id,
 37 |                     name: el.getAttribute('name'),
 38 |                     class: el.className
 39 |                 });
 40 |             }
 41 |         });
 42 |     }
 43 |     return elements;
 44 | };
 45 | ```
 46 | 
 47 | 获取当前页面的元素信息：
 48 | 
 49 | ![](./image/element_list.png)
 50 | 
 51 | 
 52 | ### 2.根据描述分析元素定位和动作
 53 | 
 54 | __2.1 大模型的调用__
 55 | 
 56 | * openai SDK(推荐)
 57 | 
 58 | ```python
 59 | import os
 60 | from openai import OpenAI
 61 | 
 62 | client = OpenAI(
 63 |     # 若没有配置环境变量，请用百炼API Key将下行替换为：api_key="sk-xxx",
 64 |     api_key=os.getenv("DASHSCOPE_API_KEY"), 
 65 |     base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
 66 | )
 67 | completion = client.chat.completions.create(
 68 |     model="qwen-plus", # 此处以qwen-plus为例，可按需更换模型名称。模型列表：https://help.aliyun.com/zh/model-studio/getting-started/models
 69 |     messages=[
 70 |         {'role': 'system', 'content': 'You are a helpful assistant.'},
 71 |         {'role': 'user', 'content': '你是谁？'}],
 72 |     )
 73 |     
 74 | print(completion.model_dump_json())
 75 | ```
 76 | 
 77 | * Request调用
 78 | 
 79 | ```python
 80 | import requests
 81 | import os
 82 | 
 83 | # 设置 API 密钥
 84 | DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")  # 从环境变量中获取 API 密钥
 85 | 
 86 | # 请求的 URL
 87 | url = "https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
 88 | 
 89 | # 请求头
 90 | headers = {
 91 |     "Authorization": f"Bearer {DASHSCOPE_API_KEY}",
 92 |     "Content-Type": "application/json"
 93 | }
 94 | 
 95 | # 请求体
 96 | data = {
 97 |     "model": "qwen-plus",
 98 |     "messages": [
 99 |         {
100 |             "role": "system",
101 |             "content": "You are a helpful assistant."
102 |         },
103 |         {
104 |             "role": "user",
105 |             "content": "你是谁？"
106 |         }
107 |     ]
108 | }
109 | 
110 | # 发送 POST 请求
111 | response = requests.post(url, headers=headers, json=data)
112 | 
113 | # 打印响应结果
114 | print(response.status_code)
115 | print(response.json())
116 | ```
117 | 
118 | __2.2 提示词__
119 | 
120 | ```python
121 | 
122 | action_prompt= """You are a web automation assistant. Based on the following page context, provide instructions for the requested action.
123 | 
124 | Current page context:
125 | URL: {context['url']}
126 | Title: {context['title']}
127 | 
128 | Available elements:
129 | {json.dumps(context['elements'], indent=2)}
130 | 
131 | User request: {prompt}
132 | 
133 | Return ONLY a JSON object with the following structure, no other text:
134 | {{
135 |     "selector": "CSS selector or XPath to locate the element",
136 |     "action": "fill",
137 |     "value": "text to input",
138 |     "key": "key to press if needed"
139 | }}
140 | 
141 | Example response:
142 | {{
143 |     "selector": "#search-input",
144 |     "action": "fill",
145 |     "value": "search text",
146 |     "key": "Enter"
147 | }}
148 | """
149 | 
150 | ```
151 | 
152 | 识别并返回的数据：
153 | 
154 | ```json
155 | {
156 |     "selector": "#sb_form_q",
157 |     "action": "fill",
158 |     "value": "playwright",
159 |     "key": "Enter"
160 | }
161 | ```
162 | 
163 | ### 3. 根据返回，转化动作。
164 | 
165 | 转化为自动化工具的动作执行：
166 | 
167 | ```python
168 |     ...
169 |     element = self.page.locator(selector)
170 |     if action == 'click':
171 |         element.click()
172 |     elif action == 'fill':
173 |         element.fill(instruction.get('value', ''))
174 |         if instruction.get('key'):
175 |             element.press(instruction.get('key'))
176 |     elif action == 'press':
177 |         element.press(instruction.get('key', 'Enter'))
178 |     else:
179 |         raise ValueError(f"Unsupported action: {action}")
180 | ```
181 | 
182 | ## 代码设计
183 | 
184 | 1. 如何支持更多的模型。
185 | 2. 如何支持多个测试库。
186 | 3. App测试有哪些不同。
187 | 4. 更多的基于AI的功能探索和尝试。
188 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # auto-wing
  2 | 
  3 | ![](auto-wing.png)
  4 | 
  5 | > auto-wing is a tool that uses LLM to assist automated testing, give your automated testing wings.
  6 | 
  7 | auto-wing是一个利用LLM辅助自动化测试的工具, 为你的自动化测试插上翅膀。
  8 | 
  9 | ### Features
 10 | 
 11 | ⭐ 集成 `playwright`、`selenium`、`appium`，支持`Web UI`和`App UI`的`AI`操作。
 12 | 
 13 | ⭐ 支持多模型：`openai`、`deepseek`、`qwen` 和 `doubao`。
 14 | 
 15 | ⭐ 支持多种操作：`ai_action`、`ai_query`、`ai_assert`。
 16 | 
 17 | ⭐ 默认支持缓存：首次执行AI任务会被缓存，后续执行相同的任务可以提升效率。
 18 | 
 19 | ⭐ 无痛的集成到现有自动化项目（`pytest`、`unittest`）中。
 20 | 
 21 | ## Install
 22 | 
 23 | * 支持pip安装，`python >= 3.9`。
 24 | 
 25 | ```shell
 26 | pip install autowing
 27 | ```
 28 | 
 29 | ## Setting Env
 30 | 
 31 | __方法一__
 32 | 
 33 | 申请LLM需要的key，在项目的根目录下创建`.env`文件。推荐`qwen`和 `deepseek`，一是便宜，二是方便。
 34 | 
 35 | * openai: https://platform.openai.com/
 36 | 
 37 | ```ini
 38 | #.env
 39 | AUTOWING_MODEL_PROVIDER = openai
 40 | OPENAI_API_KEY = sk-proj-abdefghijklmnopqrstwvwxyz0123456789
 41 | ```
 42 | 
 43 | * DeepSeek: https://platform.deepseek.com/
 44 | 
 45 | ```ini
 46 | #.env
 47 | AUTOWING_MODEL_PROVIDER = deepseek
 48 | DEEPSEEK_API_KEY = sk-abdefghijklmnopqrstwvwxyz0123456789
 49 | ```
 50 | 
 51 | * 阿里云百练（千问）：https://bailian.console.aliyun.com/
 52 | 
 53 | ```ini
 54 | #.env
 55 | AUTOWING_MODEL_PROVIDER = qwen
 56 | DASHSCOPE_API_KEY = sk-abdefghijklmnopqrstwvwxyz0123456789
 57 | ```
 58 | 
 59 | * 火山方舟（豆包）：https://console.volcengine.com/
 60 | 
 61 | ```ini
 62 | #.env
 63 | AUTOWING_MODEL_PROVIDER = doubao
 64 | ARK_API_KEY = f61d2846-xxx-xxx-xxxx-xxxxxxxxxxxxx
 65 | DOUBAO_MODEL_NAME = ep-20250207200649-xxx
 66 | ```
 67 | 
 68 | __方法二__
 69 | 
 70 | > 如果不想使用python-dotenv配置环境变量，可以直接配置环境变量。
 71 | 
 72 | ```shell
 73 | export AUTOWING_MODEL_PROVIDER=deepseek
 74 | export DEEPSEEK_API_KEY=sk-abdefghijklmnopqrstwvwxyz0123456789
 75 | ```
 76 | 
 77 | > 其他LLM模型环境变量同样的方式配置。
 78 | 
 79 | ## Examples
 80 | 
 81 | 👉 [查看 examples](./examples)
 82 | 
 83 | ```python
 84 | import pytest
 85 | from playwright.sync_api import Page, sync_playwright
 86 | from autowing.playwright.fixture import create_fixture
 87 | from dotenv import load_dotenv
 88 | 
 89 | 
 90 | @pytest.fixture(scope="session")
 91 | def page():
 92 |     """playwright page fixture"""
 93 |     # load .env file config
 94 |     load_dotenv()
 95 |     with sync_playwright() as p:
 96 |         browser = p.chromium.launch(headless=False)
 97 |         context = browser.new_context()
 98 |         page = context.new_page()
 99 |         yield page
100 |         context.close()
101 |         browser.close()
102 | 
103 | 
104 | @pytest.fixture
105 | def ai(page):
106 |     """ai fixture"""
107 |     ai_fixture = create_fixture()
108 |     return ai_fixture(page)
109 | 
110 | 
111 | def test_bing_search(page: Page, ai):
112 |     # 访问必应
113 |     page.goto("https://cn.bing.com")
114 | 
115 |     # 使用AI执行搜索
116 |     ai.ai_action('搜索输入框输入"playwright"关键字，并回车')
117 |     page.wait_for_timeout(3000)
118 | 
119 |     # 使用AI查询搜索结果
120 |     items = ai.ai_query('string[], 搜索结果列表中包含"playwright"相关的标题')
121 | 
122 |     # 验证结果
123 |     assert len(items) > 1
124 | 
125 |     # 使用AI断言
126 |     assert ai.ai_assert('检查搜索结果列表第一条标题是否包含"playwright"字符串')
127 | ```
128 | 
129 | * 运行日志：
130 | 
131 | ```shell
132 | > pytest test_playwright_pytest.py -s
133 | ================================================= test session starts =================================================
134 | platform win32 -- Python 3.12.3, pytest-8.3.4, pluggy-1.5.0
135 | rootdir: D:\github\seldomQA\auto-wing
136 | configfile: pyproject.toml
137 | plugins: base-url-2.1.0, playwright-0.6.2
138 | collected 1 item
139 | 
140 | test_playwright_pytest.py 2025-02-04 10:00:30.961 | INFO     | autowing.playwright.fixture:ai_action:88 - 🪽 AI Action: 搜索输入框输入"playwright"关键字，并回车
141 | 2025-02-04 10:00:40.070 | INFO     | autowing.playwright.fixture:ai_query:162 - 🪽 AI Query: string[], 搜索结果列表中包 含"playwright"相关的标题
142 | 2025-02-04 10:00:48.954 | DEBUG    | autowing.playwright.fixture:ai_query:218 - 📄 Query: ['Playwright 官方文档 | Playwright', 'Playwright - 快速、可靠的端到端测试框架', 'Playwright 中文文档 | Playwright', 'Playwright 入门指南 | Playwright', 'Playwright 测试框架 | Playwright', 'Playwright 教程 | Playwright', 'Playwright 使用指南 | Playwright', 'Playwright 自动化测试工具 | Playwright', 'Playwright 安装与配置 | Playwright', 'Playwright 示例代码 | Playwright']
143 | 2025-02-04 10:00:48.954 | INFO     | autowing.playwright.fixture:ai_assert:267 - 🪽 AI Assert: 检查搜索结果列表第一条标 题是否包含"playwright"字符串
144 | .
145 | 
146 | ================================================= 1 passed in 27.99s ==================================================
147 | ```
148 | 
149 | ## Prompting Tips
150 | 
151 | __1.提供更详细的描述以及样例__
152 | 
153 | 提供详细描述和示例一直是非常有用的提示词技巧。
154 | 
155 | 错误示例 ❌: `"搜'耳机'"`
156 | 
157 | 正确示例 ✅: `"找到搜索框（搜索框的上方应该有区域切换按钮，如 '国内'， '国际')，输入'耳机'，敲回车"`
158 | 
159 | 错误示例 ❌: `"断言：外卖服务正在正常运行"`
160 | 
161 | 正确示例 ✅: `"断言：界面上有个“外卖服务”的板块，并且标识着“正常”"`
162 | 
163 | __2.一个 Prompt (指令)只做一件事__
164 | 
165 | 尽管 auto-wing 有自动重规划能力，但仍应保持指令简洁。否则，LLM 的输出可能会变得混乱。指令的长度对 token 消耗的影响几乎可以忽略不计。
166 | 
167 | 错误示例 ❌:`"点击登录按钮，然后点击注册按钮，在表单中输入'test@test.com'作为邮箱，'test'作为密码，然后点击注册按钮"`
168 | 
169 | 正确示例
170 | ✅: `将任务分解为三个步骤："点击登录按钮" "点击注册按钮" "在表单中输入'test@test.com'作为邮箱，'test'作为密码，然后点击注册按钮"`
171 | 
172 | __3.从界面做推断，而不是 DOM 属性或者浏览器状态__
173 | 
174 | 所有传递给 LLM 的数据都是截图和元素坐标。DOM和浏览器 对 LLM 来说几乎是不可见的。因此，务必确保你想提取的信息都在截图中有所体现且能被
175 | LLM “看到”。
176 | 
177 | 正确示例 ✅：`标题是蓝色的`
178 | 
179 | 错误实例 ❌：`标题有个 test-id-size 属性`
180 | 
181 | 错误实例 ❌：`浏览器有两个 tab 开着`
182 | 
183 | 错误实例 ❌：`异步请求已经结束了`
184 | 
185 | __4.中、英文提示词无影响__
186 | 
187 | 由于大多数 AI 模型可以理解多种语言，所以请随意用你喜欢的语言撰写提示指令。即使提示语言与页面语言不同，通常也是可行的。
188 | 
189 | ### 交流
190 | 
191 | > 欢迎添加微信，交流和反馈问题。
192 | 
193 | <div style="display: flex;justify-content: space-between;width: 100%">
194 |     <p><img alt="微信" src="./wechat.jpg" style="width: 200px;height: 100%" ></p>
195 | </div>
196 | 


--------------------------------------------------------------------------------
/autowing/core/llm/client/qwen.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import Optional, Dict, Any, List
  4 | 
  5 | from autowing.core.llm.base import BaseLLMClient
  6 | from openai import OpenAI
  7 | 
  8 | 
  9 | class QwenClient(BaseLLMClient):
 10 |     """
 11 |     Qwen (DashScope) API client implementation.
 12 |     Provides access to Alibaba Cloud's Qwen language model through OpenAI-compatible interface.
 13 |     """
 14 | 
 15 |     def __init__(self, api_key: Optional[str] = None):
 16 |         """
 17 |         Initialize the Qwen client.
 18 | 
 19 |         Args:
 20 |             api_key (Optional[str]): DashScope API key. If not provided, will try to get from DASHSCOPE_API_KEY env var
 21 | 
 22 |         Raises:
 23 |             ValueError: If no API key is provided or found in environment variables
 24 |         """
 25 |         self.api_key = api_key or os.getenv("DASHSCOPE_API_KEY")
 26 |         if not self.api_key:
 27 |             raise ValueError("Please set the env variable `DASHSCOPE_API_KEY`")
 28 | 
 29 |         self.base_url = os.getenv("OPENAI_BASE_URL", "https://dashscope.aliyuncs.com/compatible-mode/v1")
 30 |         self.model_name = os.getenv("MIDSCENE_MODEL_NAME", "qwen3-max")
 31 | 
 32 |         self.client = OpenAI(
 33 |             api_key=self.api_key,
 34 |             base_url=self.base_url
 35 |         )
 36 | 
 37 |     def _truncate_text(self, text: str, max_length: int = 30000) -> str:
 38 |         """
 39 |         Truncate text to fit within model's length limits.
 40 | 
 41 |         Args:
 42 |             text (str): The input text to truncate
 43 |             max_length (int): Maximum allowed length for the text. Defaults to 30000
 44 | 
 45 |         Returns:
 46 |             str: Truncated text with ellipsis if needed
 47 |         """
 48 |         if len(text) > max_length:
 49 |             return text[:max_length] + "..."
 50 |         return text
 51 | 
 52 |     def _format_messages(self, prompt: str, context: Optional[Dict[str, Any]] = None) -> List[Dict[str, str]]:
 53 |         """
 54 |         Format messages for the Qwen API.
 55 | 
 56 |         Args:
 57 |             prompt (str): The main prompt text
 58 |             context (Optional[Dict[str, Any]]): Additional context information
 59 | 
 60 |         Returns:
 61 |             List[Dict[str, str]]: Formatted messages list ready for API submission
 62 |         """
 63 |         # Add system message
 64 |         messages = [{
 65 |             "role": "system",
 66 |             "content": (
 67 |                 "You are a web automation assistant. "
 68 |                 "Analyze the page structure and provide precise element locators. "
 69 |                 "Return responses in the requested format."
 70 |             )
 71 |         }]
 72 | 
 73 |         # Add context (if any)
 74 |         if context:
 75 |             context_str = json.dumps(context, ensure_ascii=False)
 76 |             messages.append({
 77 |                 "role": "user",
 78 |                 "content": f"Page context: {self._truncate_text(context_str)}"
 79 |             })
 80 | 
 81 |         # Add main prompt
 82 |         messages.append({
 83 |             "role": "user",
 84 |             "content": self._truncate_text(prompt)
 85 |         })
 86 | 
 87 |         return messages
 88 | 
 89 |     def complete(self, prompt: str, context: Optional[Dict[str, Any]] = None) -> str:
 90 |         """
 91 |         Generate a completion using Qwen model.
 92 | 
 93 |         Args:
 94 |             prompt (str): The text prompt to complete
 95 |             context (Optional[Dict[str, Any]]): Additional context for the completion
 96 | 
 97 |         Returns:
 98 |             str: The model's response text
 99 | 
100 |         Raises:
101 |             Exception: If there's an error communicating with the Qwen API
102 |         """
103 |         try:
104 |             messages = self._format_messages(prompt, context)
105 | 
106 |             response = self.client.chat.completions.create(
107 |                 model=self.model_name,
108 |                 messages=messages,
109 |                 temperature=0.7,
110 |                 max_tokens=2000
111 |             )
112 | 
113 |             return response.choices[0].message.content
114 |         except Exception as e:
115 |             raise Exception(f"Qwen API error: {str(e)}")
116 | 
117 |     def complete_with_vision(self, prompt: Dict[str, Any]) -> str:
118 |         """
119 |         Generate a completion for vision tasks using Qwen-VL model.
120 | 
121 |         Args:
122 |             prompt (Dict[str, Any]): A dictionary containing messages and image data
123 |                                    in the format required by the Qwen-VL API
124 | 
125 |         Returns:
126 |             str: The model's response text
127 | 
128 |         Raises:
129 |             Exception: If there's an error communicating with the Qwen API
130 |         """
131 |         try:
132 |             # Make sure the message length is within the limit
133 |             messages = prompt["messages"]
134 |             for msg in messages:
135 |                 if isinstance(msg.get("content"), str):
136 |                     msg["content"] = self._truncate_text(msg["content"])
137 |                 elif isinstance(msg.get("content"), list):
138 |                     for item in msg["content"]:
139 |                         if isinstance(item.get("text"), str):
140 |                             item["text"] = self._truncate_text(item["text"])
141 | 
142 |             response = self.client.chat.completions.create(
143 |                 model=self.model_name,
144 |                 messages=messages,
145 |                 temperature=0.7,
146 |                 max_tokens=2000
147 |             )
148 | 
149 |             return response.choices[0].message.content
150 |         except Exception as e:
151 |             raise Exception(f"Qwen API error: {str(e)}")
152 | 


--------------------------------------------------------------------------------
/autowing/core/llm/client/deepseek.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import Optional, Dict, Any, List
  4 | 
  5 | from openai import OpenAI
  6 | 
  7 | from autowing.core.llm.base import BaseLLMClient
  8 | 
  9 | 
 10 | class DeepSeekClient(BaseLLMClient):
 11 |     """
 12 |     A client for interacting with the DeepSeek AI model.
 13 |     Implements the BaseLLMClient interface for text completion and vision tasks.
 14 |     """
 15 | 
 16 |     def __init__(self, api_key: Optional[str] = None):
 17 |         """
 18 |         Initialize the DeepSeek client.
 19 | 
 20 |         Args:
 21 |             api_key (Optional[str]): The API key for DeepSeek. If not provided,
 22 |                                    will try to get from DEEPSEEK_API_KEY environment variable.
 23 | 
 24 |         Raises:
 25 |             ValueError: If no API key is provided or found in environment variables.
 26 |         """
 27 |         self.api_key = api_key or os.getenv("DEEPSEEK_API_KEY")
 28 |         if not self.api_key:
 29 |             raise ValueError("DeepSeek API key is required")
 30 | 
 31 |         self.base_url = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
 32 |         self.model_name = os.getenv("DEEPSEEK_MODEL_NAME", "deepseek-chat")
 33 | 
 34 |         self.client = OpenAI(
 35 |             api_key=self.api_key,
 36 |             base_url=self.base_url
 37 |         )
 38 | 
 39 |     def _truncate_text(self, text: str, max_length: int = 30000) -> str:
 40 |         """
 41 |         Truncate text to fit within model's length limits.
 42 | 
 43 |         Args:
 44 |             text (str): The input text to truncate
 45 |             max_length (int): Maximum allowed length for the text. Defaults to 30000.
 46 | 
 47 |         Returns:
 48 |             str: Truncated text with ellipsis if needed
 49 |         """
 50 |         if len(text) > max_length:
 51 |             return text[:max_length] + "..."
 52 |         return text
 53 | 
 54 |     def _format_messages(self, prompt: str, context: Optional[Dict[str, Any]] = None) -> List[Dict[str, str]]:
 55 |         """
 56 |         Format messages for the DeepSeek API.
 57 | 
 58 |         Args:
 59 |             prompt (str): The main prompt text
 60 |             context (Optional[Dict[str, Any]]): Additional context information
 61 | 
 62 |         Returns:
 63 |             List[Dict[str, str]]: Formatted messages list ready for API submission
 64 |         """
 65 |         # Add system message
 66 |         messages = [{
 67 |             "role": "system",
 68 |             "content": (
 69 |                 "You are a web automation assistant. "
 70 |                 "Analyze the page structure and provide precise element locators. "
 71 |                 "Return responses in the requested format."
 72 |             )
 73 |         }]
 74 | 
 75 |         # Add context (if any)
 76 |         if context:
 77 |             context_str = json.dumps(context, ensure_ascii=False)
 78 |             messages.append({
 79 |                 "role": "user",
 80 |                 "content": f"Page context: {self._truncate_text(context_str)}"
 81 |             })
 82 | 
 83 |         # Add main prompt
 84 |         messages.append({
 85 |             "role": "user",
 86 |             "content": self._truncate_text(prompt)
 87 |         })
 88 | 
 89 |         return messages
 90 | 
 91 |     def complete(self, prompt: str, context: Optional[Dict[str, Any]] = None) -> str:
 92 |         """
 93 |         Send a completion request to the DeepSeek API.
 94 | 
 95 |         Args:
 96 |             prompt (str): The text prompt to complete
 97 |             context (Optional[Dict[str, Any]]): Additional context for the completion
 98 | 
 99 |         Returns:
100 |             str: The model's response text
101 | 
102 |         Raises:
103 |             Exception: If there's an error communicating with the DeepSeek API
104 |         """
105 |         try:
106 |             messages = self._format_messages(prompt, context)
107 | 
108 |             response = self.client.chat.completions.create(
109 |                 model=self.model_name,
110 |                 messages=messages,
111 |                 temperature=0.7,
112 |                 max_tokens=2000
113 |             )
114 | 
115 |             return response.choices[0].message.content
116 |         except Exception as e:
117 |             raise Exception(f"DeepSeek API error: {str(e)}")
118 | 
119 |     def complete_with_vision(self, prompt: Dict[str, Any]) -> str:
120 |         """
121 |         Send a vision-based completion request to the DeepSeek API.
122 | 
123 |         Args:
124 |             prompt (Dict[str, Any]): A dictionary containing messages and image data
125 |                                    in the format expected by the API
126 | 
127 |         Returns:
128 |             str: The model's response text
129 | 
130 |         Raises:
131 |             Exception: If there's an error communicating with the DeepSeek API
132 |         """
133 |         try:
134 |             # Make sure the message length is within the limit
135 |             messages = prompt["messages"]
136 |             for msg in messages:
137 |                 if isinstance(msg.get("content"), str):
138 |                     msg["content"] = self._truncate_text(msg["content"])
139 |                 elif isinstance(msg.get("content"), list):
140 |                     for item in msg["content"]:
141 |                         if isinstance(item.get("text"), str):
142 |                             item["text"] = self._truncate_text(item["text"])
143 | 
144 |             response = self.client.chat.completions.create(
145 |                 model=self.model_name,
146 |                 messages=messages,
147 |                 temperature=0.7,
148 |                 max_tokens=2000
149 |             )
150 | 
151 |             return response.choices[0].message.content
152 |         except Exception as e:
153 |             raise Exception(f"DeepSeek API error: {str(e)}")
154 | 


--------------------------------------------------------------------------------
/autowing/core/llm/client/doubao.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import Optional, Dict, Any, List
  4 | 
  5 | from openai import OpenAI
  6 | 
  7 | from autowing.core.llm.base import BaseLLMClient
  8 | 
  9 | 
 10 | class DoubaoClient(BaseLLMClient):
 11 |     """
 12 |     Doubao API client implementation.
 13 |     Provides access to Doubao's LLM models.
 14 |     """
 15 | 
 16 |     def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None):
 17 |         """
 18 |         Initialize the Doubao client.
 19 | 
 20 |         Args:
 21 |             api_key (Optional[str]): Doubao API key. If not provided, will try to get from ARK_API_KEY env var
 22 |             base_url (Optional[str]): Custom base URL for API requests
 23 | 
 24 |         Raises:
 25 |             ValueError: If no API key is provided or found in environment variables
 26 |         """
 27 |         self.api_key = api_key or os.getenv("ARK_API_KEY")
 28 |         if not self.api_key:
 29 |             raise ValueError("Doubao API key is required")
 30 | 
 31 |         self.base_url = base_url or os.getenv("DOUBAO_BASE_URL", "https://ark.cn-beijing.volces.com/api/v3")
 32 |         self.model_name = os.getenv("DOUBAO_MODEL_NAME")
 33 |         if not self.model_name:
 34 |             raise ValueError("Doubao model name is null, For example: ep-20250207200649-xxx")
 35 | 
 36 |         self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
 37 | 
 38 |     def _truncate_text(self, text: str, max_length: int = 30000) -> str:
 39 |         """
 40 |         Truncate text to fit within model's length limits.
 41 | 
 42 |         Args:
 43 |             text (str): The input text to truncate
 44 |             max_length (int): Maximum allowed length for the text. Defaults to 30000
 45 | 
 46 |         Returns:
 47 |             str: Truncated text with ellipsis if needed
 48 |         """
 49 |         if len(text) > max_length:
 50 |             return text[:max_length] + "..."
 51 |         return text
 52 | 
 53 |     def _format_messages(self, prompt: str, context: Optional[Dict[str, Any]] = None) -> List[Dict[str, str]]:
 54 |         """
 55 |         Format messages for the Doubao API.
 56 | 
 57 |         Args:
 58 |             prompt (str): The main prompt text
 59 |             context (Optional[Dict[str, Any]]): Additional context information
 60 | 
 61 |         Returns:
 62 |             List[Dict[str, str]]: Formatted messages list ready for API submission
 63 |         """
 64 |         # Add system message
 65 |         messages = [{
 66 |             "role": "system",
 67 |             "content": (
 68 |                 "You are a web automation assistant. "
 69 |                 "Analyze the page structure and provide precise element locators. "
 70 |                 "Return responses in the requested format."
 71 |             )
 72 |         }]
 73 | 
 74 |         # Add context (if any)
 75 |         if context:
 76 |             context_str = json.dumps(context, ensure_ascii=False)
 77 |             messages.append({
 78 |                 "role": "user",
 79 |                 "content": f"Page context: {self._truncate_text(context_str)}"
 80 |             })
 81 | 
 82 |         # Add main prompt
 83 |         messages.append({
 84 |             "role": "user",
 85 |             "content": self._truncate_text(prompt)
 86 |         })
 87 | 
 88 |         return messages
 89 | 
 90 |     def complete(self, prompt: str, context: Optional[Dict[str, Any]] = None) -> str:
 91 |         """
 92 |         Generate a completion using Doubao LLM.
 93 | 
 94 |         Args:
 95 |             prompt (str): The text prompt to complete
 96 |             context (Optional[Dict[str, Any]]): Additional context for the completion
 97 | 
 98 |         Returns:
 99 |             str: The model's response text
100 | 
101 |         Raises:
102 |             Exception: If there's an error communicating with the Doubao API
103 |         """
104 |         try:
105 |             messages = self._format_messages(prompt, context)
106 | 
107 |             response = self.client.chat.completions.create(
108 |                 model=self.model_name,
109 |                 messages=messages,
110 |                 temperature=0.7,
111 |                 max_tokens=2000
112 |             )
113 |             return response.choices[0].message.content
114 |         except Exception as e:
115 |             raise Exception(f"Doubao API error: {str(e)}")
116 | 
117 |     def complete_with_vision(self, prompt: Dict[str, Any]) -> str:
118 |         """
119 |         Generate a completion for vision tasks using Doubao Vision.
120 | 
121 |         Args:
122 |             prompt (Dict[str, Any]): A dictionary containing messages and image data
123 |                                    in the format required by the Doubao Vision API
124 | 
125 |         Returns:
126 |             str: The model's response text
127 | 
128 |         Raises:
129 |             Exception: If there's an error communicating with the Doubao Vision API
130 |         """
131 |         try:
132 |             # Make sure the message length is within the limit
133 |             messages = prompt["messages"]
134 |             for msg in messages:
135 |                 if isinstance(msg.get("content"), str):
136 |                     msg["content"] = self._truncate_text(msg["content"])
137 |                 elif isinstance(msg.get("content"), list):
138 |                     for item in msg["content"]:
139 |                         if isinstance(item.get("text"), str):
140 |                             item["text"] = self._truncate_text(item["text"])
141 | 
142 |             response = self.client.chat.completions.create(
143 |                 model=self.model_name,
144 |                 messages=messages,
145 |                 temperature=0.7,
146 |                 max_tokens=2000
147 |             )
148 |             return response.choices[0].message.content
149 |         except Exception as e:
150 |             raise Exception(f"Doubao Vision API error: {str(e)}")
151 | 


--------------------------------------------------------------------------------
/autowing/core/llm/client/openai.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import Optional, Dict, Any, List
  4 | 
  5 | from openai import OpenAI
  6 | 
  7 | from autowing.core.llm.base import BaseLLMClient
  8 | 
  9 | 
 10 | class OpenAIClient(BaseLLMClient):
 11 |     """
 12 |     OpenAI API client implementation.
 13 |     Provides access to OpenAI's GPT and vision models.
 14 |     """
 15 | 
 16 |     def __init__(self, api_key: Optional[str] = None, base_url: Optional[str] = None):
 17 |         """
 18 |         Initialize the OpenAI client.
 19 | 
 20 |         Args:
 21 |             api_key (Optional[str]): OpenAI API key. If not provided, will try to get from OPENAI_API_KEY env var
 22 |             base_url (Optional[str]): Custom base URL for API requests
 23 | 
 24 |         Raises:
 25 |             ValueError: If no API key is provided or found in environment variables
 26 |         """
 27 |         self.api_key = api_key or os.getenv("OPENAI_API_KEY")
 28 |         if not self.api_key:
 29 |             raise ValueError("OpenAI API key is required")
 30 | 
 31 |         self.base_url = base_url or os.getenv("OPENAI_BASE_URL", "https://api.openai.com/v1")
 32 |         self.model_name = os.getenv("MIDSCENE_MODEL_NAME", "gpt-4o-2024-08-06")
 33 | 
 34 |         client_kwargs = {"api_key": self.api_key}
 35 |         if self.base_url:
 36 |             client_kwargs["base_url"] = self.base_url
 37 | 
 38 |         self.client = OpenAI(**client_kwargs)
 39 | 
 40 |     def _truncate_text(self, text: str, max_length: int = 30000) -> str:
 41 |         """
 42 |         Truncate text to fit within model's length limits.
 43 | 
 44 |         Args:
 45 |             text (str): The input text to truncate
 46 |             max_length (int): Maximum allowed length for the text. Defaults to 30000
 47 | 
 48 |         Returns:
 49 |             str: Truncated text with ellipsis if needed
 50 |         """
 51 |         if len(text) > max_length:
 52 |             return text[:max_length] + "..."
 53 |         return text
 54 | 
 55 |     def _format_messages(self, prompt: str, context: Optional[Dict[str, Any]] = None) -> List[Dict[str, str]]:
 56 |         """
 57 |         Format messages for the OpenAI API.
 58 | 
 59 |         Args:
 60 |             prompt (str): The main prompt text
 61 |             context (Optional[Dict[str, Any]]): Additional context information
 62 | 
 63 |         Returns:
 64 |             List[Dict[str, str]]: Formatted messages list ready for API submission
 65 |         """
 66 |         # Add system message
 67 |         messages = [{
 68 |             "role": "system",
 69 |             "content": (
 70 |                 "You are a web automation assistant. "
 71 |                 "Analyze the page structure and provide precise element locators. "
 72 |                 "Return responses in the requested format."
 73 |             )
 74 |         }]
 75 | 
 76 |         # Add context (if any)
 77 |         if context:
 78 |             context_str = json.dumps(context, ensure_ascii=False)
 79 |             messages.append({
 80 |                 "role": "user",
 81 |                 "content": f"Page context: {self._truncate_text(context_str)}"
 82 |             })
 83 | 
 84 |         # Add main prompt
 85 |         messages.append({
 86 |             "role": "user",
 87 |             "content": self._truncate_text(prompt)
 88 |         })
 89 | 
 90 |         return messages
 91 | 
 92 |     def complete(self, prompt: str, context: Optional[Dict[str, Any]] = None) -> str:
 93 |         """
 94 |         Generate a completion using GPT-4.
 95 | 
 96 |         Args:
 97 |             prompt (str): The text prompt to complete
 98 |             context (Optional[Dict[str, Any]]): Additional context for the completion
 99 | 
100 |         Returns:
101 |             str: The model's response text
102 | 
103 |         Raises:
104 |             Exception: If there's an error communicating with the OpenAI API
105 |         """
106 |         try:
107 |             messages = self._format_messages(prompt, context)
108 | 
109 |             response = self.client.chat.completions.create(
110 |                 model=self.model_name,
111 |                 messages=messages,
112 |                 temperature=0.7,
113 |                 max_tokens=2000
114 |             )
115 |             return response.choices[0].message.content
116 |         except Exception as e:
117 |             raise Exception(f"OpenAI API error: {str(e)}")
118 | 
119 |     def complete_with_vision(self, prompt: Dict[str, Any]) -> str:
120 |         """
121 |         Generate a completion for vision tasks using GPT-4 Vision.
122 | 
123 |         Args:
124 |             prompt (Dict[str, Any]): A dictionary containing messages and image data
125 |                                    in the format required by the GPT-4 Vision API
126 | 
127 |         Returns:
128 |             str: The model's response text
129 | 
130 |         Raises:
131 |             Exception: If there's an error communicating with the OpenAI Vision API
132 |         """
133 |         try:
134 |             # Make sure the message length is within the limit
135 |             messages = prompt["messages"]
136 |             for msg in messages:
137 |                 if isinstance(msg.get("content"), str):
138 |                     msg["content"] = self._truncate_text(msg["content"])
139 |                 elif isinstance(msg.get("content"), list):
140 |                     for item in msg["content"]:
141 |                         if isinstance(item.get("text"), str):
142 |                             item["text"] = self._truncate_text(item["text"])
143 | 
144 |             response = self.client.chat.completions.create(
145 |                 model=self.model_name,
146 |                 messages=messages,
147 |                 temperature=0.7,
148 |                 max_tokens=2000
149 |             )
150 |             return response.choices[0].message.content
151 |         except Exception as e:
152 |             raise Exception(f"OpenAI Vision API error: {str(e)}")
153 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/autowing/appium/fixture.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from typing import Any, Dict
  4 | 
  5 | from appium.webdriver.common.appiumby import AppiumBy
  6 | from appium.webdriver.webdriver import WebDriver
  7 | from loguru import logger
  8 | from selenium.webdriver.support.ui import WebDriverWait
  9 | 
 10 | from autowing.appium.actions import Action
 11 | from autowing.core.ai_fixture_base import AiFixtureBase
 12 | from autowing.core.llm.factory import LLMFactory
 13 | 
 14 | 
 15 | def bounds(x, y, width, height) -> list:
 16 |     """
 17 |     return element bounds
 18 |     :param x:
 19 |     :param y:
 20 |     :param width:
 21 |     :param height:
 22 |     :return:
 23 |     """
 24 |     x_start = int(x)
 25 |     y_start = int(y)
 26 |     x_end = x_start + int(width)
 27 |     y_end = y_start + int(height)
 28 |     return [[x_start, x_end], [y_start, y_end]]
 29 | 
 30 | 
 31 | class AppiumAiFixture(AiFixtureBase):
 32 |     """
 33 |     A fixture class that combines Appium with AI capabilities for mobile automation.
 34 |     Provides AI-driven interaction with mobile apps using various LLM providers.
 35 |     """
 36 | 
 37 |     def __init__(self, driver: WebDriver, platform: str = "Android"):
 38 |         """
 39 |         Initialize the AI-powered Appium fixture.
 40 | 
 41 |         Args:
 42 |             driver (WebDriver): The Appium WebDriver instance to automate
 43 |             platform: Mobile operating system platform
 44 |         """
 45 |         self.driver = driver
 46 |         self.platform = platform
 47 |         self.llm_client = LLMFactory.create()
 48 |         self.wait = WebDriverWait(self.driver, 10)  # Default timeout of 10 seconds
 49 | 
 50 |     def _get_page_context(self) -> Dict[str, Any]:
 51 |         """
 52 |         Extract context information from the current screen of the mobile app.
 53 |         Collects information about visible elements and screen metadata.
 54 | 
 55 |         Returns:
 56 |             Dict[str, Any]: A dictionary containing screen information and visible interactive elements
 57 |         """
 58 |         # Get basic screen info
 59 |         basic_info = {
 60 |             "activity": self.driver.current_activity,
 61 |             "package": self.driver.current_package
 62 |         }
 63 | 
 64 |         # Get key elements info using Appium
 65 |         elements_info = []
 66 |         if self.platform == "Android":
 67 |             elements = self.driver.find_elements(AppiumBy.XPATH, "//*")
 68 |             for el in elements:
 69 |                 if el.is_displayed():
 70 |                     elements_info.append({
 71 |                         "tag": el.tag_name,
 72 |                         "text": el.text,
 73 |                         "resource_id": el.get_attribute("resource-id"),
 74 |                         "content_desc": el.get_attribute("content-desc"),
 75 |                         "class": el.get_attribute("class"),
 76 |                         "bounds": el.get_attribute("bounds")
 77 |                     })
 78 |         elif self.platform == "iOS":
 79 |             elements = self.driver.find_elements(AppiumBy.IOS_PREDICATE, "type == '*'")
 80 |             for el in elements:
 81 |                 if el.is_displayed():
 82 |                     elements_info.append({
 83 |                         "tag": el.tag_name,
 84 |                         "text": el.text,
 85 |                         "type": el.get_attribute("type"),
 86 |                         "name": el.get_attribute("name"),
 87 |                         "label": el.get_attribute("label"),
 88 |                         "enabled": el.get_attribute("enabled"),
 89 |                         "visible": el.get_attribute("visible"),
 90 |                         "bounds": bounds(el.get_attribute("x"),
 91 |                                          el.get_attribute("y"),
 92 |                                          el.get_attribute("width"),
 93 |                                          el.get_attribute("height")),
 94 |                     })
 95 |         else:
 96 |             raise NameError(f"Unsupported {self.platform} platform.")
 97 | 
 98 |         return {
 99 |             **basic_info,
100 |             "elements": elements_info
101 |         }
102 | 
103 |     def ai_action(self, prompt: str) -> None:
104 |         """
105 |         Execute an AI-driven action on the screen based on the given prompt.
106 | 
107 |         Args:
108 |             prompt (str): Natural language description of the action to perform
109 | 
110 |         Raises:
111 |             ValueError: If the AI response cannot be parsed or contains invalid instructions
112 |             TimeoutException: If the element cannot be found or interacted with
113 |         """
114 |         logger.info(f"🪽 AI Action: {prompt}")
115 |         context = self._get_page_context()
116 | 
117 |         action_prompt = f"""
118 | Extract element locator and action from the request. Return ONLY a JSON object.
119 | 
120 | Activity: {context['activity']}
121 | Package: {context['package']}
122 | Elements: {context['elements']}
123 | Request: {prompt}
124 | 
125 | Return list format:
126 | [{{
127 |     "bounds": "coordinates of the element in the format [x1,y1][x2,y2] (notice, x1,y1 and x2,y2 are replaced by concrete coordinates.)",
128 |     "action": "click/fill/press",
129 |     "value": "text to input if needed",
130 |     "key": "key to press if needed"
131 | }}]
132 | 
133 | No other text or explanation.
134 | """
135 | 
136 |         response = self.llm_client.complete(action_prompt)
137 |         cleaned_response = self._clean_response(response)
138 |         instruction = json.loads(cleaned_response)
139 | 
140 |         if isinstance(instruction, list) is False:
141 |             raise ValueError("Invalid instruction format")
142 | 
143 |         for step in instruction:
144 |             bounds = step.get('bounds')
145 |             action = step.get('action')
146 | 
147 |             if not bounds or not action:
148 |                 raise ValueError("Invalid instruction format")
149 | 
150 |             # Extract coordinates from bounds
151 |             coord = re.findall(r'\d+', bounds)
152 |             x1, y1, x2, y2 = map(int, coord)
153 |             x_center = (x1 + x2) // 2
154 |             y_center = (y1 + y2) // 2
155 | 
156 |             # Execute the action
157 |             if action == 'click':
158 |                 action = Action(self.driver)
159 |                 action.tap(x=x_center, y=y_center)
160 |             elif action == 'fill':
161 |                 fill_text = step.get('value', '')
162 |                 logger.info(f"⌨️ fill text: {fill_text}.")
163 |                 self.driver.execute_script('mobile: type', {'text': fill_text})
164 |             elif action == 'press':
165 |                 logger.info("🔍 keyboard search key.")
166 |                 self.driver.execute_script('mobile: performEditorAction', {'action': 'search'})
167 |             else:
168 |                 raise ValueError(f"Unsupported action: {action}")
169 | 
170 |     def ai_query(self, prompt: str) -> Any:
171 |         """
172 |         Query information from the screen using AI analysis.
173 | 
174 |         Args:
175 |             prompt (str): Natural language query about the screen content.
176 |                          Can include format hints like 'string[]' or 'number'.
177 | 
178 |         Returns:
179 |             Any: The query results in the requested format
180 | 
181 |         Raises:
182 |             ValueError: If the AI response cannot be parsed into the requested format
183 |         """
184 |         logger.info(f"🪽 AI Query: {prompt}")
185 |         context = self._get_page_context()
186 | 
187 |         # Parse the requested data format
188 |         format_hint = ""
189 |         if prompt.startswith(('string[]', 'number[]', 'object[]')):
190 |             format_hint = prompt.split(',')[0].strip()
191 |             prompt = ','.join(prompt.split(',')[1:]).strip()
192 | 
193 |         # Provide different prompts based on the format
194 |         if format_hint == 'string[]':
195 |             query_prompt = f"""
196 | Extract text content matching the query. Return ONLY a JSON array of strings.
197 | 
198 | Activity: {context['activity']}
199 | Package: {context['package']}
200 | Elements: {context['elements']}
201 | Query: {prompt}
202 | 
203 | Return format example: ["result1", "result2"], (notice: Gets value data from labels and text keys)
204 | No other text or explanation.
205 | """
206 |         elif format_hint == 'number[]':
207 |             query_prompt = f"""
208 | Extract numeric values matching the query. Return ONLY a JSON array of numbers.
209 | 
210 | Activity: {context['activity']}
211 | Package: {context['package']}
212 | Elements: {context['elements']}
213 | Query: {prompt}
214 | 
215 | Return format example: [1, 2, 3], (notice: Gets value data from labels and text keys)
216 | No other text or explanation.
217 | """
218 |         else:
219 |             query_prompt = f"""
220 | Extract information matching the query. Return ONLY in valid JSON format.
221 | 
222 | Activity: {context['activity']}
223 | Package: {context['package']}
224 | Elements: {context['elements']}
225 | Query: {prompt}
226 | 
227 | Return format:
228 | - For arrays: ["item1", "item2"]
229 | - For objects: {{"key": "value"}}
230 | - For single value: "text" or number
231 | (notice: Gets value data from labels and text keys)
232 | 
233 | No other text or explanation.
234 | """
235 | 
236 |         response = self.llm_client.complete(query_prompt)
237 |         cleaned_response = self._clean_response(response)
238 |         try:
239 |             result = json.loads(cleaned_response)
240 |             query_info = self._validate_result_format(result, format_hint)
241 |             logger.debug(f"📄 Query: {query_info}")
242 |             return query_info
243 |         except json.JSONDecodeError:
244 |             # If it's a string array format, try extracting from text
245 |             if format_hint == 'string[]':
246 |                 lines = [line.strip() for line in cleaned_response.split('\n')
247 |                          if line.strip() and not line.startswith(('-', '*', '#'))]
248 | 
249 |                 query_terms = [term.lower() for term in prompt.split()
250 |                                if len(term) > 2 and term.lower() not in ['the', 'and', 'for']]
251 | 
252 |                 results = []
253 |                 for line in lines:
254 |                     if any(term in line.lower() for term in query_terms):
255 |                         text = line.strip('`"\'- ,')
256 |                         if ':' in text:
257 |                             text = text.split(':', 1)[1].strip()
258 |                         if text:
259 |                             results.append(text)
260 | 
261 |                 if results:
262 |                     seen = set()
263 |                     query_info = [x for x in results if not (x in seen or seen.add(x))]
264 |                     logger.debug(f"📄 Query: {query_info}")
265 |                     return query_info
266 | 
267 |             raise ValueError(f"Failed to parse response as JSON: {cleaned_response[:100]}...")
268 | 
269 |     def ai_assert(self, prompt: str) -> bool:
270 |         """
271 |         Verify a condition on the screen using AI analysis.
272 | 
273 |         Args:
274 |             prompt (str): Natural language description of the condition to verify
275 | 
276 |         Returns:
277 |             bool: True if the condition is met, False otherwise
278 | 
279 |         Raises:
280 |             ValueError: If the AI response cannot be parsed as a boolean value
281 |         """
282 |         logger.info(f"🪽 AI Assert: {prompt}")
283 |         context = self._get_page_context()
284 | 
285 |         assert_prompt = f"""
286 | You are a web automation assistant. Verify the following assertion and return ONLY a boolean value.
287 | 
288 | Activity: {context['activity']}
289 | Package: {context['package']}
290 | Elements: {context['elements']}
291 | Assertion: {prompt}
292 | 
293 | (notice: Gets value data from labels and text keys)
294 | 
295 | IMPORTANT: Return ONLY the word 'true' or 'false' (lowercase). No other text, no explanation.
296 | """
297 | 
298 |         response = self.llm_client.complete(assert_prompt)
299 |         cleaned_response = self._clean_response(response).lower()
300 | 
301 |         # Directly match true or false
302 |         if cleaned_response == 'true':
303 |             return True
304 |         if cleaned_response == 'false':
305 |             return False
306 | 
307 |         # If response contains other content, try extracting boolean
308 |         if 'true' in cleaned_response.split():
309 |             return True
310 |         if 'false' in cleaned_response.split():
311 |             return False
312 | 
313 |         raise ValueError("Response must be 'true' or 'false'")
314 | 
315 | 
316 | def create_fixture():
317 |     """
318 |     Create an AppiumAiFixture factory.
319 |     """
320 |     return AppiumAiFixture
321 | 


--------------------------------------------------------------------------------
/autowing/selenium/fixture.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import Any, Dict
  3 | 
  4 | from loguru import logger
  5 | from selenium.common.exceptions import TimeoutException
  6 | from selenium.webdriver.common.by import By
  7 | from selenium.webdriver.common.keys import Keys
  8 | from selenium.webdriver.remote.webdriver import WebDriver
  9 | from selenium.webdriver.support import expected_conditions as EC
 10 | from selenium.webdriver.support.ui import WebDriverWait
 11 | 
 12 | from autowing.core.ai_fixture_base import AiFixtureBase
 13 | from autowing.core.llm.factory import LLMFactory
 14 | from autowing.utils.transition import selector_to_selenium
 15 | 
 16 | 
 17 | class SeleniumAiFixture(AiFixtureBase):
 18 |     """
 19 |     A fixture class that combines Selenium with AI capabilities for web automation.
 20 |     Provides AI-driven interaction with web pages using various LLM providers.
 21 |     Maintains API compatibility with PlaywrightAiFixture.
 22 |     """
 23 | 
 24 |     def __init__(self, driver: WebDriver):
 25 |         """
 26 |         Initialize the AI-powered Selenium fixture.
 27 | 
 28 |         Args:
 29 |             driver (WebDriver): The Selenium WebDriver instance to automate
 30 |         """
 31 |         super().__init__()
 32 |         self.driver = driver
 33 |         self.llm_client = LLMFactory.create()
 34 |         self.wait = WebDriverWait(self.driver, 10)  # Default timeout of 10 seconds
 35 | 
 36 |     def _get_page_context(self) -> Dict[str, Any]:
 37 |         """
 38 |         Extract context information from the current page.
 39 |         Collects information about visible elements and page metadata.
 40 | 
 41 |         Returns:
 42 |             Dict[str, Any]: A dictionary containing page URL, title, and information about
 43 |                            visible interactive elements
 44 |         """
 45 |         # Get basic page info
 46 |         basic_info = {
 47 |             "url": self.driver.current_url,
 48 |             "title": self.driver.title
 49 |         }
 50 | 
 51 |         # Get key elements info using JavaScript
 52 |         elements_info = self.driver.execute_script("""
 53 |             const getVisibleElements = () => {
 54 |                 const elements = [];
 55 |                 const selectors = [
 56 |                     'input',        // input
 57 |                     'textarea',     // input
 58 |                     'select',       // input/click
 59 |                     'button',       // click
 60 |                     'a',            // click
 61 |                     '[role="button"]',   // click
 62 |                     '[role="link"]',     // click
 63 |                     '[role="checkbox"]', // click
 64 |                     '[role="radio"]',    // click
 65 |                     '[role="searchbox"]', // input
 66 |                     'summary',      // click（<details> ）
 67 |                     '[draggable="true"]'  // draggable
 68 |                 ];
 69 |                 
 70 |                 for (const selector of selectors) {
 71 |                     document.querySelectorAll(selector).forEach(el => {
 72 |                         if (el.offsetWidth > 0 && el.offsetHeight > 0) {
 73 |                             elements.push({
 74 |                                 tag: el.tagName.toLowerCase(),
 75 |                                 type: el.getAttribute('type') || null,
 76 |                                 placeholder: el.getAttribute('placeholder') || null,
 77 |                                 value: el.value || null,
 78 |                                 text: el.textContent?.trim() || '',
 79 |                                 aria: el.getAttribute('aria-label') || null,
 80 |                                 id: el.id || '',
 81 |                                 name: el.getAttribute('name') || null,
 82 |                                 class: el.className || '',
 83 |                                 draggable: el.getAttribute('draggable') || null
 84 |                             });
 85 |                         }
 86 |                     });
 87 |                 }
 88 |                 return elements;
 89 |             };
 90 |             return getVisibleElements();
 91 |         """)
 92 | 
 93 |         return {
 94 |             **basic_info,
 95 |             "elements": elements_info
 96 |         }
 97 | 
 98 |     def ai_action(self, prompt: str) -> None:
 99 |         """
100 |         Execute an AI-driven action on the page based on the given prompt.
101 | 
102 |         Args:
103 |             prompt (str): Natural language description of the action to perform
104 | 
105 |         Raises:
106 |             ValueError: If the AI response cannot be parsed or contains invalid instructions
107 |             TimeoutException: If the element cannot be found or interacted with
108 |         """
109 |         logger.info(f"🪽 AI Action: {prompt}")
110 |         context = self._get_page_context()
111 |         context["elements"] = self._remove_empty_keys(context.get("elements", []))
112 | 
113 |         def compute_action():
114 |             action_prompt = f"""
115 | Extract element locator and action from the request. Return ONLY a JSON object.
116 | 
117 | Page: {context['url']}
118 | Title: {context['title']}
119 | Request: {prompt}
120 | 
121 | Return format:
122 | {{
123 |     "selector": "XPATH selector to locate the element",
124 |     "action": "click/fill/press",
125 |     "value": "text to input if needed",
126 |     "key": "key to press if needed"
127 | }}
128 | Note: selector is used for a selenium location, for example：find_element(By.XPATH, selector)
129 | 
130 | Example response:
131 | {{
132 |     "selector": "//input[@id='search-input']",
133 |     "action": "fill",
134 |     "value": "search text",
135 |     "key": "Enter"
136 | }}
137 | """
138 | 
139 |             response = self.llm_client.complete(action_prompt)
140 |             cleaned_response = self._clean_response(response)
141 |             return json.loads(cleaned_response)
142 | 
143 |         # Use cache manager to get or compute the instruction
144 |         instruction = self._get_cached_or_compute(prompt, context, compute_action)
145 |         # Execute the action using the instruction
146 |         selector = instruction.get('selector')
147 |         action = instruction.get('action')
148 | 
149 |         if not selector or not action:
150 |             raise ValueError("Invalid instruction format")
151 | 
152 |         # Execute the action
153 |         selector = selector_to_selenium(selector)
154 |         try:
155 |             element = self.wait.until(EC.presence_of_element_located((By.XPATH, selector)))
156 |         except TimeoutException:
157 |             element = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
158 | 
159 |         if action == 'click':
160 |             element.click()
161 |         elif action == 'fill':
162 |             element.clear()
163 |             element.send_keys(instruction.get('value', ''))
164 |             if instruction.get('key'):
165 |                 key_attr = getattr(Keys, instruction['key'].upper(), None)
166 |                 if key_attr:
167 |                     element.send_keys(key_attr)
168 |         elif action == 'press':
169 |             key_attr = getattr(Keys, instruction.get('key', 'ENTER').upper())
170 |             element.send_keys(key_attr)
171 |         else:
172 |             raise ValueError(f"Unsupported action: {action}")
173 | 
174 |     def ai_query(self, prompt: str) -> Any:
175 |         """
176 |         Query information from the page using AI analysis.
177 | 
178 |         Args:
179 |             prompt (str): Natural language query about the page content.
180 |                          Can include format hints like 'string[]' or 'number'.
181 | 
182 |         Returns:
183 |             Any: The query results in the requested format
184 | 
185 |         Raises:
186 |             ValueError: If the AI response cannot be parsed into the requested format
187 |         """
188 |         logger.info(f"🪽 AI Query: {prompt}")
189 |         context = self._get_page_context()
190 |         context["elements"] = self._remove_empty_keys(context.get("elements", []))
191 | 
192 |         # Parse the requested data format
193 |         format_hint = ""
194 |         if prompt.startswith(('string[]', 'number[]', 'object[]')):
195 |             format_hint = prompt.split(',')[0].strip()
196 |             prompt = ','.join(prompt.split(',')[1:]).strip()
197 | 
198 |         # Provide different prompts based on the format
199 |         if format_hint == 'string[]':
200 |             query_prompt = f"""
201 | Extract text content matching the query. Return ONLY a JSON array of strings.
202 | 
203 | Page: {context['url']}
204 | Title: {context['title']}
205 | Query: {prompt}
206 | 
207 | Return format example: ["result1", "result2"]
208 | No other text or explanation.
209 | """
210 |         elif format_hint == 'number[]':
211 |             query_prompt = f"""
212 | Extract numeric values matching the query. Return ONLY a JSON array of numbers.
213 | 
214 | Page: {context['url']}
215 | Title: {context['title']}
216 | Query: {prompt}
217 | 
218 | Return format example: [1, 2, 3]
219 | No other text or explanation.
220 | """
221 |         else:
222 |             query_prompt = f"""
223 | Extract information matching the query. Return ONLY in valid JSON format.
224 | 
225 | Page: {context['url']}
226 | Title: {context['title']}
227 | Query: {prompt}
228 | 
229 | Return format:
230 | - For arrays: ["item1", "item2"]
231 | - For objects: {{"key": "value"}}
232 | - For single value: "text" or number
233 | 
234 | No other text or explanation.
235 | """
236 | 
237 |         response = self.llm_client.complete(query_prompt)
238 |         cleaned_response = self._clean_response(response)
239 |         try:
240 |             result = json.loads(cleaned_response)
241 |             query_info = self._validate_result_format(result, format_hint)
242 |             logger.debug(f"📄 Query: {query_info}")
243 |             return query_info
244 |         except json.JSONDecodeError:
245 |             # If it's a string array format, try extracting from text
246 |             if format_hint == 'string[]':
247 |                 lines = [line.strip() for line in cleaned_response.split('\n')
248 |                          if line.strip() and not line.startswith(('-', '*', '#'))]
249 | 
250 |                 query_terms = [term.lower() for term in prompt.split()
251 |                                if len(term) > 2 and term.lower() not in ['the', 'and', 'for']]
252 | 
253 |                 results = []
254 |                 for line in lines:
255 |                     if any(term in line.lower() for term in query_terms):
256 |                         text = line.strip('`"\'- ,')
257 |                         if ':' in text:
258 |                             text = text.split(':', 1)[1].strip()
259 |                         if text:
260 |                             results.append(text)
261 | 
262 |                 if results:
263 |                     seen = set()
264 |                     query_info = [x for x in results if not (x in seen or seen.add(x))]
265 |                     logger.debug(f"📄 Query: {query_info}")
266 |                     return query_info
267 | 
268 |             raise ValueError(f"Failed to parse response as JSON: {cleaned_response[:100]}...")
269 | 
270 |     def ai_assert(self, prompt: str) -> bool:
271 |         """
272 |         Verify a condition on the page using AI analysis.
273 | 
274 |         Args:
275 |             prompt (str): Natural language description of the condition to verify
276 | 
277 |         Returns:
278 |             bool: True if the condition is met, False otherwise
279 | 
280 |         Raises:
281 |             ValueError: If the AI response cannot be parsed as a boolean value
282 |         """
283 |         logger.info(f"🪽 AI Assert: {prompt}")
284 |         context = self._get_page_context()
285 |         context["elements"] = self._remove_empty_keys(context.get("elements", []))
286 | 
287 |         assert_prompt = f"""
288 | You are a web automation assistant. Verify the following assertion and return ONLY a boolean value.
289 | 
290 | Page URL: {context['url']}
291 | Page Title: {context['title']}
292 | 
293 | Assertion: {prompt}
294 | 
295 | IMPORTANT: Return ONLY the word 'true' or 'false' (lowercase). No other text, no explanation.
296 | """
297 | 
298 |         response = self.llm_client.complete(assert_prompt)
299 |         cleaned_response = self._clean_response(response).lower()
300 | 
301 |         # Directly match true or false
302 |         if cleaned_response == 'true':
303 |             return True
304 |         if cleaned_response == 'false':
305 |             return False
306 | 
307 |         # If response contains other content, try extracting boolean
308 |         if 'true' in cleaned_response.split():
309 |             return True
310 |         if 'false' in cleaned_response.split():
311 |             return False
312 | 
313 |         raise ValueError("Response must be 'true' or 'false'")
314 | 
315 |     def ai_function_cases(self, prompt: str, language: str = "Chinese") -> str:
316 |         """
317 |         Generate functional test cases based on the given prompt.
318 |         
319 |         Args:
320 |             prompt (str): Natural language description of the functionality to test
321 |             language (str): Language in which the test cases should be generated
322 |         
323 |         Returns:
324 |             str: Generated test cases in a standard format
325 |         
326 |         Raises:
327 |             ValueError: If the AI response cannot be parsed or contains invalid instructions
328 |         """
329 |         logger.info(f"🪽 AI Function Case: {prompt}")
330 |         context = self._get_page_context()
331 | 
332 |         format_hint = ""
333 |         if prompt.startswith(('json[]', 'markdown[]')):
334 |             format_hint = prompt.split(',')[0].strip()
335 |             prompt = ','.join(prompt.split(',')[1:]).strip()
336 | 
337 |         # Provide different prompts based on the format
338 |         if format_hint == 'json[]':
339 |             # Construct the prompt for generating test cases
340 |             case_prompt = f"""
341 | You are a web automation assistant. Based on the following page context, generate functional test cases.
342 | 
343 | Current page context:
344 | URL: {context['url']}
345 | Title: {context['title']}
346 | 
347 | Available elements:
348 | {json.dumps(context['elements'], indent=2)}
349 | 
350 | User request: {prompt}
351 | 
352 | Return ONLY the test cases in the following format, no other text:
353 | [
354 |     {{
355 |       "Test Case ID": "001",
356 |       "Steps": "Describe the steps to perform the test without mentioning element locators.",
357 |       "Expected Result": "Describe the expected result."
358 |     }},
359 |     {{
360 |       "Test Case ID": "002",
361 |       "Steps": "Describe the steps to perform the test without mentioning element locators.",
362 |       "Expected Result": "Describe the expected result."
363 |     }}
364 | ]
365 | ...
366 | 
367 | Finally, the output result is required to be in {language}
368 | """
369 |         elif format_hint == 'markdown[]':
370 |             case_prompt = f"""
371 | You are a web automation assistant. Based on the following page context, generate functional test cases.
372 | 
373 | Current page context:
374 | URL: {context['url']}
375 | Title: {context['title']}
376 | 
377 | Available elements:
378 | {json.dumps(context['elements'], indent=2)}
379 | 
380 | User request: {prompt}
381 | 
382 | Return ONLY the test cases in the following format, no other text:
383 | | Test Case ID | Steps                                             | Expected Result               |
384 | |--------------|---------------------------------------------------|-------------------------------|
385 | | 001          | Describe the steps to perform the test without mentioning element locators. | Describe the expected result. |
386 | | 002          | Describe the steps to perform the test without mentioning element locators. | Describe the expected result. |
387 | ...
388 | 
389 | Finally, the output result is required to be in {language}
390 | """
391 |         else:
392 |             case_prompt = f"""
393 | You are a web automation assistant. Based on the following page context, generate functional test cases.
394 | 
395 | Current page context:
396 | URL: {context['url']}
397 | Title: {context['title']}
398 | 
399 | Available elements:
400 | {json.dumps(context['elements'], indent=2)}
401 | 
402 | User request: {prompt}
403 | 
404 | Return ONLY the test cases in the following format, no other text:
405 | Test Case ID: 001
406 | Steps: Describe the steps to perform the test without mentioning element locators.
407 | Expected Result: Describe the expected result.
408 | 
409 | Test Case ID: 002
410 | Steps: Describe the steps to perform the test without mentioning element locators.
411 | Expected Result: Describe the expected result.
412 | 
413 | ...
414 | 
415 | Finally, the output result is required to be in {language}
416 | """
417 | 
418 |         try:
419 |             response = self.llm_client.complete(case_prompt)
420 |             cleaned_response = self._clean_response(response)
421 | 
422 |             logger.debug(f"""📄 Function Cases:\n {cleaned_response}""")
423 |             return cleaned_response
424 |         except Exception as e:
425 |             raise ValueError(f"Failed to generate test cases. Error: {str(e)}\nResponse: {cleaned_response[:100]}...")
426 | 
427 | 
428 | def create_fixture():
429 |     """
430 |     Create a SeleniumAiFixture factory.
431 |     """
432 |     return SeleniumAiFixture
433 | 


--------------------------------------------------------------------------------
/autowing/playwright/fixture.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import Any, Dict
  3 | 
  4 | from loguru import logger
  5 | from playwright.sync_api import Page
  6 | 
  7 | from autowing.core.ai_fixture_base import AiFixtureBase
  8 | from autowing.core.llm.factory import LLMFactory
  9 | from autowing.utils.transition import selector_to_locator
 10 | 
 11 | 
 12 | class PlaywrightAiFixture(AiFixtureBase):
 13 |     """
 14 |     A fixture class that combines Playwright with AI capabilities for web automation.
 15 |     Provides AI-driven interaction with web pages using various LLM providers.
 16 |     """
 17 | 
 18 |     def __init__(self, page: Page):
 19 |         """
 20 |         Initialize the AI-powered Playwright fixture.
 21 | 
 22 |         Args:
 23 |             page (Page): The Playwright page object to automate
 24 |         """
 25 |         super().__init__()
 26 |         self.page = page
 27 |         self.llm_client = LLMFactory.create()
 28 | 
 29 |     def _get_page_context(self) -> Dict[str, Any]:
 30 |         """
 31 |         Extract context information from the current page.
 32 |         Collects information about visible elements and page metadata.
 33 | 
 34 |         Returns:
 35 |             Dict[str, Any]: A dictionary containing page URL, title, and information about
 36 |                            visible interactive elements
 37 |         """
 38 |         # Get basic page info
 39 |         basic_info = {
 40 |             "url": self.page.url,
 41 |             "title": self.page.title()
 42 |         }
 43 | 
 44 |         # Get key elements info
 45 |         elements_info = self.page.evaluate("""() => {
 46 |             const getVisibleElements = () => {
 47 |                 const elements = [];
 48 |                 const selectors = [
 49 |                     'input',        // input
 50 |                     'textarea',     // input
 51 |                     'select',       // input/click
 52 |                     'button',       // click
 53 |                     'a',            // click
 54 |                     '[role="button"]',   // click
 55 |                     '[role="link"]',     // click
 56 |                     '[role="checkbox"]', // click
 57 |                     '[role="radio"]',    // click
 58 |                     '[role="searchbox"]', // input
 59 |                     'summary',      // click（<details> ）
 60 |                     '[draggable="true"]'  // draggable
 61 |                 ];
 62 |                 
 63 |                 for (const selector of selectors) {
 64 |                     document.querySelectorAll(selector).forEach(el => {
 65 |                         if (el.offsetWidth > 0 && el.offsetHeight > 0) {
 66 |                             elements.push({
 67 |                                 tag: el.tagName.toLowerCase(),
 68 |                                 type: el.getAttribute('type') || null,
 69 |                                 placeholder: el.getAttribute('placeholder') || null,
 70 |                                 value: el.value || null,
 71 |                                 text: el.textContent?.trim() || '',
 72 |                                 aria: el.getAttribute('aria-label') || null,
 73 |                                 id: el.id || '',
 74 |                                 name: el.getAttribute('name') || null,
 75 |                                 class: el.className || '',
 76 |                                 draggable: el.getAttribute('draggable') || null
 77 |                             });
 78 |                         }
 79 |                     });
 80 |                 }
 81 |                 return elements;
 82 |             };
 83 |             return getVisibleElements();
 84 |         }""")
 85 | 
 86 |         return {
 87 |             **basic_info,
 88 |             "elements": elements_info
 89 |         }
 90 | 
 91 |     def ai_action(self, prompt: str, iframe=None) -> None:
 92 |         """
 93 |         Execute an AI-driven action on the page based on the given prompt.
 94 |         The AI will analyze the page context and perform the requested action.
 95 | 
 96 |         Args:
 97 |             prompt (str): Natural language description of the action to perform
 98 |             iframe: FrameLocator object
 99 | 
100 |         Raises:
101 |             ValueError: If the AI response cannot be parsed or contains invalid instructions
102 |             Exception: If the requested action cannot be performed
103 |         """
104 |         logger.info(f"🪽 AI Action: {prompt}")
105 |         context = self._get_page_context()
106 |         context["elements"] = self._remove_empty_keys(context.get("elements", []))
107 | 
108 |         def compute_action():
109 |             action_prompt = f"""
110 | You are a web automation assistant. Based on the following page context, provide instructions for the requested action.
111 | 
112 | Current page context:
113 | URL: {context['url']}
114 | Title: {context['title']}
115 | 
116 | Available elements:
117 | {json.dumps(context['elements'], indent=2)}
118 | 
119 | User request: {prompt}
120 | 
121 | Return ONLY a JSON object with the following structure, no other text:
122 | {{
123 |     "selector": "CSS selector or XPath to locate the element",
124 |     "action": "fill",
125 |     "value": "text to input",
126 |     "key": "key to press if needed"
127 | }}
128 | Note: selector is used for a playwright location, for example：page.locator(selector)
129 | 
130 | Example response:
131 | {{
132 |     "selector": "//input[id='search-input']",
133 |     "action": "fill",
134 |     "value": "search text",
135 |     "key": "Enter"
136 | }}
137 | Note: The CSS selector the tag name (input/button/select...).
138 |             """
139 |             response = self.llm_client.complete(action_prompt)
140 |             cleaned_response = self._clean_response(response)
141 |             return json.loads(cleaned_response)
142 | 
143 |         # Use cache manager to get or compute the instruction
144 |         instruction = self._get_cached_or_compute(prompt, context, compute_action)
145 |         # Execute the action using the instruction
146 |         selector = instruction.get('selector')
147 |         action = instruction.get('action')
148 | 
149 |         if not selector or not action:
150 |             raise ValueError("Invalid instruction format")
151 | 
152 |         # Perform the action
153 |         selector = selector_to_locator(selector)
154 |         element = self.page.locator(selector)
155 |         if iframe is not None:
156 |             element = iframe.locator(selector)
157 | 
158 |         if action == 'click':
159 |             element.click()
160 |         elif action == 'fill':
161 |             element.fill(instruction.get('value', ''))
162 |             if instruction.get('key'):
163 |                 element.press(instruction.get('key'))
164 |         elif action == 'press':
165 |             element.press(instruction.get('key', 'Enter'))
166 |         else:
167 |             raise ValueError(f"Unsupported action: {action}")
168 | 
169 |     def ai_query(self, prompt: str) -> Any:
170 |         """
171 |         Query information from the page using AI analysis.
172 |         Supports various data formats including arrays, objects, and primitive types.
173 | 
174 |         Args:
175 |             prompt (str): Natural language query about the page content.
176 |                          Can include format hints like 'string[]' or 'number'.
177 | 
178 |         Returns:
179 |             Any: The query results in the requested format
180 | 
181 |         Raises:
182 |             ValueError: If the AI response cannot be parsed into the requested format
183 |         """
184 |         logger.info(f"🪽 AI Query: {prompt}")
185 |         context = self._get_page_context()
186 |         context["elements"] = self._remove_empty_keys(context.get("elements", []))
187 | 
188 |         # Parse the requested data format
189 |         format_hint = ""
190 |         if prompt.startswith(('string[]', 'number[]', 'object[]')):
191 |             format_hint = prompt.split(',')[0].strip()
192 |             prompt = ','.join(prompt.split(',')[1:]).strip()
193 | 
194 |         # Provide different prompts based on the format
195 |         if format_hint == 'string[]':
196 |             query_prompt = f"""
197 | Extract text content matching the query. Return ONLY a JSON array of strings.
198 | 
199 | Page: {context['url']}
200 | Title: {context['title']}
201 | Query: {prompt}
202 | 
203 | Return format example: ["result1", "result2"]
204 | No other text or explanation.
205 | """
206 |         elif format_hint == 'number[]':
207 |             query_prompt = f"""
208 | Extract numeric values matching the query. Return ONLY a JSON array of numbers.
209 | 
210 | Page: {context['url']}
211 | Title: {context['title']}
212 | Query: {prompt}
213 | 
214 | Return format example: [1, 2, 3]
215 | No other text or explanation.
216 | """
217 |         else:
218 |             # Default prompt
219 |             query_prompt = f"""
220 | Extract information matching the query. Return ONLY in valid JSON format.
221 | 
222 | Page: {context['url']}
223 | Title: {context['title']}
224 | Query: {prompt}
225 | 
226 | Return format:
227 | - For arrays: ["item1", "item2"]
228 | - For objects: {{"key": "value"}}
229 | - For single value: "text" or number
230 | 
231 | No other text or explanation.
232 | """
233 | 
234 |         response = self.llm_client.complete(query_prompt)
235 | 
236 |         try:
237 |             cleaned_response = self._clean_response(response)
238 |             try:
239 |                 result = json.loads(cleaned_response)
240 |                 query_info = self._validate_result_format(result, format_hint)
241 |                 logger.debug(f"📄 Query: {query_info}")
242 |                 return query_info
243 |             except json.JSONDecodeError:
244 |                 # If it's a string array format, try extracting from text
245 |                 if format_hint == 'string[]':
246 |                     # Split and clean text
247 |                     lines = [line.strip() for line in cleaned_response.split('\n')
248 |                              if line.strip() and not line.startswith(('-', '*', '#'))]
249 | 
250 |                     # Extract lines containing query terms
251 |                     query_terms = [term.lower() for term in prompt.split()
252 |                                    if len(term) > 2 and term.lower() not in ['the', 'and', 'for']]
253 | 
254 |                     results = []
255 |                     for line in lines:
256 |                         # Check if line contains query terms
257 |                         if any(term in line.lower() for term in query_terms):
258 |                             # Clean text
259 |                             text = line.strip('`"\'- ,')
260 |                             if ':' in text:
261 |                                 text = text.split(':', 1)[1].strip()
262 |                             if text:
263 |                                 results.append(text)
264 | 
265 |                     if results:
266 |                         # Remove duplicates while preserving order
267 |                         seen = set()
268 |                         query_info = [x for x in results if not (x in seen or seen.add(x))]
269 |                         logger.debug(f"📄 Query: {query_info}")
270 |                         return query_info
271 | 
272 |                 raise ValueError(f"Failed to parse response as JSON: {cleaned_response[:100]}...")
273 | 
274 |         except Exception as e:
275 |             raise ValueError(f"Query failed. Error: {str(e)}\nResponse: {cleaned_response[:100]}...")
276 | 
277 |     def ai_assert(self, prompt: str) -> bool:
278 |         """
279 |         Verify a condition on the page using AI analysis.
280 | 
281 |         Args:
282 |             prompt (str): Natural language description of the condition to verify
283 | 
284 |         Returns:
285 |             bool: True if the condition is met, False otherwise
286 | 
287 |         Raises:
288 |             ValueError: If the AI response cannot be parsed as a boolean value
289 |         """
290 |         logger.info(f"🪽 AI Assert: {prompt}")
291 |         context = self._get_page_context()
292 |         context["elements"] = self._remove_empty_keys(context.get("elements", []))
293 | 
294 |         # Optimize the prompt to be concise and explicitly require a boolean return
295 |         assert_prompt = f"""
296 | You are a web automation assistant. Verify the following assertion and return ONLY a boolean value.
297 | 
298 | Page URL: {context['url']}
299 | Page Title: {context['title']}
300 | 
301 | Assertion: {prompt}
302 | 
303 | IMPORTANT: Return ONLY the word 'true' or 'false' (lowercase). No other text, no explanation.
304 | """
305 | 
306 |         response = self.llm_client.complete(assert_prompt)
307 |         cleaned_response = self._clean_response(response).lower()
308 | 
309 |         try:
310 |             # Directly match true or false
311 |             if cleaned_response == 'true':
312 |                 return True
313 |             if cleaned_response == 'false':
314 |                 return False
315 | 
316 |             # If response contains other content, try extracting boolean
317 |             if 'true' in cleaned_response.split():
318 |                 return True
319 |             if 'false' in cleaned_response.split():
320 |                 return False
321 | 
322 |             raise ValueError("Response must be 'true' or 'false'")
323 | 
324 |         except Exception as e:
325 |             # Provide more useful error information
326 |             raise ValueError(
327 |                 f"Failed to parse assertion result. Response: {cleaned_response[:100]}... "
328 |                 f"Error: {str(e)}"
329 |             )
330 | 
331 |     def ai_function_cases(self, prompt: str, language: str = "Chinese") -> str:
332 |         """
333 |         Generate functional test cases based on the given prompt.
334 |         
335 |         Args:
336 |             prompt (str): Natural language description of the functionality to test
337 |             language (str): Natural language description of the functionality to test
338 | 
339 |         Returns:
340 |             str: Generated test cases in a standard format
341 |         
342 |         Raises:
343 |             ValueError: If the AI response cannot be parsed or contains invalid instructions
344 |         """
345 |         logger.info(f"🪽 AI Function Case: {prompt}")
346 |         context = self._get_page_context()
347 | 
348 |         format_hint = ""
349 |         if prompt.startswith(('json[]', 'markdown[]')):
350 |             format_hint = prompt.split(',')[0].strip()
351 |             prompt = ','.join(prompt.split(',')[1:]).strip()
352 | 
353 |         # Provide different prompts based on the format
354 |         if format_hint == 'json[]':
355 |             # Construct the prompt for generating test cases
356 |             case_prompt = f"""
357 | You are a web automation assistant. Based on the following page context, generate functional test cases.
358 | 
359 | Current page context:
360 | URL: {context['url']}
361 | Title: {context['title']}
362 | 
363 | Available elements:
364 | {json.dumps(context['elements'], indent=2)}
365 | 
366 | User request: {prompt}
367 | 
368 | Return ONLY the test cases in the following format, no other text:
369 | [
370 |     {{
371 |       "Test Case ID": "001",
372 |       "Steps": "Describe the steps to perform the test without mentioning element locators.",
373 |       "Expected Result": "Describe the expected result."
374 |     }},
375 |     {{
376 |       "Test Case ID": "002",
377 |       "Steps": "Describe the steps to perform the test without mentioning element locators.",
378 |       "Expected Result": "Describe the expected result."
379 |     }}
380 | ]
381 | ...
382 | 
383 | Finally, the output result is required to be in {language}
384 | """
385 |         elif format_hint == 'markdown[]':
386 |             case_prompt = f"""
387 | You are a web automation assistant. Based on the following page context, generate functional test cases.
388 | 
389 | Current page context:
390 | URL: {context['url']}
391 | Title: {context['title']}
392 | 
393 | Available elements:
394 | {json.dumps(context['elements'], indent=2)}
395 | 
396 | User request: {prompt}
397 | 
398 | Return ONLY the test cases in the following format, no other text:
399 | | Test Case ID | Steps                                             | Expected Result               |
400 | |--------------|---------------------------------------------------|-------------------------------|
401 | | 001          | Describe the steps to perform the test without mentioning element locators. | Describe the expected result. |
402 | | 002          | Describe the steps to perform the test without mentioning element locators. | Describe the expected result. |
403 | ...
404 | 
405 | Finally, the output result is required to be in {language}
406 | """
407 |         else:
408 |             case_prompt = f"""
409 | You are a web automation assistant. Based on the following page context, generate functional test cases.
410 | 
411 | Current page context:
412 | URL: {context['url']}
413 | Title: {context['title']}
414 | 
415 | Available elements:
416 | {json.dumps(context['elements'], indent=2)}
417 | 
418 | User request: {prompt}
419 | 
420 | Return ONLY the test cases in the following format, no other text:
421 | Test Case ID: 001
422 | Steps: Describe the steps to perform the test without mentioning element locators.
423 | Expected Result: Describe the expected result.
424 | 
425 | Test Case ID: 002
426 | Steps: Describe the steps to perform the test without mentioning element locators.
427 | Expected Result: Describe the expected result.
428 | 
429 | ...
430 | 
431 | Finally, the output result is required to be in {language}
432 | """
433 | 
434 |         try:
435 |             response = self.llm_client.complete(case_prompt)
436 |             cleaned_response = self._clean_response(response)
437 | 
438 |             logger.debug(f"""📄 Function Cases:\n {cleaned_response}""")
439 |             return cleaned_response
440 |         except Exception as e:
441 |             raise ValueError(f"Failed to generate test cases. Error: {str(e)}\nResponse: {cleaned_response[:100]}...")
442 | 
443 | 
444 | def create_fixture():
445 |     """
446 |     Create a PlaywrightAiFixture factory.
447 | 
448 |     Returns:
449 |         Callable[[Page], PlaywrightAiFixture]: A factory function that creates
450 |         PlaywrightAiFixture instances
451 |     """
452 |     return PlaywrightAiFixture
453 | 


--------------------------------------------------------------------------------