├── .env ├── LICENSE ├── README.md ├── agent ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── graph.cpython-310.pyc │ ├── prompt.cpython-310.pyc │ └── tools.cpython-310.pyc ├── graph.py ├── prompt.py └── tools.py ├── api_serve.py ├── engines ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── baidusearch.cpython-310.pyc │ ├── bingsearch.cpython-310.pyc │ ├── quarksearch.cpython-310.pyc │ └── sougousearch.cpython-310.pyc ├── baidusearch.py ├── bingsearch.py ├── quarksearch.py └── sougousearch.py ├── extension ├── README.md └── img │ ├── 1.png │ ├── 2.png │ ├── 3.png │ ├── 4.png │ └── 5.png ├── gradio_demo.py ├── img ├── framework.png ├── gradio.png ├── gradio1.png ├── gradio2.png └── workflow.png ├── pools ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── crawler_pool.cpython-310.pyc │ └── engine_pool.cpython-310.pyc ├── crawler_pool.py └── engine_pool.py ├── requirements.txt └── reranker ├── __init__.py ├── __pycache__ ├── __init__.cpython-310.pyc ├── base.cpython-310.pyc └── chunker.cpython-310.pyc ├── base.py └── chunker.py /.env: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=xxx 2 | OPENAI_BASE_URL=https://ark.cn-beijing.volces.com/api/v3 3 | MODEL_NAME=deepseek-v3-250324 4 | 5 | EMBEDDING_MODEL_NAME=doubao-embedding-large-text-240915 6 | EMBEDDING_API_KEY=xxx 7 | EMBEDDING_BASE_URL=https://ark.cn-beijing.volces.com/api/v3 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 itshyao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🧠 无需代理的LLM网络搜索引擎 2 | 3 | 一个无需代理的多搜索引擎 LLM 网络检索工具,支持 URL 内容解析和网页爬取,结合 LangGraph 实现模块化智能体链路。专为大语言模型的外部知识调用场景而设计,支持 **Playwright + Crawl4AI** 网页获取与解析,支持异步并发、内容切片与重排过滤。 4 | 5 | ## ✨ 特性一览 6 | 7 | - 🌐 **无需代理**:通过 Playwright 配置国内浏览器支持,无需代理也能进行网络搜索。 8 | - 🔍 **多搜索引擎支持**:支持 Bing、夸克、百度、搜狗 等主流搜索引擎,增强信息来源多样性。 9 | - 🤖 **意图识别**:系统能够根据用户的输入内容,自动判断是进行网络搜索还是解析 URL。 10 | - 🔄 **查询分解**:根据用户的搜索意图,自动将查询分解为多个子任务,并依次执行,从而提升搜索的相关性与效率。 11 | - ⚙️ **智能体架构**:基于 **LangGraph** 封装的**「web_search」**与**「link_parser」**。 12 | - 🏃‍♂️ **异步并发任务处理**:支持异步并发任务处理,可高效处理多个搜索任务。 13 | - 📝 **内容处理优化**: 14 | 15 | - ✂️ **内容切片**:将网页长内容按段切分。 16 | 17 | - 🔄 **内容重排**:智能重排序,提高信息相关性。 18 | 19 | - 🚫 **内容过滤**:自动剔除无关或重复内容。 20 | - 🌐 **多端支持**: 21 | 22 | - 🖥️ 提供 FastAPI 后端接口,可集成到任意系统中。 23 | 24 | - 🌍 提供 Gradio Web UI,可快速部署成可视化应用。 25 | 26 | - 🧩[ **浏览器插件支持**](https://github.com/itshyao/proxyless-llm-websearch/tree/main/extension):支持 Edge ,提供智能 URL 解析插件,直接在浏览器中发起网页解析与内容提取请求。 27 | 28 | 29 | ![workflow](img/workflow.png) 30 | 31 | ![framework](img/framework.png) 32 | 33 | ## ⚡ 快速开始 34 | 35 | ### 1. 克隆仓库 36 | 37 | ```bash 38 | git clone https://github.com/itshyao/proxyless-llm-websearch.git 39 | cd proxyless-llm-websearch 40 | ``` 41 | 42 | ### 2. 安装依赖 43 | 44 | ``` 45 | pip install -r requirements.txt 46 | python -m playwright install 47 | ``` 48 | 49 | ### 3. 快速开始 50 | 51 | #### 环境变量配置 52 | 53 | ``` 54 | OPENAI_API_KEY=xxx 55 | OPENAI_BASE_URL=https://ark.cn-beijing.volces.com/api/v3 56 | MODEL_NAME=deepseek-v3-250324 57 | 58 | EMBEDDING_MODEL_NAME=doubao-embedding-large-text-240915 59 | EMBEDDING_API_KEY=xxx 60 | EMBEDDING_BASE_URL=https://ark.cn-beijing.volces.com/api/v3 61 | ``` 62 | 63 | #### demo 64 | 65 | ```python 66 | ''' 67 | python demo.py 68 | ''' 69 | 70 | from pools import BrowserPool, CrawlerPool 71 | from agent import ToolsGraph 72 | import asyncio 73 | 74 | async def main(): 75 | browser_pool = BrowserPool(pool_size=1) 76 | crawler_pool = CrawlerPool(pool_size=1) 77 | 78 | graph = ToolsGraph(browser_pool, crawler_pool, engine="bing") 79 | 80 | await browser_pool._create_browser_instance(headless=True) 81 | await crawler_pool._get_instance() 82 | 83 | result = await graph.run("广州今日天气") 84 | 85 | await browser_pool.cleanup() 86 | await crawler_pool.cleanup() 87 | 88 | print(result) 89 | 90 | if __name__ == "__main__": 91 | asyncio.run(main()) 92 | ``` 93 | 94 | #### 后端api 95 | 96 | ```python 97 | ''' 98 | python api_serve.py 99 | ''' 100 | import requests 101 | 102 | url = "http://localhost:8000/search" 103 | 104 | data = { 105 | "question": "广州今日天气" 106 | } 107 | 108 | try: 109 | response = requests.post( 110 | url, 111 | json=data 112 | ) 113 | 114 | if response.status_code == 200: 115 | print("✅ 请求成功!") 116 | print("响应内容:", response.json()) 117 | else: 118 | print(f"❌ 请求失败,状态码:{response.status_code}") 119 | print("错误信息:", response.text) 120 | 121 | except requests.exceptions.RequestException as e: 122 | print(f"⚠️ 请求异常:{str(e)}") 123 | ``` 124 | 125 | #### gradio_demo 126 | 127 | ``` 128 | python gradio_demo.py 129 | ``` 130 | 131 | ![gradio](img/gradio1.png) 132 | 133 | ![gradio](img/gradio2.png) 134 | 135 | ## 🔍 与线上网络检索测试对比 136 | 137 | 我们将项目与一些主流的在线 API 进行对比,评估了其在复杂问题下的表现。 138 | 139 | ### 🔥 数据集 140 | 141 | - 数据集来自阿里发布的 [WebWalkerQA](https://huggingface.co/datasets/callanwu/WebWalkerQA),包含了 680 个高难度问题,覆盖教育、学术会议、游戏等多个领域。 142 | - 数据集包括中英文问题。 143 | 144 | ### 🧑‍🏫 对比结果 145 | 146 | | 搜索引擎/系统 | ✅ Correct | ❌ Incorrect | ⚠️ Partially Correct | 147 | | -------------- | --------- | ----------- | ------------------- | 148 | | **火山方舟** | 5.00% | 72.21% | 22.79% | 149 | | **百炼** | 9.85% | 62.79% | 27.35% | 150 | | **Our** | 19.85% | 47.94% | 32.06% | 151 | 152 | ## 🙏 致谢 153 | 154 | 本项目部分功能得益于以下开源项目的支持与启发,特此致谢: 155 | 156 | - 🧠 [LangGraph](https://github.com/langchain-ai/langgraph):用于构建模块化智能体链路框架。 157 | - 🕷 [Crawl4AI](https://github.com/unclecode/crawl4ai):强大的网页内容解析工具。 -------------------------------------------------------------------------------- /agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .graph import ToolsGraph -------------------------------------------------------------------------------- /agent/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/agent/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /agent/__pycache__/graph.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/agent/__pycache__/graph.cpython-310.pyc -------------------------------------------------------------------------------- /agent/__pycache__/prompt.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/agent/__pycache__/prompt.cpython-310.pyc -------------------------------------------------------------------------------- /agent/__pycache__/tools.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/agent/__pycache__/tools.cpython-310.pyc -------------------------------------------------------------------------------- /agent/graph.py: -------------------------------------------------------------------------------- 1 | from langchain_core.messages import HumanMessage, SystemMessage 2 | from langchain_openai import ChatOpenAI 3 | from langgraph.checkpoint.memory import MemorySaver 4 | from langgraph.graph import END, START, StateGraph, MessagesState 5 | from langgraph.prebuilt import ToolNode 6 | from typing import Literal 7 | from datetime import datetime 8 | from dotenv import load_dotenv 9 | import os 10 | from agent.tools import WebTools 11 | from .prompt import prompts 12 | load_dotenv() 13 | def get_datetime_str(): 14 | now = datetime.now() 15 | datetime_str = now.strftime("%Y-%m-%d %H:%M") 16 | return datetime_str 17 | 18 | class ToolsGraph: 19 | 20 | def __init__(self, browser_pool, crawler_pool, engine): 21 | self.browser_pool = browser_pool 22 | self.crawler_pool = crawler_pool 23 | self.engine = engine 24 | self.ts_manage = WebTools(browser_pool=self.browser_pool, crawler_pool=crawler_pool, engine=self.engine) 25 | self.tools = [self.ts_manage.web_search, self.ts_manage.link_parser] 26 | self.tool_node = ToolNode(self.tools) 27 | self.llm = ChatOpenAI( 28 | model=os.getenv("MODEL_NAME"), 29 | openai_api_key=os.getenv("OPENAI_API_KEY"), 30 | base_url=os.getenv("OPENAI_BASE_URL"), 31 | streaming=False, 32 | temperature=0, 33 | ).bind_tools(self.tools) 34 | workflow = StateGraph(MessagesState) 35 | workflow.add_node("agent", self.call_model) 36 | workflow.add_node("tools", self.tool_node) 37 | # 设定入口为 agent 38 | workflow.add_edge(START, "agent") 39 | # 条件边:决定是否继续调用工具 40 | workflow.add_conditional_edges("agent", self.should_continue) 41 | # 设置普通边:agent 到 agent 42 | workflow.add_edge("tools", "agent") 43 | self.graph = workflow.compile() 44 | 45 | 46 | def should_continue(self, state: MessagesState) -> Literal["tools", END]: 47 | messages = state['messages'] 48 | last = messages[-1] 49 | return "tools" if last.tool_calls else END 50 | 51 | async def call_model(self, state: MessagesState): 52 | messages = state["messages"] 53 | print(messages) 54 | response = await self.llm.ainvoke(messages) 55 | return {"messages": [response]} 56 | 57 | async def run(self, question): 58 | inputs = {"messages": [SystemMessage(content=prompts["web_prompt"] + f"\n当前时间:{get_datetime_str()}"),HumanMessage(content=question)]} 59 | final_state = await self.graph.ainvoke(inputs) 60 | for i in final_state["messages"]: 61 | print(i) 62 | return final_state["messages"][-1].content 63 | 64 | -------------------------------------------------------------------------------- /agent/prompt.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | prompts: dict[str, Any] = {} 3 | 4 | prompts["web_prompt"] = """ 5 | 你是一个智能问题处理器,能够识别用户意图、拆解问题、生成搜索友好的查询,并根据输入类型动态选择合适的工具进行处理。你可以使用两个工具: 6 | 7 | - `web_search`:用于搜索与用户问题相关的信息。 8 | - `link_parser`:用于解析用户给出的网页内容。 9 | 10 | 你始终以 Markdown 格式输出结果,并遵循以下逻辑: 11 | 12 | --- 13 | 14 | ### 一、判断输入类型和处理方式 15 | 16 | - **如果用户输入为一个或多个 URL**(以 http:// 或 https:// 开头): 17 | - 不进行搜索,直接使用 `link_parser` 工具解析网页。 18 | 19 | - **如果用户输入为自然语言问题或话题**: 20 | - 继续进行“问题分解”和“关键词提取”,并使用 `web_search` 工具。 21 | - 之后对搜索结果中最相关的页面使用 `link_parser` 进一步解析内容。 22 | 23 | --- 24 | 25 | ### 二、问题分解与关键词提取(针对自然语言输入) 26 | 27 | - 将问题拆解为 2–5 个子问题,覆盖核心维度。 28 | - 为每个子问题生成适合搜索引擎的关键词表达(避免“请问”“我想知道”等无效前缀)。 29 | 30 | 31 | **输出与引用规则:** 32 | - 你的回答必须使用 **Markdown 格式**。 33 | - 如引用具体网页内容,请在段落或句子末尾加上**引用标注**,使用 Markdown 的 `[数字](链接)` 格式,例如:[1](https://example.com)。 34 | - 所有信息必须来自工具返回的真实结果,禁止编造或凭空猜测。 35 | 36 | **额外注意事项:** 37 | - 输出内容应简洁、有条理、结论明确。 38 | - 若找不到答案,应坦诚说明并建议用户尝试其他表达方式或搜索方式。 39 | - 优先引用权威、可靠来源(如官网、主流媒体等)。 40 | """ -------------------------------------------------------------------------------- /agent/tools.py: -------------------------------------------------------------------------------- 1 | from engines import BingSearch, QuarkSearch, BaiduSearch, SougouSearch 2 | from reranker import OpenAIEmbeddingReranker, Chunker 3 | from typing import List, Optional 4 | from langchain_core.tools import StructuredTool 5 | from pydantic import BaseModel 6 | 7 | reranker = OpenAIEmbeddingReranker() 8 | 9 | 10 | class WebSearchArgsSchema(BaseModel): 11 | questions: List[str] 12 | 13 | class QuarkSearchArgsSchema(BaseModel): 14 | questions: List[str] 15 | 16 | class LinkParserArgsSchema(BaseModel): 17 | urls: List[str] 18 | 19 | class WebTools(): 20 | 21 | def __init__(self, browser_pool, crawler_pool, engine): 22 | self.browser_pool = browser_pool 23 | self.crawler_pool = crawler_pool 24 | self.engine = engine 25 | self.web_search = StructuredTool( 26 | name='web_search', 27 | description='网络搜索功能,模拟搜索引擎,专门解决实时类问题的查询。', 28 | args_schema=WebSearchArgsSchema, 29 | coroutine=self.web_search_function # 协程函数 30 | ) 31 | 32 | self.link_parser = StructuredTool( 33 | name='link_parser', 34 | description='用于解析url,获取网页链接的内容,其中urls为链接,query为用户在工具‘web_search’输入的查询。', 35 | args_schema=LinkParserArgsSchema, 36 | coroutine=self.link_parser_function 37 | ) 38 | 39 | async def web_search_function(self, questions: list) -> dict: 40 | if self.engine == "bing": 41 | search = BingSearch(browser_pool=self.browser_pool) 42 | elif self.engine == "quark": 43 | search = QuarkSearch(browser_pool=self.browser_pool) 44 | elif self.engine == "baidu": 45 | search = BaiduSearch(browser_pool=self.browser_pool) 46 | elif self.engine == "sougou": 47 | search = SougouSearch(browser_pool=self.browser_pool) 48 | else: 49 | raise "engine输入错误" 50 | result = await search.response(questions) 51 | return result 52 | 53 | async def link_parser_function(self, urls: list, query: Optional[str] = None) -> list: 54 | try: 55 | async with self.crawler_pool.get_crawler() as crawler: 56 | results = await crawler.run(urls) 57 | if query: 58 | results = await split_and_reranker(query, results) 59 | print(results) 60 | return results 61 | else: 62 | return results 63 | except: 64 | return results 65 | 66 | async def split_and_reranker(query, contents): 67 | results = [] 68 | for content in contents: 69 | splitter = Chunker() 70 | final_splits = splitter.split_text(content["content"]) 71 | final_splits = [chunk for chunk in final_splits] 72 | reranker_results = await reranker.get_reranked_documents(query, final_splits, top_k=10) 73 | content["content"] = "\n".join(reranker_results) 74 | if content["content"]: 75 | results.append(content) 76 | return results -------------------------------------------------------------------------------- /api_serve.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from fastapi import FastAPI 3 | from fastapi.middleware.cors import CORSMiddleware 4 | from contextlib import asynccontextmanager 5 | from agent import ToolsGraph 6 | from pools import BrowserPool, CrawlerPool 7 | 8 | browser_pool = BrowserPool(pool_size=1) 9 | crawler_pool = CrawlerPool(pool_size=1) 10 | graph = ToolsGraph(browser_pool, crawler_pool, engine="sougou") 11 | 12 | 13 | @asynccontextmanager 14 | async def lifespan(app: FastAPI): 15 | # startup:可选预热 16 | await browser_pool._create_browser_instance(headless=True) 17 | await crawler_pool._get_instance() 18 | print("✅ Browser pool initialized.") 19 | 20 | yield # 应用运行中,等待请求 21 | 22 | # shutdown:清理资源 23 | await browser_pool.cleanup() 24 | await crawler_pool.cleanup() 25 | print("✅ Browser pool cleaned up.") 26 | 27 | app = FastAPI(lifespan=lifespan) 28 | 29 | app.add_middleware( 30 | CORSMiddleware, 31 | allow_origins=["*"], 32 | allow_credentials=True, 33 | allow_methods=["*"], 34 | allow_headers=["*"], 35 | ) 36 | 37 | class QueryRequest(BaseModel): 38 | question: str 39 | 40 | 41 | @app.post("/search") 42 | async def search(query: QueryRequest): 43 | result = await graph.run(query.question) 44 | return {"data": result} 45 | 46 | if __name__ == "__main__": 47 | import uvicorn 48 | port = 8000 49 | uvicorn.run(app, host="0.0.0.0", port=port, workers=1) 50 | -------------------------------------------------------------------------------- /engines/__init__.py: -------------------------------------------------------------------------------- 1 | from .bingsearch import BingSearch 2 | from .quarksearch import QuarkSearch 3 | from .baidusearch import BaiduSearch 4 | from .sougousearch import SougouSearch 5 | 6 | 7 | __all__ = ["BingSearch", "QuarkSearch", "BaiduSearch", "SougouSearch"] -------------------------------------------------------------------------------- /engines/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/engines/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /engines/__pycache__/baidusearch.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/engines/__pycache__/baidusearch.cpython-310.pyc -------------------------------------------------------------------------------- /engines/__pycache__/bingsearch.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/engines/__pycache__/bingsearch.cpython-310.pyc -------------------------------------------------------------------------------- /engines/__pycache__/quarksearch.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/engines/__pycache__/quarksearch.cpython-310.pyc -------------------------------------------------------------------------------- /engines/__pycache__/sougousearch.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/engines/__pycache__/sougousearch.cpython-310.pyc -------------------------------------------------------------------------------- /engines/baidusearch.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from bs4 import BeautifulSoup 3 | from pools import BrowserPool, BrowserPlaywright 4 | import json 5 | class BaiduSearch: 6 | 7 | def __init__(self, browser_pool: BrowserPool): 8 | self.browser_pool = browser_pool 9 | self.base_url = "https://www.baidu.com/" 10 | 11 | async def response(self, questions: Optional[List[str]]) -> Optional[dict]: 12 | results = {} 13 | async with self.browser_pool.get_browser() as browser: 14 | for question in questions: 15 | html = await self.run(browser=browser, question=question) 16 | result = self.parsing(html) 17 | if result: 18 | results[question] = result 19 | 20 | return results 21 | 22 | async def run(self, browser: BrowserPlaywright, question: Optional[str]): 23 | context = await browser.browser.new_context() 24 | page = await context.new_page() 25 | await page.goto(self.base_url) 26 | 27 | await page.fill('input[name="wd"]', question) 28 | await page.wait_for_timeout(1000) 29 | await page.click('input#su') 30 | await page.wait_for_selector('div.c-container') # 等待搜索结果加载完成 31 | await page.wait_for_timeout(1000) 32 | html = await page.content() 33 | await page.close() 34 | await context.close() 35 | return html 36 | 37 | def parsing(self, html: Optional[str]) -> Optional[List[dict]]: 38 | soup = BeautifulSoup(html, "lxml") 39 | items = soup.find_all("div", class_="c-container") 40 | results = [] 41 | for item in items: 42 | title_tag = item.find('h3', class_='c-title t t tts-title') 43 | title = title_tag.get_text(strip=True) if title_tag else '' 44 | 45 | publisher_tag = item.find('a', class_='siteLink_9TPP3') 46 | publisher = publisher_tag.get_text(strip=True) if publisher_tag else '' 47 | 48 | url_tag = item.find('a', class_='siteLink_9TPP3') 49 | url = url_tag['href'] if url_tag else '' 50 | 51 | summary_tag = item.find('span', class_='content-right_2s-H4') 52 | summary = summary_tag.get_text(strip=True) if summary_tag else '' 53 | 54 | time_tag = item.find("span", class_="c-color-gray2") 55 | time = time_tag.get_text(strip=True) if time_tag else '' 56 | 57 | data = { 58 | "title": title, 59 | "publisher": publisher, 60 | "url": url, 61 | "summary": summary, 62 | "time": time 63 | } 64 | if url: 65 | results.append(data) 66 | results = [json.loads(x) for x in set(json.dumps(d, sort_keys=True) for d in results)] 67 | return results 68 | 69 | -------------------------------------------------------------------------------- /engines/bingsearch.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | import unicodedata 3 | from bs4 import BeautifulSoup 4 | from pools import BrowserPool, BrowserPlaywright 5 | 6 | class BingSearch: 7 | 8 | def __init__(self, browser_pool: BrowserPool): 9 | self.browser_pool = browser_pool 10 | self.base_url = "https://cn.bing.com" 11 | 12 | async def response(self, questions: Optional[List[str]]) -> Optional[dict]: 13 | results = {} 14 | async with self.browser_pool.get_browser() as browser: 15 | for question in questions: 16 | html = await self.run(browser=browser, question=question) 17 | result = self.parsing(html) 18 | if result: 19 | results[question] = result 20 | 21 | return results 22 | 23 | async def run(self, browser: BrowserPlaywright, question: Optional[str]): 24 | context = await browser.browser.new_context() 25 | page = await context.new_page() 26 | await page.goto(self.base_url) 27 | 28 | # 输入搜索内容并执行搜索 29 | await page.fill('input#sb_form_q', question) 30 | await page.wait_for_timeout(500) 31 | await page.keyboard.press('Enter') 32 | await page.wait_for_selector('li.b_algo') # 等待搜索结果加载完成 33 | await page.wait_for_timeout(2000) 34 | html = await page.content() 35 | await page.close() 36 | await context.close() 37 | return html 38 | 39 | def parsing(self, html: Optional[str]) -> Optional[List[dict]]: 40 | soup = BeautifulSoup(html, "lxml") 41 | items = soup.find_all("li", class_=lambda x: x and "b_algo" in x) 42 | results = [] 43 | for item in items: 44 | publisher_tag = item.find("a", class_="tilk") 45 | publisher = publisher_tag.get("aria-label") 46 | url = publisher_tag.get("href") 47 | 48 | if item.find("p"): 49 | content = unicodedata.normalize("NFKC", item.find("p").get_text(strip=True)) 50 | content_list = content.split(" · ") 51 | if len(content_list) == 2: 52 | time = content_list[0] 53 | summary = content_list[1] 54 | else: 55 | time = "UNKNOW" 56 | summary = content_list[0] 57 | else: 58 | time = "" 59 | summary = "" 60 | 61 | title_tag = item.find("h2") 62 | title = title_tag.get_text(strip=True) 63 | 64 | data = { 65 | "title": title, 66 | "publisher": publisher, 67 | "url": url, 68 | "summary": summary, 69 | "time": time 70 | } 71 | results.append(data) 72 | 73 | return results 74 | 75 | -------------------------------------------------------------------------------- /engines/quarksearch.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List 2 | from playwright.async_api import async_playwright 3 | from bs4 import BeautifulSoup 4 | from pools import BrowserPool, BrowserPlaywright 5 | 6 | class QuarkSearch: 7 | def __init__(self, browser_pool: BrowserPool): 8 | self.browser_pool = browser_pool 9 | self.base_url = "https://ai.quark.cn/" 10 | 11 | async def response(self, questions: Optional[List[str]]) -> Optional[dict]: 12 | results = {} 13 | async with self.browser_pool.get_browser() as browser: 14 | for question in questions: 15 | html = await self.run(browser=browser, question=question) 16 | result = self.parsing(html) 17 | if result: 18 | results[question] = result 19 | return results 20 | 21 | async def run(self, browser: BrowserPlaywright, question: Optional[str]): 22 | context = await browser.browser.new_context() 23 | page = await context.new_page() 24 | await page.goto(self.base_url) 25 | 26 | await page.fill('textarea[placeholder="搜资料、提问题、找答案"]', question) 27 | await page.wait_for_timeout(1000) 28 | await page.wait_for_selector("span.input-keywords-highlight", timeout=5000) 29 | await page.click("span.input-keywords-highlight") 30 | 31 | 32 | await page.wait_for_selector("section.sc.sc_structure_template_normal") 33 | 34 | await page.wait_for_function('document.body !== null') 35 | await page.evaluate('window.scrollTo(0, document.body.scrollHeight)') 36 | 37 | # await page.screenshot(path="quark_weather.png") 38 | 39 | 40 | html = await page.content() 41 | await page.close() 42 | await context.close() 43 | return html 44 | 45 | def parsing(self, html): 46 | soup = BeautifulSoup(html, "lxml") 47 | items = soup.find_all("section", class_="sc sc_structure_template_normal") 48 | results = [] 49 | for item in items: 50 | # 提取标题 51 | title_tag = item.find("div", class_="qk-title-text") 52 | title = title_tag.get_text(strip=True) if title_tag else "" 53 | 54 | # 提取发布者 55 | tags = item.find_all("span", class_="qk-source-item qk-clamp-1") 56 | publisher = tags[0].get_text(strip=True) if tags else "" 57 | if len(tags) == 2: 58 | time = tags[1].get_text(strip=True) 59 | else: 60 | time = "" 61 | 62 | url_tag = item.find("a", class_="qk-link-wrapper") 63 | url = url_tag["href"] if url_tag else "" 64 | 65 | summary_tag = item.find("div", class_="qk-paragraph-text") 66 | summary = summary_tag.get_text(strip=True) if summary_tag else "" 67 | 68 | # 输出解析后的信息 69 | result = { 70 | "title": title, 71 | "publisher": publisher, 72 | "url": url, 73 | "summary": summary, 74 | "time": time 75 | } 76 | results.append(result) 77 | return results 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /engines/sougousearch.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from bs4 import BeautifulSoup 3 | from pools import BrowserPool, BrowserPlaywright 4 | 5 | class SougouSearch: 6 | 7 | def __init__(self, browser_pool: BrowserPool): 8 | self.browser_pool = browser_pool 9 | self.base_url = "https://www.sogou.com" 10 | 11 | async def response(self, questions: Optional[List[str]]) -> Optional[dict]: 12 | results = {} 13 | async with self.browser_pool.get_browser() as browser: 14 | for question in questions: 15 | html = await self.run(browser=browser, question=question) 16 | result = self.parsing(html) 17 | if result: 18 | results[question] = result 19 | 20 | return results 21 | 22 | async def run(self, browser: BrowserPlaywright, question: Optional[str]): 23 | context = await browser.browser.new_context(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36") 24 | page = await context.new_page() 25 | await page.goto(self.base_url) 26 | await page.wait_for_timeout(1000) 27 | # await page.screenshot(path="sougou.png") 28 | await page.fill('input#query', question) 29 | await page.wait_for_timeout(1000) 30 | await page.click('input#stb') 31 | await page.wait_for_timeout(1000) 32 | await page.wait_for_selector('div.vrwrap') 33 | 34 | html = await page.content() 35 | await page.close() 36 | await context.close() 37 | return html 38 | 39 | def parsing(self, html: Optional[str]) -> Optional[List[dict]]: 40 | soup = BeautifulSoup(html, "lxml") 41 | items = soup.find_all("div", class_="vrwrap") 42 | 43 | results = [] 44 | for item in items: 45 | title_tag = item.select_one("h3.vr-title a") 46 | title = title_tag.get_text(strip=True) if title_tag else "" 47 | url = title_tag.get("href", "") if title_tag else "" 48 | 49 | if url.startswith("/link?url="): 50 | url = f"{self.base_url}{url}" 51 | 52 | summary_tag = item.select_one("div.text-layout p.star-wiki") 53 | if summary_tag: 54 | summary = summary_tag.get_text(strip=True) 55 | else: 56 | alt_summary_tag = item.select_one("div.fz-mid.space-txt") 57 | summary = alt_summary_tag.get_text(strip=True) if alt_summary_tag else "" 58 | 59 | publisher_tag = item.find("div", class_="citeurl") 60 | publisher = publisher_tag.get_text(strip=True) if publisher_tag else "" 61 | 62 | time_tag = summary.split("-") 63 | if len(time_tag) == 2: 64 | time = time_tag[0] 65 | else: 66 | time = "" 67 | if title and url: 68 | data = { 69 | "title": title, 70 | "publisher": publisher, 71 | "url": url, 72 | "summary": summary, 73 | "time": time 74 | } 75 | results.append(data) 76 | return results 77 | 78 | 79 | -------------------------------------------------------------------------------- /extension/README.md: -------------------------------------------------------------------------------- 1 | # 🔗 智能 URL 内容解析插件 2 | 3 | 一个功能强大的浏览器插件,支持对网页 URL 进行智能化解析,支持当前页面、多标签页、自定义 URL 的灵活操作。可配置后端接口,实现多用途内容提取,适用于大语言模型、网页摘要等场景。 4 | 5 | **目前插件只支持edge** 6 | 7 | ## ✨ 插件功能 8 | 9 | - 🔍 当前页面解析:一键解析当前访问页面的 URL 内容。 10 | - 🗂 多标签页解析:支持解析浏览器中多个打开的标签页。 11 | - ✏️ 自定义 URL 解析:手动输入任意 URL 进行解析,支持批量输入。 12 | - 💾 结果持久缓存:解析结果将本地缓存,避免重复请求,提升效率。 13 | - ⚙️ 接口地址可配置:可根据需求自定义请求发送的后端 API 地址,灵活接入不同解析服务。 14 | 15 | ## 📦 插件安装指南(以上线edge商城) 16 | 17 | 1. **打开浏览器扩展管理页** 18 | - Edge 地址栏输入:`edge://extensions/` 19 | 然后按回车打开。 20 | 21 | 2. **启用开发者模式** 22 | 23 | 3. **打开**,[点击获取即可使用](https://microsoftedge.microsoft.com/addons/detail/hobgcfnjjalpdmlcjgknkklkohbhoijd) 24 | 25 | ## 🛠 使用须知 26 | 27 | 要使用此插件,请 先部署后端解析服务,如: 28 | 29 | 官方推荐后端:[proxyless-llm-websearch](https://github.com/itshyao/proxyless-llm-websearch) 30 | 31 | 或接入你自己的 API 接口,具体要求查看 32 | 33 | ```json 34 | { 35 | "question": str // 输入结构:json 36 | } 37 | ``` 38 | 39 | ```json 40 | { 41 | "data": str // 输出结构:json 42 | } 43 | ``` 44 | 45 | ![1](img/1.png) 46 | 47 | ![2](img/2.png) 48 | 49 | ![3](img/3.png) 50 | 51 | ![4](img/4.png) 52 | 53 | ![5](img/5.png) -------------------------------------------------------------------------------- /extension/img/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/extension/img/1.png -------------------------------------------------------------------------------- /extension/img/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/extension/img/2.png -------------------------------------------------------------------------------- /extension/img/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/extension/img/3.png -------------------------------------------------------------------------------- /extension/img/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/extension/img/4.png -------------------------------------------------------------------------------- /extension/img/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/extension/img/5.png -------------------------------------------------------------------------------- /gradio_demo.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import asyncio 3 | 4 | from agent import ToolsGraph 5 | from pools import BrowserPool, CrawlerPool 6 | 7 | 8 | async def search_answer(question: str, engine: str): 9 | browser_pool = BrowserPool(pool_size=1) 10 | crawler_pool = CrawlerPool(pool_size=1) 11 | graph = ToolsGraph(browser_pool, crawler_pool, engine=engine) 12 | result = await graph.run(question) 13 | return result 14 | 15 | 16 | # 用 sync wrapper 包装 async 函数(Gradio 不直接支持 async) 17 | def sync_search_answer(question, engine): 18 | return asyncio.run(search_answer(question, engine)) 19 | 20 | # 启动界面 21 | with gr.Blocks() as demo: 22 | gr.Markdown("# 🔍 多引擎搜索问答") 23 | question = gr.Textbox(label="请输入你的问题") 24 | engine = gr.Radio(["bing", "quark", "baidu", "sougou"], value="bing", label="选择搜索引擎") 25 | output = gr.Textbox(label="答案") 26 | 27 | btn = gr.Button("提交查询") 28 | btn.click(fn=sync_search_answer, inputs=[question, engine], outputs=output) 29 | 30 | if __name__ == "__main__": 31 | demo.launch() 32 | -------------------------------------------------------------------------------- /img/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/img/framework.png -------------------------------------------------------------------------------- /img/gradio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/img/gradio.png -------------------------------------------------------------------------------- /img/gradio1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/img/gradio1.png -------------------------------------------------------------------------------- /img/gradio2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/img/gradio2.png -------------------------------------------------------------------------------- /img/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/img/workflow.png -------------------------------------------------------------------------------- /pools/__init__.py: -------------------------------------------------------------------------------- 1 | from .crawler_pool import CrawlerPool 2 | from .engine_pool import BrowserPool, BrowserPlaywright 3 | 4 | __all__ = ["CrawlerPool", "BrowserPool", "BrowserPlaywright"] -------------------------------------------------------------------------------- /pools/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/pools/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /pools/__pycache__/crawler_pool.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/pools/__pycache__/crawler_pool.cpython-310.pyc -------------------------------------------------------------------------------- /pools/__pycache__/engine_pool.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/pools/__pycache__/engine_pool.cpython-310.pyc -------------------------------------------------------------------------------- /pools/crawler_pool.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import atexit 3 | from asyncio import Queue, Semaphore 4 | from contextlib import asynccontextmanager 5 | from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode 6 | 7 | 8 | class CrawlerInstance: 9 | def __init__(self): 10 | self.browser_config = BrowserConfig(headless=True, verbose=False) 11 | self.run_config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED, stream=False) 12 | self.crawler = None 13 | 14 | async def __aenter__(self): 15 | self.crawler = AsyncWebCrawler(config=self.browser_config) 16 | return self 17 | 18 | async def __aexit__(self, exc_type, exc_val, exc_tb): 19 | if self.crawler: 20 | await self.crawler.close() 21 | 22 | async def run(self, urls: list[str]) -> list[dict]: 23 | responses = await self.crawler.arun_many(urls=urls, config=self.run_config) 24 | 25 | results = [] 26 | for r in responses: 27 | if r.success: 28 | results.append({"url": r.url, "content": r.markdown}) 29 | return results 30 | 31 | class CrawlerPool: 32 | def __init__(self, pool_size): 33 | self.pool_size = pool_size 34 | self.pool = Queue(maxsize=pool_size) 35 | self.lock = Semaphore(pool_size) 36 | self.instances = [] 37 | atexit.register(lambda: asyncio.run(self.cleanup())) 38 | 39 | @asynccontextmanager 40 | async def get_crawler(self): 41 | async with self.lock: 42 | crawler = await self._get_instance() 43 | try: 44 | yield crawler 45 | finally: 46 | await self._release_instance(crawler) 47 | 48 | async def _get_instance(self): 49 | if self.pool.empty(): 50 | crawler = await CrawlerInstance().__aenter__() 51 | self.instances.append(crawler) 52 | else: 53 | crawler = await self.pool.get() 54 | return crawler 55 | 56 | async def _release_instance(self, crawler: CrawlerInstance): 57 | if self.pool.qsize() < self.pool_size: 58 | await self.pool.put(crawler) 59 | 60 | async def cleanup(self): 61 | await asyncio.gather(*[ 62 | crawler.__aexit__(None, None, None) 63 | for crawler in self.instances 64 | ]) 65 | -------------------------------------------------------------------------------- /pools/engine_pool.py: -------------------------------------------------------------------------------- 1 | from contextlib import asynccontextmanager 2 | from asyncio import Queue, Semaphore 3 | from playwright.async_api import async_playwright 4 | import atexit 5 | import asyncio 6 | 7 | class BrowserPlaywright: 8 | def __init__(self, headless): 9 | self.playwright = None 10 | self.browser = None 11 | self.headless = headless 12 | 13 | async def __aenter__(self): 14 | # 启动 Playwright 只需启动一次 15 | if not self.playwright: 16 | self.playwright = await async_playwright().start() 17 | # 启动浏览器只需启动一次 18 | if not self.browser: 19 | self.browser = await self.playwright.chromium.launch(headless=self.headless) 20 | return self 21 | 22 | async def __aexit__(self, exc_type, exc_val, exc_tb): 23 | # Only close the browser when we're done with all tasks 24 | if self.browser: 25 | await self.browser.close() 26 | if self.playwright: 27 | await self.playwright.stop() 28 | 29 | async def new_page(self): 30 | # 创建新页面 31 | context = await self.browser.new_context() 32 | return await context.new_page() 33 | 34 | 35 | class BrowserPool: 36 | def __init__(self, pool_size: int): 37 | self.pool_size = pool_size 38 | self.pool = Queue(maxsize=pool_size) # 设置队列的最大长度为 pool_size 39 | self.lock = Semaphore(pool_size) # 控制并发 40 | self.browser_instances = [] # 用来保存浏览器实例 41 | # 注册退出时清理资源的函数 42 | atexit.register(lambda: asyncio.run(self.cleanup())) 43 | 44 | @asynccontextmanager 45 | async def get_browser(self): 46 | async with self.lock: 47 | browser_instance = await self._get_browser_instance() 48 | try: 49 | yield browser_instance 50 | finally: 51 | await self._release_browser_instance(browser_instance) 52 | 53 | async def _get_browser_instance(self): 54 | # 如果池为空,创建新实例 55 | if self.pool.empty(): 56 | browser_instance = await self._create_browser_instance() 57 | else: 58 | browser_instance = await self.pool.get() 59 | return browser_instance 60 | 61 | async def _create_browser_instance(self, headless=True): 62 | # 创建一个新的浏览器实例并返回 63 | browser_instance = await BrowserPlaywright(headless).__aenter__() 64 | self.browser_instances.append(browser_instance) # 保存实例 65 | return browser_instance 66 | 67 | async def _release_browser_instance(self, browser_instance: BrowserPlaywright): 68 | if self.pool.qsize() < self.pool_size: 69 | await self.pool.put(browser_instance) 70 | 71 | async def cleanup(self): 72 | loop = asyncio.get_event_loop() 73 | if loop.is_closed(): 74 | loop = asyncio.new_event_loop() 75 | asyncio.set_event_loop(loop) 76 | 77 | print("Cleaning up all browser instances.") 78 | 79 | # 并发清理所有实例 80 | await asyncio.gather( 81 | *(browser.__aexit__(None, None, None) for browser in self.browser_instances) 82 | ) 83 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | playwright 3 | lxml 4 | httpx 5 | loguru 6 | fastapi 7 | uvicorn 8 | openai 9 | langchain_openai 10 | langgraph 11 | langchain_community 12 | langchain-text-splitters 13 | torch -------------------------------------------------------------------------------- /reranker/__init__.py: -------------------------------------------------------------------------------- 1 | from .chunker import MarkdownSplitter, Chunker 2 | from .base import OpenAIEmbeddingReranker 3 | 4 | __all__ = ["MarkdownSplitter", "OpenAIEmbeddingReranker", "Chunker"] -------------------------------------------------------------------------------- /reranker/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/reranker/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /reranker/__pycache__/base.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/reranker/__pycache__/base.cpython-310.pyc -------------------------------------------------------------------------------- /reranker/__pycache__/chunker.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itshyao/proxyless-llm-websearch/70df3a2521bdd0a801028cd1d1a44294935c9565/reranker/__pycache__/chunker.cpython-310.pyc -------------------------------------------------------------------------------- /reranker/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Dict, Union, Optional 3 | from openai import AsyncOpenAI 4 | import torch 5 | import os 6 | from dotenv import load_dotenv 7 | 8 | 9 | class BaseSemanticSearcher(ABC): 10 | """ 11 | Abstract base class for semantic search implementations. 12 | """ 13 | 14 | @abstractmethod 15 | def _get_embeddings(self, texts: List[str]) -> torch.Tensor: 16 | pass 17 | 18 | async def calculate_scores( 19 | self, 20 | queries: List[str], 21 | documents: List[str], 22 | ) -> torch.Tensor: 23 | query_embeddings = await self._get_embeddings(queries) 24 | doc_embeddings = await self._get_embeddings(documents) 25 | scores = query_embeddings @ doc_embeddings.T 26 | scores = torch.softmax(scores, dim=-1) 27 | return scores 28 | 29 | async def rerank( 30 | self, 31 | query: Union[str, List[str]], 32 | documents: List[str], 33 | top_k: int = 5, 34 | ) -> List[Dict[str, Union[str, float]]]: 35 | queries = [query] if isinstance(query, str) else query 36 | scores = await self.calculate_scores(queries, documents) 37 | 38 | results = [] 39 | for query_scores in scores: 40 | top_indices = torch.topk(query_scores, min(top_k, len(documents)), dim=0) 41 | query_results = [ 42 | { 43 | "document": documents[idx.item()], 44 | "score": score.item() 45 | } 46 | for score, idx in zip(top_indices.values, top_indices.indices) 47 | ] 48 | results.append(query_results) 49 | 50 | return results[0] if isinstance(query, str) else results 51 | 52 | async def get_reranked_documents( 53 | self, 54 | query: Union[str, List[str]], 55 | documents: List[str], 56 | top_k: int = 5 57 | ) -> Union[List[str], List[List[str]]]: 58 | results = await self.rerank(query, documents, top_k) 59 | if isinstance(query, str): 60 | return [x['document'].strip() for x in results] 61 | return [[x['document'].strip() for x in r] for r in results] 62 | 63 | 64 | class OpenAIEmbeddingReranker(BaseSemanticSearcher): 65 | def __init__(self, base_url: Optional[str] = None, api_key: Optional[str] = None, model: Optional[str] = None): 66 | load_dotenv() 67 | self.api_key = api_key or os.getenv("EMBEDDING_API_KEY") 68 | self.base_url = base_url or os.getenv("EMBEDDING_BASE_URL") 69 | self.model = model or os.getenv("EMBEDDING_MODEL_NAME") 70 | if not self.api_key: 71 | raise ValueError("No OpenAI API key provided") 72 | self.client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url) 73 | self.model = model 74 | 75 | async def _get_embeddings(self, texts: List[str]) -> torch.Tensor: 76 | response = await self.client.embeddings.create( 77 | model=self.model, 78 | input=texts 79 | ) 80 | embeddings = [e.embedding for e in response.data] 81 | return torch.tensor(embeddings) 82 | 83 | -------------------------------------------------------------------------------- /reranker/chunker.py: -------------------------------------------------------------------------------- 1 | from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter 2 | from typing import List, Optional 3 | 4 | class MarkdownSplitter: 5 | def __init__(self, markdown_text: str, chunk_size: int = 150, chunk_overlap: int = 50): 6 | self.markdown_text = markdown_text 7 | self.chunk_size = chunk_size 8 | self.chunk_overlap = chunk_overlap 9 | # 定义标题层级 (支持三级标题) 10 | self.headers_to_split_on = [ 11 | ("#", "Header 1"), 12 | ("##", "Header 2"), 13 | ("###", "Header 3") 14 | ] 15 | 16 | def split_by_headers(self): 17 | # 使用 MarkdownHeaderTextSplitter 进行按标题切分 18 | markdown_splitter = MarkdownHeaderTextSplitter( 19 | headers_to_split_on=self.headers_to_split_on, 20 | strip_headers=True # 保留标题 21 | ) 22 | md_header_splits = markdown_splitter.split_text(self.markdown_text) 23 | return md_header_splits 24 | 25 | def split_by_characters(self, text_blocks): 26 | # 使用 RecursiveCharacterTextSplitter 按字符数进一步切分每个块 27 | text_splitter = RecursiveCharacterTextSplitter( 28 | chunk_size=self.chunk_size, 29 | chunk_overlap=self.chunk_overlap 30 | ) 31 | splits = text_splitter.split_documents(text_blocks) 32 | return splits 33 | 34 | def split(self): 35 | # 先按标题切分,再按字符切分 36 | header_splits = self.split_by_headers() 37 | final_splits = self.split_by_characters(header_splits) 38 | return final_splits 39 | 40 | 41 | 42 | class Chunker: 43 | """A modular text chunking class that splits text into smaller, overlapping segments. 44 | 45 | This class provides a flexible way to break down large texts into smaller chunks 46 | while maintaining context through configurable overlap. It uses RecursiveCharacterTextSplitter 47 | from langchain under the hood. 48 | 49 | Attributes: 50 | chunk_size (int): The target size for each text chunk. 51 | chunk_overlap (int): The number of characters to overlap between chunks. 52 | separators (List[str]): List of separators to use for splitting, in order of preference. 53 | length_function (callable): Function to measure text length (default: len). 54 | """ 55 | 56 | def __init__( 57 | self, 58 | chunk_size: int = 512, 59 | chunk_overlap: int = 128, 60 | separators: Optional[List[str]] = None, 61 | length_function: callable = len 62 | ): 63 | """Initialize the Chunker with specified parameters. 64 | 65 | Args: 66 | chunk_size (int, optional): Target size for each chunk. Defaults to 250. 67 | chunk_overlap (int, optional): Number of characters to overlap. Defaults to 50. 68 | separators (List[str], optional): Custom separators for splitting. 69 | Defaults to ["\n\n", "\n", " "]. 70 | length_function (callable, optional): Function to measure text length. 71 | Defaults to len. 72 | """ 73 | self.chunk_size = chunk_size 74 | self.chunk_overlap = chunk_overlap 75 | self.separators = separators or ["\n\n", "\n"] 76 | self.length_function = length_function 77 | 78 | self.splitter = RecursiveCharacterTextSplitter( 79 | separators=self.separators, 80 | chunk_size=self.chunk_size, 81 | chunk_overlap=self.chunk_overlap, 82 | length_function=self.length_function 83 | ) 84 | 85 | def split_text(self, text: str) -> List[str]: 86 | """Split a single text into chunks. 87 | 88 | Args: 89 | text (str): The input text to be split into chunks. 90 | 91 | Returns: 92 | List[str]: A list of text chunks. 93 | """ 94 | return self.splitter.split_text(text) 95 | 96 | def split_texts(self, texts: List[str]) -> List[List[str]]: 97 | """Split multiple texts into chunks. 98 | 99 | Args: 100 | texts (List[str]): A list of input texts to be split into chunks. 101 | 102 | Returns: 103 | List[List[str]]: A list of lists, where each inner list contains 104 | the chunks for one input text. 105 | """ 106 | return [self.split_text(text) for text in texts] 107 | 108 | 109 | if __name__ == '__main__': 110 | 111 | # 示例的 Markdown 文本(从 HTML 转换而来) 112 | markdown_document = """ 113 | # Intro 114 | 115 | ## History 116 | Markdown[9] is a lightweight markup language for creating formatted text using a plain-text editor. John Gruber created Markdown in 2004 as a markup language that is appealing to human readers in its source code form.[9] 117 | 118 | Markdown is widely used in blogging, instant messaging, online forums, collaborative software, documentation pages, and readme files. 119 | 120 | ## Rise and divergence 121 | As Markdown popularity grew rapidly, many Markdown implementations appeared, driven mostly by the need for additional features such as tables, footnotes, definition lists,[note 1] and Markdown inside HTML blocks. 122 | 123 | ### Standardization 124 | From 2012, a group of people, including Jeff Atwood and John MacFarlane, launched what Atwood characterised as a standardisation effort. 125 | 126 | ## Implementations 127 | Implementations of Markdown are available for over a dozen programming languages. 128 | """ 129 | 130 | # 使用 MarkdownSplitter 类 131 | splitter = MarkdownSplitter(markdown_document) 132 | final_splits = splitter.split() 133 | 134 | # 打印最终的分割结果 135 | print("\nFinal Text Splits:") 136 | for idx, split in enumerate(final_splits): 137 | print(f"Chunk {idx + 1}:\n{split}\n") 138 | --------------------------------------------------------------------------------