├── we_chat_project
├── QR.png
├── __pycache__
│ └── models.cpython-39.pyc
├── requirements.txt
├── models.py
├── bot.py
└── templates
│ └── index.html
├── LLM联网搜索插件
├── run_server.py
├── __pycache__
│ ├── app.cpython-39.pyc
│ ├── test_utils.cpython-39.pyc
│ ├── search_engine.cpython-39.pyc
│ ├── llm_client_example.cpython-39.pyc
│ └── response_processor.cpython-39.pyc
├── requirements.txt
├── interactive_test.py
├── README.md
├── response_processor.py
├── llm_client_example.py
├── test_utils.py
├── templates
│ ├── llm.html
│ └── config.html
└── search_engine.py
├── deepseek_r1_deploy
├── requirements.txt
├── web_ui.py
└── run_model.py
├── Document_upload_assistant
├── requirements.txt
├── file_analyzer_ui.py
└── file_processor.py
└── README.md
/we_chat_project/QR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1692775560/deepseek_project/HEAD/we_chat_project/QR.png
--------------------------------------------------------------------------------
/LLM联网搜索插件/run_server.py:
--------------------------------------------------------------------------------
1 | from app import app
2 |
3 | if __name__ == "__main__":
4 | app.run(host='0.0.0.0', port=5005, debug=True)
5 |
--------------------------------------------------------------------------------
/LLM联网搜索插件/__pycache__/app.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1692775560/deepseek_project/HEAD/LLM联网搜索插件/__pycache__/app.cpython-39.pyc
--------------------------------------------------------------------------------
/LLM联网搜索插件/__pycache__/test_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1692775560/deepseek_project/HEAD/LLM联网搜索插件/__pycache__/test_utils.cpython-39.pyc
--------------------------------------------------------------------------------
/LLM联网搜索插件/__pycache__/search_engine.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1692775560/deepseek_project/HEAD/LLM联网搜索插件/__pycache__/search_engine.cpython-39.pyc
--------------------------------------------------------------------------------
/we_chat_project/__pycache__/models.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1692775560/deepseek_project/HEAD/we_chat_project/__pycache__/models.cpython-39.pyc
--------------------------------------------------------------------------------
/LLM联网搜索插件/__pycache__/llm_client_example.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1692775560/deepseek_project/HEAD/LLM联网搜索插件/__pycache__/llm_client_example.cpython-39.pyc
--------------------------------------------------------------------------------
/LLM联网搜索插件/__pycache__/response_processor.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/1692775560/deepseek_project/HEAD/LLM联网搜索插件/__pycache__/response_processor.cpython-39.pyc
--------------------------------------------------------------------------------
/we_chat_project/requirements.txt:
--------------------------------------------------------------------------------
1 | wechaty==0.8.17
2 | wechaty-puppet-service==0.8.1
3 | pyee==8.2.2
4 | flask
5 | flask-cors
6 | sqlalchemy
7 | requests
8 | itchat-uos==1.5.0.dev0
--------------------------------------------------------------------------------
/deepseek_r1_deploy/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=2.0.0
2 | transformers>=4.37.0
3 | modelscope>=1.9.5
4 | gradio>=4.14.0
5 | accelerate>=0.25.0
6 | sentencepiece>=0.1.99
7 | protobuf>=4.25.1
--------------------------------------------------------------------------------
/Document_upload_assistant/requirements.txt:
--------------------------------------------------------------------------------
1 | gradio>=4.0.0
2 | httpx>=0.24.0
3 | tenacity>=8.2.0
4 | PyMuPDF>=1.22.0
5 | python-docx>=0.8.11
6 | pandas>=2.0.0
7 | openpyxl>=3.1.0
8 | baidu-aip>=4.16.0
--------------------------------------------------------------------------------
/LLM联网搜索插件/requirements.txt:
--------------------------------------------------------------------------------
1 | # 基础依赖
2 | requests==2.31.0
3 | flask==2.3.3
4 | beautifulsoup4==4.12.2
5 | lxml==4.9.3
6 | python-dotenv==1.0.0
7 |
8 | # 命令行参数解析
9 | argparse>=1.4.0
10 |
11 | # 搜索引擎比较工具依赖
12 | tabulate>=0.9.0
13 | colorama>=0.4.6
14 |
15 | # 可选依赖 - 取消注释以启用特定功能
16 | # llama-cpp-python>=0.2.0 # 如果使用llama.cpp本地模型
17 |
--------------------------------------------------------------------------------
/we_chat_project/models.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
3 | from sqlalchemy.ext.declarative import declarative_base
4 | from sqlalchemy.orm import sessionmaker
5 |
6 | # 创建基类
7 | Base = declarative_base()
8 |
9 | # 创建数据库连接
10 | engine = create_engine('sqlite:///chat_history.db')
11 |
12 | # 创建会话工厂
13 | Session = sessionmaker(bind=engine)
14 |
15 | class ChatMessage(Base):
16 | __tablename__ = 'chat_messages'
17 |
18 | id = Column(Integer, primary_key=True)
19 | sender_id = Column(String(100)) # 发送者微信ID
20 | sender_name = Column(String(100)) # 发送者昵称
21 | message = Column(Text) # 发送的消息
22 | reply = Column(Text) # 机器人的回复
23 | created_at = Column(DateTime, default=datetime.now)
24 |
25 | # 创建数据库表
26 | Base.metadata.create_all(engine)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ```markdown
2 | # DeepSeek Project
3 |
4 | 
5 |
6 | ## 安装指南
7 |
8 | ```bash
9 | # 第一步:安装依赖
10 | pip install -r requirements.txt
11 | ```
12 |
13 | ## 项目概览
14 |
15 | ### 🤖 WeChat Assistant Project
16 |
17 | **项目名称**: `wechat_project`
18 | **项目描述**:
19 | 通过对接DeepSeek API与微信接口实现的智能聊天机器人,支持自动化消息响应。
20 | *A WeChat chatbot integrated with DeepSeek's API for automated message replies.*
21 |
22 | **核心功能**:
23 | ✅ 微信消息实时监听
24 | ✅ DeepSeek多轮对话接口调用
25 | ✅ 上下文敏感型回复生成
26 | ✅ 异常流量熔断机制
27 |
28 | ---
29 |
30 | ### 📁 Document Upload Assistant
31 |
32 | **项目名称**: `Document_upload_assistant`
33 | **背景说明**:
34 | 为解决DeepSeek平台未开放文件上传API的技术限制,开发的本地化文件处理解决方案。
35 | *Localized file processing solution addressing DeepSeek's lack of file upload API.*
36 |
37 | ---
38 |
39 | ### 🚀 Deepseek_r1_deploy
40 |
41 | **项目名称**: `deepseek_r1_deploy`
42 | **项目描述**:
43 | 快速使用魔搭社区部署deepseek蒸馏模型,服务器本地都可以运行,包含前端界面
44 | *Quickly deploy Deepseek distillation model using the ModelScope community, which can run locally on the server and includes a front-end interface.*
45 |
46 | ---
47 |
48 | ### 联网搜索插件
49 | ## 📖 简介 (Introduction)
50 |
51 | 这个项目是一个为本地部署的大语言模型(LLM)提供联网搜索功能的插件。由于本地部署的大模型通常无法直接联网搜索,这个插件可以帮助模型获取最新的互联网信息,从而提供更准确和及时的回答。
52 |
53 | This project is a plugin that provides web search capabilities for locally deployed Large Language Models (LLMs). Since locally deployed LLMs typically cannot directly search the internet, this plugin helps models obtain the latest internet information, enabling more accurate and timely responses.
54 |
55 | ## ✨ 功能特点 (Features)
56 |
57 | - 🔍 支持多种搜索引擎(目前支持 Google、Bing 和百度)
58 | - 📝 可以获取搜索结果摘要
59 | - 📄 可以抓取网页详细内容
60 | - 🤖 自动格式化搜索结果为适合 LLM 处理的提示词
61 | - 🔌 提供简单的 API 接口,易于与各种 LLM 集成
62 | - 📚 包含示例客户端代码,展示如何与本地 LLM 集成
63 | - 🇨🇳 针对中文搜索优化,特别是使用百度搜索引擎
64 | - ⏰ 支持获取实时时间信息
65 | - 🛠️ 提供可配置的 Web 界面,方便调整各项参数
66 |
67 | ## 项目演进
68 |
69 | [](https://star-history.com/#1692775560/deepseek_project&Timeline)
70 |
71 | ```
72 |
--------------------------------------------------------------------------------
/deepseek_r1_deploy/web_ui.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | from run_model import generate_response
3 | import logging
4 |
5 | # 设置日志
6 | logging.basicConfig(level=logging.INFO)
7 | logger = logging.getLogger(__name__)
8 |
9 | def chat(message, history):
10 | """处理聊天消息"""
11 | try:
12 | context = "让我们进行一次友好的对话。\n\n"
13 | for hist in history:
14 | context += f"Human: {hist[0]}\nAssistant: {hist[1]}\n"
15 | context += f"Human: {message}\n"
16 |
17 | response = generate_response(context)
18 |
19 | # 直接返回元组列表格式
20 | history.append((message, response))
21 | return history
22 |
23 | except Exception as e:
24 | logger.error(f"生成回复时发生错误: {str(e)}")
25 | return history + [(message, f"抱歉,发生了错误: {str(e)}")]
26 |
27 | def create_ui():
28 | with gr.Blocks(title="DeepSeek Chat", theme=gr.themes.Soft()) as demo:
29 | gr.Markdown("""# DeepSeek Chat\n欢迎使用 DeepSeek Chat 聊天机器人!""")
30 |
31 | chatbot = gr.Chatbot(
32 | height=600,
33 | show_copy_button=True,
34 | bubble_full_width=False,
35 | avatar_images=("🧑", "🤖") # 添加用户和助手的头像
36 | )
37 |
38 | with gr.Row():
39 | msg = gr.Textbox(
40 | placeholder="在这里输入您的问题...",
41 | show_label=False,
42 | container=False,
43 | scale=8
44 | )
45 | submit = gr.Button("发送", variant="primary", scale=1, min_width=100)
46 |
47 | with gr.Row():
48 | clear = gr.Button("清空对话", variant="secondary")
49 |
50 | # 绑定事件
51 | submit_click = msg.submit(
52 | chat,
53 | inputs=[msg, chatbot],
54 | outputs=chatbot,
55 | show_progress=True
56 | ).then(
57 | lambda: "",
58 | None,
59 | msg,
60 | show_progress=False,
61 | )
62 |
63 | submit_event = submit.click(
64 | chat,
65 | inputs=[msg, chatbot],
66 | outputs=chatbot,
67 | show_progress=True
68 | ).then(
69 | lambda: "",
70 | None,
71 | msg,
72 | show_progress=False,
73 | )
74 |
75 | clear.click(lambda: [], None, chatbot, queue=False) # 修改清空对话的返回值
76 |
77 | # 添加示例问题
78 | gr.Examples(
79 | examples=[
80 | "你好,请介绍一下你自己",
81 | "请帮我写一个Python的Hello World程序",
82 | "解释一下什么是人工智能",
83 | ],
84 | inputs=msg,
85 | )
86 |
87 | return demo
88 |
89 | if __name__ == "__main__":
90 | # 启动 Gradio 服务
91 | demo = create_ui()
92 | demo.queue()
93 | demo.launch(
94 | server_name="0.0.0.0",
95 | server_port=7860,
96 | share=False, # 关闭分享功能,避免 frpc 相关错误
97 | inbrowser=True
98 | )
--------------------------------------------------------------------------------
/deepseek_r1_deploy/run_model.py:
--------------------------------------------------------------------------------
1 | from modelscope import snapshot_download, AutoModelForCausalLM, AutoTokenizer
2 | import torch
3 | import logging
4 |
5 | # 设置日志
6 | logging.basicConfig(level=logging.INFO)
7 | logger = logging.getLogger(__name__)
8 |
9 | # 全局变量存储模型和分词器
10 | model = None
11 | tokenizer = None
12 |
13 | def initialize_model():
14 | """
15 | 初始化模型和分词器
16 | """
17 | global model, tokenizer
18 |
19 | if model is None or tokenizer is None:
20 | try:
21 | # 设置设备
22 | device = "cuda" if torch.cuda.is_available() else "cpu"
23 | logger.info(f"使用设备: {device}")
24 |
25 | # 从魔搭社区下载模型
26 | logger.info("开始下载模型...")
27 | model_id = "deepseek-ai/deepseek-r1-distill-qwen-1.5b"
28 | model_dir = snapshot_download(model_id)
29 |
30 | logger.info("加载分词器...")
31 | tokenizer = AutoTokenizer.from_pretrained(
32 | model_dir,
33 | trust_remote_code=True,
34 | use_fast=False
35 | )
36 |
37 | logger.info("加载模型...")
38 | model = AutoModelForCausalLM.from_pretrained(
39 | model_dir,
40 | trust_remote_code=True,
41 | torch_dtype=torch.float16,
42 | device_map="auto",
43 | low_cpu_mem_usage=True
44 | )
45 | logger.info("模型加载完成!")
46 |
47 | except Exception as e:
48 | logger.error(f"初始化模型时发生错误: {str(e)}")
49 | raise e
50 |
51 | def generate_response(prompt):
52 | """
53 | 生成回复的函数
54 | """
55 | global model, tokenizer
56 |
57 | try:
58 | # 确保模型已初始化
59 | if model is None or tokenizer is None:
60 | initialize_model()
61 |
62 | # 添加系统提示语
63 | system_prompt = "你是一个有用的AI助手。请用简洁、专业的方式回答问题。"
64 | full_prompt = f"{system_prompt}\n\n{prompt}\nAssistant: "
65 |
66 | # 生成回复
67 | inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)
68 | outputs = model.generate(
69 | **inputs,
70 | max_length=2048,
71 | num_return_sequences=1,
72 | temperature=0.7,
73 | top_p=0.9,
74 | do_sample=True,
75 | pad_token_id=tokenizer.eos_token_id
76 | )
77 |
78 | response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
79 | return response.strip()
80 |
81 | except Exception as e:
82 | logger.error(f"生成回复时发生错误: {str(e)}")
83 | return f"抱歉,发生了错误: {str(e)}"
84 |
85 | if __name__ == "__main__":
86 | # 测试生成
87 | test_prompt = "你好,请介绍一下你自己"
88 | response = generate_response(test_prompt)
89 | print(f"测试回复: {response}")
90 |
91 | # 设置设备
92 | device = "cuda" if torch.cuda.is_available() else "cpu"
93 | logger.info(f"使用设备: {device}")
94 |
95 | try:
96 | # 从魔搭社区下载模型
97 | logger.info("开始下载模型...")
98 | model_id = "deepseek-ai/deepseek-r1-distill-qwen-1.5b"
99 | model_dir = snapshot_download(model_id)
100 |
101 | logger.info("加载分词器...")
102 | tokenizer = AutoTokenizer.from_pretrained(
103 | model_dir,
104 | trust_remote_code=True,
105 | use_fast=False
106 | )
107 |
108 | logger.info("加载模型...")
109 | model = AutoModelForCausalLM.from_pretrained(
110 | model_dir,
111 | trust_remote_code=True,
112 | torch_dtype=torch.float16,
113 | device_map="auto",
114 | low_cpu_mem_usage=True
115 | )
116 | logger.info("模型加载完成!")
117 |
118 | except Exception as e:
119 | logger.error(f"加载模型时发生错误: {str(e)}")
120 | raise
121 |
122 | def generate_response(prompt):
123 | try:
124 | logger.info("开始处理输入...")
125 | # 对输入进行编码
126 | inputs = tokenizer(prompt, return_tensors="pt").to(device)
127 |
128 | logger.info("开始生成回复...")
129 | # 生成回复
130 | outputs = model.generate(
131 | **inputs,
132 | max_new_tokens=512,
133 | temperature=0.7,
134 | top_p=0.9,
135 | repetition_penalty=1.1
136 | )
137 |
138 | # 解码并返回回复
139 | response = tokenizer.decode(outputs[0], skip_special_tokens=True)
140 | logger.info("回复生成完成")
141 | return response
142 |
143 | except Exception as e:
144 | logger.error(f"生成回复时发生错误: {str(e)}")
145 | return f"发生错误: {str(e)}"
146 |
147 | # 测试模型
148 | if __name__ == "__main__":
149 | logger.info("开始运行交互式对话...")
150 | while True:
151 | try:
152 | user_input = input("\n请输入您的问题 (输入 'quit' 退出): ")
153 | if user_input.lower() == 'quit':
154 | break
155 |
156 | response = generate_response(user_input)
157 | print(f"\n模型回复: {response}")
158 |
159 | except KeyboardInterrupt:
160 | logger.info("用户中断程序")
161 | break
162 | except Exception as e:
163 | logger.error(f"发生未预期的错误: {str(e)}")
--------------------------------------------------------------------------------
/Document_upload_assistant/file_analyzer_ui.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | from file_processor import FileProcessor
3 | import os
4 |
5 | class FileAnalyzerUI:
6 | def __init__(self):
7 | self.processor = FileProcessor()
8 | self.supported_formats = [
9 | ".pdf", ".docx", ".xlsx", ".xls",
10 | ".png", ".jpg", ".jpeg", ".bmp"
11 | ]
12 |
13 | def process_file(
14 | self,
15 | file_obj,
16 | prompt: str,
17 | progress: gr.Progress = None
18 | ) -> tuple[str, str]:
19 | """处理上传的文件并返回结果"""
20 | try:
21 | if file_obj is None:
22 | return "请先上传文件", ""
23 |
24 | # 获取文件扩展名
25 | file_extension = os.path.splitext(file_obj.name)[1].lower()
26 |
27 | # 检查文件格式
28 | if file_extension not in self.supported_formats:
29 | return f"不支持的文件格式: {file_extension}", ""
30 |
31 | if progress:
32 | progress(0.3, desc="正在提取文本...")
33 | # 提取文本并分析
34 | extracted_text, result = self.processor.process_and_analyze(file_obj, prompt)
35 |
36 | if progress:
37 | progress(1.0, desc="处理完成")
38 | return extracted_text, result
39 |
40 | except Exception as e:
41 | return f"处理过程中出现错误: {str(e)}", ""
42 |
43 | def create_ui(self):
44 | # 自定义CSS样式
45 | css = """
46 | .container {max-width: 900px; margin: auto; padding: 20px;}
47 | .header {
48 | text-align: center;
49 | padding: 20px;
50 | margin-bottom: 30px;
51 | background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
52 | border-radius: 15px;
53 | color: white;
54 | }
55 | .file-upload {
56 | border: 2px dashed #ddd;
57 | padding: 20px;
58 | border-radius: 10px;
59 | background: #f8f9fa;
60 | }
61 | .output-box {
62 | border: 1px solid #eee;
63 | border-radius: 10px;
64 | padding: 15px;
65 | margin-top: 20px;
66 | background: white;
67 | box-shadow: 0 2px 10px rgba(0,0,0,0.1);
68 | }
69 | """
70 |
71 | # 创建Gradio界面
72 | with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
73 | with gr.Column(elem_classes="container"):
74 | # 标题区域
75 | with gr.Column(elem_classes="header"):
76 | gr.Markdown("""
77 | # 📄 文件分析助手
78 | 支持PDF、Word、Excel和图片文件的智能分析
79 | """)
80 |
81 | # 文件上传区域
82 | with gr.Column(elem_classes="file-upload"):
83 | file_input = gr.File(
84 | label="上传文件",
85 | file_types=self.supported_formats
86 | )
87 |
88 | prompt_input = gr.Textbox(
89 | label="分析提示(可选)",
90 | placeholder="请输入您想要分析的具体方向,例如:'请总结文档的主要观点'",
91 | lines=2
92 | )
93 |
94 | analyze_btn = gr.Button(
95 | "开始分析",
96 | variant="primary"
97 | )
98 |
99 | # 输出区域
100 | with gr.Column(elem_classes="output-box"):
101 | with gr.Tab("提取文本"):
102 | text_output = gr.Textbox(
103 | label="提取的文本内容",
104 | lines=10,
105 | interactive=False
106 | )
107 |
108 | with gr.Tab("分析结果"):
109 | result_output = gr.Textbox(
110 | label="AI分析结果",
111 | lines=10,
112 | interactive=False
113 | )
114 |
115 | # 添加说明信息
116 | with gr.Accordion("使用说明", open=False):
117 | gr.Markdown("""
118 | ### 支持的文件格式:
119 | - PDF文件 (.pdf)
120 | - Word文档 (.docx)
121 | - Excel表格 (.xlsx, .xls)
122 | - 图片文件 (.png, .jpg, .jpeg, .bmp)
123 |
124 | ### 使用步骤:
125 | 1. 上传需要分析的文件
126 | 2. 可选:输入具体的分析提示
127 | 3. 点击"开始分析"按钮
128 | 4. 等待处理完成
129 |
130 | ### 注意事项:
131 | - 文件大小限制:50MB
132 | - 处理时间可能因文件大小而异
133 | - 图片文件将使用OCR技术识别文字
134 | """)
135 |
136 | # 绑定处理函数
137 | analyze_btn.click(
138 | fn=self.process_file,
139 | inputs=[file_input, prompt_input],
140 | outputs=[text_output, result_output],
141 | show_progress=True
142 | )
143 |
144 | return demo
145 |
146 | def main():
147 | ui = FileAnalyzerUI()
148 | demo = ui.create_ui()
149 | # 修改启动参数
150 | demo.launch(
151 | server_port=7861, # 指定端口
152 | share=True, # 允许外部访问
153 | inbrowser=True, # 自动在浏览器中打开
154 | debug=True # 启用调试模式
155 | )
156 |
157 | if __name__ == "__main__":
158 | main()
--------------------------------------------------------------------------------
/Document_upload_assistant/file_processor.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from aip import AipOcr
3 | import os
4 | import json
5 | from typing import Optional, Dict, Any, List
6 | import fitz # PyMuPDF
7 | from docx import Document
8 | import pandas as pd
9 | import httpx
10 | import time
11 | from tenacity import retry, stop_after_attempt, wait_exponential
12 |
13 | class FileProcessor:
14 | def __init__(self):
15 | # 百度OCR配置
16 | self.APP_ID = '配置你自己的'
17 | self.API_KEY = "配置你自己的"
18 | self.SECRET_KEY = "配置你自己的"
19 | self.client = AipOcr(self.APP_ID, self.API_KEY, self.SECRET_KEY)
20 |
21 | # DeepSeek配置
22 | self.DEEPSEEK_API_KEY = "your api key"
23 | self.DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
24 |
25 | # 初始化 httpx 客户端
26 | self.client = httpx.Client(timeout=30.0)
27 |
28 | def extract_text_from_pdf(self, file_path: str) -> str:
29 | """从PDF文件中提取文本"""
30 | try:
31 | text = ""
32 | with fitz.open(file_path) as pdf:
33 | for page in pdf:
34 | text += page.get_text()
35 | return text
36 | except Exception as e:
37 | return f"PDF处理错误: {str(e)}"
38 |
39 | def extract_text_from_docx(self, file_path: str) -> str:
40 | """从Word文档中提取文本"""
41 | try:
42 | doc = Document(file_path)
43 | return "\n".join([paragraph.text for paragraph in doc.paragraphs])
44 | except Exception as e:
45 | return f"Word文档处理错误: {str(e)}"
46 |
47 | def extract_text_from_excel(self, file_path: str) -> str:
48 | """从Excel文件中提取文本"""
49 | try:
50 | df = pd.read_excel(file_path)
51 | return df.to_string()
52 | except Exception as e:
53 | return f"Excel处理错误: {str(e)}"
54 |
55 | def ocr_image(self, image_path: str) -> str:
56 | """使用百度OCR识别图片中的文字"""
57 | try:
58 | with open(image_path, 'rb') as fp:
59 | image = fp.read()
60 |
61 | options = {
62 | "language_type": "CHN_ENG",
63 | "detect_direction": "true",
64 | "detect_language": "true",
65 | "probability": "true"
66 | }
67 |
68 | result = self.client_ocr.basicGeneral(image, options)
69 |
70 | if 'words_result' in result:
71 | return "\n".join([item['words'] for item in result['words_result']])
72 | return ""
73 | except Exception as e:
74 | return f"图片OCR处理错误: {str(e)}"
75 |
76 | def process_file(self, file_path: str) -> str:
77 | """处理不同类型的文件并提取文本"""
78 | try:
79 | file_extension = os.path.splitext(file_path)[1].lower()
80 |
81 | # 检查文件是否存在
82 | if not os.path.exists(file_path):
83 | return "文件不存在"
84 |
85 | if file_extension == '.pdf':
86 | return self.extract_text_from_pdf(file_path)
87 | elif file_extension == '.docx':
88 | return self.extract_text_from_docx(file_path)
89 | elif file_extension in ['.xlsx', '.xls']:
90 | return self.extract_text_from_excel(file_path)
91 | elif file_extension in ['.png', '.jpg', '.jpeg', '.bmp']:
92 | return self.ocr_image(file_path)
93 | else:
94 | return "不支持的文件格式"
95 | except Exception as e:
96 | return f"文件处理错误: {str(e)}"
97 |
98 | @retry(
99 | stop=stop_after_attempt(3),
100 | wait=wait_exponential(multiplier=1, min=4, max=10)
101 | )
102 | def call_deepseek_api(self, text: str, prompt: str = "") -> str:
103 | """调用DeepSeek API处理提取的文本"""
104 | try:
105 | # 确保文本是UTF-8编码
106 | if prompt:
107 | message = f"{prompt}\n\n{text}".encode('utf-8').decode('utf-8')
108 | else:
109 | message = text.encode('utf-8').decode('utf-8')
110 |
111 | headers = {
112 | "Authorization": f"Bearer {self.DEEPSEEK_API_KEY}",
113 | "Content-Type": "application/json; charset=utf-8" # 指定字符集
114 | }
115 |
116 | data = {
117 | "model": "deepseek-chat",
118 | "messages": [{"role": "user", "content": message}],
119 | "temperature": 0.7,
120 | "max_tokens": 800
121 | }
122 |
123 | # 使用 json.dumps 确保正确处理中文
124 | response = self.client.post(
125 | self.DEEPSEEK_API_URL,
126 | headers=headers,
127 | json=data, # httpx 会自动处理 JSON 编码
128 | timeout=30.0
129 | )
130 |
131 | # 检查响应状态
132 | response.raise_for_status()
133 |
134 | result = response.json()
135 | if "choices" in result and len(result["choices"]) > 0:
136 | return result["choices"][0]["message"]["content"]
137 | else:
138 | return "API返回结果格式异常"
139 |
140 | except httpx.HTTPStatusError as e:
141 | if e.response.status_code == 401:
142 | return "API密钥无效或已过期"
143 | elif e.response.status_code == 429:
144 | return "API调用次数超限"
145 | else:
146 | return f"API调用失败: HTTP {e.response.status_code}"
147 |
148 | except httpx.RequestError as e:
149 | return f"网络请求错误: {str(e)}"
150 |
151 | except Exception as e:
152 | return f"API调用错误: {str(e)}"
153 |
154 | def process_and_analyze(self, file_obj, prompt: str = "") -> tuple[str, str]:
155 | """处理上传的文件并进行分析"""
156 | try:
157 | if file_obj is None:
158 | return "请上传文件", ""
159 |
160 | # 获取文件路径
161 | file_path = file_obj.name
162 |
163 | # 提取文本
164 | extracted_text = self.process_file(file_path)
165 |
166 | if not extracted_text or extracted_text.startswith("文件处理错误"):
167 | return extracted_text, ""
168 |
169 | # 调用DeepSeek API进行分析
170 | analysis_result = self.call_deepseek_api(extracted_text, prompt)
171 |
172 | return extracted_text, analysis_result
173 |
174 | except Exception as e:
175 | return f"处理错误: {str(e)}", ""
176 |
177 | def __del__(self):
178 | """确保关闭 HTTP 客户端"""
179 | if hasattr(self, 'client'):
180 | self.client.close()
181 |
182 | def main():
183 | # 使用示例
184 | processor = FileProcessor()
185 |
186 | # 示例文件路径
187 | file_path = "path/to/your/file.pdf"
188 |
189 | # 可选的提示词
190 | prompt = "请分析这份文档的主要观点"
191 |
192 | # 处理文件并获取分析结果
193 | result = processor.process_and_analyze(file_path, prompt)
194 | print(result)
195 |
196 | if __name__ == "__main__":
197 | main()
--------------------------------------------------------------------------------
/LLM联网搜索插件/interactive_test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | LLM联网搜索插件交互式测试工具
6 | """
7 |
8 | from test_utils import test_search_engine, test_llm_response, compare_search_engines
9 | import os
10 | import sys
11 |
12 | def clear_screen():
13 | """清除终端屏幕"""
14 | os.system('cls' if os.name == 'nt' else 'clear')
15 |
16 | def print_header():
17 | """打印工具标题"""
18 | print("\n" + "=" * 50)
19 | print(" LLM联网搜索插件交互式测试工具")
20 | print("=" * 50)
21 |
22 | def interactive_menu():
23 | """显示交互式菜单并处理用户输入"""
24 | clear_screen()
25 | print_header()
26 | print("\n请选择功能:")
27 | print("1. 测试搜索引擎 - 测试不同搜索引擎的搜索结果")
28 | print("2. 测试LLM响应 - 测试LLM对搜索结果的处理能力")
29 | print("3. 比较搜索引擎 - 比较不同搜索引擎的结果差异")
30 | print("0. 退出程序")
31 |
32 | try:
33 | choice = input("\n请输入选项 (0-3): ")
34 |
35 | if choice == "1":
36 | test_search_engine_interactive()
37 | elif choice == "2":
38 | test_llm_response_interactive()
39 | elif choice == "3":
40 | compare_search_engines_interactive()
41 | elif choice == "0":
42 | print("\n谢谢使用,再见!")
43 | return False
44 | else:
45 | print("\n无效选择,请输入0-3之间的数字")
46 | input("\n按Enter键继续...")
47 | except KeyboardInterrupt:
48 | print("\n\n操作已取消")
49 | input("\n按Enter键继续...")
50 | except Exception as e:
51 | print(f"\n发生错误: {str(e)}")
52 | input("\n按Enter键继续...")
53 |
54 | return True
55 |
56 | def test_search_engine_interactive():
57 | """交互式测试搜索引擎"""
58 | clear_screen()
59 | print_header()
60 | print("\n=== 测试搜索引擎 ===")
61 |
62 | try:
63 | query = input("\n请输入搜索查询 (默认: 量子计算最新进展): ").strip() or "量子计算最新进展"
64 |
65 | print("\n可选搜索引擎:")
66 | print("1. baidu - 百度搜索")
67 | print("2. google - 谷歌搜索")
68 | print("3. bing - 必应搜索")
69 | engine_choice = input("\n请选择搜索引擎 (1-3, 默认: 1): ").strip() or "1"
70 |
71 | search_engine_map = {"1": "baidu", "2": "google", "3": "bing"}
72 | search_engine = search_engine_map.get(engine_choice, "baidu")
73 |
74 | num_results = input("\n请输入结果数量 (默认: 3): ").strip()
75 | num_results = int(num_results) if num_results.isdigit() else 3
76 |
77 | fetch_content = input("\n是否获取详细内容 (y/n, 默认: y): ").lower() != "n"
78 | verbose = input("\n是否显示详细信息 (y/n, 默认: n): ").lower() == "y"
79 |
80 | print("\n开始测试搜索引擎...")
81 | test_search_engine(
82 | query=query,
83 | search_engine=search_engine,
84 | num_results=num_results,
85 | fetch_content=fetch_content,
86 | verbose=verbose
87 | )
88 | except KeyboardInterrupt:
89 | print("\n\n操作已取消")
90 | except Exception as e:
91 | print(f"\n测试搜索引擎时出错: {str(e)}")
92 |
93 | def test_llm_response_interactive():
94 | """交互式测试LLM响应"""
95 | clear_screen()
96 | print_header()
97 | print("\n=== 测试LLM响应 ===")
98 |
99 | try:
100 | query = input("\n请输入搜索查询 (默认: 量子计算最新进展): ").strip() or "量子计算最新进展"
101 |
102 | print("\n可选搜索引擎:")
103 | print("1. baidu - 百度搜索")
104 | print("2. google - 谷歌搜索")
105 | print("3. bing - 必应搜索")
106 | engine_choice = input("\n请选择搜索引擎 (1-3, 默认: 1): ").strip() or "1"
107 |
108 | search_engine_map = {"1": "baidu", "2": "google", "3": "bing"}
109 | search_engine = search_engine_map.get(engine_choice, "baidu")
110 |
111 | num_results = input("\n请输入结果数量 (默认: 3): ").strip()
112 | num_results = int(num_results) if num_results.isdigit() else 3
113 |
114 | use_mock_data = input("\n是否使用模拟数据 (y/n, 默认: n): ").lower() == "y"
115 |
116 | print("\n可选LLM模型:")
117 | print("1. deepseek-r1:1.5b - 轻量级模型")
118 | print("2. llama3:8b - 中等大小模型")
119 | print("3. qwen:14b - 较大模型")
120 | print("4. 自定义模型")
121 | model_choice = input("\n请选择LLM模型 (1-4, 默认: 1): ").strip() or "1"
122 |
123 | model_map = {
124 | "1": "deepseek-r1:1.5b",
125 | "2": "llama3:8b",
126 | "3": "qwen:14b"
127 | }
128 |
129 | if model_choice in model_map:
130 | model = model_map[model_choice]
131 | else:
132 | model = input("\n请输入自定义模型名称: ").strip() or "deepseek-r1:1.5b"
133 |
134 | api_url = input("\n请输入API URL (默认: http://localhost:5003): ").strip() or "http://localhost:5003"
135 |
136 | temperature = input("\n请输入温度参数 (0.0-1.0, 默认: 0.7): ").strip() or "0.7"
137 | temperature = float(temperature)
138 |
139 | max_tokens = input("\n请输入最大生成长度 (默认: 2048): ").strip()
140 | max_tokens = int(max_tokens) if max_tokens.isdigit() else 2048
141 |
142 | verbose = input("\n是否显示详细信息 (y/n, 默认: n): ").lower() == "y"
143 |
144 | print("\n开始测试LLM响应...")
145 | test_llm_response(
146 | query=query,
147 | search_engine=search_engine,
148 | num_results=num_results,
149 | use_mock_data=use_mock_data,
150 | model=model,
151 | api_url=api_url,
152 | temperature=temperature,
153 | max_tokens=max_tokens,
154 | verbose=verbose
155 | )
156 | except KeyboardInterrupt:
157 | print("\n\n操作已取消")
158 | except Exception as e:
159 | print(f"\n测试LLM响应时出错: {str(e)}")
160 |
161 | def compare_search_engines_interactive():
162 | """交互式比较搜索引擎"""
163 | clear_screen()
164 | print_header()
165 | print("\n=== 比较搜索引擎 ===")
166 |
167 | try:
168 | query = input("\n请输入搜索查询 (默认: 量子计算最新进展): ").strip() or "量子计算最新进展"
169 |
170 | print("\n可选搜索引擎:")
171 | print("1. google - 谷歌搜索")
172 | print("2. bing - 必应搜索")
173 | print("3. baidu - 百度搜索")
174 |
175 | engines_input = input("\n请选择要比较的搜索引擎编号,用空格分隔 (默认: 1 2 3): ").strip() or "1 2 3"
176 | engine_choices = engines_input.split()
177 |
178 | engine_map = {"1": "google", "2": "bing", "3": "baidu"}
179 | engines = [engine_map.get(choice, "baidu") for choice in engine_choices]
180 |
181 | num_results = input("\n请输入每个引擎的结果数量 (默认: 3): ").strip()
182 | num_results = int(num_results) if num_results.isdigit() else 3
183 |
184 | print("\n开始比较搜索引擎...")
185 | compare_search_engines(
186 | query=query,
187 | engines=engines,
188 | num_results=num_results
189 | )
190 | except KeyboardInterrupt:
191 | print("\n\n操作已取消")
192 | except Exception as e:
193 | print(f"\n比较搜索引擎时出错: {str(e)}")
194 |
195 | if __name__ == "__main__":
196 | try:
197 | print("欢迎使用LLM联网搜索插件交互式测试工具!")
198 |
199 | running = True
200 | while running:
201 | running = interactive_menu()
202 | except KeyboardInterrupt:
203 | print("\n\n程序被用户中断")
204 | sys.exit(0)
205 | except Exception as e:
206 | print(f"\n程序发生错误: {str(e)}")
207 | sys.exit(1)
208 |
--------------------------------------------------------------------------------
/LLM联网搜索插件/README.md:
--------------------------------------------------------------------------------
1 | # LLM 联网搜索插件 (LLM Web Search Plugin)
2 |
3 |
8 |
9 | ## 📖 简介 (Introduction)
10 |
11 | 这个项目是一个为本地部署的大语言模型(LLM)提供联网搜索功能的插件。由于本地部署的大模型通常无法直接联网搜索,这个插件可以帮助模型获取最新的互联网信息,从而提供更准确和及时的回答。
12 |
13 | This project is a plugin that provides web search capabilities for locally deployed Large Language Models (LLMs). Since locally deployed LLMs typically cannot directly search the internet, this plugin helps models obtain the latest internet information, enabling more accurate and timely responses.
14 |
15 | ## ✨ 功能特点 (Features)
16 |
17 | - 🔍 支持多种搜索引擎(目前支持 Google、Bing 和百度)
18 | - 📝 可以获取搜索结果摘要
19 | - 📄 可以抓取网页详细内容
20 | - 🤖 自动格式化搜索结果为适合 LLM 处理的提示词
21 | - 🔌 提供简单的 API 接口,易于与各种 LLM 集成
22 | - 📚 包含示例客户端代码,展示如何与本地 LLM 集成
23 | - 🇨🇳 针对中文搜索优化,特别是使用百度搜索引擎
24 | - ⏰ 支持获取实时时间信息
25 | - 🛠️ 提供可配置的 Web 界面,方便调整各项参数
26 |
27 | ## 🔧 系统要求 (System Requirements)
28 |
29 | - Python 3.7+
30 | - 网络连接
31 | - 本地部署的 LLM(推荐使用 Ollama、llama.cpp 等)
32 |
33 | ## 📦 安装步骤 (Installation)
34 |
35 | 1. 克隆或下载本仓库
36 |
37 | ```bash
38 | ```
39 |
40 | 2. 安装依赖包
41 |
42 | ```bash
43 | pip install -r requirements.txt
44 | ```
45 |
46 | 3. 创建 `.env` 文件(可选)
47 |
48 | ```
49 | DEBUG=True
50 | PORT=5005
51 | SEARCH_ENGINE=google # 可选值: google, bing, baidu
52 | ```
53 |
54 | ## 🚀 使用方法 (Usage)
55 |
56 | ### 启动搜索 API 服务
57 |
58 | ```bash
59 | python run_server.py
60 | ```
61 |
62 | 服务将在 http://localhost:5005 启动(除非在 .env 文件中指定了其他端口)。
63 |
64 | 也可以通过环境变量指定搜索引擎:
65 |
66 | ```bash
67 | SEARCH_ENGINE=baidu python run_server.py
68 | ```
69 |
70 | ### 访问 Web 界面
71 |
72 | 启动服务后,可以通过浏览器访问以下页面:
73 |
74 | - 主页: http://localhost:5005/
75 | - LLM 交互页面: http://localhost:5005/llm
76 | - 配置页面: http://localhost:5005/config
77 |
78 | ### API 端点
79 |
80 | #### POST /search
81 |
82 | 执行网络搜索并返回格式化的结果。
83 |
84 | 请求示例:
85 |
86 | ```json
87 | {
88 | "query": "你的搜索查询",
89 | "num_results": 5,
90 | "fetch_content": false,
91 | "search_engine": "baidu",
92 | "llm_model": "deepseek-r1:1.5b",
93 | "temperature": 0.7,
94 | "max_tokens": 2048
95 | }
96 | ```
97 |
98 | 参数说明:
99 | - `query`: 搜索查询(必需)
100 | - `num_results`: 返回结果数量(可选,默认为 5)
101 | - `fetch_content`: 是否获取详细网页内容(可选,默认为 false)
102 | - `search_engine`: 使用的搜索引擎,"google"、"bing" 或 "baidu"(可选,默认为 "google")
103 | - `llm_model`: 使用的 LLM 模型(可选)
104 | - `temperature`: 生成温度(可选)
105 | - `max_tokens`: 最大生成 token 数(可选)
106 |
107 | 响应示例:
108 |
109 | ```json
110 | {
111 | "query": "你的搜索查询",
112 | "search_results": [
113 | {
114 | "title": "结果标题",
115 | "link": "https://example.com",
116 | "snippet": "结果摘要..."
117 | },
118 | ...
119 | ],
120 | "detailed_content": {
121 | "https://example.com": "网页内容..."
122 | },
123 | "formatted_response": "格式化后的提示词,可直接发送给 LLM",
124 | "llm_config": {
125 | "model": "deepseek-r1:1.5b",
126 | "temperature": 0.7,
127 | "max_tokens": 2048
128 | }
129 | }
130 | ```
131 |
132 | #### GET /current_time
133 |
134 | 获取当前时间信息。
135 |
136 | 响应示例:
137 |
138 | ```json
139 | {
140 | "time": "2025-03-11 17:00:55",
141 | "timezone": "Asia/Shanghai",
142 | "source": "system"
143 | }
144 | ```
145 |
146 | ## 🔄 与本地 LLM 集成 (Integration with Local LLMs)
147 |
148 | `llm_client_example.py` 文件提供了一个示例客户端,已经内置支持 Ollama、llama.cpp 等多种本地模型。您可以直接使用命令行运行客户端,也可以在自己的代码中导入并使用客户端类。
149 |
150 | ### 命令行运行示例
151 |
152 | ```bash
153 | # 使用默认设置(Ollama 和 llama3 模型)
154 | python llm_client_example.py
155 |
156 | # 指定不同的模型
157 | python llm_client_example.py --model-name qwen:7b
158 |
159 | # 指定不同的搜索引擎
160 | python llm_client_example.py --search-engine baidu
161 |
162 | # 指定不同的温度参数
163 | python llm_client_example.py --temperature 0.5
164 | ```
165 |
166 | ### 在自己的代码中使用示例
167 |
168 | ```python
169 | from llm_client_example import LLMWebSearchClient
170 |
171 | # 初始化客户端(默认使用 Ollama 和 llama3 模型)
172 | client = LLMWebSearchClient()
173 |
174 | # 或者指定不同的模型
175 | # client = LLMWebSearchClient(llm_type="ollama", model_name="qwen:7b")
176 |
177 | # 使用网络搜索回答问题
178 | result = client.answer_with_web_search("最新的 AI 技术进展是什么?")
179 |
180 | # 打印 LLM 的回答
181 | print(result["llm_response"])
182 | ```
183 |
184 | ## 🧪 测试工具 (Testing Tools)
185 |
186 | 项目中包含一个综合测试工具 `test_utils.py`,提供了多种测试功能:
187 |
188 | 1. 测试搜索引擎的搜索结果
189 | 2. 测试LLM对搜索结果的利用情况
190 | 3. 比较不同搜索引擎的结果差异
191 |
192 | ### 使用方法
193 |
194 | ```bash
195 | # 测试搜索引擎
196 | python test_utils.py --mode search --query "量子计算" --search-engine baidu
197 |
198 | # 测试LLM响应
199 | python test_utils.py --mode llm --query "人工智能应用" --model "qwen:7b" --temperature 0.5
200 |
201 | # 比较搜索引擎
202 | python test_utils.py --mode compare --query "深度学习框架对比"
203 | ```
204 |
205 | ### 参数说明
206 |
207 | - `--mode`: 测试模式,可选值为 "search"(测试搜索引擎)、"llm"(测试LLM响应)、"compare"(比较搜索引擎)
208 | - `--query`: 要测试的查询(如果不提供,将使用默认查询"量子计算最新进展")
209 | - `--verbose`: 显示详细信息,包括完整提示词
210 | - `--search-engine`: 使用的搜索引擎,可选值为 "google"、"bing" 或 "baidu"(默认为 "baidu")
211 | - `--engines`: 比较模式下要比较的搜索引擎列表(默认为 "google bing baidu")
212 | - `--mock`: 使用模拟搜索数据,不进行实际搜索
213 | - `--model`: 指定使用的 LLM 模型名称(默认为 "deepseek-r1:1.5b")
214 | - `--api-url`: 指定搜索 API 的 URL(默认为 "http://localhost:5005/search")
215 | - `--temperature`: LLM 生成的温度参数(默认为 0.7)
216 | - `--num-results`: 搜索结果数量(默认为 5)
217 | - `--fetch-content`: 获取详细网页内容
218 |
219 | ## 🌐 支持的 LLM 模型 (Supported LLM Models)
220 |
221 | 最新版本的客户端已经内置支持多种本地模型,包括:
222 |
223 | ### Ollama 支持的模型
224 |
225 | - llama3 (推荐)
226 | - deepseek-r1:1.5b / 7b / 671b
227 | - qwen:7b / 14b / 72b
228 | - yi:34b
229 | - gemma:7b / 2b
230 | - mistral:7b
231 | - mixtral:8x7b
232 | - ...以及其他 Ollama 支持的模型
233 |
234 | ### 如何使用 Ollama
235 |
236 | Ollama 是一个流行的本地模型部署工具,可以轻松运行各种开源大语言模型。我们的插件默认支持 Ollama。
237 |
238 | 1. 首先,确保您已经安装了 Ollama,安装指南可以在 [Ollama 官方网站](https://ollama.ai) 找到。
239 |
240 | 2. 下载您想要使用的模型,例如:
241 |
242 | ```bash
243 | ollama pull llama3
244 | # 或者其他模型,如
245 | # ollama pull qwen:7b
246 | # ollama pull gemma:7b
247 | ```
248 |
249 | 3. 使用我们的客户端连接到 Ollama:
250 |
251 | ```bash
252 | python llm_client_example.py --llm-type ollama --model-name llama3
253 | ```
254 |
255 | ## ⚙️ 配置选项 (Configuration Options)
256 |
257 | 通过访问配置页面 (http://localhost:5005/config),您可以调整以下配置选项:
258 |
259 | ### 搜索设置
260 | - 默认搜索引擎 (Google, Bing, Baidu)
261 | - 默认结果数量
262 | - 是否默认获取详细内容
263 | - 最大内容长度
264 |
265 | ### 时间获取设置
266 | - 默认时区
267 | - 时间源 URL
268 |
269 | ### LLM 模型设置
270 | - 默认 LLM 模型
271 | - 默认温度参数
272 | - 默认最大 token 数
273 |
274 | ### 高级设置
275 | - User Agent
276 | - 是否启用详细日志记录
277 |
278 | ## 🤝 贡献 (Contributing)
279 |
280 | 欢迎贡献代码、报告问题或提出改进建议!您可以通过以下方式参与:
281 |
282 | 1. Fork 本仓库
283 | 2. 创建您的特性分支 (`git checkout -b feature/amazing-feature`)
284 | 3. 提交您的更改 (`git commit -m 'Add some amazing feature'`)
285 | 4. 推送到分支 (`git push origin feature/amazing-feature`)
286 | 5. 开启一个 Pull Request
287 |
288 | ## 📄 许可证 (License)
289 |
290 | 本项目采用 MIT 许可证 - 详情请参阅 [LICENSE](LICENSE) 文件。
291 |
292 | ## 📞 联系方式 (Contact)
293 |
294 | 如有任何问题或建议,请通过以下方式联系我们:
295 |
296 | - 项目主页: [GitHub 仓库](https://github.com/yourusername/llm-web-search-plugin)
297 | - 电子邮件: 1692775560@qq.com
298 |
299 | ---
300 |
301 |
302 |
Made with ❤️ for LLM enthusiasts
303 |
304 |
--------------------------------------------------------------------------------
/we_chat_project/bot.py:
--------------------------------------------------------------------------------
1 | import itchat
2 | from itchat.content import TEXT
3 | import requests
4 | import json
5 | import logging
6 | from datetime import datetime
7 | from flask import Flask, render_template
8 | from models import Session, ChatMessage
9 | import threading
10 | import time
11 | import os
12 | import webbrowser
13 | from flask_cors import CORS
14 |
15 | # 配置日志
16 | logging.basicConfig(
17 | level=logging.INFO,
18 | format='%(asctime)s - %(levelname)s - %(message)s'
19 | )
20 | logger = logging.getLogger(__name__)
21 |
22 | # DeepSeek API 配置
23 | DEEPSEEK_API_URL = "https://api.deepseek.com/v1/chat/completions"
24 | DEEPSEEK_API_KEY = 'your api key'
25 |
26 | # 创建Flask应用
27 | app = Flask(__name__, static_folder='static')
28 | CORS(app)
29 |
30 | # 全局变量存储上下文
31 | chat_contexts = {}
32 |
33 | def save_message(sender_id, sender_name, message, reply):
34 | """保存聊天记录到数据库"""
35 | try:
36 | session = Session()
37 | chat_message = ChatMessage(
38 | sender_id=sender_id,
39 | sender_name=sender_name,
40 | message=message,
41 | reply=reply
42 | )
43 | session.add(chat_message)
44 | session.commit()
45 | session.close()
46 | except Exception as e:
47 | logger.error(f"保存消息失败: {str(e)}")
48 |
49 | def get_deepseek_response(message, user_id):
50 | """调用 DeepSeek API 获取回复"""
51 | try:
52 | # 获取用户上下文
53 | if user_id not in chat_contexts:
54 | chat_contexts[user_id] = []
55 |
56 | # 添加新消息到上下文
57 | chat_contexts[user_id].append({"role": "user", "content": message})
58 |
59 | # 保持上下文长度不超过5条消息
60 | if len(chat_contexts[user_id]) > 5:
61 | chat_contexts[user_id] = chat_contexts[user_id][-5:]
62 |
63 | headers = {
64 | "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
65 | "Content-Type": "application/json"
66 | }
67 |
68 | data = {
69 | "model": "deepseek-chat",
70 | "messages": [
71 | {"role": "system", "content": "你是一个友好的助手,请用简短的语言回答问题,每次回复不要超过50个字。"},
72 | *chat_contexts[user_id]
73 | ]
74 | }
75 |
76 | response = requests.post(DEEPSEEK_API_URL, headers=headers, json=data, timeout=30)
77 | response.raise_for_status()
78 | reply = response.json()['choices'][0]['message']['content']
79 |
80 | # 添加回复到上下文
81 | chat_contexts[user_id].append({"role": "assistant", "content": reply})
82 |
83 | return reply
84 | except Exception as e:
85 | logger.error(f"调用 DeepSeek API 失败: {str(e)}")
86 | return "抱歉,我现在无法回复,请稍后再试。"
87 |
88 | @itchat.msg_register([TEXT])
89 | def handle_text(msg):
90 | """处理文本消息"""
91 | try:
92 | # 获取发送者信息
93 | username = msg['FromUserName']
94 | content = msg['Text']
95 |
96 | # 获取发送者昵称
97 | sender = itchat.search_friends(userName=username)
98 | sender_name = sender['NickName'] if sender else username
99 |
100 | logger.info(f"收到消息 - 发送者: {sender_name}, 内容: {content}")
101 |
102 | # 获取回复
103 | reply = get_deepseek_response(content, username)
104 |
105 | # 保存消息记录
106 | save_message(username, sender_name, content, reply)
107 |
108 | # 发送回复
109 | logger.info(f"回复 {sender_name}: {reply}")
110 | return reply
111 |
112 | except Exception as e:
113 | logger.error(f"处理消息失败: {str(e)}")
114 | return "抱歉,我遇到了一些问题,请稍后再试。"
115 |
116 | # Flask路由
117 | @app.route('/')
118 | def index():
119 | """渲染监控页面"""
120 | return render_template('index.html')
121 |
122 | @app.route('/messages')
123 | def get_messages():
124 | """获取所有聊天记录"""
125 | # 添加跨域访问头
126 | session = Session()
127 | messages = session.query(ChatMessage).order_by(ChatMessage.created_at.desc()).all()
128 | result = [{
129 | 'id': msg.id,
130 | 'sender_name': msg.sender_name,
131 | 'message': msg.message,
132 | 'reply': msg.reply,
133 | 'created_at': msg.created_at.strftime('%Y-%m-%d %H:%M:%S')
134 | } for msg in messages]
135 | session.close()
136 | return {'messages': result}
137 |
138 | def run_flask():
139 | """运行Flask应用"""
140 | app.config['SECRET_KEY'] = 'your-secret-key-here' # 添加密钥
141 | app.config['TEMPLATES_AUTO_RELOAD'] = True # 启用模板自动重载
142 | app.run(
143 | host='127.0.0.1', # 改为本地地址
144 | port=5000,
145 | debug=False, # 关闭调试模式
146 | threaded=True
147 | )
148 |
149 | def open_dashboard():
150 | """打开监控面板"""
151 | time.sleep(2) # 等待Flask服务器启动
152 | webbrowser.open('http://127.0.0.1:5000')
153 |
154 | def login_wechat():
155 | """微信登录函数"""
156 | try:
157 | # 删除所有可能的登录文件
158 | if os.path.exists('itchat.pkl'):
159 | os.remove('itchat.pkl')
160 | logger.info("删除旧的登录状态文件")
161 |
162 | # 尝试登录
163 | itchat.auto_login(
164 | hotReload=False,
165 | enableCmdQR=-2, # 使用终端二维码,如果是Windows可以改为-1
166 | statusStorageDir='itchat.pkl',
167 | loginCallback=lambda: logger.info("登录成功"),
168 | exitCallback=lambda: logger.info("微信退出")
169 | )
170 |
171 | # 等待登录完成
172 | time.sleep(3)
173 |
174 | # 验证登录状态
175 | friends = itchat.get_friends()
176 | if friends:
177 | logger.info(f"登录验证成功,共有 {len(friends)} 个好友")
178 | # 登录成功后打开监控页面
179 | open_dashboard()
180 | return True
181 |
182 | logger.error("登录验证失败")
183 | return False
184 |
185 | except Exception as e:
186 | logger.error(f"登录过程出错: {str(e)}")
187 | return False
188 |
189 | def main():
190 | """主函数"""
191 | try:
192 | # 启动Flask线程
193 | flask_thread = threading.Thread(target=run_flask, daemon=True)
194 | flask_thread.start()
195 | logger.info("监控服务器已启动")
196 |
197 | # 删除之前的浏览器线程启动代码
198 | # 尝试登录微信
199 | retry_count = 0
200 | max_retries = 3
201 |
202 | while retry_count < max_retries:
203 | try:
204 | if login_wechat(): # 登录成功后会自动打开监控页面
205 | # 注册消息处理函数
206 | @itchat.msg_register([TEXT])
207 | def text_reply(msg):
208 | return handle_text(msg)
209 |
210 | # 运行
211 | logger.info("开始运行微信机器人...")
212 | itchat.run(debug=True)
213 | break
214 | else:
215 | retry_count += 1
216 | if retry_count < max_retries:
217 | logger.info(f"等待 10 秒后进行第 {retry_count + 1} 次重试")
218 | time.sleep(10)
219 | except Exception as e:
220 | logger.error(f"运行出错: {str(e)}")
221 | retry_count += 1
222 | if retry_count < max_retries:
223 | logger.info(f"等待 10 秒后进行第 {retry_count + 1} 次重试")
224 | time.sleep(10)
225 |
226 | if retry_count >= max_retries:
227 | logger.error("多次尝试登录失败,程序退出")
228 |
229 | except Exception as e:
230 | logger.error(f"程序运行错误: {str(e)}")
231 | finally:
232 | logger.info("程序退出")
233 |
234 | if __name__ == '__main__':
235 | try:
236 | # 确保使用最新版本的 itchat-uos
237 | if not hasattr(itchat, '__version__') or itchat.__version__ < '1.5.0':
238 | logger.warning("建议更新 itchat-uos 到最新版本")
239 | main()
240 | except KeyboardInterrupt:
241 | logger.info("程序被用户中断")
242 | except Exception as e:
243 | logger.error(f"程序异常退出: {str(e)}")
--------------------------------------------------------------------------------
/LLM联网搜索插件/response_processor.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import textwrap
4 | from datetime import datetime
5 | from typing import List, Dict, Optional, Any
6 |
7 | class ResponseProcessor:
8 | """处理和格式化搜索结果供LLM使用。
9 | Process and format search results for LLM consumption."""
10 |
11 | def __init__(self, max_tokens=4000, max_content_per_source=1500):
12 | self.max_tokens = max_tokens
13 | self.max_content_per_source = max_content_per_source
14 |
15 | def format_search_results(self, query: str, search_results: List[Dict[str, Any]],
16 | detailed_content: Optional[Dict[str, str]] = None) -> str:
17 | """
18 | 将搜索结果格式化为结构化的LLM响应。
19 | Format search results into a structured response for the LLM.
20 |
21 | 参数 | Args:
22 | query: 原始搜索查询 | The original search query
23 | search_results: 搜索结果字典列表 | List of search result dictionaries
24 | detailed_content: 特定网址的详细内容字典 | Dictionary of detailed content from specific URLs
25 |
26 | 返回 | Returns:
27 | 格式化的LLM响应 | Formatted response for the LLM
28 | """
29 | # 获取当前日期和时间 | Get current date and time
30 | current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
31 |
32 | # 开始构建响应 | Start building the response
33 | response = f"# Search Results for: \"{query}\"\n"
34 | response += f"*Search performed at: {current_time}*\n\n"
35 |
36 | # 添加搜索结果摘要 | Add search result summaries
37 | response += "## Search Result Summaries\n\n"
38 |
39 | if not search_results:
40 | response += "*No search results found*\n\n"
41 | else:
42 | for i, result in enumerate(search_results, 1):
43 | response += f"### {i}. {result['title']}\n"
44 | response += f"**Source**: [{result['link']}]({result['link']})\n"
45 | response += f"**Summary**: {result['snippet']}\n\n"
46 |
47 | # 如果可用,添加详细内容 | Add detailed content if available
48 | if detailed_content and len(detailed_content) > 0:
49 | response += "## Detailed Content\n\n"
50 |
51 | for url, content in detailed_content.items():
52 | # Find the corresponding search result to get the title
53 | title = next((r['title'] for r in search_results if r['link'] == url), "Content")
54 |
55 | # 清理和格式化内容 | Clean and format the content
56 | cleaned_content = self._clean_content(content)
57 | formatted_content = self._format_content_extract(cleaned_content)
58 |
59 | response += f"### {title}\n"
60 | response += f"**Source**: [{url}]({url})\n"
61 | response += f"**Content**:\n```\n{formatted_content}\n```\n\n"
62 |
63 | # 为LLM添加提示 | Add a prompt for the LLM
64 | response += "## Instructions for LLM\n\n"
65 | response += "Based on the search results above, please provide a comprehensive answer to the query. "
66 | response += "Include relevant information from the search results and cite sources appropriately using the source numbers. "
67 | response += "If the search results don't contain sufficient information to answer the query, "
68 | response += "please acknowledge the limitations and provide the best possible answer based on available information."
69 |
70 | return response
71 |
72 | def _clean_content(self, content) -> str:
73 | """清理和标准化网页内容。 | Clean and normalize content from web pages."""
74 | if not content:
75 | return ""
76 |
77 | # 确保内容是字符串类型
78 | if not isinstance(content, str):
79 | try:
80 | content = str(content)
81 | except:
82 | return "无法处理的内容类型"
83 |
84 | # 移除过多的空白符 | Remove excessive whitespace
85 | content = re.sub(r'\s+', ' ', content)
86 |
87 | # 移除常见的网页人工制品 | Remove common web page artifacts
88 | content = re.sub(r'Cookie Policy|Privacy Policy|Terms of Service|\d+ comments', '', content)
89 |
90 | # 移除电子邮件地址 | Remove email addresses
91 | content = re.sub(r'[\w.+-]+@[\w-]+\.[\w.-]+', '[EMAIL]', content)
92 |
93 | return content.strip()
94 |
95 | def _format_content_extract(self, content: str) -> str:
96 | """将内容提取格式化为合理的长度。 | Format a content extract to a reasonable length."""
97 | if not content:
98 | return "No content available"
99 |
100 | # 将内容截断到最大长度 | Truncate content to maximum length
101 | if len(content) > self.max_content_per_source:
102 | # 尝试在句子边界处截断 | Try to truncate at a sentence boundary
103 | truncation_point = content[:self.max_content_per_source].rfind('.')
104 | if truncation_point == -1 or truncation_point < self.max_content_per_source * 0.8:
105 | # 如果没有找到合适的句子边界,就在最大长度处截断 | If no good sentence boundary found, just truncate at max length
106 | truncated_content = content[:self.max_content_per_source]
107 | else:
108 | truncated_content = content[:truncation_point+1]
109 |
110 | formatted_content = truncated_content + "\n[Content truncated...]"
111 | else:
112 | formatted_content = content
113 |
114 | # 对长行进行换行以提高可读性 | Wrap long lines for better readability
115 | formatted_content = '\n'.join(textwrap.wrap(formatted_content, width=100,
116 | break_long_words=False,
117 | replace_whitespace=False))
118 |
119 | return formatted_content
120 |
121 | def extract_key_points(self, content: str, max_points: int = 5) -> List[str]:
122 | """从内容提取中提取关键点。 | Extract key points from a content extract."""
123 | if not content:
124 | return []
125 |
126 | # 将内容分割成句子 | Split the content into sentences
127 | sentences = re.split(r'(?<=[.!?])\s+', content)
128 |
129 | # 过滤掉非常短的句子或没有多少内容的句子 | Filter out very short sentences or sentences without much content
130 | valid_sentences = [s for s in sentences if len(s) > 20 and re.search(r'\w', s)]
131 |
132 | # 选择一部分句子作为关键点(简单方法 - 可以增强) | Select a subset of sentences as key points (simple approach - could be enhanced)
133 | key_points = []
134 | if valid_sentences:
135 | # 选取分布在整个内容中的句子 | Take sentences distributed throughout the content
136 | step = max(1, len(valid_sentences) // max_points)
137 | key_points = [valid_sentences[i] for i in range(0, len(valid_sentences), step)][:max_points]
138 |
139 | return key_points
140 |
141 | def create_prompt_with_search_results(self, user_query: str, search_results: List[Dict[str, Any]],
142 | detailed_content: Optional[Dict[str, str]] = None,
143 | system_prompt: Optional[str] = None) -> str:
144 | """
145 | 创建一个将用户查询与搜索结果结合的提示词。
146 | Create a prompt that combines the user's query with search results.
147 |
148 | 参数 | Args:
149 | user_query: 用户的原始查询 | The user's original query
150 | search_results: 搜索结果字典列表 | List of search result dictionaries
151 | detailed_content: 特定网址的详细内容字典 | Dictionary of detailed content from specific URLs
152 | system_prompt: 可选的自定义系统提示词 | Optional custom system prompt to use
153 |
154 | 返回 | Returns:
155 | 包含用户查询和搜索结果的LLM提示词 | A prompt for the LLM that includes the user query and search results
156 | """
157 | formatted_results = self.format_search_results(user_query, search_results, detailed_content)
158 |
159 | # 如果未提供,使用默认系统提示词 | Default system prompt if none provided
160 | if not system_prompt:
161 | system_prompt = (
162 | "You are an AI assistant with access to web search results. "
163 | "You specialize in providing accurate information based on recent web content. "
164 | "When responding, always cite your sources by referring to the search result numbers. "
165 | "If the search results contain contradictory information, acknowledge this and explain why. "
166 | "If the search results don't provide sufficient information to fully answer the query, be transparent about these limitations."
167 | )
168 |
169 | prompt = (
170 | f"{system_prompt}\n\n"
171 | f"The user asked: \"{user_query}\"\n\n"
172 | "I've searched the web and found the following information to help answer this question:\n\n"
173 | f"{formatted_results}\n\n"
174 | "Based on these search results, provide a comprehensive, accurate, and helpful response to the user's question. "
175 | "Cite specific sources by their numbers when drawing information from them. "
176 | "Format your response in a clear, structured way with appropriate headings and lists where helpful."
177 | )
178 |
179 | return prompt
180 |
--------------------------------------------------------------------------------
/we_chat_project/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 微信机器人监控面板
5 |
6 |
7 |
8 |
9 |
165 |
166 |
167 |
168 |
189 |
190 |
191 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
336 |
337 |
--------------------------------------------------------------------------------
/LLM联网搜索插件/llm_client_example.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | import os
4 | import time
5 | from datetime import datetime
6 | from dotenv import load_dotenv
7 | from typing import Dict, List, Any, Optional, Union
8 |
9 | # 加载环境变量
10 | load_dotenv()
11 |
12 | class LLMWebSearchClient:
13 | """
14 | 连接本地LLM与网络搜索插件的客户端。
15 | 这是一个示例实现,可以适配不同的LLM API。
16 | """
17 |
18 | def __init__(self, llm_api_url=None, search_api_url=None, model_name=None, temperature=0.7, max_tokens=2048, llm_type="ollama"):
19 | """初始化LLM Web搜索客户端"""
20 | # 默认API URL
21 | self.llm_api_url = llm_api_url or os.environ.get("LLM_API_URL", "http://localhost:5000/api/llm")
22 | self.search_api_url = search_api_url or os.environ.get("SEARCH_API_URL", "http://localhost:5005/search")
23 | self.ollama_api_url = os.environ.get("OLLAMA_API_URL", "http://localhost:11434")
24 |
25 | # LLM参数
26 | self.llm_type = llm_type # 可以是 "api" 或 "ollama"
27 | self.temperature = temperature
28 | self.max_tokens = max_tokens
29 |
30 | # 如果没有指定模型,自动检测并选择合适的模型
31 | if model_name is None:
32 | self.model_name = self._detect_best_model()
33 | else:
34 | self.model_name = model_name
35 |
36 | print(f"使用模型: {self.model_name}")
37 |
38 | def _detect_best_model(self):
39 | """检测并选择最佳可用模型"""
40 | if self.llm_type != "ollama":
41 | return "deepseek-r1:1.5b" # 非Ollama模式下的默认模型
42 |
43 | try:
44 | # 获取Ollama可用模型列表
45 | response = requests.get(f"{self.ollama_api_url}/api/tags")
46 | if response.status_code != 200:
47 | print("无法获取Ollama模型列表,使用默认模型")
48 | return "deepseek-r1:1.5b"
49 |
50 | models = response.json().get("models", [])
51 |
52 | # 模型优先级列表(从高到低)
53 | preferred_models = [
54 | # 7B级别模型优先
55 | "deepseek-r1:7b", "qwen:7b", "llama3", "gemma:7b", "mistral:7b",
56 | # 其次是其他大小模型
57 | "deepseek-r1:67b", "qwen:14b", "qwen:72b", "yi:34b", "mixtral:8x7b",
58 | # 最后是小模型
59 | "deepseek-r1:1.5b", "gemma:2b"
60 | ]
61 |
62 | # 检查可用模型中是否有优先级列表中的模型
63 | available_models = [model["name"] for model in models]
64 |
65 | for preferred_model in preferred_models:
66 | # 完全匹配
67 | if preferred_model in available_models:
68 | return preferred_model
69 |
70 | # 部分匹配(例如,如果有qwen:7b-chat,也可以匹配qwen:7b)
71 | for available_model in available_models:
72 | if preferred_model in available_model:
73 | return available_model
74 |
75 | # 如果没有找到任何优先级列表中的模型,但有其他模型可用
76 | if available_models:
77 | # 优先选择名称中包含7b的模型
78 | for model in available_models:
79 | if "7b" in model.lower():
80 | return model
81 | # 否则返回第一个可用模型
82 | return available_models[0]
83 |
84 | # 如果完全没有可用模型,返回默认模型
85 | return "deepseek-r1:1.5b"
86 |
87 | except Exception as e:
88 | print(f"检测模型时出错: {e}")
89 | return "deepseek-r1:1.5b"
90 |
91 | def search_web(self, query, num_results=5, fetch_content=True, search_engine="google"):
92 | """执行网络搜索并返回结果"""
93 | try:
94 | # 构建搜索请求
95 | search_request = {
96 | "query": query,
97 | "num_results": num_results,
98 | "fetch_content": fetch_content,
99 | "search_engine": search_engine
100 | }
101 |
102 | print(f"发送搜索请求到: {self.search_api_url}")
103 | # 发送搜索请求
104 | response = requests.post(
105 | self.search_api_url,
106 | json=search_request,
107 | headers={"Content-Type": "application/json"},
108 | timeout=30 # 添加超时设置
109 | )
110 |
111 | # 检查响应
112 | response.raise_for_status() # 如果状态码不是200,会抛出异常
113 | search_data = response.json()
114 |
115 | # 从响应中提取搜索结果
116 | search_results = search_data.get("search_results", [])
117 | return {"search_results": search_results}
118 |
119 | except requests.exceptions.ConnectionError as e:
120 | error_msg = f"无法连接到搜索服务 ({self.search_api_url}): {str(e)}"
121 | print(error_msg)
122 | return {"search_results": [], "error": error_msg}
123 | except requests.exceptions.Timeout as e:
124 | error_msg = f"搜索请求超时: {str(e)}"
125 | print(error_msg)
126 | return {"search_results": [], "error": error_msg}
127 | except requests.exceptions.RequestException as e:
128 | error_msg = f"搜索请求失败: {str(e)}"
129 | print(error_msg)
130 | return {"search_results": [], "error": error_msg}
131 | except json.JSONDecodeError as e:
132 | error_msg = f"无法解析搜索响应: {str(e)}"
133 | print(error_msg)
134 | return {"search_results": [], "error": error_msg}
135 | except Exception as e:
136 | error_msg = f"执行网络搜索时出错: {str(e)}"
137 | print(error_msg)
138 | return {"search_results": [], "error": error_msg}
139 |
140 | def query_llm(self, prompt, model=None, temperature=None, max_tokens=None):
141 | """直接查询LLM并返回响应"""
142 | try:
143 | # 使用提供的参数或默认参数
144 | model = model or self.model_name
145 | temperature = temperature if temperature is not None else self.temperature
146 | max_tokens = max_tokens if max_tokens is not None else self.max_tokens
147 |
148 | # 根据LLM类型选择不同的查询方法
149 | if self.llm_type == "ollama":
150 | # Ollama API调用
151 | try:
152 | response = requests.post(
153 | f"{self.ollama_api_url}/api/generate",
154 | json={
155 | "model": model,
156 | "prompt": prompt,
157 | "stream": False,
158 | "options": {
159 | "temperature": temperature,
160 | "num_predict": max_tokens
161 | }
162 | },
163 | timeout=60
164 | )
165 | response.raise_for_status() # 如果状态码不是200,会抛出异常
166 | return response.json().get("response", "无法获取LLM回答")
167 | except requests.exceptions.RequestException as e:
168 | if "404" in str(e):
169 | return "抱歉,无法连接到Ollama服务。请确保Ollama已安装并运行在端口11434上。\n\n错误详情: 404 Not Found - Ollama服务未找到。"
170 | else:
171 | return f"抱歉,连接到Ollama服务时出错: {str(e)}"
172 | else: # 默认使用API
173 | response = requests.post(
174 | self.llm_api_url,
175 | json={
176 | "model": model,
177 | "messages": [{"role": "user", "content": prompt}],
178 | "temperature": temperature,
179 | "max_tokens": max_tokens
180 | },
181 | timeout=60
182 | )
183 | response.raise_for_status()
184 | return response.json().get("choices", [{}])[0].get("message", {}).get("content", "无法获取LLM回答")
185 | except Exception as e:
186 | return f"查询LLM时出错: {str(e)}"
187 |
188 | def answer_with_web_search(self, query, num_results=5, fetch_content=True, search_engine="google"):
189 | """使用网络搜索增强LLM回答"""
190 | try:
191 | # 执行网络搜索
192 | search_result = self.search_web(query, num_results, fetch_content, search_engine)
193 |
194 | # 检查是否有错误信息
195 | if "error" in search_result:
196 | error_message = search_result.get("error", "未知错误")
197 | return {
198 | "answer": f"抱歉,无法获取搜索结果。请检查网络连接或稍后再试。\n\n技术详情: {error_message}",
199 | "search_results": []
200 | }
201 |
202 | # 检查搜索结果是否为空
203 | if not search_result["search_results"]:
204 | return {
205 | "answer": "抱歉,无法获取搜索结果。请检查网络连接或稍后再试。",
206 | "search_results": []
207 | }
208 |
209 | # 构建包含搜索结果的提示
210 | search_context = "\n".join([
211 | f"[{i}] {result.get('title', '无标题')}\n"
212 | f"链接: {result.get('url', result.get('link', '无链接'))}\n"
213 | f"摘要: {result.get('snippet', '无摘要')}\n"
214 | for i, result in enumerate(search_result["search_results"], 1)
215 | ])
216 |
217 | # 构建最终提示
218 | prompt = f"""请基于以下搜索结果回答问题。在回答中引用相关信息的来源,使用[数字]格式引用(例如[1],[2]等)。
219 |
220 | 问题: {query}
221 |
222 | 搜索结果:
223 | {search_context}
224 |
225 | 请提供详细、准确的回答,并确保引用相关信息的来源。如果搜索结果中没有足够的信息来回答问题,请说明这一点。"""
226 |
227 | # 查询LLM
228 | answer = self.query_llm(prompt)
229 |
230 | # 返回答案和搜索结果
231 | return {
232 | "answer": answer,
233 | "search_results": search_result["search_results"]
234 | }
235 | except Exception as e:
236 | error_message = str(e)
237 | print(f"使用网络搜索回答时出错: {error_message}")
238 | return {
239 | "answer": f"抱歉,处理您的请求时出错: {error_message}",
240 | "search_results": []
241 | }
242 |
243 | def main():
244 | """LLMWebSearchClient的示例用法"""
245 | import argparse
246 | import time
247 |
248 | # 命令行参数解析
249 | parser = argparse.ArgumentParser(description='LLM网络搜索客户端')
250 | parser.add_argument('--llm-type', type=str, default='ollama', choices=['ollama', 'api'],
251 | help='LLM类型: ollama, api')
252 | parser.add_argument('--model-name', type=str, default=None,
253 | help='模型名称,如deepseek-r1:7b, llama3等。不指定则自动检测最佳模型')
254 | parser.add_argument('--api-url', type=str, default=None,
255 | help='LLM API URL (仅在llm-type为api时使用)')
256 | parser.add_argument('--search-api-url', type=str, default='http://localhost:5005/search',
257 | help='搜索API URL')
258 | parser.add_argument('--search-engine', type=str, default='google', choices=['google', 'bing', 'baidu'],
259 | help='搜索引擎: google, bing, baidu')
260 | parser.add_argument('--temperature', type=float, default=0.7,
261 | help='生成文本的温度,控制随机性,值越高结果越多样')
262 | parser.add_argument('--max-tokens', type=int, default=2048,
263 | help='生成的最大token数量')
264 | parser.add_argument('--interactive', action='store_true',
265 | help='启用交互模式,可以连续提问')
266 |
267 | args = parser.parse_args()
268 |
269 | # 初始化客户端
270 | client = LLMWebSearchClient(
271 | search_api_url=args.search_api_url,
272 | llm_api_url=args.api_url,
273 | llm_type=args.llm_type,
274 | model_name=args.model_name,
275 | temperature=args.temperature,
276 | max_tokens=args.max_tokens
277 | )
278 |
279 | def process_query(query):
280 | # 使用网络搜索回答
281 | print(f"\n正在搜索网络并查询LLM ({args.llm_type}:{args.model_name})...")
282 | start_time = time.time()
283 | result = client.answer_with_web_search(query, search_engine=args.search_engine)
284 | processing_time = time.time() - start_time
285 |
286 | # 打印结果
287 | if "error" in result:
288 | print(f"错误: {result['error']}")
289 | return
290 |
291 | print("\n=== 搜索结果 ===")
292 | for i, res in enumerate(result["search_results"], 1):
293 | print(f"{i}. {res.get('title', '无标题')}")
294 | print(f" {res.get('link', '无链接')}")
295 | snippet = res.get('snippet', '无摘要')
296 | if len(snippet) > 100:
297 | snippet = snippet[:100] + "..."
298 | print(f" {snippet}")
299 |
300 | print(f"\n=== LLM回答 (处理时间: {processing_time:.2f}秒) ===")
301 | print(result["answer"])
302 |
303 | if args.interactive:
304 | print(f"进入交互模式 (使用 {args.llm_type}:{args.model_name}),输入'exit'或'quit'退出")
305 | while True:
306 | query = input("\n请输入您的问题: ")
307 | if query.lower() in ['exit', 'quit', '退出']:
308 | break
309 | process_query(query)
310 | else:
311 | # 获取用户查询
312 | query = input("请输入您的问题: ")
313 | process_query(query)
314 |
315 |
316 | if __name__ == "__main__":
317 | main()
318 |
--------------------------------------------------------------------------------
/LLM联网搜索插件/test_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """
5 | LLM联网搜索插件测试工具集
6 |
7 | 该文件整合了多个测试功能,包括:
8 | 1. 测试不同搜索引擎的搜索结果
9 | 2. 测试LLM对搜索结果的利用情况
10 | 3. 比较不同搜索引擎的结果差异
11 |
12 | 使用方法:
13 | python test_utils.py --mode [search|llm|compare] --query "你的查询" --search-engine baidu
14 |
15 | 示例:
16 | # 测试搜索引擎
17 | python test_utils.py --mode search --query "量子计算" --search-engine baidu
18 |
19 | # 测试LLM响应
20 | python test_utils.py --mode llm --query "人工智能应用" --model "qwen:7b" --temperature 0.5
21 |
22 | # 比较搜索引擎
23 | python test_utils.py --mode compare --query "深度学习框架对比"
24 | """
25 |
26 | import os
27 | import sys
28 | import json
29 | import argparse
30 | import time
31 | from datetime import datetime
32 | import pytz
33 | import random
34 | import requests
35 | from llm_client_example import LLMWebSearchClient
36 | from search_engine import WebSearch
37 | from response_processor import ResponseProcessor
38 |
39 | def is_time_query(query):
40 | """判断是否为时间查询"""
41 | # 简单的时间查询关键词
42 | time_keywords = ["几点", "时间", "日期", "current time", "what time", "date"]
43 |
44 | # 排除词,这些词虽然包含时间相关词汇,但不是询问当前时间
45 | exclude_keywords = ["时间管理", "时间复杂度", "时间序列", "时间轴", "时间旅行", "时间简史",
46 | "时间规划", "时间胶囊", "日期选择", "日期格式", "日期范围"]
47 |
48 | # 检查是否包含时间查询关键词
49 | contains_time_keyword = any(keyword in query.lower() for keyword in time_keywords)
50 |
51 | # 检查是否包含排除词
52 | contains_exclude_keyword = any(keyword in query.lower() for keyword in exclude_keywords)
53 |
54 | # 如果包含时间查询关键词但不包含排除词,则认为是时间查询
55 | if contains_time_keyword and not contains_exclude_keyword:
56 | # 进一步检查是否是明确询问当前时间的查询
57 | explicit_time_queries = ["现在几点", "现在是几点", "现在时间", "当前时间", "现在日期", "今天日期",
58 | "what time is it", "current time", "what is the time now"]
59 | for explicit_query in explicit_time_queries:
60 | if explicit_query in query.lower():
61 | return True
62 |
63 | # 如果查询很短并且只包含时间关键词,也认为是时间查询
64 | if len(query) < 15 and ("时间" in query or "几点" in query or "日期" in query):
65 | return True
66 |
67 | return False
68 |
69 | def get_current_time():
70 | """获取当前时间,格式化为易读的字符串"""
71 | # 获取北京时间
72 | beijing_tz = pytz.timezone('Asia/Shanghai')
73 | now = datetime.now(beijing_tz)
74 |
75 | # 格式化时间字符串
76 | time_str = now.strftime("%Y年%m月%d日 %H:%M:%S")
77 | weekday_map = {
78 | 0: "星期一",
79 | 1: "星期二",
80 | 2: "星期三",
81 | 3: "星期四",
82 | 4: "星期五",
83 | 5: "星期六",
84 | 6: "星期日"
85 | }
86 | weekday = weekday_map[now.weekday()]
87 |
88 | return f"{time_str} {weekday}"
89 |
90 | def generate_mock_search_results(query):
91 | """生成模拟的搜索结果数据"""
92 | # 检查是否是天气相关查询
93 | weather_keywords = ["天气", "气温", "下雨", "晴天", "阴天", "雨天", "雪", "温度", "气候"]
94 | is_weather_query = any(keyword in query for keyword in weather_keywords)
95 |
96 | if is_weather_query:
97 | # 获取当前日期
98 | from datetime import datetime
99 | current_date = datetime.now().strftime("%Y年%m月%d日")
100 |
101 | # 为天气查询生成更真实的模拟数据
102 | mock_results = {
103 | "query": query,
104 | "search_results": [
105 | {
106 | "title": f"【上海天气预报】上海今日天气 {current_date}",
107 | "link": "https://example.com/shanghai-weather",
108 | "snippet": f"{current_date} 上海天气:晴转多云,气温18-25℃,东南风3-4级,空气质量良好,紫外线强度中等。建议穿着薄外套或长袖衬衫。"
109 | },
110 | {
111 | "title": f"全国天气预报_中国天气网",
112 | "link": "https://example.com/china-weather",
113 | "snippet": f"今日全国天气:北方地区多云转晴,南方地区有小到中雨。华北、东北地区气温回升,西南地区有强对流天气,注意防范。"
114 | },
115 | {
116 | "title": f"天气预报查询_未来一周天气预报",
117 | "link": "https://example.com/weather-forecast",
118 | "snippet": f"未来一周天气预报:周三至周四全国大部地区天气晴好,周五开始南方将有一次降水过程,华南地区有中到大雨,局部暴雨。"
119 | }
120 | ],
121 | "detailed_content": {
122 | "https://example.com/shanghai-weather": f"""
123 | {current_date} 上海天气详情:
124 | 今日天气:晴转多云
125 | 气温:18-25℃
126 | 风向风力:东南风3-4级
127 | 空气质量:良好,AQI 65
128 | 紫外线强度:中等
129 | 生活指数:
130 | - 穿衣指数:建议穿薄外套或长袖衬衫
131 | - 洗车指数:较适宜
132 | - 感冒指数:低发期,无明显降温
133 | - 运动指数:适宜户外运动
134 | 未来三天预报:
135 | - 明天:多云,19-26℃
136 | - 后天:多云转小雨,17-23℃
137 | - 大后天:小雨,16-21℃
138 | """,
139 | "https://example.com/china-weather": f"""
140 | {current_date} 全国天气概况:
141 | 北方地区:
142 | - 华北:晴到多云,14-25℃,空气质量良
143 | - 东北:多云,早晚温差大,8-20℃
144 | - 西北:晴,气温回升,12-28℃
145 |
146 | 南方地区:
147 | - 华东:多云有阵雨,18-26℃
148 | - 华南:小到中雨,局部大雨,22-29℃
149 | - 西南:多云转阴,有阵雨或雷雨,15-24℃
150 |
151 | 主要城市天气:
152 | - 北京:晴,15-27℃
153 | - 上海:晴转多云,18-25℃
154 | - 广州:小雨,23-28℃
155 | - 深圳:中雨,22-27℃
156 | - 成都:多云,16-22℃
157 | - 武汉:多云,17-25℃
158 | """,
159 | "https://example.com/weather-forecast": f"""
160 | 未来一周全国天气预报({current_date}起):
161 |
162 | 第1天:全国大部地区天气晴好,华北、东北气温回升,西南地区多云。
163 | 第2天:华北、东北继续晴好,华南地区云量增多。
164 | 第3天:南方将有一次降水过程开始,华南地区有小到中雨。
165 | 第4天:降水范围扩大,华南、华东南部有中到大雨,局部暴雨。
166 | 第5天:降水减弱,华南仍有小到中雨,其他地区多云。
167 | 第6天:全国大部地区转为多云或晴,气温回升。
168 | 第7天:新一轮冷空气将影响北方地区,带来降温和大风天气。
169 |
170 | 温馨提示:
171 | 1. 南方地区公众需关注强降水天气,注意防范城市内涝和山洪地质灾害。
172 | 2. 北方地区公众需关注气温变化,适时调整着装。
173 | 3. 雷雨天气出行请携带雨具,注意交通安全。
174 | """
175 | }
176 | }
177 | else:
178 | # 为非天气查询生成通用模拟数据
179 | mock_results = {
180 | "query": query,
181 | "search_results": [
182 | {
183 | "title": f"关于 {query} 的最新研究",
184 | "link": "https://example.com/research",
185 | "snippet": f"这是关于 {query} 的最新研究成果,包含了最新的进展和发现..."
186 | },
187 | {
188 | "title": f"{query} 的基本概念和应用",
189 | "link": "https://example.com/concepts",
190 | "snippet": f"本文介绍了 {query} 的基本概念、原理以及在各个领域的应用..."
191 | },
192 | {
193 | "title": f"{query} 的历史发展",
194 | "link": "https://example.com/history",
195 | "snippet": f"{query} 的发展历程可以追溯到几十年前,经历了多个重要的里程碑..."
196 | }
197 | ],
198 | "detailed_content": {
199 | "https://example.com/research": f"这是一篇关于 {query} 的详细研究报告,包含了最新的研究方法、数据分析和结论...",
200 | "https://example.com/concepts": f"本文详细介绍了 {query} 的核心概念、基本原理、技术实现以及在不同行业的应用案例...",
201 | "https://example.com/history": f"本文回顾了 {query} 的完整发展历程,从早期的理论构想到现代的实际应用,包括关键人物、重要事件和技术突破..."
202 | }
203 | }
204 |
205 | # 添加格式化的提示词
206 | processor = ResponseProcessor()
207 | formatted_prompt = processor.format_search_results(
208 | query=query,
209 | search_results=mock_results["search_results"],
210 | detailed_content=mock_results["detailed_content"]
211 | )
212 | mock_results["formatted_prompt"] = formatted_prompt
213 |
214 | return mock_results
215 |
216 | def test_search_engine(query=None, search_engine="baidu", num_results=5, fetch_content=True, verbose=False):
217 | """测试搜索引擎的搜索结果"""
218 | # 如果没有提供查询,使用默认查询
219 | if query is None:
220 | query = "量子计算最新进展"
221 |
222 | print(f"查询: {query}")
223 | print(f"\n搜索引擎: {search_engine}")
224 |
225 | # 创建搜索引擎实例
226 | engine = WebSearch(search_engine=search_engine)
227 |
228 | # 执行搜索
229 | print("\n正在搜索...\n")
230 | search_results = engine.search(query, num_results=num_results)
231 |
232 | # 显示搜索结果
233 | print("=== 搜索结果 ===")
234 | for i, result in enumerate(search_results, 1):
235 | print(f"{i}. {result['title']}")
236 | print(f" {result['link']}")
237 | print(f" {result['snippet'][:100]}...\n" if len(result['snippet']) > 100 else f" {result['snippet']}\n")
238 |
239 | # 如果需要获取详细内容
240 | detailed_content = {}
241 | if fetch_content:
242 | print("\n正在获取详细内容...\n")
243 | try:
244 | # 只获取前两个结果的详细内容
245 | for i, result in enumerate(search_results[:2]):
246 | try:
247 | content = engine.fetch_content(result['link'])
248 | if content:
249 | content_preview = content[:500] + "..." if len(content) > 500 else content
250 | detailed_content[result['link']] = content
251 | print(f"获取内容成功: {result['link']}")
252 | except Exception as e:
253 | print(f"Error fetching content from {result['link']}: {str(e)}")
254 | # 不添加失败的内容到detailed_content
255 | except Exception as e:
256 | print(f"获取详细内容时出错: {str(e)}")
257 |
258 | # 使用响应处理器格式化结果
259 | processor = ResponseProcessor()
260 | formatted_prompt = processor.format_search_results(
261 | query=query,
262 | search_results=search_results,
263 | detailed_content=detailed_content
264 | )
265 |
266 | # 如果需要显示详细信息
267 | if verbose:
268 | print("\n=== 格式化的提示词 ===")
269 | print(formatted_prompt)
270 |
271 | return {
272 | "query": query,
273 | "search_results": search_results,
274 | "detailed_content": detailed_content,
275 | "formatted_prompt": formatted_prompt
276 | }
277 |
278 | def test_llm_response(query=None, search_engine="baidu", num_results=5, fetch_content=False,
279 | model=None, api_url="http://localhost:5005/search",
280 | temperature=0.7, max_tokens=2048, verbose=False, mock=False):
281 | """测试LLM对搜索结果的利用情况"""
282 | start_time = time.time()
283 |
284 | # 检查是否为时间查询
285 | if is_time_query(query):
286 | current_time = datetime.now(pytz.timezone('Asia/Shanghai')).strftime("%Y-%m-%d %H:%M:%S")
287 | print(f"\n这是一个时间查询。当前时间是: {current_time}")
288 | return
289 |
290 | print(f"\n正在测试LLM对搜索结果的利用情况...")
291 | print(f"查询: {query}")
292 | print(f"搜索引擎: {search_engine}")
293 | print(f"结果数量: {num_results}")
294 | if model:
295 | print(f"使用指定模型: {model}")
296 | else:
297 | print("使用自动检测的最佳模型")
298 |
299 | # 初始化LLM客户端
300 | client = LLMWebSearchClient(
301 | search_api_url=api_url,
302 | model_name=model,
303 | temperature=temperature,
304 | max_tokens=max_tokens
305 | )
306 |
307 | # 检查是否使用模拟数据
308 | if mock:
309 | print("\n使用模拟搜索数据...\n")
310 | search_data = generate_mock_search_results(query)
311 | else:
312 | # 使用真实搜索结果
313 | print("\n正在进行网络搜索...\n")
314 | try:
315 | # 直接使用test_search_engine函数获取搜索结果
316 | search_data = test_search_engine(
317 | query=query,
318 | search_engine=search_engine,
319 | num_results=num_results,
320 | fetch_content=fetch_content,
321 | verbose=False
322 | )
323 |
324 | # 检查是否有错误
325 | if "error" in search_data:
326 | print(f"搜索出错: {search_data['error']}")
327 | return {
328 | "query": query,
329 | "error": search_data['error'],
330 | "llm_response": f"搜索出错: {search_data['error']}"
331 | }
332 | except Exception as e:
333 | error_msg = f"搜索出错: {str(e)}"
334 | print(error_msg)
335 | return {
336 | "query": query,
337 | "error": error_msg,
338 | "llm_response": f"搜索出错: {str(e)}"
339 | }
340 |
341 | # 显示搜索结果
342 | print("=== 搜索结果 ===")
343 | for i, result in enumerate(search_data["search_results"], 1):
344 | print(f"{i}. {result['title']}")
345 | print(f" {result['link']}")
346 | if "snippet" in result:
347 | snippet = result["snippet"]
348 | print(f" {snippet[:100]}..." if len(snippet) > 100 else f" {snippet}")
349 | print() # 添加空行增加可读性
350 |
351 | # 显示格式化的提示词
352 | if verbose:
353 | print("\n=== 提示词 ===")
354 | print(search_data["formatted_prompt"])
355 | else:
356 | print("\n=== 提示词 ===")
357 | print("(使用 --verbose 参数查看完整提示词)")
358 |
359 | # 查询LLM
360 | print("\n正在查询LLM...\n")
361 | try:
362 | # 添加超时处理
363 | import time
364 | start_time = time.time()
365 | llm_response = client.query_local_llm(search_data["formatted_prompt"])
366 | end_time = time.time()
367 |
368 | print("=== LLM回答 ===")
369 | print(llm_response)
370 | print(f"\n响应时间: {end_time - start_time:.2f}秒")
371 |
372 | return {
373 | "query": query,
374 | "search_data": search_data,
375 | "llm_response": llm_response,
376 | "response_time": end_time - start_time
377 | }
378 | except Exception as e:
379 | error_msg = f"查询LLM时出错: {str(e)}"
380 | print(error_msg)
381 | print("=== LLM回答 ===")
382 | print(f"错误: {str(e)}")
383 | print("\n可能的原因:")
384 | print("1. LLM服务器未启动或无法访问")
385 | print("2. API地址不正确")
386 | print("3. 模型名称不正确或模型未下载")
387 | print("4. 网络连接问题")
388 | print("\n建议:")
389 | print(f"- 确认LLM服务器已启动并运行在 {api_url}")
390 | print(f"- 确认模型 '{model}' 已正确安装")
391 | print("- 检查网络连接")
392 |
393 | return {
394 | "query": query,
395 | "search_data": search_data,
396 | "error": error_msg,
397 | "llm_response": f"错误: {str(e)}"
398 | }
399 |
400 | def compare_search_engines(query=None, engines=None, num_results=5):
401 | """比较不同搜索引擎的结果"""
402 | if query is None:
403 | query = "量子计算最新进展"
404 |
405 | if engines is None:
406 | engines = ["google", "bing", "baidu"]
407 |
408 | print(f"查询: {query}")
409 | print(f"比较搜索引擎: {', '.join(engines)}")
410 |
411 | results = {}
412 |
413 | for engine in engines:
414 | print(f"\n正在使用 {engine} 搜索...\n")
415 | try:
416 | search_engine = WebSearch(search_engine=engine)
417 | search_results = search_engine.search(query, num_results=num_results)
418 |
419 | results[engine] = search_results
420 |
421 | print(f"=== {engine.capitalize()} 搜索结果 ===")
422 | for i, result in enumerate(search_results, 1):
423 | print(f"{i}. {result['title']}")
424 | print(f" {result['link']}")
425 | print(f" {result['snippet'][:100]}...\n")
426 | except Exception as e:
427 | print(f"{engine} 搜索出错: {str(e)}")
428 | results[engine] = {"error": str(e)}
429 |
430 | # 比较结果
431 | print("\n=== 结果比较 ===")
432 | print(f"查询: {query}")
433 |
434 | for engine in engines:
435 | if isinstance(results[engine], list):
436 | print(f"{engine.capitalize()}: 返回 {len(results[engine])} 个结果")
437 | else:
438 | print(f"{engine.capitalize()}: 搜索出错 - {results[engine]['error']}")
439 |
440 | return {
441 | "query": query,
442 | "results": results
443 | }
444 |
445 | if __name__ == "__main__":
446 | parser = argparse.ArgumentParser(description="LLM联网搜索插件测试工具")
447 | parser.add_argument("--mode", type=str, default="search", choices=["search", "llm", "compare"],
448 | help="测试模式: search(测试搜索引擎), llm(测试LLM响应), compare(比较搜索引擎)")
449 | parser.add_argument("--query", type=str, default="量子计算最新进展",
450 | help="要测试的查询 (默认: 量子计算最新进展)")
451 | parser.add_argument("--verbose", action="store_true", help="显示详细信息,包括完整提示词")
452 | parser.add_argument("--search-engine", type=str, default="baidu", choices=["google", "bing", "baidu"],
453 | help="使用的搜索引擎 (默认: baidu)")
454 | parser.add_argument("--engines", type=str, nargs="+", default=["google", "bing", "baidu"],
455 | help="比较模式下要比较的搜索引擎列表 (默认: google bing baidu)")
456 | parser.add_argument("--mock", action="store_true", help="使用模拟搜索数据,不进行实际搜索")
457 | parser.add_argument("--model", type=str, default=None,
458 | help="指定使用的LLM模型名称 (默认: 自动检测最佳模型)")
459 | parser.add_argument("--api-url", type=str, default="http://localhost:5005/search",
460 | help="指定搜索API的URL (默认: http://localhost:5005/search)")
461 | parser.add_argument("--temperature", type=float, default=0.7,
462 | help="LLM生成的温度参数 (默认: 0.7)")
463 | parser.add_argument("--num-results", type=int, default=5,
464 | help="搜索结果数量 (默认: 5)")
465 | parser.add_argument("--fetch-content", action="store_true", help="获取详细网页内容")
466 | parser.add_argument("--max-tokens", type=int, default=2048,
467 | help="LLM生成的最大长度 (默认: 2048)")
468 | args = parser.parse_args()
469 |
470 | if args.mode == "search":
471 | test_search_engine(
472 | query=args.query,
473 | search_engine=args.search_engine,
474 | num_results=args.num_results,
475 | fetch_content=args.fetch_content,
476 | verbose=args.verbose
477 | )
478 | elif args.mode == "llm":
479 | test_llm_response(
480 | query=args.query,
481 | search_engine=args.search_engine,
482 | num_results=args.num_results,
483 | fetch_content=args.fetch_content,
484 | model=args.model,
485 | api_url=args.api_url,
486 | temperature=args.temperature,
487 | max_tokens=args.max_tokens,
488 | verbose=args.verbose,
489 | mock=args.mock
490 | )
491 | elif args.mode == "compare":
492 | compare_search_engines(
493 | query=args.query,
494 | engines=args.engines,
495 | num_results=args.num_results
496 | )
497 |
--------------------------------------------------------------------------------
/LLM联网搜索插件/templates/llm.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | LLM模型交互 | LLM Model Interaction
7 |
8 |
334 |
335 |
336 |
350 |
351 |
352 |
353 |
LLM模型交互 | LLM Model Interaction
354 |
355 | {% if error_message %}
356 |
357 | {{ error_message }}
358 |
359 | {% endif %}
360 |
361 |
438 |
439 | {% if response_text %}
440 |
441 |
445 |
446 | {% if search_results %}
447 |
448 |
搜索结果 | Search Results
449 |
450 | {% for result in search_results %}
451 |
452 |
{{ loop.index }}. {{ result.title }}
453 |
{{ result.url }}
454 |
{{ result.snippet }}
455 |
456 | {% endfor %}
457 |
458 |
459 | {% endif %}
460 |
461 |
462 |
{{ response_text|safe }}
463 |
464 |
465 | {% endif %}
466 |
467 |
468 |
469 |
474 |
475 |
576 |
577 |
578 |
--------------------------------------------------------------------------------
/LLM联网搜索插件/templates/config.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | LLM Web Search Plugin - Configuration
8 |
9 |
10 |
11 |
12 |
452 |
453 |
454 |
469 |
470 |
471 |
472 |
473 |
476 |
Customize your LLM Web Search Plugin settings below to enhance your search experience.
477 |
478 |
613 |
614 |
615 |
616 |
617 |
623 |
624 |
625 |
--------------------------------------------------------------------------------
/LLM联网搜索插件/search_engine.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import json
4 | import os
5 | import re
6 | import nltk
7 | from urllib.parse import quote_plus, urlparse
8 | import time
9 | import random
10 | from typing import List, Dict, Any, Optional, Tuple
11 | from collections import Counter
12 | from datetime import datetime
13 |
14 | # 在首次导入时下载所需的NLTK资源 | Download required NLTK resources on first import
15 | try:
16 | nltk.data.find('tokenizers/punkt')
17 | except LookupError:
18 | nltk.download('punkt', quiet=True)
19 |
20 | try:
21 | nltk.data.find('stopwords')
22 | except LookupError:
23 | nltk.download('stopwords', quiet=True)
24 |
25 | class WebSearch:
26 | """
27 | 提供互联网搜索功能的类。
28 | Class that provides internet search capabilities.
29 | """
30 |
31 | def __init__(self, search_engine="google", timeout=10):
32 | """
33 | 初始化 WebSearch 类。
34 | Initialize the WebSearch class.
35 |
36 | 参数 | Args:
37 | search_engine (str): 要使用的搜索引擎 ("google", "bing", "baidu") | Search engine to use ("google", "bing", "baidu")
38 | timeout (int): 请求超时时间(秒) | Request timeout in seconds
39 | """
40 | self.search_engine = search_engine.lower()
41 | self.timeout = timeout
42 |
43 | if self.search_engine not in ["google", "bing", "baidu"]:
44 | raise ValueError(f"不支持的搜索引擎: {search_engine}。支持的引擎: google, bing, baidu")
45 |
46 | # 设置默认请求头
47 | self.headers = {
48 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
49 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
50 | "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
51 | "Accept-Encoding": "gzip, deflate, br",
52 | "Connection": "keep-alive",
53 | "Upgrade-Insecure-Requests": "1",
54 | "Sec-Fetch-Dest": "document",
55 | "Sec-Fetch-Mode": "navigate",
56 | "Sec-Fetch-Site": "none",
57 | "Sec-Fetch-User": "?1",
58 | "Cache-Control": "max-age=0"
59 | }
60 |
61 | def search(self, query, num_results=5):
62 | """
63 | 执行给定查询的网络搜索。
64 | Perform a web search for the given query.
65 |
66 | 参数 | Args:
67 | query (str): 搜索查询 | The search query
68 | num_results (int): 返回结果的数量 | Number of results to return
69 |
70 | 返回 | Returns:
71 | list: 包含搜索结果的字典列表 | List of dictionaries containing search results
72 | """
73 | if self.search_engine == "google":
74 | return self._google_search(query, num_results)
75 | elif self.search_engine == "bing":
76 | return self._bing_search(query, num_results)
77 | elif self.search_engine == "baidu":
78 | return self._baidu_search(query, num_results)
79 | else:
80 | raise ValueError(f"Unsupported search engine: {self.search_engine}")
81 |
82 | def _google_search(self, query, num_results=5):
83 | """
84 | 执行Google搜索。
85 | Perform a Google search.
86 |
87 | 注意:这是一个简单实现,由于Google的反爬取措施,可能不能可靠地工作。
88 | 在生产使用中,考虑使用官方的Google搜索API。
89 | Note: This is a simple implementation and might not work reliably due to Google's
90 | anti-scraping measures. For production use, consider using official Google Search API.
91 | """
92 | search_url = f"https://www.google.com/search?q={quote_plus(query)}&num={num_results}&hl=zh-CN"
93 |
94 | # 尝试不同的用户代理
95 | user_agents = [
96 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
97 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
98 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
99 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0"
100 | ]
101 |
102 | # 最多尝试3次
103 | max_retries = 3
104 | for retry in range(max_retries):
105 | try:
106 | # 每次尝试使用不同的用户代理
107 | current_headers = self.headers.copy()
108 | current_headers["User-Agent"] = user_agents[retry % len(user_agents)]
109 |
110 | print(f"尝试搜索 (尝试 {retry+1}/{max_retries}): {search_url}")
111 | print(f"使用用户代理: {current_headers['User-Agent'][:30]}...")
112 |
113 | response = requests.get(search_url, headers=current_headers, timeout=self.timeout)
114 | response.raise_for_status()
115 |
116 | soup = BeautifulSoup(response.text, 'html.parser')
117 | search_results = []
118 |
119 | # 保存HTML以便调试
120 | debug_file = f"google_search_debug_{retry+1}.html"
121 | with open(debug_file, "w", encoding="utf-8") as f:
122 | f.write(response.text)
123 | print(f"已保存响应HTML到 {debug_file}")
124 |
125 | # 更全面的选择器列表
126 | selectors = [
127 | 'div.g', # 传统选择器
128 | 'div.Gx5Zad', # 新版选择器
129 | 'div.tF2Cxc', # 另一种可能的选择器
130 | 'div[jscontroller]', # 更通用的选择器
131 | 'div.MjjYud', # 2023年版选择器
132 | 'div.v7W49e', # 另一个可能的容器
133 | 'div.srKDX', # 2024年版可能的选择器
134 | 'div.N54PNb' # 另一个可能的容器
135 | ]
136 |
137 | # 标题选择器列表
138 | title_selectors = [
139 | 'h3',
140 | 'h3.LC20lb',
141 | 'div.vvjwJb',
142 | 'div.DKV0Md',
143 | 'h3.zBAuLc',
144 | 'h3.DKV0Md'
145 | ]
146 |
147 | # 链接选择器列表
148 | link_selectors = [
149 | 'a',
150 | 'a[href]',
151 | 'div.yuRUbf > a',
152 | 'div.Z26q7c > a',
153 | 'div.eKjLze > div > div > a'
154 | ]
155 |
156 | # 摘要选择器列表
157 | snippet_selectors = [
158 | 'div.VwiC3b',
159 | 'div.lEBKkf',
160 | 'span.aCOpRe',
161 | 'div.s3v9rd',
162 | 'div.VwiC3b.yXK7lf',
163 | 'span.s3v9rd'
164 | ]
165 |
166 | results_found = False
167 |
168 | # 首先尝试使用选择器找到结果容器
169 | for selector in selectors:
170 | results = soup.select(selector)
171 | if results:
172 | print(f"找到结果使用选择器: {selector}, 数量: {len(results)}")
173 | results_found = True
174 |
175 | for result in results:
176 | # 尝试找到标题
177 | title_element = None
178 | for title_selector in title_selectors:
179 | title_element = result.select_one(title_selector)
180 | if title_element:
181 | break
182 |
183 | # 尝试找到链接
184 | link_element = None
185 | for link_selector in link_selectors:
186 | link_element = result.select_one(link_selector)
187 | if link_element and link_element.has_attr('href'):
188 | break
189 |
190 | # 尝试找到摘要
191 | snippet_element = None
192 | for snippet_selector in snippet_selectors:
193 | snippet_element = result.select_one(snippet_selector)
194 | if snippet_element:
195 | break
196 |
197 | if title_element and link_element:
198 | title = title_element.get_text().strip()
199 | link = link_element['href']
200 | if link.startswith('/url?q='):
201 | link = link.split('/url?q=')[1].split('&')[0]
202 |
203 | # 如果找不到摘要,使用默认文本
204 | snippet = snippet_element.get_text().strip() if snippet_element else "未找到摘要"
205 |
206 | # 过滤掉不相关的结果
207 | if not any(x in link for x in ['google.com/search', 'accounts.google', 'support.google']):
208 | search_results.append({
209 | 'title': title,
210 | 'link': link,
211 | 'snippet': snippet
212 | })
213 |
214 | # 只有当我们收集了足够多的结果时才退出循环
215 | if len(search_results) >= num_results:
216 | break
217 |
218 | if search_results:
219 | break
220 |
221 | # 如果找到了搜索结果,返回它们
222 | if search_results:
223 | print(f"成功找到 {len(search_results)} 个搜索结果")
224 | # 确保只返回请求的结果数量
225 | return search_results[:num_results]
226 |
227 | # 如果没有找到结果,尝试下一次重试
228 | print("未找到搜索结果,尝试不同的方法...")
229 |
230 | except Exception as e:
231 | print(f"搜索时出错 (尝试 {retry+1}/{max_retries}): {e}")
232 | # 如果不是最后一次尝试,继续下一次
233 | if retry < max_retries - 1:
234 | print("将在1秒后重试...")
235 | import time
236 | time.sleep(1)
237 |
238 | # 如果所有尝试都失败,使用模拟结果
239 | print("所有搜索尝试均失败,使用模拟结果")
240 | return self._mock_search_results(query, num_results)
241 |
242 | def _mock_search_results(self, query, num_results=5):
243 | """
244 | 当实际搜索失败时,生成模拟搜索结果。
245 | Generate mock search results when actual search fails.
246 | """
247 | current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
248 |
249 | # 基本模拟结果
250 | mock_results = [
251 | {
252 | 'title': f'关于 "{query}" 的搜索结果 - 模拟数据',
253 | 'link': 'https://example.com/search-results',
254 | 'snippet': f'这是一个模拟的搜索结果。由于无法连接到搜索引擎,系统生成了这个占位符。当前时间: {current_time}'
255 | },
256 | {
257 | 'title': '搜索功能暂时不可用',
258 | 'link': 'https://example.com/search-unavailable',
259 | 'snippet': '搜索引擎可能暂时阻止了来自此IP的请求,或者网络连接存在问题。请稍后再试。'
260 | }
261 | ]
262 |
263 | # 根据查询类型添加特定的模拟结果
264 |
265 | # 时间相关查询
266 | time_keywords = ["时间", "日期", "几点", "what time", "current time", "date", "today", "now", "当前时间"]
267 | if any(keyword in query.lower() for keyword in time_keywords):
268 | weekday_cn = ["一", "二", "三", "四", "五", "六", "日"][datetime.now().weekday()]
269 | weekday_en = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"][datetime.now().weekday()]
270 |
271 | mock_results.append({
272 | 'title': '当前时间信息',
273 | 'link': 'https://example.com/current-time',
274 | 'snippet': f'当前系统时间是 {current_time},星期{weekday_cn} ({weekday_en})。这是由系统生成的时间信息。'
275 | })
276 |
277 | # 天气相关查询
278 | weather_keywords = ["天气", "气温", "weather", "temperature", "forecast", "雨", "雪", "晴", "阴"]
279 | location_keywords = ["北京", "上海", "广州", "深圳", "杭州", "成都", "重庆", "武汉", "西安", "南京",
280 | "beijing", "shanghai", "guangzhou", "shenzhen"]
281 |
282 | if any(keyword in query.lower() for keyword in weather_keywords):
283 | # 检查是否包含位置信息
284 | location = "未知位置"
285 | for loc in location_keywords:
286 | if loc in query.lower():
287 | location = loc
288 | break
289 |
290 | mock_results.append({
291 | 'title': f'{location}天气信息 - 模拟数据',
292 | 'link': 'https://example.com/weather-unavailable',
293 | 'snippet': f'由于无法连接到天气服务,无法获取{location}的实时天气信息。这是一个模拟的天气信息占位符。'
294 | })
295 |
296 | # 新闻相关查询
297 | news_keywords = ["新闻", "资讯", "头条", "news", "headlines", "最新消息", "报道"]
298 | if any(keyword in query.lower() for keyword in news_keywords):
299 | mock_results.append({
300 | 'title': '最新新闻 - 模拟数据',
301 | 'link': 'https://example.com/news',
302 | 'snippet': f'由于无法连接到新闻服务,无法获取关于"{query}"的最新新闻。这是一个模拟的新闻信息占位符。'
303 | })
304 |
305 | # 返回请求数量的结果
306 | return mock_results[:num_results]
307 |
308 | def _baidu_search(self, query, num_results=5):
309 | """
310 | 执行百度搜索。
311 | Perform a Baidu search.
312 |
313 | 注意:这是一个简单实现。在生产使用中,考虑使用官方的百度搜索API。
314 | Note: This is a simple implementation. For production use, consider using
315 | official Baidu Search API.
316 | """
317 | search_url = f"https://www.baidu.com/s?wd={quote_plus(query)}&rn={num_results}"
318 |
319 | # 尝试不同的用户代理
320 | user_agents = [
321 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
322 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
323 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
324 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0"
325 | ]
326 |
327 | # 最多尝试3次
328 | max_retries = 3
329 | for retry in range(max_retries):
330 | try:
331 | # 每次尝试使用不同的用户代理
332 | current_headers = self.headers.copy()
333 | current_headers["User-Agent"] = user_agents[retry % len(user_agents)]
334 |
335 | print(f"尝试百度搜索 (尝试 {retry+1}/{max_retries}): {search_url}")
336 | print(f"使用用户代理: {current_headers['User-Agent'][:30]}...")
337 |
338 | response = requests.get(search_url, headers=current_headers, timeout=self.timeout)
339 | response.raise_for_status()
340 |
341 | soup = BeautifulSoup(response.text, 'html.parser')
342 | search_results = []
343 |
344 | # 保存HTML以便调试
345 | debug_file = f"baidu_search_debug_{retry+1}.html"
346 | with open(debug_file, "w", encoding="utf-8") as f:
347 | f.write(response.text)
348 | print(f"已保存响应HTML到 {debug_file}")
349 |
350 | # 百度搜索结果容器选择器
351 | result_containers = soup.select('div.result.c-container')
352 | if not result_containers:
353 | result_containers = soup.select('div.result-op.c-container')
354 | if not result_containers:
355 | result_containers = soup.select('div.c-container')
356 |
357 | if result_containers:
358 | print(f"找到 {len(result_containers)} 个百度搜索结果")
359 |
360 | for container in result_containers:
361 | # 提取标题
362 | title_element = container.select_one('h3.t') or container.select_one('h3.c-title')
363 | if not title_element:
364 | continue
365 |
366 | title = title_element.get_text().strip()
367 |
368 | # 提取链接
369 | link_element = title_element.select_one('a')
370 | if not link_element or not link_element.has_attr('href'):
371 | continue
372 |
373 | link = link_element['href']
374 |
375 | # 百度搜索结果链接通常是重定向链接,需要进一步处理
376 | if link.startswith('http'):
377 | pass # 已经是完整URL
378 | else:
379 | # 如果是相对链接,转换为绝对链接
380 | link = f"https://www.baidu.com{link}"
381 |
382 | # 提取摘要 - 尝试多种选择器
383 | snippet = ""
384 |
385 | # 尝试方法1:查找内容类
386 | snippet_element = container.select_one('div.c-abstract') or container.select_one('div.c-span-last')
387 | if snippet_element:
388 | snippet = snippet_element.get_text().strip()
389 |
390 | # 尝试方法2:查找内容包装器
391 | if not snippet:
392 | content_wrappers = container.select('.pure-test-wrap_T03sY .content-right_1THTn')
393 | if content_wrappers:
394 | snippet = content_wrappers[0].get_text().strip()
395 |
396 | # 尝试方法3:查找任何文本内容
397 | if not snippet:
398 | # 排除标题和链接元素
399 | for text_element in container.find_all(text=True, recursive=True):
400 | parent = text_element.parent
401 | if parent and parent.name not in ['h3', 'a', 'script', 'style']:
402 | text = text_element.strip()
403 | if text and len(text) > 20: # 只考虑较长的文本
404 | snippet = text
405 | break
406 |
407 | # 如果仍然没有找到摘要,使用占位符
408 | if not snippet:
409 | snippet = "百度搜索结果摘要不可用"
410 |
411 | search_results.append({
412 | 'title': title,
413 | 'link': link,
414 | 'snippet': snippet
415 | })
416 |
417 | if len(search_results) >= num_results:
418 | break
419 |
420 | if search_results:
421 | print(f"成功找到 {len(search_results)} 个百度搜索结果")
422 | # 确保只返回请求的结果数量
423 | return search_results[:num_results]
424 |
425 | print("未找到百度搜索结果,尝试不同的方法...")
426 |
427 | except Exception as e:
428 | print(f"百度搜索时出错 (尝试 {retry+1}/{max_retries}): {e}")
429 | if retry < max_retries - 1:
430 | print("将在1秒后重试...")
431 | import time
432 | time.sleep(1)
433 |
434 | # 如果所有尝试都失败,使用模拟结果
435 | print("所有百度搜索尝试均失败,使用模拟结果")
436 | return self._mock_search_results(query, num_results)
437 |
438 | def _bing_search(self, query, num_results=5):
439 | """
440 | 执行Bing搜索。
441 | Perform a Bing search.
442 |
443 | 注意:这是一个简单实现。在生产使用中,考虑使用官方的Bing搜索API。
444 | Note: This is a simple implementation. For production use, consider using
445 | official Bing Search API.
446 | """
447 | search_url = f"https://www.bing.com/search?q={quote_plus(query)}&count={num_results}"
448 |
449 | try:
450 | response = requests.get(search_url, headers=self.headers)
451 | response.raise_for_status()
452 |
453 | soup = BeautifulSoup(response.text, 'html.parser')
454 | search_results = []
455 |
456 | # 提取搜索结果 | Extract search results
457 | for result in soup.select('li.b_algo'):
458 | title_element = result.select_one('h2 a')
459 | snippet_element = result.select_one('div.b_caption p')
460 |
461 | if title_element and snippet_element:
462 | title = title_element.get_text()
463 | link = title_element['href']
464 | snippet = snippet_element.get_text()
465 |
466 | search_results.append({
467 | 'title': title,
468 | 'link': link,
469 | 'snippet': snippet
470 | })
471 |
472 | if len(search_results) >= num_results:
473 | break
474 |
475 | # 确保只返回请求的结果数量
476 | return search_results[:num_results]
477 |
478 | except Exception as e:
479 | print(f"Error during Bing search: {e}")
480 | return []
481 |
482 | def fetch_content(self, url: str, summarize: bool = False, max_length: int = 5000) -> Dict[str, Any]:
483 | """
484 | 获取并提取网页的主要内容,可选择生成摘要。
485 | Fetch and extract the main content from a webpage with optional summarization.
486 |
487 | 参数 | Args:
488 | url: 要获取的网页URL | URL of the webpage to fetch
489 | summarize: 是否生成内容摘要 | Whether to generate a summary of the content
490 | max_length: 返回内容的最大长度 | Maximum length of the content to return
491 |
492 | 返回 | Returns:
493 | 包含从网页提取的内容和元数据的字典 | Dictionary containing extracted content and metadata from the webpage
494 | """
495 | try:
496 | # 添加小延迟以避免速率限制 | Add a small delay to avoid rate limiting
497 | time.sleep(random.uniform(0.5, 1.5))
498 |
499 | # 获取域名以供后续使用 | Get the domain for later use
500 | domain = urlparse(url).netloc
501 |
502 | response = requests.get(url, headers=self.headers, timeout=self.timeout)
503 | response.raise_for_status()
504 |
505 | # 尝试检测编码 | Try to detect the encoding
506 | if 'charset' in response.headers.get('Content-Type', ''):
507 | response.encoding = response.headers.get_content_charset()
508 | else:
509 | # BeautifulSoup可以帮助检测编码 | BeautifulSoup can help with encoding detection
510 | soup = BeautifulSoup(response.content, 'html.parser')
511 | meta_charset = soup.find('meta', charset=True)
512 | if meta_charset:
513 | response.encoding = meta_charset.get('charset')
514 |
515 | soup = BeautifulSoup(response.text, 'html.parser')
516 |
517 | # 尝试获取标题 | Try to get title
518 | title = self._extract_title(soup)
519 |
520 | # 尝试提取发布日期 | Try to extract publish date
521 | publish_date = self._extract_publish_date(soup)
522 |
523 | # 尝试提取作者 | Try to extract author
524 | author = self._extract_author(soup)
525 |
526 | # 移除不需要的元素 | Remove unwanted elements
527 | for element in soup.select('nav, footer, header, aside, .ad, .ads, .advert, .cookie, .sidebar, .comments, .related'):
528 | element.extract()
529 |
530 | # 移除脚本和样式元素 | Remove script and style elements
531 | for script in soup(["script", "style", "svg", "noscript", "iframe"]):
532 | script.extract()
533 |
534 | # Focus on main content area if possible
535 | main_content = None
536 | for selector in ['main', 'article', '.post-content', '.article-content', '.entry-content', '#content', '.content']:
537 | main = soup.select_one(selector)
538 | if main and len(main.get_text(strip=True)) > 200:
539 | main_content = main
540 | break
541 |
542 | # If no main content area was found, use the body
543 | if not main_content:
544 | main_content = soup.body if soup.body else soup
545 |
546 | # Get text
547 | text = main_content.get_text(' ', strip=True)
548 |
549 | # Clean up the text
550 | text = self._clean_text(text)
551 |
552 | # Create a result dictionary
553 | result = {
554 | "url": url,
555 | "domain": domain,
556 | "title": title,
557 | "author": author,
558 | "publish_date": publish_date,
559 | "content": text[:max_length] + "..." if len(text) > max_length else text,
560 | "content_length": len(text)
561 | }
562 |
563 | # Generate a summary if requested
564 | if summarize and text:
565 | summary = self._generate_summary(text)
566 | key_points = self._extract_key_points(text)
567 | result["summary"] = summary
568 | result["key_points"] = key_points
569 |
570 | return result
571 |
572 | except Exception as e:
573 | print(f"Error fetching content from {url}: {e}")
574 | return {
575 | "url": url,
576 | "domain": urlparse(url).netloc,
577 | "error": str(e),
578 | "content": f"Failed to fetch content from {url}: {str(e)}",
579 | "content_length": 0
580 | }
581 |
582 | def _extract_title(self, soup: BeautifulSoup) -> str:
583 | """Extract the title of the webpage."""
584 | # Try to get title from og:title
585 | og_title = soup.find('meta', property='og:title')
586 | if og_title and og_title.get('content'):
587 | return og_title['content']
588 |
589 | # Try to get title from twitter:title
590 | twitter_title = soup.find('meta', attrs={'name': 'twitter:title'})
591 | if twitter_title and twitter_title.get('content'):
592 | return twitter_title['content']
593 |
594 | # Use the standard title tag
595 | if soup.title and soup.title.string:
596 | return soup.title.string.strip()
597 |
598 | # Try to find the first h1
599 | h1 = soup.find('h1')
600 | if h1 and h1.get_text(strip=True):
601 | return h1.get_text(strip=True)
602 |
603 | return "Unknown Title"
604 |
605 | def _extract_publish_date(self, soup: BeautifulSoup) -> Optional[str]:
606 | """Extract the publication date from the webpage."""
607 | # Try to get date from meta tags
608 | for meta in soup.find_all('meta'):
609 | prop = meta.get('property', '').lower()
610 | name = meta.get('name', '').lower()
611 | if 'published_time' in prop or 'publication_date' in name or 'publish-date' in name:
612 | if meta.get('content'):
613 | return meta['content']
614 |
615 | # Look for time tags with datetime attribute
616 | time_tag = soup.find('time')
617 | if time_tag and time_tag.get('datetime'):
618 | return time_tag['datetime']
619 |
620 | return None
621 |
622 | def _extract_author(self, soup: BeautifulSoup) -> Optional[str]:
623 | """Extract the author from the webpage."""
624 | # Try to get author from meta tags
625 | for meta in soup.find_all('meta'):
626 | prop = meta.get('property', '').lower()
627 | name = meta.get('name', '').lower()
628 | if 'author' in prop or 'author' in name:
629 | if meta.get('content'):
630 | return meta['content']
631 |
632 | # Look for structured data with author information
633 | author_elements = soup.select('.author, .byline, .meta-author')
634 | if author_elements:
635 | for element in author_elements:
636 | author_text = element.get_text(strip=True)
637 | if author_text and len(author_text) < 100: # Avoid getting long text that's not actually an author
638 | return author_text
639 |
640 | return None
641 |
642 | def _clean_text(self, text: str) -> str:
643 | """Clean the extracted text."""
644 | # Remove excessive whitespace
645 | text = re.sub(r'\s+', ' ', text)
646 |
647 | # Remove common message patterns
648 | patterns_to_remove = [
649 | r'Cookie Policy',
650 | r'Privacy Policy',
651 | r'Terms of Service',
652 | r'Accept Cookies',
653 | r'\d+ comments',
654 | r'Share on (Facebook|Twitter|LinkedIn)',
655 | r'Click here to subscribe',
656 | r'Sign up for our newsletter',
657 | r'Copyright \d{4}',
658 | r'All rights reserved',
659 | r'Please enable JavaScript'
660 | ]
661 |
662 | for pattern in patterns_to_remove:
663 | text = re.sub(pattern, '', text, flags=re.IGNORECASE)
664 |
665 | # Split by newlines and filter out very short lines that are likely menu items or ads
666 | lines = [line.strip() for line in text.split('\n')]
667 | filtered_lines = [line for line in lines if len(line) > 20 or (len(line) > 0 and line[-1] not in '.,:;')]
668 |
669 | return '\n'.join(filtered_lines).strip()
670 |
671 | def _generate_summary(self, text: str, max_length: int = 200) -> str:
672 | """Generate a simple extractive summary of the content."""
673 | if not text:
674 | return ""
675 |
676 | # Tokenize into sentences
677 | sentences = nltk.sent_tokenize(text)
678 |
679 | if not sentences:
680 | return ""
681 |
682 | # Simple case - if there's only a few sentences, use them all
683 | if len(sentences) <= 3:
684 | return ' '.join(sentences)
685 |
686 | # Tokenize words and create frequency distribution
687 | words = nltk.word_tokenize(text.lower())
688 | stop_words = set(nltk.corpus.stopwords.words('english'))
689 | words = [word for word in words if word.isalnum() and word not in stop_words]
690 |
691 | word_freq = Counter(words)
692 |
693 | # Score sentences based on word frequency
694 | sentence_scores = {}
695 | for i, sentence in enumerate(sentences):
696 | score = 0
697 | sentence_words = nltk.word_tokenize(sentence.lower())
698 | sentence_words = [word for word in sentence_words if word.isalnum()]
699 |
700 | # Prefer sentences that aren't too short or too long
701 | length_factor = min(1.0, len(sentence_words) / 20.0) if len(sentence_words) < 20 else min(1.0, 40.0 / len(sentence_words))
702 |
703 | # Position bias - earlier sentences more likely to be important
704 | position_factor = 1.0 if i < 5 else 0.8
705 |
706 | for word in sentence_words:
707 | if word in word_freq:
708 | score += word_freq[word]
709 |
710 | # Normalize by sentence length to avoid favoring very long sentences
711 | if len(sentence_words) > 0:
712 | sentence_scores[i] = (score / len(sentence_words)) * length_factor * position_factor
713 |
714 | # Get top sentences
715 | top_sentence_indices = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:3]
716 | top_sentence_indices = sorted(top_sentence_indices) # Sort by position in text
717 |
718 | summary = ' '.join([sentences[i] for i in top_sentence_indices])
719 |
720 | # Truncate if necessary
721 | if len(summary) > max_length:
722 | summary = summary[:max_length].rsplit(' ', 1)[0] + '...'
723 |
724 | return summary
725 |
726 | def _extract_key_points(self, text: str, max_points: int = 5) -> List[str]:
727 | """Extract key points from the content."""
728 | if not text:
729 | return []
730 |
731 | # Tokenize into sentences
732 | sentences = nltk.sent_tokenize(text)
733 |
734 | if not sentences or len(sentences) <= max_points:
735 | return sentences
736 |
737 | # Similar scoring approach as summary generation
738 | words = nltk.word_tokenize(text.lower())
739 | stop_words = set(nltk.corpus.stopwords.words('english'))
740 | words = [word for word in words if word.isalnum() and word not in stop_words]
741 |
742 | word_freq = Counter(words)
743 |
744 | # Score sentences
745 | sentence_scores = {}
746 | for i, sentence in enumerate(sentences):
747 | # Skip very short sentences
748 | if len(sentence) < 30:
749 | continue
750 |
751 | score = 0
752 | sentence_words = nltk.word_tokenize(sentence.lower())
753 | sentence_words = [word for word in sentence_words if word.isalnum() and word not in stop_words]
754 |
755 | # Look for indicator phrases for key points
756 | indicator_bonus = 0
757 | indicators = ['importantly', 'significantly', 'notably', 'key', 'crucial', 'essential', 'primary']
758 | for indicator in indicators:
759 | if indicator in sentence_words:
760 | indicator_bonus += 0.5
761 |
762 | for word in sentence_words:
763 | if word in word_freq:
764 | score += word_freq[word]
765 |
766 | if len(sentence_words) > 0:
767 | sentence_scores[i] = (score / len(sentence_words)) + indicator_bonus
768 |
769 | # Get top scoring sentences
770 | top_sentence_indices = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:max_points]
771 | top_sentence_indices = sorted(top_sentence_indices) # Sort by position in text
772 |
773 | return [sentences[i] for i in top_sentence_indices]
774 |
--------------------------------------------------------------------------------