├── .dbs
    ├── interactions.db
    └── marine_species.db
├── .env.example
├── .gitignore
├── LICENSE
├── app.py
├── broadcast.py
├── docs
    ├── demo130.json
    └── demo_18.json
├── dspy_evaluation.py
├── dspy_inference.py
├── dspy_program
    ├── program_v1.0.1_20250313195723.pkl
    └── program_v1.0.3_20250315154834.pkl
├── dspy_query_db.py
├── graph_data_new
    ├── entity_vectors.json
    ├── graph_entity_relation_detailed.graphml
    └── relation_vectors.json
├── images
    ├── function-diagram.png
    ├── startup-success.jpg
    ├── 二维码.jpg
    ├── 优化样本.jpg
    ├── 关系信息查询.jpg
    ├── 实体信息查询.jpg
    ├── 属性信息查询.jpg
    ├── 版本选择.jpg
    ├── 统计信息查询.jpg
    ├── 训练所有样本.jpg
    ├── 非实体信息截图.jpg
    └── 项目技术路线.jpg
├── nanovector_db.py
├── react_tools.py
├── readme.md
├── readme_en.md
├── requirements.txt
└── tools
    ├── entity_extraction.py
    └── entity_extraction_db.py


/.dbs/interactions.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/.dbs/interactions.db


--------------------------------------------------------------------------------
/.dbs/marine_species.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/.dbs/marine_species.db


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | 本项目使用DSPy进行意图识别，需要配置两个独立的模型：
 2 | Dspy官方中文文档信息：https://www.aidoczh.com/dspy/
 3 | 1. 问答/推理模型：用于处理用户查询和推理任务
 4 | 2. 后训练模型：用于基于用户的业务问答得到的反馈数据，进行后训练任务
 5 | 两个模型可以使用相同或不同的配置，支持OpenAI-SDK格式的模型：
 6 | - OpenAI API系列：GPT-3.5/4
 7 | - DeepSeek系列：deepseek-chat/coder
 8 | - 阿里云系列：Qwen/通义千问
 9 | - 百度文心系列：ERNIE-Bot
10 | - Ollama本地部署
11 | - HuggingFace部署
12 | - VLLM高性能部署
13 | 
14 | 
15 | # 问答/推理模型配置（用于处理用户查询和推理）
16 | LLM_TYPE ="deepseek"                # 模型类型
17 | API_KEY ="sk-you_api_key"             # API密钥
18 | BASE_URL ="https://api.deepseek.com/v1"  # API基础地址
19 | LLM_MODEL ="deepseek-chat"          # 具体的模型名称
20 | 
21 | # Ollama配置（本地部署方案，适合离线环境）
22 | # LLM_TYPE="ollama_chat"           # 设置为使用Ollama本地模型
23 | # API_KEY=""                       # Ollama本地部署不需要API密钥
24 | # BASE_URL="http://localhost:11434" # Ollama服务的本地地址
25 | # LLM_MODEL="gemma3:12b"           # 使用的具体模型，这里使用Gemma 12B版本
26 | 
27 | # 后训练模型配置（用于模型训练和知识提取,可以和问答/推理模型配置相同，也可以用更适合的模型）
28 | Train_LLM_TYPE ="deepseek"            # 后训练模型类型
29 | Train_LLM_MODEL ="deepseek-chat" # 后训练使用的具体模型
30 | Train_OPENAI_API_KEY ="sk-you_api_key"  # 后训练模型的API密钥
31 | Train_OPENAI_BASE_URL ="https://api.deepseek.com/v1"  # 后训练模型的API地址
32 | 
33 | # 系统环境配置（核心路径和基础设置）
34 | RAG_DIR = "graph_data_new"         # 知识图谱数据的存储目录
35 | LOG_LEVEL = "DEBUG"                # 日志级别，可选：DEBUG, INFO, WARNING, ERROR
36 | DATABASE_URL ="sqlite:///.dbs/interactions.db"  # 用户交互记录数据库的路径
37 | SPECIES_DB_URL = "./.dbs/marine_species.db"    # 物种信息数据库的路径
38 | 
39 | # 向量检索配置（影响检索质量和效率的关键参数）
40 | VECTOR_SEARCH_TOP_K = 3            # 向量检索返回的最大结果数量
41 | BETTER_THAN_THRESHOLD = 0.7        # 相似度筛选阈值，范围0-1，越大要求越严格
42 | GRAPHML_DIR = "graph_entity_relation_detailed.graphml"  # 知识图谱的存储文件路径
43 | 
44 | # Embedding配置（文本向量化相关参数）
45 | MAX_BATCH_SIZE = 100               # 批处理大小，影响处理速度和内存使用
46 | EMBEDDING_MAX_TOKEN_SIZE = 8192    # 单次能处理的最大token数量
47 | EMBEDDING_DIM = 1024              # 生成的向量维度
48 | EMBEDDING_MODEL = "text-embedding-v3"  # 使用的embedding模型版本
49 | EMBEDDING_MODEL_BASE_URL ="https://dashscope.Trainyuncs.com/compatible-mode/v1"  # embedding服务的API地址
50 | EMBEDDING_MODEL_API_KEY ="sk-you_api_key"  # embedding服务的API密钥
51 | 
52 | # 检索相关配置
53 | MAX_ITERS = 10           # 最大检索迭代次数，控制循环次数，官方默认是 5 次
54 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .DS_Store
  6 | # C extensions
  7 | *.so
  8 | .venv/
  9 | .env
 10 | # Ignore proprietary data
 11 | graph_data_new/
 12 | dspy_program/
 13 | docs/
 14 | .dbs/
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | cover/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | .pybuilder/
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # IPython
 89 | profile_default/
 90 | ipython_config.py
 91 | 
 92 | # pyenv
 93 | #   For a library or package, you might want to ignore these files since the code is
 94 | #   intended to run in multiple environments; otherwise, check them in:
 95 | # .python-version
 96 | 
 97 | # pipenv
 98 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 99 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | #   install all needed dependencies.
102 | #Pipfile.lock
103 | 
104 | # UV
105 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
106 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
107 | #   commonly ignored for libraries.
108 | #uv.lock
109 | 
110 | # poetry
111 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
112 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
113 | #   commonly ignored for libraries.
114 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
115 | #poetry.lock
116 | 
117 | # pdm
118 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
119 | #pdm.lock
120 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
121 | #   in version control.
122 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
123 | .pdm.toml
124 | .pdm-python
125 | .pdm-build/
126 | 
127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128 | __pypackages__/
129 | 
130 | # Celery stuff
131 | celerybeat-schedule
132 | celerybeat.pid
133 | 
134 | # SageMath parsed files
135 | *.sage.py
136 | 
137 | # Environments
138 | .env
139 | .venv
140 | env/
141 | venv/
142 | ENV/
143 | env.bak/
144 | venv.bak/
145 | 
146 | # Spyder project settings
147 | .spyderproject
148 | .spyproject
149 | 
150 | # Rope project settings
151 | .ropeproject
152 | 
153 | # mkdocs documentation
154 | /site
155 | 
156 | # mypy
157 | .mypy_cache/
158 | .dmypy.json
159 | dmypy.json
160 | 
161 | # Pyre type checker
162 | .pyre/
163 | 
164 | # pytype static type analyzer
165 | .pytype/
166 | 
167 | # Cython debug symbols
168 | cython_debug/
169 | 
170 | # PyCharm
171 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
172 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
173 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
174 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
175 | #.idea/
176 | 
177 | # Ruff stuff:
178 | .ruff_cache/
179 | 
180 | # PyPI configuration file
181 | .pypirc
182 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 loukie7
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import os,io,sys
  2 | from dotenv import load_dotenv
  3 | load_dotenv(override=True)
  4 | from loguru import logger
  5 | 
  6 | # 设置 logger 日志级别
  7 | log_level = os.getenv("LOG_LEVEL", "INFO")
  8 | logger.remove()  # 移除默认处理器
  9 | logger.add(sys.stderr, level=log_level)  # 添加新的处理器并设置日志级别
 10 | logger.info(f"日志级别设置为: {log_level}")
 11 | 
 12 | from fastapi import FastAPI, Request,  WebSocket, WebSocketDisconnect,Query, Body,File, UploadFile, HTTPException, BackgroundTasks
 13 | from fastapi.responses import StreamingResponse
 14 | from fastapi.middleware.cors import CORSMiddleware
 15 | import dspy
 16 | from pydantic import BaseModel, Field
 17 | from typing import List,Dict, Any
 18 | import tempfile
 19 | import json
 20 | 
 21 | from sqlalchemy import create_engine, Column, String, JSON, DateTime,Integer
 22 | from sqlalchemy.ext.declarative import declarative_base
 23 | from sqlalchemy.orm import sessionmaker
 24 | from datetime import datetime
 25 | import uuid
 26 | import asyncio
 27 | from broadcast import ConnectionManager
 28 | from dspy_inference import DspyInferenceProcessor
 29 | from dspy_evaluation import DspyEvaluationProcessor
 30 | 
 31 | 
 32 | 
 33 | app = FastAPI()
 34 | app.add_middleware(
 35 |     CORSMiddleware,
 36 |     allow_origins=["*"],
 37 |     allow_credentials=True,
 38 |     allow_methods=["*"],
 39 |     allow_headers=["*"],
 40 | )
 41 | 
 42 | 
 43 | manager = ConnectionManager()
 44 | # 初始化 DspyProcessor
 45 | dspy_processor = DspyInferenceProcessor()
 46 | # 初始化流式模型
 47 | streaming_react = dspy_processor.stream_predict
 48 | 
 49 | eval_processor = DspyEvaluationProcessor()
 50 | 
 51 | 
 52 | predictor_version = "1.0.0"
 53 | 
 54 | # 定义数据库模型
 55 | Base = declarative_base()
 56 | 
 57 | # 创建数据库引擎
 58 | engine = create_engine(os.getenv("DATABASE_URL", "sqlite:///interactions.db"), echo=False)
 59 | 
 60 | Base.metadata.create_all(engine)
 61 | 
 62 | # 创建会话
 63 | SessionLocal = sessionmaker(bind=engine)
 64 | 
 65 | # 定义封装的响应模型
 66 | class ResponseWrapper(BaseModel):
 67 |     status_code: int
 68 |     detail: str
 69 |     data: Any
 70 | 
 71 | class Interaction(Base):
 72 |     __tablename__ = 'interactions'
 73 |     
 74 |     id = Column(String, primary_key=True, default=lambda: str(uuid.uuid4()))
 75 |     timestamp = Column(DateTime, default=datetime.now)
 76 |     question = Column(String)
 77 |     model = Column(String)
 78 |     version = Column(String)
 79 |     messages = Column(JSON)
 80 |     retrievmethod = Column(JSON)
 81 |     prompt = Column(String)
 82 |     modelResponse = Column(String)
 83 |     reasoning = Column(String)
 84 |     processingTime = Column(Integer)
 85 |     tokens = Column(JSON)
 86 | 
 87 | # 新增版本管理模型
 88 | class Version(Base):
 89 |     __tablename__ = 'versions'
 90 |     
 91 |     version = Column(String, primary_key=True)
 92 |     file_path = Column(String)
 93 |     description = Column(String)
 94 |     created_at = Column(DateTime, default=datetime.now)
 95 | 
 96 | # 新增请求体模型
 97 | class TrainingRequest(BaseModel):
 98 |     ids: List[str]
 99 |     version: str
100 | 
101 | 
102 | @app.websocket("/ws")
103 | async def websocket_endpoint(websocket: WebSocket):
104 |     await manager.connect(websocket)
105 |     try:
106 |         while True:
107 |             # 保持连接（这里简单接收消息，可用于心跳检
108 |             await websocket.receive_text()
109 |     except WebSocketDisconnect:
110 |         manager.disconnect(websocket)
111 | 
112 | # 异步生成器：流式返回 ReAct 模块的回答，并在结束后通过 websocket 推送 prompt 历史
113 | async def stream_react_response(prompt: str):
114 |     global streaming_react
115 |     try:
116 |         # 跟踪上一次的内容，用于增量更新
117 |         last_reasoning = ""
118 |         last_answer = ""
119 |         
120 |         # 修改这里：直接调用 streaming_react 函数
121 |         async for chunk in streaming_react(question=prompt):
122 |             # 假设每个 chunk 为 Prediction 对象，包含 answer 与 reasoning 字段
123 |             if chunk:
124 |                 # 获取当前的 reasoning 和 answer
125 |                 current_reasoning = getattr(chunk, "reasoning", "") or ""
126 |                 current_answer = getattr(chunk, "answer", "") or ""
127 |                 
128 |                 # 计算增量内容
129 |                 reasoning_delta = current_reasoning[len(last_reasoning):] if current_reasoning else ""
130 |                 answer_delta = current_answer[len(last_answer):] if current_answer else ""
131 |                 
132 |                 # 只有当有新内容时才发送
133 |                 if reasoning_delta or answer_delta:
134 |                     data = {
135 |                         "reasoning_delta": reasoning_delta,
136 |                         "answer_delta": answer_delta,
137 |                         "reasoning": current_reasoning,  # 也可以选择只发送增量
138 |                         "answer": current_answer        # 也可以选择只发送增量
139 |                     }
140 |                     logger.info(f"增量数据: {json.dumps(data)}")
141 |                     yield f"data: {json.dumps(data)}\n\n"
142 |                     
143 |                     # 更新上一次的内容
144 |                     last_reasoning = current_reasoning
145 |                     last_answer = current_answer
146 |         
147 |         # 流式结束后的处理...
148 |         last_message = dspy_processor.get_last_message()
149 |         
150 |         # 检查 last_message 是否为 None 或不包含必要字段
151 |         if not last_message:
152 |             error_data = {"error": "无法获取消息历史", "message": "处理请求时发生错误"}
153 |             logger.error(f"last_message 为空或无效")
154 |             yield f"data: {json.dumps(error_data)}\n\n"
155 |             yield "data: [DONE]\n\n"
156 |             return
157 | 
158 |         # 构造一个只包含所需字段的新字典
159 |         data_to_send = {
160 |             "question": prompt,
161 |             "prompt": last_message.get("prompt"),
162 |             "messages": last_message.get("messages"),
163 |             "timestamp": last_message.get("timestamp"),
164 |             "uuid": last_message.get("uuid"),
165 |             "model": last_message.get("model"),
166 |             "version": predictor_version
167 |         }
168 | 
169 |         # 从 response 中提取 choices 第一个元素的 message 的 content 字段
170 |         try:
171 |             # 检查 response 是否存在且包含必要字段
172 |             if "response" in last_message and last_message["response"] and "choices" in last_message["response"]:
173 |                 data_to_send["content"] = last_message["response"].choices[0].message.content
174 |                 tokens = {}
175 |                 if "usage" in last_message:
176 |                     tokens["completion_tokens"] = last_message["usage"].get("completion_tokens", 0)
177 |                     tokens["prompt_tokens"] = last_message["usage"].get("prompt_tokens", 0)
178 |                     tokens["total_tokens"] = last_message["usage"].get("total_tokens", 0)
179 |                     data_to_send["tokens"] = tokens
180 |             else:
181 |                 data_to_send["content"] = None
182 |                 data_to_send["tokens"] = {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0}
183 |                 logger.warning("response 字段不存在或格式不正确")
184 |         except (KeyError, IndexError, AttributeError) as e:
185 |             # 如果不存在该字段则设为 None 或者按需处理
186 |             data_to_send["content"] = None
187 |             data_to_send["tokens"] = {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0}
188 |             logger.error(f"提取 content 时出错：{e}")
189 |         
190 |         # 将数据转换为 JSON 字符串
191 |         json_message = json.dumps(data_to_send, ensure_ascii=False, indent=2)
192 |         logger.info(json_message)
193 |         
194 |         # 修改：不再通过 websocket 广播，而是通过流式返回完整消息
195 |         yield f"data: {json.dumps({'prompt_history': json_message})}\n\n"
196 |         yield "data: [DONE]\n\n"
197 |     
198 |     except Exception as e:
199 |         # 捕获所有异常，返回错误信息
200 |         error_message = str(e)
201 |         logger.error(f"stream_react_response 发生错误: {error_message}")
202 |         error_data = {"error": "处理请求失败", "message": error_message}
203 |         yield f"data: {json.dumps(error_data)}\n\n"
204 |         yield "data: [DONE]\n\n"
205 | 
206 | 
207 | @app.post("/chat")
208 | async def chat(request: Request, prompt: str = Body(..., embed=True), stream: int = Body(None, embed=True), version: str = Body(None, embed=True)):
209 |     
210 |     global predictor_version
211 |     global streaming_react  # 添加全局声明
212 |     try:
213 |         # 创建会话
214 |         session = SessionLocal()
215 |         pred = dspy_processor.model
216 |         
217 |         predictor_version =dspy_processor.get_version()
218 |         # 记录一个当前的版本号，如果版本号没有发生变化，则不需要进行操作
219 |         if version and version != predictor_version:
220 |             # 查询版本信息
221 |             version_info = session.query(Version).filter(Version.version == version).first()
222 |             if not version_info:
223 |                 return ResponseWrapper(status_code=404, detail="error", data={"message": f"Version {version} not found"})
224 |             
225 |             # 加载指定版本的模型文件todo
226 |             logger.info(f"开始切换版本：{version}/{version_info.file_path}")
227 |             file_path = version_info.file_path
228 |             dspy_processor.load_model(file_path)
229 |             # 更新 predictor_version
230 |             predictor_version = version
231 |             dspy_processor.set_version(version)
232 |             logger.info(f"切换版本成功：{version},清除缓存")
233 |             # 重新初始化 streaming_react
234 |             streaming_react = dspy_processor.stream_predict  # 修改这里：直接赋值函数引用，不要调用
235 |         
236 |         if stream == 1:
237 |             # 流式返回：包装生成器，media_type 为 "text/event-stream"
238 |             return StreamingResponse(stream_react_response(prompt), media_type="text/event-stream")
239 |         else:
240 |             # 非流式返回：直接调用 ReAct 模块，获取最终答案
241 |             # 为pred设置独立的llm配置
242 |             with dspy.llm_config(model=dspy_processor.lm):
243 |                 pred = dspy_processor.model
244 |                 dspyres = pred(question=prompt)
245 |             content = dspyres.answer
246 |             reasoning = dspyres.reasoning
247 |             return ResponseWrapper(status_code=200, detail="success", data={"content": content, "reasoning": reasoning})
248 |     except Exception as e:
249 |         return ResponseWrapper(status_code=500, detail="error", data={"message": str(e)})
250 |     finally:
251 |         session.close()
252 | # 新增的 API 方法：接收数据并保存到 JSON 文件
253 | @app.post("/save_data")
254 | async def save_data(data: Dict):
255 |     try:
256 |         # 定义保存数据的文件路径
257 |         file_path = "saved_data.json"
258 |         
259 |         # 检查文件是否存在，如果存在则读取现有数据
260 |         if os.path.exists(file_path):
261 |             with open(file_path, "r", encoding="utf-8") as file:
262 |                 existing_data = json.load(file)
263 |         else:
264 |             existing_data = []
265 |         
266 |         # 将新数据添加到现有数据中
267 |         existing_data.append(data)
268 |         
269 |         # 将更新后的数据写回文件
270 |         with open(file_path, "w", encoding="utf-8") as file:
271 |             json.dump(existing_data, file, ensure_ascii=False, indent=2)
272 |         
273 |         return ResponseWrapper(status_code=200, detail="success", data={"message": "Data saved successfully"})
274 |     except Exception as e:
275 |         return ResponseWrapper(status_code=500, detail="error", data={"message": str(e)})
276 | 
277 | 
278 | 
279 | # 新增的 API 方法：接收数据并保存到 SQLite 数据库
280 | @app.post("/save_to_db")
281 | async def save_to_db(data: Dict):
282 |     try:
283 |         # 创建会话
284 |         session = SessionLocal()
285 | 
286 |         # 检查是否已存在相同ID
287 |         if data.get("id"):
288 |             existing = session.query(Interaction).get(data["id"])
289 |             if existing:
290 |                 return ResponseWrapper(
291 |                     status_code=400,
292 |                     detail="error",
293 |                     data={"message": f"相同记录 {data['id']} 已存在"}
294 |                 )
295 | 
296 | 
297 |         # 格式化 messages 和 retrievmethod 字段
298 |         formatted_messages = json.dumps(data.get("messages"), ensure_ascii=False, indent=2)
299 |         formatted_retrievmethod = json.dumps(data.get("retrievmethod"), ensure_ascii=False, indent=2)
300 |         
301 |         
302 |         # 创建 Interaction 实例
303 |         interaction = Interaction(
304 |             id=data.get("id"),
305 |             timestamp=datetime.fromisoformat(data.get("timestamp")),
306 |             question=data.get("question"),
307 |             model=data.get("model"),
308 |             version=data.get("version"),
309 |             messages=json.loads(formatted_messages),
310 |             retrievmethod=json.loads(formatted_retrievmethod),
311 |             prompt=data.get("prompt"),
312 |             modelResponse=data.get("modelResponse"),
313 |             reasoning=data.get("reasoning"),
314 |             processingTime=data.get("processingTime"),
315 |             tokens=data.get("tokens")
316 |         )
317 |         
318 |         # 添加到会话并提交
319 |         session.add(interaction)
320 |         session.commit()
321 |         
322 |         return ResponseWrapper(status_code=200, detail="success", data={"message": "Data saved successfully to database"})
323 |     except Exception as e:
324 |         session.rollback()
325 |         return ResponseWrapper(status_code=500, detail="error", data={"message": str(e)})
326 |     finally:
327 |         session.close()
328 | 
329 | @app.delete("/interactions/{interaction_id}", response_model=ResponseWrapper)
330 | async def delete_interaction(interaction_id: str):
331 |     try:
332 |         session = SessionLocal()
333 |         
334 |         # 查询要删除的记录
335 |         interaction = session.query(Interaction).filter(Interaction.id == interaction_id).first()
336 |         
337 |         if not interaction:
338 |             return ResponseWrapper(
339 |                 status_code=404,
340 |                 detail="error",
341 |                 data={"message": f"ID为 {interaction_id} 的记录不存在"}
342 |             )
343 |         
344 |         # 执行删除
345 |         session.delete(interaction)
346 |         session.commit()
347 |         
348 |         return ResponseWrapper(
349 |             status_code=200,
350 |             detail="success",
351 |             data={"message": "记录删除成功", "deleted_id": interaction_id}
352 |         )
353 |     except Exception as e:
354 |         session.rollback()
355 |         return ResponseWrapper(
356 |             status_code=500,
357 |             detail="error",
358 |             data={"message": f"删除失败: {str(e)}"}
359 |         )
360 |     finally:
361 |         session.close()
362 | 
363 | # 新增的 API 方法：接收数据并更新 SQLite 数据库中的记录
364 | @app.post("/editdata")
365 | async def edit_data(data: Dict):
366 |     try:
367 |         # 创建会话
368 |         session = SessionLocal()
369 |         
370 |         # 获取 messageId 和更新字段
371 |         message_id = data.get("messageId")
372 |         update_fields = data.get("updateFields", {})
373 |         
374 |         # 根据 messageId 查找记录
375 |         interaction = session.query(Interaction).filter(Interaction.id == message_id).first()
376 |         
377 |         if not interaction:
378 |             return ResponseWrapper(status_code=404, detail="error", data={"message": "Record not found"})
379 |         
380 |         # 更新指定的字段
381 |         for field, value in update_fields.items():
382 |             if hasattr(interaction, field):
383 |                 if field in ["messages", "retrievmethod"]:
384 |                     # 格式化 JSON 字段
385 |                     setattr(interaction, field, json.loads(json.dumps(value, ensure_ascii=False, indent=2)))
386 |                 else:
387 |                     setattr(interaction, field, value)
388 |             else:
389 |                 return ResponseWrapper(status_code=400, detail="error", data={"message": f"Field '{field}' does not exist"})
390 |          
391 |         # 提交更改
392 |         session.commit()
393 |         
394 |         return ResponseWrapper(status_code=200, detail="success", data={"message": "Data updated successfully"})
395 |     except Exception as e:
396 |         session.rollback()
397 |         return ResponseWrapper(status_code=500, detail="error", data={"message": str(e)})
398 |     finally:
399 |         session.close()
400 | 
401 | @app.get("/interactions/{interaction_id}", response_model=ResponseWrapper)
402 | async def get_interaction_by_id(interaction_id: str):
403 |     try:
404 |         session = SessionLocal()
405 |         interaction = session.query(Interaction).filter(Interaction.id == interaction_id).first()
406 |         
407 |         if not interaction:
408 |             return ResponseWrapper(
409 |                 status_code=404,
410 |                 detail="error",
411 |                 data={"message": f"ID为 {interaction_id} 的记录不存在"}
412 |             )
413 |         
414 |         interaction_data = {
415 |             "id": interaction.id,
416 |             "timestamp": interaction.timestamp.isoformat(),
417 |             "question": interaction.question,
418 |             "model": interaction.model,
419 |             "version": interaction.version,
420 |             "messages": interaction.messages,
421 |             "retrievmethod": interaction.retrievmethod,
422 |             "prompt": interaction.prompt,
423 |             "modelResponse": interaction.modelResponse,
424 |             "reasoning": interaction.reasoning,
425 |             "processingTime": interaction.processingTime,
426 |             "tokens": interaction.tokens
427 |         }
428 |         
429 |         return ResponseWrapper(
430 |             status_code=200,
431 |             detail="success",
432 |             data=interaction_data
433 |         )
434 |     except Exception as e:
435 |         return ResponseWrapper(
436 |             status_code=500,
437 |             detail="error",
438 |             data={"message": f"查询失败: {str(e)}"}
439 |         )
440 |     finally:
441 |         session.close()
442 | 
443 | @app.get("/interactions", response_model=ResponseWrapper)
444 | async def get_interactions_by_version(
445 |     version: str = Query(None),
446 |     page: int = Query(1, ge=1, description="页码，从1开始"),
447 |     page_size: int = Query(10, ge=1, le=100, description="每页数量")
448 | ):
449 |     try:
450 |         session = SessionLocal()
451 |         
452 |         # 获取最新版本（如果未指定）
453 |         # latest_version = session.query(Version.version)\
454 |         #                           .order_by(Version.created_at.desc())\
455 |         #                           .first()
456 |         #     if not latest_version:
457 |         #         return ResponseWrapper(status_code=404, detail="error", data={"message": "无可用版本"})
458 |         #     version = latest_version[0]
459 |         # 修改后的代码片段
460 |         if not version:
461 |             # 移除获取最新版本逻辑，直接构建无版本过滤的查询
462 |             base_query = session.query(
463 |                 Interaction.id,
464 |                 Interaction.question,
465 |                 Interaction.version,
466 |                 Interaction.model,
467 |                 Interaction.processingTime,
468 |                 Interaction.timestamp
469 |             ).order_by(
470 |                 Interaction.timestamp.desc()
471 |             )
472 |         else:
473 |             # 当指定版本时保持原有过滤逻辑
474 |             base_query = session.query(
475 |                 Interaction.id,
476 |                 Interaction.question,
477 |                 Interaction.version,
478 |                 Interaction.model,
479 |                 Interaction.processingTime,
480 |                 Interaction.timestamp
481 |             ).filter(
482 |                 Interaction.version == version
483 |             ).order_by(
484 |                 Interaction.timestamp.desc()
485 |             )
486 | 
487 |         # 分页处理
488 |         total_count = base_query.count()
489 |         total_pages = (total_count + page_size - 1) // page_size
490 |         
491 |         interactions = base_query.offset(
492 |             (page - 1) * page_size
493 |         ).limit(
494 |             page_size
495 |         ).all()
496 | 
497 |         # 构建响应数据
498 |         interaction_list = [
499 |             {
500 |                 "id": row.id,
501 |                 "question": row.question,
502 |                 "version": row.version,
503 |                 "model": row.model,
504 |                 "processingTime": row.processingTime,
505 |                 "timestamp": row.timestamp.isoformat()
506 |             }
507 |             for row in interactions
508 |         ]
509 |         
510 |         return ResponseWrapper(
511 |             status_code=200,
512 |             detail="success",
513 |             data={
514 |                 "version": version,
515 |                 "pagination": {
516 |                     "total": total_count,
517 |                     "total_pages": total_pages,
518 |                     "current_page": page,
519 |                     "page_size": page_size
520 |                 },
521 |                 "interactions": interaction_list
522 |             }
523 |         )
524 |     except Exception as e:
525 |         return ResponseWrapper(status_code=500, detail="error", data={"message": str(e)})
526 |     finally:
527 |         session.close() 
528 | 
529 | # 全局优化任务跟踪
530 | optimization_tasks = {}
531 | 
532 | # 异步优化任务
533 | async def run_dspy_optimization(training_data: List[Dict], version: str, ids: List[str]):
534 |     task_id = f"optimization_task_{version}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
535 |     try:
536 |         from dspy.teleprompt import BootstrapFewShot
537 |         from dspy.evaluate import Evaluate
538 |         from dspy.evaluate.metrics import answer_exact_match
539 | 
540 |         # 更新状态并发送开始消息
541 |         logger.info(f"开始优化任务 {task_id}，数据量: {len(training_data)}，版本: {version}")
542 |         optimization_tasks[task_id] = "loading_data"
543 |         await manager.broadcast(json.dumps({
544 |             "type": "optimization_status",
545 |             "data": {
546 |                 "task_id": task_id,
547 |                 "status": "loading_data",
548 |                 "progress": 5,
549 |                 "message": "正在准备训练数据..."
550 |             }
551 |         }))
552 | 
553 |         # 创建训练集
554 |         trainset = [dspy.Example(question=x["question"],reasoning=x["reasoning"], answer=x["modelResponse"]).with_inputs("question") for x in training_data]
555 |         logger.info(f"任务 {task_id}: 已创建训练集，共 {len(trainset)} 条数据")
556 |         
557 |         # 更新状态
558 |         optimization_tasks[task_id] = "preparing_model"
559 |         await manager.broadcast(json.dumps({
560 |             "type": "optimization_status",
561 |             "data": {
562 |                 "task_id": task_id,
563 |                 "status": "preparing_model",
564 |                 "progress": 10,
565 |                 "message": "正在准备模型..."
566 |             }
567 |         }))
568 |         
569 |         # 从最新版本加载预测模型
570 |         session = SessionLocal()
571 |         
572 |         # 修改这里：使用 dspy_processor 的 model 而不是 eval_processor 的 model
573 |         # 因为 DspyEvaluationProcessor 没有 model 属性
574 |         predict = dspy_processor.model
575 |         logger.info(f"任务 {task_id}: 已加载模型")
576 |         
577 |         # 设置优化器
578 |         teleprompter = BootstrapFewShot(metric=eval_processor.llm_biological_metric, max_labeled_demos=15)
579 |         
580 |         # 更新状态
581 |         optimization_tasks[task_id] = "optimizing"
582 |         await manager.broadcast(json.dumps({
583 |             "type": "optimization_status",
584 |             "data": {
585 |                 "task_id": task_id,
586 |                 "status": "optimizing",
587 |                 "progress": 15,
588 |                 "message": "正在进行模型优化..."
589 |             }
590 |         }))
591 |         
592 |         # 编译优化
593 |         logger.info(f"任务 {task_id}: 开始编译优化")
594 |         compiled_predictor = teleprompter.compile(predict, trainset=trainset)
595 |         logger.info(f"任务 {task_id}: 编译优化完成")
596 |         
597 |         # 更新状态
598 |         optimization_tasks[task_id] = "saving_model"
599 |         await manager.broadcast(json.dumps({
600 |             "type": "optimization_status",
601 |             "data": {
602 |                 "task_id": task_id,
603 |                 "status": "saving_model",
604 |                 "progress": 50,
605 |                 "message": "正在保存优化后的模型..."
606 |             }
607 |         }))
608 |         
609 |         # 确保目录存在
610 |         os.makedirs("dspy_program", exist_ok=True)
611 |         last_version = session.query(Version.version).order_by(Version.created_at.desc()).first().version
612 |         
613 |         
614 |         # 保存优化后的模型
615 |         timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
616 |         output_path = f"dspy_program/program_v{last_version}_{timestamp}.pkl"
617 |         compiled_predictor.save(output_path, save_program=False)
618 |         logger.info(f"任务 {task_id}: 已保存模型到 {output_path}")
619 |         
620 |         # 解析当前版本号，生成新版本号
621 |         # 从数据库获取最新版本号，原生新增
622 |         major, minor, patch = map(int, last_version.split('.'))
623 |         new_version = f"{major}.{minor}.{patch + 1}"
624 |         
625 |         # 描述信息
626 |         description = f"基于 {version} 版本，使用 {len(ids)} 条数据优化生成的新版本"
627 |         
628 |         # 创建新版本
629 |         new_version_instance = Version(
630 |             version=new_version,
631 |             file_path=output_path,
632 |             description=description
633 |         )
634 |         
635 |         session.add(new_version_instance)
636 |         session.commit()
637 |         logger.info(f"任务 {task_id}: 已创建新版本 {new_version}")
638 |         
639 |         # 更新状态为完成
640 |         optimization_tasks[task_id] = "completed"
641 |         
642 |         # 通过 WebSocket 广播版本更新消息
643 |         await manager.broadcast(json.dumps({
644 |             "type": "version_update",
645 |             "data": {
646 |                 "old_version": version,
647 |                 "new_version": new_version,
648 |                 "description": description,
649 |                 "model_path": output_path,
650 |                 "training_ids": ids,
651 |                 "progress": 100,
652 |                 "message": f"优化完成，已创建新版本{new_version}"
653 |             }
654 |         }))
655 |         logger.info(f"任务 {task_id}: 优化任务完成")
656 |         
657 |     except Exception as e:
658 |         # 记录错误并通过 WebSocket 发送失败消息
659 |         error_message = str(e)
660 |         logger.error(f"任务 {task_id} 失败: {error_message}")
661 |         optimization_tasks[task_id] = f"failed: {error_message}"
662 |         
663 |         await manager.broadcast(json.dumps({
664 |             "type": "optimization_failed",
665 |             "data": {
666 |                 "version": version,
667 |                 "error": error_message,
668 |                 "task_id": task_id,
669 |                 "progress": 0,
670 |                 "message": f"优化失败: {error_message}"
671 |             }
672 |         }))
673 |     finally:
674 |         if 'session' in locals():
675 |             session.close()
676 | 
677 | @app.post("/addtraining", response_model=ResponseWrapper)
678 | async def add_training(request: TrainingRequest, background_tasks: BackgroundTasks):  # 使用新模型
679 |     session = None
680 |     try:
681 |         # 获取ID列表
682 |         ids = request.ids
683 |         version = request.version
684 |         
685 |         # 参数校验
686 |         if not ids:
687 |             return ResponseWrapper(
688 |                 status_code=400,
689 |                 detail="error",
690 |                 data={"message": "未提供有效ID列表"}
691 |             )
692 |         if not version:
693 |             return ResponseWrapper(
694 |                 status_code=400,
695 |                 detail="error",
696 |                 data={"message": "必须提供版本号参数"}
697 |             )
698 | 
699 |         session = SessionLocal()
700 |         
701 |         # 查询数据库并收集数据
702 |         training_data = []
703 |         for interaction_id in ids:
704 |             interaction = session.query(Interaction).get(interaction_id)
705 |             if interaction:
706 |                 training_data.append({
707 |                     "id": interaction.id,
708 |                     "question": interaction.question,
709 |                     "reasoning": interaction.reasoning,
710 |                     "modelResponse": interaction.modelResponse,
711 |                     "timestamp": interaction.timestamp.isoformat()
712 |                 })
713 | 
714 |         if not training_data:
715 |             return ResponseWrapper(
716 |                 status_code=404,
717 |                 detail="error",
718 |                 data={"message": "未找到匹配的记录"}
719 |             )
720 | 
721 |         # 生成任务ID
722 |         timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
723 |         task_id = f"optimization_task_{version}_{timestamp}"
724 |         
725 |         # 在后台启动优化任务前先设置状态
726 |         optimization_tasks[task_id] = "pending"
727 |         
728 |         # 将训练数据和任务信息保存为全局变量，以便后台任务使用
729 |         # 这样可以避免在后台任务中重新查询数据库
730 |         task_info = {
731 |             "training_data": training_data,
732 |             "version": version,
733 |             "ids": [item["id"] for item in training_data],
734 |             "task_id": task_id
735 |         }
736 |         
737 |         # 添加后台任务 - 使用普通函数而不是异步函数
738 |         background_tasks.add_task(
739 |             start_optimization_task,
740 |             task_info
741 |         )
742 |         
743 |         # 立即返回响应，不等待优化任务完成
744 |         logger.info(f"已创建优化任务 {task_id}，将在后台处理 {len(training_data)} 条数据")
745 |         return ResponseWrapper(
746 |             status_code=200,
747 |             detail="success",
748 |             data={
749 |                 "message": f"成功收集 {len(training_data)} 条训练数据，已创建后台优化任务",
750 |                 "task_id": task_id,
751 |                 "exported_ids": [item["id"] for item in training_data],
752 |                 "version": version,  # 返回版本号用于验证
753 |                 "optimization_status": "pending"  # 返回初始优化状态
754 |             }
755 |         )
756 | 
757 |     except Exception as e:
758 |         logger.error(f"创建优化任务失败: {str(e)}")
759 |         return ResponseWrapper(
760 |             status_code=500,
761 |             detail="error",
762 |             data={"message": f"处理失败: {str(e)}"}
763 |         )
764 |     finally:
765 |         if session:
766 |             session.close()
767 | 
768 | # 新增函数：启动优化任务的普通函数
769 | def start_optimization_task(task_info):
770 |     """启动优化任务的普通函数，用于后台任务"""
771 |     # 创建一个新的事件循环
772 |     loop = asyncio.new_event_loop()
773 |     asyncio.set_event_loop(loop)
774 |     
775 |     # 在新的事件循环中运行异步任务
776 |     try:
777 |         # 发送初始通知
778 |         loop.run_until_complete(manager.broadcast(json.dumps({
779 |             "type": "optimization_created",
780 |             "data": {
781 |                 "task_id": task_info["task_id"],
782 |                 "status": "pending",
783 |                 "progress": 0,
784 |                 "message": f"已创建优化任务，准备处理 {len(task_info['training_data'])} 条数据",
785 |                 "version": task_info["version"],
786 |                 "ids": task_info["ids"]
787 |             }
788 |         })))
789 |         
790 |         # 设置状态为 running
791 |         optimization_tasks[task_info["task_id"]] = "running"
792 |         
793 |         # 执行实际的优化任务
794 |         loop.run_until_complete(run_dspy_optimization(
795 |             task_info["training_data"], 
796 |             task_info["version"], 
797 |             task_info["ids"]
798 |         ))
799 |     except Exception as e:
800 |         logger.error(f"优化任务执行失败: {str(e)}")
801 |         # 设置任务状态为失败
802 |         optimization_tasks[task_info["task_id"]] = f"failed: {str(e)}"
803 |         # 发送失败通知
804 |         loop.run_until_complete(manager.broadcast(json.dumps({
805 |             "type": "optimization_failed",
806 |             "data": {
807 |                 "version": task_info["version"],
808 |                 "error": str(e),
809 |                 "task_id": task_info["task_id"],
810 |                 "progress": 0,
811 |                 "message": f"优化失败: {str(e)}"
812 |             }
813 |         })))
814 |     finally:
815 |         # 关闭事件循环
816 |         loop.close()
817 | 
818 | # 新增的 API 方法：创建新版本
819 | @app.post("/create_version")
820 | async def create_version(file_path: str = Body(..., embed=True), old_version: str = Body(..., embed=True), description: str = Body(..., embed=True)):
821 |     try:
822 |         # 创建会话
823 |         session = SessionLocal()
824 |         
825 |         # 解析旧版本号
826 |         major, minor, patch = map(int, old_version.split('.'))
827 |         
828 |         # 递增版本号
829 |         new_version = f"{major}.{minor}.{patch + 1}"
830 |         
831 |         # 检查新版本号是否已存在
832 |         existing_version = session.query(Version).filter(Version.version == new_version).first()
833 |         if existing_version:
834 |             return ResponseWrapper(status_code=400, detail="error", data={"message": f"Version {new_version} already exists"})
835 |         
836 |         # 创建新版本实例
837 |         new_version_instance = Version(
838 |             version=new_version,
839 |             file_path=file_path,
840 |             description=description
841 |         )
842 |         
843 |         # 添加到会话并提交
844 |         session.add(new_version_instance)
845 |         session.commit()
846 |         
847 |         return ResponseWrapper(status_code=200, detail="success", data={"message": "Version created successfully", "new_version": new_version})
848 |     except Exception as e:
849 |         session.rollback()
850 |         return ResponseWrapper(status_code=500, detail="error", data={"message": str(e)})
851 |     finally:
852 |         session.close()
853 | 
854 | @app.get("/versions", response_model=ResponseWrapper)
855 | async def get_versions():
856 |     try:
857 |         # 创建会话
858 |         session = SessionLocal()
859 |         
860 |         # 查询所有版本并按创建时间排序
861 |         versions = session.query(Version).order_by(Version.created_at.asc()).all()
862 |         
863 |         # 提取版本号
864 |         version_list = [{"version": version.version, "file_path": version.file_path, "description": version.description, "created_at": version.created_at} for version in versions]
865 |         
866 |         return ResponseWrapper(status_code=200, detail="success", data={"versions": version_list})
867 |     except Exception as e:
868 |         return ResponseWrapper(status_code=500, detail="error", data={"message": str(e)})
869 |     finally:
870 |         session.close()
871 | 
872 | @app.get("/health",response_model=ResponseWrapper)
873 | async def health_check():
874 |     return ResponseWrapper(status_code=200, detail="success", data={"status": "healthy"})
875 | 
876 | # 添加一个 API 端点查询优化任务状态
877 | @app.get("/optimization_status/{task_id:path}", response_model=ResponseWrapper)
878 | async def get_optimization_status(task_id: str):
879 |     try:
880 |         if task_id in optimization_tasks:
881 |             status = optimization_tasks[task_id]
882 |             return ResponseWrapper(
883 |                 status_code=200,
884 |                 detail="success",
885 |                 data={
886 |                     "task_id": task_id,
887 |                     "status": status
888 |                 }
889 |             )
890 |         else:
891 |             return ResponseWrapper(
892 |                 status_code=404,
893 |                 detail="error",
894 |                 data={"message": f"未找到对应的优化任务: {task_id}"}
895 |             )
896 |     except Exception as e:
897 |         return ResponseWrapper(
898 |             status_code=500,
899 |             detail="error",
900 |             data={"message": f"查询失败: {str(e)}"}
901 |         )
902 | 
903 | if __name__ == "__main__":
904 |     import uvicorn
905 |     uvicorn.run(app, host="0.0.0.0", port=8080)
906 | 


--------------------------------------------------------------------------------
/broadcast.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI, Request,  WebSocket, WebSocketDisconnect
 2 | from loguru import logger
 3 | # 定义一个 WebSocket 连接管理器
 4 | class ConnectionManager:
 5 |     def __init__(self):
 6 |         self.active_connections = []
 7 |     
 8 |     async def connect(self, websocket: WebSocket):
 9 |         await websocket.accept()
10 |         self.active_connections.append(websocket)
11 |     
12 |     def disconnect(self, websocket: WebSocket):
13 |         self.active_connections.remove(websocket)
14 |     
15 |     async def broadcast(self, message: str):
16 |         dead_connections = []
17 |         for connection in self.active_connections:
18 |             try:
19 |                 await connection.send_text(message)
20 |                 logger.info(f"已向客户端推送消息: {message}")
21 |             except Exception as e:
22 |                 dead_connections.append(connection)
23 |                 logger.error(f"广播消息时出错: {str(e)}")
24 |                 continue
25 |         
26 |         # 清理已断开的连接
27 |         for dead_connection in dead_connections:
28 |             try:
29 |                 self.active_connections.remove(dead_connection)
30 |             except ValueError:
31 |                 pass


--------------------------------------------------------------------------------
/docs/demo_18.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "拉丁学名": "Eptatretus burgeri",
  4 |         "命名年份": "1855",
  5 |         "作者": "Girard",
  6 |         "中文学名": "蒲氏黏盲鳗",
  7 |         "界": "动物界",
  8 |         "门": "脊索动物门",
  9 |         "纲": "圆口纲",
 10 |         "目": "盲鳗目",
 11 |         "科": "盲鳗科",
 12 |         "属": "黏盲鳗属",
 13 |         "种": "蒲氏黏盲鳗",
 14 |         "自然分布地": "我国台湾东北海域、黄海南部、东海，以及日本南部海域",
 15 |         "生活习性": "蒲氏黏盲鳗栖息于水深达200米的海域，肉食性，营寄生生活。繁殖期在秋、冬季，亲鱼游向深水产卵。卵粒大，长椭球形，附着于海藻、礁石上发育。",
 16 |         "生物特征": "蒲氏黏盲鳗体鳗形，口圆形，端位，具短须。眼退化，埋于皮下。分泌黏液多。体黄褐色，以鱼体背中线处具1条白色纵带为特征。体长为40-60厘米。鱼体含有一种芳胺类物质，即黏盲鳗素，对心脏有刺激起博等作用。"
 17 |     },
 18 |     {
 19 |         "拉丁学名": "Eptatretus okinoseanus",
 20 |         "命名年份": "1904",
 21 |         "中文学名": "紫黏盲鳗",
 22 |         "作者": "Dean",
 23 |         "界": "动物界",
 24 |         "门": "脊索动物门",
 25 |         "纲": "圆口纲",
 26 |         "目": "盲鳗目",
 27 |         "科": "盲鳗科",
 28 |         "属": "黏盲鳗属",
 29 |         "种": "紫黏盲鳗",
 30 |         "自然分布地": "我国南海北部、东海，以及日本南部海域",
 31 |         "生活习性": "紫黏盲鳗栖息于水深200至600米的海域。其生态习性与蒲氏黏盲鳗相似，肉食性，营寄生生活。繁殖期在秋、冬季，亲鱼游向深水产卵。卵粒大，长椭球形，附着于海藻、礁石上发育。",
 32 |         "生物特征": "紫黏盲鳗体鳗形，体长为60至80厘米。体紫黑色，外鳃孔8对，眼表皮、外鳃孔和腹侧正中皮褶白色。"
 33 |     },
 34 |     {
 35 |         "拉丁学名": "Eptatretus rubicundus",
 36 |         "命名年份": "2010",
 37 |         "作者": "Lee et Mok",
 38 |         "中文学名": "红盲鳗",
 39 |         "界": "动物界",
 40 |         "门": "脊索动物门",
 41 |         "纲": "圆口纲",
 42 |         "目": "盲鳗目",
 43 |         "科": "盲鳗科",
 44 |         "属": "红盲鳗属",
 45 |         "种": "红盲鳗",
 46 |         "自然分布地": "中国台湾海域",
 47 |         "生活习性": "红身黏盲鳗生活在深海区域，通常在海底泥沙中栖息。它们通过分泌大量黏液来防御捕食者。繁殖季节和繁殖特点尚不明确。",
 48 |         "生物特征": "红身黏盲鳗体形较粗壮，呈鳗形。鼻管末端有1对小瓣膜，无眼点。体全身粉红色，背中线处无白色纵带。具5对外鳃孔，与鳃区黏液孔均呈一直线排列。黏液孔多达100-102个。口漏斗状，外缘具3枚多尖齿和7枚单尖齿，内缘具2枚多尖齿及7枚单尖齿。"
 49 |     },
 50 |     {
 51 |         "拉丁学名": "Eptatretus nelsoni",
 52 |         "命名年份": "1994",
 53 |         "作者": "Huang et Mok",
 54 |         "中文学名": "纽氏黏盲鳗",
 55 |         "界": "动物界",
 56 |         "门": "脊索动物门",
 57 |         "纲": "圆口纲",
 58 |         "目": "盲鳗目",
 59 |         "科": "盲鳗科",
 60 |         "属": "黏盲鳗属",
 61 |         "种": "纽氏黏盲鳗",
 62 |         "自然分布地": "中国台湾海域",
 63 |         "生活习性": "纽氏黏盲鳗为深水半寄生种，通常生活在深海区域。它们通过分泌黏液来防御捕食者。繁殖季节和繁殖特点尚不明确。",
 64 |         "生物特征": "纽氏黏盲鳗体细长，眼退化，埋于皮下。口漏斗状，有口须。鳃孔每侧4个，呈堆状。每个鳃孔周缘均呈白色，左侧最后鳃孔扩大。鳃孔前有黏液孔19个，鳃孔上无黏液孔，躯干部有黏液孔35个，尾部有黏液孔8个。齿式8+3/2+7。体长约20 cm。"
 65 |     },
 66 |     {
 67 |         "拉丁学名": "Eptatretus yangi",
 68 |         "命名年份": "1958",
 69 |         "作者": "Yangi  Teng",
 70 |         "中文学名": "杨氏黏盲鳗",
 71 |         "界": "动物界",
 72 |         "门": "脊索动物门",
 73 |         "纲": "圆口纲",
 74 |         "目": "盲鳗目",
 75 |         "科": "盲鳗科",
 76 |         "属": "黏盲鳗属",
 77 |         "种": "杨氏黏盲鳗",
 78 |         "自然分布地": "中国台湾海域",
 79 |         "生活习性": "杨氏黏盲鳗栖息在较浅的海域，水深20-50米。它们是肉食性，营寄生生活。通常通过拖网捕获。",
 80 |         "生物特征": "杨氏黏盲鳗体鳗形。外鳃孔每侧5个，相互接近，不规则地排成一堆。鳃孔前有黏液孔16-23个，鳃孔上无黏液孔，躯干部有黏液孔42-47个，尾部有黏液孔8-11个。齿式(6-8)+3/2+(6-8)。体灰褐色，腹部灰色。体长约30 cm。"
 81 |     },
 82 |     {
 83 |         "拉丁学名": "Eptatretus cheni",
 84 |         "命名年份": "1975",
 85 |         "作者": "Shen  et  Tao",
 86 |         "中文学名": "陈氏黏盲鳗",
 87 |         "界": "动物界",
 88 |         "门": "脊索动物门",
 89 |         "纲": "圆口纲",
 90 |         "目": "盲鳗目",
 91 |         "科": "盲鳗科",
 92 |         "属": "黏盲鳗属",
 93 |         "种": "陈氏黏盲鳗",
 94 |         "自然分布地": "中国台湾海域",
 95 |         "生活习性": "陈氏黏盲鳗为深水半寄生种，通常生活在深海区域。它们通过分泌黏液来防御捕食者。繁殖季节和繁殖特点尚不明确。",
 96 |         "生物特征": "陈氏黏盲鳗体鳗形，眼已退化，埋于皮下。口漏斗状，口缘有须。鳃孔每侧5个，呈直线排列。各鳃孔间距离短。鳃孔前有26个黏液孔，鳃孔上无黏液孔，躯干部有45-47个黏液孔，尾部有7-8个黏液孔。齿式(9-11)+3/2+11。体呈暗灰色。体长约16 cm。"
 97 |     },
 98 |     {
 99 |         "拉丁学名": "Eptatretus sheni",
100 |         "命名年份": "1994",
101 |         "作者": "Huang et Mok",
102 |         "中文学名": "沈氏黏盲鳗",
103 |         "界": "动物界",
104 |         "门": "脊索动物门",
105 |         "纲": "圆口纲",
106 |         "目": "盲鳗目",
107 |         "科": "盲鳗科",
108 |         "属": "黏盲鳗属",
109 |         "种": "沈氏黏盲鳗",
110 |         "自然分布地": "中国台湾海域",
111 |         "生活习性": "沈氏黏盲鳗为深水半寄生种，通常生活在深海区域。它们通过分泌黏液来防御捕食者。繁殖季节和繁殖特点尚不明确。",
112 |         "生物特征": "沈氏黏盲鳗体细长似鳗，眼退化，埋于皮下，眼点较明显。口漏斗状，周缘具须。鳃孔每侧6个，呈直线状紧密排列。每个鳃孔周边均有白环。鳃孔前有13-18个黏液孔，鳃孔上有0-2个黏液孔，躯干部有39-46个黏液孔，尾部有8-12个黏液孔。齿式11+3/3+10。体略带褐色。体长约45 cm。"
113 |     },
114 |     {
115 |         "拉丁学名": " Eptatretus taiwanae",
116 |         "命名年份": "1975",
117 |         "作者": "Shen  et  Tao",
118 |         "中文学名": "台湾黏盲鳗",
119 |         "界": "动物界",
120 |         "门": "脊索动物门",
121 |         "纲": "圆口纲",
122 |         "目": "盲鳗目",
123 |         "科": "盲鳗科",
124 |         "属": "黏盲鳗属",
125 |         "种": "台湾黏盲鳗",
126 |         "自然分布地": "中国台湾海域",
127 |         "生活习性": "台湾黏盲鳗为深水半寄生种，通常生活在深海区域。它们通过分泌黏液来防御捕食者。繁殖季节和繁殖特点尚不明确。",
128 |         "生物特征": "台湾黏盲鳗体细长似鳗，眼退化，埋于皮下。口漏斗状，周缘有须。鳃孔每侧6个，排列成2列，有的不甚规则。鳃孔前有16-19个黏液孔，鳃孔上无黏液孔，躯干部有36-42个黏液孔，尾部有6-9个黏液孔。齿式(6-8)+3/2+(6-7)。体淡红褐色，背缘褐色。"
129 |     },
130 |     {
131 |         "拉丁学名": "Eptatretus atami",
132 |         "命名年份": "1904",
133 |         "作者": "Dean",
134 |         "中文学名": "阿塔氏黏盲鳗",
135 |         "界": "动物界",
136 |         "门": "脊索动物门",
137 |         "纲": "圆口纲",
138 |         "目": "盲鳗目",
139 |         "科": "盲鳗科",
140 |         "属": "黏盲鳗属",
141 |         "种": "阿塔氏黏盲鳗",
142 |         "自然分布地": "分布于黄海南部海域，以及日本青森以南海域。",
143 |         "生活习性": "阿塔氏黏盲鳗产卵期为4-8月份；怀卵量少，为15-30粒。卵径为25-26 mm；卵两端密生附着丝。属肉食性，以其他鱼类和底栖动物为食。",
144 |         "生物特征": "阿塔氏黏盲鳗体鳗形。和蒲氏黏盲鳗相似，鳃孔亦为6对，但外鳃孔相互靠近，并呈两列不规则排列。齿式(10-13)+3/2+(10-13)。从头部到尾部腹面并排有两列黏液孔，能分泌大量黏液。体茶褐色，外鳃孔周缘白色。体长50 cm左右。肉可食，皮可制革。"
145 |     },
146 |     {
147 |         "拉丁学名": "Myxine formosana",
148 |         "命名年份": "2001",
149 |         "作者": "Mok  et  Kuo",
150 |         "中文学名": "台湾盲鳗",
151 |         "界": "动物界",
152 |         "门": "脊索动物门",
153 |         "纲": "圆口纲",
154 |         "目": "盲鳗目",
155 |         "科": "盲鳗科",
156 |         "属": "盲鳗属",
157 |         "种": "台湾盲鳗",
158 |         "自然分布地": "分布于我国东海、台湾海域",
159 |         "生活习性": "台湾盲鳗为半寄生深水种，生活在深水区域。体长70 cm左右。",
160 |         "生物特征": "台湾盲鳗体鳗形，眼退化。外鼻孔1个，开口于吻端。口漏斗状，口缘有短须。每侧1个鳃孔，常具白缘。鳃囊5个。鳃孔前有26-32个黏液孔，鳃孔上无黏液孔，躯干部有54-58个黏液孔，尾部有14个黏液孔。齿较多，齿式(8-12)+3/2+(8-12)。体暗灰色。"
161 |     },
162 |     {
163 |         "拉丁学名": "Lethenteron camtschaticum ",
164 |         "命名年份": "1811",
165 |         "作者": "Tilesius",
166 |         "中文学名": "东亚叉牙七鳃鳗",
167 |         "界": "动物界",
168 |         "门": "脊索动物门",
169 |         "纲": "圆口纲",
170 |         "目": "七鳃鳗目",
171 |         "科": "七鳃鳗科",
172 |         "属": "叉牙七鳃鳗属",
173 |         "种": "东亚叉牙七鳃鳗",
174 |         "自然分布地": "分布于中国的黑龙江、图们江流域，偶见于鸭绿江口及江苏近岸水域；以及日本海。",
175 |         "生活习性": "东亚叉牙七鳃鳗为洄游性鱼形动物。其幼体至成体栖息于海中，营半寄生生活。性成熟个体于冬季上溯至河口，翌年5-6月在河中筑巢产卵，每次产8万-10万粒，卵黏附于沙砾上发育。亲体产卵后死亡。",
176 |         "生物特征": "东亚叉牙七鳃鳗体鳗形。口腹位，呈吸盘状，周缘具穗状突起。眼发达，位于头前部。鳃孔7对，位于眼后，故又有“八目鳗”之称。两背鳍略分离，第2背鳍较高而长，末端附近呈黑色。尾鳍矛状，褐色或黑色。体青绿色，腹部浅黄色或灰白色。体长为50-60 cm。"
177 |     },
178 |     {
179 |         "拉丁学名": "Lampetra reissneri",
180 |         "命名年份": "1869",
181 |         "作者": "Dybowski",
182 |         "中文学名": "雷氏七鳃鳗",
183 |         "界": "动物界",
184 |         "门": "脊索动物门",
185 |         "纲": "圆口纲",
186 |         "目": "七鳃鳗目",
187 |         "科": "七鳃鳗科",
188 |         "属": "七鳃鳗属",
189 |         "种": "雷氏七鳃鳗",
190 |         "自然分布地": "中国的黄海北部、黑龙江、松花江、图们江流域，以及日本海等",
191 |         "生活习性": "雷氏七鳃鳗其生态习性仍不甚明了。孟庆闻等(1995)、尼科尔斯基(1960)、朱元鼎等(2001)都认为其是不进行洄游的淡水种类，并有在兴凯湖等繁殖的具体报告。而中坊徹次(1993)、刘瑞玉(2008)明确其为江海洄游种类。益田一(1984)记述其幼体从夏到冬完成变态，随后降海潜泥底生活；翌春产卵。",
192 |         "生物特征": "雷氏七鳃鳗形态特征与日本七鳃鳗相似，只是吻较宽短，上口齿板齿钝尖，两背鳍连续，尾鳍色较淡。体长可达20 cm，但以小型个体较多见。"
193 |     },
194 |     {
195 |         "拉丁学名": "Chimaera phantasma",
196 |         "命名年份": "1900",
197 |         "作者": "Jordan  et  Snyder",
198 |         "中文学名": "黑线银鲛",
199 |         "界": "动物界",
200 |         "门": "脊索动物门",
201 |         "纲": "软骨鱼纲",
202 |         "目": "银鲛目",
203 |         "科": "银鲛科",
204 |         "属": "银鲛属",
205 |         "种": "黑线银鲛",
206 |         "自然分布地": "分布于我国东海、黄海、台湾海域，以及日本北海道以南海域、朝鲜半岛西南部海域。",
207 |         "生活习性": "黑线银鲛属冷温性较深水分布种，栖息水深90-500米。冬季向近海洄游。卵生，卵大且呈纺锤形。主食软体动物。",
208 |         "生物特征": "黑线银鲛头高而侧扁。体侧扁，延长，向后细小。尾呈鞭状。雄性的眼前上方具一柄状额鳍脚，腹鳍内侧具一三叉形鳍脚。吻短。口横裂。上颌前齿板喙状；侧齿板宽大，呈三角形。背鳍2个，以低鳍膜相连，第1背鳍具一扁长硬棘。臀鳍低平，后端尖突，与尾鳍下叶分隔处有一缺刻。侧线小波曲状。体银白色，头上部、第1至第2背鳍上部、背侧上部褐色。侧线下方，胸鳍、腹鳍间有一黑色纵带。全长可达1米。"
209 |     },
210 |     {
211 |         "拉丁学名": "Hydrolagus mitsukurii",
212 |         "命名年份": "1904",
213 |         "作者": "Jordan  et  Snyder",
214 |         "中文学名": "箕作兔银鲛",
215 |         "界": "动物界",
216 |         "门": "脊索动物门",
217 |         "纲": "软骨鱼纲",
218 |         "目": "银鲛目",
219 |         "科": "银鲛科",
220 |         "属": "兔银鲛属",
221 |         "种": "箕作兔银鲛",
222 |         "自然分布地": "分布于我国东海、南海，以及日本南部海域、冲绳海漕。",
223 |         "生活习性": "箕作兔银鲛栖息水深600-900米。",
224 |         "生物特征": "箕作兔银鲛为兔银鲛属鱼类，以臀鳍缺失与银鲛属鱼类相区别。上颌具6枚齿板。第1背鳍鳍棘后缘呈锯齿状，其长度几乎与头长相等。头前部高耸。尾鳍丝状部显著比头长。雄鱼交尾器分2支。体褐色，腹部色略浅，各鳍呈褐色。体侧具若干与侧线平行的浅色纵带。全长58-85厘米。"
225 |     },
226 |     {
227 |         "拉丁学名": "Hydrolagus purpurescens",
228 |         "命名年份": "1905",
229 |         "作者": "Gilbert",
230 |         "中文学名": "紫银鲛",
231 |         "界": "动物界",
232 |         "门": "脊索动物门",
233 |         "纲": "软骨鱼纲",
234 |         "目": "银鲛目",
235 |         "科": "银鲛科",
236 |         "属": "兔银鲛属",
237 |         "种": "紫银鲛",
238 |         "自然分布地": "分布于我国南海，以及日本岩手海域、美国夏威夷海域等。",
239 |         "生活习性": "紫银鲛栖息水深1120-1920米。",
240 |         "生物特征": "紫银鲛无臀鳍。尾鳍下叶前无缺刻。头前部缓尖。第1背鳍鳍棘后缘光滑。尾鳍丝状部较短。侧线有小波纹状弯曲，但侧线上部无短横带。雄鱼交尾器分3支。体褐色，略带紫色，无斑点。全长约80厘米。"
241 |     },
242 |     {
243 |         "拉丁学名": "Hydrolagus ogilbyi",
244 |         "命名年份": "1898",
245 |         "作者": "Waite",
246 |         "中文学名": "奥氏兔银鲛",
247 |         "界": "动物界",
248 |         "门": "脊索动物门",
249 |         "纲": "软骨鱼纲",
250 |         "目": "银鲛目",
251 |         "科": "兔银鲛科",
252 |         "属": "兔银鲛属",
253 |         "种": "奥氏兔银鲛",
254 |         "自然分布地": "中国黄海、东海、南海、台湾海域，日本南部海域，澳大利亚海域，印度-西太平洋",
255 |         "生活习性": "奥氏兔银鲛海洋底栖鱼类，栖息水深120-350米。游泳能力弱，以小型底栖动物为食。偶见于底拖网渔获。",
256 |         "生物特征": "奥氏兔银鲛有低平臀鳍，并与尾鳍下叶相连，无缺刻。侧线波纹状，侧线上方具许多短横纹。体淡褐色，腹部色浅。背部有一宽纵带。侧线灰色，各鳍略呈褐色。全长75-95厘米。该鱼背鳍硬棘中空，具毒腺。"
257 |     },
258 |     {
259 |         "拉丁学名": "Rhinochimaera pacifica",
260 |         "命名年份": "1895",
261 |         "作者": "Mitsukuri",
262 |         "中文学名": "太平洋长吻银鲛",
263 |         "界": "动物界",
264 |         "门": "脊索动物门",
265 |         "纲": "软骨鱼纲",
266 |         "目": "银鲛目",
267 |         "科": "长吻银鲛科",
268 |         "属": "长吻银鲛属",
269 |         "种": "太平洋长吻银鲛",
270 |         "自然分布地": "中国东海、南海，日本北海道以南海域，新西兰海域，秘鲁海域",
271 |         "生活习性": "太平洋长吻银鲛深海底层鱼类，通常栖息水深750-1100米。卵生。",
272 |         "生物特征": "太平洋长吻银鲛吻尖长，呈剑状，基部近侧扁。口小，横裂，有齿板3对。躯体侧扁，延长。无臀鳍。雄鱼交尾器棒状。尾鳍上叶为肉质，其边缘有1列30-50个齿状突起。侧线几乎平直。体与各鳍均为黑褐色。全长可达1.3米。"
273 |     },
274 |     {
275 |         "拉丁学名": "Rhinochimaera africana",
276 |         "命名年份": "1990",
277 |         "作者": "Stehmann  et  Ebert",
278 |         "中文学名": "非洲长吻银鲛",
279 |         "界": "动物界",
280 |         "门": "脊索动物门",
281 |         "纲": "软骨鱼纲",
282 |         "目": "银鲛目",
283 |         "科": "长吻银鲛科",
284 |         "属": "长吻银鲛属",
285 |         "种": "非洲长吻银鲛",
286 |         "自然分布地": "中国台湾海域，印度-太平洋深水域",
287 |         "生活习性": "非洲长吻银鲛为深海鱼类。",
288 |         "生物特征": "非洲长吻银鲛体延长，侧扁。吻长，平扁，吻长大于或等于头长的2倍。眼上侧线管与眼下侧线管于吻腹面交会，交会点到吻端较到鼻管为近。第1背鳍鳍棘较软条为长。胸鳍大。尾鳍上叶无小棘。体黑色。"
289 |     }
290 | ]


--------------------------------------------------------------------------------
/dspy_evaluation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import dspy
  3 | from dotenv import load_dotenv
  4 | from loguru import logger
  5 | 
  6 | # 确保环境变量已加载
  7 | load_dotenv(override=True)
  8 | 
  9 | class DspyEvaluationProcessor:
 10 |     def __init__(self):
 11 |         # 初始化评估用的语言模型
 12 |         self.eval_lm = dspy.LM(
 13 |             f'{os.getenv("Train_LLM_TYPE")}/{os.getenv("Train_LLM_MODEL")}',
 14 |             base_url=os.getenv("Train_OPENAI_BASE_URL"),
 15 |             api_key=os.getenv("Train_OPENAI_API_KEY"),
 16 |             stream=True  # 直接在创建模型时启用流式模式
 17 |         )
 18 |         # 移除全局配置，避免影响其他模块
 19 |         # dspy.configure(lm=self.eval_lm)
 20 |     
 21 |     # 评估相关功能
 22 |     class BiologicalRetrievalEvaluation(dspy.Signature):
 23 |         """评估生物检索任务的推理步骤质量"""
 24 |         question = dspy.InputField(desc="用户的查询问题")
 25 |         standard_reasoning = dspy.InputField(desc="标准的推理步骤")
 26 |         predicted_reasoning = dspy.InputField(desc="模型产生的推理步骤")
 27 |         evaluation_score = dspy.OutputField(desc="评分(0-100)")
 28 |         evaluation_feedback = dspy.OutputField(desc="详细的评分解释，包括各个方面的得分")
 29 | 
 30 |     class LLMBiologicalEvaluator(dspy.Module):
 31 |         def __init__(self, eval_lm):
 32 |             super().__init__()
 33 |             # 使用传入的评估模型
 34 |             self.eval_lm = eval_lm
 35 |             
 36 |             # 使用思维链方式增强评估能力，直接提供指令，并使用专用评估模型
 37 |             self.eval_chain = dspy.ChainOfThought(
 38 |                 DspyEvaluationProcessor.BiologicalRetrievalEvaluation,
 39 |                 instructions="""
 40 |                 您是一位专业的生物检索质量评估专家。您的任务是评估模型产生的生物检索推理步骤质量。
 41 |             
 42 |             请根据以下标准进行评分(总分100分):
 43 |             
 44 |             1. 检索条件识别准确性 (20分)
 45 |                - 是否正确识别了所有检索条件
 46 |                - 是否正确区分了精确条件和模糊条件
 47 |             
 48 |             2. 需求识别准确性 (10分)
 49 |                - 是否正确识别了查询中的所有需求
 50 |             
 51 |             3. 检索策略合理性 (40分)
 52 |                - 是否先执行精确检索，后执行模糊检索 (10分)
 53 |                - 后续检索步骤是否基于前面步骤的结果 (10分)
 54 |                - 筛选顺序是否从限制性强的条件开始 (10分)
 55 |                - 前面步骤检索到的内容是否把全部信息传递给后面检索所使用的工具 (10分)
 56 |             
 57 |             4. 结果整合正确性和完整性 (30分)
 58 |                - 答案准确性，与标准答案相比，核心事实是否一致，即使表达方式不同，只要核心信息正确也应得高分 (25分)
 59 |                - 提取所有需要汇总的信息  (5分)
 60 |             
 61 |             评估须知：
 62 |                 1. 在评估答案准确性时，请比对预测结果与标准答案的内容，理解语义等价性而非只做字符匹配
 63 |                 2. 即使表达方式不同，只要内容实质相同，也应给予高分
 64 |                 3. 同一事实的不同表述方式应被视为正确，如"共有3种"和"总共有三种"表达的是相同含义
 65 |                 4. 对每个评分维度提供详细分析和具体理由
 66 |                 """
 67 |             )
 68 |             # 显式设置评估链使用评估模型
 69 |             self.eval_chain.lm = self.eval_lm
 70 |         
 71 |         def forward(self, example, prediction):
 72 |             """评估预测与标准答案的匹配程度
 73 |             
 74 |             Args:
 75 |                 example: 包含标准答案的示例
 76 |                 prediction: 模型的预测结果
 77 |                 
 78 |             Returns:
 79 |                 float: 0-1之间的分数
 80 |             """
 81 |             # 如果没有推理步骤，使用简单的答案匹配
 82 |             if not hasattr(example, 'reasoning') or not hasattr(prediction, 'reasoning'):
 83 |                 return 1.0 if dspy.evaluate.answer_exact_match(example, prediction) else 0.0
 84 |             
 85 |             # 准备标准推理步骤
 86 |             standard_reasoning = "\n".join(example.reasoning) if isinstance(example.reasoning, list) else example.reasoning
 87 |             
 88 |             # 获取预测的推理步骤
 89 |             predicted_reasoning = prediction.reasoning if hasattr(prediction, 'reasoning') else ""
 90 |             
 91 |             try:
 92 |                 # 直接使用评估链，不再使用 context 管理器
 93 |                 # 因为我们已经在创建模型时启用了流式模式，并显式设置了评估链使用评估模型
 94 |                 evaluation = self.eval_chain(
 95 |                     question=example.question,
 96 |                     standard_reasoning=standard_reasoning,
 97 |                     predicted_reasoning=predicted_reasoning
 98 |                 )
 99 |                 
100 |                 # 将分数从0-100转换为0-1
101 |                 try:
102 |                     score = float(evaluation.evaluation_score) / 100.0
103 |                     # 边界处理
104 |                     score = max(0.0, min(1.0, score))
105 |                     logger.info(f"评估结果: {score} (问题: {example.question[:30]}...)")
106 |                     return score
107 |                 except:
108 |                     # 如果分数转换失败，默认返回0.5
109 |                     logger.warning(f"评估分数转换失败，使用默认分数0.5")
110 |                     return 0.5
111 |             except Exception as e:
112 |                 logger.error(f"评估过程出错: {str(e)}")
113 |                 # 出错时返回默认分数
114 |                 return 0.5
115 |     
116 |     def llm_biological_metric(self, example, pred, trace=None, frac=1.0):
117 |         """使用大模型评估函数"""
118 |         try:
119 |             # 创建评估器实例，传入评估模型
120 |             evaluator = self.LLMBiologicalEvaluator(self.eval_lm)
121 |             
122 |             # 确保在 litellm 客户端级别启用流式模式
123 |             if hasattr(evaluator.eval_lm, 'client'):
124 |                 evaluator.eval_lm.client.stream = True
125 |                 logger.info("已在评估模型客户端级别启用流式模式")
126 |             
127 |             # 执行评估
128 |             result = evaluator(example, pred)
129 |             return result
130 |         except Exception as e:
131 |             logger.error(f"评估指标计算出错: {str(e)}")
132 |             # 出错时返回默认分数
133 |             return 0.5


--------------------------------------------------------------------------------
/dspy_inference.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import dspy
  3 | from react_tools import ReActTools, GraphVectorizer
  4 | from dotenv import load_dotenv
  5 | from dspy_query_db import MarineSpeciesQuery
  6 | import json
  7 | from loguru import logger
  8 | 
  9 | 
 10 | # 确保环境变量已加载
 11 | load_dotenv(override=True)
 12 | 
 13 | MAX_ITERS = int(os.getenv("MAX_ITERS","10"))
 14 | 
 15 | class DspyInferenceProcessor:
 16 |     def __init__(self):
 17 |         self.ragtool = ReActTools()
 18 |         self.graphvectorizer = GraphVectorizer()
 19 |         self.query_processor = MarineSpeciesQuery(os.getenv("SPECIES_DB_URL","./.dbs/marine_species.db"))
 20 |         # 初始化语言模型
 21 |         self.lm = dspy.LM(
 22 |             f'{os.getenv("LLM_TYPE")}/{os.getenv("LLM_MODEL")}',
 23 |             base_url=os.getenv("BASE_URL"),
 24 |             api_key=os.getenv("API_KEY")
 25 |         )
 26 |         
 27 |         # 配置 dspy 使用该语言模型
 28 |         dspy.configure(lm=self.lm)
 29 | 
 30 |         # 初始化版本号
 31 |         self.predictor_version = "1.0.0"
 32 |         # 初始化 RactModel
 33 |         self.model = self.RactModel(self)
 34 |         # 使用 streamify 包装，获得支持流式返回的模块
 35 |         self.streaming_model = dspy.streamify(self.model)
 36 |     
 37 |     def find_nodes_by_node_type(self, start_node, trget_node_type):
 38 |         '''
 39 |         此方法会根据传入的节点名称，在图数据中以该节点为起点查找包含指定节点类型的节点列表，并返回节点数量与节点列表。
 40 |         start_node 为开始查找的树节点名称，只允许单个节点、
 41 |         trget_node_type 目标节点类型,只允许单个类型名称
 42 |         返回值为从该节点开始，包含指定属性名的节点数量与节点列表
 43 |         已知图数据中存在一系列的海洋生物相关信息：
 44 |         1. ⽣物分类学图数据：包括"拉丁学名", "命名年份", "作者", "中文学名",
 45 |         2. ⽣物科属于数据："界", "门", "纲", "目", "科", "属", "种"(种即是中文学名),它们的从属关系是: 界 -> 门 -> 纲 -> 目 -> 科 ->属 ->种 。
 46 |         3. ⽣物特征图数据：包括"自然分布地", "生物特征","生活习性"等。
 47 |         本方法可以根据给定的节点名称，在图数据中以此节点为起点查找包含指定该属性的节点或节点列表，例如1："盲鳗科" "种" 则会返回 盲鳗科所有的种，例如2："盲鳗科" "界" 则会返回 盲鳗科对应的界， 。
 48 |         4. 因为本方法需要的参数是精准的节点属性名称(或节点类型名)，建议查询的节点类型属于"自然分布地", "生物特征", "生活习性"等时,或查询返回为空时、查询失败时，先通过get_unique_vector_query_results方法获取准确的节点名称，再通过本方法获取对应的节点信息。
 49 | 
 50 |         Args:
 51 |             start_node: 开始查找的节点名称
 52 |             trget_node_type: 目标节点类型
 53 |         Returns:
 54 |             count: 节点数量
 55 |             nodes: 节点列表
 56 |         '''
 57 |         nodes = self.ragtool.find_nodes_by_node_type(start_node, trget_node_type)
 58 |         # 如果nodes为空，则返回0,不为为空时，则返回节点数量与节点列表
 59 |         if not nodes:
 60 |             return 0,[]
 61 |         count = len(nodes)
 62 |         return count,nodes
 63 | 
 64 |     def batch_find_nodes_by_node_type(self, start_nodes, trget_node_type):
 65 |         '''
 66 |         此方法会根据传入包含多个开始节点的列表，批量查询指定目标节点类型的节点列表，返回多条查询的结果集。
 67 |         Args:
 68 |             start_nodes: 开始查找的节点名称列表
 69 |             trget_node_type: 目标节点类型
 70 |         Returns:
 71 |             traget_nodes_list: 多条查询结果的列表
 72 |         '''
 73 |         # 字典格式为，key为节点名称，value为包含指定属性名的节点数量与节点列表
 74 |         traget_nodes_list = {}
 75 |         for node in start_nodes:
 76 |             count, nodes = self.find_nodes_by_node_type(start_nodes, trget_node_type)
 77 |             traget_nodes_list[node] = {"count": count, "nodes": nodes}
 78 |         return traget_nodes_list
 79 | 
 80 |     def get_unique_vector_query_results(self, query, node_type=None, search_type="all", top_k=1, better_than_threshold=0.65):
 81 |         """通过向量搜索，获取与查询最相关的实体或关系
 82 |         Args:
 83 |             query: 搜索查询文本
 84 |             node_type: 实体类型筛选条件，如果为None则不筛选。可选值包括：
 85 |                 - species (种、中文名)
 86 |                 - 界
 87 |                 - 门
 88 |                 - 纲
 89 |                 - 目
 90 |                 - 科
 91 |                 - 属
 92 |                 - 位置
 93 |                 - 繁殖特征
 94 |                 - 行为特征
 95 |                 - 体型
 96 |                 - 体色
 97 |                 - 体长
 98 |                 - 特殊特征
 99 |             search_type: 搜索类型，'all'/'entity'/'relation'
100 |             top_k: 返回结果的数量
101 |             better_than_threshold: 相似度阈值，只返回相似度高于此值的结果
102 |         Returns:
103 |             list: 搜索结果，精准的实体名列表
104 |         """
105 |         try:
106 |             # 添加超时控制
107 |             import asyncio
108 |             from concurrent.futures import ThreadPoolExecutor
109 |             
110 |             # 使用线程池执行可能耗时的操作
111 |             with ThreadPoolExecutor() as executor:
112 |                 # 设置超时时间（例如10秒）
113 |                 future = executor.submit(self.graphvectorizer.search, query, node_type, search_type, top_k, better_than_threshold)
114 |                 try:
115 |                     result = future.result(timeout=10)  # 10秒超时
116 |                     return result
117 |                 except TimeoutError:
118 |                     logger.error(f"向量搜索超时: query={query}, node_type={node_type}")
119 |                     return []  # 超时返回空列表
120 |         except Exception as e:
121 |             # 捕获所有异常，确保不会导致整个流程崩溃
122 |             logger.error(f"向量搜索出错: {str(e)}, query={query}, node_type={node_type}")
123 |             return []  # 出错返回空列表
124 | 
125 |     def get_node_attribute(self,node_id):
126 |         '''
127 |         根据节点id获取所有属性，包括中文学名、拉丁学名、命名年份、作者、node_type
128 |         Args:
129 |             node_id: 节点id
130 |         Returns:
131 |             list: 属性列表
132 |         '''
133 |         return self.ragtool.get_node_attribute(node_id)
134 |     def get_adjacent_node_descriptions(self, nodenames):
135 |         '''
136 |         此方法会根据传入的节点列表，获取每个节点相邻所有节点描述，合并到一个列表中返回，非精准检索，谨慎使用
137 |         Args:
138 |             nodenames: 节点名称列表
139 |         Returns:
140 |             list: 相邻节点描述列表
141 |         '''
142 |         return self.ragtool.get_adjacent_node_descriptions(nodenames)
143 | 
144 |     def nodes_count(self, nodes):
145 |         '''
146 |         此方法会根据传入的节点列表，统计数量，返回数量
147 |         Args:
148 |             nodes: 节点列表
149 |         Returns:
150 |             int: 节点数量
151 |         '''
152 |         if not nodes:
153 |             return 0
154 |         return len(nodes)
155 |     
156 |     def MarineSpeciesQuery(self,query):
157 |         """根据自然语言查询数据库
158 |         Args:
159 |             natural_language_query: 用户的自然语言查询
160 |             
161 |         Returns:
162 |             查询结果和解释
163 |         """
164 |         result = self.query_processor.query_database(query)
165 |         return  self.query_processor.format_query_results(result)
166 |     
167 |     # 定义签名类
168 |     class MarineBiologyKnowledgeQueryAnswer(dspy.Signature):
169 |         """
170 |         针对复杂检索问题的增强签名。
171 |         此签名能够：
172 |         1. 分析用户问题，提取精确检索条件和模糊检索条件
173 |         2. 确定检索顺序和优先级策略
174 |         3. 对多实体结果进行遍历查询
175 |         4. 按照检索需求有序组织答案
176 |         """
177 |         # 输入字段
178 |         question = dspy.InputField(desc="用户的原始问题")
179 |         # 输出字段
180 |         answer = dspy.OutputField(desc="根据检索结果综合形成的完整答案，确保涵盖所有检索需求，使用中文回复")
181 |     
182 |     # 建议添加的问题分类签名
183 |     class QuestionClassifier(dspy.Signature):
184 |         """对用户问题进行分类"""
185 |         question = dspy.InputField(desc="用户的原始问题")
186 |         question_type = dspy.OutputField(desc="问题类型，可能的值包括：实体查询/关系查询/属性查询/统计查询等")
187 |         search_strategy = dspy.OutputField(desc="建议的检索策略：向量检索/图检索/混合检索")
188 |         key_entities = dspy.OutputField(desc="问题中的关键实体列表")
189 |     
190 |     # 定义 RactModel 类
191 |     class RactModel(dspy.Module):
192 |         def __init__(self, processor):
193 |             super().__init__()
194 |             # 保存外部类的引用
195 |             self.processor = processor
196 |             # 利用 ReAct 将工具函数集成进来
197 |             self.react = dspy.ReAct(
198 |                 DspyInferenceProcessor.MarineBiologyKnowledgeQueryAnswer,
199 |                 max_iters = MAX_ITERS,
200 |                 tools=[
201 |                     processor.find_nodes_by_node_type, 
202 |                     processor.batch_find_nodes_by_node_type,
203 |                     processor.get_unique_vector_query_results,
204 |                     processor.get_node_attribute,
205 |                     processor.get_adjacent_node_descriptions,
206 |                     processor.nodes_count
207 |                 ]
208 |             )
209 |         
210 |         def forward(self, question):
211 |             return self.react(question=question)
212 |     
213 |     def get_last_message(self):
214 |         """获取最后一条消息历史"""
215 |         return self.lm.history[-1] if self.lm.history else None
216 |     
217 |     def load_model(self, file_path):
218 |         """加载指定版本的模型"""
219 |         result = self.model.load(file_path)
220 |         # 加载模型后清除缓存
221 |         dspy.settings.configure(cache=None)
222 |         return result
223 |     
224 |     def set_version(self, version):
225 |         """设置当前预测器版本"""
226 |         self.predictor_version = version
227 |     
228 |     def get_version(self):
229 |         """获取当前预测器版本"""
230 |         return self.predictor_version
231 |     
232 |     def predict(self, question):
233 |         """非流式预测"""
234 |         return self.model(question=question)
235 |     
236 |     def stream_predict(self, question):
237 |         """流式预测，实现真正的增量输出"""
238 |         try:
239 |             # 创建一个跟踪状态的对象
240 |             class StreamState:
241 |                 def __init__(self):
242 |                     self.last_answer = ""
243 |                     self.last_reasoning = ""
244 |                     self.is_first_chunk = True
245 |                     
246 |             state = StreamState()
247 |             
248 |             # 使用 dspy 的流式模型获取结果
249 |             async def real_stream():
250 |                 # 首先发送一个空的初始状态
251 |                 if state.is_first_chunk:
252 |                     initial_prediction = type('Prediction', (), {
253 |                         'answer': '',
254 |                         'reasoning': '思考中...'
255 |                     })
256 |                     state.is_first_chunk = False
257 |                     yield initial_prediction
258 |                 
259 |                 # 启动非流式预测（在后台运行）
260 |                 import asyncio
261 |                 from concurrent.futures import ThreadPoolExecutor
262 |                 
263 |                 # 创建一个执行器来运行阻塞的预测
264 |                 with ThreadPoolExecutor() as executor:
265 |                     # 提交预测任务到线程池
266 |                     future = executor.submit(self.predict, question)
267 |                     
268 |                     # 每隔一小段时间检查一次结果，模拟流式输出
269 |                     while not future.done():
270 |                         await asyncio.sleep(0.2)  # 等待200毫秒
271 |                         # 发送思考中的状态
272 |                         thinking_prediction = type('Prediction', (), {
273 |                             'answer': state.last_answer,
274 |                             'reasoning': state.last_reasoning + "."  # 添加一个点表示思考
275 |                         })
276 |                         state.last_reasoning += "."
277 |                         yield thinking_prediction
278 |                     
279 |                     # 获取最终结果
280 |                     try:
281 |                         final_result = future.result()
282 |                         # 如果最终结果可用，分段返回
283 |                         if hasattr(final_result, 'answer') and hasattr(final_result, 'reasoning'):
284 |                             # 将答案和推理过程分成多个部分
285 |                             answer_parts = self._split_text(final_result.answer, 10)  # 分成约10个部分
286 |                             reasoning_parts = self._split_text(final_result.reasoning, 5)  # 分成约5个部分
287 |                             
288 |                             # 先返回完整的推理过程
289 |                             for i, reasoning_part in enumerate(reasoning_parts):
290 |                                 current_reasoning = "".join(reasoning_parts[:i+1])
291 |                                 prediction = type('Prediction', (), {
292 |                                     'answer': state.last_answer,
293 |                                     'reasoning': current_reasoning
294 |                                 })
295 |                                 state.last_reasoning = current_reasoning
296 |                                 yield prediction
297 |                                 await asyncio.sleep(0.1)  # 短暂停顿
298 |                             
299 |                             # 然后逐步返回答案
300 |                             for i, answer_part in enumerate(answer_parts):
301 |                                 current_answer = "".join(answer_parts[:i+1])
302 |                                 prediction = type('Prediction', (), {
303 |                                     'answer': current_answer,
304 |                                     'reasoning': final_result.reasoning
305 |                                 })
306 |                                 state.last_answer = current_answer
307 |                                 yield prediction
308 |                                 await asyncio.sleep(0.1)  # 短暂停顿
309 |                     except Exception as e:
310 |                         logger.error(f"获取预测结果时出错: {str(e)}")
311 |                         error_prediction = type('Prediction', (), {
312 |                             'answer': '处理您的请求时出现错误',
313 |                             'reasoning': f'发生错误: {str(e)}'
314 |                         })
315 |                         yield error_prediction
316 |             
317 |             return real_stream()
318 |         except Exception as e:
319 |             logger.error(f"流式预测出错: {str(e)}")
320 |             # 如果流式预测失败，尝试使用非流式预测
321 |             try:
322 |                 logger.info("尝试使用非流式预测作为备选方案")
323 |                 result = self.predict(question)
324 |                 # 将非流式结果转换为可迭代对象以模拟流式返回
325 |                 async def mock_stream():
326 |                     yield result
327 |                 return mock_stream()
328 |             except Exception as e2:
329 |                 logger.error(f"备选预测也失败: {str(e2)}")
330 |                 raise e  # 重新抛出原始异常
331 |     
332 |     def _split_text(self, text, num_parts):
333 |         """将文本分成大约 num_parts 个部分"""
334 |         if not text:
335 |             return [""]
336 |         
337 |         # 计算每部分的大致长度
338 |         part_length = max(1, len(text) // num_parts)
339 |         parts = []
340 |         
341 |         for i in range(0, len(text), part_length):
342 |             parts.append(text[i:i + part_length])
343 |         
344 |         return parts


--------------------------------------------------------------------------------
/dspy_program/program_v1.0.1_20250313195723.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/dspy_program/program_v1.0.1_20250313195723.pkl


--------------------------------------------------------------------------------
/dspy_program/program_v1.0.3_20250315154834.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/dspy_program/program_v1.0.3_20250315154834.pkl


--------------------------------------------------------------------------------
/dspy_query_db.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import re
  4 | import dspy
  5 | import sqlite3
  6 | from dotenv import load_dotenv
  7 | # 设置DSPy的语言模型
  8 | def setup_dspy():
  9 |     load_dotenv(override=True)
 10 |     
 11 |     if os.getenv("Train_LLM_MODEL"):
 12 |         Train = dspy.LM(
 13 |             f'deepseek/{os.getenv("Train_LLM_MODEL")}',
 14 |             base_url=os.getenv("Train_OPENAI_BASE_URL"),
 15 |             api_key=os.getenv("Train_OPENAI_API_KEY")
 16 |         )
 17 |         dspy.settings.configure(lm=Train)
 18 |     else:
 19 |         # 默认使用OpenAI
 20 |         dspy.settings.configure(lm="openai")
 21 | 
 22 | 
 23 | # 在已有的签名定义之后添加
 24 | class NaturalLanguageToSQL(dspy.Signature):
 25 |     """将自然语言查询转换为SQL语句。注意：返回纯SQL文本，不要包含```sql或```等代码块标记。
 26 |     重要：保持原始查询中的中文词汇不变，不要自动转换为拉丁文或英文。
 27 |     当查询涉及到地理位置（distributions表中的location字段）时，必须使用LIKE语句而不是精确匹配，
 28 |     例如：WHERE location LIKE '%东海%' 而不是 WHERE location = '东海'"""
 29 |     query = dspy.InputField(description="用户的自然语言查询")
 30 |     db_schema = dspy.InputField(description="数据库的表结构信息")
 31 |     sql = dspy.OutputField(description="生成的SQL查询语句，必须是纯SQL文本，对地理位置使用LIKE操作符")
 32 |     explanation = dspy.OutputField(description="SQL查询的解释")
 33 | 
 34 | # 在已有的提取器类之后添加
 35 | class SQLGenerator(dspy.Module):
 36 |     def __init__(self):
 37 |         super().__init__()
 38 |         self.generator = dspy.ChainOfThought(NaturalLanguageToSQL)
 39 |     
 40 |     def forward(self, query, db_schema):
 41 |         return self.generator(query=query, db_schema=db_schema)
 42 |     
 43 | # 查询相关类
 44 | class MarineSpeciesQuery:
 45 |     def __init__(self, db_path):
 46 |         """初始化查询器
 47 |         
 48 |         Args:
 49 |             db_path: SQLite数据库文件路径
 50 |         """
 51 |         self.db_path = db_path
 52 |         setup_dspy()
 53 |     
 54 |     def query_database(self, natural_language_query):
 55 |         """根据自然语言查询数据库
 56 |         
 57 |         Args:
 58 |             natural_language_query: 用户的自然语言查询
 59 |             
 60 |         Returns:
 61 |             查询结果和解释
 62 |         """
 63 |         # 先获取表中实际的值
 64 |         with sqlite3.connect(self.db_path) as conn:
 65 |             cursor = conn.cursor()
 66 |             cursor.execute("SELECT DISTINCT family FROM species")
 67 |             families = [row[0] for row in cursor.fetchall()]
 68 |         
 69 |         # 获取数据库表结构
 70 |         with sqlite3.connect(self.db_path) as conn:
 71 |             cursor = conn.cursor()
 72 |             
 73 |             # 获取所有表名
 74 |             cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
 75 |             tables = cursor.fetchall()
 76 |             
 77 |             db_schema = []
 78 |             for table in tables:
 79 |                 table_name = table[0]
 80 |                 cursor.execute(f"PRAGMA table_info({table_name})")
 81 |                 columns = cursor.fetchall()
 82 |                 
 83 |                 column_info = []
 84 |                 for col in columns:
 85 |                     column_info.append({
 86 |                         "name": col[1],
 87 |                         "type": col[2]
 88 |                     })
 89 |                 
 90 |                 db_schema.append({
 91 |                     "table": table_name,
 92 |                     "columns": column_info
 93 |                 })
 94 |             
 95 |             db_schema_str = json.dumps(db_schema, ensure_ascii=False, indent=2)
 96 |             
 97 |             # 当拼接db_schema_enriched时，添加关于location的使用说明
 98 |             db_schema_enriched = json.dumps(db_schema, ensure_ascii=False, indent=2)
 99 |             
100 |             # 添加额外使用提示
101 |             location_usage_hint = """
102 |             重要提示：当查询涉及地理位置时，请使用LIKE操作符而不是等号(=)。
103 |             例如：
104 |             正确: WHERE d.location LIKE '%东海%'
105 |             错误: WHERE d.location = '东海'
106 |             
107 |             这是因为地理位置通常需要模糊匹配，一个物种可能分布在多个地区，
108 |             或者地理位置描述可能包含其他词汇。
109 |             """
110 |             
111 |             # 初始化SQL生成器
112 |             sql_generator = SQLGenerator()
113 |             
114 |             # 生成SQL
115 |             result = sql_generator(natural_language_query, db_schema_enriched + "\n" + location_usage_hint)
116 |             
117 |             # 清理SQL，移除Markdown代码块标记
118 |             sql = result.sql
119 |             sql = re.sub(r'```sql\s*', '', sql)  # 移除开始的```sql
120 |             sql = re.sub(r'\s*```\s*$', '', sql)  # 移除结束的```
121 |             
122 |             try:
123 |                 # 执行SQL查询
124 |                 print(f"执行SQL查询: {sql}")
125 |                 cursor.execute(sql)
126 |                 
127 |                 # 获取列名
128 |                 column_names = [description[0] for description in cursor.description]
129 |                 
130 |                 # 获取查询结果
131 |                 rows = cursor.fetchall()
132 |                 
133 |                 # 转换为字典列表
134 |                 results = []
135 |                 for row in rows:
136 |                     result_dict = {}
137 |                     for i, col_name in enumerate(column_names):
138 |                         result_dict[col_name] = row[i]
139 |                     results.append(result_dict)
140 |                 
141 |                 return {
142 |                     "success": True,
143 |                     "query": natural_language_query,
144 |                     "sql": sql,  # 使用清理后的SQL
145 |                     "explanation": result.explanation,
146 |                     "results": results,
147 |                     "column_names": column_names,
148 |                     "row_count": len(rows)
149 |                 }
150 |             except Exception as e:
151 |                 print(f"SQL执行错误: {e}")
152 |                 return {
153 |                     "success": False,
154 |                     "query": natural_language_query,
155 |                     "sql": sql,  # 使用清理后的SQL
156 |                     "explanation": result.explanation,
157 |                     "error": str(e)
158 |                 }
159 | 
160 |     def format_query_results(self, query_result):
161 |         """格式化查询结果
162 |         
163 |         Args:
164 |             query_result: 查询结果字典
165 |             
166 |         Returns:
167 |             格式化的结果字符串
168 |         """
169 |         if not query_result["success"]:
170 |             return f"查询失败: {query_result['error']}\n原始SQL: {query_result['sql']}"
171 |         
172 |         output = []
173 |         output.append(f"查询: {query_result['query']}")
174 |         output.append(f"SQL: {query_result['sql']}")
175 |         output.append(f"解释: {query_result['explanation']}")
176 |         output.append(f"找到 {query_result['row_count']} 条结果:")
177 |         
178 |         if query_result['row_count'] > 0:
179 |             # 计算每列的最大宽度
180 |             widths = {}
181 |             for col in query_result['column_names']:
182 |                 widths[col] = len(col)
183 |             
184 |             for row in query_result['results']:
185 |                 for col in query_result['column_names']:
186 |                     val = str(row[col]) if row[col] is not None else 'NULL'
187 |                     widths[col] = max(widths[col], len(val))
188 |             
189 |             # 生成表头
190 |             header = " | ".join(col.ljust(widths[col]) for col in query_result['column_names'])
191 |             separator = "-+-".join("-" * widths[col] for col in query_result['column_names'])
192 |             
193 |             output.append(header)
194 |             output.append(separator)
195 |             
196 |             # 生成数据行
197 |             for row in query_result['results']:
198 |                 row_str = " | ".join(
199 |                     str(row[col]).ljust(widths[col]) if row[col] is not None else 'NULL'.ljust(widths[col])
200 |                     for col in query_result['column_names']
201 |                 )
202 |                 output.append(row_str)
203 |         
204 |         return "\n".join(output)
205 |     
206 | 
207 | if __name__ == "__main__":
208 |     # 直接使用查询处理器示例
209 |     query_processor = MarineSpeciesQuery("marine_species.db")
210 |     result = query_processor.query_database("分布在东海的盲鳗科哪些生物?有多少？")
211 |     formatted_result = query_processor.format_query_results(result)
212 |     print(formatted_result)
213 | 


--------------------------------------------------------------------------------
/images/function-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/images/function-diagram.png


--------------------------------------------------------------------------------
/images/startup-success.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/images/startup-success.jpg


--------------------------------------------------------------------------------
/images/二维码.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/images/二维码.jpg


--------------------------------------------------------------------------------
/images/优化样本.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/images/优化样本.jpg


--------------------------------------------------------------------------------
/images/关系信息查询.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/images/关系信息查询.jpg


--------------------------------------------------------------------------------
/images/实体信息查询.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/images/实体信息查询.jpg


--------------------------------------------------------------------------------
/images/属性信息查询.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/images/属性信息查询.jpg


--------------------------------------------------------------------------------
/images/版本选择.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/images/版本选择.jpg


--------------------------------------------------------------------------------
/images/统计信息查询.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/images/统计信息查询.jpg


--------------------------------------------------------------------------------
/images/训练所有样本.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/images/训练所有样本.jpg


--------------------------------------------------------------------------------
/images/非实体信息截图.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/images/非实体信息截图.jpg


--------------------------------------------------------------------------------
/images/项目技术路线.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loukie7/Datacapsule/08a6ff167a89234868a3970bc42f93bef41058a3/images/项目技术路线.jpg


--------------------------------------------------------------------------------
/nanovector_db.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from dotenv import load_dotenv
  3 | from openai import OpenAI
  4 | import os
  5 | import json
  6 | from pathlib import Path
  7 | import networkx as nx
  8 | from loguru import logger
  9 | 
 10 | class NanoVectorDB:
 11 |     def __init__(self, db_path: str):
 12 |         """初始化向量数据库
 13 |         
 14 |         Args:
 15 |             db_path: 数据库文件存储路径
 16 |         """
 17 |         self.db_path = Path(db_path)
 18 |         self.db_path.mkdir(parents=True, exist_ok=True)
 19 |         
 20 |         self.entity_vectors_file = self.db_path / 'entity_vectors.json'
 21 |         self.relation_vectors_file = self.db_path / 'relation_vectors.json'
 22 |         logger.info(f"初始化向量数据库: {self.db_path}/entity_vectors.json, relation_vectors.json" )
 23 |         # 初始化存储文件
 24 |         if not self.entity_vectors_file.exists():
 25 |             logger.info(f"文件不存在，开始创建向量数据库: {self.db_path}/entity_vectors.json, relation_vectors.json" )
 26 |             self._save_vectors(self.entity_vectors_file, [])
 27 |         if not self.relation_vectors_file.exists():
 28 |             logger.info(f"文件不存在，开始创建向量数据库: {self.db_path}/relation_vectors.json" )
 29 |             self._save_vectors(self.relation_vectors_file, [])
 30 |         logger.info(f"开始缓存向量数据: {self.db_path}/entity_vectors.json, relation_vectors.json" )
 31 |         # 缓存向量数据
 32 |         self.entity_vectors_cache = self._load_vectors(self.entity_vectors_file)
 33 |         self.relation_vectors_cache = self._load_vectors(self.relation_vectors_file)
 34 |         logger.info(f"已缓存实体向量 {len(self.entity_vectors_cache)} 条，关系向量 {len(self.relation_vectors_cache)} 条")
 35 | 
 36 |     def _save_vectors(self, file_path: Path, vectors: list):
 37 |         """保存向量数据到文件"""
 38 |         with open(file_path, 'w', encoding='utf-8') as f:
 39 |             json.dump(vectors, f, ensure_ascii=False)
 40 |     
 41 |     def _load_vectors(self, file_path: Path) -> list:
 42 |         """从文件加载向量数据"""
 43 |         logger.info(f"开始加载向量数据库: {file_path}")
 44 |         with open(file_path, 'r', encoding='utf-8') as f:
 45 |             data = json.load(f)
 46 |         logger.info(f"成功加载向量数据库: {file_path}")
 47 |         return data
 48 |     
 49 |     def add_entity(self, entity_id: str, entity_type: str, entity_name: str, embedding: list):
 50 |         """添加实体向量"""
 51 |         self.entity_vectors_cache.append({
 52 |             'entity_id': entity_id,
 53 |             'entity_type': entity_type,
 54 |             'entity_name': entity_name,
 55 |             'embedding': embedding
 56 |         })
 57 |         self._save_vectors(self.entity_vectors_file, self.entity_vectors_cache)
 58 |     
 59 |     def add_relation(self, source_id: str, target_id: str, relation_type: str, embedding: list):
 60 |         """添加关系向量"""
 61 |         self.relation_vectors_cache.append({
 62 |             'source_id': source_id,
 63 |             'target_id': target_id,
 64 |             'relation_type': relation_type,
 65 |             'embedding': embedding
 66 |         })
 67 |         self._save_vectors(self.relation_vectors_file, self.relation_vectors_cache)
 68 |     
 69 |     def search_entities(self, query_embedding: list, k: int = 5) -> list:
 70 |         """搜索最相似的实体"""
 71 |         results = []
 72 |         
 73 |         for entity in self.entity_vectors_cache:
 74 |             similarity = 1 - self._cosine_distance(query_embedding, entity['embedding'])
 75 |             results.append({
 76 |                 'type': 'entity',
 77 |                 'id': entity['entity_id'],
 78 |                 'entity_type': entity['entity_type'],
 79 |                 'name': entity['entity_name'],
 80 |                 'similarity': similarity
 81 |             })
 82 |         
 83 |         results.sort(key=lambda x: x['similarity'], reverse=True)
 84 |         return results[:k]
 85 |     
 86 |     def search_relations(self, query_embedding: list, k: int = 5) -> list:
 87 |         """搜索最相似的关系"""
 88 |         results = []
 89 |         
 90 |         for relation in self.relation_vectors_cache:
 91 |             similarity = 1 - self._cosine_distance(query_embedding, relation['embedding'])
 92 |             results.append({
 93 |                 'type': 'relation',
 94 |                 'source': relation['source_id'],
 95 |                 'target': relation['target_id'],
 96 |                 'relation_type': relation['relation_type'],
 97 |                 'similarity': similarity
 98 |             })
 99 |         
100 |         results.sort(key=lambda x: x['similarity'], reverse=True)
101 |         return results[:k]
102 |     
103 |     def _cosine_distance(self, v1: list, v2: list) -> float:
104 |         """计算余弦距离"""
105 |         v1_array = np.array(v1)
106 |         v2_array = np.array(v2)
107 |         dot_product = np.dot(v1_array, v2_array)
108 |         norm_v1 = np.linalg.norm(v1_array)
109 |         norm_v2 = np.linalg.norm(v2_array)
110 |         return 1 - dot_product / (norm_v1 * norm_v2)


--------------------------------------------------------------------------------
/react_tools.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from openai import OpenAI
  4 | from loguru import logger
  5 | import networkx as nx
  6 | import dspy
  7 | from typing import List
  8 | from nanovector_db import NanoVectorDB
  9 | 
 10 | 
 11 | MAX_BATCH_SIZE = os.getenv("MAX_BATCH_SIZE")
 12 | VECTOR_SEARCH_TOP_K = int(os.getenv("VECTOR_SEARCH_TOP_K","3"))
 13 | BETTER_THAN_THRESHOLD =  float(os.getenv("BETTER_THAN_THRESHOLD","0.7"))
 14 | WORKING_DIR =os.getenv("RAG_DIR","graph_data")
 15 | 
 16 | client = OpenAI(base_url=os.getenv("EMBEDDING_MODEL_BASE_URL"),api_key=os.getenv("EMBEDDING_MODEL_API_KEY"),)
 17 | 
 18 | # 定义节点类型的层级顺序
 19 | NODE_HIERARCHY = {
 20 |         "界": 1,
 21 |         "门": 2,
 22 |         "纲": 3,
 23 |         "目": 4,
 24 |         "科": 5,
 25 |         "属": 6,
 26 |         "种": 7,
 27 |         "中文学名": 7,
 28 |         "自然分布地": 8,
 29 |         "生活习性": 8,
 30 |         "生物特征": 8,
 31 |         "经济性": 8,
 32 |         "保护信息": 8,
 33 |         "食性":8,
 34 |         "繁殖特征":8,
 35 |         "行为特征":8,
 36 |         "体型":8,
 37 |         "体色":8,
 38 |         "体长":8,
 39 |         "特殊特征":8
 40 | }
 41 | 
 42 | class ReActTools:
 43 |     def __init__(self):
 44 |         logger.info("ReActTools initialized")
 45 |         GRAPHML_DIR = os.getenv("GRAPHML_DIR","graph_chunk_entity_relation_clean.graphml")
 46 |         logger.info("init-ReActTools")
 47 |         logger.info(f"{WORKING_DIR}/{GRAPHML_DIR}")
 48 |         if os.path.exists(f"{WORKING_DIR}/{GRAPHML_DIR}"):
 49 |             self.nx = nx.read_graphml(f"{WORKING_DIR}/{GRAPHML_DIR}")
 50 |         
 51 |         # 判断是否正确加载到网络图
 52 |         if self.nx and self.nx.number_of_nodes() >0:
 53 |             logger.info(f"NetworkX graph loaded successfully! have nodes: {self.nx.number_of_nodes()}")
 54 |             self.nx_nodes=self.nx.nodes(data=True)
 55 |             self.entity_type_map = {}
 56 |             for node in self.nx_nodes:
 57 |                 item = node[1]
 58 |                 id = node[0]
 59 |                 entity_type = item.get('node_type') 
 60 |                 if entity_type:  # 只处理包含entity_type的节点
 61 |                     if entity_type not in self.entity_type_map:
 62 |                         self.entity_type_map[entity_type] = {}
 63 |                     self.entity_type_map[entity_type][id] = item
 64 |                 else:
 65 |                     logger.warning(f"Warning: Node {id} missing node_type attribute")
 66 |         else:
 67 |             logger.error("NetworkX graph is empty!")
 68 | 
 69 |         self.dim = int(os.getenv("EMBEDDING_DIM",1536))
 70 |         self.vectorizer = GraphVectorizer(WORKING_DIR)
 71 |        
 72 |     def openai_embedding_function(self,texts: List[str]):
 73 |         
 74 |         response = client.embeddings.create(
 75 |             input=texts,
 76 |             model=os.getenv("EMBEDDING_MODEL")
 77 |         )
 78 |         return [x.embedding for x in response.data]
 79 |     
 80 |     def find_nodes_by_node_type(self,start_node,attr_name):
 81 |         '''
 82 |         根据开始节点名查找具有指定属性节点，返回节点信息,节点不存时返回None
 83 |         '''
 84 |         logger.info(f"开始查找 - 起始节点: '{start_node}', 目标属性: '{attr_name}'")
 85 |         checked_nodes = []
 86 |         nodes = set()
 87 |         self.find_neighbors_recursive(start_node, attr_name, nodes, checked_nodes, depth=0)
 88 |         logger.info(f"查找完成 - 找到 {len(nodes)} 个节点: {nodes}")
 89 |         return nodes
 90 |                 
 91 | 
 92 |     def find_neighbors_recursive(self,node, target, nodes, checked_nodes, depth=0):
 93 |         """
 94 |         递归查询某一节点的邻居，并根据目标进行逐层判断，确保递进朝一个方向。
 95 |         :param node: 当前节点
 96 |         :param target: 目标节点的类型
 97 |         :param nodes: 已找到的目标节点列表
 98 |         :param checked_nodes: 已检查的节点列表
 99 |         :param depth: 当前递归深度（用于日志缩进）
100 |         """
101 |         indent = "  " * depth
102 |         logger.debug(f"{indent}检查节点: '{node}' (递归深度: {depth}, 已检查节点数: {len(checked_nodes)})")
103 |         checked_nodes.append(node)  # 标记当前节点已检查
104 |         
105 |         # 添加异常处理，检查节点是否存在
106 |         try:
107 |             if node not in self.nx.nodes:
108 |                 logger.warning(f"{indent}节点 '{node}' 不存在于图中")
109 |                 return
110 |             
111 |             source_node_type = self.nx.nodes[node].get("node_type")
112 |             if not source_node_type:
113 |                 logger.warning(f"{indent}节点 '{node}' 没有node_type属性")
114 |                 return
115 |             
116 |             logger.debug(f"{indent}当前节点类型: '{source_node_type}'")
117 |         except Exception as e:
118 |             logger.error(f"{indent}处理节点 '{node}' 时出错: {str(e)}")
119 |             return
120 |             
121 |         # 获取当前节点和目标节点的层级
122 |         source_level = NODE_HIERARCHY.get(source_node_type, float('inf'))
123 |         target_level = NODE_HIERARCHY.get(target, float('inf'))
124 |         logger.debug(f"{indent}层级比较 - 当前节点: {source_level}, 目标节点: {target_level}")
125 |         
126 |         if source_level == target_level:
127 |             logger.info(f"{indent}找到目标节点! '{node}' (类型: {source_node_type})")
128 |             nodes.add(node)
129 |             return
130 |             
131 |         # 获取邻居节点
132 |         try:
133 |             # 获取所有相邻节点（包括入边和出边）
134 |             neighbors = list(self.nx.neighbors(node))  # 获取出边邻居
135 |             predecessors = list(self.nx.predecessors(node))  # 获取入边邻居
136 |             all_neighbors = list(set(neighbors + predecessors))  # 合并并去重
137 |             logger.debug(f"{indent}找到 {len(all_neighbors)} 个邻居节点（包括入边和出边）")
138 |         except Exception as e:
139 |             logger.error(f"{indent}获取节点 '{node}' 的邻居时出错: {str(e)}")
140 |             return
141 |             
142 |         for neighbor in all_neighbors:
143 |             # 跳过已检查的节点
144 |             if neighbor in checked_nodes:
145 |                 logger.debug(f"{indent}跳过已检查的节点: '{neighbor}'")
146 |                 continue
147 |                 
148 |             try:
149 |                 neighbor_type = self.nx.nodes[neighbor].get("node_type")
150 |                 if not neighbor_type:
151 |                     logger.debug(f"{indent}邻居节点 '{neighbor}' 没有node_type属性，跳过")
152 |                     continue
153 |                     
154 |                 neighbor_level = NODE_HIERARCHY.get(neighbor_type, float('inf'))
155 |                 logger.debug(f"{indent}检查邻居: '{neighbor}' (类型: {neighbor_type}, 层级: {neighbor_level})")
156 |                 
157 |                 # 如果是目标节点，则添加到结果列表
158 |                 if neighbor_type == target or (neighbor_level == 7 and neighbor_level == target_level):
159 |                     logger.info(f"{indent}找到目标节点! '{neighbor}' (类型: {neighbor_type})")
160 |                     nodes.add(neighbor)
161 |                     # 如果目标比当前节点层级高，停止递归并返回目标节点
162 |                     if target_level <= source_level:
163 |                         logger.debug(f"{indent}目标层级({target_level})小于等于当前层级({source_level})，停止递归")
164 |                         return
165 |                 else:
166 |                     if NODE_HIERARCHY.get(neighbor_type, float('inf')) <= 7:
167 |                         if target_level < source_level and neighbor_level < source_level:
168 |                             logger.debug(f"{indent}向上递归: '{neighbor}' (当前层级: {source_level}, 邻居层级: {neighbor_level}, 目标层级: {target_level})")
169 |                             self.find_neighbors_recursive(neighbor, target, nodes, checked_nodes, depth+1)
170 |                         elif target_level > source_level and neighbor_level > source_level:
171 |                             logger.debug(f"{indent}向下递归: '{neighbor}' (当前层级: {source_level}, 邻居层级: {neighbor_level}, 目标层级: {target_level})")
172 |                             self.find_neighbors_recursive(neighbor, target, nodes, checked_nodes, depth+1)
173 |                         else:
174 |                             logger.debug(f"{indent}不符合递归条件，跳过邻居: '{neighbor}'")
175 |                     else:
176 |                         logger.debug(f"{indent}邻居层级 > 7，跳过: '{neighbor}' (层级: {neighbor_level})")
177 |             except Exception as e:
178 |                 logger.warning(f"{indent}处理邻居节点 '{neighbor}' 时出错: {str(e)}")
179 |                 continue
180 |         
181 |         logger.debug(f"{indent}完成节点 '{node}' 的所有邻居检查")
182 | 
183 |     # 查询指定节点所有属性
184 |     def get_node_attribute(self,node_id):
185 |         '''
186 |         根据节点id获取所有属性，包括中文学名、拉丁学名、命名年份、作者、node_type
187 |         :param node_id: 节点id
188 |         :return: 属性值
189 |         '''
190 |         return self.nx.nodes[node_id]
191 |     
192 |     def get_adjacent_node_descriptions(self,nodenames):
193 |         '''
194 |         根据列表中节点名获取所有相邻节点的description
195 |         :param node_id: 节点id
196 |         :return: 所有相依节点信息集合
197 |         '''
198 |         result = set()
199 |         for nodename in nodenames:
200 |             # 获取出边邻居
201 |             for neighbor in self.nx.neighbors(nodename):
202 |                 description = self.nx.nodes[neighbor].get("description")
203 |                 if description:
204 |                     result.add(description)
205 |             # 获取入边邻居
206 |             for predecessor in self.nx.predecessors(nodename):
207 |                 description = self.nx.nodes[predecessor].get("description")
208 |                 if description:
209 |                     result.add(description)
210 |         return list(result)
211 | 
212 | class GraphVectorizer:
213 |     def __init__(self, db_path: str=None, openai_api_key: str = None):
214 |         """初始化向量化器
215 |         
216 |         Args:
217 |             db_path: 向量数据库存储路径
218 |             openai_api_key: OpenAI API密钥，如果不提供则从环境变量获取
219 |         """
220 |         if db_path is None:
221 |             db_path = WORKING_DIR
222 |         self.db = NanoVectorDB(db_path)
223 | 
224 |     
225 |     def _get_embedding(self, text: str) -> list[float]:
226 |         """获取文本的向量表示"""
227 |         response = client.embeddings.create(
228 |             model=os.getenv("EMBEDDING_MODEL"),
229 |             input=text,
230 |             encoding_format="float"
231 |         )
232 |         return response.data[0].embedding
233 |     
234 |     def vectorize_graph(self, graph_file: str):
235 |         """将知识图谱中的实体和关系向量化并存储
236 |         
237 |         Args:
238 |             graph_file: GraphML文件路径
239 |         """
240 |         # 读取图谱
241 |         G = nx.read_graphml(graph_file)
242 |         
243 |         # 向量化并存储实体
244 |         for node, attrs in G.nodes(data=True):
245 |             # 构建实体描述文本
246 |             entity_desc = f"实体ID: {node}"
247 |             if 'node_type' in attrs:
248 |                 entity_desc += f", 类型: {attrs['node_type']}"
249 |             if 'name' in attrs:
250 |                 entity_desc += f", 名称: {attrs['name']}"
251 |             
252 |             # 获取实体向量
253 |             embedding = self._get_embedding(entity_desc)
254 |             
255 |             # 存储实体向量
256 |             self.db.add_entity(
257 |                 entity_id=node,
258 |                 entity_type=attrs.get('node_type'),
259 |                 entity_name=attrs.get('name'),
260 |                 embedding=embedding
261 |             )
262 |         
263 |         # 向量化并存储关系
264 |         for source, target, attrs in G.edges(data=True):
265 |             # 构建关系描述文本
266 |             relation_desc = f"关系: 从 {source} 到 {target}"
267 |             if 'relation' in attrs:
268 |                 relation_desc += f", 类型: {attrs['relation']}"
269 |             
270 |             # 获取关系向量
271 |             embedding = self._get_embedding(relation_desc)
272 |             
273 |             # 存储关系向量
274 |             self.db.add_relation(
275 |                 source_id=source,
276 |                 target_id=target,
277 |                 relation_type=attrs.get('relation'),
278 |                 embedding=embedding
279 |             )
280 |     
281 |     def search(self, query: str, node_type: str = None, search_type: str = 'all', top_k: int = 5, better_than_threshold: float = BETTER_THAN_THRESHOLD):
282 |         """搜索与查询最相关的实体或关系
283 |         Args:
284 |             query: 搜索查询文本
285 |             node_type: 实体类型筛选条件，如果为None则不筛选。可选值包括：
286 |                 - species (种、中文名)
287 |                 - 界
288 |                 - 门
289 |                 - 纲
290 |                 - 目
291 |                 - 科
292 |                 - 属
293 |                 - 自然分布地
294 |                 - 食性
295 |                 - 繁殖特征
296 |                 - 生活习性
297 |                 - 体型
298 |                 - 体色
299 |                 - 体长
300 |                 - 特殊特征
301 |             k: 返回的结果数量
302 |             search_type: 搜索类型，'all'/'entity'/'relation'
303 |             better_than_threshold: 相似度阈值，只返回相似度高于此值的结果
304 |         
305 |         Returns:
306 |             list: 搜索结果，精准的实体名列表
307 |         """
308 |         # 获取查询向量
309 |         query_embedding = self._get_embedding(query)
310 |         results = []
311 |         
312 |         if search_type in ['all', 'entity']:
313 |             entities = self.db.search_entities(query_embedding, k=100)  # 获取更多结果用于筛选
314 |             # 按node_type筛选
315 |             if node_type:
316 |                 entities = [e for e in entities if e['entity_type'] == node_type]
317 |             results.extend(entities)
318 |         
319 |         if search_type in ['all', 'relation']:
320 |             results.extend(self.db.search_relations(query_embedding, k=100))  # 获取更多结果用于筛选
321 |         
322 |         # 按相似度阈值筛选
323 |         results = [r for r in results if r['similarity'] >= better_than_threshold]
324 |         
325 |         # 按相似度排序
326 |         results.sort(key=lambda x: x['similarity'], reverse=True)
327 |         return results[:top_k]
328 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">🌟 Datacapsule 🌟</h1>
  2 | 
  3 | Datacapsule is a knowledge graph-based multi-path retrieval solution designed to achieve precise knowledge retrieval through multi-path retrieval technology. This solution encompasses multiple functional modules, including retrieval systems, entity relationship extraction, entity attribute extraction, entity linking, structured database construction, and question answering systems, providing powerful support for information retrieval and applications.
  4 | 
  5 | <br>
  6 | 
  7 | <div align="center">
  8 |   <img src="https://img.shields.io/badge/Solution-red" />
  9 |   <img src="https://img.shields.io/badge/Free-blue" />
 10 |   <img src="https://img.shields.io/badge/Python-yellow" />
 11 |   <img src="https://img.shields.io/badge/JavaScript-orange" />
 12 |   <img src="https://img.shields.io/badge/TypeScript-blue" />
 13 |   <br />
 14 |   <a href="./readme.md">
 15 |     <img src="https://img.shields.io/badge/English-lightgrey" />
 16 |   </a>
 17 |   <a href="./readme_en.md">
 18 |     <img src="https://img.shields.io/badge/简体中文-lightgrey" />
 19 |   </a>
 20 | </div>
 21 | 
 22 | 
 23 | # Background
 24 | Knowledge graph multi-path retrieval technology is widely used in the field of information retrieval. By constructing a graph database containing rich entities and relationships, it allows users to query relevant information through various methods (such as keywords, entity linking, etc.). This approach not only improves the efficiency of information retrieval but also helps users better understand and utilize complex relationships in the data.
 25 | 
 26 | However, traditional knowledge graph construction has limited speed and efficiency, necessitating a more efficient method for graph construction. At the same time, there are also issues with the efficiency of graph-based retrieval. Therefore, we propose a knowledge graph-based multi-path retrieval technology solution aimed at improving construction efficiency and optimizing retrieval results.
 27 | The system conducts deep understanding of user questions, first determining whether the entities in the user's question exist in the graph. If not, answers are directly obtained through vector retrieval.
 28 | 
 29 | If the entity exists in the graph, the system then determines the type of user question, such as: entity query (e.g., "What is the Taiwan hagfish?"); relationship query (e.g., "What is the relationship between the Taiwan hagfish and the Eptatretus stoutii?"); attribute query (e.g., "What are the living habits of the Eptatretus stoutii?"); statistical query (e.g., "How many species are in the hagfish family?"). Entity queries, relationship queries, and attribute queries are retrieved through graph structure retrieval; statistical queries are retrieved through structured retrieval. 
 30 | 
 31 | # Main Features
 32 | 
 33 | 1. **Function Design Diagram**:
 34 |      ![Function Design Diagram](./images/function-diagram.png)
 35 | 
 36 | 2. **Project Structure Overview**:
 37 | 
 38 | - (Backend service directory)
 39 |   - dspy_program/ (DSPy models and programs directory)
 40 |     - retrieval_demo_18.json (Small sample dataset)
 41 |     - retrieval_demo_130.json (Full-scale dataset)
 42 |     - optimized_program.pkl (Optimized DSPy program)
 43 |     - signature.py (DSPy signature definition file)
 44 |     - examples/ (Training example data)
 45 |   - graph_data_new/ (Knowledge graph data directory)
 46 |     - knowledge_graph-1.html (Knowledge graph visualization file)
 47 |     - knowledge_graph-1.graphml (Knowledge graph data file)
 48 |     - vectors/ (Vector data storage directory)
 49 |       - bio_vectors.json (Biological entity vector data)
 50 |       - relation_vectors.json (Relationship vector data)
 51 |   - tools/ (Tool module directory)
 52 |     - entity_extraction.py (Entity extraction tool)
 53 |     - entity_extraction_db.py (Structured database construction tool)
 54 |   - .dspy_cache/ (DSPy cache directory)
 55 |   - app.py (Main application entry)
 56 |   - dspy_evaluation.py (Evaluation module)
 57 |   - dspy_inference.py (Inference module)
 58 |   - dspy_query_db.py (Database query module)
 59 |   - nanovector_db.py (Vector database implementation)
 60 |   - react_tools.py (Graph query and vector retrieval tools)
 61 |   - requirements.txt (Dependency list)
 62 |   - .env (Environment configuration file)
 63 | 
 64 | - (Frontend service directory)
 65 |   - src/ (Source code directory)
 66 |     - components/ (Components directory)
 67 |       - Chat/ (Chat related components)
 68 |       - Graph/ (Knowledge graph display components)
 69 |       - UI/ (UI elements components)
 70 |     - hooks/ (React hook functions)
 71 |     - services/ (Service call modules)
 72 |     - App.tsx (Main application component)
 73 |     - main.tsx (Entry file)
 74 |   - public/ (Static resource directory)
 75 |     - images/ (Image resources)
 76 |   - package.json (Project configuration and dependencies)
 77 |   - vite.config.ts (Vite configuration file)
 78 |   - tailwind.config.js (TailwindCSS configuration)
 79 |   - .env.example (Environment variable example)
 80 | 
 81 | 3. **Knowledge Graph and Structured Database Construction**: Using DSPy as an intent recognition method to process entity extraction and build graph information, corresponding to the `entity_extraction.py` module, extracting the built graph information into structured information and storing it in a database, corresponding to the `entity_extraction_db.py` module.
 82 | 
 83 | 4. **Knowledge Graph Storage and Management**: Knowledge graph storage and management functionality based on NetworkX, supporting dynamic construction and querying of entity relationships, corresponding to the `ReActTools` module in `react_tools.py`.
 84 | 
 85 | 5. **Vector Database Retrieval**: Lightweight vector database based on NanoVector, supporting efficient semantic similarity retrieval, corresponding to the `NanoVectorDB` module in `nanovector_db.py`.
 86 | 
 87 | 6. **Graph-based Multi-path Retrieval Method**:
 88 | 
 89 |    - Chain of Thought-based reasoning system
 90 |    - Multi-turn dialogue context understanding
 91 |    - Forms a complete reasoning and querying system
 92 |       `dspy_inference.py` integrates various retrieval methods; provides a unified query interface
 93 |       `dspy_query_db.py` handles structured data queries
 94 |       `react_tools.py` integrates vector retrieval and graph retrieval, where the `ReActTools` class is responsible for graph structure retrieval, and the `GraphVectorizer` class is responsible for vector retrieval, calling the functionality of `NanoVectordb.py`
 95 |       `nanovector_db.py` encapsulates the NanoVector library, providing vector database querying, storage, and vector similarity calculation functions
 96 |       `dspy_evaluation.py` ensures reasoning quality and model optimization
 97 | 
 98 |    System Collaborative Workflow:
 99 |    1. User initiates a query → `dspy_inference.py`
100 |    - Receives the user's question
101 |    - Responsible for overall inference process control
102 |    - Determines whether entities in the question exist in the knowledge graph:
103 |      * If not in the graph: directly uses vector retrieval to get answers
104 |      * If in the graph: further determines the question type
105 |    - Question type determination and corresponding retrieval strategies:
106 |      * Entity query (using graph structure retrieval) 
107 |        Example: "What is the Taiwan hagfish?"
108 |      * Relationship query (using graph structure retrieval)
109 |        Example: "What is the relationship between the Taiwan hagfish and the Eptatretus stoutii?"
110 |      * Attribute query (using graph structure retrieval)
111 |        Example: "What are the living habits of the Eptatretus stoutii?"
112 |      * Statistical query (using structured retrieval)
113 |        Example: "How many species are there in the order Hexanchiformes?"
114 | 
115 |    2. Multi-path Retrieval Phase:
116 |       a) Vector Retrieval Path:
117 |          `dspy_inference.py → react_tools.py (GraphVectorizer class) → nanovector_db.py`
118 |          - Converts questions to vectors
119 |          - Calculates vector similarity
120 |          - Returns relevant entities
121 |       b) Graph Structure Retrieval Path:
122 |          `dspy_inference.py → react_tools.py (ReActTools class)`
123 |          - Performs graph traversal based on entities
124 |          - Finds related nodes and relationships
125 |          - Returns structured knowledge
126 |       c) Structured Retrieval Path:
127 |          `dspy_inference.py → dspy_query_db.py`
128 |          - Converts natural language to SQL
129 |          - Queries structured database
130 |          - Returns exact matching results
131 |    3. Result Integration and Reasoning:
132 |       - `dspy_inference.py` integrates multi-path retrieval results
133 |       - Uses DSPy for reasoning and answer generation
134 |       - Generates structured responses
135 |    4. Evaluation and Optimization:
136 |       `dspy_evaluation.py`
137 |       - Evaluates answer quality
138 |       - Collects user feedback
139 |       - Used for model optimization
140 |       - Updates optimizer data
141 |    5. Returns Results to User:
142 |       - Streams answers back
143 |       - Saves interaction records
144 |       - Updates system status
145 | 
146 |    Corresponding to the `dspy_inference.py`, `dspy_evaluation.py`, and `dspy_query_db.py` modules.
147 | 
148 | 7. **Real-time Communication and State Synchronization**:
149 |    - Real-time message pushing implemented with WebSocket
150 |    - Supports streaming conversation responses
151 |    - Real-time feedback of optimizer status
152 |    Corresponding to the WebSocket implementation in `broadcast.py` and `app.py`.
153 | 
154 | 8. **Model Optimizer**:
155 |    - Supports model optimization based on user feedback
156 |    - Version management and rollback functionality
157 |    - Visualization of optimizer processes
158 |    Corresponding to the evaluation optimization module in `dspy_evaluation.py`.
159 | 
160 | 9. **Database Management System**:
161 |    - SQLite storage for user interaction records
162 |    - Supports batch processing of vector data
163 |    - Data version control
164 |    Corresponding to the database management functionality in `dspy_query_db.py`.
165 | 
166 | 10. **Frontend Interactive Interface**:
167 |    - Modern interface based on React 18 + Vite
168 |    - Real-time dialogue window
169 |    - Collection of user Q&A pairs
170 |    - Reasoning process display
171 |    - Optimization progress display
172 |    Corresponding to the implementation in the directory.
173 | 
174 | 11. **System Monitoring and Logging**:
175 |    - Hierarchical logging system based on loguru
176 |    - Performance monitoring and error tracking
177 |    - API call statistics
178 |    Corresponding to the logging implementations in various modules.
179 | 
180 | 12. **Environment Configuration Management**:
181 |     - Support for multiple LLM model configurations
182 |     - Flexible environment variable management
183 |     - Multi-environment deployment support
184 |     Corresponding to the configuration management in `.env` and `.env.example`.
185 | 
186 | # Technical Framework
187 | ## **Frontend Technology Stack**
188 | - Development Language: JavaScript+TypeScript
189 | - Frontend Framework: React 18 + Vite
190 | - UI Framework: TailwindCSS
191 | - Development Tools:
192 |   * Build Tool: Vite
193 | - Real-time Communication: WebSocket client
194 | 
195 | ## **Backend Technology Stack**
196 | - Development Language: Python (Recommended version: 3.8+)
197 | - Web Framework: FastAPI
198 | - Databases:
199 |   * Structured Data: SQLite
200 |   * Vector Database: NanoVector (Lightweight vector database)
201 |   * Graph Structure Information Storage: NetworkX (For knowledge graph storage)
202 | - Knowledge Extraction:
203 |   * Entity & Relationship Extraction: DSPy + CoT (Chain of Thought)
204 | - AI Models:
205 |   * Embedding Model: See configuration in .env.example
206 |   * Large Language Model: Supports OpenAI/DeepSeek, etc., see configuration in .env.example
207 | - Development Tools:
208 |   * Dependency Management: pip
209 |   * Environment Management: python-dotenv
210 |   * Logging System: loguru
211 | 
212 | ## **System Architecture**
213 | - Frontend-backend separation architecture
214 | - WebSocket real-time communication
215 | - Vector retrieval + Graph retrieval + text2sql hybrid recall
216 | - DSPy intent understanding and reasoning
217 | 
218 | **This project focuses primarily on the implementation of the solution, with some code generated by cursor for efficiency**
219 | 
220 | 
221 | # Project Dependencies
222 | See requirements.txt for details 
223 | 
224 | # Quick Start
225 | 
226 | ## 1. Install Dependencies
227 | ```bash
228 | pip install -r requirements.txt
229 | ```
230 | Note: If you encounter errors during installation, it might be due to formatting issues in the requirements.txt file. Suggestions:
231 | - Copy the contents of requirements.txt to a new file
232 | - Check for and remove any special characters
233 | - Use the newly created dependency file for installation
234 | 
235 | ## 2. Configure Environment Variables
236 | Create a .env file in the backend directory based on the .env.example template. The main configuration items are as follows:
237 | 
238 | a) Large Language Model Configuration:
239 |     This project uses DSPy for intent recognition and requires configuration of two separate models:
240 |     DSPy official documentation: https://www.aidoczh.com/dspy/
241 |     1. Q&A/Inference Model: Used for processing user queries and reasoning
242 |     2. Optimization Model: Used for model optimization
243 |     Both models can use the same or different configurations, supporting OpenAI-SDK format models:
244 |     - OpenAI API series: GPT-3.5/4/4o
245 |     - DeepSeek series: deepseek-chat/coder
246 |     - Alibaba Cloud series: Qwen
247 |     - Baidu ERNIE series: ERNIE-Bot
248 |     - Ollama local deployment
249 |     - HuggingFace deployment
250 |     - VLLM high-performance deployment
251 | 
252 | 
253 |     # Q&A/Inference Model Configuration (for processing user queries and reasoning)
254 |     LLM_TYPE="deepseek"                # Model type (can be replaced with other models)
255 |     API_KEY="sk-xxxxxxxxxxxxxxxx"             # API key
256 |     BASE_URL="xxxxxxxxxxxxxxxxxxxxx"  # API base URL
257 |     LLM_MODEL="deepseek-chat"          # Specific model name
258 |     
259 |     # Ollama Configuration (Local deployment solution, suitable for offline environments)
260 |     # LLM_TYPE="ollama_chat"           # Set to use Ollama local model
261 |     # API_KEY=""                       # Ollama local deployment does not require an API key
262 |     # BASE_URL="http://localhost:11434" # Local address of Ollama service
263 |     # LLM_MODEL="xxxxxxxxxxxxx"           # Specific model to use
264 |     
265 |     # Optimization Model Configuration (for model optimization, can be ignored if not optimizing)
266 |     Train_LLM_TYPE="deepseek"            # Optimization model type (can be replaced with other models)
267 |     Train_LLM_MODEL="deepseek-chat" # Specific model used for optimization
268 |     Train_OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxx"  # API key for the optimization model
269 |     Train_OPENAI_BASE_URL="xxxxxxxxxxxxxxxxxxxxx"  # API address for the optimization model
270 | 
271 | b) System Environment Configuration (Core paths and parameter settings):
272 |    ```
273 |    RAG_DIR="graph_data_new"              # Knowledge graph data storage directory
274 |    LOG_LEVEL="DEBUG"                     # Log level (Options: DEBUG, INFO, WARNING, ERROR)
275 |    DATABASE_URL="sqlite:///.dbs/interactions.db"  # Interaction database path
276 |    SPECIES_DB_URL="./.dbs/marine_species.db"     # Species database path
277 |    ```
278 | 
279 | c) Vector Retrieval Configuration (Key parameters affecting retrieval performance):
280 |    ```
281 |    VECTOR_SEARCH_TOP_K=3                 # Maximum number of results returned by vector retrieval
282 |    BETTER_THAN_THRESHOLD=0.7             # Similarity filtering threshold (between 0-1)
283 |    GRAPHML_DIR="graph_entity_relation_detailed.graphml"  # Knowledge graph storage file
284 |    ```
285 | 
286 | d) Embedding Model Configuration (Text vectorization parameters):
287 |    ```
288 |    MAX_BATCH_SIZE=100                    # Batch size, affects processing speed
289 |    EMBEDDING_MAX_TOKEN_SIZE=8192         # Maximum number of tokens per process
290 |    EMBEDDING_DIM=1024                    # Vector dimension
291 |    EMBEDDING_MODEL="xxxxxxxxxxxxxxx"   # Embedding model used
292 |    EMBEDDING_MODEL_BASE_URL="xxxxxxxxxxxxxxxxxxxxx"
293 |    EMBEDDING_MODEL_API_KEY="your-embedding-api-key"  # API key for the embedding service
294 |    ```
295 | 
296 | Important Notes:
297 | - All items marked as "your-xxx-api-key" must be replaced with your actual API keys
298 | - API keys can be obtained from the respective service provider platforms
299 | - Please ensure all necessary configurations are completed before running the program
300 | - Paths in the configuration file can be adjusted according to your actual deployment environment
301 | - It is recommended to verify the correctness of the configuration in a test environment before formal deployment
302 | 
303 | ## 3. Run Services
304 | ## Environment Configuration
305 | This project uses environment variables to configure API and WebSocket addresses.
306 | 
307 | ### Configuration Steps
308 | 1. Copy the `.env.example` file and rename it to `.env` (or `.env.development`, `.env.production`, etc.)
309 | 2. Modify the variable values in the file according to your environment
310 | 
311 | ### Available Environment Variables
312 | - `VITE_API_URL`: Backend API address
313 | - `VITE_WS_URL`: WebSocket service address
314 | 
315 | ### Start Backend Service
316 | ```bash
317 | python app.py
318 | ```
319 | ### Start Frontend Service
320 | ```bash
321 | npm install
322 | - Development environment: `npm run dev` (Uses configuration from `.env.development` or `.env`)
323 | - Production build: `npm run build` (Uses configuration from `.env.production`)
324 | ```
325 | ## 4. Data Processing Information
326 | This project provides two data processing methods:
327 | 1. Use built-in example data (default method)
328 | 2. Use custom data:
329 |    - Use tools/entity_extraction.py for graph data extraction
330 |    - Use entity_extraction_db.py for structured data extraction and storage
331 |    - Processed data will be automatically stored in the locations specified in the configuration file:
332 |      * Graph data: Saved in the directory specified by RAG_DIR
333 |      * Structured data: Saved in the database file specified by SPECIES_DB_URL 
334 | 
335 | ## 5. Operation Steps
336 | **Successful startup interface is as follows**:
337 | ![Startup Success Interface](./images/startup-success.jpg)
338 | 
339 | 
340 | **Questions with entities not in the graph:**
341 | 
342 | ![Non-Entity Information Screenshot](./images/非实体信息截图.jpg)
343 | 
344 | **Additional Explanation**: When the entity in a user's query does not exist in the knowledge graph, the system automatically switches to vector retrieval strategy. The current configuration uses a `top_k=1` parameter, returning only the single result with the highest similarity. This design performs well when handling fuzzy queries within a professional domain, but has limitations when facing queries outside the domain:
345 | 
346 | 1. For complex questions requiring multiple information sources, a single result may not be comprehensive
347 | 2. For statistical questions (such as "How many types..."), the system can only answer based on limited context
348 | 3. For non-professional domain questions, there is insufficient background knowledge for accurate responses
349 | 
350 | This limitation is a trade-off in the current system design and can be improved through:
351 | - Adjusting the `top_k` parameter in `dspy_inference.py` to get more results
352 | - Implementing intelligent forwarding to general models for non-domain questions
353 | - Expanding the knowledge graph to cover a wider range of entity information
354 | 
355 | **Questions with entities in the graph:**
356 | 
357 | - **Entity Query Questions:**
358 | 
359 |   ![Entity Information Query](./images/实体信息查询.jpg)
360 | 
361 | - **Relationship Query Questions:**
362 | 
363 |   ![Relationship Information Query](./images/关系信息查询.jpg)
364 | 
365 | - **Attribute Query Questions:**
366 | 
367 |   ![Attribute Information Query](./images/属性信息查询.jpg)
368 | 
369 | - **Statistical Query Questions:**
370 | 
371 |   ![Statistical Information Query](./images/统计信息查询.jpg)
372 | 
373 |   The correctness of the questions can be verified in ~docs/demo130.json
374 | 
375 | - **Knowledge Graph Display:**
376 | 
377 |   - Click the link on the homepage to access the knowledge graph information
378 | 
379 | - **Building Optimization Samples**:
380 | 
381 |   - Manually modify the content in the "Reasoning Process" and "Model Return" sections on the frontend page
382 |   - Under the current architecture, small sample optimization data (30-50 entries) can achieve some effect
383 |   - ![Optimization Sample](./images/优化样本.jpg)
384 | 
385 | - **Optimize Samples:**
386 | 
387 |   - ![Train All Samples](./images/训练所有样本.jpg) 
388 | 
389 | ### DSPy Intent Understanding Mechanism
390 | 
391 | 1. **Zero-shot Understanding Capability**:
392 | 
393 |    - The DSPy framework uses the ReAct (Reasoning+Acting) pattern, allowing large models to understand user intent without pre-training
394 |    - The system integrates various tool functions through the `ReActModel` class in `dspy_inference.py`
395 |    - The large model automatically selects the most appropriate tool based on the question semantics, for example:
396 |      * Entity question: "What is the Taiwan hagfish?" → calls `find_nodes_by_node_type`
397 |      * Statistical question: "How many species are in the hagfish family?" → calls appropriate counting and query methods
398 | 
399 | 2. **Zero-shot Understanding Implementation Principle**:
400 | 
401 |    - In `dspy_inference.py`, the ReAct module automatically parses the signature and documentation string of each tool function:
402 |      ```python
403 |      # Core code in dspy_inference.py
404 |      self.react = dspy.ReAct(
405 |          DspyInferenceProcessor.MarineBiologyKnowledgeQueryAnswer,
406 |          max_iters = MAX_ITERS,
407 |          tools=[
408 |              processor.find_nodes_by_node_type, 
409 |              processor.get_unique_vector_query_results,
410 |              # ...other tools
411 |          ]
412 |      )
413 |      ```
414 | 
415 |    - The detailed documentation of tool functions provides key context, such as the description in `find_nodes_by_node_type`:
416 |      ```python
417 |      def find_nodes_by_node_type(self, start_node, trget_node_type):
418 |          '''
419 |          This method finds a list of nodes of the specified node type starting from the given node name in the graph data.
420 |          start_node is the name of the tree node to start searching from, only allowing a single node.
421 |          trget_node_type is the target node type, only allowing a single type name.
422 |          The return value is the number and list of nodes with the specified property name starting from that node.
423 |          Known graph data contains a series of marine biology related information:
424 |          1. Biological taxonomy graph data: including "Latin scientific name", "naming year", "author", "Chinese scientific name"
425 |          2. Biological family and genus data: "kingdom", "phylum", "class", "order", "family", "genus", "species"...
426 |          '''
427 |      ```
428 | 
429 |    - DSPy internally generates implicit prompts that guide the model in how to select tools for different questions:
430 |      * When a question contains "What is the Taiwan hagfish", the model understands this is querying the description of a specific entity
431 |      * When a question contains "How many species are in the hagfish family", the model understands this requires a counting operation
432 | 
433 |    - The large model's chain of thought capability (reflected in `react_tools.py`) allows the system to:
434 |      * Analyze key entities and relationships in the question
435 |      * Plan multi-step retrieval strategies
436 |      * Adjust subsequent operations based on intermediate results
437 | 
438 |    This zero-shot understanding capability does not rely on predefined hardcoded rules, but rather on:
439 |    1. Clear function naming and documentation
440 |    2. DSPy's prompt engineering automation
441 |    3. The large model's contextual understanding ability
442 |    4. The ReAct framework's reasoning-action loop mechanism
443 | 
444 | 3. **Tool Selection Mechanism**:
445 |    ```python
446 |    self.react = dspy.ReAct(
447 |        DspyInferenceProcessor.MarineBiologyKnowledgeQueryAnswer,
448 |        max_iters = MAX_ITERS,
449 |        tools=[processor.find_nodes_by_node_type, ...]
450 |    )
451 |    ```
452 |    - The model analyzes question features through Chain-of-Thought
453 |    - Dynamically selects tool combinations based on question type
454 |    - Handles multiple question types without hardcoded rules
455 | 
456 | ###  DSPy Optimization Principles and Effects
457 | 
458 | 1. **Optimization Technology Essence**:
459 |    - DSPy optimization is not traditional parameter fine-tuning, but **prompt engineering automation**
460 |    - The system collects user feedback data through the evaluator in `dspy_evaluation.py`
461 |    - The optimization process is stored in program files (.pkl and .json) in the `dspy_program` directory
462 | 
463 | 2. **Optimization Process**:
464 |    ```python
465 |    # Optimization logic in app.py
466 |    async def run_dspy_optimization(training_data: List[Dict], version: str, ids: List[str]):
467 |        # Collect optimization data
468 |        # Build evaluation metrics
469 |        # Optimize reasoning program
470 |        # Save optimized model
471 |    ```
472 |    - Collect user questions and feedback data as optimization samples
473 |    - Use BiologicalRetrievalEvaluation to evaluate reasoning quality
474 |    - Apply multiple iterations of optimization to generate more precise thinking chain templates
475 | 
476 | 3. **Optimization Effects**:
477 |    - **Intent Understanding Enhancement**: The system can more accurately distinguish between entity queries, relationship queries, attribute queries, and statistical queries
478 |    - **Tool Selection Optimization**: The model learns to combine retrieval tools more efficiently, reducing unnecessary retrieval steps
479 |    - **Reasoning Pattern Improvement**: By analyzing successful cases, the system generates more structured reasoning paths
480 |    - **Domain Adaptability**: The optimized system shows stronger domain-specific understanding capabilities, especially in handling marine biology terminology
481 | 
482 | 4. **Version Comparison**:
483 |    - The optimization effect can be seen by comparing `program_v1.0.1_20250302192606.json` and `program_v1.0.3_20250315154834.pkl` 
484 | 
485 | 
486 | 
487 | 
488 | ## 6. Communication and Issue Discussion
489 | ## About Data
490 | 
491 | ### 1. Data Source Replacement
492 | 
493 | #### Built-in Data Source Replacement
494 | 
495 | This project contains two built-in example datasets (`demo18.json` and `demo130.json`), which share the same structure but differ in data volume. Replacement steps:
496 | 
497 | ```bash
498 | # Replace small test dataset
499 | cp your_small_dataset.json docs/demo18.json
500 | 
501 | # Replace complete dataset
502 | cp your_full_dataset.json docs/demo130.json
503 | ```
504 | 
505 | The two datasets share the same structure and fields, differing only in data volume, convenient for quick testing and complete training.
506 | 
507 | #### Custom Data Introduction
508 | 
509 | Introducing your own domain data requires the following comprehensive adjustments:
510 | 
511 | 1. **Prepare JSON Format Data**
512 |    - The system primarily supports JSON format, containing entity, relationship, and attribute fields
513 | 
514 | 2. **Entity Extraction and Graph Construction**
515 |    - Use `tools/entity_extraction.py` to extract entities from JSON and build the graph
516 |    - Need to modify extraction logic to adapt to your data structure
517 |    - Customize entity type and relationship type mapping
518 | 
519 | 3. **Establish Structured Database**
520 |    - Use `tools/entity_extraction_db.py` to create a relational database
521 |    - Adjust database table structure design
522 |    - Modify field mapping and indexing strategies
523 | 
524 | 4. **Comprehensive DSPy Component Adjustment**
525 | 
526 |    a. `dspy_inference.py`:
527 |       - Redefine question types and intent classification
528 |       - Modify the `MarineBiologyKnowledgeQueryAnswer` signature class and description
529 |       - Adjust ReAct tool selection logic and parameters
530 |       - Customize reasoning flow and decision paths
531 | 
532 |    b. `dspy_evaluation.py`:
533 |       - Redesign evaluation metrics and weights
534 |       - Modify the `BiologicalRetrievalEvaluation` signature to match the new domain
535 |       - Adjust scoring standards and feedback mechanisms
536 | 
537 |    c. `dspy_query_db.py`:
538 |       - Restructure SQL generation logic
539 |       - Adjust `NaturalLanguageToSQL` prompts
540 |       - Modify database queries and result formatting
541 | 
542 |    d. `react_tools.py`:
543 |       - Redefine `NODE_HIERARCHY` to match the hierarchical relationships of the new domain
544 |       - Adjust graph retrieval algorithms and path selection logic
545 |       - Modify vector retrieval parameters and thresholds
546 | 
547 | 5. **Configuration File Adjustment**
548 |    - Modify model parameters in `.env.example` and `.env`
549 |    - Adjust vector retrieval parameters and thresholds
550 |    - Update data paths and filenames
551 | 
552 | 6. **Optimization Data Preparation**
553 |    
554 |    - Create domain-specific example Q&A pairs
555 |    - Write standard reasoning paths as optimization benchmarks
556 |    - Design evaluation samples suitable for the new domain
557 | 
558 | ### 2. Data Scenario Adaptability
559 | 
560 | ### Best Application Scenarios
561 | - **Domains with clear standard answers**: Such as encyclopedia knowledge, product catalogs, technical specifications, etc.
562 | - **Highly structured data**: Knowledge bases with clear entity relationships and well-defined attributes
563 | - **Professional vertical domains**: Such as the marine biology classification system in this project's example
564 | 
565 | ### Scenarios Requiring Additional Work
566 | - **Non-quantifiable evaluation content**: Such as paper summaries, opinion analysis, and other subjective content
567 | - **Scenarios requiring reasoning**: Problems requiring complex logical deduction
568 | - **Multi-source heterogeneous data**: Mixed data from different formats and structures
569 | 
570 | In these scenarios, you need to design custom evaluation metrics to effectively measure system performance.
571 | 
572 | ### 3. Complete Data Processing Workflow (Future Plans)
573 | 
574 | Data cleaning and segmentation are the focus of our next development phase, which will implement the following workflow:
575 | 
576 | ### Data Preprocessing Workflow
577 | 
578 | 1. **Layout Recognition and Conversion**
579 |    - PDF and other documents are converted to structured Markdown through layout recognition models
580 |    - Key steps: Automatic recognition → Structured conversion → Manual verification
581 | 
582 | 2. **Intelligent Content Segmentation**
583 |    - Multiple segmentation strategies: Fixed length, semantic segmentation, page segmentation, recursive chunking
584 |    - Adaptive segmentation: Automatically selecting the best segmentation method based on content characteristics
585 |    - Post-segmentation manual review to ensure quality
586 | 
587 | 3. **Multimodal Vectorization**
588 |    - Text: Generate vector representations using large-scale language models
589 |    - Images: Process through multimodal models, extracting visual and textual semantics
590 |    - Tables: Convert to structured text using specialized models before vectorization
591 |    - All non-text content undergoes human confirmation before vectorization
592 | 
593 | 4. **Structured Processing** (Optional)
594 |    - Convert unstructured content to JSON format using large models
595 |    - Field granularity and depth can be customized according to business requirements
596 |    - Support for complex nested structures and multi-level relationships
597 | 
598 | 5. **Multi-level Index Construction**
599 |    - Vector Index: Semantic vectors of all content enter the vector database
600 |    - Entity Index: Extracted entities and relationships enter specialized indices
601 |    - Structured Index: JSON data is imported into a relational database
602 |    - Hybrid Index: Supports multi-path recall and cross-validation 
603 | 
604 | 
605 | ## System Limitations and Improvement Directions
606 | 
607 | ### Current Intent Recognition Module Limitations
608 | 
609 | 1. **Limited Streaming Output Support**
610 |    - The current framework does not support true incremental streaming output
611 |    - Large responses may cause extended waiting times on the frontend
612 |    - User experience may be affected during complex queries
613 | 
614 | 2. **Optimization Effect Quantification Challenges**
615 |    - Optimization effects are not easily reflected in quantitative metrics
616 |    - Domain adaptability improvements are difficult to measure precisely
617 |    - Comparative test benchmarks are not yet complete
618 | 
619 | 3. **Insufficient Architectural Flexibility**
620 |    - The existing framework has high coupling with business logic
621 |    - Difficult to quickly adapt to new domains and requirements
622 |    - Future goal: Develop into a configurable middleware form, supporting plugin-based development
623 | 
624 | ### Complex Query Processing Capability
625 | 
626 | 1. **Multi-condition Filtering Query Support Status**
627 |    - The system principally supports statistical queries with multiple filtering conditions
628 |    - For example: "How many species of sharks in the order Hexanchiformes are over 3m long and live in the East China Sea?"
629 | 
630 | 2. **Query Precision Dependency Factors**
631 |    - Query precision is highly dependent on the field granularity of structured data
632 |    - Key conditions:
633 |      * User filtering conditions must match the structured data fields processed by `entity_extraction_db.py`
634 |      * Query fields need to be stored as independent attributes (such as "body length", "natural distribution area")
635 |      * If attributes are merged (e.g., multiple features merged into "biological features"), query precision will be significantly reduced
636 | 
637 | 3. **Improvement Directions**
638 |    - Optimize entity extraction logic to support more fine-grained attribute recognition
639 |    - Enhance structured data processing, improve attribute separation and standardization
640 |    - Improve fuzzy matching capabilities to handle non-exact condition expressions
641 |    - Introduce automatic field mapping to enable intelligent correspondence between user queries and data fields
642 | 
643 | ### Response Efficiency Improvement Strategies
644 | 
645 | 1. **Local Deployment Optimization**
646 |    - Local model deployment can significantly improve overall response speed
647 |    - Recommended high-performance inference frameworks:
648 |      * [VLLM](https://github.com/vllm-project/vllm): Supports efficient batch processing and KV caching
649 |      * [Xinference](https://github.com/xorbitsai/xinference): Distributed inference support and resource optimization
650 |    - Model selection recommendations:
651 |      * Not recommended to locally deploy small parameter models (7B/14B), as inference quality is difficult to meet complex reasoning requirements
652 |    
653 | 2. **API Service Selection**
654 |    - Significant performance differences between different service providers
655 |    - Service comparison analysis:
656 |      * DeepSeek official API: Complete functionality but slower response, suitable for non-real-time scenarios
657 |    - Selection recommendations:
658 |      * For cost-sensitive scenarios, choose service providers with better cost-performance ratio while ensuring basic performance
659 |      * Recommended to conduct performance and cost comparison tests across multiple service providers before formal deployment
660 |    
661 |    
662 |    
663 | ## Graph Management and Display Information
664 | 
665 | ### Graph Database and Visualization Optimization
666 | 
667 | 1. **Current Graph Management Architecture**
668 |    - Using lightweight graph database implementation (based on NetworkX)
669 |    - Features and limitations:
670 |      * Efficient and flexible, easy to integrate and deploy
671 |      * Lacks professional graph database management interface
672 |      * Does not support complex visualization configuration and interaction operations
673 |    - Future plans:
674 |      * Integrate professional graph databases (such as Neo4j or TigerGraph)
675 |      * Develop administrator console, supporting graph structure adjustment
676 |      * Optimize storage structure, enhancing large-scale graph processing capabilities
677 | 
678 | 2. **Knowledge Graph Display Optimization**
679 |    - Current implementation:
680 |      * Basic HTML display (`knowledge_graph-1.html`)
681 |      * Simple network graph layout, lacking interactive features
682 |      * Node and edge styles not professionally designed
683 |    - Improvement plans:
684 |      * Introduce professional graph visualization libraries (such as ECharts, Graphin, etc.)
685 |      * Implement adaptive layout and zoom functions
686 |      * Support node grouping, filtering, and highlighting interactive features
687 | 
688 | 3. **Reasoning Process Display Information**
689 |    - Current design:
690 |      * The system deliberately retains and displays detailed reasoning processes
691 |      * Purpose: Facilitate developers and users in deeply understanding system decision paths
692 |      * Helps debugging and verifying reasoning quality
693 |    - Configurable options:
694 |      * Production environments can hide detailed reasoning processes through configuration
695 |      * Development environments can retain complete thinking chains for development and optimization
696 |      * Subsequent versions will provide more fine-grained display control options
697 | 
698 | 
699 | ## 7. Next Steps
700 | ### **From Solution to End-to-End Product**:
701 | 
702 | 1. **Current Positioning and Limitations**
703 |    - The current open-source content is essentially a technical solution
704 |    - Main challenges:
705 |      * When users need to change datasets, extensive code modifications are required
706 |      * High degree of customization, limited reusability
707 |      * High technical threshold, not suitable for direct use by non-technical teams
708 | 
709 | 2. **Product Development Roadmap**
710 |    - Core transformation: From code modification to configuration-driven
711 |    - Planned features:
712 |      * Visual configuration interface: Intent recognition framework signatures, evaluation plans, etc.
713 |      * Modular design: Supporting plug-and-play component replacement
714 |      * Low-code/no-code interface: Lowering usage threshold
715 |      * Automated workflows: Simplifying data preprocessing and model optimization processes
716 |    - Goal: Significantly reduce enterprise knowledge base construction and maintenance costs
717 | 
718 | 3. **"Datacapsule" Product Vision**
719 |    - Product name origin: Datacapsule — small capsule containing enormous energy
720 |    - Core value propositions:
721 |      * Reduce enterprise knowledge construction difficulty
722 |      * Form closed-loop enterprise knowledge barriers
723 |      * Release the potential of large models in vertical domains
724 |    - Application scenarios:
725 |      * Enterprise proprietary knowledge management
726 |      * Professional domain intelligent Q&A
727 |      * Industry knowledge graph construction and application
728 | 
729 | ### Open Collaboration Invitation
730 | 
731 | We sincerely invite developers interested in knowledge graphs, large model applications, data processing, and other fields to join the project. If interested, please scan the QR code at the end of the README file to contact us and explore the future of knowledge enhancement together!
732 | 
733 |    
734 | 
735 | ## 8. Acknowledgments
736 | 
737 | **Project Acknowledgments**: Many thanks to the Baidu PaddlePaddle AI Technology Ecosystem Department: 梦姐、楠哥, and 张翔、新飞 for their strong support and help with this project!
738 | 
739 | **Project Core Contributors**: Loukie7、Alex—鹏哥
740 | 
741 | If you are interested in the project, you can scan the code to add friends. A product communication group will be established later.
742 | 
743 | ![QR Code](./images/二维码.jpg) 
744 | 
745 | 


--------------------------------------------------------------------------------
/readme_en.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">🌟 Datacapsule 🌟</h1>
  2 | 
  3 | Datacapsule是一个基于知识图谱的多路召回解决方案，旨在通过多路召回技术，实现精准的知识检索。该解决方案涵盖了检索系统、实体关系抽取、实体属性抽取、实体链接、结构化数据库构建以及问答系统等多个功能模块，为信息检索和应用提供了强大的支持。
  4 | 
  5 | <br>
  6 | 
  7 | <div align="center">
  8 |   <img src="https://img.shields.io/badge/Solution-red" />
  9 |   <img src="https://img.shields.io/badge/Free-blue" />
 10 |   <img src="https://img.shields.io/badge/Python-yellow" />
 11 |   <img src="https://img.shields.io/badge/JavaScript-orange" />
 12 |   <img src="https://img.shields.io/badge/TypeScript-blue" />
 13 |   <br />
 14 |   <a href="./readme.md">
 15 |     <img src="https://img.shields.io/badge/English-lightgrey" />
 16 |   </a>
 17 |   <a href="./readme_en.md">
 18 |     <img src="https://img.shields.io/badge/简体中文-lightgrey" />
 19 |   </a>
 20 | </div>
 21 | 
 22 | 
 23 | # 背景
 24 | 知识图谱的多路召回技术是一种在信息检索领域中广泛使用的技术，它通过构建一个包含丰富实体和关系的图形数据库，使得用户可以通过多种方式（如关键字、实体链接等）来查询相关信息。这种方法不仅可以提高信息检索的效率，还可以帮助用户更好地理解和利用数据中的复杂关系。
 25 | 
 26 | 但是传统知识图谱构建的速度和效率有限，因此需要一种更高效的构建图谱方法，同时基于图谱的检索的效率也存在问题，因此，我们提出了一种基于知识图谱的多路召回技术解决方案，旨在提高构建效率并优化检索效果。
 27 | 对用户的问题进行深入理解，首先判断用户问题的实体中是否在图谱中，如果不在直接通过向量检索得到答案。
 28 | 
 29 | 如果在实体中，在判断用户问题的种类，如：实体查询：如"什么是台湾盲鳗？"；关系查询：如"台湾盲鳗和蒲氏黏盲鳗有什么关系？"；属性查询：如"蒲氏黏盲鳗的生活习性是什么？"；统计查询：如"盲鳗科有多少种？"。实体查询、关系查询、属性查询通过图结构检索召回；统计查询通过结构化检索召回
 30 | 
 31 | 
 32 | # 主要功能介绍
 33 | 
 34 | 1. **功能设计图**：
 35 |      ![功能设计图](./images/function-diagram.png)
 36 | 
 37 | 2. **项目文件结构概要**：
 38 | 
 39 | - （后端服务目录）
 40 |   - dspy_program/（DSPy模型及程序目录）
 41 |     - retrieval_demo_18.json（小型示例数据集）
 42 |     - retrieval_demo_130.json（完整规模数据集）
 43 |     - optimized_program.pkl（优化后的DSPy程序）
 44 |     - signature.py（DSPy签名定义文件）
 45 |     - examples/（训练示例数据）
 46 |   - graph_data_new/（知识图谱数据目录）
 47 |     - knowledge_graph-1.html（知识图谱可视化文件）
 48 |     - knowledge_graph-1.graphml（知识图谱数据文件）
 49 |     - vectors/（向量数据存储目录）
 50 |       - bio_vectors.json（生物实体向量数据）
 51 |       - relation_vectors.json（关系向量数据）
 52 |   - tools/（工具类模块目录）
 53 |     - entity_extraction.py（实体抽取工具）
 54 |     - entity_extraction_db.py（结构化数据库构建工具）
 55 |   - .dspy_cache/（DSPy缓存目录）
 56 |   - app.py（主应用入口）
 57 |   - dspy_evaluation.py（评估模块）
 58 |   - dspy_inference.py（推理模块）
 59 |   - dspy_query_db.py（数据库查询模块）
 60 |   - nanovector_db.py（向量数据库实现）
 61 |   - react_tools.py（图谱查询与向量检索工具）
 62 |   - requirements.txt（依赖包列表）
 63 |   - .env（环境配置文件）
 64 | 
 65 | - （前端服务目录）
 66 |   - src/（源代码目录）
 67 |     - components/（组件目录）
 68 |       - Chat/（聊天相关组件）
 69 |       - Graph/（知识图谱展示组件）
 70 |       - UI/（界面元素组件）
 71 |     - hooks/（React钩子函数）
 72 |     - services/（服务调用模块）
 73 |     - App.tsx（应用主组件）
 74 |     - main.tsx（入口文件）
 75 |   - public/（静态资源目录）
 76 |     - images/（图片资源）
 77 |   - package.json（项目配置和依赖）
 78 |   - vite.config.ts（Vite配置文件）
 79 |   - tailwind.config.js（TailwindCSS配置）
 80 |   - .env.example（环境变量示例）
 81 | 
 82 | 3. **知识图谱与结构化数据库构建**：基于dspy作为意图识别方法去处理实体抽取，构建图谱信息，对应`entity_extraction.py`模块，将构建的图谱信息抽取为结构化信息存储进数据库中，对应`entity_extraction_db.py`模块。
 83 | 
 84 | 4. **知识图谱存储与管理**：基于 NetworkX 实现的知识图谱存储和管理功能，支持实体关系的动态构建和查询，对应 `react_tools.py` 中的 `ReActTools` 模块。
 85 | 
 86 | 5. **向量数据库检索**：基于 NanoVector 实现的轻量级向量数据库，支持高效的语义相似度检索，对应 `nanovector_db.py` 中的 `NanoVectorDB` 模块。
 87 | 
 88 | 6. **基于图谱的多路召回方法**：
 89 | 
 90 |    - 基于 Chain of Thought 的推理系统
 91 |    - 支持多轮对话的上下文理解
 92 |    - 形成了一个完整的推理和查询系统
 93 |       `dspy_inference.py`  整合各种检索方式；提供统一的查询接口
 94 |       `dspy_query_db.py`  处理结构化数据查询
 95 |       `react_tools.py`  整合向量检索和图检索，`ReActTools`类负责图结构检索，`GraphVectorizer`类负责向量检索，调用 `NanoVectordb.py` 的功能
 96 |       `nanovector_db.py`  封装了NanoVector库，提供了向量数据库的查询、存储和向量相似度计算功能
 97 |       `dspy_evaluation.py`  确保推理质量和模型优化
 98 | 
 99 |    系统协同工作流程：
100 |    1. 用户发起查询 → `dspy_inference.py`
101 |    - 接收用户问题
102 |    - 负责整体推理流程控制
103 |    - 判断问题中的实体是否在知识图谱中：
104 |      * 不在图谱中：直接使用向量检索获取答案
105 |      * 在图谱中：进一步判断问题类型
106 |    - 问题类型判断和对应的检索策略：
107 |      * 实体查询（使用图结构检索） 
108 |        例如："什么是台湾盲鳗？"
109 |      * 关系查询（使用图结构检索）
110 |        例如："台湾盲鳗和蒲氏黏盲鳗有什么关系？"
111 |      * 属性查询（使用图结构检索）
112 |        例如："蒲氏黏盲鳗的生活习性是什么？"
113 |      * 统计查询（使用结构化检索）
114 |        例如："虎鲨目的生物有多少种？"
115 | 
116 |    2. 多路检索阶段：
117 |       a) 向量检索路径：
118 |          `dspy_inference.py → react_tools.py (GraphVectorizer类) → nanovector_db.py`
119 |          - 将问题转换为向量
120 |          - 计算向量相似度
121 |          - 返回相关实体
122 |             b) 图结构检索路径：
123 |             `dspy_inference.py → react_tools.py (ReActTools类)`
124 |          - 基于实体进行图遍历
125 |          - 查找相关节点和关系
126 |          - 返回结构化知识
127 |             c) 结构化检索路径：
128 |             `dspy_inference.py → dspy_query_db.py`
129 |          - 将自然语言转换为SQL
130 |          - 查询结构化数据库
131 |          - 返回精确匹配结果
132 |    3. 结果整合与推理：
133 |       - `dspy_inference.py` 整合多路检索结果
134 |       - 使用 DSPy 进行推理和答案生成
135 |       - 生成结构化的回答
136 |    4. 评估与优化：
137 |       `dspy_evaluation.py`
138 |       - 评估答案质量
139 |       - 收集用户反馈
140 |       - 用于模型优化
141 |       - 更新优化器数据
142 |    5. 返回结果给用户：
143 |       - 流式返回答案
144 |       - 保存交互记录
145 |       - 更新系统状态
146 | 
147 |    对应 `dspy_inference.py` 、 `dspy_evaluation.py` 和 `dspy_query_db.py` 模块。
148 | 
149 | 7. **实时通信与状态同步**：
150 |    - WebSocket 实现的实时消息推送
151 |    - 支持流式输出的对话响应
152 |    - 优化器状态的实时反馈
153 |    对应 `broadcast.py` 和 `app.py` 中的 WebSocket 实现。
154 | 
155 | 8. **模型优化器**：
156 |    - 支持基于用户反馈的模型优化
157 |    - 版本管理和回滚功能
158 |    - 优化器过程可视化
159 |    对应 `dspy_evaluation.py` 中的评估优化模块。
160 | 
161 | 9. **数据库管理系统**：
162 |    - SQLite 存储用户交互记录
163 |    - 支持向量数据的批量处理
164 |    - 数据版本控制
165 |    对应 `dspy_query_db.py` 中的数据库管理功能。
166 | 
167 | 10. **前端交互界面**：
168 | 
169 |    - 基于 React 18 + Vite 的现代化界面
170 |    - 实时对话窗口
171 |    - 用户问答对收集
172 |    - 推理过程展示
173 |    - 优化进度展示
174 |       对应前端目录的实现。
175 | 
176 | 11. **系统监控与日志**：
177 |    - 基于 loguru 的分级日志系统
178 |    - 性能监控和错误追踪
179 |    - API 调用统计
180 |       对应各模块中的日志记录实现。
181 | 
182 | 12. **环境配置管理**：
183 |     - 支持多种 LLM 模型配置
184 |     - 灵活的环境变量管理
185 |     - 多环境部署支持
186 |     对应 `.env` 和 `.env.example` 的配置管理。
187 | 
188 | 
189 | # 技术框架
190 | ## **前端技术栈**
191 | - 开发语言：JavaScript+TypeScript
192 | - 前端框架：React 18 + Vite
193 | - UI 框架：TailwindCSS
194 | - 开发工具：
195 |   * 构建工具：Vite
196 | - 实时通信：WebSocket 客户端
197 | 
198 | ## **后端技术栈**
199 | - 开发语言：Python (推荐版本：3.8+)
200 | - Web 框架：FastAPI
201 | - 数据库：
202 |   * 结构化数据：SQLite
203 |   * 向量数据库：NanoVector (轻量级向量数据库)
204 |   * 图结构信息存储：NetworkX (用于知识图谱存储)
205 | - 知识抽取：
206 |   * 实体&关系抽取：DSPy + CoT (Chain of Thought)
207 | - AI 模型：
208 |   * Embedding 模型：支持 配置见 .env.example
209 |   * 大语言模型：支持 OpenAI/DeepSeek等，配置见 .env.example
210 | - 开发工具：
211 |   * 依赖管理：pip
212 |   * 环境管理：python-dotenv
213 |   * 日志系统：loguru
214 | 
215 | ## **系统架构**
216 | - 前后端分离架构
217 | - WebSocket 实时通信
218 | - 向量检索 + 图检索 + text2sql混合召回
219 | - DSPy 意图理解和推理
220 | 
221 | **本项目主要关注解决方案的实现，部分代码由cursor生成提效**
222 | 
223 | 
224 | # 项目依赖
225 | 详情参考requirements.txt 
226 | 
227 | 
228 | # 快速开始
229 | ## 0.拉取服务
230 | ### 拉取后端服务
231 | ```bash
232 | git clone https://github.com/loukie7/Datacapsule.git
233 | ```
234 | ### 拉取前端服务
235 | 点击[Datacapsule-webui 项目仓库](https://github.com/loukie7/Datacapsule-webui)查看
236 | 
237 | ## 1. 安装依赖
238 | ```bash
239 | cd Datacapsule
240 | pip install -r requirements.txt
241 | ```
242 | 注意：如果安装时报错，可能是requirements.txt文件格式问题，建议：
243 | - 复制requirements.txt内容到新文件
244 | - 检查并删除可能的特殊字符
245 | - 使用新创建的依赖文件进行安装
246 | 
247 | ## 2. 配置环境变量
248 | 在目录下创建.env文件，并按照.env.example模板进行配置。主要配置项如下：
249 | 
250 | a) 大语言模型配置：
251 |     本项目使用DSPy进行意图识别，需要配置两个独立的模型：
252 |     Dspy官方中文文档信息：https://www.aidoczh.com/dspy/
253 |     1. 问答/推理模型：用于处理用户查询和推理
254 |     2. 优化模型：用于模型优化
255 |     两个模型可以使用相同或不同的配置，支持OpenAI-SDK格式的模型：
256 |     - OpenAI API系列：GPT-3.5/4/4o
257 |     - DeepSeek系列：deepseek-chat/coder
258 |     - 阿里云系列：Qwen/通义千问
259 |     - 百度文心系列：ERNIE-Bot
260 |     - Ollama本地部署
261 |     - HuggingFace部署
262 |     - VLLM高性能部署
263 | 
264 | 
265 |     # 问答/推理模型配置（用于处理用户查询和推理）
266 |     LLM_TYPE="deepseek"                # 模型类型(可替换为其他模型)
267 |     API_KEY="sk-xxxxxxxxxxxxxxxx"             # API密钥
268 |     BASE_URL="xxxxxxxxxxxxxxxxxxxxx"  # API基础地址
269 |     LLM_MODEL="deepseek-chat"          # 具体的模型名称
270 |     
271 |     # Ollama配置（本地部署方案，适合离线环境）
272 |     # LLM_TYPE="ollama_chat"           # 设置为使用Ollama本地模型
273 |     # API_KEY=""                       # Ollama本地部署不需要API密钥
274 |     # BASE_URL="http://localhost:11434" # Ollama服务的本地地址
275 |     # LLM_MODEL="xxxxxxxxxxxxx"           # 使用的具体模型
276 |     
277 |     # 优化模型配置（用于模型后优化,如果不优化可以忽略）
278 |     Train_LLM_TYPE="deepseek"            # 优化模型类型(可替换为其他模型)
279 |     Train_LLM_MODEL="deepseek-chat" # 优化使用的具体模型
280 |     Train_OPENAI_API_KEY="xxxxxxxxxxxxxxxxxxxxx"  # 优化模型的API密钥
281 |     Train_OPENAI_BASE_URL="xxxxxxxxxxxxxxxxxxxxx"  # 优化模型的API地址
282 | 
283 | b) 系统环境配置（核心路径和参数设置）：
284 |    ```
285 |    RAG_DIR="graph_data_new"              # 知识图谱数据存储目录
286 |    LOG_LEVEL="DEBUG"                     # 日志级别（可选：DEBUG, INFO, WARNING, ERROR）
287 |    DATABASE_URL="sqlite:///.dbs/interactions.db"  # 交互数据库路径
288 |    SPECIES_DB_URL="./.dbs/marine_species.db"     # 物种数据库路径
289 |    ```
290 | 
291 | c) 向量检索配置（影响检索效果的关键参数）：
292 |    ```
293 |    VECTOR_SEARCH_TOP_K=3                 # 向量检索返回的最大结果数
294 |    BETTER_THAN_THRESHOLD=0.7             # 相似度筛选阈值（0-1之间）
295 |    GRAPHML_DIR="graph_entity_relation_detailed.graphml"  # 知识图谱存储文件
296 |    ```
297 | 
298 | d) Embedding模型配置（文本向量化参数）：
299 |    ```
300 |    MAX_BATCH_SIZE=100                    # 批处理大小，影响处理速度
301 |    EMBEDDING_MAX_TOKEN_SIZE=8192         # 单次处理的最大token数
302 |    EMBEDDING_DIM=1024                    # 向量维度
303 |    EMBEDDING_MODEL="xxxxxxxxxxxxxxx"   # 使用的embedding模型
304 |    EMBEDDING_MODEL_BASE_URL="xxxxxxxxxxxxxxxxxxxxx"
305 |    EMBEDDING_MODEL_API_KEY="your-embedding-api-key"  # embedding服务的API密钥
306 |    ```
307 | 
308 | 重要注意事项：
309 | - 所有标注为 "your-xxx-api-key" 的配置项必须替换为您申请的实际API密钥
310 | - API密钥可以从相应的服务提供商平台获取：
311 | - 请确保在运行程序前完成所有必要的配置
312 | - 配置文件中的路径可以根据实际部署环境进行调整
313 | - 建议在正式部署前在测试环境中验证配置的正确性
314 | 
315 | ## 3. 运行服务
316 | 
317 | ## 环境配置
318 | 本项目使用环境变量来配置API和WebSocket地址。
319 | 
320 | ### 配置步骤
321 | 1. 复制`.env.example`文件并重命名为`.env`（或`.env.development`、`.env.production`等）
322 | 2. 根据你的环境修改文件中的变量值
323 | 
324 | ### 可用环境变量
325 | - `VITE_API_URL`: 后端API地址
326 | - `VITE_WS_URL`: WebSocket服务地址
327 | 
328 | ### 启动后端服务
329 | 确保Datacapsule和Datacapsule-webui在一个文件夹内
330 | ```bash
331 | cd Datacapsule
332 | python app.py
333 | ```
334 | ### 启动前端服务
335 | 点击[Datacapsule-webui 项目仓库](https://github.com/loukie7/Datacapsule-webui)查看
336 | 
337 | ## 4. 数据处理说明
338 | 本项目提供了两种数据处理方式：
339 | 1. 使用内置示例数据（默认方式）
340 | 2. 使用自定义数据：
341 |    - 使用 tools/entity_extraction.py 进行图数据抽取
342 |    - 使用 entity_extraction_db.py 进行结构化数据抽取与存储
343 |    - 处理后的数据将自动存储在配置文件指定的位置：
344 |      * 图数据：保存在 RAG_DIR 指定目录
345 |      * 结构化数据：保存在 SPECIES_DB_URL 指定的数据库文件
346 | 
347 | 
348 | ## 5. 运行步骤
349 | **启动成功后的界面如下**：
350 | ![启动成功界面](./images/startup-success.jpg)
351 | 
352 | 
353 | **实体不在图谱中的问题：**
354 | 
355 | ![非实体信息截图](./images/非实体信息截图.jpg)
356 | 
357 | **补充说明**：当用户查询的实体不存在于知识图谱中时，系统会自动切换至向量检索策略。当前配置使用 `top_k=1` 参数，仅返回相似度最高的单个结果。这种设计在处理专业领域内的模糊查询时表现良好，但面对领域外查询时存在局限性：
358 | 
359 | 1. 对于需要综合多个信息源的复杂问题，单一结果可能不够全面
360 | 2. 对于统计类问题（如"有多少种..."），系统只能基于有限上下文回答
361 | 3. 对于非专业领域的问题，缺乏足够的背景知识进行准确响应
362 | 
363 | 此限制是系统当前设计的权衡结果，可通过以下方式改进：
364 | - 在 `dspy_inference.py` 中调整 `top_k` 参数以获取更多结果
365 | - 对非领域问题实现智能转发至通用模型
366 | - 扩展知识图谱覆盖更广泛的实体信息
367 | 
368 | **实体在图谱中的问题：**
369 | 
370 | - **实体查询问题：**
371 | 
372 |   ![实体信息查询](./images/实体信息查询.jpg)
373 | 
374 | - **关系查询问题：**
375 | 
376 |   ![关系信息查询](./images/关系信息查询.jpg)
377 | 
378 | - **属性查询问题：**
379 | 
380 |   ![属性信息查询](./images/属性信息查询.jpg)
381 | 
382 | - **统计查询问题：**
383 | 
384 |   ![统计信息查询](./images/统计信息查询.jpg)
385 | 
386 |   问题的正确性可以去～docs/demo130.json自行验证
387 | 
388 | - **知识图谱展现：**
389 | 
390 |   - 点击首页link链接即可即可获取知识图谱信息
391 | 
392 | - **构建优化样本**：
393 | 
394 |   - 人工去修改前端页面中“推理过程”和“模型返回”中的内容
395 |   - 目前架构下小样本的优化数据（30-50条）能取得一定的效果
396 |   - ![优化样本](./images/优化样本.jpg)
397 | 
398 | - **优化样本：**
399 | 
400 |   - ![训练所有样本](./images/训练所有样本.jpg)
401 | 
402 |   
403 | 
404 | 
405 | 
406 | ### DSPy 意图理解机制
407 | 
408 | 1. **零样本理解能力**：
409 | 
410 |    - DSPy 框架使用 ReAct（Reasoning+Acting）模式，允许大模型在无需预训练的情况下理解用户意图
411 |    - 系统通过 `dspy_inference.py` 中的 `ReActModel` 类集成了多种工具函数
412 |    - 大模型根据问题语义自动选择最合适的工具，例如：
413 |      * 实体问题："什么是台湾盲鳗？" → 调用 `find_nodes_by_node_type`
414 |      * 统计问题："盲鳗科有多少种？" → 调用适当的计数和查询方法
415 | 
416 | 2. **零样本理解的实现原理**：
417 | 
418 |    - 在 `dspy_inference.py` 中，ReAct 模块会自动解析每个工具函数的签名和文档字符串：
419 |      ```python
420 |      # dspy_inference.py 中的核心代码
421 |      self.react = dspy.ReAct(
422 |          DspyInferenceProcessor.MarineBiologyKnowledgeQueryAnswer,
423 |          max_iters = MAX_ITERS,
424 |          tools=[
425 |              processor.find_nodes_by_node_type, 
426 |              processor.get_unique_vector_query_results,
427 |              # ...其他工具
428 |          ]
429 |      )
430 |      ```
431 | 
432 |    - 工具函数的详细文档提供了关键上下文，如 `find_nodes_by_node_type` 中的描述：
433 |      ```python
434 |      def find_nodes_by_node_type(self, start_node, trget_node_type):
435 |          '''
436 |          此方法会根据传入的节点名称，在图数据中以该节点为起点查找包含指定节点类型的节点列表。
437 |          start_node 为开始查找的树节点名称，只允许单个节点。
438 |          trget_node_type 目标节点类型,只允许单个类型名称。
439 |          返回值为从该节点开始，包含指定属性名的节点数量与节点列表。
440 |          已知图数据中存在一系列的海洋生物相关信息：
441 |          1. ⽣物分类学图数据：包括"拉丁学名", "命名年份", "作者", "中文学名"
442 |          2. ⽣物科属于数据："界", "门", "纲", "目", "科", "属", "种"...
443 |          '''
444 |      ```
445 | 
446 |    - DSPy 内部生成隐式提示，引导模型如何为不同问题选择工具：
447 |      * 当问题包含"台湾盲鳗是什么"时，模型理解这是查询特定实体的描述
448 |      * 当问题包含"盲鳗科有多少种"时，模型理解这需要计数操作
449 | 
450 |    - 大模型的思维链能力（在 `react_tools.py` 中体现）让系统能够：
451 |      * 分析问题中的关键实体和关系
452 |      * 规划多步检索策略
453 |      * 根据中间结果调整后续操作
454 | 
455 |    这种零样本理解能力不依赖于预先定义的硬编码规则，而是依托于:
456 |    1. 函数的清晰命名和文档
457 |    2. DSPy的提示工程自动化
458 |    3. 大模型的上下文理解能力
459 |    4. ReAct框架的推理-行动循环机制
460 | 
461 | 3. **工具选择机制**：
462 |    ```python
463 |    self.react = dspy.ReAct(
464 |        DspyInferenceProcessor.MarineBiologyKnowledgeQueryAnswer,
465 |        max_iters = MAX_ITERS,
466 |        tools=[processor.find_nodes_by_node_type, ...]
467 |    )
468 |    ```
469 |    - 模型通过思考链（Chain-of-Thought）分析问题特征
470 |    - 基于问题类型动态选择工具组合
471 |    - 无需硬编码规则即可处理多种问题类型
472 | 
473 | ###  DSPy 优化原理与效果
474 | 
475 | 1. **优化技术本质**：
476 |    - DSPy 优化不是传统的参数微调，而是**提示工程自动化**
477 |    - 系统通过 `dspy_evaluation.py` 中的评估器收集用户反馈数据
478 |    - 优化过程存储在 `dspy_program` 目录中的程序文件（.pkl 和 .json）
479 | 
480 | 2. **优化流程**：
481 |    ```python
482 |    # app.py 中的优化逻辑
483 |    async def run_dspy_optimization(training_data: List[Dict], version: str, ids: List[str]):
484 |        # 收集优化数据
485 |        # 构建评估指标
486 |        # 优化推理程序
487 |        # 保存优化后的模型
488 |    ```
489 |    - 收集用户提问和反馈数据作为优化样本
490 |    - 使用 BiologicalRetrievalEvaluation 评估推理质量
491 |    - 应用多次迭代优化，生成更精确的思考链模板
492 | 
493 | 3. **优化效果**：
494 |    - **意图理解增强**：系统能更准确区分实体查询、关系查询、属性查询和统计查询
495 |    - **工具选择优化**：模型学会更高效地组合检索工具，减少不必要的检索步骤
496 |    - **推理模式改进**：通过分析成功案例，系统生成更结构化的推理路径
497 |    - **领域适应性**：优化后的系统表现出更强的领域特定理解能力，尤其在海洋生物学术语处理上
498 | 
499 | 4. **版本比较**：
500 |    - 通过比较 `program_v1.0.1_20250302192606.json` 和 `program_v1.0.3_20250315154834.pkl` 可见优化效果
501 |    
502 |    
503 |    
504 | 
505 | ## 6. 交流与问题讨论
506 | ## About data
507 | 
508 | ### 1. 数据源替换
509 | 
510 | #### 内置数据源替换
511 | 
512 | 本项目包含两个内置示例数据集（`demo18.json`和`demo130.json`），它们结构相同但数据量不同。替换步骤：
513 | 
514 | ```bash
515 | # 替换小型测试数据集
516 | cp your_small_dataset.json docs/demo18.json
517 | 
518 | # 替换完整数据集
519 | cp your_full_dataset.json docs/demo130.json
520 | ```
521 | 
522 | 两个数据集共享相同的结构和字段，仅在数据量上有区别，方便您进行快速测试和完整训练。
523 | 
524 | #### 自定义数据引入
525 | 
526 | 引入您自己的领域数据需要以下全面调整：
527 | 
528 | 1. **准备JSON格式数据**
529 |    - 系统优先支持JSON格式，包含实体、关系和属性字段
530 | 
531 | 2. **实体抽取与图谱构建**
532 |    - 使用`tools/entity_extraction.py`从JSON中提取实体并构建图谱
533 |    - 需修改抽取逻辑以适配您的数据结构
534 |    - 自定义实体类型和关系类型映射
535 | 
536 | 3. **建立结构化数据库**
537 |    - 使用`tools/entity_extraction_db.py`创建关系型数据库
538 |    - 调整数据库表结构设计
539 |    - 修改字段映射和索引策略
540 | 
541 | 4. **DSPy组件全面调整**
542 | 
543 |    a. `dspy_inference.py`：
544 |       - 重新定义问题类型和意图分类
545 |       - 修改`MarineBiologyKnowledgeQueryAnswer`签名类及描述
546 |       - 调整ReAct工具选择逻辑和参数
547 |       - 自定义推理流程和决策路径
548 | 
549 |    b. `dspy_evaluation.py`：
550 |       - 重新设计评估指标和权重
551 |       - 修改`BiologicalRetrievalEvaluation`签名以匹配新领域
552 |       - 调整评分标准和反馈机制
553 | 
554 |    c. `dspy_query_db.py`：
555 |       - 重构SQL生成逻辑
556 |       - 调整`NaturalLanguageToSQL`提示
557 |       - 修改数据库查询和结果格式化
558 | 
559 |    d. `react_tools.py`：
560 |       - 重新定义`NODE_HIERARCHY`以匹配新领域的层级关系
561 |       - 调整图检索算法和路径选择逻辑
562 |       - 修改向量检索参数和阈值
563 | 
564 | 5. **配置文件调整**
565 |    - 修改`.env.example`和`.env`中的模型参数
566 |    - 调整向量检索参数和阈值
567 |    - 更新数据路径和文件名
568 | 
569 | 6. **优化数据准备**
570 |    
571 |    - 创建领域特定的示例问答对
572 |    - 编写标准推理路径作为优化基准
573 |    - 设计适合新领域的评估样本
574 | 
575 | ### 2. 数据场景适配性
576 | 
577 | ### 最佳适用场景
578 | - **有明确标准答案的领域**：如百科知识、产品目录、技术规范等
579 | - **结构化程度高的数据**：实体关系明确、属性定义清晰的知识库
580 | - **专业垂直领域**：如本项目示例中的海洋生物学分类系统
581 | 
582 | ### 需要额外工作的场景
583 | - **非量化评估内容**：如论文概要、观点分析等主观内容
584 | - **需要推理的场景**：需要复杂逻辑推导的问题
585 | - **多源异构数据**：来自不同格式、不同结构的混合数据
586 | 
587 | 在这些场景中，您需要设计自定义评估指标才能有效衡量系统表现。
588 | 
589 | ### 3. 数据处理全流程（未来规划）
590 | 
591 | 数据清洗和切分是我们下一阶段的重点开发方向，将实现以下流程：
592 | 
593 | ### 数据预处理流程
594 | 
595 | 1. **版面识别转换**
596 |    - PDF等文档通过版面识别模型转换为结构化Markdown
597 |    - 关键步骤：自动识别→结构化转换→人工校验
598 | 
599 | 2. **智能内容切分**
600 |    - 多种切分策略：固定长度、语义分割、页面分割、递归分块
601 |    - 自适应切分：根据内容特点自动选择最佳切分方式
602 |    - 切分后进行人工复核确保质量
603 | 
604 | 3. **多模态向量化**
605 |    - 文本：使用大规模语言模型生成向量表示
606 |    - 图像：通过多模态模型处理，提取视觉与文本语义
607 |    - 表格：专用模型转换为结构化文本后向量化
608 |    - 所有非文本内容经过人工确认后再进行向量化
609 | 
610 | 4. **结构化处理**（可选）
611 |    - 通过大模型将非结构化内容转换为JSON格式
612 |    - 字段粒度和深度可根据业务需求定制
613 |    - 支持复杂嵌套结构和多级关系
614 | 
615 | 5. **多级索引构建**
616 |    - 向量索引：所有内容的语义向量进入向量数据库
617 |    - 实体索引：抽取的实体及关系进入专用索引
618 |    - 结构化索引：JSON数据导入关系型数据库
619 |    - 混合索引：支持多路召回和交叉验证
620 | 
621 | 
622 | 
623 | ## 系统局限性与改进方向
624 | 
625 | ### 当前意图识别模块的局限
626 | 
627 | 1. **流式输出支持有限**
628 |    - 当前框架不支持真正的增量式流式输出
629 |    - 大型响应可能导致前端等待时间延长
630 |    - 用户体验在复杂查询时可能受到影响
631 | 
632 | 2. *优化效果量化挑战**
633 |    - 优化效果不易在量化指标上直观体现
634 |    - 领域适应性提升难以精确衡量
635 |    - 对比测试基准尚不完善
636 | 
637 | 3. **架构灵活性不足**
638 |    - 现有框架与业务逻辑耦合度较高
639 |    - 难以快速适应新领域和新需求
640 |    - 未来目标：发展为可配置的中间件形态，支持插件式开发
641 | 
642 | ### 复杂查询处理能力
643 | 
644 | 1. **多条件筛选查询支持情况**
645 |    - 系统原则上支持多条件筛选的统计查询
646 |    - 例如："体长3m以上，生活在东海的虎鲨目鲨鱼有多少种？"
647 | 
648 | 2. **查询精度依赖因素**
649 |    - 查询精度高度依赖于结构化数据的字段粒度
650 |    - 关键条件：
651 |      * 用户筛选条件必须与`entity_extraction_db.py`处理的结构化数据字段匹配
652 |      * 查询字段需作为独立属性存储（如"体长"、"自然分布地"）
653 |      * 若属性被合并（如多种特征合并为"生物特征"），查询精度将显著降低
654 | 
655 | 3. **改进方向**
656 |    - 优化实体抽取逻辑，支持更细粒度的属性识别
657 |    - 增强结构化数据处理，改进属性分离与标准化
658 |    - 提升模糊匹配能力，处理非精确条件表述
659 |    - 引入自动字段映射，实现用户查询与数据字段的智能对应
660 | 
661 | ### 响应效率提升策略
662 | 
663 | 1. **本地部署优化**
664 |    - 本地模型部署可显著提升整体响应速度
665 |    - 推荐搭配高性能推理框架：
666 |      * [VLLM](https://github.com/vllm-project/vllm)：支持高效批处理和KV缓存
667 |      * [Xinference](https://github.com/xorbitsai/xinference)：分布式推理支持和资源优化
668 |    - 模型选择建议：
669 |      * 不推荐本地部署小参数模型（7B/14B），推理质量难以满足复杂推理需求
670 |    
671 | 2. **API服务选择**
672 |    - 不同服务提供商性能差异显著
673 |    - 服务对比分析：
674 |      * DeepSeek官方API：功能完整但响应较慢，适合非实时场景
675 |    - 选择建议：
676 |      * 对成本敏感的场景，可在保证基本性能的前提下选择性价比更高的服务商
677 |      * 建议在正式部署前进行多服务商的性能和成本对比测试
678 |    
679 |    
680 |    
681 | ## 图谱管理与展示说明
682 | 
683 | ### 图数据库与可视化优化
684 | 
685 | 1. **当前图谱管理架构**
686 |    - 采用轻量级图数据库实现（基于NetworkX）
687 |    - 特点与局限：
688 |      * 高效灵活，便于集成和部署
689 |      * 缺少专业的图数据库管理界面
690 |      * 不支持复杂的可视化配置和交互操作
691 |    - 未来规划：
692 |      * 集成专业图数据库（如Neo4j或TigerGraph）
693 |      * 开发管理员控制台，支持图谱结构调整
694 |      * 优化存储结构，提升大规模图谱处理能力
695 | 
696 | 2. **知识图谱展现优化**
697 |    - 当前实现：
698 |      * 基础HTML展示（`knowledge_graph-1.html`）
699 |      * 简单网络图布局，缺乏交互功能
700 |      * 节点和边的样式未经专业设计
701 |    - 改进计划：
702 |      * 引入专业图可视化库（如ECharts、Graphin等）
703 |      * 实现自适应布局和缩放功能
704 |      * 支持节点分组、过滤和高亮等交互特性
705 | 
706 | 3. **推理过程展示说明**
707 |    - 当前设计：
708 |      * 系统故意保留并展示详细的推理过程
709 |      * 目的：方便开发者和用户深入理解系统决策路径
710 |      * 有助于调试和验证推理质量
711 |    - 可配置选项：
712 |      * 生产环境可通过配置隐藏详细推理过程
713 |      * 研发环境可保留完整思考链用于开发和优化
714 |      * 后续版本将提供更精细的展示控制选项
715 | 
716 | 
717 | 
718 | ## 7. 下一步计划
719 | ### **从解决方案到端到端产品**：
720 | 
721 | 1. **当前定位与局限**
722 |    - 目前开源内容本质上是一套技术解决方案
723 |    - 主要挑战：
724 |      * 用户需更换数据集时，需修改大量代码
725 |      * 定制化程度高，可复用性有限
726 |      * 技术门槛较高，不适合非技术团队直接使用
727 | 
728 | 2. **产品化发展路线**
729 |    - 核心转变：从代码修改到配置驱动
730 |    - 规划功能：
731 |      * 可视化配置界面：意图识别框架签名、评估方案等
732 |      * 模块化设计：支持即插即用的组件替换
733 |      * 低代码/无代码接口：降低使用门槛
734 |      * 自动化工作流：简化数据预处理和模型优化过程
735 |    - 目标：大幅降低企业知识库构建与维护成本
736 | 
737 | 3. **"数据胶囊"产品愿景**
738 |    - 产品名称由来：Datacapsule（数据胶囊）—— 小小胶囊蕴含庞大能量
739 |    - 核心价值主张：
740 |      * 降低企业知识构建难度
741 |      * 形成企业闭环的知识壁垒
742 |      * 释放大模型在垂直领域的潜力
743 |    - 适用场景：
744 |      * 企业专有知识管理
745 |      * 专业领域智能问答
746 |      * 行业知识图谱构建与应用
747 | 
748 | ### 开放协作邀请
749 | 
750 | 我们诚挚邀请对知识图谱、大模型应用、数据处理等领域感兴趣的开发者加入项目。如有兴趣，请扫描README文件末尾的二维码与我们联系，一起探索知识增强的未来！
751 | 
752 |    
753 | 
754 | ## 8.鸣谢
755 | 
756 | **项目鸣谢**：十分感谢百度飞桨AI技术生态部：梦姐、楠哥和张翔、新飞同学对本项目的大力支持与帮助！
757 | 
758 | **项目核心贡献者**：Loukie7、Alex—鹏哥
759 | 
760 | 对项目感兴趣的同学可以扫码添加好友，后续会成立产品交流社群
761 | 
762 | ![二维码](./images/二维码.jpg)
763 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | aioboto3==13.4.0
  2 | aiobotocore==2.18.0
  3 | aiofiles==24.1.0
  4 | aiohappyeyeballs==2.4.6
  5 | aiohttp==3.11.12
  6 | aioitertools==0.12.0
  7 | aiosignal==1.3.2
  8 | alembic==1.14.1
  9 | annotated-types==0.7.0
 10 | anyio==4.8.0
 11 | appnope==0.1.4
 12 | APScheduler==3.11.0
 13 | asttokens==3.0.0
 14 | asyncer==0.0.8
 15 | asyncpg==0.30.0
 16 | attrs==25.1.0
 17 | azure-ai-documentintelligence==1.0.0
 18 | azure-core==1.32.0
 19 | azure-identity==1.20.0
 20 | backoff==2.2.1
 21 | beautifulsoup4==4.13.3
 22 | boto3==1.36.1
 23 | botocore==1.36.1
 24 | cachetools==5.5.1
 25 | certifi==2025.1.31
 26 | cffi==1.17.1
 27 | charset-normalizer==3.4.1
 28 | click==8.1.8
 29 | cloudpickle==3.1.1
 30 | cobble==0.1.4
 31 | colorlog==6.9.0
 32 | comm==0.2.2
 33 | cryptography==43.0.3
 34 | dashscope==1.22.1
 35 | datasets==3.3.0
 36 | debugpy==1.8.12
 37 | decorator==5.1.1
 38 | defusedxml==0.7.1
 39 | dill==0.3.8
 40 | diskcache==5.6.3
 41 | distro==1.9.0
 42 | dnspython==2.7.0
 43 | dspy==2.6.3
 44 | email_validator==2.2.0
 45 | et_xmlfile==2.0.0
 46 | executing==2.2.0
 47 | fastapi==0.115.8
 48 | fastapi-sso==0.16.0
 49 | filelock==3.17.0
 50 | frozenlist==1.5.0
 51 | fsspec==2024.12.0
 52 | gunicorn==22.0.0
 53 | h11==0.14.0
 54 | httpcore==1.0.7
 55 | httptools==0.6.4
 56 | httpx==0.28.1
 57 | huggingface-hub==0.28.1
 58 | idna==3.10
 59 | importlib_metadata==8.6.1
 60 | ipykernel==6.29.5
 61 | ipython==8.32.0
 62 | isodate==0.7.2
 63 | jedi==0.19.2
 64 | Jinja2==3.1.5
 65 | jiter==0.8.2
 66 | jmespath==1.0.1
 67 | joblib==1.4.2
 68 | json_repair==0.37.0
 69 | jsonlines==4.0.0
 70 | jsonpickle==4.0.2
 71 | jsonschema==4.23.0
 72 | jsonschema-specifications==2024.10.1
 73 | jupyter_client==8.6.3
 74 | jupyter_core==5.7.2
 75 | litellm==1.61.6
 76 | loguru==0.7.3
 77 | lxml==5.3.1
 78 | magicattr==0.1.6
 79 | Mako==1.3.9
 80 | mammoth==1.9.0
 81 | markdownify==1.0.0
 82 | markitdown==0.0.1a4
 83 | MarkupSafe==3.0.2
 84 | matplotlib-inline==0.1.7
 85 | msal==1.31.1
 86 | msal-extensions==1.2.0
 87 | multidict==6.1.0
 88 | multiprocess==0.70.16
 89 | nano-vectordb==0.0.4.3
 90 | nest-asyncio==1.6.0
 91 | networkx==3.4.2
 92 | numpy==1.26.4
 93 | oauthlib==3.2.2
 94 | olefile==0.47
 95 | openai==1.63.0
 96 | openpyxl==3.1.5
 97 | optuna==4.2.1
 98 | orjson==3.10.15
 99 | packaging==24.2
100 | pandas==2.2.3
101 | parso==0.8.4
102 | pathvalidate==3.2.3
103 | pdfminer.six==20240706
104 | pexpect==4.9.0
105 | pillow==11.1.0
106 | platformdirs==4.3.6
107 | portalocker==2.10.1
108 | prompt_toolkit==3.0.50
109 | propcache==0.2.1
110 | psutil==7.0.0
111 | ptyprocess==0.7.0
112 | pure_eval==0.2.3
113 | puremagic==1.28
114 | pyarrow==19.0.0
115 | pycparser==2.22
116 | pydantic==2.10.6
117 | pydantic_core==2.27.2
118 | pydub==0.25.1
119 | Pygments==2.19.1
120 | PyJWT==2.10.1
121 | PyMySQL==1.1.1
122 | PyNaCl==1.5.0
123 | python-dateutil==2.9.0.post0
124 | python-dotenv==1.0.1
125 | python-multipart==0.0.18
126 | python-pptx==1.0.2
127 | pytz==2025.1
128 | pyvis==0.3.2
129 | PyYAML==6.0.2
130 | pyzmq==26.2.1
131 | redis==5.2.1
132 | referencing==0.36.2
133 | regex==2024.11.6
134 | requests==2.32.3
135 | rpds-py==0.22.3
136 | rq==2.1.0
137 | s3transfer==0.11.2
138 | six==1.17.0
139 | sniffio==1.3.1
140 | soupsieve==2.6
141 | SpeechRecognition==3.14.1
142 | SQLAlchemy==2.0.38
143 | stack-data==0.6.3
144 | starlette==0.45.3
145 | tenacity==9.0.0
146 | tiktoken==0.7.0
147 | tokenizers==0.21.0
148 | tornado==6.4.2
149 | tqdm==4.67.1
150 | traitlets==5.14.3
151 | typing_extensions==4.12.2
152 | tzdata==2025.1
153 | tzlocal==5.3
154 | ujson==5.10.0
155 | urllib3==2.3.0
156 | uvicorn==0.29.0
157 | uvloop==0.21.0
158 | watchfiles==1.0.4
159 | wcwidth==0.2.13
160 | websocket-client==1.8.0
161 | websockets==15.0
162 | wrapt==1.17.2
163 | xlrd==2.0.1
164 | XlsxWriter==3.2.2
165 | xxhash==3.5.0
166 | yarl==1.18.3
167 | youtube-transcript-api==0.6.3
168 | zipp==3.21.0
169 | 


--------------------------------------------------------------------------------
/tools/entity_extraction.py:
--------------------------------------------------------------------------------
  1 | import os,io,sys
  2 | import json
  3 | import networkx as nx
  4 | import dspy
  5 | 
  6 | # 定义用于分类的签名
  7 | class ClassifyDistribution(dspy.Signature):
  8 |     """将生物的自然分布地文本拆分为多个具体的地理位置实体。"""
  9 |     text = dspy.InputField()
 10 |     locations = dspy.OutputField(description="从文本中提取的地理位置列表")
 11 | 
 12 | class ClassifyHabits(dspy.Signature):
 13 |     """将生物的生活习性文本拆分为多个具体的习性特征实体。"""
 14 |     text = dspy.InputField()
 15 |     feeding = dspy.OutputField(description="食性信息")
 16 |     reproduction = dspy.OutputField(description="繁殖信息")
 17 |     behavior = dspy.OutputField(description="行为特征")
 18 | 
 19 | class ClassifyFeatures(dspy.Signature):
 20 |     """将生物的生物特征文本拆分为多个具体的特征实体。"""
 21 |     text = dspy.InputField()
 22 |     body_shape = dspy.OutputField(description="体型特征")
 23 |     body_color = dspy.OutputField(description="体色特征")
 24 |     body_size = dspy.OutputField(description="体长信息")
 25 |     special_features = dspy.OutputField(description="特殊特征")
 26 | 
 27 | # 创建分类器
 28 | class DistributionExtractor(dspy.Module):
 29 |     def __init__(self):
 30 |         self.classifier = dspy.Predict(ClassifyDistribution)
 31 |     
 32 |     def forward(self, text):
 33 |         return self.classifier(text=text)
 34 | 
 35 | class HabitsExtractor(dspy.Module):
 36 |     def __init__(self):
 37 |         self.classifier = dspy.Predict(ClassifyHabits)
 38 |     
 39 |     def forward(self, text):
 40 |         return self.classifier(text=text)
 41 | 
 42 | class FeaturesExtractor(dspy.Module):
 43 |     def __init__(self):
 44 |         self.classifier = dspy.Predict(ClassifyFeatures)
 45 |     
 46 |     def forward(self, text):
 47 |         return self.classifier(text=text)
 48 | 
 49 | # 设置DSPy的语言模型
 50 | def setup_dspy():
 51 |     ali= dspy.LM(
 52 |             f'deepseek/{os.getenv("ALI_LLM_MODEL")}',
 53 |             base_url=os.getenv("ALI_OPENAI_BASE_URL"),
 54 |             api_key=os.getenv("ALI_OPENAI_API_KEY")
 55 |     )
 56 |     dspy.settings.configure(lm=ali)
 57 |     
 58 | # 主函数
 59 | def process_entities():
 60 |     # 设置DSPy
 61 |     setup_dspy()
 62 |     
 63 |     # 初始化提取器
 64 |     distribution_extractor = DistributionExtractor()
 65 |     habits_extractor = HabitsExtractor()
 66 |     features_extractor = FeaturesExtractor()
 67 |     
 68 |     # 读取JSON数据
 69 |     with open('/Users/idw/rags/modellens_dspyv3.0/docs/demo130.json', 'r', encoding='utf-8') as f:
 70 |         data = json.load(f)
 71 |     
 72 |     # 创建有向图
 73 |     G = nx.DiGraph()
 74 |     
 75 |     # 遍历每个生物实体
 76 |     print("开始处理生物实体数据...")
 77 |     print(f"共加载 {len(data)} 个生物实体数据")
 78 |     for entity_index, entity in enumerate(data):
 79 |         # 使用中文学名作为唯一标识符
 80 |         entity_id = entity['中文学名']
 81 |         print(f"\n[{entity_index+1}/{len(data)}] 正在处理生物: {entity_id}（拉丁学名: {entity['拉丁学名']}）")
 82 |         print(f"  分类信息: 界={entity['界']}, 门={entity['门']}, 纲={entity['纲']}, 目={entity['目']}, 科={entity['科']}, 属={entity['属']}, 种={entity['种']}")
 83 |         
 84 |         # 安全获取命名信息，处理可能缺失的字段
 85 |         naming_year = entity.get('命名年份', '未知')
 86 |         # 如果命令年份不是字符串则转换为字符串
 87 |         if not isinstance(naming_year, str):
 88 |             naming_year = str(naming_year)
 89 |         author = entity.get('作者', '未知')
 90 |         print(f"  命名信息: 命名年份={naming_year}, 作者={author}")
 91 |         
 92 |         # 添加实体节点，包含基本属性，使用get方法安全获取可能缺失的字段
 93 |         G.add_node(entity_id,
 94 |                    中文学名=entity['中文学名'],
 95 |                    拉丁学名=entity['拉丁学名'],  # 添加拉丁学名属性
 96 |                    命名年份=naming_year,
 97 |                    作者=entity.get('作者', ''),
 98 |                    node_type='种')  # 将'species'改为'种'
 99 |         
100 |         # 添加命名年份节点
101 |         year_node_id = f"年份_{naming_year}"
102 |         G.add_node(year_node_id, name=naming_year, node_type="命名年份")
103 |         G.add_edge(entity_id, year_node_id, relation="命名于")
104 |         # 添加作者节点
105 |         author_node_id = f"作者_{author}"
106 |         G.add_node(author_node_id, name=author, node_type="作者")
107 |         G.add_edge(entity_id, author_node_id, relation="作者为")
108 |         # 添加拉丁学名节点
109 |         latin_name_node_id = f"拉丁学名_{entity['拉丁学名']}"
110 |         G.add_node(latin_name_node_id, name=entity['拉丁学名'], node_type="拉丁学名")
111 |         G.add_edge(entity_id, latin_name_node_id, relation="拉丁学名")
112 |         # 添加分类层级关系
113 |         print(f"  构建分类层级关系...")
114 |         taxonomy_levels = ['界', '门', '纲', '目', '科', '属']  # 移除'种'级别
115 |         for i in range(len(taxonomy_levels)):
116 |             current_level = taxonomy_levels[i]
117 |             current_value = entity[current_level]
118 |             
119 |             # 添加分类节点
120 |             if not G.has_node(current_value):
121 |                 G.add_node(current_value, node_type=current_level)
122 |                 print(f"    - 添加{current_level}节点: {current_value}")
123 |             
124 |             # 添加分类关系边
125 |             if i > 0:
126 |                 previous_level = taxonomy_levels[i-1]
127 |                 previous_value = entity[previous_level]
128 |                 G.add_edge(previous_value, current_value, relation='包含')
129 |                 print(f"    - 添加关系: {previous_value} 包含 {current_value}")
130 |         
131 |         # 直接连接属到物种实体，跳过种级别
132 |         G.add_edge(entity['属'], entity_id, relation='包含')
133 |         print(f"    - 添加关系: {entity['属']} 包含 {entity_id}")
134 |         
135 |         # 处理自然分布地
136 |         print(f"  处理 {entity_id} 的自然分布地信息...")
137 |         try:
138 |             print(f"  原始自然分布地文本: {entity['自然分布地']}")
139 |             distribution_result = distribution_extractor(entity['自然分布地'])
140 |             print(f"  提取到的地理位置: {distribution_result.locations}")
141 |             # 如果是字符串则根据','分割成字符串，还需要'，'的分割
142 |             locations = []
143 |             if isinstance(distribution_result.locations, str):
144 |                 # 如果是'，'的还需要分割
145 |                 if ',' in distribution_result.locations:
146 |                     locations = distribution_result.locations.split(',')
147 |                 # 再循环判断一下是否有'，'的
148 |                 for location in locations:
149 |                     if '，' in location:
150 |                         locations.append(location)
151 |                     # 去除空格
152 |                 locations = [location.strip() for location in locations] 
153 |             else:
154 |                 locations = distribution_result.locations
155 |            
156 |             for idx, location in enumerate(locations):
157 |                 # 过滤掉无效的地理位置信息
158 |                 if location and location.strip() and location != "无信息" and location != "不明确":
159 |                     location_id = f"{location}"
160 |                     G.add_node(location_id, name=location, node_type='自然分布地')
161 |                     G.add_edge(entity_id, location_id, relation='分布于')
162 |                     print(f"    - 添加地理位置: {location}")
163 |         except Exception as e:
164 |             print(f"  处理自然分布地时出错: {e}")
165 |             # 如果分类失败，添加原始文本
166 |             location_id = f"{entity['自然分布地']}"
167 |             G.add_node(location_id, name=entity['自然分布地'], node_type='自然分布地')
168 |             G.add_edge(entity_id, location_id, relation='分布于')
169 |             print(f"  使用原始文本作为地理位置: {entity['自然分布地']}")
170 | 
171 |         
172 |         # 处理生活习性
173 |         print(f"  处理 {entity_id} 的生活习性信息...")
174 |         try:
175 |             print(f"  原始生活习性文本: {entity['生活习性']}")
176 |             habits_result = habits_extractor(entity['生活习性'])
177 |             print(f"  食性={habits_result.feeding}, 繁殖={habits_result.reproduction}, 行为={habits_result.behavior}")
178 |             
179 |             # 添加食性信息
180 |             if habits_result.feeding and "无具体" not in habits_result.feeding and "不明确" not in habits_result.feeding:
181 |                 feeding_id = f"{habits_result.feeding}"
182 |                 G.add_node(feeding_id, name=habits_result.feeding, node_type='生活习性')
183 |                 G.add_edge(entity_id, feeding_id, relation='食性为') 
184 |                 print(f"    - 添加食性: {habits_result.feeding}")
185 |             
186 |             # 添加繁殖信息
187 |             if habits_result.reproduction and "无具体" not in habits_result.reproduction and "不明确" not in habits_result.reproduction:
188 |                 reproduction_id = f"{habits_result.reproduction}"
189 |                 G.add_node(reproduction_id, name=habits_result.reproduction, node_type='生活习性')
190 |                 G.add_edge(entity_id, reproduction_id, relation='繁殖特征')  # 修改关系方向
191 |                 print(f"    - 添加繁殖信息: {habits_result.reproduction}")
192 |             
193 |             # 添加行为特征
194 |             if habits_result.behavior and "无具体" not in habits_result.behavior and "不明确" not in habits_result.behavior:
195 |                 behavior_id = f"{habits_result.behavior}"
196 |                 G.add_node(behavior_id, name=habits_result.behavior, node_type='生活习性')
197 |                 G.add_edge(entity_id, behavior_id, relation='行为特征')  # 修改关系方向
198 |                 print(f"    - 添加行为特征: {habits_result.behavior}")
199 |         except Exception as e:
200 |             print(f"  处理生活习性时出错: {e}")
201 |             # 如果分类失败，添加原始文本
202 |             habits_id = f"{entity['生活习性']}"
203 |             G.add_node(habits_id, name=entity['生活习性'], node_type='生活习性')
204 |             G.add_edge(entity_id, habits_id, relation='生活习性')
205 |             print(f"  使用原始文本作为生活习性: {entity['生活习性']}")
206 | 
207 |         
208 |         # 处理生物特征
209 |         print(f"  处理 {entity_id} 的生物特征信息...")
210 |         try:
211 |             print(f"  原始生物特征文本: {entity['生物特征']}")
212 |             features_result = features_extractor(entity['生物特征'])
213 |             print(f"  提取结果: 体型={features_result.body_shape}, 体色={features_result.body_color}, 体长={features_result.body_size}, 特殊特征={features_result.special_features}")
214 |             
215 |             
216 |             # 添加体型特征
217 |             if features_result.body_shape and "无具体" not in features_result.body_shape and "不明确" not in features_result.body_shape:
218 |                 shape_id = f"{features_result.body_shape}"
219 |                 G.add_node(shape_id, name=features_result.body_shape, node_type='生物特征')
220 |                 G.add_edge(entity_id, shape_id, relation='体型为')  # 修改关系方向
221 |                 print(f"    - 添加体型特征: {features_result.body_shape}")
222 |             
223 |             # 添加体色特征
224 |             if features_result.body_color and "无具体" not in features_result.body_color and "不明确" not in features_result.body_color:
225 |                 color_id = f"{features_result.body_color}"
226 |                 G.add_node(color_id, name=features_result.body_color, node_type='生物特征')
227 |                 G.add_edge(entity_id, color_id, relation='体色为')  # 修改关系方向
228 |                 print(f"    - 添加体色特征: {features_result.body_color}")
229 |             
230 |             # 添加体长信息
231 |             if features_result.body_size and "无具体" not in features_result.body_size and "不明确" not in features_result.body_size:
232 |                 size_id = f"{features_result.body_size}"
233 |                 G.add_node(size_id, name=features_result.body_size, node_type='生物特征')
234 |                 G.add_edge(entity_id, size_id, relation='体长为')  # 修改关系方向
235 |                 print(f"    - 添加体长信息: {features_result.body_size}")
236 |             
237 |             # 添加特殊特征
238 |             if features_result.special_features and "无具体" not in features_result.special_features and "不明确" not in features_result.special_features:
239 |                 special_id = f"{features_result.special_features}"
240 |                 G.add_node(special_id, name=features_result.special_features, node_type='生物特征')
241 |                 G.add_edge(entity_id, special_id, relation='特殊特征')  # 修改关系方向
242 |                 print(f"    - 添加特殊特征: {features_result.special_features}")
243 |         except Exception as e:
244 |             print(f"  处理生物特征时出错: {e}")
245 |             # 如果分类失败，添加原始文本
246 |             features_id = f"{entity['生物特征']}"
247 |             G.add_node(features_id, name=entity['生物特征'], node_type='生物特征')
248 |             G.add_edge(entity_id, features_id, relation='生物特征')
249 |             print(f"  使用原始文本作为生物特征: {entity['生物特征']}")
250 | 
251 |     
252 |     # 保存为GraphML格式
253 |     output_file = '/Users/idw/rags/modellens_dspyv3.0/graph_data_new/graph_entity_relation_detailed.graphml'
254 |     print(f"\n保存知识图谱到文件: {output_file}")
255 |     nx.write_graphml(G, output_file, encoding='utf-8')
256 |     print(f"已成功生成详细的实体关系图: {output_file}")
257 |     print(f"图谱统计信息:")
258 |     print(f"  - 总节点数: {G.number_of_nodes()}")
259 |     print(f"  - 总边数: {G.number_of_edges()}")
260 |     
261 |     # 统计各类型节点数量
262 |     node_types = {}
263 |     for node, attrs in G.nodes(data=True):
264 |         node_type = attrs.get('node_type', 'unknown')
265 |         node_types[node_type] = node_types.get(node_type, 0) + 1
266 |     
267 |     print(f"  - 节点类型统计:")
268 |     for node_type, count in node_types.items():
269 |         print(f"    * {node_type}: {count}个节点")
270 |     
271 |     print(f"处理完成!")
272 | 
273 | 
274 | if __name__ == "__main__":
275 |     import os
276 |     from dotenv import load_dotenv
277 |     load_dotenv(override=True)
278 |     process_entities()
279 | 
280 |     


--------------------------------------------------------------------------------
/tools/entity_extraction_db.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import re
  4 | import dspy
  5 | import sqlite3
  6 | from dotenv import load_dotenv
  7 | 
  8 | # 定义用于分类的签名
  9 | class ClassifyDistribution(dspy.Signature):
 10 |     """将生物的自然分布地文本拆分为多个具体的地理位置实体。"""
 11 |     text = dspy.InputField()
 12 |     locations = dspy.OutputField(description="从文本中提取的地理位置列表")
 13 | 
 14 | class ClassifyHabits(dspy.Signature):
 15 |     """从生物的生活习性文本中提取数值特征。"""
 16 |     text = dspy.InputField()
 17 |     depth = dspy.OutputField(description="栖息水深（米）")
 18 |     temperature = dspy.OutputField(description="温度范围（摄氏度）")
 19 |     egg_count = dspy.OutputField(description="产卵量（粒）")
 20 |     other_numerical = dspy.OutputField(description="其他数值特征，格式为[{名称, 数值, 单位}]")
 21 | 
 22 | class ClassifyFeatures(dspy.Signature):
 23 |     """从生物的生物特征文本中提取数值特征。"""
 24 |     text = dspy.InputField()
 25 |     body_length = dspy.OutputField(description="体长或全长（厘米或米）")
 26 |     body_weight = dspy.OutputField(description="体重（克或千克）")
 27 |     other_numerical = dspy.OutputField(description="其他数值特征，格式为[{名称, 数值, 单位}]")
 28 | 
 29 | 
 30 | 
 31 | # 创建提取器
 32 | class DistributionExtractor(dspy.Module):
 33 |     def __init__(self):
 34 |         self.classifier = dspy.Predict(ClassifyDistribution)
 35 |     
 36 |     def forward(self, text):
 37 |         return self.classifier(text=text)
 38 | 
 39 | class HabitsExtractor(dspy.Module):
 40 |     def __init__(self):
 41 |         self.classifier = dspy.Predict(ClassifyHabits)
 42 |     
 43 |     def forward(self, text):
 44 |         return self.classifier(text=text)
 45 | 
 46 | class FeaturesExtractor(dspy.Module):
 47 |     def __init__(self):
 48 |         self.classifier = dspy.Predict(ClassifyFeatures)
 49 |     
 50 |     def forward(self, text):
 51 |         return self.classifier(text=text)
 52 | 
 53 | # 设置DSPy的语言模型
 54 | def setup_dspy():
 55 |     load_dotenv(override=True)
 56 |     
 57 |     if os.getenv("ALI_LLM_MODEL"):
 58 |         ali = dspy.LM(
 59 |             f'deepseek/{os.getenv("ALI_LLM_MODEL")}',
 60 |             base_url=os.getenv("ALI_OPENAI_BASE_URL"),
 61 |             api_key=os.getenv("ALI_OPENAI_API_KEY")
 62 |         )
 63 |         dspy.settings.configure(lm=ali)
 64 |     else:
 65 |         # 默认使用OpenAI
 66 |         dspy.settings.configure(lm="openai")
 67 | 
 68 | # 海洋生物数据处理类
 69 | class MarineSpeciesProcessor:
 70 |     def __init__(self, db_path):
 71 |         """初始化处理器
 72 |         
 73 |         Args:
 74 |             db_path: SQLite数据库文件路径
 75 |         """
 76 |         self.db_path = db_path
 77 |         setup_dspy()
 78 |         self.distribution_extractor = DistributionExtractor()
 79 |         self.habits_extractor = HabitsExtractor()
 80 |         self.features_extractor = FeaturesExtractor()
 81 |         self._setup_database()
 82 |     
 83 |     def _setup_database(self):
 84 |         """创建数据库表"""
 85 |         with sqlite3.connect(self.db_path) as conn:
 86 |             # 物种基本信息表
 87 |             conn.execute('''
 88 |             CREATE TABLE IF NOT EXISTS species (
 89 |                 id INTEGER PRIMARY KEY,
 90 |                 latin_name TEXT NOT NULL,
 91 |                 naming_year INTEGER,
 92 |                 author TEXT,
 93 |                 chinese_name TEXT,
 94 |                 kingdom TEXT,
 95 |                 phylum TEXT,
 96 |                 class TEXT,
 97 |                 order_name TEXT,
 98 |                 family TEXT,
 99 |                 genus TEXT,
100 |                 species_name TEXT,
101 |                 body_length TEXT
102 |             )
103 |             ''')
104 |             
105 |             # 地理分布表
106 |             conn.execute('''
107 |             CREATE TABLE IF NOT EXISTS distributions (
108 |                 id INTEGER PRIMARY KEY,
109 |                 species_id INTEGER,
110 |                 location TEXT,
111 |                 FOREIGN KEY (species_id) REFERENCES species(id)
112 |             )
113 |             ''')
114 |             
115 |             # 数值特性表
116 |             conn.execute('''
117 |             CREATE TABLE IF NOT EXISTS numerical_traits (
118 |                 id INTEGER PRIMARY KEY,
119 |                 species_id INTEGER,
120 |                 trait_type TEXT,
121 |                 trait_name TEXT,
122 |                 value REAL,
123 |                 unit TEXT,
124 |                 FOREIGN KEY (species_id) REFERENCES species(id)
125 |             )
126 |             ''')
127 |             
128 |             # 原始文本描述表
129 |             conn.execute('''
130 |             CREATE TABLE IF NOT EXISTS descriptions (
131 |                 id INTEGER PRIMARY KEY,
132 |                 species_id INTEGER,
133 |                 description_type TEXT,
134 |                 content TEXT,
135 |                 FOREIGN KEY (species_id) REFERENCES species(id)
136 |             )
137 |             ''')
138 |     
139 |     def _extract_body_length(self, text):
140 |         """从生物特征文本中提取体长信息
141 |         
142 |         Args:
143 |             text: 生物特征文本
144 |             
145 |         Returns:
146 |             提取的体长信息或None
147 |         """
148 |         # 匹配常见的体长表述格式
149 |         patterns = [
150 |             r'体长(?:为)?(\d+(?:[.．]\d+)?(?:\s*[-－~～至]\s*\d+(?:[.．]\d+)?)?)\s*(?:厘米|cm|CM)',
151 |             r'体长(?:为)?约(\d+(?:[.．]\d+)?(?:\s*[-－~～至]\s*\d+(?:[.．]\d+)?)?)\s*(?:厘米|cm|CM)',
152 |             r'全长(?:为)?(\d+(?:[.．]\d+)?(?:\s*[-－~～至]\s*\d+(?:[.．]\d+)?)?)\s*(?:厘米|cm|CM)',
153 |             r'全长(?:为)?约(\d+(?:[.．]\d+)?(?:\s*[-－~～至]\s*\d+(?:[.．]\d+)?)?)\s*(?:厘米|cm|CM)',
154 |             r'全长可达(\d+(?:[.．]\d+)?(?:\s*[-－~～至]\s*\d+(?:[.．]\d+)?)?)\s*(?:米|m|M)',
155 |             r'体长约(\d+(?:[.．]\d+)?(?:\s*[-－~～至]\s*\d+(?:[.．]\d+)?)?)\s*(?:厘米|cm|CM)',
156 |             r'全长约(\d+(?:[.．]\d+)?(?:\s*[-－~～至]\s*\d+(?:[.．]\d+)?)?)\s*(?:厘米|cm|CM)'
157 |         ]
158 |         
159 |         for pattern in patterns:
160 |             match = re.search(pattern, text)
161 |             if match:
162 |                 return match.group(1)
163 |         
164 |         return None
165 |     
166 |     def _extract_locations(self, distribution_text):
167 |         """使用DSPy从分布地文本中提取具体地理位置
168 |         
169 |         Args:
170 |             distribution_text: 分布地文本
171 |             
172 |         Returns:
173 |             地理位置列表
174 |         """
175 |         try:
176 |             result = self.distribution_extractor(distribution_text)
177 |             locations = []
178 |             
179 |             # 处理返回的地理位置，可能是字符串或列表
180 |             if isinstance(result.locations, str):
181 |                 # 如果是字符串，按逗号分割
182 |                 if ',' in result.locations:
183 |                     locations.extend([loc.strip() for loc in result.locations.split(',')])
184 |                 elif '，' in result.locations:
185 |                     locations.extend([loc.strip() for loc in result.locations.split('，')])
186 |                 else:
187 |                     locations.append(result.locations.strip())
188 |             else:
189 |                 # 如果是列表，直接使用
190 |                 locations = result.locations
191 |             
192 |             # 过滤无效位置
193 |             filtered_locations = []
194 |             for loc in locations:
195 |                 if loc and loc.strip() and loc != "无信息" and loc != "不明确":
196 |                     filtered_locations.append(loc.strip())
197 |             
198 |             return filtered_locations
199 |         except Exception as e:
200 |             print(f"提取地理位置时出错: {e}")
201 |             return []
202 |     
203 |     def _extract_numerical_traits_from_habits(self, text):
204 |         """从生活习性文本中提取数值特征
205 |         
206 |         Args:
207 |             text: 生活习性文本
208 |             
209 |         Returns:
210 |             数值特征列表，每项包含trait_name, value, unit
211 |         """
212 |         try:
213 |             result = self.habits_extractor(text)
214 |             traits = []
215 |             
216 |             # 处理栖息水深
217 |             if result.depth and result.depth not in ["无", "未知", "不明确"]:
218 |                 # 提取数字和单位
219 |                 match = re.search(r'(\d+(?:\.\d+)?(?:\s*[-~]\s*\d+(?:\.\d+)?)?)\s*(米|m)', result.depth)
220 |                 if match:
221 |                     value_str = match.group(1)
222 |                     unit = match.group(2)
223 |                     
224 |                     # 如果是范围，取平均值
225 |                     if '-' in value_str or '~' in value_str:
226 |                         parts = re.split(r'[-~]', value_str)
227 |                         try:
228 |                             value = (float(parts[0].strip()) + float(parts[1].strip())) / 2
229 |                         except:
230 |                             value = float(parts[0].strip())
231 |                     else:
232 |                         value = float(value_str)
233 |                     
234 |                     traits.append({
235 |                         "name": "栖息水深",
236 |                         "value": value,
237 |                         "unit": unit
238 |                     })
239 |             
240 |             # 处理温度范围
241 |             if result.temperature and result.temperature not in ["无", "未知", "不明确"]:
242 |                 match = re.search(r'(\d+(?:\.\d+)?(?:\s*[-~]\s*\d+(?:\.\d+)?)?)\s*(°C|℃)', result.temperature)
243 |                 if match:
244 |                     value_str = match.group(1)
245 |                     unit = match.group(2)
246 |                     
247 |                     if '-' in value_str or '~' in value_str:
248 |                         parts = re.split(r'[-~]', value_str)
249 |                         try:
250 |                             value = (float(parts[0].strip()) + float(parts[1].strip())) / 2
251 |                         except:
252 |                             value = float(parts[0].strip())
253 |                     else:
254 |                         value = float(value_str)
255 |                     
256 |                     traits.append({
257 |                         "name": "适宜温度",
258 |                         "value": value,
259 |                         "unit": unit
260 |                     })
261 |             
262 |             # 处理产卵量
263 |             if result.egg_count and result.egg_count not in ["无", "未知", "不明确"]:
264 |                 match = re.search(r'(\d+(?:\.\d+)?(?:\s*[-~]\s*\d+(?:\.\d+)?)?万?\s*)(粒|个)', result.egg_count)
265 |                 if match:
266 |                     value_str = match.group(1)
267 |                     unit = match.group(2)
268 |                     
269 |                     # 处理"万"单位
270 |                     multiplier = 10000 if "万" in value_str else 1
271 |                     value_str = value_str.replace("万", "").strip()
272 |                     
273 |                     if '-' in value_str or '~' in value_str:
274 |                         parts = re.split(r'[-~]', value_str)
275 |                         try:
276 |                             value = (float(parts[0].strip()) + float(parts[1].strip())) / 2 * multiplier
277 |                         except:
278 |                             value = float(parts[0].strip()) * multiplier
279 |                     else:
280 |                         value = float(value_str) * multiplier
281 |                     
282 |                     traits.append({
283 |                         "name": "产卵量",
284 |                         "value": value,
285 |                         "unit": unit
286 |                     })
287 |             
288 |             # 处理其他数值特征
289 |             if result.other_numerical and isinstance(result.other_numerical, list):
290 |                 for trait in result.other_numerical:
291 |                     if isinstance(trait, dict) and 'name' in trait and 'value' in trait and 'unit' in trait:
292 |                         traits.append(trait)
293 |             
294 |             return traits
295 |         except Exception as e:
296 |             print(f"从生活习性提取数值特征时出错: {e}")
297 |             return []
298 |     
299 |     def _extract_numerical_traits_from_features(self, text):
300 |         """从生物特征文本中提取数值特征
301 |         
302 |         Args:
303 |             text: 生物特征文本
304 |             
305 |         Returns:
306 |             数值特征列表，每项包含trait_name, value, unit
307 |         """
308 |         try:
309 |             result = self.features_extractor(text)
310 |             traits = []
311 |             
312 |             # 处理体长
313 |             if result.body_length and result.body_length not in ["无", "未知", "不明确"]:
314 |                 match = re.search(r'(\d+(?:\.\d+)?(?:\s*[-~]\s*\d+(?:\.\d+)?)?)\s*(厘米|cm|CM|米|m)', result.body_length)
315 |                 if match:
316 |                     value_str = match.group(1)
317 |                     unit = match.group(2)
318 |                     
319 |                     # 单位标准化
320 |                     if unit.lower() in ['cm', 'cm', '厘米']:
321 |                         unit = '厘米'
322 |                     elif unit.lower() in ['m', 'm', '米']:
323 |                         unit = '米'
324 |                     
325 |                     # 如果是范围，取平均值
326 |                     if '-' in value_str or '~' in value_str:
327 |                         parts = re.split(r'[-~]', value_str)
328 |                         try:
329 |                             value = (float(parts[0].strip()) + float(parts[1].strip())) / 2
330 |                         except:
331 |                             value = float(parts[0].strip())
332 |                     else:
333 |                         value = float(value_str)
334 |                     
335 |                     traits.append({
336 |                         "name": "体长",
337 |                         "value": value,
338 |                         "unit": unit
339 |                     })
340 |             
341 |             # 处理体重
342 |             if result.body_weight and result.body_weight not in ["无", "未知", "不明确"]:
343 |                 match = re.search(r'(\d+(?:\.\d+)?(?:\s*[-~]\s*\d+(?:\.\d+)?)?)\s*(克|g|千克|kg)', result.body_weight)
344 |                 if match:
345 |                     value_str = match.group(1)
346 |                     unit = match.group(2)
347 |                     
348 |                     # 单位标准化
349 |                     if unit.lower() in ['g', 'g', '克']:
350 |                         unit = '克'
351 |                     elif unit.lower() in ['kg', 'kg', '千克']:
352 |                         unit = '千克'
353 |                     
354 |                     if '-' in value_str or '~' in value_str:
355 |                         parts = re.split(r'[-~]', value_str)
356 |                         try:
357 |                             value = (float(parts[0].strip()) + float(parts[1].strip())) / 2
358 |                         except:
359 |                             value = float(parts[0].strip())
360 |                     else:
361 |                         value = float(value_str)
362 |                     
363 |                     traits.append({
364 |                         "name": "体重",
365 |                         "value": value,
366 |                         "unit": unit
367 |                     })
368 |             
369 |             # 处理其他数值特征
370 |             if result.other_numerical and isinstance(result.other_numerical, list):
371 |                 for trait in result.other_numerical:
372 |                     if isinstance(trait, dict) and 'name' in trait and 'value' in trait and 'unit' in trait:
373 |                         traits.append(trait)
374 |             
375 |             return traits
376 |         except Exception as e:
377 |             print(f"从生物特征提取数值特征时出错: {e}")
378 |             return []
379 |     
380 |     def process_json_file(self, json_file_path):
381 |         """处理JSON文件，提取数据并存入SQLite数据库
382 |         
383 |         Args:
384 |             json_file_path: JSON文件路径
385 |         """
386 |         with open(json_file_path, 'r', encoding='utf-8') as f:
387 |             species_data = json.load(f)
388 |         
389 |         print(f"开始处理生物实体数据...")
390 |         print(f"共加载 {len(species_data)} 个生物实体数据")
391 |         
392 |         with sqlite3.connect(self.db_path) as conn:
393 |             cursor = conn.cursor()
394 |             
395 |             for entity_index, entity in enumerate(species_data):
396 |                 # 使用中文学名作为标识
397 |                 entity_id = entity['中文学名']
398 |                 print(f"\n[{entity_index+1}/{len(species_data)}] 正在处理生物: {entity_id}（拉丁学名: {entity['拉丁学名']}）")
399 |                 
400 |                 # 从生物特征中提取体长
401 |                 body_length = None
402 |                 if '生物特征' in entity:
403 |                     body_length = self._extract_body_length(entity['生物特征'])
404 |                 
405 |                 # 安全获取命名信息，处理可能缺失的字段
406 |                 naming_year = entity.get('命名年份', None)
407 |                 # 如果命令年份不是字符串则转换为字符串
408 |                 if naming_year and not isinstance(naming_year, int):
409 |                     try:
410 |                         naming_year = int(naming_year)
411 |                     except:
412 |                         naming_year = None
413 |                 
414 |                 # 插入物种基本信息
415 |                 cursor.execute('''
416 |                 INSERT INTO species (
417 |                     latin_name, naming_year, author, chinese_name, 
418 |                     kingdom, phylum, class, order_name, 
419 |                     family, genus, species_name, body_length
420 |                 ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
421 |                 ''', (
422 |                     entity.get('拉丁学名', ''),
423 |                     naming_year,
424 |                     entity.get('作者', ''),
425 |                     entity.get('中文学名', ''),
426 |                     entity.get('界', ''),
427 |                     entity.get('门', ''),
428 |                     entity.get('纲', ''),
429 |                     entity.get('目', ''),
430 |                     entity.get('科', ''),
431 |                     entity.get('属', ''),
432 |                     entity.get('种', ''),
433 |                     body_length
434 |                 ))
435 |                 
436 |                 species_id = cursor.lastrowid
437 |                 print(f"  已添加物种基本信息，ID: {species_id}")
438 |                 
439 |                 # 处理原始描述文本
440 |                 for desc_type in ['生活习性', '生物特征']:
441 |                     if desc_type in entity:
442 |                         cursor.execute('''
443 |                         INSERT INTO descriptions (species_id, description_type, content)
444 |                         VALUES (?, ?, ?)
445 |                         ''', (species_id, desc_type, entity[desc_type]))
446 |                 
447 |                 # 处理地理分布
448 |                 if '自然分布地' in entity:
449 |                     print(f"  处理 {entity_id} 的自然分布地信息...")
450 |                     print(f"  原始自然分布地文本: {entity['自然分布地']}")
451 |                     locations = self._extract_locations(entity['自然分布地'])
452 |                     
453 |                     for location in locations:
454 |                         cursor.execute('''
455 |                         INSERT INTO distributions (species_id, location)
456 |                         VALUES (?, ?)
457 |                         ''', (species_id, location))
458 |                         print(f"    - 添加地理位置: {location}")
459 |                 
460 |                 # 处理生活习性中的数值特征
461 |                 if '生活习性' in entity:
462 |                     print(f"  处理 {entity_id} 的生活习性信息...")
463 |                     print(f"  原始生活习性文本: {entity['生活习性']}")
464 |                     traits = self._extract_numerical_traits_from_habits(entity['生活习性'])
465 |                     
466 |                     for trait in traits:
467 |                         cursor.execute('''
468 |                         INSERT INTO numerical_traits (
469 |                             species_id, trait_type, trait_name, value, unit
470 |                         ) VALUES (?, ?, ?, ?, ?)
471 |                         ''', (
472 |                             species_id, 
473 |                             '生活习性',
474 |                             trait['name'], 
475 |                             trait['value'],
476 |                             trait['unit']
477 |                         ))
478 |                         print(f"    - 添加数值特征: {trait['name']} = {trait['value']} {trait['unit']}")
479 |                 
480 |                 # 处理生物特征中的数值特征
481 |                 if '生物特征' in entity:
482 |                     print(f"  处理 {entity_id} 的生物特征信息...")
483 |                     print(f"  原始生物特征文本: {entity['生物特征']}")
484 |                     traits = self._extract_numerical_traits_from_features(entity['生物特征'])
485 |                     
486 |                     for trait in traits:
487 |                         cursor.execute('''
488 |                         INSERT INTO numerical_traits (
489 |                             species_id, trait_type, trait_name, value, unit
490 |                         ) VALUES (?, ?, ?, ?, ?)
491 |                         ''', (
492 |                             species_id, 
493 |                             '生物特征',
494 |                             trait['name'], 
495 |                             trait['value'],
496 |                             trait['unit']
497 |                         ))
498 |                         print(f"    - 添加数值特征: {trait['name']} = {trait['value']} {trait['unit']}")
499 |             
500 |             conn.commit()
501 |             print(f"\n数据已成功导入到数据库: {self.db_path}")
502 |             
503 |             # 输出统计信息
504 |             cursor.execute("SELECT COUNT(*) FROM species")
505 |             species_count = cursor.fetchone()[0]
506 |             
507 |             cursor.execute("SELECT COUNT(*) FROM distributions")
508 |             distributions_count = cursor.fetchone()[0]
509 |             
510 |             cursor.execute("SELECT COUNT(*) FROM numerical_traits")
511 |             traits_count = cursor.fetchone()[0]
512 |             
513 |             print(f"数据库统计信息:")
514 |             print(f"  - 总物种数: {species_count}")
515 |             print(f"  - 总地理分布记录: {distributions_count}")
516 |             print(f"  - 总数值特征记录: {traits_count}")
517 |             print(f"处理完成!")
518 | 
519 | 
520 | # 使用示例
521 | if __name__ == "__main__":
522 |     processor = MarineSpeciesProcessor("./dbs/marine_species.db")
523 |     # 加工单个JSON文件
524 |     # processor.process_json_file("docs/demo_18.json")
525 |     


--------------------------------------------------------------------------------