├── src
    └── pdf_reader
    │   ├── __main__.py
    │   ├── __init__.py
    │   └── server.py
├── .gitignore
├── requirements.txt
├── pyproject.toml
├── LICENSE
└── README.md


/src/pdf_reader/__main__.py:
--------------------------------------------------------------------------------
1 | """
2 | PDF Reader Server 启动脚本
3 | """
4 | import asyncio
5 | from .server import main
6 | 
7 | if __name__ == "__main__":
8 |     asyncio.run(main())
9 | 


--------------------------------------------------------------------------------
/src/pdf_reader/__init__.py:
--------------------------------------------------------------------------------
1 | from . import server
2 | import asyncio
3 | 
4 | def main():
5 |     """Main entry point for the package."""
6 |     asyncio.run(server.main())
7 | 
8 | __all__ = ['main', 'server']
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Virtual Environment
24 | .env
25 | .venv
26 | env/
27 | venv/
28 | ENV/
29 | 
30 | # IDE
31 | .idea/
32 | .vscode/
33 | *.swp
34 | *.swo
35 | 
36 | # OS
37 | .DS_Store
38 | Thumbs.db
39 | 
40 | # Project specific
41 | *.pdf
42 | model_cache/
43 | *.log
44 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | mcp>=0.1.0
 2 | PyMuPDF>=1.23.8
 3 | Pillow>=10.1.0  # 使用标准 Pillow 包
 4 | nltk>=3.8.1
 5 | spacy>=3.7.2
 6 | pdfminer.six>=20221105
 7 | pandas>=2.1.4
 8 | tabula-py>=2.9.0
 9 | scikit-learn>=1.3.2
10 | langdetect>=1.0.9
11 | transformers>=4.36.1
12 | torch>=2.1.2  # 如果有NVIDIA GPU，建议安装CUDA版本
13 | sentence-transformers>=2.2.2
14 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
15 | # 新增性能优化相关的包
16 | ujson>=5.9.0  # 更快的JSON处理
17 | pyarrow>=14.0.2  # 更快的数据处理
18 | psutil>=5.9.7  # 系统资源监控
19 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "pdf_reader"
 7 | version = "0.1.0"
 8 | description = "MCP server for reading PDF files"
 9 | requires-python = ">=3.10"
10 | dependencies = [
11 |     "mcp",
12 |     "PyMuPDF",      # PDF处理
13 |     "Pillow",       # 图像处理
14 |     "nltk",         # 自然语言处理
15 |     "spacy",        # NLP分析
16 |     "pdfminer.six", # PDF文本提取增强
17 |     "pandas",       # 表格数据处理
18 |     "tabula-py",    # PDF表格提取
19 |     "scikit-learn", # 机器学习支持
20 |     "langdetect",   # 语言检测
21 |     "transformers", # 文本分析和分类
22 |     "torch",        # PyTorch支持
23 |     "sentence-transformers", # 文本相似度
24 | ]
25 | 
26 | [project.scripts]
27 | pdf_reader = "pdf_reader:main"
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 [Saury1120]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PDF-MCP ![GitHub stars](https://img.shields.io/github/stars/saury1120/pdf-mcp.svg?style=social&label=Star)
  2 | 
  3 | [中文](#pdf-mcp-服务) | [English](#pdf-mcp-service)
  4 | 
  5 | ## 📄 PDF-MCP 服务
  6 | 
  7 | 高性能 PDF 文档处理服务，支持文本、图片、表格提取及高级分析。
  8 | 
  9 | ## ✨ 主要特性
 10 | 
 11 | - **📜 文本提取**：多语言支持，保留格式。
 12 | - **🖼️ 图片处理**：提取与优化。
 13 | - **📊 表格识别**：结构化数据输出。
 14 | - **🧠 智能分类**：基于深度学习。
 15 | - **🔍 相似度分析**：跨语言比较。
 16 | - **🌐 多语言支持**：100+ 种语言。
 17 | 
 18 | ## 💻 系统要求
 19 | 
 20 | - **🖥️ 硬件**：2 核 CPU，4GB 内存。
 21 | - **⚙️ 软件**：Python 3.10+，可选 CUDA 支持。
 22 | 
 23 | ## 🚀 快速开始
 24 | 
 25 | 1. 🗂️ 克隆仓库并进入目录：
 26 |    ```bash
 27 |    git clone https://github.com/saury1120/pdf-mcp.git
 28 |    cd pdf-mcp
 29 |    ```
 30 | 2. 🛠️ 创建虚拟环境并安装依赖：
 31 |    ```bash
 32 |    uv venv
 33 |    source .venv/bin/activate
 34 |    uv pip install -r requirements.txt
 35 |    ```
 36 | 3. ▶️ 启动服务：
 37 |    ```bash
 38 |    uv run pdf_reader
 39 | 
 40 |    
 41 | ### Claude Desktop 配置
 42 | 1. 找到配置文件：
 43 |    - macOS: `~/Library/Application Support/Claude/claude_desktop_config.json`
 44 |    - Windows: `%AppData%/Claude/claude_desktop_config.json`
 45 | 2. 添加以下配置：
 46 | ```json
 47 | {
 48 |     "mcpServers": {
 49 |         "pdf_reader": {
 50 |             "command": "uv",
 51 |             "args": [
 52 |                 "--directory",
 53 |                 "/path/to/pdf-mcp",  # 替换为实际路径
 54 |                 "run",
 55 |                 "pdf_reader"
 56 |             ]
 57 |         }
 58 |     }
 59 | }
 60 |  ```
 61 | 
 62 | 
 63 | # PDF-MCP Service
 64 | 
 65 | A high-performance PDF document processing service supporting text, image, table extraction, and advanced analysis.
 66 | 
 67 | ## ✨ Key Features
 68 | 
 69 | - **📜 Text Extraction**: Multilingual support, retains formatting.
 70 | - **🖼️ Image Processing**: Extraction and optimization.
 71 | - **📊 Table Recognition**: Structured data output.
 72 | - **🧠 Intelligent Classification**: Based on deep learning.
 73 | - **🔍 Similarity Analysis**: Cross-language comparison.
 74 | - **🌐 Multilingual Support**: 100+ languages.
 75 | 
 76 | ## 💻 System Requirements
 77 | 
 78 | - **🖥️ Hardware**: 2-core CPU, 4GB RAM.
 79 | - **⚙️ Software**: Python 3.10+, optional CUDA support.
 80 | 
 81 | ## 🚀 Quick Start
 82 | 
 83 | 1. 🗂️ Clone the repository and enter the directory:
 84 |    ```bash
 85 |    git clone https://github.com/saury1120/pdf-mcp.git
 86 |    cd pdf-mcp
 87 |    ```
 88 | 2. 🛠️ Create a virtual environment and install dependencies:
 89 |    ```bash
 90 |    uv venv
 91 |    source .venv/bin/activate
 92 |    uv pip install -r requirements.txt
 93 |    ```
 94 | 3. ▶️ Start the service:
 95 |    ```bash
 96 |    uv run pdf_reader
 97 | 
 98 | 
 99 | ## Claude Desktop 
100 | ```json
101 | {
102 |     "mcpServers": {
103 |         "pdf_reader": {
104 |             "command": "uv",
105 |             "args": [
106 |                 "--directory",
107 |                 "/path/to/pdf-mcp",  # 替换为实际路径
108 |                 "run",
109 |                 "pdf_reader"
110 |             ]
111 |         }
112 |     }
113 | }
114 | ```
115 | 


--------------------------------------------------------------------------------
/src/pdf_reader/server.py:
--------------------------------------------------------------------------------
   1 | from typing import Any, List, Dict
   2 | import asyncio
   3 | import base64
   4 | import fitz  # PyMuPDF
   5 | from PIL import Image
   6 | import io
   7 | import nltk
   8 | import spacy
   9 | import pandas as pd
  10 | from tabula import read_pdf
  11 | from pdfminer.high_level import extract_text as pdfminer_extract_text
  12 | from mcp.server.models import InitializationOptions
  13 | import mcp.types as types
  14 | from mcp.server import NotificationOptions, Server
  15 | import mcp.server.stdio
  16 | from sklearn.feature_extraction.text import TfidfVectorizer
  17 | from sklearn.naive_bayes import MultinomialNB
  18 | from langdetect import detect
  19 | from transformers import pipeline
  20 | from sentence_transformers import SentenceTransformer
  21 | import torch
  22 | import numpy as np
  23 | from collections import defaultdict
  24 | import threading
  25 | from functools import lru_cache
  26 | import concurrent.futures
  27 | import time
  28 | import psutil
  29 | 
  30 | class ModelManager:
  31 |     _instance = None
  32 |     _models = {}
  33 |     _lock = threading.Lock()
  34 |     _last_used = {}
  35 |     _memory_threshold = 0.8  # 内存使用率阈值
  36 |     _max_idle_time = 300  # 模型最大空闲时间（秒）
  37 | 
  38 |     def __new__(cls):
  39 |         if cls._instance is None:
  40 |             with cls._lock:
  41 |                 if cls._instance is None:
  42 |                     cls._instance = super().__new__(cls)
  43 |         return cls._instance
  44 | 
  45 |     def __init__(self):
  46 |         if hasattr(self, '_initialized'):
  47 |             return
  48 |         self._initialized = True
  49 |         self._model_configs = {
  50 |             'spacy': {
  51 |                 'name': 'en_core_web_sm',
  52 |                 'loader': self._load_spacy,
  53 |                 'quantize': False
  54 |             },
  55 |             'classifier': {
  56 |                 'name': 'facebook/bart-large-mnli',
  57 |                 'loader': self._load_classifier,
  58 |                 'quantize': {
  59 |                     'enabled': True,
  60 |                     'method': 'dynamic',
  61 |                     'dtype': torch.qint8,
  62 |                     'layers': [torch.nn.Linear, torch.nn.Conv1d, torch.nn.Conv2d],
  63 |                     'calibration_size': 100
  64 |                 }
  65 |             },
  66 |             'sentence_transformer': {
  67 |                 'name': 'paraphrase-MiniLM-L6-v2',
  68 |                 'loader': self._load_sentence_transformer,
  69 |                 'quantize': {
  70 |                     'enabled': True,
  71 |                     'method': 'dynamic',
  72 |                     'dtype': torch.qint8,
  73 |                     'layers': [torch.nn.Linear],
  74 |                     'calibration_size': 100
  75 |                 }
  76 |             }
  77 |         }
  78 |         self.device = self._get_optimal_device()
  79 |         self._setup_quantization_backend()
  80 |         self._cleanup_thread = threading.Thread(target=self._cleanup_models, daemon=True)
  81 |         self._cleanup_thread.start()
  82 | 
  83 |     def _get_optimal_device(self):
  84 |         """根据系统配置选择最优设备"""
  85 |         if not torch.cuda.is_available():
  86 |             return 'cpu'
  87 |         
  88 |         # 检查GPU内存
  89 |         gpu_memory = torch.cuda.get_device_properties(0).total_memory
  90 |         if gpu_memory < 4 * 1024 * 1024 * 1024:  # 小于4GB
  91 |             return 'cpu'
  92 |         
  93 |         return 'cuda'
  94 | 
  95 |     def _get_optimal_quantization_config(self, model_type):
  96 |         """根据设备和模型类型选择最优量化配置"""
  97 |         base_config = self._model_configs[model_type]['quantize']
  98 |         if not base_config:
  99 |             return base_config
 100 | 
 101 |         if self.device == 'cpu':
 102 |             return {
 103 |                 'enabled': True,
 104 |                 'method': 'dynamic',
 105 |                 'dtype': torch.qint8,
 106 |                 'layers': base_config['layers'],
 107 |                 'calibration_size': 50  # CPU下使用更小的校准集
 108 |             }
 109 |         else:
 110 |             return {
 111 |                 'enabled': True,
 112 |                 'method': 'static',
 113 |                 'dtype': torch.float16,  # GPU下使用半精度
 114 |                 'layers': base_config['layers'],
 115 |                 'calibration_size': base_config['calibration_size']
 116 |             }
 117 | 
 118 |     def get_model(self, model_type: str):
 119 |         """延迟加载模型"""
 120 |         with self._lock:
 121 |             if model_type not in self._models:
 122 |                 config = self._model_configs.get(model_type)
 123 |                 if not config:
 124 |                     raise ModelLoadError(f"Unknown model type: {model_type}")
 125 |                 
 126 |                 # 检查内存使用情况
 127 |                 self._check_memory_usage()
 128 |                 
 129 |                 # 加载模型
 130 |                 try:
 131 |                     model = config['loader'](config['name'])
 132 |                     quant_config = self._get_optimal_quantization_config(model_type)
 133 |                     if quant_config and quant_config['enabled']:
 134 |                         model = self._prepare_model_for_quantization(model, quant_config)
 135 |                     self._models[model_type] = model
 136 |                 except Exception as e:
 137 |                     raise ModelLoadError(f"Failed to load model {model_type}: {str(e)}")
 138 |             
 139 |             # 更新最后使用时间
 140 |             self._last_used[model_type] = time.time()
 141 |             return self._models[model_type]
 142 | 
 143 |     def _check_memory_usage(self):
 144 |         """检查内存使用情况并在必要时卸载模型"""
 145 |         memory_percent = psutil.virtual_memory().percent / 100
 146 | 
 147 |         if memory_percent > self._memory_threshold:
 148 |             self._unload_least_used_model()
 149 | 
 150 |     def _unload_least_used_model(self):
 151 |         """卸载最少使用的模型"""
 152 |         if not self._last_used:
 153 |             return
 154 | 
 155 |         current_time = time.time()
 156 |         least_used_model = min(self._last_used.items(), key=lambda x: x[1])[0]
 157 |         if current_time - self._last_used[least_used_model] > self._max_idle_time:
 158 |             self._unload_model(least_used_model)
 159 | 
 160 |     def _unload_model(self, model_type: str):
 161 |         """卸载指定模型"""
 162 |         if model_type in self._models:
 163 |             del self._models[model_type]
 164 |             del self._last_used[model_type]
 165 |             # 强制进行垃圾回收
 166 |             import gc
 167 |             gc.collect()
 168 |             if self.device == 'cuda':
 169 |                 torch.cuda.empty_cache()
 170 | 
 171 |     def _cleanup_models(self):
 172 |         """定期清理未使用的模型"""
 173 |         while True:
 174 |             time.sleep(60)  # 每分钟检查一次
 175 |             with self._lock:
 176 |                 current_time = time.time()
 177 |                 models_to_unload = [
 178 |                     model_type for model_type, last_used in self._last_used.items()
 179 |                     if current_time - last_used > self._max_idle_time
 180 |                 ]
 181 |                 for model_type in models_to_unload:
 182 |                     self._unload_model(model_type)
 183 | 
 184 |     def _setup_quantization_backend(self):
 185 |         """设置量化后端"""
 186 |         if self.device == 'cuda':
 187 |             # 在GPU上使用CUDA量化后端
 188 |             torch.backends.quantized.engine = 'fbgemm'
 189 |         else:
 190 |             # 在CPU上使用fbgemm (Windows compatible)
 191 |             torch.backends.quantized.engine = 'fbgemm'
 192 | 
 193 |     def _prepare_model_for_quantization(self, model, config):
 194 |         """准备模型进行量化"""
 195 |         if not config['enabled']:
 196 |             return model
 197 | 
 198 |         if config['method'] == 'dynamic':
 199 |             return self._apply_dynamic_quantization(model, config)
 200 |         elif config['method'] == 'static':
 201 |             return self._apply_static_quantization(model, config)
 202 |         return model
 203 | 
 204 |     def _apply_dynamic_quantization(self, model, config):
 205 |         """应用动态量化"""
 206 |         try:
 207 |             print(f"Applying dynamic quantization with dtype {config['dtype']}")
 208 |             model = torch.quantization.quantize_dynamic(
 209 |                 model,
 210 |                 qconfig_spec={
 211 |                     layer: torch.quantization.default_dynamic_qconfig
 212 |                     for layer in config['layers']
 213 |                 },
 214 |                 dtype=config['dtype']
 215 |             )
 216 |             print("Dynamic quantization applied successfully")
 217 |             return model
 218 |         except Exception as e:
 219 |             print(f"Dynamic quantization failed: {str(e)}")
 220 |             return model
 221 | 
 222 |     def _apply_static_quantization(self, model, config):
 223 |         """应用静态量化"""
 224 |         try:
 225 |             print(f"Applying static quantization")
 226 |             # 准备量化配置
 227 |             model.qconfig = torch.quantization.get_default_qconfig('fbgemm' if self.device == 'cuda' else 'fbgemm')
 228 |             
 229 |             # 融合操作
 230 |             model = torch.quantization.fuse_modules(model, [['conv', 'bn', 'relu']])
 231 |             
 232 |             # 准备量化
 233 |             model = torch.quantization.prepare(model)
 234 |             
 235 |             # 校准（这里需要实际的校准数据）
 236 |             # self._calibrate_model(model, config['calibration_size'])
 237 |             
 238 |             # 转换为量化模型
 239 |             model = torch.quantization.convert(model)
 240 |             
 241 |             print("Static quantization applied successfully")
 242 |             return model
 243 |         except Exception as e:
 244 |             print(f"Static quantization failed: {str(e)}")
 245 |             return model
 246 | 
 247 |     def _calibrate_model(self, model, calibration_size):
 248 |         """使用校准数据集校准模型（用于静态量化）"""
 249 |         # 这里应该使用实际的校准数据
 250 |         # 为了示例，我们使用随机数据
 251 |         with torch.no_grad():
 252 |             for _ in range(calibration_size):
 253 |                 dummy_input = torch.randn(1, 3, 224, 224)
 254 |                 model(dummy_input)
 255 | 
 256 |     def _load_spacy(self):
 257 |         try:
 258 |             return spacy.load('en_core_web_sm')
 259 |         except OSError:
 260 |             spacy.cli.download('en_core_web_sm')
 261 |             return spacy.load('en_core_web_sm')
 262 | 
 263 |     def _load_classifier(self):
 264 |         print("Loading classifier model...")
 265 |         model = pipeline("zero-shot-classification",
 266 |                         model='facebook/bart-large-mnli',
 267 |                         device=self.device)
 268 |         
 269 |         if self._model_configs['classifier']['quantize']['enabled']:
 270 |             print("Applying quantization to classifier model")
 271 |             model.model = self._prepare_model_for_quantization(
 272 |                 model.model, 
 273 |                 self._model_configs['classifier']['quantize']
 274 |             )
 275 |         return model
 276 | 
 277 |     def _load_sentence_transformer(self):
 278 |         print("Loading sentence transformer model...")
 279 |         model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
 280 |         model = model.to(self.device)
 281 |         
 282 |         if self._model_configs['sentence_transformer']['quantize']['enabled']:
 283 |             print("Applying quantization to sentence transformer model")
 284 |             model.encoder = self._prepare_model_for_quantization(
 285 |                 model.encoder,
 286 |                 self._model_configs['sentence_transformer']['quantize']
 287 |             )
 288 |         return model
 289 | 
 290 |     def clear_cache(self, model_type: str = None):
 291 |         with self._lock:
 292 |             if model_type:
 293 |                 if model_type in self._models:
 294 |                     del self._models[model_type]
 295 |             else:
 296 |                 self._models.clear()
 297 | 
 298 |     def get_model_memory_usage(self, model_type: str = None):
 299 |         """获取模型内存使用情况"""
 300 |         if model_type:
 301 |             if model_type in self._models:
 302 |                 model = self._models[model_type]
 303 |                 return self._get_model_size(model)
 304 |             return None
 305 |         
 306 |         memory_usage = {}
 307 |         for model_type, model in self._models.items():
 308 |             memory_usage[model_type] = self._get_model_size(model)
 309 |         return memory_usage
 310 | 
 311 |     def _get_model_size(self, model):
 312 |         """计算模型大小（以MB为单位）"""
 313 |         param_size = 0
 314 |         buffer_size = 0
 315 |         
 316 |         for param in model.parameters():
 317 |             param_size += param.nelement() * param.element_size()
 318 |         
 319 |         for buffer in model.buffers():
 320 |             buffer_size += buffer.nelement() * buffer.element_size()
 321 |             
 322 |         size_mb = (param_size + buffer_size) / 1024 / 1024
 323 |         return round(size_mb, 2)
 324 | 
 325 | class ModelContext:
 326 |     """模型使用的上下文管理器"""
 327 |     def __init__(self, model_type: str, manager: 'ModelManager'):
 328 |         self.model_type = model_type
 329 |         self.manager = manager
 330 |         self.model = None
 331 |         self.error = None
 332 | 
 333 |     def __enter__(self):
 334 |         try:
 335 |             self.model = self.manager.get_model(self.model_type)
 336 |             return self.model
 337 |         except Exception as e:
 338 |             self.error = e
 339 |             raise ModelError(f"Error loading model {self.model_type}: {str(e)}")
 340 | 
 341 |     def __exit__(self, exc_type, exc_val, exc_tb):
 342 |         if exc_type is not None:
 343 |             # 记录错误但不处理
 344 |             print(f"Error using model {self.model_type}: {str(exc_val)}")
 345 |         return False  # 让异常继续传播
 346 | 
 347 | class ModelError(Exception):
 348 |     """模型相关错误的基类"""
 349 |     pass
 350 | 
 351 | class ModelLoadError(ModelError):
 352 |     """模型加载错误"""
 353 |     pass
 354 | 
 355 | class ModelInferenceError(ModelError):
 356 |     """模型推理错误"""
 357 |     pass
 358 | 
 359 | # 创建全局ModelManager实例
 360 | model_manager = ModelManager()
 361 | 
 362 | # 服务器初始化
 363 | server = Server("pdf_reader")
 364 | 
 365 | # 下载必要的NLTK数据
 366 | nltk_resources = ['punkt', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words', 'stopwords']
 367 | for resource in nltk_resources:
 368 |     try:
 369 |         nltk.data.find(f'tokenizers/{resource}')
 370 |     except LookupError:
 371 |         nltk.download(resource, quiet=True)
 372 | 
 373 | @server.list_tools()
 374 | async def handle_list_tools() -> list[types.Tool]:
 375 |     """列出可用的工具"""
 376 |     return [
 377 |         types.Tool(
 378 |             name="extract-text",
 379 |             description="从PDF文件中提取文本内容",
 380 |             inputSchema={
 381 |                 "type": "object",
 382 |                 "properties": {
 383 |                     "file_path": {
 384 |                         "type": "string",
 385 |                         "description": "PDF文件的路径",
 386 |                     },
 387 |                     "page_number": {
 388 |                         "type": "integer",
 389 |                         "description": "要提取的页码（从0开始）",
 390 |                     },
 391 |                 },
 392 |                 "required": ["file_path"],
 393 |             },
 394 |         ),
 395 |         types.Tool(
 396 |             name="extract-images",
 397 |             description="从PDF文件中提取图片",
 398 |             inputSchema={
 399 |                 "type": "object",
 400 |                 "properties": {
 401 |                     "file_path": {
 402 |                         "type": "string",
 403 |                         "description": "PDF文件的路径",
 404 |                     },
 405 |                     "page_number": {
 406 |                         "type": "integer",
 407 |                         "description": "要提取的页码（从0开始）",
 408 |                     },
 409 |                 },
 410 |                 "required": ["file_path"],
 411 |             },
 412 |         ),
 413 |         types.Tool(
 414 |             name="extract-tables",
 415 |             description="从PDF文件中提取表格",
 416 |             inputSchema={
 417 |                 "type": "object",
 418 |                 "properties": {
 419 |                     "file_path": {
 420 |                         "type": "string",
 421 |                         "description": "PDF文件的路径",
 422 |                     },
 423 |                     "page_number": {
 424 |                         "type": "integer",
 425 |                         "description": "要提取的页码（从0开始）",
 426 |                     },
 427 |                 },
 428 |                 "required": ["file_path"],
 429 |             },
 430 |         ),
 431 |         types.Tool(
 432 |             name="analyze-content",
 433 |             description="分析PDF文件内容，提取关键信息",
 434 |             inputSchema={
 435 |                 "type": "object",
 436 |                 "properties": {
 437 |                     "file_path": {
 438 |                         "type": "string",
 439 |                         "description": "PDF文件的路径",
 440 |                     },
 441 |                     "analysis_type": {
 442 |                         "type": "string",
 443 |                         "description": "分析类型：entities（实体）, summary（摘要）, keywords（关键词）",
 444 |                         "enum": ["entities", "summary", "keywords"],
 445 |                     },
 446 |                 },
 447 |                 "required": ["file_path", "analysis_type"],
 448 |             },
 449 |         ),
 450 |         types.Tool(
 451 |             name="get-metadata",
 452 |             description="获取PDF文件的元数据信息",
 453 |             inputSchema={
 454 |                 "type": "object",
 455 |                 "properties": {
 456 |                     "file_path": {
 457 |                         "type": "string",
 458 |                         "description": "PDF文件的路径",
 459 |                     },
 460 |                 },
 461 |                 "required": ["file_path"],
 462 |             },
 463 |         ),
 464 |         types.Tool(
 465 |             name="classify-document",
 466 |             description="对PDF文档进行分类",
 467 |             inputSchema={
 468 |                 "type": "object",
 469 |                 "properties": {
 470 |                     "file_path": {
 471 |                         "type": "string",
 472 |                         "description": "PDF文件的路径",
 473 |                     },
 474 |                     "categories": {
 475 |                         "type": "array",
 476 |                         "items": {"type": "string"},
 477 |                         "description": "可能的分类类别列表",
 478 |                     },
 479 |                 },
 480 |                 "required": ["file_path", "categories"],
 481 |             },
 482 |         ),
 483 |         types.Tool(
 484 |             name="calculate-similarity",
 485 |             description="计算两个PDF文档的相似度",
 486 |             inputSchema={
 487 |                 "type": "object",
 488 |                 "properties": {
 489 |                     "file_path1": {
 490 |                         "type": "string",
 491 |                         "description": "第一个PDF文件的路径",
 492 |                     },
 493 |                     "file_path2": {
 494 |                         "type": "string",
 495 |                         "description": "第二个PDF文件的路径",
 496 |                     },
 497 |                 },
 498 |                 "required": ["file_path1", "file_path2"],
 499 |             },
 500 |         ),
 501 |         types.Tool(
 502 |             name="detect-languages",
 503 |             description="检测PDF文档中使用的语言",
 504 |             inputSchema={
 505 |                 "type": "object",
 506 |                 "properties": {
 507 |                     "file_path": {
 508 |                         "type": "string",
 509 |                         "description": "PDF文件的路径",
 510 |                     },
 511 |                 },
 512 |                 "required": ["file_path"],
 513 |             },
 514 |         ),
 515 |         types.Tool(
 516 |             name="advanced-analysis",
 517 |             description="执行高级文本分析",
 518 |             inputSchema={
 519 |                 "type": "object",
 520 |                 "properties": {
 521 |                     "file_path": {
 522 |                         "type": "string",
 523 |                         "description": "PDF文件的路径",
 524 |                     },
 525 |                 },
 526 |                 "required": ["file_path"],
 527 |             },
 528 |         ),
 529 |     ]
 530 | 
 531 | async def extract_text_from_pdf(file_path: str, page_number: int = None) -> str:
 532 |     """从PDF中提取文本"""
 533 |     try:
 534 |         doc = fitz.open(file_path)
 535 |         if page_number is not None:
 536 |             if 0 <= page_number < len(doc):
 537 |                 text = doc[page_number].get_text()
 538 |                 doc.close()
 539 |                 return text
 540 |             else:
 541 |                 doc.close()
 542 |                 return f"页码 {page_number} 超出范围。PDF共有 {len(doc)} 页。"
 543 |         
 544 |         # 如果没有指定页码，提取所有页面的文本
 545 |         text = ""
 546 |         for page in doc:
 547 |             text += page.get_text() + "\n"
 548 |         doc.close()
 549 |         return text
 550 |     except Exception as e:
 551 |         return f"提取文本时出错: {str(e)}"
 552 | 
 553 | async def extract_images_from_pdf(file_path: str, page_number: int = None):
 554 |     """从PDF中提取图片，返回base64编码的图片列表"""
 555 |     try:
 556 |         doc = fitz.open(file_path)
 557 |         images = []
 558 |         pages = [page_number] if page_number is not None else range(len(doc))
 559 |         
 560 |         for page_num in pages:
 561 |             page = doc[page_num]
 562 |             image_list = page.get_images()
 563 |             
 564 |             # 并行处理图片
 565 |             def process_image(img_index):
 566 |                 try:
 567 |                     xref = image_list[img_index][0]
 568 |                     base_image = doc.extract_image(xref)
 569 |                     image_bytes = base_image["image"]
 570 |                     
 571 |                     # 转换和优化图片
 572 |                     image = Image.open(io.BytesIO(image_bytes))
 573 |                     image = optimize_image(image)
 574 |                     
 575 |                     # 转换为base64
 576 |                     buffered = io.BytesIO()
 577 |                     image.save(buffered, format="PNG", optimize=True)
 578 |                     img_str = base64.b64encode(buffered.getvalue()).decode()
 579 |                     return img_str
 580 |                 except Exception as e:
 581 |                     print(f"处理图片时出错: {str(e)}")
 582 |                     return None
 583 |             
 584 |             # 使用线程池并行处理图片
 585 |             with concurrent.futures.ThreadPoolExecutor() as executor:
 586 |                 futures = [executor.submit(process_image, i) for i in range(len(image_list))]
 587 |                 for future in concurrent.futures.as_completed(futures):
 588 |                     if future.result():
 589 |                         images.append(future.result())
 590 |         
 591 |         doc.close()
 592 |         return images
 593 |     except Exception as e:
 594 |         print(f"提取图片时出错: {str(e)}")
 595 |         return []
 596 | 
 597 | async def extract_tables_from_pdf(file_path: str, page_number: int = None) -> List[str]:
 598 |     """从PDF中提取表格"""
 599 |     try:
 600 |         if page_number is not None:
 601 |             tables = read_pdf(file_path, pages=page_number + 1)  # tabula使用1-based页码
 602 |         else:
 603 |             tables = read_pdf(file_path, pages='all')
 604 |         
 605 |         if not tables:
 606 |             return ["未找到表格"]
 607 |         
 608 |         result = []
 609 |         for i, table in enumerate(tables):
 610 |             result.append(f"表格 {i+1}:\n{table.to_string()}\n---")
 611 |         return result
 612 |     except Exception as e:
 613 |         return [f"提取表格时出错: {str(e)}"]
 614 | 
 615 | async def analyze_pdf_content(file_path: str, analysis_type: str) -> Dict[str, Any]:
 616 |     """分析PDF内容"""
 617 |     try:
 618 |         text = extract_text_from_pdf(file_path)
 619 |         
 620 |         if analysis_type == "entities":
 621 |             with ModelContext('spacy', model_manager) as nlp:
 622 |                 doc = nlp(text)
 623 |                 entities = [(ent.text, ent.label_) for ent in doc.ents]
 624 |                 return {"entities": entities}
 625 |             
 626 |         elif analysis_type == "summary":
 627 |             with ModelContext('classifier', model_manager) as classifier:
 628 |                 sentences = nltk.sent_tokenize(text)
 629 |                 results = classifier(sentences, 
 630 |                                   candidate_labels=["important", "not important"],
 631 |                                   multi_label=False)
 632 |                 important_sentences = [sent for sent, score in zip(sentences, results['scores']) 
 633 |                                     if score > 0.7]
 634 |                 return {"summary": " ".join(important_sentences[:5])}
 635 |             
 636 |         elif analysis_type == "keywords":
 637 |             with ModelContext('spacy', model_manager) as nlp:
 638 |                 doc = nlp(text)
 639 |                 keywords = [token.text for token in doc if not token.is_stop and token.is_alpha]
 640 |                 return {"keywords": list(set(keywords[:20]))}
 641 |             
 642 |     except ModelError as e:
 643 |         return {"error": f"Model error: {str(e)}"}
 644 |     except Exception as e:
 645 |         return {"error": f"Unexpected error: {str(e)}"}
 646 | 
 647 | async def get_pdf_metadata(file_path: str) -> Dict[str, Any]:
 648 |     """获取PDF元数据"""
 649 |     try:
 650 |         doc = fitz.open(file_path)
 651 |         metadata = doc.metadata
 652 |         doc.close()
 653 |         return {
 654 |             "title": metadata.get("title", "未知"),
 655 |             "author": metadata.get("author", "未知"),
 656 |             "subject": metadata.get("subject", "未知"),
 657 |             "keywords": metadata.get("keywords", "未知"),
 658 |             "creator": metadata.get("creator", "未知"),
 659 |             "producer": metadata.get("producer", "未知"),
 660 |             "creation_date": metadata.get("creationDate", "未知"),
 661 |             "modification_date": metadata.get("modDate", "未知"),
 662 |             "page_count": doc.page_count
 663 |         }
 664 |     except Exception as e:
 665 |         return {"error": str(e)}
 666 | 
 667 | async def classify_document(file_path: str, categories: List[str]) -> Dict[str, Any]:
 668 |     """对文档进行分类"""
 669 |     try:
 670 |         text = pdfminer_extract_text(file_path)
 671 |         with ModelContext('classifier', model_manager) as classifier:
 672 |             result = classifier(text, categories)
 673 |             return {
 674 |                 "labels": result["labels"],
 675 |                 "scores": [float(score) for score in result["scores"]]
 676 |             }
 677 |     except ModelError as e:
 678 |         return {"error": f"Model error: {str(e)}"}
 679 |     except Exception as e:
 680 |         return {"error": f"Unexpected error: {str(e)}"}
 681 | 
 682 | async def calculate_similarity(file_path1: str, file_path2: str) -> Dict[str, float]:
 683 |     """计算两个文档的相似度"""
 684 |     try:
 685 |         text1 = pdfminer_extract_text(file_path1)
 686 |         text2 = pdfminer_extract_text(file_path2)
 687 |         
 688 |         with ModelContext('sentence_transformer', model_manager) as model:
 689 |             # 将文本分成较小的块进行处理
 690 |             def chunk_text(text, chunk_size=1000):
 691 |                 return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
 692 |             
 693 |             # 计算文本块的嵌入向量
 694 |             def get_embeddings(text):
 695 |                 chunks = chunk_text(text)
 696 |                 embeddings = model.encode(chunks)
 697 |                 return np.mean(embeddings, axis=0)
 698 |             
 699 |             # 计算两个文档的相似度
 700 |             embedding1 = get_embeddings(text1)
 701 |             embedding2 = get_embeddings(text2)
 702 |             similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))
 703 |             
 704 |             return {"similarity_score": float(similarity)}
 705 |             
 706 |     except ModelError as e:
 707 |         return {"error": f"Model error: {str(e)}"}
 708 |     except Exception as e:
 709 |         return {"error": f"Unexpected error: {str(e)}"}
 710 | 
 711 | async def detect_languages(file_path: str) -> Dict[str, Any]:
 712 |     """检测文档中的语言"""
 713 |     try:
 714 |         text = pdfminer_extract_text(file_path)
 715 |         with ModelContext('spacy', model_manager) as nlp:
 716 |             # 将文本分成段落
 717 |             paragraphs = text.split('\n\n')
 718 |             language_info = []
 719 |             
 720 |             for para in paragraphs:
 721 |                 if not para.strip():
 722 |                     continue
 723 |                     
 724 |                 try:
 725 |                     lang = detect(para)
 726 |                     doc = nlp(para)
 727 |                     # 获取段落的语言特征
 728 |                     features = {
 729 |                         'text': para[:100] + '...' if len(para) > 100 else para,
 730 |                         'language': lang,
 731 |                         'tokens': len(doc),
 732 |                         'sentences': len(list(doc.sents))
 733 |                     }
 734 |                     language_info.append(features)
 735 |                 except Exception as e:
 736 |                     print(f"Error processing paragraph: {str(e)}")
 737 |                     continue
 738 |             
 739 |             return {
 740 |                 "language_analysis": language_info,
 741 |                 "document_stats": {
 742 |                     "total_paragraphs": len(paragraphs),
 743 |                     "processed_paragraphs": len(language_info)
 744 |                 }
 745 |             }
 746 |             
 747 |     except ModelError as e:
 748 |         return {"error": f"Model error: {str(e)}"}
 749 |     except Exception as e:
 750 |         return {"error": f"Unexpected error: {str(e)}"}
 751 | 
 752 | async def advanced_text_analysis(file_path: str) -> Dict[str, Any]:
 753 |     """执行高级文本分析"""
 754 |     try:
 755 |         text = pdfminer_extract_text(file_path)
 756 |         
 757 |         with ModelContext('spacy', model_manager) as nlp:
 758 |             doc = nlp(text)
 759 |             
 760 |             # 1. 复杂度分析
 761 |             sentences = list(doc.sents)
 762 |             avg_sentence_length = sum(len(sent) for sent in sentences) / len(sentences)
 763 |             
 764 |             # 2. 词性分布
 765 |             pos_dist = defaultdict(int)
 766 |             for token in doc:
 767 |                 pos_dist[token.pos_] += 1
 768 |             
 769 |             # 3. 依存关系分析
 770 |             dep_dist = defaultdict(int)
 771 |             for token in doc:
 772 |                 dep_dist[token.dep_] += 1
 773 |             
 774 |             # 4. 主题建模（使用TF-IDF找出最重要的词组）
 775 |             vectorizer = TfidfVectorizer(max_features=10)
 776 |             tfidf_matrix = vectorizer.fit_transform([text])
 777 |             feature_names = vectorizer.get_feature_names_out()
 778 |             scores = tfidf_matrix.toarray()[0]
 779 |             important_phrases = [
 780 |                 {"phrase": phrase, "importance": float(score)} 
 781 |                 for phrase, score in zip(feature_names, scores)
 782 |             ]
 783 |             
 784 |             return {
 785 |                 "complexity_metrics": {
 786 |                     "avg_sentence_length": float(avg_sentence_length),
 787 |                     "vocabulary_size": len(set(token.text.lower() for token in doc)),
 788 |                     "readability_score": float(avg_sentence_length * 0.39 + 11.8)
 789 |                 },
 790 |                 "pos_distribution": dict(pos_dist),
 791 |                 "dependency_patterns": dict(dep_dist),
 792 |                 "important_phrases": sorted(important_phrases, 
 793 |                                          key=lambda x: x["importance"], 
 794 |                                          reverse=True)[:10]
 795 |             }
 796 |             
 797 |     except ModelError as e:
 798 |         return {"error": f"Model error: {str(e)}"}
 799 |     except Exception as e:
 800 |         return {"error": f"Unexpected error: {str(e)}"}
 801 | 
 802 | @lru_cache(maxsize=100)
 803 | def process_text(text: str) -> str:
 804 |     """处理文本并缓存结果"""
 805 |     try:
 806 |         with ModelContext('spacy', model_manager) as nlp:
 807 |             doc = nlp(text)
 808 |             return " ".join([token.text for token in doc])
 809 |     except ModelError as e:
 810 |         print(f"Error processing text: {str(e)}")
 811 |         return text  # 返回原始文本作为后备方案
 812 |     except Exception as e:
 813 |         print(f"Unexpected error processing text: {str(e)}")
 814 |         return text
 815 | 
 816 | async def main():
 817 |     """运行服务器"""
 818 |     try:
 819 |         print("PDF Reader MCP 服务启动中...")
 820 |         
 821 |         # 在后台线程中初始化依赖
 822 |         init_thread = threading.Thread(target=initialize_dependencies)
 823 |         init_thread.start()
 824 |         
 825 |         # 启动服务器
 826 |         async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
 827 |             await server.run(
 828 |                 read_stream,
 829 |                 write_stream,
 830 |                 InitializationOptions(
 831 |                     server_name="pdf_reader",
 832 |                     server_version="0.1.0",
 833 |                     capabilities=server.get_capabilities(
 834 |                         notification_options=NotificationOptions(),
 835 |                         experimental_capabilities={},
 836 |                     ),
 837 |                 ),
 838 |             )
 839 |     except Exception as e:
 840 |         print(f"服务器运行错误: {str(e)}")
 841 |         raise
 842 | 
 843 | def initialize_dependencies():
 844 |     """异步初始化所需的依赖"""
 845 |     try:
 846 |         # NLTK数据 - 使用异步方式下载
 847 |         nltk_resources = ['punkt', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words', 'stopwords']
 848 |         for resource in nltk_resources:
 849 |             try:
 850 |                 nltk.data.find(f'tokenizers/{resource}')
 851 |             except LookupError:
 852 |                 nltk.download(resource, quiet=True)
 853 |         
 854 |         print("NLTK resources loaded")
 855 |         print(f"GPU acceleration: {'available' if torch.cuda.is_available() else 'not available'}")
 856 |         
 857 |         return True
 858 |     except Exception as e:
 859 |         print(f"Initialization failed: {str(e)}")
 860 |         return False
 861 | 
 862 | # 优化图片处理
 863 | def optimize_image(image: Image.Image, max_size: int = 1024) -> Image.Image:
 864 |     """优化图片大小和质量"""
 865 |     if max(image.size) > max_size:
 866 |         ratio = max_size / max(image.size)
 867 |         new_size = tuple(int(dim * ratio) for dim in image.size)
 868 |         image = image.resize(new_size, Image.Resampling.LANCZOS)
 869 |     return image
 870 | 
 871 | if __name__ == "__main__":
 872 |     # 设置torch使用的线程数
 873 |     torch.set_num_threads(4)
 874 |     
 875 |     # 确保在主模块中运行
 876 |     import sys
 877 |     if 'src.pdf_reader.server' in sys.modules:
 878 |         del sys.modules['src.pdf_reader.server']
 879 |     
 880 |     # 初始化并运行服务器
 881 |     asyncio.run(main())
 882 | 
 883 | @server.call_tool()
 884 | async def handle_call_tool(
 885 |     name: str, arguments: dict | None
 886 | ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
 887 |     """处理工具调用请求"""
 888 |     if not arguments:
 889 |         raise ValueError("缺少参数")
 890 | 
 891 |     file_path = arguments.get("file_path")
 892 |     if not file_path:
 893 |         raise ValueError("缺少文件路径")
 894 | 
 895 |     if name == "extract-text":
 896 |         page_number = arguments.get("page_number")
 897 |         text = await extract_text_from_pdf(file_path, page_number)
 898 |         return [types.TextContent(type="text", text=text)]
 899 |     
 900 |     elif name == "extract-images":
 901 |         page_number = arguments.get("page_number")
 902 |         images = await extract_images_from_pdf(file_path, page_number)
 903 |         result = []
 904 |         for i, img_base64 in enumerate(images):
 905 |             if img_base64.startswith("提取图片时出错"):
 906 |                 result.append(types.TextContent(type="text", text=img_base64))
 907 |             else:
 908 |                 result.append(types.ImageContent(
 909 |                     type="image",
 910 |                     format="image/png",
 911 |                     data=img_base64
 912 |                 ))
 913 |         return result if result else [types.TextContent(type="text", text="未找到图片")]
 914 |     
 915 |     elif name == "extract-tables":
 916 |         page_number = arguments.get("page_number")
 917 |         tables = await extract_tables_from_pdf(file_path, page_number)
 918 |         return [types.TextContent(type="text", text="\n".join(tables))]
 919 |     
 920 |     elif name == "analyze-content":
 921 |         analysis_type = arguments.get("analysis_type")
 922 |         if not analysis_type:
 923 |             raise ValueError("缺少分析类型")
 924 |         
 925 |         result = await analyze_pdf_content(file_path, analysis_type)
 926 |         if "error" in result:
 927 |             return [types.TextContent(type="text", text=f"分析出错: {result['error']}")]
 928 |         
 929 |         if analysis_type == "entities":
 930 |             text = "识别到的实体:\n"
 931 |             for entity_type, entities in result["entities"].items():
 932 |                 text += f"\n{entity_type}:\n- " + "\n- ".join(entities)
 933 |         elif analysis_type == "summary":
 934 |             text = f"文档摘要:\n{result['summary']}"
 935 |         elif analysis_type == "keywords":
 936 |             text = "关键词:\n- " + "\n- ".join(result["keywords"])
 937 |         
 938 |         return [types.TextContent(type="text", text=text)]
 939 |     
 940 |     elif name == "get-metadata":
 941 |         metadata = await get_pdf_metadata(file_path)
 942 |         if "error" in metadata:
 943 |             return [types.TextContent(type="text", text=f"获取元数据出错: {metadata['error']}")]
 944 |         
 945 |         text = "PDF元数据:\n"
 946 |         for key, value in metadata.items():
 947 |             text += f"{key}: {value}\n"
 948 |         
 949 |         return [types.TextContent(type="text", text=text)]
 950 |     
 951 |     elif name == "classify-document":
 952 |         categories = arguments.get("categories")
 953 |         if not categories:
 954 |             raise ValueError("缺少分类类别")
 955 |         
 956 |         result = await classify_document(file_path, categories)
 957 |         if "error" in result:
 958 |             return [types.TextContent(type="text", text=f"分类出错: {result['error']}")]
 959 |         
 960 |         text = "文档分类结果:\n"
 961 |         for label, score in zip(result["labels"], result["scores"]):
 962 |             text += f"{label}: {score:.2%}\n"
 963 |         
 964 |         return [types.TextContent(type="text", text=text)]
 965 |     
 966 |     elif name == "calculate-similarity":
 967 |         file_path2 = arguments.get("file_path2")
 968 |         if not file_path2:
 969 |             raise ValueError("缺少第二个文件路径")
 970 |         
 971 |         result = await calculate_similarity(file_path, file_path2)
 972 |         if "error" in result:
 973 |             return [types.TextContent(type="text", text=f"计算相似度出错: {result['error']}")]
 974 |         
 975 |         text = f"文档相似度: {result['similarity_score']:.2%}\n"
 976 |         text += result["interpretation"]
 977 |         
 978 |         return [types.TextContent(type="text", text=text)]
 979 |     
 980 |     elif name == "detect-languages":
 981 |         result = await detect_languages(file_path)
 982 |         if "error" in result:
 983 |             return [types.TextContent(type="text", text=f"语言检测出错: {result['error']}")]
 984 |         
 985 |         text = f"主要语言: {result['primary_language']}\n\n"
 986 |         text += "语言分布:\n"
 987 |         for lang, ratio in result["language_distribution"].items():
 988 |             text += f"{lang}: {ratio:.1%}\n"
 989 |         
 990 |         return [types.TextContent(type="text", text=text)]
 991 |     
 992 |     elif name == "advanced-analysis":
 993 |         result = await advanced_text_analysis(file_path)
 994 |         if "error" in result:
 995 |             return [types.TextContent(type="text", text=f"分析出错: {result['error']}")]
 996 |         
 997 |         text = "高级文本分析结果:\n\n"
 998 |         
 999 |         # 复杂度指标
1000 |         text += "1. 复杂度指标:\n"
1001 |         metrics = result["complexity_metrics"]
1002 |         text += f"- 平均句子长度: {metrics['avg_sentence_length']:.1f}\n"
1003 |         text += f"- 词汇量: {metrics['vocabulary_size']}\n"
1004 |         text += f"- 可读性评分: {metrics['readability_score']:.1f}\n\n"
1005 |         
1006 |         # 词性分布
1007 |         text += "2. 词性分布:\n"
1008 |         for pos, count in result["pos_distribution"].items():
1009 |             text += f"- {pos}: {count}\n"
1010 |         text += "\n"
1011 |         
1012 |         # 重要短语
1013 |         text += "3. 重要短语:\n"
1014 |         for item in result["important_phrases"]:
1015 |             text += f"- {item['phrase']}: {item['importance']:.3f}\n"
1016 |         
1017 |         return [types.TextContent(type="text", text=text)]
1018 |     
1019 |     else:
1020 |         raise ValueError(f"未知的工具: {name}")


--------------------------------------------------------------------------------