├── src └── pdf_reader │ ├── __main__.py │ ├── __init__.py │ └── server.py ├── .gitignore ├── requirements.txt ├── pyproject.toml ├── LICENSE └── README.md /src/pdf_reader/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | PDF Reader Server 启动脚本 3 | """ 4 | import asyncio 5 | from .server import main 6 | 7 | if __name__ == "__main__": 8 | asyncio.run(main()) 9 | -------------------------------------------------------------------------------- /src/pdf_reader/__init__.py: -------------------------------------------------------------------------------- 1 | from . import server 2 | import asyncio 3 | 4 | def main(): 5 | """Main entry point for the package.""" 6 | asyncio.run(server.main()) 7 | 8 | __all__ = ['main', 'server'] 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Virtual Environment 24 | .env 25 | .venv 26 | env/ 27 | venv/ 28 | ENV/ 29 | 30 | # IDE 31 | .idea/ 32 | .vscode/ 33 | *.swp 34 | *.swo 35 | 36 | # OS 37 | .DS_Store 38 | Thumbs.db 39 | 40 | # Project specific 41 | *.pdf 42 | model_cache/ 43 | *.log 44 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mcp>=0.1.0 2 | PyMuPDF>=1.23.8 3 | Pillow>=10.1.0 # 使用标准 Pillow 包 4 | nltk>=3.8.1 5 | spacy>=3.7.2 6 | pdfminer.six>=20221105 7 | pandas>=2.1.4 8 | tabula-py>=2.9.0 9 | scikit-learn>=1.3.2 10 | langdetect>=1.0.9 11 | transformers>=4.36.1 12 | torch>=2.1.2 # 如果有NVIDIA GPU,建议安装CUDA版本 13 | sentence-transformers>=2.2.2 14 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl 15 | # 新增性能优化相关的包 16 | ujson>=5.9.0 # 更快的JSON处理 17 | pyarrow>=14.0.2 # 更快的数据处理 18 | psutil>=5.9.7 # 系统资源监控 19 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "pdf_reader" 7 | version = "0.1.0" 8 | description = "MCP server for reading PDF files" 9 | requires-python = ">=3.10" 10 | dependencies = [ 11 | "mcp", 12 | "PyMuPDF", # PDF处理 13 | "Pillow", # 图像处理 14 | "nltk", # 自然语言处理 15 | "spacy", # NLP分析 16 | "pdfminer.six", # PDF文本提取增强 17 | "pandas", # 表格数据处理 18 | "tabula-py", # PDF表格提取 19 | "scikit-learn", # 机器学习支持 20 | "langdetect", # 语言检测 21 | "transformers", # 文本分析和分类 22 | "torch", # PyTorch支持 23 | "sentence-transformers", # 文本相似度 24 | ] 25 | 26 | [project.scripts] 27 | pdf_reader = "pdf_reader:main" 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 [Saury1120] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PDF-MCP ![GitHub stars](https://img.shields.io/github/stars/saury1120/pdf-mcp.svg?style=social&label=Star) 2 | 3 | [中文](#pdf-mcp-服务) | [English](#pdf-mcp-service) 4 | 5 | ## 📄 PDF-MCP 服务 6 | 7 | 高性能 PDF 文档处理服务,支持文本、图片、表格提取及高级分析。 8 | 9 | ## ✨ 主要特性 10 | 11 | - **📜 文本提取**:多语言支持,保留格式。 12 | - **🖼️ 图片处理**:提取与优化。 13 | - **📊 表格识别**:结构化数据输出。 14 | - **🧠 智能分类**:基于深度学习。 15 | - **🔍 相似度分析**:跨语言比较。 16 | - **🌐 多语言支持**:100+ 种语言。 17 | 18 | ## 💻 系统要求 19 | 20 | - **🖥️ 硬件**:2 核 CPU,4GB 内存。 21 | - **⚙️ 软件**:Python 3.10+,可选 CUDA 支持。 22 | 23 | ## 🚀 快速开始 24 | 25 | 1. 🗂️ 克隆仓库并进入目录: 26 | ```bash 27 | git clone https://github.com/saury1120/pdf-mcp.git 28 | cd pdf-mcp 29 | ``` 30 | 2. 🛠️ 创建虚拟环境并安装依赖: 31 | ```bash 32 | uv venv 33 | source .venv/bin/activate 34 | uv pip install -r requirements.txt 35 | ``` 36 | 3. ▶️ 启动服务: 37 | ```bash 38 | uv run pdf_reader 39 | 40 | 41 | ### Claude Desktop 配置 42 | 1. 找到配置文件: 43 | - macOS: `~/Library/Application Support/Claude/claude_desktop_config.json` 44 | - Windows: `%AppData%/Claude/claude_desktop_config.json` 45 | 2. 添加以下配置: 46 | ```json 47 | { 48 | "mcpServers": { 49 | "pdf_reader": { 50 | "command": "uv", 51 | "args": [ 52 | "--directory", 53 | "/path/to/pdf-mcp", # 替换为实际路径 54 | "run", 55 | "pdf_reader" 56 | ] 57 | } 58 | } 59 | } 60 | ``` 61 | 62 | 63 | # PDF-MCP Service 64 | 65 | A high-performance PDF document processing service supporting text, image, table extraction, and advanced analysis. 66 | 67 | ## ✨ Key Features 68 | 69 | - **📜 Text Extraction**: Multilingual support, retains formatting. 70 | - **🖼️ Image Processing**: Extraction and optimization. 71 | - **📊 Table Recognition**: Structured data output. 72 | - **🧠 Intelligent Classification**: Based on deep learning. 73 | - **🔍 Similarity Analysis**: Cross-language comparison. 74 | - **🌐 Multilingual Support**: 100+ languages. 75 | 76 | ## 💻 System Requirements 77 | 78 | - **🖥️ Hardware**: 2-core CPU, 4GB RAM. 79 | - **⚙️ Software**: Python 3.10+, optional CUDA support. 80 | 81 | ## 🚀 Quick Start 82 | 83 | 1. 🗂️ Clone the repository and enter the directory: 84 | ```bash 85 | git clone https://github.com/saury1120/pdf-mcp.git 86 | cd pdf-mcp 87 | ``` 88 | 2. 🛠️ Create a virtual environment and install dependencies: 89 | ```bash 90 | uv venv 91 | source .venv/bin/activate 92 | uv pip install -r requirements.txt 93 | ``` 94 | 3. ▶️ Start the service: 95 | ```bash 96 | uv run pdf_reader 97 | 98 | 99 | ## Claude Desktop 100 | ```json 101 | { 102 | "mcpServers": { 103 | "pdf_reader": { 104 | "command": "uv", 105 | "args": [ 106 | "--directory", 107 | "/path/to/pdf-mcp", # 替换为实际路径 108 | "run", 109 | "pdf_reader" 110 | ] 111 | } 112 | } 113 | } 114 | ``` 115 | -------------------------------------------------------------------------------- /src/pdf_reader/server.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Dict 2 | import asyncio 3 | import base64 4 | import fitz # PyMuPDF 5 | from PIL import Image 6 | import io 7 | import nltk 8 | import spacy 9 | import pandas as pd 10 | from tabula import read_pdf 11 | from pdfminer.high_level import extract_text as pdfminer_extract_text 12 | from mcp.server.models import InitializationOptions 13 | import mcp.types as types 14 | from mcp.server import NotificationOptions, Server 15 | import mcp.server.stdio 16 | from sklearn.feature_extraction.text import TfidfVectorizer 17 | from sklearn.naive_bayes import MultinomialNB 18 | from langdetect import detect 19 | from transformers import pipeline 20 | from sentence_transformers import SentenceTransformer 21 | import torch 22 | import numpy as np 23 | from collections import defaultdict 24 | import threading 25 | from functools import lru_cache 26 | import concurrent.futures 27 | import time 28 | import psutil 29 | 30 | class ModelManager: 31 | _instance = None 32 | _models = {} 33 | _lock = threading.Lock() 34 | _last_used = {} 35 | _memory_threshold = 0.8 # 内存使用率阈值 36 | _max_idle_time = 300 # 模型最大空闲时间(秒) 37 | 38 | def __new__(cls): 39 | if cls._instance is None: 40 | with cls._lock: 41 | if cls._instance is None: 42 | cls._instance = super().__new__(cls) 43 | return cls._instance 44 | 45 | def __init__(self): 46 | if hasattr(self, '_initialized'): 47 | return 48 | self._initialized = True 49 | self._model_configs = { 50 | 'spacy': { 51 | 'name': 'en_core_web_sm', 52 | 'loader': self._load_spacy, 53 | 'quantize': False 54 | }, 55 | 'classifier': { 56 | 'name': 'facebook/bart-large-mnli', 57 | 'loader': self._load_classifier, 58 | 'quantize': { 59 | 'enabled': True, 60 | 'method': 'dynamic', 61 | 'dtype': torch.qint8, 62 | 'layers': [torch.nn.Linear, torch.nn.Conv1d, torch.nn.Conv2d], 63 | 'calibration_size': 100 64 | } 65 | }, 66 | 'sentence_transformer': { 67 | 'name': 'paraphrase-MiniLM-L6-v2', 68 | 'loader': self._load_sentence_transformer, 69 | 'quantize': { 70 | 'enabled': True, 71 | 'method': 'dynamic', 72 | 'dtype': torch.qint8, 73 | 'layers': [torch.nn.Linear], 74 | 'calibration_size': 100 75 | } 76 | } 77 | } 78 | self.device = self._get_optimal_device() 79 | self._setup_quantization_backend() 80 | self._cleanup_thread = threading.Thread(target=self._cleanup_models, daemon=True) 81 | self._cleanup_thread.start() 82 | 83 | def _get_optimal_device(self): 84 | """根据系统配置选择最优设备""" 85 | if not torch.cuda.is_available(): 86 | return 'cpu' 87 | 88 | # 检查GPU内存 89 | gpu_memory = torch.cuda.get_device_properties(0).total_memory 90 | if gpu_memory < 4 * 1024 * 1024 * 1024: # 小于4GB 91 | return 'cpu' 92 | 93 | return 'cuda' 94 | 95 | def _get_optimal_quantization_config(self, model_type): 96 | """根据设备和模型类型选择最优量化配置""" 97 | base_config = self._model_configs[model_type]['quantize'] 98 | if not base_config: 99 | return base_config 100 | 101 | if self.device == 'cpu': 102 | return { 103 | 'enabled': True, 104 | 'method': 'dynamic', 105 | 'dtype': torch.qint8, 106 | 'layers': base_config['layers'], 107 | 'calibration_size': 50 # CPU下使用更小的校准集 108 | } 109 | else: 110 | return { 111 | 'enabled': True, 112 | 'method': 'static', 113 | 'dtype': torch.float16, # GPU下使用半精度 114 | 'layers': base_config['layers'], 115 | 'calibration_size': base_config['calibration_size'] 116 | } 117 | 118 | def get_model(self, model_type: str): 119 | """延迟加载模型""" 120 | with self._lock: 121 | if model_type not in self._models: 122 | config = self._model_configs.get(model_type) 123 | if not config: 124 | raise ModelLoadError(f"Unknown model type: {model_type}") 125 | 126 | # 检查内存使用情况 127 | self._check_memory_usage() 128 | 129 | # 加载模型 130 | try: 131 | model = config['loader'](config['name']) 132 | quant_config = self._get_optimal_quantization_config(model_type) 133 | if quant_config and quant_config['enabled']: 134 | model = self._prepare_model_for_quantization(model, quant_config) 135 | self._models[model_type] = model 136 | except Exception as e: 137 | raise ModelLoadError(f"Failed to load model {model_type}: {str(e)}") 138 | 139 | # 更新最后使用时间 140 | self._last_used[model_type] = time.time() 141 | return self._models[model_type] 142 | 143 | def _check_memory_usage(self): 144 | """检查内存使用情况并在必要时卸载模型""" 145 | memory_percent = psutil.virtual_memory().percent / 100 146 | 147 | if memory_percent > self._memory_threshold: 148 | self._unload_least_used_model() 149 | 150 | def _unload_least_used_model(self): 151 | """卸载最少使用的模型""" 152 | if not self._last_used: 153 | return 154 | 155 | current_time = time.time() 156 | least_used_model = min(self._last_used.items(), key=lambda x: x[1])[0] 157 | if current_time - self._last_used[least_used_model] > self._max_idle_time: 158 | self._unload_model(least_used_model) 159 | 160 | def _unload_model(self, model_type: str): 161 | """卸载指定模型""" 162 | if model_type in self._models: 163 | del self._models[model_type] 164 | del self._last_used[model_type] 165 | # 强制进行垃圾回收 166 | import gc 167 | gc.collect() 168 | if self.device == 'cuda': 169 | torch.cuda.empty_cache() 170 | 171 | def _cleanup_models(self): 172 | """定期清理未使用的模型""" 173 | while True: 174 | time.sleep(60) # 每分钟检查一次 175 | with self._lock: 176 | current_time = time.time() 177 | models_to_unload = [ 178 | model_type for model_type, last_used in self._last_used.items() 179 | if current_time - last_used > self._max_idle_time 180 | ] 181 | for model_type in models_to_unload: 182 | self._unload_model(model_type) 183 | 184 | def _setup_quantization_backend(self): 185 | """设置量化后端""" 186 | if self.device == 'cuda': 187 | # 在GPU上使用CUDA量化后端 188 | torch.backends.quantized.engine = 'fbgemm' 189 | else: 190 | # 在CPU上使用fbgemm (Windows compatible) 191 | torch.backends.quantized.engine = 'fbgemm' 192 | 193 | def _prepare_model_for_quantization(self, model, config): 194 | """准备模型进行量化""" 195 | if not config['enabled']: 196 | return model 197 | 198 | if config['method'] == 'dynamic': 199 | return self._apply_dynamic_quantization(model, config) 200 | elif config['method'] == 'static': 201 | return self._apply_static_quantization(model, config) 202 | return model 203 | 204 | def _apply_dynamic_quantization(self, model, config): 205 | """应用动态量化""" 206 | try: 207 | print(f"Applying dynamic quantization with dtype {config['dtype']}") 208 | model = torch.quantization.quantize_dynamic( 209 | model, 210 | qconfig_spec={ 211 | layer: torch.quantization.default_dynamic_qconfig 212 | for layer in config['layers'] 213 | }, 214 | dtype=config['dtype'] 215 | ) 216 | print("Dynamic quantization applied successfully") 217 | return model 218 | except Exception as e: 219 | print(f"Dynamic quantization failed: {str(e)}") 220 | return model 221 | 222 | def _apply_static_quantization(self, model, config): 223 | """应用静态量化""" 224 | try: 225 | print(f"Applying static quantization") 226 | # 准备量化配置 227 | model.qconfig = torch.quantization.get_default_qconfig('fbgemm' if self.device == 'cuda' else 'fbgemm') 228 | 229 | # 融合操作 230 | model = torch.quantization.fuse_modules(model, [['conv', 'bn', 'relu']]) 231 | 232 | # 准备量化 233 | model = torch.quantization.prepare(model) 234 | 235 | # 校准(这里需要实际的校准数据) 236 | # self._calibrate_model(model, config['calibration_size']) 237 | 238 | # 转换为量化模型 239 | model = torch.quantization.convert(model) 240 | 241 | print("Static quantization applied successfully") 242 | return model 243 | except Exception as e: 244 | print(f"Static quantization failed: {str(e)}") 245 | return model 246 | 247 | def _calibrate_model(self, model, calibration_size): 248 | """使用校准数据集校准模型(用于静态量化)""" 249 | # 这里应该使用实际的校准数据 250 | # 为了示例,我们使用随机数据 251 | with torch.no_grad(): 252 | for _ in range(calibration_size): 253 | dummy_input = torch.randn(1, 3, 224, 224) 254 | model(dummy_input) 255 | 256 | def _load_spacy(self): 257 | try: 258 | return spacy.load('en_core_web_sm') 259 | except OSError: 260 | spacy.cli.download('en_core_web_sm') 261 | return spacy.load('en_core_web_sm') 262 | 263 | def _load_classifier(self): 264 | print("Loading classifier model...") 265 | model = pipeline("zero-shot-classification", 266 | model='facebook/bart-large-mnli', 267 | device=self.device) 268 | 269 | if self._model_configs['classifier']['quantize']['enabled']: 270 | print("Applying quantization to classifier model") 271 | model.model = self._prepare_model_for_quantization( 272 | model.model, 273 | self._model_configs['classifier']['quantize'] 274 | ) 275 | return model 276 | 277 | def _load_sentence_transformer(self): 278 | print("Loading sentence transformer model...") 279 | model = SentenceTransformer('paraphrase-MiniLM-L6-v2') 280 | model = model.to(self.device) 281 | 282 | if self._model_configs['sentence_transformer']['quantize']['enabled']: 283 | print("Applying quantization to sentence transformer model") 284 | model.encoder = self._prepare_model_for_quantization( 285 | model.encoder, 286 | self._model_configs['sentence_transformer']['quantize'] 287 | ) 288 | return model 289 | 290 | def clear_cache(self, model_type: str = None): 291 | with self._lock: 292 | if model_type: 293 | if model_type in self._models: 294 | del self._models[model_type] 295 | else: 296 | self._models.clear() 297 | 298 | def get_model_memory_usage(self, model_type: str = None): 299 | """获取模型内存使用情况""" 300 | if model_type: 301 | if model_type in self._models: 302 | model = self._models[model_type] 303 | return self._get_model_size(model) 304 | return None 305 | 306 | memory_usage = {} 307 | for model_type, model in self._models.items(): 308 | memory_usage[model_type] = self._get_model_size(model) 309 | return memory_usage 310 | 311 | def _get_model_size(self, model): 312 | """计算模型大小(以MB为单位)""" 313 | param_size = 0 314 | buffer_size = 0 315 | 316 | for param in model.parameters(): 317 | param_size += param.nelement() * param.element_size() 318 | 319 | for buffer in model.buffers(): 320 | buffer_size += buffer.nelement() * buffer.element_size() 321 | 322 | size_mb = (param_size + buffer_size) / 1024 / 1024 323 | return round(size_mb, 2) 324 | 325 | class ModelContext: 326 | """模型使用的上下文管理器""" 327 | def __init__(self, model_type: str, manager: 'ModelManager'): 328 | self.model_type = model_type 329 | self.manager = manager 330 | self.model = None 331 | self.error = None 332 | 333 | def __enter__(self): 334 | try: 335 | self.model = self.manager.get_model(self.model_type) 336 | return self.model 337 | except Exception as e: 338 | self.error = e 339 | raise ModelError(f"Error loading model {self.model_type}: {str(e)}") 340 | 341 | def __exit__(self, exc_type, exc_val, exc_tb): 342 | if exc_type is not None: 343 | # 记录错误但不处理 344 | print(f"Error using model {self.model_type}: {str(exc_val)}") 345 | return False # 让异常继续传播 346 | 347 | class ModelError(Exception): 348 | """模型相关错误的基类""" 349 | pass 350 | 351 | class ModelLoadError(ModelError): 352 | """模型加载错误""" 353 | pass 354 | 355 | class ModelInferenceError(ModelError): 356 | """模型推理错误""" 357 | pass 358 | 359 | # 创建全局ModelManager实例 360 | model_manager = ModelManager() 361 | 362 | # 服务器初始化 363 | server = Server("pdf_reader") 364 | 365 | # 下载必要的NLTK数据 366 | nltk_resources = ['punkt', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words', 'stopwords'] 367 | for resource in nltk_resources: 368 | try: 369 | nltk.data.find(f'tokenizers/{resource}') 370 | except LookupError: 371 | nltk.download(resource, quiet=True) 372 | 373 | @server.list_tools() 374 | async def handle_list_tools() -> list[types.Tool]: 375 | """列出可用的工具""" 376 | return [ 377 | types.Tool( 378 | name="extract-text", 379 | description="从PDF文件中提取文本内容", 380 | inputSchema={ 381 | "type": "object", 382 | "properties": { 383 | "file_path": { 384 | "type": "string", 385 | "description": "PDF文件的路径", 386 | }, 387 | "page_number": { 388 | "type": "integer", 389 | "description": "要提取的页码(从0开始)", 390 | }, 391 | }, 392 | "required": ["file_path"], 393 | }, 394 | ), 395 | types.Tool( 396 | name="extract-images", 397 | description="从PDF文件中提取图片", 398 | inputSchema={ 399 | "type": "object", 400 | "properties": { 401 | "file_path": { 402 | "type": "string", 403 | "description": "PDF文件的路径", 404 | }, 405 | "page_number": { 406 | "type": "integer", 407 | "description": "要提取的页码(从0开始)", 408 | }, 409 | }, 410 | "required": ["file_path"], 411 | }, 412 | ), 413 | types.Tool( 414 | name="extract-tables", 415 | description="从PDF文件中提取表格", 416 | inputSchema={ 417 | "type": "object", 418 | "properties": { 419 | "file_path": { 420 | "type": "string", 421 | "description": "PDF文件的路径", 422 | }, 423 | "page_number": { 424 | "type": "integer", 425 | "description": "要提取的页码(从0开始)", 426 | }, 427 | }, 428 | "required": ["file_path"], 429 | }, 430 | ), 431 | types.Tool( 432 | name="analyze-content", 433 | description="分析PDF文件内容,提取关键信息", 434 | inputSchema={ 435 | "type": "object", 436 | "properties": { 437 | "file_path": { 438 | "type": "string", 439 | "description": "PDF文件的路径", 440 | }, 441 | "analysis_type": { 442 | "type": "string", 443 | "description": "分析类型:entities(实体), summary(摘要), keywords(关键词)", 444 | "enum": ["entities", "summary", "keywords"], 445 | }, 446 | }, 447 | "required": ["file_path", "analysis_type"], 448 | }, 449 | ), 450 | types.Tool( 451 | name="get-metadata", 452 | description="获取PDF文件的元数据信息", 453 | inputSchema={ 454 | "type": "object", 455 | "properties": { 456 | "file_path": { 457 | "type": "string", 458 | "description": "PDF文件的路径", 459 | }, 460 | }, 461 | "required": ["file_path"], 462 | }, 463 | ), 464 | types.Tool( 465 | name="classify-document", 466 | description="对PDF文档进行分类", 467 | inputSchema={ 468 | "type": "object", 469 | "properties": { 470 | "file_path": { 471 | "type": "string", 472 | "description": "PDF文件的路径", 473 | }, 474 | "categories": { 475 | "type": "array", 476 | "items": {"type": "string"}, 477 | "description": "可能的分类类别列表", 478 | }, 479 | }, 480 | "required": ["file_path", "categories"], 481 | }, 482 | ), 483 | types.Tool( 484 | name="calculate-similarity", 485 | description="计算两个PDF文档的相似度", 486 | inputSchema={ 487 | "type": "object", 488 | "properties": { 489 | "file_path1": { 490 | "type": "string", 491 | "description": "第一个PDF文件的路径", 492 | }, 493 | "file_path2": { 494 | "type": "string", 495 | "description": "第二个PDF文件的路径", 496 | }, 497 | }, 498 | "required": ["file_path1", "file_path2"], 499 | }, 500 | ), 501 | types.Tool( 502 | name="detect-languages", 503 | description="检测PDF文档中使用的语言", 504 | inputSchema={ 505 | "type": "object", 506 | "properties": { 507 | "file_path": { 508 | "type": "string", 509 | "description": "PDF文件的路径", 510 | }, 511 | }, 512 | "required": ["file_path"], 513 | }, 514 | ), 515 | types.Tool( 516 | name="advanced-analysis", 517 | description="执行高级文本分析", 518 | inputSchema={ 519 | "type": "object", 520 | "properties": { 521 | "file_path": { 522 | "type": "string", 523 | "description": "PDF文件的路径", 524 | }, 525 | }, 526 | "required": ["file_path"], 527 | }, 528 | ), 529 | ] 530 | 531 | async def extract_text_from_pdf(file_path: str, page_number: int = None) -> str: 532 | """从PDF中提取文本""" 533 | try: 534 | doc = fitz.open(file_path) 535 | if page_number is not None: 536 | if 0 <= page_number < len(doc): 537 | text = doc[page_number].get_text() 538 | doc.close() 539 | return text 540 | else: 541 | doc.close() 542 | return f"页码 {page_number} 超出范围。PDF共有 {len(doc)} 页。" 543 | 544 | # 如果没有指定页码,提取所有页面的文本 545 | text = "" 546 | for page in doc: 547 | text += page.get_text() + "\n" 548 | doc.close() 549 | return text 550 | except Exception as e: 551 | return f"提取文本时出错: {str(e)}" 552 | 553 | async def extract_images_from_pdf(file_path: str, page_number: int = None): 554 | """从PDF中提取图片,返回base64编码的图片列表""" 555 | try: 556 | doc = fitz.open(file_path) 557 | images = [] 558 | pages = [page_number] if page_number is not None else range(len(doc)) 559 | 560 | for page_num in pages: 561 | page = doc[page_num] 562 | image_list = page.get_images() 563 | 564 | # 并行处理图片 565 | def process_image(img_index): 566 | try: 567 | xref = image_list[img_index][0] 568 | base_image = doc.extract_image(xref) 569 | image_bytes = base_image["image"] 570 | 571 | # 转换和优化图片 572 | image = Image.open(io.BytesIO(image_bytes)) 573 | image = optimize_image(image) 574 | 575 | # 转换为base64 576 | buffered = io.BytesIO() 577 | image.save(buffered, format="PNG", optimize=True) 578 | img_str = base64.b64encode(buffered.getvalue()).decode() 579 | return img_str 580 | except Exception as e: 581 | print(f"处理图片时出错: {str(e)}") 582 | return None 583 | 584 | # 使用线程池并行处理图片 585 | with concurrent.futures.ThreadPoolExecutor() as executor: 586 | futures = [executor.submit(process_image, i) for i in range(len(image_list))] 587 | for future in concurrent.futures.as_completed(futures): 588 | if future.result(): 589 | images.append(future.result()) 590 | 591 | doc.close() 592 | return images 593 | except Exception as e: 594 | print(f"提取图片时出错: {str(e)}") 595 | return [] 596 | 597 | async def extract_tables_from_pdf(file_path: str, page_number: int = None) -> List[str]: 598 | """从PDF中提取表格""" 599 | try: 600 | if page_number is not None: 601 | tables = read_pdf(file_path, pages=page_number + 1) # tabula使用1-based页码 602 | else: 603 | tables = read_pdf(file_path, pages='all') 604 | 605 | if not tables: 606 | return ["未找到表格"] 607 | 608 | result = [] 609 | for i, table in enumerate(tables): 610 | result.append(f"表格 {i+1}:\n{table.to_string()}\n---") 611 | return result 612 | except Exception as e: 613 | return [f"提取表格时出错: {str(e)}"] 614 | 615 | async def analyze_pdf_content(file_path: str, analysis_type: str) -> Dict[str, Any]: 616 | """分析PDF内容""" 617 | try: 618 | text = extract_text_from_pdf(file_path) 619 | 620 | if analysis_type == "entities": 621 | with ModelContext('spacy', model_manager) as nlp: 622 | doc = nlp(text) 623 | entities = [(ent.text, ent.label_) for ent in doc.ents] 624 | return {"entities": entities} 625 | 626 | elif analysis_type == "summary": 627 | with ModelContext('classifier', model_manager) as classifier: 628 | sentences = nltk.sent_tokenize(text) 629 | results = classifier(sentences, 630 | candidate_labels=["important", "not important"], 631 | multi_label=False) 632 | important_sentences = [sent for sent, score in zip(sentences, results['scores']) 633 | if score > 0.7] 634 | return {"summary": " ".join(important_sentences[:5])} 635 | 636 | elif analysis_type == "keywords": 637 | with ModelContext('spacy', model_manager) as nlp: 638 | doc = nlp(text) 639 | keywords = [token.text for token in doc if not token.is_stop and token.is_alpha] 640 | return {"keywords": list(set(keywords[:20]))} 641 | 642 | except ModelError as e: 643 | return {"error": f"Model error: {str(e)}"} 644 | except Exception as e: 645 | return {"error": f"Unexpected error: {str(e)}"} 646 | 647 | async def get_pdf_metadata(file_path: str) -> Dict[str, Any]: 648 | """获取PDF元数据""" 649 | try: 650 | doc = fitz.open(file_path) 651 | metadata = doc.metadata 652 | doc.close() 653 | return { 654 | "title": metadata.get("title", "未知"), 655 | "author": metadata.get("author", "未知"), 656 | "subject": metadata.get("subject", "未知"), 657 | "keywords": metadata.get("keywords", "未知"), 658 | "creator": metadata.get("creator", "未知"), 659 | "producer": metadata.get("producer", "未知"), 660 | "creation_date": metadata.get("creationDate", "未知"), 661 | "modification_date": metadata.get("modDate", "未知"), 662 | "page_count": doc.page_count 663 | } 664 | except Exception as e: 665 | return {"error": str(e)} 666 | 667 | async def classify_document(file_path: str, categories: List[str]) -> Dict[str, Any]: 668 | """对文档进行分类""" 669 | try: 670 | text = pdfminer_extract_text(file_path) 671 | with ModelContext('classifier', model_manager) as classifier: 672 | result = classifier(text, categories) 673 | return { 674 | "labels": result["labels"], 675 | "scores": [float(score) for score in result["scores"]] 676 | } 677 | except ModelError as e: 678 | return {"error": f"Model error: {str(e)}"} 679 | except Exception as e: 680 | return {"error": f"Unexpected error: {str(e)}"} 681 | 682 | async def calculate_similarity(file_path1: str, file_path2: str) -> Dict[str, float]: 683 | """计算两个文档的相似度""" 684 | try: 685 | text1 = pdfminer_extract_text(file_path1) 686 | text2 = pdfminer_extract_text(file_path2) 687 | 688 | with ModelContext('sentence_transformer', model_manager) as model: 689 | # 将文本分成较小的块进行处理 690 | def chunk_text(text, chunk_size=1000): 691 | return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] 692 | 693 | # 计算文本块的嵌入向量 694 | def get_embeddings(text): 695 | chunks = chunk_text(text) 696 | embeddings = model.encode(chunks) 697 | return np.mean(embeddings, axis=0) 698 | 699 | # 计算两个文档的相似度 700 | embedding1 = get_embeddings(text1) 701 | embedding2 = get_embeddings(text2) 702 | similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)) 703 | 704 | return {"similarity_score": float(similarity)} 705 | 706 | except ModelError as e: 707 | return {"error": f"Model error: {str(e)}"} 708 | except Exception as e: 709 | return {"error": f"Unexpected error: {str(e)}"} 710 | 711 | async def detect_languages(file_path: str) -> Dict[str, Any]: 712 | """检测文档中的语言""" 713 | try: 714 | text = pdfminer_extract_text(file_path) 715 | with ModelContext('spacy', model_manager) as nlp: 716 | # 将文本分成段落 717 | paragraphs = text.split('\n\n') 718 | language_info = [] 719 | 720 | for para in paragraphs: 721 | if not para.strip(): 722 | continue 723 | 724 | try: 725 | lang = detect(para) 726 | doc = nlp(para) 727 | # 获取段落的语言特征 728 | features = { 729 | 'text': para[:100] + '...' if len(para) > 100 else para, 730 | 'language': lang, 731 | 'tokens': len(doc), 732 | 'sentences': len(list(doc.sents)) 733 | } 734 | language_info.append(features) 735 | except Exception as e: 736 | print(f"Error processing paragraph: {str(e)}") 737 | continue 738 | 739 | return { 740 | "language_analysis": language_info, 741 | "document_stats": { 742 | "total_paragraphs": len(paragraphs), 743 | "processed_paragraphs": len(language_info) 744 | } 745 | } 746 | 747 | except ModelError as e: 748 | return {"error": f"Model error: {str(e)}"} 749 | except Exception as e: 750 | return {"error": f"Unexpected error: {str(e)}"} 751 | 752 | async def advanced_text_analysis(file_path: str) -> Dict[str, Any]: 753 | """执行高级文本分析""" 754 | try: 755 | text = pdfminer_extract_text(file_path) 756 | 757 | with ModelContext('spacy', model_manager) as nlp: 758 | doc = nlp(text) 759 | 760 | # 1. 复杂度分析 761 | sentences = list(doc.sents) 762 | avg_sentence_length = sum(len(sent) for sent in sentences) / len(sentences) 763 | 764 | # 2. 词性分布 765 | pos_dist = defaultdict(int) 766 | for token in doc: 767 | pos_dist[token.pos_] += 1 768 | 769 | # 3. 依存关系分析 770 | dep_dist = defaultdict(int) 771 | for token in doc: 772 | dep_dist[token.dep_] += 1 773 | 774 | # 4. 主题建模(使用TF-IDF找出最重要的词组) 775 | vectorizer = TfidfVectorizer(max_features=10) 776 | tfidf_matrix = vectorizer.fit_transform([text]) 777 | feature_names = vectorizer.get_feature_names_out() 778 | scores = tfidf_matrix.toarray()[0] 779 | important_phrases = [ 780 | {"phrase": phrase, "importance": float(score)} 781 | for phrase, score in zip(feature_names, scores) 782 | ] 783 | 784 | return { 785 | "complexity_metrics": { 786 | "avg_sentence_length": float(avg_sentence_length), 787 | "vocabulary_size": len(set(token.text.lower() for token in doc)), 788 | "readability_score": float(avg_sentence_length * 0.39 + 11.8) 789 | }, 790 | "pos_distribution": dict(pos_dist), 791 | "dependency_patterns": dict(dep_dist), 792 | "important_phrases": sorted(important_phrases, 793 | key=lambda x: x["importance"], 794 | reverse=True)[:10] 795 | } 796 | 797 | except ModelError as e: 798 | return {"error": f"Model error: {str(e)}"} 799 | except Exception as e: 800 | return {"error": f"Unexpected error: {str(e)}"} 801 | 802 | @lru_cache(maxsize=100) 803 | def process_text(text: str) -> str: 804 | """处理文本并缓存结果""" 805 | try: 806 | with ModelContext('spacy', model_manager) as nlp: 807 | doc = nlp(text) 808 | return " ".join([token.text for token in doc]) 809 | except ModelError as e: 810 | print(f"Error processing text: {str(e)}") 811 | return text # 返回原始文本作为后备方案 812 | except Exception as e: 813 | print(f"Unexpected error processing text: {str(e)}") 814 | return text 815 | 816 | async def main(): 817 | """运行服务器""" 818 | try: 819 | print("PDF Reader MCP 服务启动中...") 820 | 821 | # 在后台线程中初始化依赖 822 | init_thread = threading.Thread(target=initialize_dependencies) 823 | init_thread.start() 824 | 825 | # 启动服务器 826 | async with mcp.server.stdio.stdio_server() as (read_stream, write_stream): 827 | await server.run( 828 | read_stream, 829 | write_stream, 830 | InitializationOptions( 831 | server_name="pdf_reader", 832 | server_version="0.1.0", 833 | capabilities=server.get_capabilities( 834 | notification_options=NotificationOptions(), 835 | experimental_capabilities={}, 836 | ), 837 | ), 838 | ) 839 | except Exception as e: 840 | print(f"服务器运行错误: {str(e)}") 841 | raise 842 | 843 | def initialize_dependencies(): 844 | """异步初始化所需的依赖""" 845 | try: 846 | # NLTK数据 - 使用异步方式下载 847 | nltk_resources = ['punkt', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words', 'stopwords'] 848 | for resource in nltk_resources: 849 | try: 850 | nltk.data.find(f'tokenizers/{resource}') 851 | except LookupError: 852 | nltk.download(resource, quiet=True) 853 | 854 | print("NLTK resources loaded") 855 | print(f"GPU acceleration: {'available' if torch.cuda.is_available() else 'not available'}") 856 | 857 | return True 858 | except Exception as e: 859 | print(f"Initialization failed: {str(e)}") 860 | return False 861 | 862 | # 优化图片处理 863 | def optimize_image(image: Image.Image, max_size: int = 1024) -> Image.Image: 864 | """优化图片大小和质量""" 865 | if max(image.size) > max_size: 866 | ratio = max_size / max(image.size) 867 | new_size = tuple(int(dim * ratio) for dim in image.size) 868 | image = image.resize(new_size, Image.Resampling.LANCZOS) 869 | return image 870 | 871 | if __name__ == "__main__": 872 | # 设置torch使用的线程数 873 | torch.set_num_threads(4) 874 | 875 | # 确保在主模块中运行 876 | import sys 877 | if 'src.pdf_reader.server' in sys.modules: 878 | del sys.modules['src.pdf_reader.server'] 879 | 880 | # 初始化并运行服务器 881 | asyncio.run(main()) 882 | 883 | @server.call_tool() 884 | async def handle_call_tool( 885 | name: str, arguments: dict | None 886 | ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]: 887 | """处理工具调用请求""" 888 | if not arguments: 889 | raise ValueError("缺少参数") 890 | 891 | file_path = arguments.get("file_path") 892 | if not file_path: 893 | raise ValueError("缺少文件路径") 894 | 895 | if name == "extract-text": 896 | page_number = arguments.get("page_number") 897 | text = await extract_text_from_pdf(file_path, page_number) 898 | return [types.TextContent(type="text", text=text)] 899 | 900 | elif name == "extract-images": 901 | page_number = arguments.get("page_number") 902 | images = await extract_images_from_pdf(file_path, page_number) 903 | result = [] 904 | for i, img_base64 in enumerate(images): 905 | if img_base64.startswith("提取图片时出错"): 906 | result.append(types.TextContent(type="text", text=img_base64)) 907 | else: 908 | result.append(types.ImageContent( 909 | type="image", 910 | format="image/png", 911 | data=img_base64 912 | )) 913 | return result if result else [types.TextContent(type="text", text="未找到图片")] 914 | 915 | elif name == "extract-tables": 916 | page_number = arguments.get("page_number") 917 | tables = await extract_tables_from_pdf(file_path, page_number) 918 | return [types.TextContent(type="text", text="\n".join(tables))] 919 | 920 | elif name == "analyze-content": 921 | analysis_type = arguments.get("analysis_type") 922 | if not analysis_type: 923 | raise ValueError("缺少分析类型") 924 | 925 | result = await analyze_pdf_content(file_path, analysis_type) 926 | if "error" in result: 927 | return [types.TextContent(type="text", text=f"分析出错: {result['error']}")] 928 | 929 | if analysis_type == "entities": 930 | text = "识别到的实体:\n" 931 | for entity_type, entities in result["entities"].items(): 932 | text += f"\n{entity_type}:\n- " + "\n- ".join(entities) 933 | elif analysis_type == "summary": 934 | text = f"文档摘要:\n{result['summary']}" 935 | elif analysis_type == "keywords": 936 | text = "关键词:\n- " + "\n- ".join(result["keywords"]) 937 | 938 | return [types.TextContent(type="text", text=text)] 939 | 940 | elif name == "get-metadata": 941 | metadata = await get_pdf_metadata(file_path) 942 | if "error" in metadata: 943 | return [types.TextContent(type="text", text=f"获取元数据出错: {metadata['error']}")] 944 | 945 | text = "PDF元数据:\n" 946 | for key, value in metadata.items(): 947 | text += f"{key}: {value}\n" 948 | 949 | return [types.TextContent(type="text", text=text)] 950 | 951 | elif name == "classify-document": 952 | categories = arguments.get("categories") 953 | if not categories: 954 | raise ValueError("缺少分类类别") 955 | 956 | result = await classify_document(file_path, categories) 957 | if "error" in result: 958 | return [types.TextContent(type="text", text=f"分类出错: {result['error']}")] 959 | 960 | text = "文档分类结果:\n" 961 | for label, score in zip(result["labels"], result["scores"]): 962 | text += f"{label}: {score:.2%}\n" 963 | 964 | return [types.TextContent(type="text", text=text)] 965 | 966 | elif name == "calculate-similarity": 967 | file_path2 = arguments.get("file_path2") 968 | if not file_path2: 969 | raise ValueError("缺少第二个文件路径") 970 | 971 | result = await calculate_similarity(file_path, file_path2) 972 | if "error" in result: 973 | return [types.TextContent(type="text", text=f"计算相似度出错: {result['error']}")] 974 | 975 | text = f"文档相似度: {result['similarity_score']:.2%}\n" 976 | text += result["interpretation"] 977 | 978 | return [types.TextContent(type="text", text=text)] 979 | 980 | elif name == "detect-languages": 981 | result = await detect_languages(file_path) 982 | if "error" in result: 983 | return [types.TextContent(type="text", text=f"语言检测出错: {result['error']}")] 984 | 985 | text = f"主要语言: {result['primary_language']}\n\n" 986 | text += "语言分布:\n" 987 | for lang, ratio in result["language_distribution"].items(): 988 | text += f"{lang}: {ratio:.1%}\n" 989 | 990 | return [types.TextContent(type="text", text=text)] 991 | 992 | elif name == "advanced-analysis": 993 | result = await advanced_text_analysis(file_path) 994 | if "error" in result: 995 | return [types.TextContent(type="text", text=f"分析出错: {result['error']}")] 996 | 997 | text = "高级文本分析结果:\n\n" 998 | 999 | # 复杂度指标 1000 | text += "1. 复杂度指标:\n" 1001 | metrics = result["complexity_metrics"] 1002 | text += f"- 平均句子长度: {metrics['avg_sentence_length']:.1f}\n" 1003 | text += f"- 词汇量: {metrics['vocabulary_size']}\n" 1004 | text += f"- 可读性评分: {metrics['readability_score']:.1f}\n\n" 1005 | 1006 | # 词性分布 1007 | text += "2. 词性分布:\n" 1008 | for pos, count in result["pos_distribution"].items(): 1009 | text += f"- {pos}: {count}\n" 1010 | text += "\n" 1011 | 1012 | # 重要短语 1013 | text += "3. 重要短语:\n" 1014 | for item in result["important_phrases"]: 1015 | text += f"- {item['phrase']}: {item['importance']:.3f}\n" 1016 | 1017 | return [types.TextContent(type="text", text=text)] 1018 | 1019 | else: 1020 | raise ValueError(f"未知的工具: {name}") --------------------------------------------------------------------------------