├── rules ├── 9.论文规则集.xlsx ├── 1.教育经历规则集.xlsx ├── 16.考核规则集.xlsx ├── 17.附件规则集.xlsx ├── 2.工作经历规则集.xlsx ├── 8.项目经历规则集.xlsx ├── 交叉检验规则.md ├── 14.资质证书规则集.xlsx └── 11.专利(著作权)情况规则集.xlsx ├── src ├── __init__.py ├── config │ ├── __init__.py │ ├── warning_config.py │ ├── api_config.py │ ├── model_config.py │ └── redis.py ├── models │ ├── __init__.py │ └── state.py ├── nodes │ ├── __init__.py │ ├── file_processing.py │ ├── cross_validation.py │ ├── report_generation.py │ ├── core_info_extraction.py │ ├── validation.py │ └── pdf_extraction.py ├── graph │ ├── __init__.py │ ├── workflow.py │ ├── state.py │ └── edges.py ├── tools │ ├── __init__.py │ ├── workflow_integration.py │ ├── common_utils.py │ ├── cache_manager.py │ ├── langsmith_utils.py │ └── file_utils.py └── agent.py ├── langgraph.json ├── __init__.py ├── README.md ├── graph_def.py ├── .gitignore ├── requirements.txt ├── .env.example ├── pyproject.toml └── static ├── styles.css └── index.html /rules/9.论文规则集.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/9.论文规则集.xlsx -------------------------------------------------------------------------------- /rules/1.教育经历规则集.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/1.教育经历规则集.xlsx -------------------------------------------------------------------------------- /rules/16.考核规则集.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/16.考核规则集.xlsx -------------------------------------------------------------------------------- /rules/17.附件规则集.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/17.附件规则集.xlsx -------------------------------------------------------------------------------- /rules/2.工作经历规则集.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/2.工作经历规则集.xlsx -------------------------------------------------------------------------------- /rules/8.项目经历规则集.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/8.项目经历规则集.xlsx -------------------------------------------------------------------------------- /rules/交叉检验规则.md: -------------------------------------------------------------------------------- 1 | 1.所有材料中的主人公姓名必须一致 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /rules/14.资质证书规则集.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/14.资质证书规则集.xlsx -------------------------------------------------------------------------------- /rules/11.专利(著作权)情况规则集.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/11.专利(著作权)情况规则集.xlsx -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 主要源代码目录 3 | 4 | 包含系统的核心模块: 5 | - graph: LangGraph工作流定义 6 | - nodes: 各个处理节点实现 7 | - tools: 工具函数和辅助模块 8 | - models: 数据模型和状态定义 9 | - services: 业务服务层 10 | """ -------------------------------------------------------------------------------- /langgraph.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": ["."], 3 | "graphs": { 4 | "audit_workflow": "graph_def:graph" 5 | }, 6 | "dockerfile_lines": [], 7 | "python_version": "3.12", 8 | "env": ".env", 9 | "port": 8123 10 | } -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 企业级职称评审材料审核系统 3 | 基于 LangGraph 框架的智能化审核流程 4 | 5 | 项目结构说明: 6 | - src/: 主要源代码目录 7 | - graph/: LangGraph工作流定义 8 | - nodes/: 各个处理节点实现 9 | - tools/: 工具函数和辅助模块 10 | - models/: 数据模型和状态定义 11 | - services/: 业务服务层 12 | - config/: 配置文件 13 | - tests/: 测试代码 14 | - docs/: 文档目录 15 | - data/: 数据存储目录 16 | """ 17 | 18 | # 系统版本信息 19 | __version__ = "1.0.0" 20 | __author__ = "Abruzz1" 21 | __description__ = "企业级职称评审材料审核系统" -------------------------------------------------------------------------------- /src/config/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 配置模块 3 | 4 | 包含项目所有配置相关的功能: 5 | - Redis 配置和连接管理 6 | - 环境变量配置 7 | - 其他系统配置 8 | """ 9 | 10 | from .model_config import ( 11 | model_config, 12 | setup_model_environment, 13 | setup_model_environment_sync, 14 | print_model_help 15 | ) 16 | 17 | __all__ = [ 18 | 'model_config', 19 | 'setup_model_environment', 20 | 'setup_model_environment_sync', 21 | 'print_model_help' 22 | ] 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 企业级职称评审材料审核系统 2 | 3 | 基于LangGraph框架构建的智能化职称评审材料审核系统,通过AI技术自动化处理和校验职称申报材料。 4 | 5 | 🔧 **集成LangSmith调试和监控功能** - 提供完整的工作流追踪、性能监控和调试支持。 6 | 7 | ## 系统架构 8 | 9 | 系统采用LangGraph图形化工作流设计,包含以下主要模块: 10 | 11 | 1. **文件处理模块** - ZIP解压、文件分类 12 | 2. **PDF智能处理** - 页数检测、智能分片 13 | 3. **内容提取** - AI识别、17类材料分类 14 | 4. **规则校验** - 各类材料规则验证 15 | 5. **交叉校验** - 核心信息一致性检查 16 | 6. **报告生成** - HTML格式化输出 17 | 18 | ## 安装说明 19 | 1.创建虚拟环境 python -m venv venv 20 | 21 | 启用 .venv/Scripts/activate 22 | 23 | 2.安装依赖 pip install . 24 | 25 | pip install requirements.txt 26 | 27 | 3.打开开发工具 langgraph dev 28 | 29 | 4.启动网页端 python web_app_v2.py 30 | 31 | -------------------------------------------------------------------------------- /graph_def.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | LangGraph 工作流图定义 4 | 5 | 专门用于 LangGraph Studio 的图定义文件 6 | 避免复杂的导入路径问题 7 | """ 8 | 9 | import sys 10 | import os 11 | 12 | # 确保项目路径在sys.path中 13 | project_root = os.path.dirname(os.path.abspath(__file__)) 14 | if project_root not in sys.path: 15 | sys.path.insert(0, project_root) 16 | 17 | try: 18 | # 导入工作流创建函数 19 | from src.graph.workflow import create_audit_workflow 20 | 21 | # 创建图对象 22 | graph = create_audit_workflow() 23 | 24 | print("✅ LangGraph 工作流图已成功创建") 25 | 26 | except Exception as e: 27 | print(f"❌ 创建图失败: {e}") 28 | import traceback 29 | traceback.print_exc() 30 | raise -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 数据模型包 3 | 4 | 定义系统中使用的所有数据模型: 5 | - 状态管理模型 6 | - 业务数据模型 7 | - 配置模型 8 | 9 | 模型使用状态说明: 10 | ✅ 高度活跃: CoreInfo, RuleInfo, RuleFileInfo, MaterialProcessingStats 11 | ⚠️ 部分使用: ValidationResult, CrossValidationResult, AuditReport 12 | ✖️ 已移除: FileInfo, MaterialInfo, ReportSummary 13 | """ 14 | 15 | from .state import ( 16 | CoreInfo, 17 | ValidationResult, 18 | CrossValidationResult, 19 | RuleInfo, 20 | RuleFileInfo, 21 | AuditReport, 22 | AuditState, 23 | MaterialProcessingStats 24 | ) 25 | 26 | __all__ = [ 27 | "CoreInfo", 28 | "ValidationResult", 29 | "CrossValidationResult", 30 | "RuleInfo", 31 | "RuleFileInfo", 32 | "AuditReport", 33 | "AuditState", 34 | "MaterialProcessingStats" 35 | ] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python bytecode / cache 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Virtual environments 7 | .venv/ 8 | venv/ 9 | env/ 10 | venv313/ 11 | 12 | # Test caches & coverage 13 | .pytest_cache/ 14 | .mypy_cache/ 15 | .ruff_cache/ 16 | .tox/ 17 | .nox/ 18 | .coverage* 19 | coverage.xml 20 | htmlcov/ 21 | 22 | # Packaging / build artifacts 23 | build/ 24 | dist/ 25 | .eggs/ 26 | *.egg-info/ 27 | *.egg 28 | pip-wheel-metadata/ 29 | 30 | # Jupyter 31 | .ipynb_checkpoints/ 32 | 33 | # Logs 34 | logs/ 35 | *.log 36 | 37 | # IDE / OS 38 | .vscode/ 39 | .idea/ 40 | .DS_Store 41 | Thumbs.db 42 | desktop.ini 43 | 44 | # Environment files 45 | .env 46 | !.env.example 47 | 48 | # Project-specific temporary/data dirs 49 | test_data/ 50 | temp_pdf_processing/ 51 | uploads/ 52 | extracted/ 53 | .model_cache/ 54 | .langgraph_api/ 55 | .qoder/ 56 | 57 | # Optional: front-end deps if ever used 58 | node_modules/ 59 | 60 | -------------------------------------------------------------------------------- /src/nodes/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 节点定义模块 3 | 4 | 包含LangGraph所有节点定义: 5 | - ZIP解压和文件夹处理节点 (file_processing) 6 | - PDF内容提取节点 (pdf_extraction) 7 | - 核心信息提取节点 (core_info_extraction) 8 | - 规则校验节点 (validation) 9 | - 交叉校验节点 (cross_validation) 10 | - 报告生成节点 (report_generation) 11 | - 规则集加载节点 (load_rules) 12 | - 规则集提取节点 (extract_rules) 13 | """ 14 | 15 | # 从独立的节点文件中导入各个节点 16 | from .file_processing import file_processing_node 17 | 18 | from .pdf_extraction import pdf_extraction_node 19 | from .core_info_extraction import core_info_extraction_node 20 | from .validation import validation_node 21 | from .cross_validation import cross_validation_node 22 | from .report_generation import report_generation_node 23 | 24 | # 规则处理节点 25 | from .rules_processing import load_rules_node, extract_rules_node 26 | 27 | 28 | __all__ = [ 29 | "file_processing_node", 30 | "pdf_extraction_node", 31 | "core_info_extraction_node", 32 | "validation_node", 33 | "cross_validation_node", 34 | "report_generation_node", 35 | "load_rules_node", 36 | "extract_rules_node" 37 | ] -------------------------------------------------------------------------------- /src/graph/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | LangGraph工作流定义模块 3 | 4 | 包含系统的主要工作流: 5 | - workflow.py: 主要的审核工作流定义 6 | - state.py: 工作流状态管理 7 | - edges.py: 边和路由逻辑定义 8 | """ 9 | 10 | from .workflow import ( 11 | create_audit_workflow, 12 | get_default_workflow 13 | ) 14 | 15 | from .state import ( 16 | AuditState, 17 | WorkflowConfig, 18 | create_initial_state, 19 | update_state_step, 20 | add_warning, 21 | set_error, 22 | mark_complete 23 | ) 24 | 25 | from .edges import ( 26 | should_continue_processing, 27 | route_folder_validation, 28 | route_to_cross_validation, 29 | should_generate_report, 30 | check_core_info_for_cross_validation, 31 | check_pdf_extraction_status 32 | ) 33 | 34 | __all__ = [ 35 | # Workflow functions (优化后的版本,只保留主工作流) 36 | "create_audit_workflow", 37 | "get_default_workflow", 38 | 39 | # State management 40 | "AuditState", 41 | "WorkflowConfig", 42 | "create_initial_state", 43 | "update_state_step", 44 | "add_warning", 45 | "set_error", 46 | "mark_complete", 47 | 48 | # Edge routing functions 49 | "should_continue_processing", 50 | "route_folder_validation", 51 | "route_to_cross_validation", 52 | "should_generate_report", 53 | "check_core_info_for_cross_validation", 54 | "check_pdf_extraction_status" 55 | ] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 基础依赖包 2 | langgraph>=0.2.0 3 | langsmith>=0.1.0 4 | langtrace-python-sdk>=2.0.0 5 | pydantic>=2.0.0 6 | typing-extensions>=4.0.0 7 | langchain-core>=0.1.0 # LangGraph核心依赖 8 | langgraph-cli>=0.1.0 # LangGraph开发工具 9 | langgraph-checkpoint-redis>=0.1.0 # Redis检查点 10 | 11 | # 谷歌AI API(新版本) 12 | google-genai>=1.33.0 13 | 14 | # 环境变量管理 15 | python-dotenv>=1.0.0 16 | 17 | # 文件处理 18 | pathlib2 19 | zipfile36>=0.1.0 20 | python-magic>=0.4.0 21 | Pillow>=10.0.0 22 | 23 | # Web框架 24 | fastapi>=0.104.0 25 | uvicorn>=0.24.0 26 | python-multipart>=0.0.6 27 | sse-starlette>=1.6.0 # Server-Sent Events支持 28 | starlette>=0.27.0 29 | 30 | # 数据处理 31 | pandas>=2.0.0 32 | numpy>=1.24.0 33 | 34 | # HTML报告生成 35 | jinja2>=3.1.0 36 | weasyprint>=60.0 37 | 38 | # 配置管理 39 | python-dotenv>=1.0.0 40 | pyyaml>=6.0 41 | 42 | # 日志和监控 43 | loguru>=0.7.0 44 | prometheus-client>=0.19.0 45 | 46 | # 测试框架 47 | pytest>=7.4.0 48 | pytest-asyncio>=0.21.0 49 | pytest-cov>=4.1.0 50 | 51 | # 开发工具 52 | black>=23.0.0 53 | isort>=5.12.0 54 | flake8>=6.0.0 55 | mypy>=1.6.0 56 | 57 | # OCR和AI 58 | pytesseract>=0.3.10 59 | opencv-python>=4.8.0 60 | 61 | # 数据库 62 | sqlalchemy>=2.0.0 63 | alembic>=1.12.0 64 | 65 | # 异步处理 66 | aiofiles>=23.2.0 67 | celery>=5.3.0 68 | redis>=5.0.0 69 | 70 | # 文档处理 71 | markdown>=3.5.0 72 | markdownify>=0.11.0 73 | 74 | # 工作流状态管理 75 | psycopg>=3.1.0 # PostgreSQL支持 76 | asyncpg>=0.29.0 # 异步PostgreSQL -------------------------------------------------------------------------------- /src/tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 工具模块导出 3 | 4 | 按功能模块组织的工具函数导出,包括: 5 | - AI模型工具(ai_utils) 6 | - 文件处理工具(file_utils) 7 | - 通用工具(common_utils) 8 | - 工作流集成工具(workflow_integration) 9 | """ 10 | 11 | # AI模型工具 12 | from .ai_utils import ( 13 | extract_core_information_with_ai, 14 | validate_material_with_ai, 15 | cross_validate_materials_with_ai, 16 | extract_category_core_info_with_ai 17 | ) 18 | 19 | # 文件处理工具 20 | from .file_utils import ( 21 | extract_zip_file, 22 | validate_folder_structure, 23 | analyze_markdown_structure, 24 | extract_markdown_content 25 | ) 26 | 27 | # 通用工具 28 | from .common_utils import ( 29 | extract_with_regex, 30 | generate_html_report 31 | ) 32 | 33 | # 工作流集成工具 34 | from .workflow_integration import ( 35 | extract_core_information_from_json, 36 | extract_core_information, 37 | validate_material_rules 38 | ) 39 | 40 | __all__ = [ 41 | # AI模型工具 42 | "extract_core_information_with_ai", 43 | "validate_material_with_ai", 44 | "cross_validate_materials_with_ai", 45 | "extract_category_core_info_with_ai", 46 | 47 | # 文件处理工具 48 | "extract_zip_file", 49 | "validate_folder_structure", 50 | "analyze_markdown_structure", 51 | "extract_markdown_content", 52 | 53 | # 通用工具 54 | "extract_with_regex", 55 | "generate_html_report", 56 | 57 | # 工作流集成工具 58 | "extract_core_information_from_json", 59 | "extract_core_information", 60 | "validate_material_rules" 61 | ] -------------------------------------------------------------------------------- /src/config/warning_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | 警告配置管理 3 | 4 | 统一管理系统中的警告过滤器,特别针对第三方库的弃用警告 5 | """ 6 | 7 | import warnings 8 | import os 9 | 10 | 11 | def setup_warning_filters(): 12 | """ 13 | 设置系统警告过滤器 14 | 15 | 主要针对以下警告进行优化: 16 | 1. pkg_resources弃用警告(来自Marker内部) 17 | 2. 其他第三方库的不必要警告 18 | """ 19 | 20 | # 抑制pkg_resources弃用警告 21 | # 这个警告来自Marker库内部,用户无法控制 22 | warnings.filterwarnings( 23 | "ignore", 24 | category=DeprecationWarning, 25 | module="pkg_resources" 26 | ) 27 | 28 | # 抑制setuptools相关的pkg_resources警告 29 | warnings.filterwarnings( 30 | "ignore", 31 | message=".*pkg_resources is deprecated.*", 32 | category=UserWarning 33 | ) 34 | 35 | # 抑制其他第三方库的常见警告 36 | warnings.filterwarnings( 37 | "ignore", 38 | category=DeprecationWarning, 39 | module="transformers" 40 | ) 41 | 42 | # 可选:在开发模式下显示所有警告 43 | if os.environ.get("LANGGRAPH_DEBUG", "false").lower() == "true": 44 | warnings.resetwarnings() 45 | warnings.simplefilter("always", DeprecationWarning) 46 | print("🔍 调试模式:显示所有警告信息") 47 | else: 48 | print("✅ 已配置警告过滤器,抑制第三方库不必要的警告") 49 | 50 | 51 | def suppress_marker_warnings(): 52 | """ 53 | 保持兼容性函数(已无作用) 54 | """ 55 | pass 56 | 57 | 58 | def get_warning_env_vars(): 59 | """ 60 | 获取用于抑制警告的环境变量字典 61 | 62 | Returns: 63 | 环境变量字典 64 | """ 65 | return { 66 | "PYTHONWARNINGS": "ignore::DeprecationWarning:pkg_resources", 67 | "TRANSFORMERS_VERBOSITY": "error", # 降低transformers库的输出等级 68 | "TOKENIZERS_PARALLELISM": "false", # 避免tokenizers并发警告 69 | } 70 | 71 | 72 | # 自动在模块导入时设置警告过滤器 73 | if __name__ != "__main__": 74 | setup_warning_filters() -------------------------------------------------------------------------------- /src/nodes/file_processing.py: -------------------------------------------------------------------------------- 1 | """ 2 | ZIP解压节点 3 | 4 | 专门处理ZIP压缩包解压和17个标准文件夹结构验证 5 | """ 6 | 7 | from typing import Dict, Any 8 | from pathlib import Path 9 | from src.graph.state import AuditState 10 | from src.tools import ( 11 | extract_zip_file, 12 | validate_folder_structure 13 | ) 14 | 15 | 16 | async def file_processing_node(state: AuditState) -> Dict[str, Any]: 17 | """ 18 | ZIP解压节点 - 解压ZIP文件并验证17个标准文件夹结构 19 | """ 20 | try: 21 | # 支持两种输入字段名(向后兼容) 22 | zip_path = state.get("uploaded_file") or state.get("zip_file_path") 23 | 24 | if not zip_path: 25 | return { 26 | "current_step": "zip_extraction_failed", 27 | "error_message": "未找到上传的ZIP文件路径" 28 | } 29 | 30 | print(f"📦 开始解压ZIP文件: {Path(zip_path).name}") 31 | 32 | # 解压 ZIP 文件 33 | extraction_result = await extract_zip_file(zip_path) 34 | 35 | if not extraction_result: 36 | return { 37 | "current_step": "zip_extraction_failed", 38 | "error_message": "ZIP文件解压失败" 39 | } 40 | 41 | # 获取解压后的根目录 42 | extraction_path = extraction_result.get("extraction_path") 43 | extracted_files = extraction_result.get("files", []) 44 | 45 | # 检查解压是否成功 46 | if not extraction_path: 47 | return { 48 | "current_step": "zip_extraction_failed", 49 | "error_message": "ZIP文件解压失败,无法获取解压路径" 50 | } 51 | 52 | print(f"📁 ZIP解压完成,提取到: {extraction_path}") 53 | print(f"📊 共解压 {len(extracted_files)} 个文件") 54 | 55 | # 验证17个标准文件夹结构 56 | folder_validation = await validate_folder_structure(extraction_path) 57 | 58 | return { 59 | "extraction_path": extraction_path, 60 | "extracted_files": extracted_files, 61 | "folder_validation": folder_validation, 62 | "current_step": "zip_extraction_completed", 63 | "file_type": "zip" 64 | } 65 | 66 | except Exception as e: 67 | print(f"❌ ZIP解压失败: {str(e)}") 68 | return { 69 | "current_step": "zip_extraction_failed", 70 | "error_message": f"ZIP解压失败: {str(e)}" 71 | } -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # 环境配置示例文件 2 | # 复制为 .env 并填入实际配置值 3 | 4 | # OpenAI API配置 5 | OPENAI_API_KEY=your_openai_api_key_here 6 | OPENAI_BASE_URL=https://api.openai.com/v1 7 | OPENAI_MODEL=gpt-4 8 | 9 | # 应用配置 10 | APP_NAME=职称评审材料审核系统 11 | APP_VERSION=1.0.0 12 | DEBUG=True 13 | LOG_LEVEL=INFO 14 | 15 | # 文件存储配置 16 | UPLOAD_DIR=./data/uploads 17 | TEMP_DIR=./data/temp 18 | OUTPUT_DIR=./data/outputs 19 | MAX_FILE_SIZE=104857600 # 100MB 20 | 21 | # 数据库配置 22 | DATABASE_URL=sqlite:///./data/audit_system.db 23 | 24 | # Redis配置(用于任务队列) 25 | REDIS_URL=redis://localhost:6379/0 26 | 27 | # API服务配置 28 | HOST=0.0.0.0 29 | PORT=8000 30 | WORKERS=4 31 | 32 | # AI处理配置 33 | MAX_CONCURRENT_TASKS=10 34 | PDF_MAX_PAGES=100 35 | CHUNK_SIZE=2048 36 | OVERLAP_SIZE=200 37 | 38 | # 规则校验配置 39 | ENABLE_STRICT_MODE=True 40 | AUTO_RETRY_COUNT=3 41 | VALIDATION_TIMEOUT=300 42 | 43 | # 报告生成配置 44 | REPORT_TEMPLATE_DIR=./templates 45 | REPORT_ASSETS_DIR=./assets 46 | ENABLE_PDF_EXPORT=True 47 | 48 | # 安全配置 49 | SECRET_KEY=your_secret_key_here 50 | ACCESS_TOKEN_EXPIRE_MINUTES=30 51 | 52 | # 监控配置 53 | ENABLE_METRICS=True 54 | METRICS_PORT=9090 55 | 56 | # Marker + Gemma AI配置 57 | # 谷歌AI API配置 58 | GOOGLE_API_KEY= 59 | 60 | 61 | 62 | # Marker配置 63 | MARKER_USE_LLM=true 64 | MARKER_OUTPUT_FORMAT=json 65 | MARKER_FORMAT_LINES=true 66 | 67 | # 设备配置 68 | TORCH_DEVICE=cuda 69 | 70 | # Hugging Face配置(如果需要) 71 | HF_TOKEN=your_huggingface_token_here 72 | 73 | # LangSmith配置(用于调试和监控) 74 | LANGSMITH_API_KEY=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 75 | LANGCHAIN_TRACING_V2=true 76 | LANGCHAIN_ENDPOINT=https://api.smith.langchain.com 77 | LANGCHAIN_PROJECT=Audit_Workflow_Debug 78 | LANGSMITH_TRACING=true 79 | 80 | # LangSmith配置(可选,用于调试和监控) 81 | LANGSMITH_API_KEY=your_langsmith_api_key_here 82 | LANGCHAIN_TRACING_V2=true 83 | LANGCHAIN_ENDPOINT=https://api.smith.langchain.com 84 | LANGCHAIN_PROJECT=Audit_Workflow_Debug 85 | LANGSMITH_TRACING=true 86 | 87 | # Google AI API配置 88 | GOOGLE_API_KEY=your_google_api_key_here 89 | GEMINI_MODEL=gemini-2.5-flash # 可选值: gemini-1.5-flash, gemini-2.5-flash, gemini-pro 90 | 91 | # 应用配置 92 | ENVIRONMENT=development 93 | HOST=0.0.0.0 94 | PORT=8000 95 | DEBUG=true 96 | 97 | # Redis配置(用于缓存和任务队列) 98 | REDIS_URL=redis://localhost:6379/0 99 | REDIS_PASSWORD= 100 | REDIS_DB=0 101 | 102 | # PostgreSQL配置(可选,用于持久化存储) 103 | DATABASE_URL=postgresql://user:password@localhost:5432/langgraph_audit 104 | POSTGRES_USER=langgraph_user 105 | POSTGRES_PASSWORD=your_password 106 | POSTGRES_DB=langgraph_audit 107 | POSTGRES_HOST=localhost 108 | POSTGRES_PORT=5432 109 | 110 | # 文件处理配置 111 | MAX_FILE_SIZE=100MB 112 | ALLOWED_FILE_TYPES=.zip,.md,.txt,.pdf 113 | UPLOAD_DIR=./uploads 114 | EXTRACTED_DIR=./extracted 115 | REPORTS_DIR=./reports 116 | 117 | # 缓存配置 118 | CACHE_ENABLED=true 119 | CACHE_TTL=3600 120 | CACHE_MAX_SIZE=1000 121 | 122 | # 日志配置 123 | LOG_LEVEL=INFO 124 | LOG_FORMAT=json 125 | LOG_FILE=./logs/app.log 126 | 127 | # 安全配置 128 | SECRET_KEY=your-secret-key-here 129 | CORS_ORIGINS=* 130 | ALLOWED_HOSTS=localhost,127.0.0.1,0.0.0.0 131 | 132 | # 工作流配置 133 | WORKFLOW_TIMEOUT=300 134 | MAX_RETRIES=3 135 | CONCURRENT_TASKS=5 136 | 137 | # OCR配置 138 | TESSERACT_PATH=/usr/bin/tesseract 139 | TESSERACT_DATA_PATH=/usr/share/tesseract-ocr/4.00/tessdata 140 | 141 | # 开发工具配置 142 | LANGCHAIN_VERBOSE=false 143 | LANGCHAIN_DEBUG=false 144 | -------------------------------------------------------------------------------- /src/tools/workflow_integration.py: -------------------------------------------------------------------------------- 1 | """ 2 | 审核工作流集成工具 3 | 4 | 提供审核系统的核心集成函数,连接各个工具模块 5 | """ 6 | 7 | from typing import List, Dict, Any 8 | from pathlib import Path 9 | from src.models.state import ValidationResult, CoreInfo 10 | from src.tools import ( 11 | extract_core_information_with_ai, 12 | validate_material_with_ai, 13 | extract_with_regex 14 | ) 15 | 16 | def extract_core_information_from_json(json_extractions: List[Dict[str, Any]]) -> CoreInfo: 17 | """使用Gemma AI从JSON提取结果中智能提取核心信息""" 18 | print("🤖 使用Gemma模型进行智能信息提取...") 19 | 20 | # 整合所有文档内容 21 | combined_content = "" 22 | extracted_from = [] 23 | 24 | for json_extraction in json_extractions: 25 | file_path = json_extraction.get("file_path", "") 26 | content_blocks = json_extraction.get("content_blocks", []) 27 | 28 | for block in content_blocks: 29 | content = block.get("content", "") 30 | if content.strip(): 31 | combined_content += content + "\n" 32 | 33 | if file_path: 34 | extracted_from.append(Path(file_path).name) 35 | 36 | if not combined_content.strip(): 37 | return CoreInfo(name="", id_number="", extracted_from=extracted_from) 38 | 39 | # 使用AI提取,失败时降级到正则表达式 40 | ai_result = extract_core_information_with_ai(combined_content, extracted_from) 41 | 42 | if ai_result: 43 | return CoreInfo( 44 | name=ai_result["name"], 45 | id_number=ai_result["id_number"], 46 | extracted_from=ai_result["extracted_from"] 47 | ) 48 | else: 49 | name, id_number = extract_with_regex(combined_content) 50 | return CoreInfo(name=name, id_number=id_number, extracted_from=extracted_from) 51 | 52 | def extract_core_information(materials: List[Dict[str, Any]]) -> CoreInfo: 53 | """提取核心信息(简化版) - 使用Dict替代MaterialInfo""" 54 | # 将Dict转换为JSON格式进行处理 55 | json_extractions = [] 56 | for material in materials: 57 | json_extraction = { 58 | "file_path": material.get("material_id", ""), 59 | "content_blocks": [{"content": material.get("content", "")}] 60 | } 61 | json_extractions.append(json_extraction) 62 | 63 | return extract_core_information_from_json(json_extractions) 64 | 65 | def validate_material_rules(material: Dict[str, Any]) -> List[ValidationResult]: 66 | """使用Gemma AI进行智能审核 - 使用Dict替代MaterialInfo""" 67 | material_type = material.get("material_type", "") 68 | content = material.get("content", "") 69 | 70 | print(f"🤖 使用Gemma模型审核材料: {material_type}") 71 | 72 | # 使用AI进行智能审核 73 | ai_results = validate_material_with_ai(material_type, content) 74 | 75 | if ai_results: 76 | results = [] 77 | for item in ai_results: 78 | if isinstance(item, dict) and "rule_name" in item: 79 | results.append(ValidationResult( 80 | rule_id=f"GEMMA_{len(results)+1:03d}", 81 | rule_name=item.get("rule_name", "智能审核"), 82 | status=item.get("status", "WARNING"), 83 | message=item.get("message", "审核完成") 84 | )) 85 | return results 86 | else: 87 | # AI失败时返回默认验证结果 88 | return [ValidationResult( 89 | rule_id="FALLBACK_001", 90 | rule_name="默认审核", 91 | status="WARNING", 92 | message="AI审核失败,使用默认审核规则" 93 | )] 94 | 95 | -------------------------------------------------------------------------------- /src/config/api_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | API配置工具 3 | 4 | 用于配置PDF提取API端点和相关参数 5 | """ 6 | 7 | from typing import Dict, Any, Optional 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | # 全局API配置 13 | _api_config = { 14 | "pdf_extraction_endpoint": "http://183.203.184.233:8888/pdf_parse_supplychain", # 用户提供的实际端点 15 | "timeout": 60, 16 | "max_file_size": 20 * 1024 * 1024, # 20MB 17 | "supported_formats": [".pdf"] 18 | } 19 | 20 | 21 | def configure_pdf_api(endpoint: str, timeout: int = 60, max_file_size: int = 20 * 1024 * 1024) -> None: 22 | """ 23 | 配置PDF提取API 24 | 25 | Args: 26 | endpoint: API端点URL 27 | timeout: 超时时间(秒) 28 | max_file_size: 最大文件大小(字节) 29 | """ 30 | global _api_config 31 | 32 | _api_config.update({ 33 | "pdf_extraction_endpoint": endpoint, 34 | "timeout": timeout, 35 | "max_file_size": max_file_size 36 | }) 37 | 38 | logger.info(f"PDF API已配置: {endpoint}") 39 | print(f"✅ PDF提取API已配置: {endpoint}") 40 | 41 | 42 | def get_pdf_api_config() -> Dict[str, Any]: 43 | """ 44 | 获取当前PDF API配置 45 | 46 | Returns: 47 | API配置字典 48 | """ 49 | return _api_config.copy() 50 | 51 | 52 | def is_pdf_api_configured() -> bool: 53 | """ 54 | 检查PDF API是否已配置 55 | 56 | Returns: 57 | 是否已配置 58 | """ 59 | return _api_config.get("pdf_extraction_endpoint") is not None 60 | 61 | 62 | async def validate_pdf_file(file_path: str) -> Dict[str, Any]: 63 | """ 64 | 验证PDF文件是否符合要求 65 | 66 | Args: 67 | file_path: PDF文件路径 68 | 69 | Returns: 70 | 验证结果 71 | """ 72 | import os 73 | from pathlib import Path 74 | 75 | try: 76 | import asyncio 77 | from pathlib import Path 78 | file_path_obj = Path(file_path) 79 | 80 | # 使用异步方式检查文件是否存在 81 | file_exists = await asyncio.to_thread(file_path_obj.exists) 82 | if not file_exists: 83 | return { 84 | "valid": False, 85 | "error": "文件不存在" 86 | } 87 | 88 | # 检查文件扩展名 89 | if file_path_obj.suffix.lower() not in _api_config["supported_formats"]: 90 | return { 91 | "valid": False, 92 | "error": f"不支持的文件格式: {file_path_obj.suffix}" 93 | } 94 | 95 | # 使用异步方式检查文件大小 96 | file_stat = await asyncio.to_thread(file_path_obj.stat) 97 | file_size = file_stat.st_size 98 | if file_size > _api_config["max_file_size"]: 99 | return { 100 | "valid": False, 101 | "error": f"文件过大: {file_size} > {_api_config['max_file_size']}" 102 | } 103 | 104 | return { 105 | "valid": True, 106 | "file_size": file_size, 107 | "format": file_path_obj.suffix.lower() 108 | } 109 | 110 | except Exception as e: 111 | return { 112 | "valid": False, 113 | "error": f"文件验证失败: {str(e)}" 114 | } 115 | 116 | 117 | def create_pdf_api_headers() -> Dict[str, str]: 118 | """ 119 | 创建PDF API请求头(基于提供的API示例) 120 | 121 | Returns: 122 | 请求头字典 123 | """ 124 | return { 125 | "accept": "application/json", # 与示例一致 126 | "User-Agent": "LangGraph-PDF-Extractor/1.0" 127 | # Content-Type 会由 aiohttp 自动设置为 multipart/form-data 128 | } 129 | 130 | 131 | def get_pdf_api_params() -> Dict[str, str]: 132 | """ 133 | 获取PDF API查询参数(基于提供的API示例) 134 | 135 | Returns: 136 | API查询参数字典 137 | """ 138 | return { 139 | "parse_method": "auto", 140 | "is_json_md_dump": "false", 141 | "output_dir": "output", 142 | "return_layout": "false", 143 | "return_info": "false", 144 | "return_content_list": "false", 145 | "return_images": "false" 146 | } 147 | 148 | 149 | def build_pdf_api_url(base_endpoint: str, custom_params: Optional[Dict[str, str]] = None) -> str: 150 | """ 151 | 构建完整的PDF API URL 152 | 153 | Args: 154 | base_endpoint: 基础端点URL(不包含查询参数) 155 | custom_params: 自定义参数(可选) 156 | 157 | Returns: 158 | 完整的API URL 159 | """ 160 | params = get_pdf_api_params() 161 | 162 | # 如果有自定义参数,覆盖默认参数 163 | if custom_params: 164 | params.update(custom_params) 165 | 166 | # 构建查询字符串 167 | query_string = "&".join([f"{k}={v}" for k, v in params.items()]) 168 | 169 | # 处理base_endpoint是否已经包含查询参数 170 | separator = "&" if "?" in base_endpoint else "?" 171 | 172 | return f"{base_endpoint}{separator}{query_string}" -------------------------------------------------------------------------------- /src/nodes/cross_validation.py: -------------------------------------------------------------------------------- 1 | """ 2 | 交叉校验节点 3 | 4 | 对核心信息进行交叉校验: 5 | 1. 姓名一致性校验 6 | 2. 身份证一致性校验 7 | 3. 基于rules文件夹中的交叉检验规则 8 | """ 9 | 10 | from typing import Dict, Any 11 | from src.graph.state import AuditState 12 | from src.tools.ai_utils import cross_validate_materials_with_ai 13 | 14 | 15 | def cross_validation_node(state: AuditState) -> Dict[str, Any]: 16 | """ 17 | 完全无缓存的交叉校验节点 - 每次都处理全新数据 18 | 19 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的 20 | """ 21 | try: 22 | print(f"🔍 开始无缓存交叉校验节点...") 23 | 24 | # 🔍 获取核心信息(优先使用核心信息提取节点的结果) 25 | core_info = state.get("core_info") 26 | all_extracted_info = state.get("api_extraction_results", {}) or state.get("extracted_content", {}) 27 | current_step = state.get("current_step", "未知") 28 | 29 | print(f"🔍 当前状态详细信息:") 30 | print(f" 当前步骤: {current_step}") 31 | print(f" 核心信息状态: {'有效' if core_info else '无'}") 32 | print(f" 提取材料数量: {len(all_extracted_info)}") 33 | 34 | # 🚨 优先检查核心信息提取节点的结果 35 | if not core_info: 36 | print(f"⚠️ 没有找到核心信息,检查核心信息提取节点是否正常执行") 37 | raise Exception("未找到任何核心信息用于交叉校验") 38 | 39 | # 🔍 验证核心信息的数据结构 40 | if not isinstance(core_info, dict): 41 | print(f"⚠️ 核心信息格式不正确: {type(core_info)}") 42 | # 尝试转换为字典格式 43 | if hasattr(core_info, 'name') and hasattr(core_info, 'id_number'): 44 | core_info = { 45 | "attachments": { 46 | "name": getattr(core_info, 'name', ''), 47 | "id_number": getattr(core_info, 'id_number', ''), 48 | "extracted_from": getattr(core_info, 'extracted_from', []) 49 | } 50 | } 51 | else: 52 | raise Exception(f"核心信息格式不可识别: {type(core_info)}") 53 | 54 | # 🔍 统计有效的核心信息条目 55 | valid_entries = 0 56 | name_sources = [] 57 | id_sources = [] 58 | 59 | for category, info in core_info.items(): 60 | if isinstance(info, dict) and (info.get('name') or info.get('id_number')): 61 | valid_entries += 1 62 | if info.get('name'): 63 | name_sources.append(f"{category}: {info['name']}") 64 | if info.get('id_number'): 65 | id_sources.append(f"{category}: {info['id_number']}") 66 | 67 | print(f"📋 有效核心信息条目: {valid_entries}") 68 | print(f"📋 姓名信息来源: {len(name_sources)} 项") 69 | print(f"📋 身份证信息来源: {len(id_sources)} 项") 70 | 71 | if valid_entries == 0: 72 | print(f"⚠️ 所有核心信息条目都为空,无法进行交叉校验") 73 | raise Exception("所有核心信息条目都为空,无法进行交叉校验") 74 | 75 | # 🚨 直接执行交叉验证 - 不使用缓存,使用核心信息提取节点的结果 76 | cross_validation_results = cross_validate_materials_with_ai(all_extracted_info, core_info) 77 | 78 | # 直接转换AI结果为标准格式 - 不存入缓存 79 | converted_results = [] 80 | for ai_result in cross_validation_results: 81 | status = ai_result.get('status', 'WARNING') 82 | if status == 'PASS' or '✅' in status: 83 | result_status = '✅通过' 84 | elif status == 'WARNING' or '⚠️' in status: 85 | result_status = '⚠️警告' 86 | elif status == 'ERROR' or '❌' in status: 87 | result_status = '❌不通过' 88 | else: 89 | result_status = '⚠️警告' 90 | 91 | converted_result = { 92 | "rule_name": ai_result.get('rule_name', '未知规则'), 93 | "result": result_status, 94 | "details": ai_result.get('message', 'AI交叉校验完成'), 95 | "priority": ai_result.get('priority', '极高'), 96 | "material_type": "AI交叉校验", 97 | "rule_content": ai_result.get('rule_content', ''), 98 | "timestamp": _get_current_timestamp() 99 | } 100 | converted_results.append(converted_result) 101 | 102 | # 🚨 直接返回结果,不使用任何缓存机制 103 | print(f"✅ 无缓存交叉校验完成,生成{len(converted_results)}项结果") 104 | 105 | return { 106 | "cross_validation": converted_results, 107 | "current_step": "cross_validation_completed", 108 | "processing_logs": [ 109 | f"交叉校验完成,生成{len(converted_results)}项结果", 110 | f"基于{valid_entries}项有效核心信息进行校验", 111 | "已完全取消缓存机制,确保数据全新" 112 | ] 113 | } 114 | 115 | except Exception as e: 116 | print(f"❌ 交叉校验失败: {str(e)}") 117 | return { 118 | "current_step": "cross_validation_failed", 119 | "error_message": f"交叉校验失败: {str(e)}", 120 | "processing_logs": [f"交叉校验失败: {str(e)}"] 121 | } 122 | 123 | 124 | def _get_current_timestamp() -> str: 125 | """获取当前时间戳""" 126 | from datetime import datetime 127 | return datetime.now().isoformat() -------------------------------------------------------------------------------- /src/nodes/report_generation.py: -------------------------------------------------------------------------------- 1 | """ 2 | 报告生成节点 - 完全无缓存版本 3 | 4 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的 5 | """ 6 | 7 | from typing import Dict, Any 8 | from src.graph.state import AuditState 9 | 10 | 11 | def report_generation_node(state: AuditState) -> Dict[str, Any]: 12 | """ 13 | 完全无缓存的报告生成节点 - 每次都处理全新数据 14 | 15 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的 16 | """ 17 | try: 18 | print(f"📄 开始无缓存报告生成...") 19 | 20 | # 生成报告ID 21 | from datetime import datetime 22 | import uuid 23 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 24 | report_id = f"AUDIT_{timestamp}_{str(uuid.uuid4())[:8].upper()}" 25 | 26 | # 直接获取当前状态的所有数据 - 不使用任何缓存 27 | material_validation = state.get("material_validation", {}) 28 | cross_validation = state.get("cross_validation", []) 29 | 30 | print(f"🔍 当前状态数据:") 31 | print(f" 材料校验结果: {len(material_validation)} 项") 32 | print(f" 交叉校验结果: {len(cross_validation)} 项") 33 | 34 | # 直接整合所有数据 - 不做缓存检查 35 | all_results = [] 36 | 37 | # 整合material_validation数据 38 | for material_type, results in material_validation.items(): 39 | if isinstance(results, list): 40 | all_results.extend(results) 41 | elif results: 42 | all_results.append(results) 43 | 44 | # 整合cross_validation数据 45 | if isinstance(cross_validation, list): 46 | all_results.extend(cross_validation) 47 | 48 | if not all_results: 49 | print("⚠️ 未找到任何校验结果,生成空报告") 50 | 51 | print(f"📊 报告数据统计: 共{len(all_results)}项结果") 52 | 53 | # 直接生成HTML报告 - 不使用缓存的复杂逻辑 54 | html_report = _generate_html_report(all_results, report_id) 55 | 56 | # 保存报告文件 57 | report_path = f"audit_report_{timestamp}.html" 58 | 59 | if report_path and html_report: 60 | with open(report_path, 'w', encoding='utf-8') as f: 61 | f.write(html_report) 62 | 63 | print(f"✅ 报告已生成: {report_path}") 64 | else: 65 | raise Exception("报告路径或内容为空") 66 | 67 | return { 68 | "audit_report": html_report, 69 | "report_path": report_path, 70 | "current_step": "completed", 71 | "is_complete": True, 72 | "processing_logs": [ 73 | f"报告生成完成: {report_id}", 74 | f"处理了{len(all_results)}项结果", 75 | "已完全取消缓存机制,确保数据全新", 76 | f"报告已保存至: {report_path}" 77 | ] 78 | } 79 | 80 | except Exception as e: 81 | print(f"❌ 报告生成失败: {str(e)}") 82 | return { 83 | "current_step": "report_generation_failed", 84 | "error_message": f"报告生成失败: {str(e)}" 85 | } 86 | 87 | 88 | def _generate_html_report(all_results: list, report_id: str) -> str: 89 | """ 90 | 生成简化的HTML报告 - 完全无缓存机制 91 | """ 92 | from datetime import datetime 93 | 94 | print(f"📊 报告生成使用数据,共{len(all_results)}项结果") 95 | 96 | # 按材料类型分组 97 | material_groups = {} 98 | for result in all_results: 99 | material_type = result.get('material_type', '未知类型') 100 | if material_type not in material_groups: 101 | material_groups[material_type] = [] 102 | material_groups[material_type].append(result) 103 | 104 | # 统计数据 105 | error_count = sum(1 for r in all_results if r.get('result', '').startswith('❌')) 106 | warning_count = sum(1 for r in all_results if r.get('result', '').startswith('⚠️')) 107 | pass_count = sum(1 for r in all_results if r.get('result', '').startswith('✅')) 108 | total_validations = len(all_results) 109 | 110 | print(f"📊 统计: 错误{error_count}, 警告{warning_count}, 通过{pass_count}") 111 | 112 | # 生成基本的HTML报告结构 113 | html_template = f""" 114 | 115 | 116 | 117 | 118 | 119 | 职称评审材料审核报告 - {report_id} 120 | 132 | 133 | 134 |
135 |

职称评审材料审核报告

136 |

报告ID: {report_id}

137 |

生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

138 |
139 | 140 |
141 |
142 |

总计

143 |

{len(all_results)} 项检查

144 |
145 |
146 |

错误

147 |

{error_count} 项

148 |
149 |
150 |

警告

151 |

{warning_count} 项

152 |
153 |
154 |

通过

155 |

{pass_count} 项

156 |
157 |
158 | 159 |
160 |

详细结果

161 | """ 162 | 163 | # 添加材料组详情 164 | for material_type, results in material_groups.items(): 165 | html_template += f""" 166 |
167 |
{material_type} ({len(results)} 项)
168 | """ 169 | for result in results[:10]: # 限制显示数量 170 | result_class = "error" if result.get('result', '').startswith('❌') else "warning" if result.get('result', '').startswith('⚠️') else "pass" 171 | html_template += f""" 172 |
173 | {result.get('rule_name', '未知规则')}: {result.get('result', '未知')}
174 | {result.get('details', '无详情')} 175 |
176 | """ 177 | html_template += "
" 178 | 179 | html_template += """ 180 |
181 | 182 | 183 | """ 184 | 185 | return html_template -------------------------------------------------------------------------------- /src/config/model_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | 配置管理器 3 | 4 | 用于管理OCR API配置及环境变量 5 | """ 6 | 7 | import os 8 | from pathlib import Path 9 | from typing import Dict, Optional 10 | import logging 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | class ModelConfig: 15 | """配置管理器""" 16 | 17 | def __init__(self): 18 | self.project_root = Path(__file__).parent.parent.parent 19 | self.cache_dir = self.project_root / ".model_cache" 20 | 21 | # 智能初始化:检查是否在异步环境中 22 | try: 23 | import asyncio 24 | # 尝试获取当前任务,如果成功说明在异步环境中 25 | asyncio.current_task() 26 | logger.info("🔄 检测到异步环境,将延迟创建缓存目录") 27 | except RuntimeError: 28 | # 不在异步环境中,可以安全创建目录 29 | self.setup_cache_directories_sync() 30 | except Exception: 31 | # 如果检测失败,使用同步方式(向后兼容) 32 | self.setup_cache_directories_sync() 33 | 34 | async def setup_cache_directories(self): 35 | """设置缓存目录(异步版本)""" 36 | try: 37 | import asyncio 38 | # 使用异步方式创建目录 39 | await asyncio.to_thread(self.cache_dir.mkdir, parents=True, exist_ok=True) 40 | logger.info(f"📁 缓存目录: {self.cache_dir}") 41 | 42 | except Exception as e: 43 | logger.error(f"❌ 缓存目录设置失败: {e}") 44 | 45 | def setup_cache_directories_sync(self): 46 | """设置缓存目录(同步版本,仅用于初始化)""" 47 | try: 48 | # 创建本地缓存目录 49 | self.cache_dir.mkdir(parents=True, exist_ok=True) 50 | logger.info(f"📁 缓存目录: {self.cache_dir}") 51 | 52 | except Exception as e: 53 | logger.error(f"❌ 缓存目录设置失败: {e}") 54 | 55 | async def is_models_cached(self) -> bool: 56 | """检查缓存是否存在(异步版本)""" 57 | import asyncio 58 | return await asyncio.to_thread(self.cache_dir.exists) 59 | 60 | async def get_cache_size(self) -> str: 61 | """获取缓存目录大小(异步版本)""" 62 | try: 63 | import asyncio 64 | total_size = 0 65 | 66 | # 使用异步方式遍历文件 67 | async def calculate_size(): 68 | nonlocal total_size 69 | paths = await asyncio.to_thread(list, self.cache_dir.rglob("*")) 70 | for path in paths: 71 | is_file = await asyncio.to_thread(path.is_file) 72 | if is_file: 73 | stat_result = await asyncio.to_thread(path.stat) 74 | total_size += stat_result.st_size 75 | 76 | await calculate_size() 77 | 78 | # 转换为可读格式 79 | if total_size < 1024: 80 | return f"{total_size} B" 81 | elif total_size < 1024**2: 82 | return f"{total_size/1024:.1f} KB" 83 | elif total_size < 1024**3: 84 | return f"{total_size/1024**2:.1f} MB" 85 | else: 86 | return f"{total_size/1024**3:.1f} GB" 87 | 88 | except Exception as e: 89 | logger.error(f"❌ 获取缓存大小失败: {e}") 90 | return "未知" 91 | 92 | async def clear_cache(self): 93 | """清理缓存(异步版本)""" 94 | try: 95 | import shutil 96 | import asyncio 97 | if self.cache_dir.exists(): 98 | await asyncio.to_thread(shutil.rmtree, self.cache_dir) 99 | logger.info("🧹 缓存已清理") 100 | await self.setup_cache_directories() 101 | except Exception as e: 102 | logger.error(f"❌ 清理缓存失败: {e}") 103 | 104 | def clear_cache_sync(self): 105 | """清理缓存(同步版本)""" 106 | try: 107 | import shutil 108 | if self.cache_dir.exists(): 109 | shutil.rmtree(self.cache_dir) 110 | logger.info("🧹 缓存已清理") 111 | self.setup_cache_directories_sync() 112 | except Exception as e: 113 | logger.error(f"❌ 清理缓存失败: {e}") 114 | 115 | async def get_status(self) -> Dict[str, str]: 116 | """获取配置状态(异步版本)""" 117 | cache_size = await self.get_cache_size() 118 | return { 119 | "cache_dir": str(self.cache_dir), 120 | "cache_size": cache_size, 121 | "ocr_api_enabled": "启用", 122 | } 123 | 124 | def get_status_sync(self) -> Dict[str, str]: 125 | """获取配置状态(同步版本)""" 126 | try: 127 | total_size = 0 128 | if self.cache_dir.exists(): 129 | for path in self.cache_dir.rglob("*"): 130 | if path.is_file(): 131 | total_size += path.stat().st_size 132 | 133 | # 转换为可读格式 134 | if total_size < 1024: 135 | cache_size = f"{total_size} B" 136 | elif total_size < 1024**2: 137 | cache_size = f"{total_size/1024:.1f} KB" 138 | elif total_size < 1024**3: 139 | cache_size = f"{total_size/1024**2:.1f} MB" 140 | else: 141 | cache_size = f"{total_size/1024**3:.1f} GB" 142 | except Exception as e: 143 | logger.error(f"❌ 获取缓存大小失败: {e}") 144 | cache_size = "未知" 145 | 146 | return { 147 | "cache_dir": str(self.cache_dir), 148 | "cache_size": cache_size, 149 | "ocr_api_enabled": "启用", 150 | } 151 | 152 | 153 | # 全局配置实例 154 | model_config = ModelConfig() 155 | 156 | 157 | async def setup_model_environment(): 158 | """设置环境(在应用启动时调用,异步版本)""" 159 | logger.info("🔧 正在设置环境...") 160 | 161 | # 设置缓存目录 162 | await model_config.setup_cache_directories() 163 | 164 | # 打印状态信息 165 | status = await model_config.get_status() 166 | logger.info("📊 配置状态:") 167 | for key, value in status.items(): 168 | logger.info(f" {key}: {value}") 169 | 170 | def setup_model_environment_sync(): 171 | """设置环境(同步版本)""" 172 | logger.info("🔧 正在设置环境...") 173 | 174 | # 设置缓存目录 175 | model_config.setup_cache_directories_sync() 176 | 177 | # 打印状态信息 178 | status = model_config.get_status_sync() 179 | logger.info("📊 配置状态:") 180 | for key, value in status.items(): 181 | logger.info(f" {key}: {value}") 182 | 183 | 184 | def print_model_help(): 185 | """打印配置帮助信息""" 186 | help_text = """ 187 | 🔧 OCR API配置选项: 188 | 189 | 环境变量设置: 190 | OCR_API_BASE_URL=http://183.203.184.233:8888 # OCR API地址 191 | 192 | 使用说明: 193 | 1. 启动OCR API服务 194 | 确保您的OCR API服务正在运行 195 | 默认地址: http://183.203.184.233:8888 196 | 197 | 2. 启动主应用 198 | python web_app_v2.py 199 | 200 | 缓存位置: {cache_dir} 201 | """.format(cache_dir=model_config.cache_dir) 202 | 203 | print(help_text) 204 | 205 | 206 | if __name__ == "__main__": 207 | print_model_help() -------------------------------------------------------------------------------- /src/tools/common_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | 通用工具 3 | 4 | 提供通用的工具函数: 5 | - 正则表达式提取 6 | - 数据清理和验证 7 | - HTML报告生成 8 | - 日志记录 9 | """ 10 | 11 | import re 12 | from typing import Dict, Any, List, Optional, Union 13 | from pathlib import Path 14 | from src.models.state import CoreInfo, ValidationResult as StateValidationResult 15 | 16 | def extract_with_regex(content: str) -> tuple[str, str]: 17 | """使用正则表达式的备用提取方法(增强版)""" 18 | name = "" 19 | id_number = "" 20 | 21 | # 提取姓名(多种格式匹配) 22 | name_patterns = [ 23 | r"姓[名]*[::]\s*([^\s\n\r\t]+)", # 姓名: 24 | r"申请人[::]\s*([^\s\n\r\t]+)", # 申请人: 25 | r"姓[\s]*名[\s]*[::]\s*([^\s\n\r\t]+)", # 姓 名: 26 | r"^([\u4e00-\u9fff]{2,4})[\s]*[男女]", # 中文姓名后面跟性别 27 | ] 28 | 29 | for pattern in name_patterns: 30 | name_match = re.search(pattern, content, re.MULTILINE) 31 | if name_match: 32 | potential_name = name_match.group(1).strip() 33 | # 验证姓名的合理性(中文字符2-4个字) 34 | if re.match(r'^[\u4e00-\u9fff]{2,4}$', potential_name): 35 | name = potential_name 36 | break 37 | 38 | # 提取身份证号(多种格式匹配) 39 | id_patterns = [ 40 | r"身份证[号码]*[::]\s*(\d{17}[\dX])", # 身份证号: 41 | r"公民身份号码[::]\s*(\d{17}[\dX])", # 公民身份号码: 42 | r"ID[\s]*Number[\s]*[::]\s*(\d{17}[\dX])", # ID Number: 43 | r"(\d{17}[\dX])(?![\d])", # 直接匹配18位数字(排除更长数字) 44 | ] 45 | 46 | for pattern in id_patterns: 47 | id_match = re.search(pattern, content) 48 | if id_match: 49 | potential_id = id_match.group(1) 50 | # 验证身份证号格式 51 | if re.match(r'^\d{17}[\dX]$', potential_id): 52 | id_number = potential_id 53 | break 54 | 55 | if name or id_number: 56 | print(f"✅ 正则提取成功: 姓名='{name}', 身份证='{id_number}'") 57 | else: 58 | print("⚠️ 正则提取未找到有效信息") 59 | 60 | return name, id_number 61 | 62 | def generate_html_report(core_info: Optional[Union[CoreInfo, Dict[str, Any]]], validation_results: List[Any]) -> str: 63 | """生成HTML格式化报告""" 64 | # 处理core_info为None的情况 65 | if core_info is None: 66 | name = '未提取' 67 | id_number = '未提取' 68 | extracted_from = [] 69 | else: 70 | # 支持CoreInfo对象和Dict两种类型 71 | if isinstance(core_info, dict): 72 | name = core_info.get('name', '') or '未提取' 73 | id_number = core_info.get('id_number', '') or '未提取' 74 | extracted_from = core_info.get('extracted_from', []) or [] 75 | else: 76 | # CoreInfo对象 77 | name = getattr(core_info, 'name', None) or '未提取' 78 | id_number = getattr(core_info, 'id_number', None) or '未提取' 79 | extracted_from = getattr(core_info, 'extracted_from', []) or [] 80 | 81 | html_template = f""" 82 | 83 | 84 | 85 | 86 | 87 | 职称评审材料审核报告 88 | 106 | 107 | 108 |
109 |

📄 职称评审材料审核报告

110 |

生成时间:

111 |
112 | 113 |
114 |

👤 核心信息

115 |
116 |
117 | 姓名: {name} 118 |
119 |
120 | 身份证号: {id_number} 121 |
122 |
123 | 信息来源: {', '.join(extracted_from) if extracted_from else '无'} 124 |
125 |
126 |
127 | 128 |
129 |

✅ 审核结果

""" 130 | 131 | if validation_results: 132 | for result in validation_results: 133 | # 处理不同的ValidationResult类型 134 | # 支持既有status属性,也支持result属性 135 | status = getattr(result, 'status', None) or getattr(result, 'result', 'UNKNOWN') 136 | rule_name = getattr(result, 'rule_name', '未知规则') 137 | message = getattr(result, 'message', None) or getattr(result, 'details', '无详细信息') 138 | 139 | # 统一处理status格式 140 | if '✅' in status or status == 'PASS': 141 | status_normalized = 'pass' 142 | status_display = '✅通过' 143 | elif '⚠️' in status or status == 'WARNING': 144 | status_normalized = 'warning' 145 | status_display = '⚠️警告' 146 | elif '❌' in status or status == 'ERROR': 147 | status_normalized = 'error' 148 | status_display = '❌不通过' 149 | else: 150 | status_normalized = 'unknown' 151 | status_display = status 152 | 153 | status_class = f"result-{status_normalized}" 154 | badge_class = f"badge-{status_normalized}" 155 | 156 | html_template += f""" 157 |
158 | {rule_name} 159 | {status_display} 160 |

{message}

161 |
""" 162 | else: 163 | html_template += "

无审核结果

" 164 | 165 | html_template += """ 166 |
167 | 168 | 171 | 172 | """ 173 | 174 | return html_template -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "langgraph-audit-system" 7 | version = "0.1.0" 8 | description = "An enterprise-level intelligent title evaluation material review system built on the LangGraph framework" 9 | authors = [ 10 | {name = "LangGraph Audit Team", email = "team@langgraph-audit.com"} 11 | ] 12 | license = {text = "MIT"} 13 | readme = "README.md" 14 | requires-python = ">=3.10,<3.13" 15 | keywords = ["langgraph", "audit", "ai", "workflow", "title-evaluation"] 16 | classifiers = [ 17 | "Development Status :: 4 - Beta", 18 | "Intended Audience :: Developers", 19 | "License :: OSI Approved :: MIT License", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3.10", 22 | "Programming Language :: Python :: 3.11", 23 | "Programming Language :: Python :: 3.12", 24 | "Topic :: Software Development :: Libraries :: Python Modules", 25 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 26 | ] 27 | 28 | # Core dependencies for production 29 | dependencies = [ 30 | # LangGraph core dependencies 31 | "langgraph>=0.2.0", 32 | "langsmith>=0.1.0", 33 | "langtrace-python-sdk>=2.0.0", 34 | "pydantic>=2.0.0", 35 | "typing-extensions>=4.0.0", 36 | "langchain-core>=0.1.0", # LangGraph core dependency 37 | "langgraph-cli>=0.1.0", # LangGraph development tools 38 | 39 | # LangGraph Redis integration 40 | "langgraph-checkpoint-redis>=0.1.0", 41 | 42 | # PostgreSQL integration 43 | "psycopg[binary,pool]>=3.1.0", 44 | "asyncpg>=0.29.0", 45 | 46 | # AI API integration 47 | "google-generativeai>=0.3.0", 48 | 49 | # Environment and configuration 50 | "python-dotenv>=1.0.0", 51 | "pyyaml>=6.0", 52 | 53 | # File processing utilities 54 | "pathlib2", 55 | "zipfile36>=0.1.0", 56 | "python-magic>=0.4.0", 57 | "Pillow>=10.0.0", 58 | 59 | # Web framework 60 | "fastapi>=0.104.0", 61 | "uvicorn>=0.24.0", 62 | "python-multipart>=0.0.6", 63 | "sse-starlette>=1.6.0", # Server-Sent Events support 64 | "starlette>=0.27.0", 65 | 66 | # Data processing 67 | "pandas>=2.0.0", 68 | "numpy>=1.24.0", 69 | 70 | # HTML report generation 71 | "jinja2>=3.1.0", 72 | "weasyprint>=60.0", 73 | 74 | # Monitoring and logging 75 | "loguru>=0.7.0", 76 | "prometheus-client>=0.19.0", 77 | 78 | # OCR and computer vision 79 | "pytesseract>=0.3.10", 80 | "opencv-python>=4.8.0", 81 | 82 | # Database support 83 | "sqlalchemy>=2.0.0", 84 | "alembic>=1.12.0", 85 | 86 | # Async processing 87 | "aiofiles>=23.2.0", 88 | "aiohttp>=3.8.0", 89 | "celery>=5.3.0", 90 | "redis>=5.0.0", 91 | 92 | # Document processing 93 | "markdown>=3.5.0", 94 | "markdownify>=0.11.0", 95 | ] 96 | 97 | [project.optional-dependencies] 98 | # Development dependencies 99 | dev = [ 100 | "pytest>=7.4.0", 101 | "pytest-asyncio>=0.21.0", 102 | "pytest-cov>=4.1.0", 103 | "black>=23.0.0", 104 | "isort>=5.12.0", 105 | "flake8>=6.0.0", 106 | "mypy>=1.6.0", 107 | ] 108 | 109 | # Testing dependencies 110 | test = [ 111 | "pytest>=7.4.0", 112 | "pytest-asyncio>=0.21.0", 113 | "pytest-cov>=4.1.0", 114 | "pytest-mock>=3.11.0", 115 | "httpx>=0.25.0", # for testing FastAPI endpoints 116 | ] 117 | 118 | # Documentation dependencies 119 | docs = [ 120 | "mkdocs>=1.5.0", 121 | "mkdocs-material>=9.4.0", 122 | "mkdocstrings[python]>=0.23.0", 123 | ] 124 | 125 | # Full development environment 126 | all = [ 127 | "langgraph-audit-system[dev,test,docs]" 128 | ] 129 | 130 | [project.urls] 131 | Homepage = "https://github.com/your-org/langgraph-audit-system" 132 | Documentation = "https://your-org.github.io/langgraph-audit-system" 133 | Repository = "https://github.com/your-org/langgraph-audit-system.git" 134 | Issues = "https://github.com/your-org/langgraph-audit-system/issues" 135 | 136 | [project.scripts] 137 | # Command line entry points 138 | langgraph-audit = "src.agent:main" 139 | audit-debug = "debug_langsmith:main" 140 | check-health = "check_health:main" 141 | 142 | [tool.hatch.build.targets.wheel] 143 | packages = ["src"] 144 | 145 | [tool.hatch.build.targets.sdist] 146 | include = [ 147 | "src/", 148 | "rules/", 149 | "test_data/", 150 | "README.md", 151 | "langgraph.json", 152 | "pyproject.toml", 153 | ] 154 | exclude = [ 155 | "**/__pycache__/", 156 | "**/*.pyc", 157 | "**/*.pyo", 158 | "**/*.orig", 159 | "**/*.rej", 160 | "**/*~", 161 | "**/#*#", 162 | "**/.#*", 163 | ".git/", 164 | ".pytest_cache/", 165 | ".coverage", 166 | ] 167 | 168 | # Black code formatting configuration 169 | [tool.black] 170 | line-length = 88 171 | target-version = ["py310", "py311", "py312"] 172 | include = '\.pyi?$' 173 | extend-exclude = ''' 174 | /( 175 | # directories 176 | \.eggs 177 | | \.git 178 | | \.hg 179 | | \.mypy_cache 180 | | \.tox 181 | | \.venv 182 | | _build 183 | | buck-out 184 | | build 185 | | dist 186 | )/ 187 | ''' 188 | 189 | # isort import sorting configuration 190 | [tool.isort] 191 | profile = "black" 192 | multi_line_output = 3 193 | line_length = 88 194 | known_first_party = ["src"] 195 | known_third_party = ["langgraph", "langsmith", "pydantic", "fastapi"] 196 | 197 | # Flake8 linting configuration 198 | [tool.flake8] 199 | max-line-length = 88 200 | extend-ignore = ["E203", "W503", "E501"] 201 | exclude = [ 202 | ".git", 203 | "__pycache__", 204 | "build", 205 | "dist", 206 | ".eggs", 207 | "*.egg-info", 208 | ".venv", 209 | ".pytest_cache", 210 | ] 211 | 212 | # MyPy type checking configuration 213 | [tool.mypy] 214 | python_version = "3.10" 215 | warn_return_any = true 216 | warn_unused_configs = true 217 | disallow_untyped_defs = true 218 | disallow_incomplete_defs = true 219 | check_untyped_defs = true 220 | disallow_untyped_decorators = true 221 | no_implicit_optional = true 222 | warn_redundant_casts = true 223 | warn_unused_ignores = true 224 | warn_no_return = true 225 | warn_unreachable = true 226 | strict_equality = true 227 | 228 | [[tool.mypy.overrides]] 229 | module = [ 230 | "pytesseract", 231 | "cv2", 232 | "weasyprint", 233 | "celery", 234 | "redis", 235 | ] 236 | ignore_missing_imports = true 237 | 238 | # Pytest configuration 239 | [tool.pytest.ini_options] 240 | minversion = "7.0" 241 | addopts = "-ra -q --strict-markers --strict-config" 242 | testpaths = ["tests"] 243 | python_files = ["test_*.py", "*_test.py"] 244 | python_classes = ["Test*"] 245 | python_functions = ["test_*"] 246 | markers = [ 247 | "slow: marks tests as slow (deselect with '-m \"not slow\"')", 248 | "integration: marks tests as integration tests", 249 | "unit: marks tests as unit tests", 250 | "langsmith: marks tests that require LangSmith API", 251 | ] 252 | 253 | # Coverage configuration 254 | [tool.coverage.run] 255 | source = ["src"] 256 | branch = true 257 | omit = [ 258 | "*/tests/*", 259 | "*/test_*", 260 | "*/__pycache__/*", 261 | "*/migrations/*", 262 | ] 263 | 264 | [tool.coverage.report] 265 | precision = 2 266 | exclude_lines = [ 267 | "pragma: no cover", 268 | "def __repr__", 269 | "if self.debug:", 270 | "if settings.DEBUG", 271 | "raise AssertionError", 272 | "raise NotImplementedError", 273 | "if 0:", 274 | "if __name__ == .__main__.:", 275 | "class .*\\bProtocol\\):", 276 | "@(abc\\.)?abstractmethod", 277 | ] -------------------------------------------------------------------------------- /src/graph/workflow.py: -------------------------------------------------------------------------------- 1 | """ 2 | 主要的职称评审材料审核工作流定义 - 完全无缓存版本 3 | 4 | 🚨 已完全取消缓存机制,确保每个节点传输的信息都是全新的、一次性的 5 | 6 | 包括: 7 | 1. ZIP解压和文件夹验证 8 | 2. PDF内容提取和核心信息提取 9 | 3. 规则集加载和提取(并行处理) 10 | 4. 规则校验和交叉验证 11 | 5. 报告生成 12 | 13 | 只包含一个主工作流:create_audit_workflow() 14 | """ 15 | 16 | # LangGraph 核心导入 - 移除缓存相关的导入 17 | from langgraph.graph import StateGraph, START, END # type: ignore 18 | 19 | # 导入 RetryPolicy 20 | try: 21 | from langgraph.types import RetryPolicy # type: ignore 22 | RETRY_POLICY_AVAILABLE = True 23 | except ImportError: 24 | RetryPolicy = None 25 | RETRY_POLICY_AVAILABLE = False 26 | 27 | # 已完全移除 checkpointer 和内存存储器相关导入 28 | 29 | from .state import AuditState 30 | from .edges import ( 31 | check_pdf_extraction_status, 32 | create_parallel_branches, # 并行分支路由 33 | after_rules_loaded, # 规则加载后路由 34 | check_rules_for_validation, # 规则验证路由 35 | check_pdf_extraction_for_parallel_processing # PDF提取并行分发路由 36 | ) 37 | from src.tools.langsmith_utils import ( 38 | setup_langsmith_environment, 39 | event_logger, 40 | with_langsmith_tracing 41 | ) 42 | 43 | 44 | @with_langsmith_tracing 45 | def create_audit_workflow(): 46 | """ 47 | 创建完全无缓存的职称评审材料审核工作流 48 | 49 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的 50 | 51 | 工作流程: 52 | ZIP解压 -> 并行分支: 53 | 分支1: PDF内容提取 -> 核心信息提取 -> 交叉校验 54 | 分支2: 规则集加载 -> 规则提取 -> 汇入验证 55 | 最后: 报告生成 56 | 57 | Returns: 58 | 编译后的LangGraph工作流(无缓存) 59 | """ 60 | # 延迟导入以避免循环依赖 61 | from src.nodes import ( 62 | file_processing_node, 63 | core_info_extraction_node, 64 | validation_node, 65 | report_generation_node 66 | ) 67 | from src.nodes.pdf_extraction import pdf_extraction_node 68 | from src.nodes.cross_validation import cross_validation_node 69 | from src.nodes.rules_processing import load_rules_node, extract_rules_node 70 | 71 | # 初始化LangSmith环境 72 | setup_langsmith_environment() 73 | 74 | workflow = StateGraph(AuditState) 75 | 76 | # 根据LangGraph最佳实践添加重试策略(仅在可用时) 77 | retry_policy_io = None 78 | retry_policy_ai = None 79 | retry_policy_general = None 80 | 81 | if RETRY_POLICY_AVAILABLE and RetryPolicy is not None: 82 | retry_policy_io = RetryPolicy(max_attempts=3, retry_on=[IOError, FileNotFoundError]) 83 | retry_policy_ai = RetryPolicy(max_attempts=5, retry_on=[TimeoutError, ConnectionError]) 84 | retry_policy_general = RetryPolicy(max_attempts=2) 85 | 86 | # 添加所有节点并配置重试策略 87 | workflow.add_node( 88 | "file_processing", 89 | _wrap_node_with_logging(file_processing_node, "file_processing"), 90 | retry_policy=retry_policy_io 91 | ) 92 | workflow.add_node( 93 | "pdf_extraction", 94 | _wrap_node_with_logging(pdf_extraction_node, "pdf_extraction"), 95 | retry_policy=retry_policy_ai 96 | ) 97 | workflow.add_node( 98 | "core_info_extraction", 99 | _wrap_node_with_logging(core_info_extraction_node, "core_info_extraction") 100 | ) 101 | workflow.add_node( 102 | "validation", 103 | _wrap_node_with_logging(validation_node, "validation"), 104 | retry_policy=retry_policy_ai 105 | ) 106 | workflow.add_node( 107 | "cross_validation", 108 | _wrap_node_with_logging(cross_validation_node, "cross_validation"), 109 | retry_policy=retry_policy_general 110 | ) 111 | workflow.add_node( 112 | "report_generation", 113 | _wrap_node_with_logging(report_generation_node, "report_generation"), 114 | retry_policy=retry_policy_general 115 | ) 116 | workflow.add_node( 117 | "load_rules", 118 | _wrap_node_with_logging(load_rules_node, "load_rules"), 119 | retry_policy=retry_policy_general 120 | ) 121 | workflow.add_node( 122 | "extract_rules", 123 | _wrap_node_with_logging(extract_rules_node, "extract_rules"), 124 | retry_policy=retry_policy_ai 125 | ) 126 | 127 | # 定义工作流边连接:添加规则集并行处理支持 128 | workflow.add_edge(START, "file_processing") 129 | 130 | # 从file_processing分叉到并行处理路径 131 | workflow.add_conditional_edges( 132 | "file_processing", 133 | create_parallel_branches, 134 | ["pdf_extraction", "load_rules"] # 支持并行分支 135 | ) 136 | 137 | # 规则处理分支 138 | workflow.add_conditional_edges( 139 | "load_rules", 140 | after_rules_loaded, 141 | { 142 | "extract_rules": "extract_rules", 143 | "rules_load_failed": END 144 | } 145 | ) 146 | 147 | # 规则提取完成后,将规则通过条件边传递给validation 148 | workflow.add_conditional_edges( 149 | "extract_rules", 150 | check_rules_for_validation, 151 | ["validation", "cross_validation"] # 支持Send API并行分发 152 | ) 153 | 154 | # PDF提取后进入核心信息提取(主流程) 155 | workflow.add_conditional_edges( 156 | "pdf_extraction", 157 | check_pdf_extraction_status, 158 | { 159 | "pdf_extraction_success": "core_info_extraction", 160 | "pdf_extraction_failed": END 161 | } 162 | ) 163 | 164 | # 🛠️ 关键修复:简化工作流连接,避免多重触发导致的缓存问题 165 | # 删除直接边,只使用条件边触发节点,确保数据一致性 166 | 167 | # validation和cross_validation完成后进入报告生成 168 | workflow.add_edge("validation", "report_generation") 169 | workflow.add_edge("cross_validation", "report_generation") 170 | workflow.add_edge("core_info_extraction", "report_generation") 171 | 172 | workflow.add_edge("report_generation", END) 173 | 174 | # 编译工作流 - 完全无缓存版本 175 | # 🚨 已移除所有checkpointer和内存存储相关的配置 176 | # 确保每个节点传输的信息都是全新的、一次性的 177 | return workflow.compile() 178 | 179 | 180 | 181 | 182 | 183 | def _wrap_node_with_logging(node_func, node_name: str): 184 | """ 185 | 包装节点函数以添加LangSmith日志记录 186 | 187 | Args: 188 | node_func: 节点函数 189 | node_name: 节点名称 190 | 191 | Returns: 192 | 包装后的节点函数 193 | """ 194 | import asyncio 195 | import inspect 196 | 197 | # 检查节点函数是否为异步函数 198 | if inspect.iscoroutinefunction(node_func): 199 | # 异步节点包装器 200 | async def async_wrapped_node(state): 201 | try: 202 | # 记录节点开始 203 | event_logger.log_node_start(node_name, state) 204 | 205 | # 执行异步节点函数 206 | result = await node_func(state) 207 | 208 | # 记录节点完成 209 | event_logger.log_node_complete(node_name, result) 210 | 211 | return result 212 | 213 | except Exception as e: 214 | # 记录节点错误 215 | event_logger.log_node_error(node_name, e) 216 | raise 217 | 218 | return async_wrapped_node 219 | else: 220 | # 同步节点包装器 221 | def sync_wrapped_node(state): 222 | try: 223 | # 记录节点开始 224 | event_logger.log_node_start(node_name, state) 225 | 226 | # 执行节点函数 227 | result = node_func(state) 228 | 229 | # 记录节点完成 230 | event_logger.log_node_complete(node_name, result) 231 | 232 | return result 233 | 234 | except Exception as e: 235 | # 记录节点错误 236 | event_logger.log_node_error(node_name, e) 237 | raise 238 | 239 | return sync_wrapped_node 240 | 241 | 242 | 243 | 244 | 245 | # 延迟创建默认工作流,避免循环导入 246 | default_workflow = None 247 | 248 | def get_default_workflow(): 249 | """获取默认工作流(延迟创建)""" 250 | global default_workflow 251 | if default_workflow is None: 252 | default_workflow = create_audit_workflow() 253 | return default_workflow 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | -------------------------------------------------------------------------------- /src/graph/state.py: -------------------------------------------------------------------------------- 1 | """ 2 | LangGraph工作流状态管理 3 | 4 | 定义审核流程中的状态结构: 5 | - AuditState: 主要的审核状态 6 | - 各个节点间的状态传递规则 7 | - 状态的序列化和反序列化 8 | - 支持并发安全的状态管理 9 | """ 10 | 11 | from typing import Dict, List, Any, Optional, TypedDict, Annotated 12 | from dataclasses import dataclass, field 13 | from pathlib import Path 14 | import operator 15 | 16 | 17 | def step_reducer(existing: str, new: str) -> str: 18 | """current_step字段的reducer函数:后写入优先,确保并发安全""" 19 | # 对于步骤状态,使用最新的值(last write wins) 20 | return new if new else existing 21 | 22 | 23 | class AuditState(TypedDict): 24 | """审核工作流状态定义(支持并发安全) 25 | 26 | 注意:此处声明的键必须覆盖所有节点读写的字段; 27 | 未在此处声明的字段在LangGraph状态合并时可能被丢弃, 28 | 因而需要在这里统一、规范地进行声明。 29 | """ 30 | 31 | # 输入文件信息 32 | uploaded_file: Optional[str] # 上传的ZIP压缩包路径 33 | file_type: str # 文件类型 (zip) 34 | extraction_path: Optional[str] # ZIP解压后的根目录 35 | extracted_files: Annotated[List[str], operator.add] # 解压得到的文件列表(并发安全) 36 | 37 | # 文件夹结构验证 38 | folder_validation: Dict[str, Any] # 17个标准文件夹验证结果 39 | folder_classification: Dict[str, List[str]] # 文件夹分类结果 {文件夹名: [.pdf文件列表]} 40 | 41 | # PDF内容提取和分析(新增) 42 | pdf_extraction_results: Dict[str, Any] # PDF文件提取结果 43 | api_extraction_results: Dict[str, Any] # 通过API提取的JSON结果 44 | 45 | # PDF API配置(新增) 46 | pdf_api_endpoint: Optional[str] # PDF提取API端点 47 | 48 | # 内容提取和分析 49 | extracted_content: Dict[str, Any] # 从PDF文件提取的内容信息 50 | content_analysis: Dict[str, Any] # AI分析的结构化内容 51 | core_info: Optional[Dict[str, Any]] # 核心信息(姓名、身份证号等) 52 | 53 | # 验证结果(使用reducer确保并发安全) 54 | material_validation: Dict[str, List[Any]] # 材料校验结果 55 | cross_validation: Annotated[List[Any], operator.add] # 交叉校验结果(并发安全) 56 | validation_results: Annotated[List[Dict[str, Any]], operator.add] # 所有校验结果(并发安全) 57 | # 详细验证结果与摘要(供报告节点直接消费) 58 | validation_results_detailed: Annotated[List[Dict[str, Any]], operator.add] # 详细验证结果 59 | validation_summary: Optional[Dict[str, Any]] # 验证摘要 60 | 61 | # 规则集处理(新增并行处理支持) 62 | rules_data: Annotated[List[Dict[str, Any]], operator.add] # 加载的规则集数据(并发安全) 63 | parsed_rules: List[Any] # 🚨 移除reducer,直接替换而不是累加规则(支持RuleInfo对象和字典格式) 64 | rules_by_category: Dict[str, List[Any]] # 按1-17项分类的规则集 65 | 66 | # 缓存管理(新增) 67 | validation_cache: Annotated[List[Dict[str, Any]], operator.add] # 验证结果缓存 68 | cross_validation_cache: Annotated[List[Dict[str, Any]], operator.add] # 交叉验证结果缓存 69 | 70 | # 报告生成 71 | audit_report: Optional[str] # 生成的审核报告 72 | report_path: Optional[str] # 报告文件路径 73 | report_summary: Optional[Dict[str, Any]] # 报告摘要(便于前端展示) 74 | quality_score: Optional[float] # 报告质量评分 75 | compliance_status: Optional[str] # 合规性状态(PASS/WARNING/FAIL) 76 | 77 | # 处理统计(可选,供调试/展示) 78 | processing_stats: Optional[Dict[str, Any]] # 处理统计信息 79 | 80 | # 流程控制(使用reducer确保并发安全) 81 | current_step: Annotated[str, step_reducer] # 当前步骤(并发安全) 82 | error_message: Optional[str] # 错误信息 83 | warnings: Annotated[List[str], operator.add] # 警告信息(并发安全) 84 | processing_logs: Annotated[List[str], operator.add] # 处理日志(并发安全) 85 | is_complete: bool # 是否完成 86 | 87 | # 会话管理(LangGraph官方持久化支持) 88 | session_id: Optional[str] # 会话ID 89 | 90 | 91 | @dataclass 92 | class WorkflowConfig: 93 | """工作流配置""" 94 | 95 | # 文件处理配置 96 | max_file_size: int = 50 * 1024 * 1024 # 50MB (ZIP压缩包) 97 | supported_formats: List[str] = field(default_factory=lambda: ['.zip']) 98 | 99 | # 文件夹验证配置 100 | required_folders: List[str] = field(default_factory=lambda: [ 101 | "1.教育经历", "2.工作经历", "3.继续教育(培训情况)", "4.学术技术兼职情况", 102 | "5.获奖情况", "6.获得荣誉称号情况", "7.主持参与科研项目(基金)情况", 103 | "8.主持参与工程技术项目情况", "9.论文", "10.著(译)作(教材)", 104 | "11.专利(著作权)情况", "12.主持参与指定标准情况", 105 | "13.成果被批示、采纳、运用和推广情况", "14.资质证书", 106 | "15.奖惩情况", "16.考核情况", "17.申报材料附件信息" 107 | ]) 108 | 109 | # PDF处理配置 110 | max_pdf_file_size: int = 20 * 1024 * 1024 # 20MB per PDF file 111 | pdf_api_timeout: int = 60 # PDF API提取超时时间(秒) 112 | pdf_api_endpoint: Optional[str] = None # PDF提取API端点 113 | 114 | # AI处理配置 115 | ai_timeout: int = 300 # AI处理超时时间(秒) 116 | max_retries: int = 3 # 最大重试次数 117 | 118 | # 输出配置 119 | output_dir: str = 'output' 120 | report_template: str = 'templates/audit_report.html' 121 | 122 | 123 | def create_initial_state( 124 | uploaded_file: str, 125 | session_id: Optional[str] = None 126 | ) -> AuditState: 127 | """创建初始状态(支持并发安全)""" 128 | 129 | file_path = Path(uploaded_file) 130 | file_type = file_path.suffix.lower() 131 | 132 | # 尝试从配置获取PDF API端点 133 | pdf_api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain" # 默认配置 134 | try: 135 | from src.config.api_config import get_pdf_api_config 136 | api_config = get_pdf_api_config() 137 | configured_endpoint = api_config.get("pdf_extraction_endpoint") 138 | if configured_endpoint: 139 | pdf_api_endpoint = configured_endpoint 140 | print(f"✅ 从配置文件加载PDF API端点: {pdf_api_endpoint}") 141 | else: 142 | print(f"⚠️ 配置文件中未找到PDF API端点,使用默认值: {pdf_api_endpoint}") 143 | except ImportError: 144 | print(f"⚠️ 无法导入API配置模块,使用默认PDF API端点: {pdf_api_endpoint}") 145 | except Exception as e: 146 | print(f"⚠️ 读取API配置失败: {e},使用默认PDF API端点: {pdf_api_endpoint}") 147 | 148 | # 确保API端点不为空 149 | if not pdf_api_endpoint: 150 | pdf_api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain" 151 | print(f"🔧 强制设置默认PDF API端点: {pdf_api_endpoint}") 152 | 153 | return AuditState( 154 | # 输入文件信息 155 | uploaded_file=uploaded_file, 156 | file_type=file_type, 157 | extraction_path=None, 158 | extracted_files=[], 159 | 160 | # 文件夹结构验证 161 | folder_validation={}, 162 | folder_classification={}, 163 | 164 | # PDF内容提取和分析(新增) 165 | pdf_extraction_results={}, 166 | api_extraction_results={}, 167 | 168 | # PDF API配置 169 | pdf_api_endpoint=pdf_api_endpoint, 170 | 171 | # 内容提取和分析 172 | extracted_content={}, 173 | content_analysis={}, 174 | core_info=None, 175 | 176 | # 验证结果(初始化为空列表以支持reducer) 177 | material_validation={}, 178 | cross_validation=[], 179 | validation_results=[], 180 | validation_results_detailed=[], 181 | validation_summary=None, 182 | 183 | # 规则集处理(初始化为空列表以支持reducer) 184 | rules_data=[], 185 | parsed_rules=[], # 支持RuleInfo对象和字典格式 186 | rules_by_category={}, 187 | 188 | # 缓存管理(新增) 189 | validation_cache=[], 190 | cross_validation_cache=[], 191 | 192 | # 报告生成 193 | audit_report=None, 194 | report_path=None, 195 | report_summary=None, 196 | quality_score=None, 197 | compliance_status=None, 198 | processing_stats=None, 199 | 200 | # 流程控制 201 | current_step="zip_extraction", 202 | error_message=None, 203 | warnings=[], 204 | processing_logs=[], 205 | is_complete=False, 206 | 207 | # 会话管理 208 | session_id=session_id 209 | ) 210 | 211 | 212 | def update_state_step(state: AuditState, step: str) -> Dict[str, Any]: 213 | """更新状态步骤(并发安全)""" 214 | # 使用reducer模式更新step,避免直接赋值 215 | return {"current_step": step} 216 | 217 | 218 | def add_warning(state: AuditState, warning: str) -> Dict[str, Any]: 219 | """添加警告信息""" 220 | return {"warnings": [warning]} 221 | 222 | 223 | def set_error(state: AuditState, error: str) -> Dict[str, Any]: 224 | """设置错误信息""" 225 | return {"error_message": error} 226 | 227 | 228 | def mark_complete(state: AuditState) -> Dict[str, Any]: 229 | """标记流程完成(并发安全)""" 230 | return { 231 | "is_complete": True, 232 | "current_step": "completed" 233 | } 234 | -------------------------------------------------------------------------------- /src/tools/cache_manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | 缓存管理工具 3 | 4 | 用于管理validation和cross_validation阶段的缓存结果: 5 | 1. 按材料类型分类整理 6 | 2. 按优先级排序(高优先级错误在前) 7 | 3. 过滤通过的结果(仅显示警告和错误) 8 | 4. 生成结构化的报告数据 9 | """ 10 | 11 | from typing import Dict, List, Any, Optional 12 | from collections import defaultdict 13 | 14 | 15 | class ValidationCacheManager: 16 | """验证缓存管理器""" 17 | 18 | def __init__(self): 19 | self.priority_order = { 20 | "极高": 1, 21 | "高": 2, 22 | "中": 3, 23 | "低": 4 24 | } 25 | 26 | self.status_order = { 27 | "❌不通过": 1, 28 | "⚠️警告": 2, 29 | "✅通过": 3 30 | } 31 | 32 | def organize_validation_cache(self, validation_cache: List[Dict[str, Any]], 33 | cross_validation_cache: List[Dict[str, Any]]) -> Dict[str, Any]: 34 | """ 35 | 整理验证缓存数据 36 | 37 | Args: 38 | validation_cache: 材料验证缓存结果 39 | cross_validation_cache: 交叉验证缓存结果 40 | 41 | Returns: 42 | 整理后的报告数据 43 | """ 44 | print("📊 开始整理验证缓存数据...") 45 | 46 | # 按材料类型分类 47 | material_groups = self._group_by_material_type(validation_cache) 48 | 49 | # 添加交叉验证结果 50 | if cross_validation_cache: 51 | material_groups["交叉校验"] = cross_validation_cache 52 | 53 | # 过滤和排序每个材料类型的结果 54 | filtered_groups = {} 55 | total_issues = 0 56 | 57 | for material_type, results in material_groups.items(): 58 | # 过滤掉通过的结果,只保留警告和错误 59 | filtered_results = self._filter_non_passing_results(results) 60 | 61 | if filtered_results: 62 | # 按优先级和状态排序 63 | sorted_results = self._sort_results_by_priority(filtered_results) 64 | filtered_groups[material_type] = sorted_results 65 | total_issues += len(sorted_results) 66 | 67 | print(f" 📋 {material_type}: {len(sorted_results)}个问题") 68 | 69 | # 生成统计信息 70 | statistics = self._generate_statistics(validation_cache, cross_validation_cache) 71 | 72 | print(f"✅ 缓存数据整理完成,共发现{total_issues}个需要关注的问题") 73 | 74 | return { 75 | "material_groups": filtered_groups, 76 | "statistics": statistics, 77 | "total_issues": total_issues, 78 | "processed_at": self._get_current_timestamp() 79 | } 80 | 81 | def _group_by_material_type(self, validation_cache: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: 82 | """按材料类型分组""" 83 | groups = defaultdict(list) 84 | 85 | for result in validation_cache: 86 | material_type = result.get("material_type", "未知类型") 87 | groups[material_type].append(result) 88 | 89 | return dict(groups) 90 | 91 | def _filter_non_passing_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 92 | """过滤掉通过的结果,只保留警告和错误""" 93 | return [ 94 | result for result in results 95 | if result.get("result", "").strip() != "✅通过" 96 | ] 97 | 98 | def _sort_results_by_priority(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 99 | """按优先级和状态排序(高优先级、错误状态在前)""" 100 | def sort_key(result): 101 | priority = result.get("priority", "中") 102 | status = result.get("result", "⚠️警告") 103 | 104 | priority_score = self.priority_order.get(priority, 3) 105 | status_score = self.status_order.get(status, 2) 106 | 107 | return (priority_score, status_score) 108 | 109 | return sorted(results, key=sort_key) 110 | 111 | def _generate_statistics(self, validation_cache: List[Dict[str, Any]], 112 | cross_validation_cache: List[Dict[str, Any]]) -> Dict[str, Any]: 113 | """生成统计信息""" 114 | all_results = validation_cache + cross_validation_cache 115 | 116 | # 按状态统计 117 | status_counts = defaultdict(int) 118 | priority_counts = defaultdict(int) 119 | material_counts = defaultdict(int) 120 | 121 | for result in all_results: 122 | status = result.get("result", "⚠️警告") 123 | priority = result.get("priority", "中") 124 | material_type = result.get("material_type", "未知类型") 125 | 126 | status_counts[status] += 1 127 | priority_counts[priority] += 1 128 | material_counts[material_type] += 1 129 | 130 | return { 131 | "total_results": len(all_results), 132 | "validation_results": len(validation_cache), 133 | "cross_validation_results": len(cross_validation_cache), 134 | "status_distribution": dict(status_counts), 135 | "priority_distribution": dict(priority_counts), 136 | "material_distribution": dict(material_counts), 137 | "issues_count": len([r for r in all_results if r.get("result", "").strip() != "✅通过"]) 138 | } 139 | 140 | def get_report_summary(self, organized_data: Dict[str, Any]) -> Dict[str, Any]: 141 | """ 142 | 生成报告摘要 143 | 144 | Args: 145 | organized_data: 整理后的数据 146 | 147 | Returns: 148 | 报告摘要信息 149 | """ 150 | material_groups = organized_data.get("material_groups", {}) 151 | statistics = organized_data.get("statistics", {}) 152 | 153 | # 计算各类问题数量 154 | error_count = sum( 155 | len([r for r in results if r.get("result", "").startswith("❌")]) 156 | for results in material_groups.values() 157 | ) 158 | 159 | warning_count = sum( 160 | len([r for r in results if r.get("result", "").startswith("⚠️")]) 161 | for results in material_groups.values() 162 | ) 163 | 164 | # 最高优先级问题 165 | high_priority_issues = [] 166 | for material_type, results in material_groups.items(): 167 | for result in results: 168 | if result.get("priority") in ["极高", "高"]: 169 | high_priority_issues.append({ 170 | "material_type": material_type, 171 | "rule_name": result.get("rule_name", ""), 172 | "details": result.get("details", ""), 173 | "priority": result.get("priority", "") 174 | }) 175 | 176 | return { 177 | "total_materials_checked": len(statistics.get("material_distribution", {})), 178 | "total_issues": organized_data.get("total_issues", 0), 179 | "error_count": error_count, 180 | "warning_count": warning_count, 181 | "high_priority_count": len(high_priority_issues), 182 | "high_priority_issues": high_priority_issues[:5], # 只显示前5个 183 | "material_issue_summary": { 184 | material_type: len(results) 185 | for material_type, results in material_groups.items() 186 | } 187 | } 188 | 189 | def _get_current_timestamp(self) -> str: 190 | """获取当前时间戳""" 191 | from datetime import datetime 192 | return datetime.now().isoformat() 193 | 194 | 195 | # 全局缓存管理器实例 196 | cache_manager = ValidationCacheManager() 197 | 198 | 199 | def organize_audit_cache(state) -> Dict[str, Any]: 200 | """ 201 | 整理审核缓存数据的便捷函数 202 | 203 | Args: 204 | state: 审核状态 205 | 206 | Returns: 207 | 整理后的缓存数据 208 | """ 209 | validation_cache = state.get("validation_cache", []) 210 | cross_validation_cache = state.get("cross_validation_cache", []) 211 | 212 | return cache_manager.organize_validation_cache(validation_cache, cross_validation_cache) 213 | 214 | 215 | def get_report_data_from_cache(state) -> Dict[str, Any]: 216 | """ 217 | 从缓存中获取报告数据 218 | 219 | Args: 220 | state: 审核状态 221 | 222 | Returns: 223 | 报告数据 224 | """ 225 | organized_data = organize_audit_cache(state) 226 | summary = cache_manager.get_report_summary(organized_data) 227 | 228 | return { 229 | "organized_data": organized_data, 230 | "summary": summary, 231 | "cache_processed": True 232 | } -------------------------------------------------------------------------------- /src/tools/langsmith_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | LangSmith集成工具类 3 | 4 | 提供LangGraph项目的调试、监控和评估功能 5 | """ 6 | 7 | import os 8 | import uuid 9 | from typing import Dict, Any, Optional, List 10 | from datetime import datetime 11 | import getpass 12 | 13 | def setup_langsmith_environment(): 14 | """ 15 | 设置LangSmith环境变量 16 | 17 | 根据LangGraph最佳实践配置LangSmith追踪 18 | """ 19 | def _set_env(var: str): 20 | """安全地设置环境变量""" 21 | if not os.environ.get(var): 22 | # 优先从.env文件读取,如果没有则提示输入 23 | value = getpass.getpass(f"请输入 {var}: ") 24 | os.environ[var] = value 25 | 26 | # 设置必要的API密钥 27 | _set_env("LANGSMITH_API_KEY") 28 | 29 | # 配置LangSmith追踪 30 | os.environ["LANGCHAIN_TRACING_V2"] = "true" 31 | os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com" 32 | os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT", "Audit_Workflow_Debug") 33 | os.environ["LANGSMITH_TRACING"] = "true" 34 | 35 | print("✅ LangSmith环境配置完成") 36 | print(f"📊 项目名称: {os.environ['LANGCHAIN_PROJECT']}") 37 | 38 | 39 | def create_run_config( 40 | run_name: Optional[str] = None, 41 | tags: Optional[List[str]] = None, 42 | metadata: Optional[Dict[str, Any]] = None, 43 | thread_id: Optional[str] = None 44 | ) -> Dict[str, Any]: 45 | """ 46 | 创建LangGraph运行配置,支持LangSmith追踪 47 | 48 | Args: 49 | run_name: 运行名称 50 | tags: 标签列表 51 | metadata: 元数据 52 | thread_id: 线程ID 53 | 54 | Returns: 55 | 配置字典 56 | """ 57 | config = {} 58 | 59 | # 生成唯一的运行ID 60 | if not config.get("run_id"): 61 | config["run_id"] = str(uuid.uuid4()) 62 | 63 | # 设置运行名称 64 | if run_name: 65 | config["run_name"] = run_name 66 | else: 67 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 68 | config["run_name"] = f"audit_workflow_{timestamp}" 69 | 70 | # 设置标签 71 | default_tags = ["audit_workflow", "langgraph", "production"] 72 | if tags: 73 | config["tags"] = default_tags + tags 74 | else: 75 | config["tags"] = default_tags 76 | 77 | # 设置元数据 78 | default_metadata = { 79 | "version": "1.0.0", 80 | "environment": os.getenv("ENVIRONMENT", "development"), 81 | "project": "职称评审材料审核系统" 82 | } 83 | if metadata: 84 | default_metadata.update(metadata) 85 | config["metadata"] = default_metadata 86 | 87 | # 设置可配置参数 88 | configurable = {} 89 | if thread_id: 90 | configurable["thread_id"] = thread_id 91 | 92 | if configurable: 93 | config["configurable"] = configurable 94 | 95 | return config 96 | 97 | 98 | def log_workflow_step(step_name: str, status: str, data: Optional[Dict] = None): 99 | """ 100 | 记录工作流步骤,便于调试 101 | 102 | Args: 103 | step_name: 步骤名称 104 | status: 状态 (started, completed, failed) 105 | data: 附加数据 106 | """ 107 | timestamp = datetime.now().isoformat() 108 | log_entry = { 109 | "timestamp": timestamp, 110 | "step": step_name, 111 | "status": status, 112 | "data": data or {} 113 | } 114 | 115 | # 使用结构化日志,LangSmith可以捕获 116 | print(f"🔍 [{timestamp}] {step_name.upper()}: {status}") 117 | if data: 118 | print(f" 📝 数据: {data}") 119 | 120 | 121 | def create_debug_config(breakpoints: Optional[List[str]] = None) -> Dict[str, Any]: 122 | """ 123 | 创建调试配置 124 | 125 | Args: 126 | breakpoints: 断点列表 127 | 128 | Returns: 129 | 调试配置 130 | """ 131 | config = create_run_config( 132 | run_name="debug_session", 133 | tags=["debug", "development"], 134 | metadata={"mode": "debug"} 135 | ) 136 | 137 | if breakpoints: 138 | config["breakpoints"] = breakpoints 139 | 140 | # 启用详细追踪 141 | config["recursion_limit"] = 50 142 | 143 | return config 144 | 145 | 146 | def hide_sensitive_data(inputs: Dict[str, Any]) -> Dict[str, Any]: 147 | """ 148 | 隐藏敏感数据,避免在LangSmith中暴露 149 | 150 | Args: 151 | inputs: 输入数据 152 | 153 | Returns: 154 | 脱敏后的数据 155 | """ 156 | copied = inputs.copy() 157 | 158 | # 隐藏敏感字段 159 | sensitive_fields = ["api_key", "password", "token", "secret"] 160 | 161 | for key in copied: 162 | if any(sensitive in key.lower() for sensitive in sensitive_fields): 163 | copied[key] = "***HIDDEN***" 164 | 165 | # 隐藏长文本内容 166 | if isinstance(copied[key], str) and len(copied[key]) > 1000: 167 | copied[key] = copied[key][:100] + "...[内容过长已截断]" 168 | 169 | return copied 170 | 171 | 172 | class LangSmithEventLogger: 173 | """LangSmith事件记录器""" 174 | 175 | def __init__(self, project_name: str = "Audit_Workflow"): 176 | self.project_name = project_name 177 | self.events = [] 178 | 179 | def log_node_start(self, node_name: str, state: Dict[str, Any]): 180 | """记录节点开始""" 181 | event = { 182 | "type": "node_start", 183 | "node": node_name, 184 | "timestamp": datetime.now().isoformat(), 185 | "state_keys": list(state.keys()) 186 | } 187 | self.events.append(event) 188 | log_workflow_step(f"节点开始: {node_name}", "started") 189 | 190 | def log_node_complete(self, node_name: str, result: Dict[str, Any]): 191 | """记录节点完成""" 192 | event = { 193 | "type": "node_complete", 194 | "node": node_name, 195 | "timestamp": datetime.now().isoformat(), 196 | "result_keys": list(result.keys()) 197 | } 198 | self.events.append(event) 199 | log_workflow_step(f"节点完成: {node_name}", "completed", {"result_keys": list(result.keys())}) 200 | 201 | def log_node_error(self, node_name: str, error: Exception): 202 | """记录节点错误""" 203 | event = { 204 | "type": "node_error", 205 | "node": node_name, 206 | "timestamp": datetime.now().isoformat(), 207 | "error": str(error), 208 | "error_type": type(error).__name__ 209 | } 210 | self.events.append(event) 211 | log_workflow_step(f"节点错误: {node_name}", "failed", {"error": str(error)}) 212 | 213 | def get_events(self) -> List[Dict[str, Any]]: 214 | """获取所有事件""" 215 | return self.events 216 | 217 | def clear_events(self): 218 | """清空事件""" 219 | self.events.clear() 220 | 221 | 222 | # 全局事件记录器实例 223 | event_logger = LangSmithEventLogger() 224 | 225 | 226 | def with_langsmith_tracing(func): 227 | """ 228 | 装饰器:为函数添加LangSmith追踪 229 | """ 230 | def wrapper(*args, **kwargs): 231 | from langchain_core.tracers.context import tracing_v2_enabled 232 | from langsmith import Client 233 | 234 | # 创建LangSmith客户端,隐藏敏感数据 235 | client = Client( 236 | hide_inputs=hide_sensitive_data, 237 | hide_outputs=hide_sensitive_data 238 | ) 239 | 240 | # 在追踪上下文中执行函数 241 | with tracing_v2_enabled(client=client): 242 | return func(*args, **kwargs) 243 | 244 | return wrapper 245 | 246 | 247 | def stream_with_debug(graph, inputs: Dict[str, Any], config: Optional[Dict[str, Any]] = None): 248 | """ 249 | 流式执行图并输出调试信息 250 | 251 | Args: 252 | graph: LangGraph图实例 253 | inputs: 输入数据 254 | config: 配置信息 255 | 256 | Yields: 257 | 流式输出结果 258 | """ 259 | if not config: 260 | config = create_debug_config() 261 | 262 | print(f"🚀 开始执行工作流...") 263 | print(f"📊 运行ID: {config.get('run_id')}") 264 | print(f"🏷️ 标签: {config.get('tags', [])}") 265 | 266 | try: 267 | # 使用debug模式流式执行 268 | for chunk in graph.stream(inputs, config, stream_mode="debug"): 269 | print(f"🔍 调试信息: {chunk}") 270 | yield chunk 271 | 272 | except Exception as e: 273 | print(f"❌ 执行失败: {str(e)}") 274 | event_logger.log_node_error("workflow", e) 275 | raise 276 | 277 | 278 | if __name__ == "__main__": 279 | # 测试LangSmith配置 280 | setup_langsmith_environment() 281 | 282 | # 测试配置创建 283 | test_config = create_run_config( 284 | run_name="test_run", 285 | tags=["test"], 286 | metadata={"test": True} 287 | ) 288 | print(f"测试配置: {test_config}") -------------------------------------------------------------------------------- /src/config/redis.py: -------------------------------------------------------------------------------- 1 | """ 2 | Redis 配置和连接管理 3 | 4 | 提供 Redis 连接配置、健康检查和连接池管理功能 5 | """ 6 | 7 | import os 8 | import redis 9 | from typing import Optional, Dict, Any 10 | from dataclasses import dataclass 11 | from loguru import logger 12 | 13 | @dataclass 14 | class RedisConfig: 15 | """Redis 配置类""" 16 | 17 | # 连接配置 18 | host: str = "localhost" 19 | port: int = 6379 20 | db: int = 0 21 | password: Optional[str] = None 22 | 23 | # 连接池配置 24 | max_connections: int = 20 25 | retry_on_timeout: bool = True 26 | 27 | # 超时配置 28 | socket_connect_timeout: int = 5 29 | socket_timeout: int = 5 30 | 31 | # TTL 配置 (用于 LangGraph checkpointer) 32 | default_ttl: int = 3600 # 1小时,单位:秒 33 | refresh_on_read: bool = True 34 | 35 | # 键前缀 36 | checkpoint_prefix: str = "langgraph:checkpoint:" 37 | store_prefix: str = "langgraph:store:" 38 | 39 | @classmethod 40 | def from_env(cls) -> "RedisConfig": 41 | """从环境变量创建 Redis 配置""" 42 | return cls( 43 | host=os.getenv("REDIS_HOST", "localhost"), 44 | port=int(os.getenv("REDIS_PORT", "6379")), 45 | db=int(os.getenv("REDIS_DB", "0")), 46 | password=os.getenv("REDIS_PASSWORD"), 47 | max_connections=int(os.getenv("REDIS_MAX_CONNECTIONS", "20")), 48 | socket_connect_timeout=int(os.getenv("REDIS_SOCKET_CONNECT_TIMEOUT", "5")), 49 | socket_timeout=int(os.getenv("REDIS_SOCKET_TIMEOUT", "5")), 50 | default_ttl=int(os.getenv("REDIS_DEFAULT_TTL", "3600")), 51 | refresh_on_read=os.getenv("REDIS_REFRESH_ON_READ", "true").lower() == "true", 52 | checkpoint_prefix=os.getenv("REDIS_CHECKPOINT_PREFIX", "langgraph:checkpoint:"), 53 | store_prefix=os.getenv("REDIS_STORE_PREFIX", "langgraph:store:") 54 | ) 55 | 56 | def get_connection_url(self) -> str: 57 | """获取 Redis 连接 URL""" 58 | if self.password: 59 | return f"redis://:{self.password}@{self.host}:{self.port}/{self.db}" 60 | return f"redis://{self.host}:{self.port}/{self.db}" 61 | 62 | def get_ttl_config(self) -> Dict[str, Any]: 63 | """获取 TTL 配置字典""" 64 | return { 65 | "default_ttl": self.default_ttl // 60, # LangGraph Redis 期望分钟 66 | "refresh_on_read": self.refresh_on_read 67 | } 68 | 69 | 70 | class RedisManager: 71 | """Redis 连接管理器""" 72 | 73 | def __init__(self, config: Optional[RedisConfig] = None): 74 | self.config = config or RedisConfig.from_env() 75 | self._redis_client: Optional[redis.Redis] = None 76 | 77 | @property 78 | def redis_client(self) -> redis.Redis: 79 | """获取 Redis 客户端(单例模式)""" 80 | if self._redis_client is None: 81 | # 创建连接池 82 | pool = redis.ConnectionPool( 83 | host=self.config.host, 84 | port=self.config.port, 85 | db=self.config.db, 86 | password=self.config.password, 87 | max_connections=self.config.max_connections, 88 | retry_on_timeout=self.config.retry_on_timeout, 89 | socket_connect_timeout=self.config.socket_connect_timeout, 90 | socket_timeout=self.config.socket_timeout, 91 | decode_responses=True 92 | ) 93 | # 显式创建同步 Redis 客户端 94 | self._redis_client = redis.Redis(connection_pool=pool) 95 | return self._redis_client 96 | 97 | def test_connection(self) -> bool: 98 | """测试 Redis 连接""" 99 | try: 100 | # 直接调用 ping(),同步客户端应该返回布尔值 101 | result = self.redis_client.ping() 102 | # 确保结果是布尔值 103 | success = bool(result) 104 | if success: 105 | logger.info(f"✅ Redis 连接成功: {self.config.host}:{self.config.port}") 106 | else: 107 | logger.error(f"❌ Redis ping 返回 False") 108 | return success 109 | except Exception as e: 110 | logger.error(f"❌ Redis 连接失败: {e}") 111 | return False 112 | 113 | def get_info(self) -> Dict[str, Any]: 114 | """获取 Redis 服务器信息""" 115 | try: 116 | info = self.redis_client.info() 117 | # 确保返回的是字典类型 118 | if isinstance(info, dict): 119 | return info 120 | else: 121 | logger.warning(f"Redis info() 返回了非字典类型: {type(info)}") 122 | return {} 123 | except Exception as e: 124 | logger.error(f"获取 Redis 信息失败: {e}") 125 | return {} 126 | 127 | def clear_cache(self, pattern: str = "*") -> int: 128 | """清理缓存""" 129 | try: 130 | keys = self.redis_client.keys(pattern) 131 | # 确保 keys 是列表类型 132 | if isinstance(keys, (list, tuple)) and keys: 133 | deleted = self.redis_client.delete(*keys) 134 | # 安全处理 deleted 的类型转换 135 | if isinstance(deleted, int): 136 | deleted_count = deleted 137 | else: 138 | # 对于非整数类型(包括异步类型),返回 0 139 | logger.warning(f"Redis delete() 返回了非整数类型: {type(deleted)},默认为 0") 140 | deleted_count = 0 141 | 142 | logger.info(f"清理了 {deleted_count} 个缓存键") 143 | return deleted_count 144 | elif isinstance(keys, (list, tuple)): 145 | # 空列表 146 | return 0 147 | else: 148 | logger.warning(f"Redis keys() 返回了非列表类型: {type(keys)}") 149 | return 0 150 | except Exception as e: 151 | logger.error(f"清理缓存失败: {e}") 152 | return 0 153 | 154 | def get_memory_usage(self) -> Dict[str, Any]: 155 | """获取内存使用情况""" 156 | try: 157 | # 直接调用 info() 方法,同步 Redis 客户端应该返回字典 158 | info = self.redis_client.info("memory") 159 | 160 | # 确保返回的是字典类型 161 | if isinstance(info, dict): 162 | return { 163 | "used_memory": info.get("used_memory", 0), 164 | "used_memory_human": info.get("used_memory_human", "0B"), 165 | "used_memory_peak": info.get("used_memory_peak", 0), 166 | "used_memory_peak_human": info.get("used_memory_peak_human", "0B"), 167 | "total_system_memory": info.get("total_system_memory", 0), 168 | "total_system_memory_human": info.get("total_system_memory_human", "0B") 169 | } 170 | else: 171 | # 如果不是字典类型,记录警告并返回空字典 172 | logger.warning(f"Redis info() 返回了非字典类型: {type(info)}") 173 | return {} 174 | except Exception as e: 175 | logger.error(f"获取内存使用情况失败: {e}") 176 | return {} 177 | 178 | def close(self): 179 | """关闭 Redis 连接""" 180 | if self._redis_client: 181 | self._redis_client.close() 182 | self._redis_client = None 183 | logger.info("Redis 连接已关闭") 184 | 185 | 186 | # 全局 Redis 管理器实例 187 | _redis_manager: Optional[RedisManager] = None 188 | 189 | 190 | def get_redis_manager() -> RedisManager: 191 | """获取全局 Redis 管理器实例""" 192 | global _redis_manager 193 | if _redis_manager is None: 194 | _redis_manager = RedisManager() 195 | return _redis_manager 196 | 197 | 198 | def get_redis_config() -> RedisConfig: 199 | """获取 Redis 配置""" 200 | return get_redis_manager().config 201 | 202 | 203 | def test_redis_connection() -> bool: 204 | """测试 Redis 连接""" 205 | return get_redis_manager().test_connection() 206 | 207 | 208 | # 健康检查函数 209 | def redis_health_check() -> Dict[str, Any]: 210 | """Redis 健康检查""" 211 | manager = get_redis_manager() 212 | 213 | health_info = { 214 | "service": "redis", 215 | "status": "unknown", 216 | "details": {} 217 | } 218 | 219 | try: 220 | # 测试连接 221 | if manager.test_connection(): 222 | health_info["status"] = "healthy" 223 | health_info["details"]["connection"] = "ok" 224 | 225 | # 获取服务器信息 226 | info = manager.get_info() 227 | if isinstance(info, dict): 228 | health_info["details"]["version"] = info.get("redis_version", "unknown") 229 | health_info["details"]["uptime"] = info.get("uptime_in_seconds", 0) 230 | 231 | # 获取内存使用情况 232 | memory_info = manager.get_memory_usage() 233 | health_info["details"]["memory"] = memory_info 234 | 235 | else: 236 | health_info["status"] = "unhealthy" 237 | health_info["details"]["error"] = "connection_failed" 238 | 239 | except Exception as e: 240 | health_info["status"] = "unhealthy" 241 | health_info["details"]["error"] = str(e) 242 | 243 | return health_info -------------------------------------------------------------------------------- /static/styles.css: -------------------------------------------------------------------------------- 1 | /* LangGraph 职称评审系统 - 样式文件 */ 2 | 3 | :root { 4 | --primary-color: #0d6efd; 5 | --secondary-color: #6c757d; 6 | --success-color: #198754; 7 | --danger-color: #dc3545; 8 | --warning-color: #ffc107; 9 | --info-color: #0dcaf0; 10 | --light-color: #f8f9fa; 11 | --dark-color: #212529; 12 | } 13 | 14 | body { 15 | font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; 16 | background-color: #f5f7fa; 17 | color: var(--dark-color); 18 | } 19 | 20 | .navbar-brand { 21 | font-weight: 600; 22 | font-size: 1.25rem; 23 | } 24 | 25 | .card { 26 | border: none; 27 | border-radius: 12px; 28 | transition: all 0.3s ease; 29 | } 30 | 31 | .card:hover { 32 | transform: translateY(-2px); 33 | } 34 | 35 | .card-header { 36 | border-radius: 12px 12px 0 0 !important; 37 | border: none; 38 | font-weight: 600; 39 | } 40 | 41 | .btn { 42 | border-radius: 8px; 43 | font-weight: 500; 44 | transition: all 0.3s ease; 45 | } 46 | 47 | .btn:hover { 48 | transform: translateY(-1px); 49 | } 50 | 51 | /* 特色卡片样式 */ 52 | .feature-card { 53 | text-align: center; 54 | padding: 2rem 1rem; 55 | border-radius: 12px; 56 | background: white; 57 | box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); 58 | transition: all 0.3s ease; 59 | margin-bottom: 1.5rem; 60 | } 61 | 62 | .feature-card:hover { 63 | transform: translateY(-4px); 64 | box-shadow: 0 8px 15px rgba(0, 0, 0, 0.15); 65 | } 66 | 67 | .feature-card i { 68 | font-size: 2.5rem; 69 | margin-bottom: 1rem; 70 | } 71 | 72 | .feature-card h5 { 73 | color: var(--dark-color); 74 | margin-bottom: 0.5rem; 75 | font-weight: 600; 76 | } 77 | 78 | .feature-card p { 79 | color: var(--secondary-color); 80 | margin: 0; 81 | font-size: 0.9rem; 82 | } 83 | 84 | /* 进度条样式 */ 85 | .progress { 86 | border-radius: 10px; 87 | background-color: #e9ecef; 88 | } 89 | 90 | .progress-bar { 91 | border-radius: 10px; 92 | transition: width 0.6s ease; 93 | } 94 | 95 | /* 工作流步骤样式 */ 96 | .workflow-step { 97 | display: flex; 98 | align-items: center; 99 | padding: 1rem; 100 | margin-bottom: 0.5rem; 101 | border-radius: 8px; 102 | background: white; 103 | border-left: 4px solid #e9ecef; 104 | transition: all 0.3s ease; 105 | } 106 | 107 | .workflow-step.active { 108 | border-left-color: var(--primary-color); 109 | background: rgba(13, 110, 253, 0.05); 110 | } 111 | 112 | .workflow-step.completed { 113 | border-left-color: var(--success-color); 114 | background: rgba(25, 135, 84, 0.05); 115 | } 116 | 117 | .workflow-step.error { 118 | border-left-color: var(--danger-color); 119 | background: rgba(220, 53, 69, 0.05); 120 | } 121 | 122 | .workflow-step-icon { 123 | width: 40px; 124 | height: 40px; 125 | border-radius: 50%; 126 | display: flex; 127 | align-items: center; 128 | justify-content: center; 129 | margin-right: 1rem; 130 | font-size: 1.2rem; 131 | background: #e9ecef; 132 | color: var(--secondary-color); 133 | transition: all 0.3s ease; 134 | } 135 | 136 | .workflow-step.active .workflow-step-icon { 137 | background: var(--primary-color); 138 | color: white; 139 | } 140 | 141 | .workflow-step.completed .workflow-step-icon { 142 | background: var(--success-color); 143 | color: white; 144 | } 145 | 146 | .workflow-step.error .workflow-step-icon { 147 | background: var(--danger-color); 148 | color: white; 149 | } 150 | 151 | .workflow-step-content h6 { 152 | margin: 0 0 0.25rem 0; 153 | font-weight: 600; 154 | } 155 | 156 | .workflow-step-content p { 157 | margin: 0; 158 | color: var(--secondary-color); 159 | font-size: 0.9rem; 160 | } 161 | 162 | /* 日志容器样式 */ 163 | .log-container { 164 | height: 300px; 165 | overflow-y: auto; 166 | background: #2c3e50; 167 | color: #ecf0f1; 168 | font-family: 'Courier New', monospace; 169 | font-size: 0.85rem; 170 | padding: 1rem; 171 | border-radius: 8px; 172 | } 173 | 174 | .log-entry { 175 | margin-bottom: 0.5rem; 176 | padding: 0.25rem 0.5rem; 177 | border-radius: 4px; 178 | word-wrap: break-word; 179 | } 180 | 181 | .log-entry.started { 182 | background: rgba(13, 110, 253, 0.2); 183 | border-left: 3px solid var(--primary-color); 184 | } 185 | 186 | .log-entry.progress { 187 | background: rgba(255, 193, 7, 0.2); 188 | border-left: 3px solid var(--warning-color); 189 | } 190 | 191 | .log-entry.completed { 192 | background: rgba(25, 135, 84, 0.2); 193 | border-left: 3px solid var(--success-color); 194 | } 195 | 196 | .log-entry.error { 197 | background: rgba(220, 53, 69, 0.2); 198 | border-left: 3px solid var(--danger-color); 199 | } 200 | 201 | .log-timestamp { 202 | color: #95a5a6; 203 | font-size: 0.75rem; 204 | margin-right: 0.5rem; 205 | } 206 | 207 | /* 任务列表样式 */ 208 | .task-item { 209 | padding: 1rem; 210 | border-bottom: 1px solid #e9ecef; 211 | cursor: pointer; 212 | transition: all 0.3s ease; 213 | } 214 | 215 | .task-item:hover { 216 | background: rgba(13, 110, 253, 0.05); 217 | } 218 | 219 | .task-item.active { 220 | background: rgba(13, 110, 253, 0.1); 221 | border-left: 4px solid var(--primary-color); 222 | } 223 | 224 | .task-status { 225 | font-size: 0.75rem; 226 | padding: 0.25rem 0.5rem; 227 | border-radius: 12px; 228 | font-weight: 600; 229 | text-transform: uppercase; 230 | } 231 | 232 | .task-status.started { 233 | background: rgba(13, 110, 253, 0.1); 234 | color: var(--primary-color); 235 | } 236 | 237 | .task-status.processing { 238 | background: rgba(255, 193, 7, 0.1); 239 | color: #996404; 240 | } 241 | 242 | .task-status.completed { 243 | background: rgba(25, 135, 84, 0.1); 244 | color: var(--success-color); 245 | } 246 | 247 | .task-status.failed { 248 | background: rgba(220, 53, 69, 0.1); 249 | color: var(--danger-color); 250 | } 251 | 252 | /* 状态徽章样式 */ 253 | .badge.bg-primary { 254 | background-color: var(--primary-color) !important; 255 | } 256 | 257 | .badge.bg-warning { 258 | background-color: var(--warning-color) !important; 259 | color: var(--dark-color) !important; 260 | } 261 | 262 | .badge.bg-success { 263 | background-color: var(--success-color) !important; 264 | } 265 | 266 | .badge.bg-danger { 267 | background-color: var(--danger-color) !important; 268 | } 269 | 270 | /* 文件输入样式 */ 271 | .form-control:focus { 272 | border-color: var(--primary-color); 273 | box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25); 274 | } 275 | 276 | .form-select:focus { 277 | border-color: var(--primary-color); 278 | box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25); 279 | } 280 | 281 | /* 动画效果 */ 282 | @keyframes pulse { 283 | 0% { 284 | transform: scale(1); 285 | } 286 | 50% { 287 | transform: scale(1.05); 288 | } 289 | 100% { 290 | transform: scale(1); 291 | } 292 | } 293 | 294 | .pulse { 295 | animation: pulse 2s infinite; 296 | } 297 | 298 | @keyframes fadeInUp { 299 | from { 300 | opacity: 0; 301 | transform: translateY(30px); 302 | } 303 | to { 304 | opacity: 1; 305 | transform: translateY(0); 306 | } 307 | } 308 | 309 | .fade-in-up { 310 | animation: fadeInUp 0.6s ease-out; 311 | } 312 | 313 | /* 响应式设计 */ 314 | @media (max-width: 768px) { 315 | .feature-card { 316 | padding: 1.5rem 1rem; 317 | } 318 | 319 | .workflow-step { 320 | flex-direction: column; 321 | text-align: center; 322 | } 323 | 324 | .workflow-step-icon { 325 | margin-right: 0; 326 | margin-bottom: 0.5rem; 327 | } 328 | 329 | .log-container { 330 | height: 200px; 331 | font-size: 0.8rem; 332 | } 333 | } 334 | 335 | /* 滚动条样式 */ 336 | .log-container::-webkit-scrollbar { 337 | width: 8px; 338 | } 339 | 340 | .log-container::-webkit-scrollbar-track { 341 | background: #34495e; 342 | border-radius: 4px; 343 | } 344 | 345 | .log-container::-webkit-scrollbar-thumb { 346 | background: #7f8c8d; 347 | border-radius: 4px; 348 | } 349 | 350 | .log-container::-webkit-scrollbar-thumb:hover { 351 | background: #95a5a6; 352 | } 353 | 354 | /* 加载动画 */ 355 | .spinner-border-sm { 356 | width: 1rem; 357 | height: 1rem; 358 | } 359 | 360 | /* 工具提示样式 */ 361 | .tooltip { 362 | font-size: 0.8rem; 363 | } 364 | 365 | /* 模态框样式 */ 366 | .modal-content { 367 | border-radius: 12px; 368 | border: none; 369 | box-shadow: 0 20px 25px rgba(0, 0, 0, 0.15); 370 | } 371 | 372 | .modal-header { 373 | border-radius: 12px 12px 0 0; 374 | border-bottom: 1px solid #e9ecef; 375 | } 376 | 377 | /* 代码块样式 */ 378 | pre { 379 | background: #2c3e50 !important; 380 | color: #ecf0f1 !important; 381 | border: none; 382 | border-radius: 8px; 383 | font-family: 'Courier New', monospace; 384 | font-size: 0.85rem; 385 | max-height: 400px; 386 | overflow-y: auto; 387 | } 388 | 389 | /* 连接状态指示器 */ 390 | .connection-status { 391 | position: fixed; 392 | top: 20px; 393 | right: 20px; 394 | z-index: 1050; 395 | padding: 0.5rem 1rem; 396 | border-radius: 20px; 397 | font-size: 0.8rem; 398 | font-weight: 600; 399 | transition: all 0.3s ease; 400 | } 401 | 402 | .connection-status.connected { 403 | background: rgba(25, 135, 84, 0.9); 404 | color: white; 405 | } 406 | 407 | .connection-status.disconnected { 408 | background: rgba(220, 53, 69, 0.9); 409 | color: white; 410 | } 411 | 412 | .connection-status.connecting { 413 | background: rgba(255, 193, 7, 0.9); 414 | color: var(--dark-color); 415 | } -------------------------------------------------------------------------------- /src/tools/file_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | 文件处理工具 3 | 4 | 提供文件处理相关的工具函数: 5 | - ZIP文件解压 6 | - 17个标准文件夹结构验证 7 | - Markdown文件处理 8 | - 文件路径处理 9 | """ 10 | 11 | import zipfile 12 | import re 13 | import markdown 14 | from pathlib import Path 15 | from typing import List, Dict, Any, Optional 16 | 17 | async def extract_zip_file(zip_path: str) -> Dict[str, Any]: 18 | """ 19 | 解压ZIP文件并返回解压结果 20 | 21 | Args: 22 | zip_path: ZIP文件路径 23 | 24 | Returns: 25 | 解压结果字典,包含解压路径和文件列表 26 | """ 27 | try: 28 | import asyncio 29 | 30 | # 使用异步方式处理ZIP文件 31 | def _extract_zip(): 32 | with zipfile.ZipFile(zip_path, 'r') as zip_ref: 33 | # 解压到当前目录的 extracted 文件夹 34 | extract_dir = Path(zip_path).parent / "extracted" 35 | extract_dir.mkdir(exist_ok=True) 36 | 37 | zip_ref.extractall(extract_dir) 38 | 39 | # 收集所有解压的文件 40 | extracted_files = [] 41 | import os 42 | # 使用os.walk代替rglob来避免阻塞调用 43 | for root, dirs, files in os.walk(extract_dir): 44 | for file in files: 45 | file_path = os.path.join(root, file) 46 | extracted_files.append(file_path) 47 | return extract_dir, extracted_files 48 | 49 | extract_dir, extracted_files = await asyncio.to_thread(_extract_zip) 50 | 51 | return { 52 | "extraction_path": str(extract_dir), 53 | "files": extracted_files, 54 | "success": True 55 | } 56 | 57 | except Exception as e: 58 | print(f"解压失败: {e}") 59 | return { 60 | "extraction_path": None, 61 | "files": [], 62 | "success": False, 63 | "error": str(e) 64 | } 65 | 66 | async def validate_folder_structure(extraction_path: str) -> Dict[str, Any]: 67 | """ 68 | 验证17个标准文件夹结构 69 | 70 | 支持文件夹在根目录或下一层子目录中 71 | 72 | Args: 73 | extraction_path: 解压后的根目录路径 74 | 75 | Returns: 76 | 验证结果字典 77 | """ 78 | # 17个标准文件夹名称 79 | standard_folders = [ 80 | "1.教育经历", 81 | "2.工作经历", 82 | "3.继续教育(培训情况)", 83 | "4.学术技术兼职情况", 84 | "5.获奖情况", 85 | "6.获得荣誉称号情况", 86 | "7.主持参与科研项目(基金)情况", 87 | "8.主持参与工程技术项目情况", 88 | "9.论文", 89 | "10.著(译)作(教材)", 90 | "11.专利(著作权)情况", 91 | "12.主持参与指定标准情况", 92 | "13.成果被批示、采纳、运用和推广情况", 93 | "14.资质证书", 94 | "15.奖惩情况", 95 | "16.考核情况", 96 | "17.申报材料附件信息" 97 | ] 98 | 99 | extraction_dir = Path(extraction_path) 100 | 101 | # 递归查找17个标准文件夹(在根目录或下一层子目录中) 102 | async def find_folders_recursively(search_dir: Path, max_depth: int = 2) -> Dict[str, str]: 103 | """ 104 | 递归查找标准文件夹 105 | 106 | Args: 107 | search_dir: 搜索目录 108 | max_depth: 最大搜索深度(1=仅根目录,2=根目录+一层子目录) 109 | 110 | Returns: 111 | 找到的文件夹映射 {文件夹名: 路径} 112 | """ 113 | found_folders = {} 114 | 115 | async def _search_directory(current_dir: Path, current_depth: int): 116 | if current_depth > max_depth: 117 | return 118 | 119 | try: 120 | # 使用异步方式读取目录 - 使用os.scandir避免阻塞 121 | import asyncio 122 | import os 123 | 124 | try: 125 | # 使用asyncio.to_thread包装os.scandir调用 126 | def _list_directory(): 127 | return list(os.scandir(current_dir)) 128 | 129 | entries = await asyncio.to_thread(_list_directory) 130 | 131 | for entry in entries: 132 | # 检查是否是目录 133 | def _check_is_dir(): 134 | return entry.is_dir() 135 | 136 | if await asyncio.to_thread(_check_is_dir): 137 | folder_name = entry.name 138 | # 检查是否是标准文件夹 139 | if folder_name in standard_folders and folder_name not in found_folders: 140 | found_folders[folder_name] = str(entry.path) 141 | print(f"📁 找到标准文件夹: {folder_name} -> {entry.path}") 142 | 143 | # 如果还没达到最大深度,继续递归搜索 144 | if current_depth < max_depth: 145 | from pathlib import Path 146 | await _search_directory(Path(entry.path), current_depth + 1) 147 | except OSError as e: 148 | print(f"⚠️ 无法扫描目录 {current_dir}: {e}") 149 | except PermissionError: 150 | print(f"⚠️ 无法访问目录: {current_dir}") 151 | 152 | await _search_directory(search_dir, 1) 153 | return found_folders 154 | 155 | print(f"🔍 开始递归查找17个标准文件夹(最大深度2层)...") 156 | found_folder_paths = await find_folders_recursively(extraction_dir, max_depth=2) 157 | 158 | # 构建文件夹信息 159 | folders_found = [] 160 | missing_folders = [] 161 | 162 | for standard_folder in standard_folders: 163 | if standard_folder in found_folder_paths: 164 | folders_found.append({ 165 | "name": standard_folder, 166 | "path": found_folder_paths[standard_folder], 167 | "exists": True 168 | }) 169 | else: 170 | missing_folders.append(standard_folder) 171 | 172 | # 获取所有实际存在的文件夹(用于检查额外文件夹) 173 | import asyncio 174 | import os 175 | all_actual_folders = [] 176 | 177 | async def collect_folders(): 178 | # 使用os.walk代替rglob来避免阻塞调用 179 | def _walk_dirs(): 180 | folders = [] 181 | for root, dirs, files in os.walk(extraction_dir): 182 | for dir_name in dirs: 183 | folders.append(dir_name) 184 | return folders 185 | 186 | folder_names = await asyncio.to_thread(_walk_dirs) 187 | all_actual_folders.extend(folder_names) 188 | 189 | await collect_folders() 190 | 191 | # 检查额外的文件夹 192 | extra_folders = [] 193 | for actual_folder in set(all_actual_folders): 194 | if actual_folder not in standard_folders: 195 | extra_folders.append(actual_folder) 196 | 197 | # 判断是否合规 198 | is_valid = len(missing_folders) == 0 199 | 200 | print(f"📊 文件夹验证结果: 找到 {len(folders_found)}/{len(standard_folders)} 个标准文件夹") 201 | if missing_folders: 202 | print(f"⚠️ 缺失的文件夹: {missing_folders}") 203 | 204 | return { 205 | "is_valid": is_valid, 206 | "folders_found": folders_found, 207 | "missing_folders": missing_folders, 208 | "extra_folders": extra_folders, 209 | "total_standard_folders": len(standard_folders), 210 | "found_count": len(folders_found) 211 | } 212 | 213 | 214 | def analyze_markdown_structure(md_content: str) -> Dict[str, Any]: 215 | """ 216 | 分析Markdown文件结构 217 | 218 | Args: 219 | md_content: Markdown内容 220 | 221 | Returns: 222 | 结构分析结果 223 | """ 224 | import datetime 225 | 226 | try: 227 | # 基本统计信息 228 | lines = md_content.split('\n') 229 | 230 | # 提取标题 231 | headers = [] 232 | for line in lines: 233 | if line.strip().startswith('#'): 234 | level = len(line) - len(line.lstrip('#')) 235 | title = line.strip('#').strip() 236 | headers.append({ 237 | "level": level, 238 | "title": title 239 | }) 240 | 241 | # 提取列表项 242 | list_items = [] 243 | for line in lines: 244 | stripped = line.strip() 245 | if stripped.startswith('-') or stripped.startswith('*') or re.match(r'^\d+\.', stripped): 246 | list_items.append(stripped) 247 | 248 | return { 249 | "total_lines": len(lines), 250 | "total_chars": len(md_content), 251 | "headers": headers, 252 | "list_items": list_items, 253 | "has_content": len(md_content.strip()) > 0, 254 | "extraction_timestamp": datetime.datetime.now().isoformat() 255 | } 256 | 257 | except Exception as e: 258 | return { 259 | "total_lines": 0, 260 | "total_chars": 0, 261 | "headers": [], 262 | "list_items": [], 263 | "has_content": False, 264 | "error": str(e) 265 | } 266 | 267 | 268 | async def extract_markdown_content(md_file_path: str) -> Dict[str, Any]: 269 | """ 270 | 提取Markdown文件内容 271 | 272 | Args: 273 | md_file_path: Markdown文件路径 274 | 275 | Returns: 276 | 提取结果 277 | """ 278 | try: 279 | import asyncio 280 | # 使用异步方式读取文件 281 | def _read_file(): 282 | with open(md_file_path, 'r', encoding='utf-8') as f: 283 | return f.read() 284 | 285 | content = await asyncio.to_thread(_read_file) 286 | 287 | structure = analyze_markdown_structure(content) 288 | 289 | return { 290 | "file_path": md_file_path, 291 | "content": content, 292 | "structure": structure, 293 | "success": True 294 | } 295 | 296 | except Exception as e: 297 | return { 298 | "file_path": md_file_path, 299 | "content": "", 300 | "structure": {}, 301 | "success": False, 302 | "error": str(e) 303 | } -------------------------------------------------------------------------------- /src/nodes/core_info_extraction.py: -------------------------------------------------------------------------------- 1 | """ 2 | 核心信息提取节点 3 | 4 | 从1-17项材料中分别提取各自的核心信息: 5 | - 每项材料提取相应的关键字段 6 | - 输出17个字段的结构化信息 7 | - 支持AI增强的信息提取 8 | """ 9 | 10 | from typing import Dict, Any, Optional 11 | from src.graph.state import AuditState 12 | from src.tools.ai_utils import extract_core_information_with_ai, extract_category_core_info_with_ai 13 | 14 | 15 | def core_info_extraction_node(state: AuditState) -> Dict[str, Any]: 16 | """ 17 | 完全无缓存的核心信息提取节点 - 每次都处理全新数据 18 | 19 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的 20 | """ 21 | try: 22 | print(f"🎯 开始无缓存核心信息提取...") 23 | 24 | # 直接获取当前状态的数据 - 不使用任何缓存 25 | api_extraction_results = state.get("api_extraction_results", {}) 26 | extracted_content = state.get("extracted_content", {}) 27 | 28 | print(f"🔍 当前状态数据:") 29 | print(f" API提取结果: {len(api_extraction_results)} 项") 30 | print(f" 备用提取内容: {len(extracted_content)} 项") 31 | 32 | # 确定使用哪个数据源 - 直接判断,不做缓存检查 33 | if api_extraction_results: 34 | data_source = api_extraction_results 35 | print(f"✅ 使用API提取结果: {len(api_extraction_results)} 项") 36 | elif extracted_content: 37 | data_source = extracted_content 38 | print(f"⚠️ 使用备用提取内容: {len(extracted_content)} 项") 39 | else: 40 | print("⚠️ 没有找到提取的内容,跳过核心信息提取") 41 | return { 42 | "core_info": _create_empty_core_info_structure(), 43 | "current_step": "core_info_extraction_skipped", 44 | "processing_logs": ["未找到有效数据,跳过核心信息提取"] 45 | } 46 | 47 | # 直接创建17项核心信息结构 - 不使用缓存 48 | core_info_structure = _create_empty_core_info_structure() 49 | 50 | # 1-17项材料分类映射 51 | material_categories = { 52 | "1.教育经历": "education", 53 | "教育经历": "education", 54 | "2.工作经历": "work_experience", 55 | "工作经历": "work_experience", 56 | "3.继续教育(培训情况)": "continuing_education", 57 | "继续教育": "continuing_education", 58 | "培训情况": "continuing_education", 59 | "4.学术技术兼职情况": "academic_positions", 60 | "学术技术兼职情况": "academic_positions", 61 | "5.获奖情况": "awards", 62 | "获奖情况": "awards", 63 | "6.获得荣誉称号情况": "honors", 64 | "荣誉称号": "honors", 65 | "7.主持参与科研项目(基金)情况": "research_projects", 66 | "科研项目": "research_projects", 67 | "8.主持参与工程技术项目情况": "engineering_projects", 68 | "工程项目": "engineering_projects", 69 | "9.论文": "papers", 70 | "论文": "papers", 71 | "10.著(译)作(教材)": "publications", 72 | "著作": "publications", 73 | "教材": "publications", 74 | "11.专利(著作权)情况": "patents", 75 | "专利": "patents", 76 | "12.主持参与指定标准情况": "standards", 77 | "标准制定": "standards", 78 | "13.成果被批示、采纳、运用和推广情况": "achievements", 79 | "成果应用": "achievements", 80 | "14.资质证书": "certificates", 81 | "资质证书": "certificates", 82 | "15.奖惩情况": "rewards_punishments", 83 | "奖惩情况": "rewards_punishments", 84 | "16.考核情况": "evaluations", 85 | "考核情况": "evaluations", 86 | "17.申报材料附件信息": "attachments", 87 | "附件信息": "attachments" 88 | } 89 | 90 | print(f"📁 发现 {len(data_source)} 个材料类型需要提取核心信息") 91 | 92 | # 处理每个材料类型 93 | for folder_name, folder_data in data_source.items(): 94 | print(f"🔍 正在处理: {folder_name}") 95 | 96 | # 确定材料类别 97 | category_key = None 98 | for key, category in material_categories.items(): 99 | if key in folder_name or folder_name in key: 100 | category_key = category 101 | break 102 | 103 | if not category_key: 104 | print(f"⚠️ 未识别的材料类型: {folder_name},归类为附件信息") 105 | category_key = "attachments" 106 | 107 | # 提取材料内容 108 | material_content = _extract_material_content_from_folder(folder_data) 109 | 110 | if not material_content.strip(): 111 | print(f"⚠️ {folder_name} 没有有效内容") 112 | continue 113 | 114 | # 使用AI提取该材料类型的核心信息 115 | try: 116 | extracted_info = extract_category_core_info_with_ai( 117 | category_key, folder_name, material_content 118 | ) 119 | 120 | if extracted_info: 121 | core_info_structure[category_key] = extracted_info 122 | print(f"✅ {folder_name} 核心信息提取成功") 123 | else: 124 | print(f"⚠️ {folder_name} 核心信息提取失败") 125 | 126 | except Exception as e: 127 | print(f"⚠️ {folder_name} 信息提取异常: {e}") 128 | # 创建默认结构,保持数据一致性 129 | core_info_structure[category_key] = { 130 | "name": None, 131 | "id_number": None, 132 | "extracted_from": [folder_name], 133 | "content_summary": None, 134 | "key_info": { 135 | "category": category_key, 136 | "folder_name": folder_name, 137 | "error": str(e), 138 | "extracted_at": _get_current_timestamp() 139 | } 140 | } 141 | continue 142 | 143 | # 统计提取结果 144 | extracted_categories = [] 145 | name_count = 0 146 | id_count = 0 147 | 148 | for category, info in core_info_structure.items(): 149 | if info and info.get('name'): 150 | name_count += 1 151 | if info and info.get('id_number'): 152 | id_count += 1 153 | if info and (info.get('name') or info.get('id_number') or info.get('content_summary')): 154 | extracted_categories.append(category) 155 | 156 | print(f"✅ 核心信息提取完成:") 157 | print(f" 成功处理 {len(extracted_categories)} 项材料") 158 | print(f" 提取到姓名的材料: {name_count} 项") 159 | print(f" 提取到身份证号的材料: {id_count} 项") 160 | 161 | # 🚨 确保数据结构符合交叉校验节点的期望 162 | return { 163 | "core_info": core_info_structure, 164 | "current_step": "core_info_extraction_completed", 165 | "processing_logs": [ 166 | f"核心信息提取完成: 成功处理{len(extracted_categories)}项材料", 167 | f"提取到姓名的材料: {name_count}项", 168 | f"提取到身份证号的材料: {id_count}项" 169 | ] 170 | } 171 | 172 | except Exception as e: 173 | print(f"❌ 核心信息提取失败: {str(e)}") 174 | # 🚨 即使失败也要返回有效的空结构,确保后续节点能正常处理 175 | return { 176 | "core_info": _create_empty_core_info_structure(), 177 | "current_step": "core_info_extraction_failed", 178 | "error_message": f"核心信息提取失败: {str(e)}", 179 | "processing_logs": [f"核心信息提取失败: {str(e)}"] 180 | } 181 | 182 | 183 | def _create_empty_core_info_structure() -> Dict[str, Any]: 184 | """创建空的1-17项核心信息结构,每项都包含姓名和身份证号用于交叉校验""" 185 | # 为每一项材料创建相同的基础结构 186 | base_structure = { 187 | "name": None, # 姓名(从该项材料中提取) 188 | "id_number": None, # 身份证号(从该项材料中提取) 189 | "extracted_from": [], # 信息来源文件 190 | "content_summary": None, # 内容摘要 191 | "key_info": {} # 该项材料的关键信息 192 | } 193 | 194 | return { 195 | # 1-17项材料,每项都包含姓名和身份证号用于交叉校验 196 | "education": base_structure.copy(), # 1.教育经历 197 | "work_experience": base_structure.copy(), # 2.工作经历 198 | "continuing_education": base_structure.copy(), # 3.继续教育(培训情况) 199 | "academic_positions": base_structure.copy(), # 4.学术技术兼职情况 200 | "awards": base_structure.copy(), # 5.获奖情况 201 | "honors": base_structure.copy(), # 6.获得荣誉称号情况 202 | "research_projects": base_structure.copy(), # 7.主持参与科研项目(基金)情况 203 | "engineering_projects": base_structure.copy(), # 8.主持参与工程技术项目情况 204 | "papers": base_structure.copy(), # 9.论文 205 | "publications": base_structure.copy(), # 10.著(译)作(教材) 206 | "patents": base_structure.copy(), # 11.专利(著作权)情况 207 | "standards": base_structure.copy(), # 12.主持参与指定标准情况 208 | "achievements": base_structure.copy(), # 13.成果被批示、采纳、运用和推广情况 209 | "certificates": base_structure.copy(), # 14.资质证书 210 | "rewards_punishments": base_structure.copy(), # 15.奖惩情况 211 | "evaluations": base_structure.copy(), # 16.考核情况 212 | "attachments": base_structure.copy() # 17.申报材料附件信息 213 | } 214 | 215 | 216 | def _extract_material_content_from_folder(folder_data: Any) -> str: 217 | """从文件夹数据中提取材料内容""" 218 | material_content = "" 219 | 220 | if isinstance(folder_data, list): 221 | # 处理api_extraction_results格式 222 | for json_item in folder_data: 223 | if isinstance(json_item, dict): 224 | content = json_item.get("content", {}) 225 | if isinstance(content, dict): 226 | # 尝试多种可能的内容字段 227 | for key in ["md_content", "raw_markdown", "text", "content"]: 228 | if key in content: 229 | text_content = str(content[key]) 230 | if text_content.strip(): 231 | material_content += text_content + "\n\n" 232 | break 233 | if not material_content: 234 | material_content += str(content) + "\n\n" 235 | else: 236 | material_content += str(content) + "\n\n" 237 | 238 | elif isinstance(folder_data, dict): 239 | # 处理extracted_content格式 240 | content_list = folder_data.get("content", []) 241 | if isinstance(content_list, list): 242 | for item in content_list: 243 | if isinstance(item, dict): 244 | if "json_data" in item: 245 | json_data = item["json_data"] 246 | content = json_data.get("content", {}) 247 | if isinstance(content, dict): 248 | for key in ["md_content", "raw_markdown", "text", "content"]: 249 | if key in content: 250 | text_content = str(content[key]) 251 | if text_content.strip(): 252 | material_content += text_content + "\n\n" 253 | break 254 | else: 255 | material_content += str(content) + "\n\n" 256 | elif "content" in item: 257 | material_content += str(item["content"]) + "\n\n" 258 | else: 259 | material_content += str(item) + "\n\n" 260 | 261 | return material_content.strip() 262 | 263 | 264 | def _get_current_timestamp() -> str: 265 | """获取当前时间戳""" 266 | from datetime import datetime 267 | return datetime.now().isoformat() -------------------------------------------------------------------------------- /src/graph/edges.py: -------------------------------------------------------------------------------- 1 | """ 2 | LangGraph边和路由逻辑定义 3 | 4 | 包含工作流中的条件边和路由函数: 5 | - 根据PDF页数决定处理策略的路由 6 | - 根据材料类型决定校验规则的路由 7 | - 根据校验结果决定后续流程的路由 8 | - 支持Send API实现的并行分支 9 | """ 10 | 11 | from typing import Dict, Any, List, Union 12 | from .state import AuditState 13 | 14 | # 导入Send API用于并行处理 15 | try: 16 | from langgraph.types import Send 17 | SEND_AVAILABLE = True 18 | except ImportError: 19 | Send = None 20 | SEND_AVAILABLE = False 21 | 22 | 23 | def should_continue_processing(state: AuditState) -> str: 24 | """ 25 | 判断是否继续处理流程 26 | 27 | Returns: 28 | "continue": 继续处理 29 | "error": 发生错误,终止流程 30 | """ 31 | if state.get("error_message"): 32 | return "error" 33 | 34 | if not state.get("uploaded_file"): 35 | return "error" 36 | 37 | return "continue" 38 | 39 | 40 | def route_folder_validation(state: AuditState) -> str: 41 | """ 42 | 根据文件夹结构验证结果决定处理策略 43 | 44 | Returns: 45 | "process_folders": 文件夹结构正确,继续处理 46 | "error": 文件夹结构错误,终止流程 47 | """ 48 | folder_validation = state.get("folder_validation", {}) 49 | 50 | # 检查是否有17个标准文件夹 51 | if not folder_validation: 52 | return "error" 53 | 54 | folders_found = folder_validation.get("folders_found", []) 55 | if len(folders_found) < 17: 56 | return "error" 57 | 58 | return "process_folders" 59 | 60 | 61 | def should_continue_content_analysis(state: AuditState) -> str: 62 | """ 63 | 判断是否继续内容分析 64 | 65 | Returns: 66 | "analyze": 继续分析 67 | "skip_analysis": 跳过分析 68 | "error": 发生错误 69 | """ 70 | if state.get("error_message"): 71 | return "error" 72 | 73 | extracted_content = state.get("extracted_content", {}) 74 | if not extracted_content: 75 | return "skip_analysis" 76 | 77 | return "analyze" 78 | 79 | 80 | def route_to_cross_validation(state: AuditState) -> str: 81 | """ 82 | 决定是否进行交叉校验 83 | 84 | Returns: 85 | "cross_validate": 进行交叉校验 86 | "skip_cross_validation": 跳过交叉校验 87 | "error": 发生错误 88 | """ 89 | if state.get("error_message"): 90 | return "error" 91 | 92 | # 检查是否有材料校验结果 93 | material_validation = state.get("material_validation", {}) 94 | if not material_validation: 95 | return "skip_cross_validation" 96 | 97 | # 检查是否有核心信息 98 | core_info = state.get("core_info") 99 | extracted_content = state.get("extracted_content", {}) 100 | 101 | if not core_info and not extracted_content: 102 | return "skip_cross_validation" 103 | 104 | return "cross_validate" 105 | 106 | 107 | def should_generate_report(state: AuditState) -> str: 108 | """ 109 | 判断是否应该生成报告 110 | 111 | Returns: 112 | "generate_report": 生成报告 113 | "error": 发生错误,终止流程 114 | """ 115 | if state.get("error_message"): 116 | return "error" 117 | 118 | # 只要有任何处理结果就生成报告 119 | has_content = any([ 120 | state.get("extracted_content"), 121 | state.get("material_validation"), 122 | state.get("cross_validation"), 123 | state.get("folder_classification") 124 | ]) 125 | 126 | if has_content: 127 | return "generate_report" 128 | else: 129 | return "error" 130 | def check_pdf_extraction_for_parallel_processing(state: AuditState) -> Union[List, str]: 131 | """ 132 | PDF提取完成后,并行分发到core_info_extraction和validation节点 133 | 134 | 确保PDF提取的数据能同时进入核心信息提取和材料校验 135 | 136 | Returns: 137 | Send对象列表,发送到core_info_extraction和validation 138 | 或者在失败时返回END 139 | """ 140 | if not SEND_AVAILABLE or Send is None: 141 | print("⚠️ Send API不可用,使用传统路由") 142 | # 检查PDF提取状态 143 | status = check_pdf_extraction_status(state) 144 | if status == "pdf_extraction_success": 145 | return "core_info_extraction" # 退化到传统路由 146 | else: 147 | return "END" 148 | 149 | # 检查PDF提取状态 150 | status = check_pdf_extraction_status(state) 151 | 152 | if status == "pdf_extraction_success": 153 | print(f"🚀 PDF提取成功,并行分发到核心信息提取和校验节点") 154 | 155 | # 并行发送到两个处理节点 156 | return [ 157 | Send("core_info_extraction", state), # 核心信息提取 158 | Send("validation", state) # 直接进入校验 159 | ] 160 | else: 161 | print("❌ PDF提取失败,终止流程") 162 | return "END" 163 | 164 | 165 | def check_core_info_for_cross_validation(state: AuditState) -> str: 166 | """ 167 | 检查核心信息是否完成,决定是否进行交叉验证 168 | 169 | 注意:LangGraph不支持真正的"等待两个节点都完成"逻辑 170 | 这里简化为:只要有核心信息就进行交叉验证 171 | 172 | Returns: 173 | "proceed_cross_validation": 进行交叉验证 174 | "skip_cross_validation": 跳过交叉验证 175 | """ 176 | core_info = state.get("core_info") 177 | extracted_content = state.get("extracted_content", {}) 178 | 179 | # 只要有核心信息和提取内容就进行交叉验证 180 | if core_info is not None and extracted_content: 181 | return "proceed_cross_validation" 182 | else: 183 | return "skip_cross_validation" 184 | 185 | 186 | def check_pdf_extraction_status(state: AuditState) -> str: 187 | """ 188 | 检查PDF提取状态,确保PDF内容提取完成后才进行下一步 189 | 190 | 这是关键的状态判断函数,遵循LangGraph条件边的最佳实践 191 | 192 | Returns: 193 | "pdf_extraction_success": PDF提取成功,继续后续流程 194 | "pdf_extraction_failed": PDF提取失败,跳转到错误处理 195 | "pdf_extraction_pending": PDF提取正在进行中(理论上不应该出现) 196 | """ 197 | print("🔍 检查PDF提取状态...") 198 | 199 | # 检查当前步骤状态 200 | current_step = state.get("current_step", "") 201 | print(f"📋 当前步骤: {current_step}") 202 | 203 | # 修复被连接的状态字符串问题 204 | if "pdf_extraction_failed" in current_step: 205 | print("❌ PDF提取已标记为失败") 206 | return "pdf_extraction_failed" 207 | 208 | if "pdf_extraction_completed" in current_step: 209 | print("✅ PDF提取已标记为完成") 210 | # 检查是否有实际的提取结果 211 | pdf_extraction_results = state.get("pdf_extraction_results", {}) 212 | api_extraction_results = state.get("api_extraction_results", {}) 213 | if pdf_extraction_results or api_extraction_results: 214 | print(f"📊 找到PDF提取结果: {len(pdf_extraction_results)} 个文件夹") 215 | return "pdf_extraction_success" 216 | else: 217 | print("⚠️ PDF提取完成但没有结果数据") 218 | return "pdf_extraction_failed" 219 | 220 | # 检查是否有实际的提取结果或空文件夹结构 221 | pdf_extraction_results = state.get("pdf_extraction_results", {}) 222 | api_extraction_results = state.get("api_extraction_results", {}) 223 | 224 | # 只要有文件夹结构就认为成功,不一定要有PDF文件 225 | if pdf_extraction_results: 226 | total_files = 0 227 | successful_files = 0 228 | 229 | for folder_name, folder_data in pdf_extraction_results.items(): 230 | files = folder_data.get("files", []) 231 | total_files += len(files) 232 | successful_files += len([f for f in files if f.get("success")]) 233 | 234 | print(f"📊 PDF提取统计: {successful_files}/{total_files} 文件成功,{len(pdf_extraction_results)}个文件夹") 235 | 236 | # 即使没有PDF文件,只要有文件夹结构就认为成功 237 | print("✅ 检测到PDF提取结果或文件夹结构") 238 | return "pdf_extraction_success" 239 | else: 240 | print("❌ 没有PDF提取结果") 241 | return "pdf_extraction_failed" 242 | 243 | # 检查错误消息 244 | error_message = state.get("error_message", "") 245 | if error_message and "pdf" in error_message.lower() and "failed" in error_message.lower(): 246 | print(f"❌ 发现PDF相关错误: {error_message}") 247 | return "pdf_extraction_failed" 248 | 249 | # 默认情况:如果状态不明确,认为是失败 250 | print("⚠️ PDF提取状态不明确,默认为失败") 251 | return "pdf_extraction_failed" 252 | 253 | 254 | def create_parallel_branches(state: AuditState) -> Union[List, str]: 255 | """ 256 | 创建并行分支:从文件处理后分发到多个并行路径 257 | 258 | 使用LangGraph的Send API实现真正的并行处理: 259 | 1. PDF提取路径 260 | 2. 规则处理路径 261 | 262 | Returns: 263 | Send对象列表,每个对象代表一个并行分支 264 | """ 265 | if not SEND_AVAILABLE or Send is None: 266 | print("⚠️ Send API不可用,使用传统路由") 267 | return "pdf_extraction" # 退化到传统路由 268 | 269 | print("🚀 创建并行分支: PDF提取 + 规则处理") 270 | 271 | # 返回多个Send对象,实现并行处理 272 | return [ 273 | Send("pdf_extraction", state), # PDF提取路径 274 | Send("load_rules", state) # 规则加载路径 275 | ] 276 | 277 | 278 | def after_rules_loaded(state: AuditState) -> str: 279 | """ 280 | 规则加载完成后的路由 281 | 282 | Returns: 283 | "extract_rules": 继续提取规则 284 | "rules_load_failed": 规则加载失败 285 | """ 286 | current_step = state.get("current_step", "") 287 | 288 | if "rules_load_failed" in current_step: 289 | print("❌ 规则加载失败") 290 | return "rules_load_failed" 291 | 292 | if "rules_loaded" in current_step: 293 | print("✅ 规则加载成功,继续提取") 294 | return "extract_rules" 295 | 296 | # 检查是否有规则数据 297 | rules_data = state.get("rules_data", []) 298 | if rules_data: 299 | print(f"✅ 发现 {len(rules_data)} 个规则数据,继续提取") 300 | return "extract_rules" 301 | 302 | print("❌ 未找到规则数据") 303 | return "rules_load_failed" 304 | 305 | 306 | def check_core_info_for_parallel_validation(state: AuditState) -> Union[List, str]: 307 | """ 308 | 核心信息提取完成后,并行分发到validation和cross_validation节点 309 | 310 | 确保PDF提取路径的数据也能进入validation节点 311 | 312 | Returns: 313 | Send对象列表,发送到validation和cross_validation 314 | """ 315 | if not SEND_AVAILABLE or Send is None: 316 | print("⚠️ Send API不可用,使用传统路由") 317 | return "validation" # 退化到传统路由 318 | 319 | core_info = state.get("core_info") 320 | extracted_content = state.get("extracted_content", {}) 321 | 322 | print(f"🚀 核心信息提取完成,分发到验证节点") 323 | print(f"📊 核心信息状态: {core_info is not None}") 324 | print(f"📊 提取内容状态: {len(extracted_content) if extracted_content else 0} 项") 325 | 326 | # 并行发送到两个验证节点 327 | return [ 328 | Send("validation", state), 329 | Send("cross_validation", state) 330 | ] 331 | 332 | 333 | def check_rules_for_validation(state: AuditState) -> Union[List, str]: 334 | """ 335 | 检查规则提取结果,决定是否可以进入验证阶段 336 | 337 | 使用Send API将规则数据发送到validation和cross_validation节点 338 | 339 | Returns: 340 | Send对象列表,发送到validation和cross_validation 341 | """ 342 | if not SEND_AVAILABLE or Send is None: 343 | print("⚠️ Send API不可用,使用传统路由") 344 | return "validation" # 退化到传统路由 345 | 346 | parsed_rules = state.get("parsed_rules", []) 347 | current_step = state.get("current_step", "") 348 | 349 | # 添加详细调试信息 350 | print(f"🔍 check_rules_for_validation 调试信息:") 351 | print(f" current_step: {current_step}") 352 | print(f" parsed_rules 数量: {len(parsed_rules)}") 353 | print(f" parsed_rules 内容: {parsed_rules[:2] if parsed_rules else '空'}") 354 | 355 | # 修复条件判断:只要有规则就传递给validation 356 | if parsed_rules and len(parsed_rules) > 0: 357 | print(f"🚀 规则提取成功,分发到验证节点: {len(parsed_rules)} 条规则") 358 | 359 | # 将规则数据发送到两个验证节点 360 | return [ 361 | Send("validation", state), 362 | Send("cross_validation", state) 363 | ] 364 | elif "rules_extract_skipped" in current_step: 365 | print("🚨 规则提取已跳过,直接进行基础验证") 366 | return [Send("validation", state)] 367 | else: 368 | print("⚠️ 规则提取未完成或无规则数据,只进行基础验证") 369 | return [Send("validation", state)] -------------------------------------------------------------------------------- /src/agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | 主要的职称评审材料审核代理 3 | 4 | 基于LangGraph框架的完整审核系统入口 5 | 集成LangSmith调试和监控功能 6 | """ 7 | 8 | import os 9 | from typing import Dict, Any, Optional 10 | 11 | # 定义RunnableConfig为类型别名,避免对langchain_core的依赖 12 | RunnableConfig = Dict[str, Any] 13 | 14 | # 导入工作流模块 15 | try: 16 | # 优先使用绝对导入 17 | from src.graph.workflow import create_audit_workflow 18 | except ImportError: 19 | try: 20 | # 如果绝对导入失败,尝试相对导入 21 | import sys 22 | import os 23 | # 添加项目根目录到Python路径 24 | project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 25 | if project_root not in sys.path: 26 | sys.path.insert(0, project_root) 27 | from src.graph.workflow import create_audit_workflow 28 | except ImportError: 29 | try: 30 | # 最后尝试从当前目录导入 31 | from graph.workflow import create_audit_workflow 32 | except ImportError: 33 | raise ImportError("无法导入工作流模块,请检查项目结构") 34 | 35 | # 导入状态模块 36 | try: 37 | from src.graph.state import ( 38 | AuditState, 39 | create_initial_state 40 | ) 41 | except ImportError: 42 | try: 43 | from graph.state import ( 44 | AuditState, 45 | create_initial_state 46 | ) 47 | except ImportError: 48 | # 如果都失败,尝试使用系统路径 49 | import sys 50 | import os 51 | project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 52 | if project_root not in sys.path: 53 | sys.path.insert(0, project_root) 54 | try: 55 | from src.graph.state import ( 56 | AuditState, 57 | create_initial_state 58 | ) 59 | except ImportError: 60 | raise ImportError("无法导入状态模块,请检查项目结构") 61 | 62 | try: 63 | from src.config.api_config import configure_pdf_api 64 | except ImportError: 65 | try: 66 | from config.api_config import configure_pdf_api 67 | except ImportError: 68 | def configure_pdf_api(*args, **kwargs): 69 | print("⚠️ API配置模块未加载,使用默认配置") 70 | 71 | # 删除未使用的 configure_pdf_api_endpoint 导入 72 | 73 | try: 74 | from src.tools.langsmith_utils import setup_langsmith_environment 75 | except ImportError: 76 | try: 77 | from tools.langsmith_utils import setup_langsmith_environment 78 | except ImportError: 79 | def setup_langsmith_environment(): 80 | print("⚠️ LangSmith工具未加载") 81 | 82 | # 初始化主工作流 83 | if os.getenv("LANGSMITH_API_KEY"): 84 | setup_langsmith_environment() 85 | print("✅ LangSmith追踪已启用") 86 | 87 | # 使用统一的主工作流 88 | graph = create_audit_workflow() 89 | print("✅ 主审核工作流已就绪") 90 | 91 | # 导出主要接口 92 | __all__ = [ 93 | "graph", 94 | "run_audit", 95 | "run_audit_with_tracing", 96 | "debug_audit", 97 | "configure_pdf_api", 98 | "run_pdf_audit", 99 | "AuditState", 100 | "create_initial_state" 101 | ] 102 | 103 | 104 | async def run_audit(uploaded_file: str, session_id: Optional[str] = None) -> dict: 105 | """ 106 | 运行审核工作流的便捷函数(异步版本) 107 | 108 | Args: 109 | uploaded_file: 上传的文件路径 110 | session_id: 会话ID(可选) 111 | 112 | Returns: 113 | 审核结果 114 | """ 115 | # 创建初始状态 116 | initial_state = create_initial_state(uploaded_file, session_id) 117 | 118 | # 确保PDF API端点配置(修复:在ZIP文件审核中也设置) 119 | if not initial_state.get("pdf_api_endpoint"): 120 | api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain" 121 | initial_state["pdf_api_endpoint"] = api_endpoint 122 | print(f"🔧 为ZIP文件审核设置PDF API端点: {api_endpoint}") 123 | 124 | # 为基础审核模式创建配置 125 | config = None 126 | if session_id: 127 | config = {"configurable": {"thread_id": session_id}} 128 | 129 | try: 130 | # 执行工作流(使用异步API) 131 | print(f"🚀 开始审核流程: {uploaded_file}") 132 | if config: 133 | result = await graph.ainvoke(initial_state, config) # type: ignore 134 | else: 135 | result = await graph.ainvoke(initial_state) 136 | 137 | print(f"✅ 审核完成! 最终状态: {result.get('current_step', '未知')}") 138 | return result 139 | 140 | except Exception as e: 141 | print(f"❌ 审核失败: {str(e)}") 142 | return { 143 | "error": str(e), 144 | "current_step": "failed", 145 | "error_message": str(e) 146 | } 147 | 148 | 149 | async def run_audit_with_tracing( 150 | uploaded_file: str, 151 | session_id: Optional[str] = None, 152 | run_name: Optional[str] = None, 153 | tags: Optional[list] = None 154 | ) -> dict: 155 | """ 156 | 运行带LangSmith追踪的审核工作流(异步版本) 157 | 158 | Args: 159 | uploaded_file: 上传的文件路径 160 | session_id: 会话ID(可选) 161 | run_name: 运行名称 162 | tags: 标签列表 163 | 164 | Returns: 165 | 审核结果 166 | """ 167 | try: 168 | from src.tools.langsmith_utils import create_run_config, with_langsmith_tracing 169 | 170 | # 创建初始状态 171 | initial_state = create_initial_state(uploaded_file, session_id) 172 | 173 | # 确保PDF API端点配置(修复:在带追踪审核中也设置) 174 | if not initial_state.get("pdf_api_endpoint"): 175 | api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain" 176 | initial_state["pdf_api_endpoint"] = api_endpoint 177 | print(f"🔧 为带追踪审核设置PDF API端点: {api_endpoint}") 178 | 179 | # 创建带追踪的配置 180 | config = create_run_config( 181 | run_name=run_name or f"audit_with_tracing_{session_id or 'default'}", 182 | tags=tags or ["web", "tracing", "production"], 183 | thread_id=session_id 184 | ) 185 | 186 | print(f"🔍 开始带追踪的审核流程: {uploaded_file}") 187 | print(f"📊 运行名称: {config.get('run_name')}") 188 | print(f"🏷️ 标签: {config.get('tags', [])}") 189 | 190 | # 使用带追踪的图执行(异步版本) 191 | @with_langsmith_tracing 192 | async def traced_audit(): 193 | return await graph.ainvoke(initial_state, config) # type: ignore 194 | 195 | result = await traced_audit() 196 | 197 | print(f"✅ 带追踪审核完成! 最终状态: {result.get('current_step', '未知')}") 198 | return result 199 | 200 | except Exception as e: 201 | print(f"❌ 带追踪审核失败: {str(e)}") 202 | return { 203 | "error": str(e), 204 | "current_step": "failed", 205 | "error_message": str(e) 206 | } 207 | 208 | 209 | async def debug_audit( 210 | uploaded_file: str, 211 | session_id: Optional[str] = None, 212 | breakpoints: Optional[list] = None 213 | ) -> dict: 214 | """ 215 | 运行调试模式的审核工作流(异步版本) 216 | 217 | Args: 218 | uploaded_file: 上传的文件路径 219 | session_id: 会话ID(可选) 220 | breakpoints: 断点列表 221 | 222 | Returns: 223 | 审核结果 224 | """ 225 | try: 226 | from src.tools.langsmith_utils import create_debug_config, event_logger 227 | 228 | # 创建初始状态 229 | initial_state = create_initial_state(uploaded_file, session_id) 230 | 231 | # 确保PDF API端点配置(修复:在调试模式中也设置) 232 | if not initial_state.get("pdf_api_endpoint"): 233 | api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain" 234 | initial_state["pdf_api_endpoint"] = api_endpoint 235 | print(f"🔧 为调试模式设置PDF API端点: {api_endpoint}") 236 | 237 | # 创建调试配置 238 | config = create_debug_config(breakpoints=breakpoints) 239 | if session_id: 240 | config["configurable"] = {"thread_id": session_id} 241 | 242 | print(f"🐛 开始调试模式审核流程: {uploaded_file}") 243 | print(f"🔧 断点: {breakpoints or ['无']}") 244 | 245 | # 清空事件日志 246 | event_logger.clear_events() 247 | 248 | # 执行工作流(异步版本) 249 | result = await graph.ainvoke(initial_state, config) # type: ignore 250 | 251 | # 收集调试信息 252 | debug_events = event_logger.get_events() 253 | 254 | print(f"✅ 调试模式审核完成! 最终状态: {result.get('current_step', '未知')}") 255 | print(f"📝 记录了 {len(debug_events)} 个调试事件") 256 | 257 | # 在结果中包含调试信息 258 | result["debug_events"] = debug_events 259 | return result 260 | 261 | except Exception as e: 262 | print(f"❌ 调试模式审核失败: {str(e)}") 263 | return { 264 | "error": str(e), 265 | "current_step": "failed", 266 | "error_message": str(e) 267 | } 268 | 269 | 270 | async def run_pdf_audit( 271 | uploaded_file: str, 272 | api_endpoint: str, 273 | session_id: Optional[str] = None, 274 | with_tracing: bool = False 275 | ) -> dict: 276 | """ 277 | 运行PDF审核工作流(异步版本) 278 | 279 | Args: 280 | uploaded_file: 上传的ZIP文件路径 281 | api_endpoint: PDF提取API端点 282 | session_id: 会话ID(可选) 283 | with_tracing: 是否启用LangSmith追踪 284 | 285 | Returns: 286 | 审核结果 287 | """ 288 | try: 289 | # 配置PDF API端点 290 | configure_pdf_api(api_endpoint) 291 | print(f"🔧 已配置PDF提取API: {api_endpoint}") 292 | 293 | # 创建初始状态 294 | initial_state = create_initial_state(uploaded_file, session_id) 295 | 296 | # 直接设置API端点(现在AuditState已经支持这个字段) 297 | initial_state["pdf_api_endpoint"] = api_endpoint 298 | 299 | # 选择执行模式 300 | if with_tracing: 301 | print(f"🔍 开始PDF审核流程(启用追踪): {uploaded_file}") 302 | return await run_audit_with_tracing( 303 | uploaded_file, 304 | session_id, 305 | run_name=f"pdf_audit_{session_id or 'default'}", 306 | tags=["pdf", "api_extraction", "production"] 307 | ) 308 | else: 309 | print(f"🚀 开始PDF审核流程: {uploaded_file}") 310 | 311 | # 为基础审核模式创建配置 312 | config = None 313 | if session_id: 314 | config = {"configurable": {"thread_id": session_id}} 315 | 316 | # 执行工作流(异步版本) 317 | if config: 318 | result = await graph.ainvoke(initial_state, config) # type: ignore 319 | else: 320 | result = await graph.ainvoke(initial_state) 321 | 322 | print(f"✅ PDF审核完成! 最终状态: {result.get('current_step', '未知')}") 323 | return result 324 | 325 | except Exception as e: 326 | print(f"❌ PDF审核失败: {str(e)}") 327 | return { 328 | "error": str(e), 329 | "current_step": "failed", 330 | "error_message": str(e), 331 | "pdf_api_endpoint": api_endpoint 332 | } 333 | 334 | 335 | 336 | 337 | 338 | async def main_async(): 339 | """命令行入口点(异步版本)""" 340 | import sys 341 | import argparse 342 | 343 | parser = argparse.ArgumentParser(description='LangGraph 职称材料审核系统') 344 | parser.add_argument('file_path', help='要审核的ZIP文件路径') 345 | parser.add_argument('--session-id', help='会话ID(可选)') 346 | 347 | 348 | args = parser.parse_args() 349 | 350 | # 统一使用主审核函数(异步版本) 351 | result = await run_audit(args.file_path, args.session_id) 352 | print(f"✅ 审核结果: {result}") 353 | 354 | return result 355 | 356 | def main(): 357 | """命令行入口点(用于pyproject.toml脚本配置)""" 358 | import asyncio 359 | return asyncio.run(main_async()) 360 | 361 | 362 | if __name__ == "__main__": 363 | # 示例用法 364 | import os 365 | import asyncio 366 | 367 | async def example_usage(): 368 | # 检查测试数据 369 | test_file = "test_data/sample.zip" 370 | 371 | if os.path.exists(test_file): 372 | print("🧪 运行测试审核...") 373 | result = await run_audit(test_file) 374 | print(f"📊 审核结果: {result}") 375 | else: 376 | print("📋 主代理已就绪,可以通过以下方式使用:") 377 | print(" from src.agent import run_audit") 378 | print(" import asyncio") 379 | print(" result = asyncio.run(run_audit('path/to/your/file.zip'))") 380 | print("\n🔧 或者直接使用图对象:") 381 | print(" from src.agent import graph") 382 | print(" result = await graph.ainvoke(initial_state)") 383 | 384 | asyncio.run(example_usage()) -------------------------------------------------------------------------------- /static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | LangGraph 职称评审材料审核系统 7 | 8 | 9 | 10 | 11 | 12 |
13 | 14 | 27 | 28 |
29 |
30 | 31 |
32 |
33 |
34 |
35 | 文件上传 36 |
37 |
38 |
39 |
40 |
41 | 42 | 43 |
支持ZIP文件(完整材料包)或PDF文件(单个文档)
44 |
45 | 46 |
47 | 48 | 49 |
50 | 51 |
52 |
53 | 54 | 57 |
58 |
59 | 60 | 63 |
64 |
65 | 66 |
67 | 68 | 73 |
74 | 75 | 78 |
79 |
80 |
81 | 82 | 83 |
84 |
85 |
86 | 任务列表 87 |
88 | 91 |
92 |
93 |
94 | 95 |
96 |
97 |
98 |
99 | 100 | 101 |
102 | 103 |
104 |
105 | 106 |
107 |

LangGraph 职称评审材料审核系统

108 |

109 | 基于人工智能的智能职称材料审核,支持实时流式处理和进度追踪 110 |

111 |
112 |
113 |
114 | 115 |
智能处理
116 |

AI驱动的内容提取和规则验证

117 |
118 |
119 |
120 |
121 | 122 |
实时更新
123 |

流式API支持实时进度追踪

124 |
125 |
126 |
127 |
128 | 129 |
详细报告
130 |

生成完整的HTML审核报告

131 |
132 |
133 |
134 |
135 | 136 | 137 |
138 | 139 |
140 |
141 |
142 | 审核信息 143 |
144 |
145 |
146 |
147 |
148 | 任务ID:
149 | 文件名:
150 | 文件大小: 151 |
152 |
153 | 会话ID:
154 | 开始时间:
155 | 状态: 156 |
157 |
158 |
159 |
160 | 161 | 162 |
163 |
164 |
165 | 审核进度 166 | 0% 167 |
168 |
169 |
171 |
172 |
准备开始...
173 |
174 |
175 | 176 | 177 |
178 |
179 |
180 | 工作流步骤 181 |
182 |
183 |
184 |
185 | 186 |
187 |
188 |
189 | 190 | 191 |
192 |
193 |
194 | 实时日志 195 |
196 | 199 |
200 |
201 |
202 | 203 |
204 |
205 |
206 | 207 | 208 |
209 | 212 | 215 | 218 | 221 |
222 |
223 |
224 |
225 |
226 |
227 | 228 | 229 | 242 | 243 | 244 | 262 | 263 | 264 | 265 | 266 | -------------------------------------------------------------------------------- /src/nodes/validation.py: -------------------------------------------------------------------------------- 1 | """ 2 | AI驱动的规则校验节点 - 基于rules文件夹中的Excel规则 3 | """ 4 | 5 | from typing import Dict, List, Any 6 | from src.graph.state import AuditState 7 | 8 | # 导入AI工具 9 | try: 10 | from src.tools.ai_utils import validate_material_with_ai 11 | _ai_utils_available = True 12 | except ImportError: 13 | _ai_utils_available = False 14 | validate_material_with_ai = None 15 | 16 | 17 | def validation_node(state: AuditState) -> Dict[str, Any]: 18 | """ 19 | 完全无缓存的AI智能校验节点 - 每次都处理全新数据 20 | 21 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的 22 | """ 23 | try: 24 | print(f"⚡ 开始无缓存AI智能校验...") 25 | 26 | # 直接获取当前状态的材料内容和规则数据 - 不使用任何缓存 27 | extracted_content = state.get("api_extraction_results", {}) or state.get("extracted_content", {}) 28 | parsed_rules = state.get("parsed_rules", []) 29 | rules_by_category = state.get("rules_by_category", {}) 30 | 31 | print(f"🔍 当前状态数据:") 32 | print(f" 材料数量: {len(extracted_content)}") 33 | print(f" 规则数量: {len(parsed_rules)}") 34 | print(f" 规则分类: {list(rules_by_category.keys())}") 35 | 36 | if not extracted_content: 37 | print("⚠️ 未找到可校验的材料内容") 38 | return { 39 | "current_step": "validation_completed", 40 | "processing_logs": ["未找到可校验的材料内容"] 41 | } 42 | 43 | # 直接处理所有材料 - 不使用队列缓存机制 44 | validation_results = [] 45 | material_validation = {} 46 | total_materials = len(extracted_content) 47 | processed_count = 0 48 | 49 | print(f"📋 开始校验{total_materials}个材料类型") 50 | 51 | # 直接遍历处理每个材料 - 完全无缓存 52 | for material_type, material_data in extracted_content.items(): 53 | processed_count += 1 54 | print(f"🔍 正在校验: {material_type} ({processed_count}/{total_materials})") 55 | 56 | try: 57 | # 数据预处理:确保是单个材料的数据 58 | if isinstance(material_data, list) and len(material_data) > 0: 59 | actual_data = material_data[0] if material_data else {} 60 | elif isinstance(material_data, dict): 61 | actual_data = material_data 62 | else: 63 | actual_data = {"content": material_data, "material_type": material_type} 64 | 65 | # 提取材料内容 66 | material_content = _extract_material_content(actual_data) 67 | 68 | # 🎯 智能规则匹配:教育经历材料只与教育经历规则集匹配 69 | matched_rules = _get_matched_rules_for_material(material_type, rules_by_category, parsed_rules) 70 | print(f"🎯 {material_type} 匹配到 {len(matched_rules)} 条相关规则") 71 | 72 | # 使用AI工具进行校验,将规则作为prompt的一部分 73 | material_results = None 74 | 75 | if _ai_utils_available and validate_material_with_ai and material_content.strip(): 76 | print(f"✅ 使用AI校验: {material_type}") 77 | 78 | try: 79 | # 使用匹配的规则进行AI校验,而不是所有规则 80 | if matched_rules and len(matched_rules) > 0: 81 | print(f"📤 向AI传递{len(matched_rules)}条匹配的{material_type}规则") 82 | 83 | ai_results = validate_material_with_ai( 84 | material_type, 85 | material_content, 86 | rules_context=matched_rules 87 | ) 88 | else: 89 | print(f"⚠️ {material_type}未找到匹配的规则,跳过AI校验") 90 | ai_results = [] 91 | 92 | if ai_results and len(ai_results) > 0: 93 | print(f"✅ AI校验成功,生成{len(ai_results)}个结果") 94 | # 转换AI结果格式 95 | converted_results = [] 96 | for ai_result in ai_results: 97 | converted_result = { 98 | "rule_name": ai_result.get("rule_name", f"{material_type}规则校验"), 99 | "result": _convert_ai_status_to_result(ai_result.get("status", "WARNING")), 100 | "details": ai_result.get("message", "校验完成"), 101 | "priority": _convert_ai_status_to_priority(ai_result.get("status", "WARNING")), 102 | "material_type": material_type, 103 | "rule_content": ai_result.get("rule_content", ""), 104 | "ai_powered": True, 105 | "timestamp": _get_current_timestamp() 106 | } 107 | converted_results.append(converted_result) 108 | validation_results.append(converted_result) 109 | 110 | material_results = converted_results 111 | else: 112 | print(f"⚠️ AI校验返回空结果") 113 | 114 | except Exception as ai_error: 115 | print(f"⚠️ AI校验失败: {ai_error}") 116 | else: 117 | print(f"⚠️ AI工具不可用或无内容") 118 | 119 | # 如果AI校验失败,创建基础结果 120 | if not material_results: 121 | print(f"🔧 为{material_type}创建基础校验结果") 122 | basic_result = { 123 | "rule_name": f"{material_type}基础校验", 124 | "result": "⚠️警告", 125 | "details": "未能进行AI校验,仅进行了基础检查", 126 | "priority": "中", 127 | "material_type": material_type, 128 | "rule_content": "", 129 | "ai_powered": False, 130 | "timestamp": _get_current_timestamp() 131 | } 132 | material_results = [basic_result] 133 | validation_results.append(basic_result) 134 | 135 | # 存储到material_validation中以兼容现有系统 136 | material_validation[material_type] = material_results 137 | 138 | print(f"✅ {material_type}校验完成,生成{len(material_results)}个结果") 139 | 140 | except Exception as material_error: 141 | print(f"❌ 校验{material_type}时发生错误: {str(material_error)}") 142 | # 为失败的材料创建错误记录 143 | error_result = { 144 | "rule_name": f"{material_type}校验错误", 145 | "result": "❌不通过", 146 | "details": f"校验过程发生错误: {str(material_error)}", 147 | "priority": "高", 148 | "material_type": material_type, 149 | "rule_content": "", 150 | "timestamp": _get_current_timestamp() 151 | } 152 | validation_results.append(error_result) 153 | material_validation[material_type] = [error_result] 154 | 155 | # 直接返回结果,不使用任何缓存机制 156 | print(f"✅ 无缓存规则校验完成:处理{processed_count}个材料类型,生成{len(validation_results)}项结果") 157 | 158 | # 构建详细结果与摘要(供报告使用) 159 | try: 160 | from src.models.state import ValidationResult, ValidationSummary 161 | detailed_results = [] 162 | for rd in validation_results: 163 | try: 164 | detailed_results.append(ValidationResult.from_validation_output(rd)) 165 | except Exception as conv_err: 166 | print(f"⚠️ 转换验证结果失败: {conv_err}") 167 | summary = ValidationSummary.from_validation_results(detailed_results) if detailed_results else None 168 | except Exception as model_err: 169 | print(f"⚠️ 生成验证模型失败: {model_err}") 170 | detailed_results = [] 171 | summary = None 172 | 173 | return { 174 | "material_validation": material_validation, 175 | "validation_cache": validation_results, 176 | "validation_results_detailed": [r.dict() for r in detailed_results], 177 | "validation_summary": summary.dict() if summary else None, 178 | "current_step": "validation_completed", 179 | "processing_logs": [ 180 | f"处理了{processed_count}个材料类型", 181 | f"生成了{len(validation_results)}项校验结果", 182 | "已完全取消缓存机制,确保数据全新" 183 | ] 184 | } 185 | 186 | except Exception as e: 187 | print(f"❌ 规则校验失败: {str(e)}") 188 | return { 189 | "current_step": "validation_failed", 190 | "error_message": f"规则校验失败: {str(e)}" 191 | } 192 | 193 | 194 | def _process_validation_results(material_type: str, validation_results: List, 195 | validation_cache_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 196 | """ 197 | 处理AI校验结果并存入缓存 198 | """ 199 | processed_results = [] 200 | 201 | if isinstance(validation_results, list) and len(validation_results) > 0: 202 | for result in validation_results: 203 | if isinstance(result, dict): 204 | result['timestamp'] = _get_current_timestamp() 205 | processed_results.append(result) 206 | validation_cache_results.append(result) 207 | else: 208 | # 其他类型,转换为字典 209 | result_dict = { 210 | "rule_name": f"{material_type}校验", 211 | "result": "⚠️警告", 212 | "details": str(result), 213 | "priority": "中", 214 | "material_type": material_type, 215 | "rule_content": "", 216 | "timestamp": _get_current_timestamp() 217 | } 218 | processed_results.append(result_dict) 219 | validation_cache_results.append(result_dict) 220 | else: 221 | # 空结果 222 | result_dict = { 223 | "rule_name": f"{material_type}校验", 224 | "result": "⚠️警告", 225 | "details": "未能生成有效的校验结果", 226 | "priority": "中", 227 | "material_type": material_type, 228 | "rule_content": "", 229 | "timestamp": _get_current_timestamp() 230 | } 231 | processed_results.append(result_dict) 232 | validation_cache_results.append(result_dict) 233 | 234 | return processed_results 235 | 236 | 237 | def _get_current_timestamp() -> str: 238 | """获取当前时间戳""" 239 | from datetime import datetime 240 | return datetime.now().isoformat() 241 | 242 | 243 | def _convert_ai_status_to_result(status: str) -> str: 244 | """将AI状态转换为结果格式""" 245 | status_upper = status.upper() 246 | if status_upper == "PASS": 247 | return "✅通过" 248 | elif status_upper == "WARNING": 249 | return "⚠️警告" 250 | elif status_upper == "ERROR": 251 | return "❌不通过" 252 | else: 253 | return "⚠️警告" # 默认 254 | 255 | 256 | def _convert_ai_status_to_priority(status: str) -> str: 257 | """将AI状态转换为优先级""" 258 | status_upper = status.upper() 259 | if status_upper == "ERROR": 260 | return "高" 261 | elif status_upper == "WARNING": 262 | return "中" 263 | elif status_upper == "PASS": 264 | return "低" 265 | else: 266 | return "中" # 默认 267 | 268 | 269 | def _get_matched_rules_for_material(material_type: str, rules_by_category: Dict[str, List[Any]], all_rules: List[Any]) -> List[Any]: 270 | """ 271 | 🎯 智能规则匹配:教育经历材料只与教育经历规则集匹配 272 | 273 | Args: 274 | material_type: 材料类型(如"教育经历") 275 | rules_by_category: 按分类组织的规则 276 | all_rules: 所有规则列表(备用) 277 | 278 | Returns: 279 | 匹配的规则列表 280 | """ 281 | try: 282 | print(f"🔍 正在为{material_type}匹配规则...") 283 | 284 | # 1-17项材料分类映射表 285 | material_to_category = { 286 | # 直接匹配数字编号 287 | "1.教育经历": "1", 288 | "2.工作经历": "2", 289 | "3.继续教育": "3", 290 | "4.学术技术兼职情况": "4", 291 | "5.获奖情况": "5", 292 | "6.获得荣誉称号情况": "6", 293 | "7.主持参与科研项目": "7", 294 | "8.主持参与工程技术项目情况": "8", 295 | "9.论文": "9", 296 | "10.著(译)作(教材)": "10", 297 | "11.专利(著作权)情况": "11", 298 | "12.主持参与指定标准情况": "12", 299 | "13.成果被批示、采纳、运用和推广情况": "13", 300 | "14.资质证书": "14", 301 | "15.奖惩情况": "15", 302 | "16.考核情况": "16", 303 | "17.申报材料附件信息": "17", 304 | 305 | # 关键词匹配 306 | "教育经历": "1", 307 | "工作经历": "2", 308 | "继续教育": "3", 309 | "培训情况": "3", 310 | "学术技术兼职": "4", 311 | "获奖": "5", 312 | "荣誉称号": "6", 313 | "科研项目": "7", 314 | "工程项目": "8", 315 | "项目经历": "8", 316 | "论文": "9", 317 | "著作": "10", 318 | "教材": "10", 319 | "专利": "11", 320 | "著作权": "11", 321 | "标准": "12", 322 | "成果": "13", 323 | "证书": "14", 324 | "资质": "14", 325 | "奖惩": "15", 326 | "考核": "16", 327 | "附件": "17" 328 | } 329 | 330 | # 首先尝试直接匹配 331 | category_id = material_to_category.get(material_type) 332 | 333 | # 如果直接匹配失败,尝试关键词匹配 334 | if not category_id: 335 | for keyword, cat_id in material_to_category.items(): 336 | if keyword in material_type and len(keyword) > 2: # 避免过短的关键词 337 | category_id = cat_id 338 | print(f"🎯 通过关键词'{keyword}'匹配到分类 {cat_id}") 339 | break 340 | 341 | # 获取匹配的规则 342 | matched_rules = [] 343 | 344 | if category_id and category_id in rules_by_category: 345 | matched_rules = rules_by_category[category_id] 346 | print(f"✅ {material_type} 匹配到分类{category_id},找到 {len(matched_rules)} 条专用规则") 347 | 348 | # 如果没有找到专用规则,查找通用规则 349 | if not matched_rules: 350 | # 查找通用规则(如交叉检验规则、通用规则等) 351 | general_rules = [] 352 | for rule in all_rules: 353 | rule_content = getattr(rule, 'content', '') if hasattr(rule, 'content') else rule.get('content', '') 354 | source_file = getattr(rule, 'source_file', '') if hasattr(rule, 'source_file') else rule.get('source_file', '') 355 | 356 | if '通用' in source_file or '交叉' in source_file or '基础' in source_file: 357 | general_rules.append(rule) 358 | 359 | if general_rules: 360 | matched_rules = general_rules 361 | print(f"⚠️ {material_type} 未找到专用规则,使用 {len(general_rules)} 条通用规则") 362 | 363 | # 最后的备用方案:返回空列表(不使用所有规则) 364 | if not matched_rules: 365 | print(f"⚠️ {material_type} 未找到任何匹配的规则,将跳过校验") 366 | 367 | return matched_rules 368 | 369 | except Exception as e: 370 | print(f"⚠️ 规则匹配失败: {e}") 371 | return [] 372 | 373 | 374 | def _extract_material_content(actual_data: Dict[str, Any]) -> str: 375 | """从材料数据中提取文本内容""" 376 | material_content = "" 377 | if isinstance(actual_data, dict): 378 | if "content" in actual_data: 379 | content_data = actual_data["content"] 380 | if isinstance(content_data, dict): 381 | # 尝试多种可能的内容字段 382 | for key in ["md_content", "raw_markdown", "text", "content"]: 383 | if key in content_data: 384 | material_content = str(content_data[key]) 385 | break 386 | if not material_content: 387 | material_content = str(content_data) 388 | else: 389 | material_content = str(content_data) 390 | else: 391 | material_content = str(actual_data) 392 | else: 393 | material_content = str(actual_data) 394 | 395 | return material_content -------------------------------------------------------------------------------- /src/nodes/pdf_extraction.py: -------------------------------------------------------------------------------- 1 | """ 2 | PDF内容提取节点 3 | 4 | 通过FastAPI接口处理PDF文件内容提取并转换为JSON格式 5 | """ 6 | 7 | import json 8 | import asyncio 9 | from typing import Dict, Any, List, Optional 10 | from pathlib import Path 11 | import logging 12 | 13 | try: 14 | import aiohttp # type: ignore[import] 15 | from aiohttp import ClientTimeout # type: ignore[import] 16 | except ImportError: 17 | print("Warning: aiohttp not installed. Please install with: pip install aiohttp") 18 | aiohttp = None # type: ignore 19 | ClientTimeout = None # type: ignore 20 | 21 | try: 22 | from ..graph.state import AuditState 23 | except ImportError: 24 | from src.graph.state import AuditState 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | 29 | async def extract_pdf_via_api(pdf_file_path: str, api_endpoint: str) -> Dict[str, Any]: 30 | """ 31 | 通过FastAPI提取PDF内容为JSON 32 | 33 | 基于用户提供的工作案例,使用aiohttp实现类似requests的参数传递方式: 34 | - 基础URL和查询参数分开处理 35 | - 逐个上传PDF文件(不是压缩包) 36 | - 使用multipart/form-data格式 37 | 38 | Args: 39 | pdf_file_path: PDF文件路径 40 | api_endpoint: API端点URL(不包含查询参数) 41 | 42 | Returns: 43 | 提取的JSON内容 44 | """ 45 | if aiohttp is None: 46 | return { 47 | "success": False, 48 | "error": "aiohttp库未安装,请使用 pip install aiohttp 安装", 49 | "file_path": pdf_file_path 50 | } 51 | 52 | try: 53 | # 按照用户案例的方式设置参数 54 | params = { 55 | 'parse_method': 'auto', 56 | 'is_json_md_dump': 'false', 57 | 'output_dir': 'output', 58 | 'return_layout': 'false', 59 | 'return_info': 'false', 60 | 'return_content_list': 'false', 61 | 'return_images': 'false' 62 | } 63 | 64 | # 创建请求头 65 | headers = { 66 | "accept": "application/json", 67 | "User-Agent": "LangGraph-PDF-Extractor/1.0" 68 | } 69 | 70 | print(f"📤 正在上传PDF文件: {Path(pdf_file_path).name} 到 {api_endpoint}") 71 | 72 | async with aiohttp.ClientSession() as session: 73 | # 异步读取文件内容 74 | try: 75 | file_content = await asyncio.to_thread(lambda: open(pdf_file_path, 'rb').read()) 76 | except Exception as file_error: 77 | error_msg = f"读取PDF文件失败: {str(file_error)}" 78 | print(f"❌ {error_msg}") 79 | return { 80 | "success": False, 81 | "error": error_msg, 82 | "file_path": pdf_file_path, 83 | "api_endpoint": api_endpoint 84 | } 85 | 86 | # 按照用户案例创建文件数据 87 | data = aiohttp.FormData() 88 | data.add_field( 89 | 'pdf_file', # 与用户案例中的字段名一致 90 | file_content, 91 | filename=Path(pdf_file_path).name, 92 | content_type='application/pdf' 93 | ) 94 | 95 | # 使用params参数传递查询参数,类似requests.post(url, params=params, files=files) 96 | # 创建超时设置 97 | timeout = ClientTimeout(total=120) if ClientTimeout else aiohttp.ClientTimeout(total=120) 98 | 99 | async with session.post( 100 | api_endpoint, 101 | params=params, # 查询参数单独传递 102 | data=data, # 文件数据 103 | headers=headers, 104 | timeout=timeout 105 | ) as response: 106 | print(f"📊 API响应状态码: {response.status}") 107 | 108 | if response.status == 200: 109 | try: 110 | result = await response.json() 111 | print(f"✅ 成功提取PDF内容: {Path(pdf_file_path).name}") 112 | print(f"📋 API返回结构: {list(result.keys()) if isinstance(result, dict) else type(result)}") 113 | return { 114 | "success": True, 115 | "content": result, 116 | "file_path": pdf_file_path, 117 | "api_endpoint": str(response.url), 118 | "extraction_timestamp": None 119 | } 120 | except Exception as json_error: 121 | error_text = await response.text() 122 | print(f"⚠️ API返回非JSON格式: {json_error}") 123 | return { 124 | "success": False, 125 | "error": f"API返回非JSON格式: {json_error}", 126 | "error_details": error_text[:500], 127 | "file_path": pdf_file_path, 128 | "api_endpoint": str(response.url) 129 | } 130 | else: 131 | error_text = await response.text() 132 | print(f"❌ API返回错误状态码 {response.status}: {error_text[:200]}...") 133 | return { 134 | "success": False, 135 | "error": f"API返回错误状态码: {response.status}", 136 | "error_details": error_text, 137 | "file_path": pdf_file_path, 138 | "api_endpoint": str(response.url) 139 | } 140 | 141 | except FileNotFoundError: 142 | error_msg = f"找不到PDF文件: {pdf_file_path}" 143 | print(f"❌ {error_msg}") 144 | return { 145 | "success": False, 146 | "error": error_msg, 147 | "file_path": pdf_file_path, 148 | "api_endpoint": api_endpoint 149 | } 150 | except Exception as e: 151 | error_msg = f"API调用失败: {str(e)}" 152 | print(f"❌ {error_msg}") 153 | return { 154 | "success": False, 155 | "error": error_msg, 156 | "file_path": pdf_file_path, 157 | "api_endpoint": api_endpoint 158 | } 159 | 160 | 161 | async def pdf_extraction_node(state: AuditState) -> Dict[str, Any]: 162 | """ 163 | 完全无缓存的PDF内容提取节点 - 每次都处理全新数据 164 | 165 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的 166 | """ 167 | try: 168 | print(f"📄 开始无缓存PDF内容提取...") 169 | 170 | # 直接获取当前状态的文件夹数据 - 不使用任何缓存 171 | folder_validation = state.get("folder_validation", {}) 172 | 173 | print(f"🔍 当前状态数据:") 174 | print(f" 文件夹验证结果: {len(folder_validation.get('folders_found', []))} 个文件夹") 175 | 176 | # 验证数据有效性 177 | if not folder_validation or not folder_validation.get("folders_found"): 178 | print("⚠️ 未找到有效的文件夹结构数据") 179 | return { 180 | "current_step": "pdf_extraction_failed", 181 | "error_message": "没有找到有效的文件夹结构", 182 | "processing_logs": ["没有找到有效的文件夹结构"] 183 | } 184 | 185 | # 获取PDF API端点配置 186 | api_endpoint = state.get("pdf_api_endpoint") 187 | if not api_endpoint: 188 | # 尝试使用默认配置 189 | api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain" 190 | print(f"⚠️ 状态中未配置PDF API端点,使用默认端点: {api_endpoint}") 191 | 192 | # 检查是否有配置文件 193 | try: 194 | from src.config.api_config import get_pdf_api_config 195 | api_config = get_pdf_api_config() 196 | configured_endpoint = api_config.get("pdf_extraction_endpoint") 197 | if configured_endpoint: 198 | api_endpoint = configured_endpoint 199 | print(f"✅ 从配置文件获取到API端点: {api_endpoint}") 200 | except ImportError: 201 | print("⚠️ 无法导入API配置模块,使用硬编码默认端点") 202 | except Exception as e: 203 | print(f"⚠️ 读取API配置失败: {e},使用硬编码默认端点") 204 | 205 | # 如果仍然没有API端点,返回错误 206 | if not api_endpoint: 207 | return { 208 | "current_step": "pdf_extraction_failed", 209 | "error_message": "未配置PDF提取API端点,请检查配置文件或环境变量" 210 | } 211 | 212 | folders_found = folder_validation["folders_found"] 213 | pdf_extraction_results = {} 214 | api_extraction_results = {} 215 | total_pdf_files = 0 216 | successful_extractions = 0 217 | 218 | # 处理每个标准文件夹中的PDF文件 219 | for folder_info in folders_found: 220 | folder_name = folder_info["name"] 221 | folder_path = folder_info["path"] 222 | 223 | print(f"📁 处理文件夹: {folder_name}") 224 | 225 | # 查找文件夹中的PDF文件(异步方式) 226 | folder_path_obj = Path(folder_path) 227 | 228 | # 使用asyncio.to_thread来异步执行文件系统操作 229 | try: 230 | pdf_files = await asyncio.to_thread(lambda: list(folder_path_obj.glob("*.pdf"))) 231 | except Exception as glob_error: 232 | print(f"❌ 扫描文件夹 {folder_name} 时发生错误: {str(glob_error)}") 233 | pdf_extraction_results[folder_name] = { 234 | "files": [], 235 | "folder_path": folder_path, 236 | "material_type": folder_name, 237 | "pdf_files_count": 0, 238 | "status": "error", 239 | "error": str(glob_error) 240 | } 241 | continue 242 | 243 | if not pdf_files: 244 | print(f"⚠️ 文件夹 {folder_name} 中没有找到PDF文件") 245 | pdf_extraction_results[folder_name] = { 246 | "files": [], 247 | "folder_path": folder_path, 248 | "material_type": folder_name, 249 | "pdf_files_count": 0, 250 | "status": "empty" 251 | } 252 | continue 253 | 254 | total_pdf_files += len(pdf_files) 255 | folder_results = [] 256 | 257 | # 使用asyncio并发处理PDF文件提取 258 | tasks = [] 259 | for pdf_file in pdf_files: 260 | task = extract_pdf_via_api(str(pdf_file), api_endpoint) 261 | tasks.append(task) 262 | 263 | # 并发执行API调用 264 | results = await asyncio.gather(*tasks, return_exceptions=True) 265 | 266 | for pdf_file, result in zip(pdf_files, results): 267 | if isinstance(result, Exception): 268 | print(f"❌ 处理文件 {pdf_file.name} 时发生异常: {str(result)}") 269 | folder_results.append({ 270 | "file_name": pdf_file.name, 271 | "file_path": str(pdf_file), 272 | "success": False, 273 | "error": str(result), 274 | "material_type": folder_name 275 | }) 276 | elif isinstance(result, dict) and result.get("success"): 277 | print(f"✅ 成功提取 {pdf_file.name}") 278 | successful_extractions += 1 279 | 280 | # 异步获取文件大小 281 | try: 282 | file_size = await asyncio.to_thread(lambda: pdf_file.stat().st_size) 283 | except Exception as stat_error: 284 | print(f"⚠️ 获取文件大小失败: {stat_error}") 285 | file_size = 0 286 | 287 | # 构建标准化JSON格式 288 | standardized_json = { 289 | "metadata": { 290 | "file_name": pdf_file.name, 291 | "file_path": str(pdf_file), 292 | "size_bytes": file_size, 293 | "material_type": folder_name, 294 | "extraction_method": "api" 295 | }, 296 | "content": result.get("content", {}), 297 | "validation": { 298 | "is_valid": True, 299 | "api_endpoint": api_endpoint, 300 | "extraction_timestamp": result.get("extraction_timestamp") 301 | } 302 | } 303 | 304 | folder_results.append({ 305 | "file_name": pdf_file.name, 306 | "file_path": str(pdf_file), 307 | "success": True, 308 | "json_data": standardized_json, 309 | "json_string": json.dumps(standardized_json, ensure_ascii=False, indent=2), 310 | "format": "strict_json", 311 | "size": len(json.dumps(standardized_json)), 312 | "material_type": folder_name 313 | }) 314 | 315 | # 存储API提取结果 316 | if folder_name not in api_extraction_results: 317 | api_extraction_results[folder_name] = [] 318 | api_extraction_results[folder_name].append(standardized_json) 319 | 320 | else: 321 | # 处理失败的情况 322 | error_msg = "未知错误" 323 | if isinstance(result, dict): 324 | error_msg = result.get("error", "未知错误") 325 | print(f"❌ 提取失败 {pdf_file.name}: {error_msg}") 326 | folder_results.append({ 327 | "file_name": pdf_file.name, 328 | "file_path": str(pdf_file), 329 | "success": False, 330 | "error": error_msg, 331 | "material_type": folder_name 332 | }) 333 | 334 | pdf_extraction_results[folder_name] = { 335 | "files": folder_results, 336 | "folder_path": folder_path, 337 | "material_type": folder_name, 338 | "pdf_files_count": len(pdf_files), 339 | "successful_count": len([r for r in folder_results if r.get("success")]), 340 | "status": "success" if folder_results else "empty" 341 | } 342 | 343 | success_folders = sum(1 for item in pdf_extraction_results.values() 344 | if item.get("status") in ["success", "empty"]) # 包括空文件夹 345 | total_folders = len(pdf_extraction_results) 346 | 347 | print(f"✅ PDF内容提取完成: {success_folders}/{total_folders}个文件夹,{successful_extractions}/{total_pdf_files}个PDF文件提取成功") 348 | 349 | # 即使没有PDF文件,只要有文件夹结构就认为成功 350 | if total_folders > 0: 351 | return { 352 | "pdf_extraction_results": pdf_extraction_results, 353 | "api_extraction_results": api_extraction_results, 354 | "extracted_content": api_extraction_results, # 保持兼容性 355 | "current_step": "pdf_extraction_completed", 356 | "processing_stats": { 357 | "total_folders": total_folders, 358 | "successful_folders": success_folders, 359 | "total_pdf_files": total_pdf_files, 360 | "successful_extractions": successful_extractions, 361 | "extraction_rate": successful_extractions / total_pdf_files if total_pdf_files > 0 else 0 362 | } 363 | } 364 | else: 365 | return { 366 | "current_step": "pdf_extraction_failed", 367 | "error_message": "未找到可处理的文件夹" 368 | } 369 | 370 | except Exception as e: 371 | logger.error(f"PDF内容提取失败: {str(e)}") 372 | print(f"❌ PDF内容提取失败: {str(e)}") 373 | return { 374 | "current_step": "pdf_extraction_failed", 375 | "error_message": f"PDF内容提取失败: {str(e)}" 376 | } 377 | 378 | 379 | def configure_pdf_api_endpoint(state: AuditState, api_endpoint: str) -> Dict[str, Any]: 380 | """ 381 | 配置PDF提取API端点 382 | 383 | Args: 384 | state: 当前状态 385 | api_endpoint: API端点URL 386 | 387 | Returns: 388 | 更新的状态 389 | """ 390 | return { 391 | "pdf_api_endpoint": api_endpoint, 392 | "processing_logs": [f"已配置PDF提取API端点: {api_endpoint}"] 393 | } -------------------------------------------------------------------------------- /src/models/state.py: -------------------------------------------------------------------------------- 1 | """ 2 | 数据模型定义 3 | 4 | 定义审核流程中使用的数据模型(不包括LangGraph状态) 5 | 6 | 模型使用状态: 7 | - CoreInfo: ✅ 高度活跃 - 在多个节点中实际使用 8 | - RuleInfo: ✅ 高度活跃 - 规则处理核心模型 9 | - ValidationResult: ⚠️ 部分使用 - 主要用作类型注解 10 | - CrossValidationResult: ⚠️ 部分使用 - 主要用作类型注解 11 | - MaterialProcessingStats: ✅ 有效使用 - 在报告生成中实际使用 12 | - AuditReport: ⚠️ 部分功能未启用 - 完善但使用有限 13 | 14 | 已移除未使用模型: 15 | - FileInfo: ✖️ 已移除 - 几乎未使用 16 | - MaterialInfo: ✖️ 已移除 - 使用场景有限,可用Dict替代 17 | - ReportSummary: ✖️ 已移除 - 完全未使用 18 | """ 19 | 20 | from typing import List, Dict, Any, Optional, Union 21 | from pydantic import BaseModel, Field 22 | 23 | 24 | # ============================================================================ 25 | # 核心业务模型(高度活跃) 26 | # ============================================================================ 27 | class CoreInfo(BaseModel): 28 | """核心信息模型(简化版) - ✅ 高度活跃模型""" 29 | name: str = Field(description="姓名,统一格式,去除空格", default="") 30 | gender: str = Field(description="性别,男/女", default="") 31 | id_number: str = Field(description="身份证号,18位标准格式", default="") 32 | extracted_from: List[str] = Field(description="信息来源材料", default_factory=list) 33 | 34 | 35 | # ============================================================================ 36 | # 校验结果模型(部分使用) 37 | # ============================================================================ 38 | class ValidationResult(BaseModel): 39 | """校验结果模型 - 增强版,完整存储validation节点的所有输出信息""" 40 | rule_id: str 41 | rule_name: str 42 | status: str # PASS, WARNING, ERROR 43 | result: str # "✅通过", "⚠️警告", "❌不通过" 44 | message: str 45 | details: str = Field(description="详细描述信息") 46 | priority: str = Field(description="优先级:高/中/低") 47 | material_type: str = Field(description="材料类型") 48 | rule_content: str = Field(description="应用的规则内容", default="") 49 | ai_powered: bool = Field(description="是否AI驱动的校验", default=False) 50 | rules_applied: int = Field(description="应用的规则数量", default=0) 51 | timestamp: str = Field(description="校验时间戳") 52 | 53 | @classmethod 54 | def from_validation_output(cls, validation_dict: Dict[str, Any]) -> "ValidationResult": 55 | """从validation节点输出的字典创建ValidationResult对象""" 56 | return cls( 57 | rule_id=validation_dict.get('rule_name', '').replace(' ', '_'), 58 | rule_name=validation_dict.get('rule_name', ''), 59 | status=cls._convert_result_to_status(validation_dict.get('result', '')), 60 | result=validation_dict.get('result', ''), 61 | message=validation_dict.get('details', ''), 62 | details=validation_dict.get('details', ''), 63 | priority=validation_dict.get('priority', '中'), 64 | material_type=validation_dict.get('material_type', ''), 65 | rule_content=validation_dict.get('rule_content', ''), 66 | ai_powered=validation_dict.get('ai_powered', False), 67 | rules_applied=validation_dict.get('rules_applied', 0), 68 | timestamp=validation_dict.get('timestamp', '') 69 | ) 70 | 71 | @staticmethod 72 | def _convert_result_to_status(result: str) -> str: 73 | """将结果转换为状态""" 74 | if result.startswith('✅'): 75 | return 'PASS' 76 | elif result.startswith('⚠️'): 77 | return 'WARNING' 78 | elif result.startswith('❌'): 79 | return 'ERROR' 80 | else: 81 | return 'WARNING' 82 | 83 | 84 | class ValidationSummary(BaseModel): 85 | """验证结果摘要模型 - 存储validation节点的完整统计信息""" 86 | total_materials_processed: int = Field(description="处理的材料数量") 87 | total_validations: int = Field(description="总校验数量") 88 | successful_materials: int = Field(description="成功校验的材料数量") 89 | error_count: int = Field(description="错误数量") 90 | warning_count: int = Field(description="警告数量") 91 | pass_count: int = Field(description="通过数量") 92 | ai_powered_validations: int = Field(description="AI驱动的校验数量") 93 | total_rules_applied: int = Field(description="应用的规则总数") 94 | materials_by_type: Dict[str, int] = Field(description="按材料类型统计", default_factory=dict) 95 | validation_start_time: Optional[str] = Field(description="校验开始时间", default=None) 96 | validation_end_time: Optional[str] = Field(description="校验结束时间", default=None) 97 | 98 | @classmethod 99 | def from_validation_results(cls, validation_results: List[ValidationResult]) -> "ValidationSummary": 100 | """从验证结果列表创建摘要""" 101 | error_count = sum(1 for r in validation_results if r.status == 'ERROR') 102 | warning_count = sum(1 for r in validation_results if r.status == 'WARNING') 103 | pass_count = sum(1 for r in validation_results if r.status == 'PASS') 104 | ai_powered_count = sum(1 for r in validation_results if r.ai_powered) 105 | total_rules = sum(r.rules_applied for r in validation_results) 106 | 107 | materials_by_type = {} 108 | for result in validation_results: 109 | mat_type = result.material_type 110 | materials_by_type[mat_type] = materials_by_type.get(mat_type, 0) + 1 111 | 112 | return cls( 113 | total_materials_processed=len(set(r.material_type for r in validation_results)), 114 | total_validations=len(validation_results), 115 | successful_materials=len(set(r.material_type for r in validation_results if r.status != 'ERROR')), 116 | error_count=error_count, 117 | warning_count=warning_count, 118 | pass_count=pass_count, 119 | ai_powered_validations=ai_powered_count, 120 | total_rules_applied=total_rules, 121 | materials_by_type=materials_by_type 122 | ) 123 | 124 | 125 | class CrossValidationResult(BaseModel): 126 | """交叉校验结果模型 - ⚠️ 主要用作类型注解,实际多使用Dict""" 127 | validation_type: str # name_consistency, id_consistency, time_logic, data_rationality 128 | status: str # PASS, WARNING, ERROR 129 | message: str 130 | conflicts: List[str] = [] 131 | 132 | 133 | # ============================================================================ 134 | # 规则相关模型(高度活跃) 135 | # ============================================================================ 136 | class RuleInfo(BaseModel): 137 | """规则信息模型 - ✅ 高度活跃模型,在rules_processing和validation中大量使用""" 138 | rule_id: str = Field(description="规则唯一标识") 139 | content: str = Field(description="规则内容") 140 | source_file: str = Field(description="来源文件名") 141 | category: str = Field(description="1-17中的分类编号", default="17") 142 | priority: str = Field(description="优先级", default="normal") 143 | 144 | 145 | class RuleFileInfo(BaseModel): 146 | """规则文件信息模型 - ✅ 在rules_processing中使用""" 147 | file_name: str = Field(description="规则文件名") 148 | file_path: str = Field(description="文件完整路径") 149 | file_type: str = Field(description="文件类型 (.xlsx 或 .md)") 150 | size: int = Field(description="文件大小或规则数量") 151 | content: Optional[str] = Field(description="文件原始内容(仅Markdown文件)", default=None) 152 | extracted_rules: Optional[List[RuleInfo]] = Field(description="提取的规则列表(仅Excel文件)", default=None) 153 | 154 | 155 | # ============================================================================ 156 | # 状态管理模型 157 | # ============================================================================ 158 | class AuditState(BaseModel): 159 | """审核工作流状态定义(业务数据模型)""" 160 | 161 | # 输入文件信息 162 | uploaded_file: Optional[str] = None # 上传的文件路径 163 | file_type: str = "" # 文件类型 (zip/pdf/doc等) 164 | 165 | # 文件处理结果 166 | extracted_files: List[str] = Field(default_factory=list) # 解压后的文件列表 167 | file_classification: Dict[str, str] = Field(default_factory=dict) # 文件分类结果 168 | 169 | # PDF处理 170 | pdf_analysis: Dict[str, Any] = Field(default_factory=dict) # PDF页数分析结果 171 | pdf_chunks: Dict[str, List[str]] = Field(default_factory=dict) # PDF分片结果 172 | 173 | # 内容提取 174 | extracted_content: Dict[str, Any] = Field(default_factory=dict) # 提取的内容信息 175 | core_info: Optional[Dict[str, Any]] = None # 核心信息(姓名、身份证号) 176 | 177 | # 规则处理 178 | rules_data: List[RuleFileInfo] = Field(default_factory=list) # 加载的规则文件数据 179 | parsed_rules: List[RuleInfo] = Field(default_factory=list) # 解析后的规则列表 180 | rules_by_category: Dict[str, List[RuleInfo]] = Field(default_factory=dict) # 按1-17项分类的规则 181 | 182 | # 验证结果(完整存储) 183 | validation_results_detailed: List[ValidationResult] = Field(description="详细的验证结果列表", default_factory=list) 184 | validation_summary: Optional[ValidationSummary] = Field(description="验证结果摘要", default=None) 185 | material_validation: Dict[str, List[Any]] = Field(default_factory=dict) # 材料校验结果(兼容) 186 | cross_validation: List[Any] = Field(default_factory=list) # 交叉校验结果(并发安全) 187 | validation_results: List[Dict[str, Any]] = Field(default_factory=list) # 所有校验结果(兼容) 188 | 189 | # 报告生成 190 | audit_report: Optional["AuditReport"] = None # 生成的审核报告对象 191 | report_path: Optional[str] = None # 报告文件路径 192 | 193 | # 流程控制 194 | current_step: str = "file_processing" # 当前步骤 195 | error_message: Optional[str] = None # 错误信息 196 | warnings: List[str] = Field(default_factory=list) # 警告信息 197 | processing_logs: List[str] = Field(default_factory=list) # 处理日志 198 | is_complete: bool = False # 是否完成 199 | 200 | # Redis缓存相关 201 | session_id: Optional[str] = None # 会话ID 202 | 203 | 204 | # ============================================================================ 205 | # 报告相关模型(部分功能未启用) 206 | # ============================================================================ 207 | 208 | 209 | class AuditReport(BaseModel): 210 | """审核报告模型(增强版) - ⚠️ 完善但使用有限,主要作为类型注解""" 211 | 212 | # 报告基本信息 213 | report_id: str = Field(description="报告唯一标识") 214 | generated_at: str = Field(description="生成时间") 215 | report_version: str = Field(description="报告版本", default="v2.0") 216 | 217 | # 申报人信息 218 | applicant_info: CoreInfo = Field(description="申报人核心信息") 219 | 220 | # 审核摘要 221 | summary: Dict[str, Any] = Field(description="审核结果摘要", default_factory=dict) 222 | 223 | # 材料处理统计 224 | processing_stats: Dict[str, Any] = Field(description="处理统计信息", default_factory=dict) 225 | 226 | # 校验结果分类(按严重程度) 227 | severe_issues: List[ValidationResult] = Field(description="严重问题", default_factory=list) 228 | warnings: List[ValidationResult] = Field(description="警告问题", default_factory=list) 229 | suggestions: List[ValidationResult] = Field(description="建议优化", default_factory=list) 230 | passed_validations: List[ValidationResult] = Field(description="通过的校验", default_factory=list) 231 | 232 | # 交叉校验结果 233 | cross_validation_results: List[CrossValidationResult] = Field(description="交叉校验结果", default_factory=list) 234 | 235 | # 按材料分类的结果 236 | material_results: Dict[str, List[ValidationResult]] = Field(description="按材料类型分类的结果", default_factory=dict) 237 | 238 | # 规则应用统计 239 | rules_applied: Dict[str, Any] = Field(description="应用的规则统计", default_factory=dict) 240 | 241 | # HTML报告内容 242 | html_content: Optional[str] = Field(description="生成的HTML报告内容", default=None) 243 | 244 | # 报告文件路径 245 | file_path: Optional[str] = Field(description="报告文件保存路径", default=None) 246 | 247 | # 质量评分 248 | quality_score: Optional[float] = Field(description="材料质量评分(0-100)", default=None) 249 | 250 | # 合规性评估 251 | compliance_status: str = Field(description="合规性状态", default="PENDING") # PASS/WARNING/FAIL/PENDING 252 | 253 | # 建议措施 254 | recommendations: List[str] = Field(description="改进建议", default_factory=list) 255 | 256 | # 审核日志 257 | audit_logs: List[str] = Field(description="审核过程日志", default_factory=list) 258 | 259 | @classmethod 260 | def create_from_state(cls, state: Any, report_id: str) -> "AuditReport": 261 | """从审核状态创建报告""" 262 | from datetime import datetime 263 | 264 | # 获取核心信息 265 | core_info = state.get('core_info') or {} if hasattr(state, 'get') else getattr(state, 'core_info', None) or {} 266 | 267 | # 处理字典和对象访问 268 | def get_state_value(key: str, default=None): 269 | if hasattr(state, 'get'): # 字典类型 270 | return state.get(key, default) 271 | else: # 对象类型 272 | return getattr(state, key, default) 273 | 274 | applicant_info = CoreInfo( 275 | name=core_info.get('name', '') if isinstance(core_info, dict) else '', 276 | gender=core_info.get('gender', '') if isinstance(core_info, dict) else '', 277 | id_number=core_info.get('id_number', '') if isinstance(core_info, dict) else '', 278 | extracted_from=core_info.get('extracted_from', []) if isinstance(core_info, dict) else [] 279 | ) 280 | 281 | # 创建报告实例 282 | audit_logs = get_state_value('processing_logs', []) 283 | if not isinstance(audit_logs, list): 284 | audit_logs = [] 285 | 286 | return cls( 287 | report_id=report_id, 288 | generated_at=datetime.now().isoformat(), 289 | applicant_info=applicant_info, 290 | processing_stats=MaterialProcessingStats.from_state(state).dict(), 291 | audit_logs=audit_logs 292 | ) 293 | 294 | def calculate_quality_score(self) -> float: 295 | """计算质量评分""" 296 | total_validations = len(self.severe_issues) + len(self.warnings) + len(self.passed_validations) 297 | if total_validations == 0: 298 | return 100.0 299 | 300 | # 计算分数:错误扣分更多,警告扣分较少 301 | error_penalty = len(self.severe_issues) * 10 302 | warning_penalty = len(self.warnings) * 3 303 | total_penalty = error_penalty + warning_penalty 304 | 305 | score = max(0, 100 - total_penalty) 306 | return score 307 | 308 | def determine_compliance_status(self) -> str: 309 | """确定合规性状态""" 310 | if len(self.severe_issues) > 0: 311 | return "FAIL" 312 | elif len(self.warnings) > 0: 313 | return "WARNING" 314 | else: 315 | return "PASS" 316 | 317 | def get_summary_dict(self) -> Dict[str, Any]: 318 | """获取摘要字典""" 319 | return { 320 | "total_validations": len(self.severe_issues) + len(self.warnings) + len(self.passed_validations), 321 | "error_count": len(self.severe_issues), 322 | "warning_count": len(self.warnings), 323 | "passed_count": len(self.passed_validations), 324 | "cross_validation_count": len(self.cross_validation_results), 325 | "quality_score": self.quality_score or self.calculate_quality_score(), 326 | "compliance_status": self.compliance_status 327 | } 328 | 329 | 330 | # ============================================================================ 331 | # 统计模型(有效使用) 332 | # ============================================================================ 333 | 334 | 335 | class MaterialProcessingStats(BaseModel): 336 | """材料处理统计模型 - ✅ 在AuditReport中有实际应用""" 337 | files_extracted: int = Field(description="解压文件数量", default=0) 338 | pdfs_processed: int = Field(description="处理的PDF数量", default=0) 339 | content_extracted: bool = Field(description="内容提取成功", default=False) 340 | core_info_extracted: bool = Field(description="核心信息提取成功", default=False) 341 | categories_classified: List[str] = Field(description="已分类的材料类型", default_factory=list) 342 | 343 | @classmethod 344 | def from_state(cls, state: Any) -> "MaterialProcessingStats": 345 | """从审核状态创建处理统计""" 346 | # 处理字典和对象访问 347 | def get_state_value(key: str, default=None): 348 | if hasattr(state, 'get'): # 字典类型 349 | return state.get(key, default) 350 | else: # 对象类型 351 | return getattr(state, key, default) 352 | 353 | extracted_files = get_state_value('extracted_files', []) or [] 354 | extracted_content = get_state_value('extracted_content', {}) or {} 355 | core_info = get_state_value('core_info') 356 | 357 | return cls( 358 | files_extracted=len(extracted_files), 359 | pdfs_processed=len([f for f in extracted_files if f.lower().endswith('.pdf')]), 360 | content_extracted=len(extracted_content) > 0, 361 | core_info_extracted=bool(core_info and ( 362 | core_info.get('name') or core_info.get('id_number') 363 | if isinstance(core_info, dict) else False 364 | )), 365 | categories_classified=list(extracted_content.keys()) if extracted_content else [] 366 | ) --------------------------------------------------------------------------------