├── rules
    ├── 9.论文规则集.xlsx
    ├── 1.教育经历规则集.xlsx
    ├── 16.考核规则集.xlsx
    ├── 17.附件规则集.xlsx
    ├── 2.工作经历规则集.xlsx
    ├── 8.项目经历规则集.xlsx
    ├── 交叉检验规则.md
    ├── 14.资质证书规则集.xlsx
    └── 11.专利（著作权）情况规则集.xlsx
├── src
    ├── __init__.py
    ├── config
    │   ├── __init__.py
    │   ├── warning_config.py
    │   ├── api_config.py
    │   ├── model_config.py
    │   └── redis.py
    ├── models
    │   ├── __init__.py
    │   └── state.py
    ├── nodes
    │   ├── __init__.py
    │   ├── file_processing.py
    │   ├── cross_validation.py
    │   ├── report_generation.py
    │   ├── core_info_extraction.py
    │   ├── validation.py
    │   └── pdf_extraction.py
    ├── graph
    │   ├── __init__.py
    │   ├── workflow.py
    │   ├── state.py
    │   └── edges.py
    ├── tools
    │   ├── __init__.py
    │   ├── workflow_integration.py
    │   ├── common_utils.py
    │   ├── cache_manager.py
    │   ├── langsmith_utils.py
    │   └── file_utils.py
    └── agent.py
├── langgraph.json
├── __init__.py
├── README.md
├── graph_def.py
├── .gitignore
├── requirements.txt
├── .env.example
├── pyproject.toml
└── static
    ├── styles.css
    └── index.html


/rules/9.论文规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/9.论文规则集.xlsx


--------------------------------------------------------------------------------
/rules/1.教育经历规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/1.教育经历规则集.xlsx


--------------------------------------------------------------------------------
/rules/16.考核规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/16.考核规则集.xlsx


--------------------------------------------------------------------------------
/rules/17.附件规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/17.附件规则集.xlsx


--------------------------------------------------------------------------------
/rules/2.工作经历规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/2.工作经历规则集.xlsx


--------------------------------------------------------------------------------
/rules/8.项目经历规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/8.项目经历规则集.xlsx


--------------------------------------------------------------------------------
/rules/交叉检验规则.md:
--------------------------------------------------------------------------------
 1 | 1.所有材料中的主人公姓名必须一致
 2 | 
 3 | 
 4 | 
 5 | 
 6 | 
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/rules/14.资质证书规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/14.资质证书规则集.xlsx


--------------------------------------------------------------------------------
/rules/11.专利（著作权）情况规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/11.专利（著作权）情况规则集.xlsx


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 主要源代码目录
 3 | 
 4 | 包含系统的核心模块：
 5 | - graph: LangGraph工作流定义
 6 | - nodes: 各个处理节点实现  
 7 | - tools: 工具函数和辅助模块
 8 | - models: 数据模型和状态定义
 9 | - services: 业务服务层
10 | """


--------------------------------------------------------------------------------
/langgraph.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dependencies": ["."],
 3 |   "graphs": {
 4 |     "audit_workflow": "graph_def:graph"
 5 |   },
 6 |   "dockerfile_lines": [],
 7 |   "python_version": "3.12",
 8 |   "env": ".env",
 9 |   "port": 8123
10 | }


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 企业级职称评审材料审核系统
 3 | 基于 LangGraph 框架的智能化审核流程
 4 | 
 5 | 项目结构说明：
 6 | - src/: 主要源代码目录
 7 |   - graph/: LangGraph工作流定义
 8 |   - nodes/: 各个处理节点实现
 9 |   - tools/: 工具函数和辅助模块
10 |   - models/: 数据模型和状态定义
11 |   - services/: 业务服务层
12 | - config/: 配置文件
13 | - tests/: 测试代码
14 | - docs/: 文档目录
15 | - data/: 数据存储目录
16 | """
17 | 
18 | # 系统版本信息
19 | __version__ = "1.0.0"
20 | __author__ = "Abruzz1"
21 | __description__ = "企业级职称评审材料审核系统"


--------------------------------------------------------------------------------
/src/config/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 配置模块
 3 | 
 4 | 包含项目所有配置相关的功能：
 5 | - Redis 配置和连接管理
 6 | - 环境变量配置
 7 | - 其他系统配置
 8 | """
 9 | 
10 | from .model_config import (
11 |     model_config, 
12 |     setup_model_environment, 
13 |     setup_model_environment_sync,
14 |     print_model_help
15 | )
16 | 
17 | __all__ = [
18 |     'model_config', 
19 |     'setup_model_environment', 
20 |     'setup_model_environment_sync',
21 |     'print_model_help'
22 | ]
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 企业级职称评审材料审核系统
 2 | 
 3 | 基于LangGraph框架构建的智能化职称评审材料审核系统，通过AI技术自动化处理和校验职称申报材料。
 4 | 
 5 | 🔧 **集成LangSmith调试和监控功能** - 提供完整的工作流追踪、性能监控和调试支持。
 6 | 
 7 | ## 系统架构
 8 | 
 9 | 系统采用LangGraph图形化工作流设计，包含以下主要模块：
10 | 
11 | 1. **文件处理模块** - ZIP解压、文件分类
12 | 2. **PDF智能处理** - 页数检测、智能分片
13 | 3. **内容提取** - AI识别、17类材料分类
14 | 4. **规则校验** - 各类材料规则验证
15 | 5. **交叉校验** - 核心信息一致性检查
16 | 6. **报告生成** - HTML格式化输出
17 | 
18 | ## 安装说明
19 | 1.创建虚拟环境 python -m venv venv 
20 | 
21 | 启用   .venv/Scripts/activate    
22 | 
23 | 2.安装依赖 pip install .
24 | 
25 | pip install requirements.txt
26 | 
27 | 3.打开开发工具 langgraph dev
28 | 
29 | 4.启动网页端 python web_app_v2.py
30 | 
31 | 


--------------------------------------------------------------------------------
/graph_def.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | LangGraph 工作流图定义
 4 | 
 5 | 专门用于 LangGraph Studio 的图定义文件
 6 | 避免复杂的导入路径问题
 7 | """
 8 | 
 9 | import sys
10 | import os
11 | 
12 | # 确保项目路径在sys.path中
13 | project_root = os.path.dirname(os.path.abspath(__file__))
14 | if project_root not in sys.path:
15 |     sys.path.insert(0, project_root)
16 | 
17 | try:
18 |     # 导入工作流创建函数
19 |     from src.graph.workflow import create_audit_workflow
20 |     
21 |     # 创建图对象
22 |     graph = create_audit_workflow()
23 |     
24 |     print("✅ LangGraph 工作流图已成功创建")
25 |     
26 | except Exception as e:
27 |     print(f"❌ 创建图失败: {e}")
28 |     import traceback
29 |     traceback.print_exc()
30 |     raise


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 数据模型包
 3 | 
 4 | 定义系统中使用的所有数据模型：
 5 | - 状态管理模型
 6 | - 业务数据模型
 7 | - 配置模型
 8 | 
 9 | 模型使用状态说明：
10 | ✅ 高度活跃： CoreInfo, RuleInfo, RuleFileInfo, MaterialProcessingStats
11 | ⚠️ 部分使用： ValidationResult, CrossValidationResult, AuditReport
12 | ✖️ 已移除： FileInfo, MaterialInfo, ReportSummary
13 | """
14 | 
15 | from .state import (
16 |     CoreInfo,
17 |     ValidationResult,
18 |     CrossValidationResult,
19 |     RuleInfo,
20 |     RuleFileInfo,
21 |     AuditReport,
22 |     AuditState,
23 |     MaterialProcessingStats
24 | )
25 | 
26 | __all__ = [
27 |     "CoreInfo",
28 |     "ValidationResult",
29 |     "CrossValidationResult",
30 |     "RuleInfo",
31 |     "RuleFileInfo",
32 |     "AuditReport",
33 |     "AuditState",
34 |     "MaterialProcessingStats"
35 | ]


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python bytecode / cache
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Virtual environments
 7 | .venv/
 8 | venv/
 9 | env/
10 | venv313/
11 | 
12 | # Test caches & coverage
13 | .pytest_cache/
14 | .mypy_cache/
15 | .ruff_cache/
16 | .tox/
17 | .nox/
18 | .coverage*
19 | coverage.xml
20 | htmlcov/
21 | 
22 | # Packaging / build artifacts
23 | build/
24 | dist/
25 | .eggs/
26 | *.egg-info/
27 | *.egg
28 | pip-wheel-metadata/
29 | 
30 | # Jupyter
31 | .ipynb_checkpoints/
32 | 
33 | # Logs
34 | logs/
35 | *.log
36 | 
37 | # IDE / OS
38 | .vscode/
39 | .idea/
40 | .DS_Store
41 | Thumbs.db
42 | desktop.ini
43 | 
44 | # Environment files
45 | .env
46 | !.env.example
47 | 
48 | # Project-specific temporary/data dirs
49 | test_data/
50 | temp_pdf_processing/
51 | uploads/
52 | extracted/
53 | .model_cache/
54 | .langgraph_api/
55 | .qoder/
56 | 
57 | # Optional: front-end deps if ever used
58 | node_modules/
59 | 
60 | 


--------------------------------------------------------------------------------
/src/nodes/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 节点定义模块
 3 | 
 4 | 包含LangGraph所有节点定义：
 5 | - ZIP解压和文件夹处理节点 (file_processing)
 6 | - PDF内容提取节点 (pdf_extraction)
 7 | - 核心信息提取节点 (core_info_extraction)
 8 | - 规则校验节点 (validation)
 9 | - 交叉校验节点 (cross_validation)
10 | - 报告生成节点 (report_generation)
11 | - 规则集加载节点 (load_rules)
12 | - 规则集提取节点 (extract_rules)
13 | """
14 | 
15 | # 从独立的节点文件中导入各个节点
16 | from .file_processing import file_processing_node
17 | 
18 | from .pdf_extraction import pdf_extraction_node
19 | from .core_info_extraction import core_info_extraction_node
20 | from .validation import validation_node
21 | from .cross_validation import cross_validation_node
22 | from .report_generation import report_generation_node
23 | 
24 | # 规则处理节点
25 | from .rules_processing import load_rules_node, extract_rules_node
26 | 
27 | 
28 | __all__ = [
29 |     "file_processing_node",
30 |     "pdf_extraction_node",
31 |     "core_info_extraction_node", 
32 |     "validation_node",
33 |     "cross_validation_node",
34 |     "report_generation_node",
35 |     "load_rules_node",
36 |     "extract_rules_node"
37 | ]


--------------------------------------------------------------------------------
/src/graph/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | LangGraph工作流定义模块
 3 | 
 4 | 包含系统的主要工作流：
 5 | - workflow.py: 主要的审核工作流定义
 6 | - state.py: 工作流状态管理
 7 | - edges.py: 边和路由逻辑定义
 8 | """
 9 | 
10 | from .workflow import (
11 |     create_audit_workflow,
12 |     get_default_workflow
13 | )
14 | 
15 | from .state import (
16 |     AuditState,
17 |     WorkflowConfig,
18 |     create_initial_state,
19 |     update_state_step,
20 |     add_warning,
21 |     set_error,
22 |     mark_complete
23 | )
24 | 
25 | from .edges import (
26 |     should_continue_processing,
27 |     route_folder_validation,
28 |     route_to_cross_validation,
29 |     should_generate_report,
30 |     check_core_info_for_cross_validation,
31 |     check_pdf_extraction_status
32 | )
33 | 
34 | __all__ = [
35 |     # Workflow functions (优化后的版本，只保留主工作流)
36 |     "create_audit_workflow",
37 |     "get_default_workflow",
38 |     
39 |     # State management
40 |     "AuditState",
41 |     "WorkflowConfig",
42 |     "create_initial_state",
43 |     "update_state_step",
44 |     "add_warning",
45 |     "set_error",
46 |     "mark_complete",
47 |     
48 |     # Edge routing functions
49 |     "should_continue_processing",
50 |     "route_folder_validation",
51 |     "route_to_cross_validation",
52 |     "should_generate_report",
53 |     "check_core_info_for_cross_validation",
54 |     "check_pdf_extraction_status"
55 | ]


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # 基础依赖包
 2 | langgraph>=0.2.0
 3 | langsmith>=0.1.0
 4 | langtrace-python-sdk>=2.0.0
 5 | pydantic>=2.0.0
 6 | typing-extensions>=4.0.0
 7 | langchain-core>=0.1.0  # LangGraph核心依赖
 8 | langgraph-cli>=0.1.0  # LangGraph开发工具
 9 | langgraph-checkpoint-redis>=0.1.0  # Redis检查点
10 | 
11 | # 谷歌AI API（新版本）
12 | google-genai>=1.33.0
13 | 
14 | # 环境变量管理
15 | python-dotenv>=1.0.0
16 | 
17 | # 文件处理
18 | pathlib2
19 | zipfile36>=0.1.0
20 | python-magic>=0.4.0
21 | Pillow>=10.0.0
22 | 
23 | # Web框架
24 | fastapi>=0.104.0
25 | uvicorn>=0.24.0
26 | python-multipart>=0.0.6
27 | sse-starlette>=1.6.0  # Server-Sent Events支持
28 | starlette>=0.27.0
29 | 
30 | # 数据处理
31 | pandas>=2.0.0
32 | numpy>=1.24.0
33 | 
34 | # HTML报告生成
35 | jinja2>=3.1.0
36 | weasyprint>=60.0
37 | 
38 | # 配置管理
39 | python-dotenv>=1.0.0
40 | pyyaml>=6.0
41 | 
42 | # 日志和监控
43 | loguru>=0.7.0
44 | prometheus-client>=0.19.0
45 | 
46 | # 测试框架
47 | pytest>=7.4.0
48 | pytest-asyncio>=0.21.0
49 | pytest-cov>=4.1.0
50 | 
51 | # 开发工具
52 | black>=23.0.0
53 | isort>=5.12.0
54 | flake8>=6.0.0
55 | mypy>=1.6.0
56 | 
57 | # OCR和AI
58 | pytesseract>=0.3.10
59 | opencv-python>=4.8.0
60 | 
61 | # 数据库
62 | sqlalchemy>=2.0.0
63 | alembic>=1.12.0
64 | 
65 | # 异步处理
66 | aiofiles>=23.2.0
67 | celery>=5.3.0
68 | redis>=5.0.0
69 | 
70 | # 文档处理
71 | markdown>=3.5.0
72 | markdownify>=0.11.0
73 | 
74 | # 工作流状态管理
75 | psycopg>=3.1.0  # PostgreSQL支持
76 | asyncpg>=0.29.0  # 异步PostgreSQL


--------------------------------------------------------------------------------
/src/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 工具模块导出
 3 | 
 4 | 按功能模块组织的工具函数导出，包括：
 5 | - AI模型工具（ai_utils）
 6 | - 文件处理工具（file_utils）
 7 | - 通用工具（common_utils）
 8 | - 工作流集成工具（workflow_integration）
 9 | """
10 | 
11 | # AI模型工具
12 | from .ai_utils import (
13 |     extract_core_information_with_ai,
14 |     validate_material_with_ai,
15 |     cross_validate_materials_with_ai,
16 |     extract_category_core_info_with_ai
17 | )
18 | 
19 | # 文件处理工具
20 | from .file_utils import (
21 |     extract_zip_file,
22 |     validate_folder_structure,
23 |     analyze_markdown_structure,
24 |     extract_markdown_content
25 | )
26 | 
27 | # 通用工具
28 | from .common_utils import (
29 |     extract_with_regex,
30 |     generate_html_report
31 | )
32 | 
33 | # 工作流集成工具
34 | from .workflow_integration import (
35 |     extract_core_information_from_json,
36 |     extract_core_information,
37 |     validate_material_rules
38 | )
39 | 
40 | __all__ = [
41 |     # AI模型工具
42 |     "extract_core_information_with_ai", 
43 |     "validate_material_with_ai",
44 |     "cross_validate_materials_with_ai",
45 |     "extract_category_core_info_with_ai",
46 |     
47 |     # 文件处理工具
48 |     "extract_zip_file",
49 |     "validate_folder_structure",
50 |     "analyze_markdown_structure",
51 |     "extract_markdown_content",
52 |     
53 |     # 通用工具
54 |     "extract_with_regex",
55 |     "generate_html_report",
56 |     
57 |     # 工作流集成工具
58 |     "extract_core_information_from_json",
59 |     "extract_core_information",
60 |     "validate_material_rules"
61 | ]


--------------------------------------------------------------------------------
/src/config/warning_config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 警告配置管理
 3 | 
 4 | 统一管理系统中的警告过滤器，特别针对第三方库的弃用警告
 5 | """
 6 | 
 7 | import warnings
 8 | import os
 9 | 
10 | 
11 | def setup_warning_filters():
12 |     """
13 |     设置系统警告过滤器
14 |     
15 |     主要针对以下警告进行优化：
16 |     1. pkg_resources弃用警告（来自Marker内部）
17 |     2. 其他第三方库的不必要警告
18 |     """
19 |     
20 |     # 抑制pkg_resources弃用警告
21 |     # 这个警告来自Marker库内部，用户无法控制
22 |     warnings.filterwarnings(
23 |         "ignore", 
24 |         category=DeprecationWarning, 
25 |         module="pkg_resources"
26 |     )
27 |     
28 |     # 抑制setuptools相关的pkg_resources警告
29 |     warnings.filterwarnings(
30 |         "ignore",
31 |         message=".*pkg_resources is deprecated.*",
32 |         category=UserWarning
33 |     )
34 |     
35 |     # 抑制其他第三方库的常见警告
36 |     warnings.filterwarnings(
37 |         "ignore",
38 |         category=DeprecationWarning,
39 |         module="transformers"
40 |     )
41 |     
42 |     # 可选：在开发模式下显示所有警告
43 |     if os.environ.get("LANGGRAPH_DEBUG", "false").lower() == "true":
44 |         warnings.resetwarnings()
45 |         warnings.simplefilter("always", DeprecationWarning)
46 |         print("🔍 调试模式：显示所有警告信息")
47 |     else:
48 |         print("✅ 已配置警告过滤器，抑制第三方库不必要的警告")
49 | 
50 | 
51 | def suppress_marker_warnings():
52 |     """
53 |     保持兼容性函数（已无作用）
54 |     """
55 |     pass
56 | 
57 | 
58 | def get_warning_env_vars():
59 |     """
60 |     获取用于抑制警告的环境变量字典
61 |     
62 |     Returns:
63 |         环境变量字典
64 |     """
65 |     return {
66 |         "PYTHONWARNINGS": "ignore::DeprecationWarning:pkg_resources",
67 |         "TRANSFORMERS_VERBOSITY": "error",  # 降低transformers库的输出等级
68 |         "TOKENIZERS_PARALLELISM": "false",  # 避免tokenizers并发警告
69 |     }
70 | 
71 | 
72 | # 自动在模块导入时设置警告过滤器
73 | if __name__ != "__main__":
74 |     setup_warning_filters()


--------------------------------------------------------------------------------
/src/nodes/file_processing.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ZIP解压节点
 3 | 
 4 | 专门处理ZIP压缩包解压和17个标准文件夹结构验证
 5 | """
 6 | 
 7 | from typing import Dict, Any
 8 | from pathlib import Path
 9 | from src.graph.state import AuditState
10 | from src.tools import (
11 |     extract_zip_file,
12 |     validate_folder_structure
13 | )
14 | 
15 | 
16 | async def file_processing_node(state: AuditState) -> Dict[str, Any]:
17 |     """
18 |     ZIP解压节点 - 解压ZIP文件并验证17个标准文件夹结构
19 |     """
20 |     try:
21 |         # 支持两种输入字段名（向后兼容）
22 |         zip_path = state.get("uploaded_file") or state.get("zip_file_path")
23 |         
24 |         if not zip_path:
25 |             return {
26 |                 "current_step": "zip_extraction_failed",
27 |                 "error_message": "未找到上传的ZIP文件路径"
28 |             }
29 |         
30 |         print(f"📦 开始解压ZIP文件: {Path(zip_path).name}")
31 |         
32 |         # 解压 ZIP 文件
33 |         extraction_result = await extract_zip_file(zip_path)
34 |         
35 |         if not extraction_result:
36 |             return {
37 |                 "current_step": "zip_extraction_failed",
38 |                 "error_message": "ZIP文件解压失败"
39 |             }
40 |         
41 |         # 获取解压后的根目录
42 |         extraction_path = extraction_result.get("extraction_path")
43 |         extracted_files = extraction_result.get("files", [])
44 |         
45 |         # 检查解压是否成功
46 |         if not extraction_path:
47 |             return {
48 |                 "current_step": "zip_extraction_failed",
49 |                 "error_message": "ZIP文件解压失败，无法获取解压路径"
50 |             }
51 |         
52 |         print(f"📁 ZIP解压完成，提取到: {extraction_path}")
53 |         print(f"📊 共解压 {len(extracted_files)} 个文件")
54 |         
55 |         # 验证17个标准文件夹结构
56 |         folder_validation = await validate_folder_structure(extraction_path)
57 |         
58 |         return {
59 |             "extraction_path": extraction_path,
60 |             "extracted_files": extracted_files,
61 |             "folder_validation": folder_validation,
62 |             "current_step": "zip_extraction_completed",
63 |             "file_type": "zip"
64 |         }
65 |         
66 |     except Exception as e:
67 |         print(f"❌ ZIP解压失败: {str(e)}")
68 |         return {
69 |             "current_step": "zip_extraction_failed",
70 |             "error_message": f"ZIP解压失败: {str(e)}"
71 |         }


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
  1 | # 环境配置示例文件
  2 | # 复制为 .env 并填入实际配置值
  3 | 
  4 | # OpenAI API配置
  5 | OPENAI_API_KEY=your_openai_api_key_here
  6 | OPENAI_BASE_URL=https://api.openai.com/v1
  7 | OPENAI_MODEL=gpt-4
  8 | 
  9 | # 应用配置
 10 | APP_NAME=职称评审材料审核系统
 11 | APP_VERSION=1.0.0
 12 | DEBUG=True
 13 | LOG_LEVEL=INFO
 14 | 
 15 | # 文件存储配置
 16 | UPLOAD_DIR=./data/uploads
 17 | TEMP_DIR=./data/temp
 18 | OUTPUT_DIR=./data/outputs
 19 | MAX_FILE_SIZE=104857600  # 100MB
 20 | 
 21 | # 数据库配置
 22 | DATABASE_URL=sqlite:///./data/audit_system.db
 23 | 
 24 | # Redis配置（用于任务队列）
 25 | REDIS_URL=redis://localhost:6379/0
 26 | 
 27 | # API服务配置
 28 | HOST=0.0.0.0
 29 | PORT=8000
 30 | WORKERS=4
 31 | 
 32 | # AI处理配置
 33 | MAX_CONCURRENT_TASKS=10
 34 | PDF_MAX_PAGES=100
 35 | CHUNK_SIZE=2048
 36 | OVERLAP_SIZE=200
 37 | 
 38 | # 规则校验配置
 39 | ENABLE_STRICT_MODE=True
 40 | AUTO_RETRY_COUNT=3
 41 | VALIDATION_TIMEOUT=300
 42 | 
 43 | # 报告生成配置
 44 | REPORT_TEMPLATE_DIR=./templates
 45 | REPORT_ASSETS_DIR=./assets
 46 | ENABLE_PDF_EXPORT=True
 47 | 
 48 | # 安全配置
 49 | SECRET_KEY=your_secret_key_here
 50 | ACCESS_TOKEN_EXPIRE_MINUTES=30
 51 | 
 52 | # 监控配置
 53 | ENABLE_METRICS=True
 54 | METRICS_PORT=9090
 55 | 
 56 | # Marker + Gemma AI配置
 57 | # 谷歌AI API配置
 58 | GOOGLE_API_KEY=
 59 | 
 60 | 
 61 | 
 62 | # Marker配置
 63 | MARKER_USE_LLM=true
 64 | MARKER_OUTPUT_FORMAT=json
 65 | MARKER_FORMAT_LINES=true
 66 | 
 67 | # 设备配置
 68 | TORCH_DEVICE=cuda
 69 | 
 70 | # Hugging Face配置（如果需要）
 71 | HF_TOKEN=your_huggingface_token_here
 72 | 
 73 | # LangSmith配置（用于调试和监控）
 74 | LANGSMITH_API_KEY=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
 75 | LANGCHAIN_TRACING_V2=true
 76 | LANGCHAIN_ENDPOINT=https://api.smith.langchain.com
 77 | LANGCHAIN_PROJECT=Audit_Workflow_Debug
 78 | LANGSMITH_TRACING=true
 79 | 
 80 | # LangSmith配置（可选，用于调试和监控）
 81 | LANGSMITH_API_KEY=your_langsmith_api_key_here
 82 | LANGCHAIN_TRACING_V2=true
 83 | LANGCHAIN_ENDPOINT=https://api.smith.langchain.com
 84 | LANGCHAIN_PROJECT=Audit_Workflow_Debug
 85 | LANGSMITH_TRACING=true
 86 | 
 87 | # Google AI API配置
 88 | GOOGLE_API_KEY=your_google_api_key_here
 89 | GEMINI_MODEL=gemini-2.5-flash  # 可选值: gemini-1.5-flash, gemini-2.5-flash, gemini-pro
 90 | 
 91 | # 应用配置
 92 | ENVIRONMENT=development
 93 | HOST=0.0.0.0
 94 | PORT=8000
 95 | DEBUG=true
 96 | 
 97 | # Redis配置（用于缓存和任务队列）
 98 | REDIS_URL=redis://localhost:6379/0
 99 | REDIS_PASSWORD=
100 | REDIS_DB=0
101 | 
102 | # PostgreSQL配置（可选，用于持久化存储）
103 | DATABASE_URL=postgresql://user:password@localhost:5432/langgraph_audit
104 | POSTGRES_USER=langgraph_user
105 | POSTGRES_PASSWORD=your_password
106 | POSTGRES_DB=langgraph_audit
107 | POSTGRES_HOST=localhost
108 | POSTGRES_PORT=5432
109 | 
110 | # 文件处理配置
111 | MAX_FILE_SIZE=100MB
112 | ALLOWED_FILE_TYPES=.zip,.md,.txt,.pdf
113 | UPLOAD_DIR=./uploads
114 | EXTRACTED_DIR=./extracted
115 | REPORTS_DIR=./reports
116 | 
117 | # 缓存配置
118 | CACHE_ENABLED=true
119 | CACHE_TTL=3600
120 | CACHE_MAX_SIZE=1000
121 | 
122 | # 日志配置
123 | LOG_LEVEL=INFO
124 | LOG_FORMAT=json
125 | LOG_FILE=./logs/app.log
126 | 
127 | # 安全配置
128 | SECRET_KEY=your-secret-key-here
129 | CORS_ORIGINS=*
130 | ALLOWED_HOSTS=localhost,127.0.0.1,0.0.0.0
131 | 
132 | # 工作流配置
133 | WORKFLOW_TIMEOUT=300
134 | MAX_RETRIES=3
135 | CONCURRENT_TASKS=5
136 | 
137 | # OCR配置
138 | TESSERACT_PATH=/usr/bin/tesseract
139 | TESSERACT_DATA_PATH=/usr/share/tesseract-ocr/4.00/tessdata
140 | 
141 | # 开发工具配置
142 | LANGCHAIN_VERBOSE=false
143 | LANGCHAIN_DEBUG=false
144 | 


--------------------------------------------------------------------------------
/src/tools/workflow_integration.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 审核工作流集成工具
 3 | 
 4 | 提供审核系统的核心集成函数，连接各个工具模块
 5 | """
 6 | 
 7 | from typing import List, Dict, Any
 8 | from pathlib import Path
 9 | from src.models.state import ValidationResult, CoreInfo
10 | from src.tools import (
11 |     extract_core_information_with_ai,
12 |     validate_material_with_ai,
13 |     extract_with_regex
14 | )
15 | 
16 | def extract_core_information_from_json(json_extractions: List[Dict[str, Any]]) -> CoreInfo:
17 |     """使用Gemma AI从JSON提取结果中智能提取核心信息"""
18 |     print("🤖 使用Gemma模型进行智能信息提取...")
19 |     
20 |     # 整合所有文档内容
21 |     combined_content = ""
22 |     extracted_from = []
23 |     
24 |     for json_extraction in json_extractions:
25 |         file_path = json_extraction.get("file_path", "")
26 |         content_blocks = json_extraction.get("content_blocks", [])
27 |         
28 |         for block in content_blocks:
29 |             content = block.get("content", "")
30 |             if content.strip():
31 |                 combined_content += content + "\n"
32 |         
33 |         if file_path:
34 |             extracted_from.append(Path(file_path).name)
35 |     
36 |     if not combined_content.strip():
37 |         return CoreInfo(name="", id_number="", extracted_from=extracted_from)
38 |     
39 |     # 使用AI提取，失败时降级到正则表达式
40 |     ai_result = extract_core_information_with_ai(combined_content, extracted_from)
41 |     
42 |     if ai_result:
43 |         return CoreInfo(
44 |             name=ai_result["name"],
45 |             id_number=ai_result["id_number"],
46 |             extracted_from=ai_result["extracted_from"]
47 |         )
48 |     else:
49 |         name, id_number = extract_with_regex(combined_content)
50 |         return CoreInfo(name=name, id_number=id_number, extracted_from=extracted_from)
51 | 
52 | def extract_core_information(materials: List[Dict[str, Any]]) -> CoreInfo:
53 |     """提取核心信息（简化版） - 使用Dict替代MaterialInfo"""
54 |     # 将Dict转换为JSON格式进行处理
55 |     json_extractions = []
56 |     for material in materials:
57 |         json_extraction = {
58 |             "file_path": material.get("material_id", ""),
59 |             "content_blocks": [{"content": material.get("content", "")}]
60 |         }
61 |         json_extractions.append(json_extraction)
62 |     
63 |     return extract_core_information_from_json(json_extractions)
64 | 
65 | def validate_material_rules(material: Dict[str, Any]) -> List[ValidationResult]:
66 |     """使用Gemma AI进行智能审核 - 使用Dict替代MaterialInfo"""
67 |     material_type = material.get("material_type", "")
68 |     content = material.get("content", "")
69 |     
70 |     print(f"🤖 使用Gemma模型审核材料: {material_type}")
71 |     
72 |     # 使用AI进行智能审核
73 |     ai_results = validate_material_with_ai(material_type, content)
74 |     
75 |     if ai_results:
76 |         results = []
77 |         for item in ai_results:
78 |             if isinstance(item, dict) and "rule_name" in item:
79 |                 results.append(ValidationResult(
80 |                     rule_id=f"GEMMA_{len(results)+1:03d}",
81 |                     rule_name=item.get("rule_name", "智能审核"),
82 |                     status=item.get("status", "WARNING"),
83 |                     message=item.get("message", "审核完成")
84 |                 ))
85 |         return results
86 |     else:
87 |         # AI失败时返回默认验证结果
88 |         return [ValidationResult(
89 |             rule_id="FALLBACK_001",
90 |             rule_name="默认审核",
91 |             status="WARNING",
92 |             message="AI审核失败，使用默认审核规则"
93 |         )]
94 | 
95 | 


--------------------------------------------------------------------------------
/src/config/api_config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | API配置工具
  3 | 
  4 | 用于配置PDF提取API端点和相关参数
  5 | """
  6 | 
  7 | from typing import Dict, Any, Optional
  8 | import logging
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | # 全局API配置
 13 | _api_config = {
 14 |     "pdf_extraction_endpoint": "http://183.203.184.233:8888/pdf_parse_supplychain",  # 用户提供的实际端点
 15 |     "timeout": 60,
 16 |     "max_file_size": 20 * 1024 * 1024,  # 20MB
 17 |     "supported_formats": [".pdf"]
 18 | }
 19 | 
 20 | 
 21 | def configure_pdf_api(endpoint: str, timeout: int = 60, max_file_size: int = 20 * 1024 * 1024) -> None:
 22 |     """
 23 |     配置PDF提取API
 24 |     
 25 |     Args:
 26 |         endpoint: API端点URL
 27 |         timeout: 超时时间（秒）
 28 |         max_file_size: 最大文件大小（字节）
 29 |     """
 30 |     global _api_config
 31 |     
 32 |     _api_config.update({
 33 |         "pdf_extraction_endpoint": endpoint,
 34 |         "timeout": timeout,
 35 |         "max_file_size": max_file_size
 36 |     })
 37 |     
 38 |     logger.info(f"PDF API已配置: {endpoint}")
 39 |     print(f"✅ PDF提取API已配置: {endpoint}")
 40 | 
 41 | 
 42 | def get_pdf_api_config() -> Dict[str, Any]:
 43 |     """
 44 |     获取当前PDF API配置
 45 |     
 46 |     Returns:
 47 |         API配置字典
 48 |     """
 49 |     return _api_config.copy()
 50 | 
 51 | 
 52 | def is_pdf_api_configured() -> bool:
 53 |     """
 54 |     检查PDF API是否已配置
 55 |     
 56 |     Returns:
 57 |         是否已配置
 58 |     """
 59 |     return _api_config.get("pdf_extraction_endpoint") is not None
 60 | 
 61 | 
 62 | async def validate_pdf_file(file_path: str) -> Dict[str, Any]:
 63 |     """
 64 |     验证PDF文件是否符合要求
 65 |     
 66 |     Args:
 67 |         file_path: PDF文件路径
 68 |         
 69 |     Returns:
 70 |         验证结果
 71 |     """
 72 |     import os
 73 |     from pathlib import Path
 74 |     
 75 |     try:
 76 |         import asyncio
 77 |         from pathlib import Path
 78 |         file_path_obj = Path(file_path)
 79 |         
 80 |         # 使用异步方式检查文件是否存在
 81 |         file_exists = await asyncio.to_thread(file_path_obj.exists)
 82 |         if not file_exists:
 83 |             return {
 84 |                 "valid": False,
 85 |                 "error": "文件不存在"
 86 |             }
 87 |         
 88 |         # 检查文件扩展名
 89 |         if file_path_obj.suffix.lower() not in _api_config["supported_formats"]:
 90 |             return {
 91 |                 "valid": False,
 92 |                 "error": f"不支持的文件格式: {file_path_obj.suffix}"
 93 |             }
 94 |         
 95 |         # 使用异步方式检查文件大小
 96 |         file_stat = await asyncio.to_thread(file_path_obj.stat)
 97 |         file_size = file_stat.st_size
 98 |         if file_size > _api_config["max_file_size"]:
 99 |             return {
100 |                 "valid": False,
101 |                 "error": f"文件过大: {file_size} > {_api_config['max_file_size']}"
102 |             }
103 |         
104 |         return {
105 |             "valid": True,
106 |             "file_size": file_size,
107 |             "format": file_path_obj.suffix.lower()
108 |         }
109 |         
110 |     except Exception as e:
111 |         return {
112 |             "valid": False,
113 |             "error": f"文件验证失败: {str(e)}"
114 |         }
115 | 
116 | 
117 | def create_pdf_api_headers() -> Dict[str, str]:
118 |     """
119 |     创建PDF API请求头（基于提供的API示例）
120 |     
121 |     Returns:
122 |         请求头字典
123 |     """
124 |     return {
125 |         "accept": "application/json",  # 与示例一致
126 |         "User-Agent": "LangGraph-PDF-Extractor/1.0"
127 |         # Content-Type 会由 aiohttp 自动设置为 multipart/form-data
128 |     }
129 | 
130 | 
131 | def get_pdf_api_params() -> Dict[str, str]:
132 |     """
133 |     获取PDF API查询参数（基于提供的API示例）
134 |     
135 |     Returns:
136 |         API查询参数字典
137 |     """
138 |     return {
139 |         "parse_method": "auto",
140 |         "is_json_md_dump": "false",
141 |         "output_dir": "output",
142 |         "return_layout": "false",
143 |         "return_info": "false",
144 |         "return_content_list": "false",
145 |         "return_images": "false"
146 |     }
147 | 
148 | 
149 | def build_pdf_api_url(base_endpoint: str, custom_params: Optional[Dict[str, str]] = None) -> str:
150 |     """
151 |     构建完整的PDF API URL
152 |     
153 |     Args:
154 |         base_endpoint: 基础端点URL（不包含查询参数）
155 |         custom_params: 自定义参数（可选）
156 |         
157 |     Returns:
158 |         完整的API URL
159 |     """
160 |     params = get_pdf_api_params()
161 |     
162 |     # 如果有自定义参数，覆盖默认参数
163 |     if custom_params:
164 |         params.update(custom_params)
165 |     
166 |     # 构建查询字符串
167 |     query_string = "&".join([f"{k}={v}" for k, v in params.items()])
168 |     
169 |     # 处理base_endpoint是否已经包含查询参数
170 |     separator = "&" if "?" in base_endpoint else "?"
171 |     
172 |     return f"{base_endpoint}{separator}{query_string}"


--------------------------------------------------------------------------------
/src/nodes/cross_validation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 交叉校验节点
  3 | 
  4 | 对核心信息进行交叉校验：
  5 | 1. 姓名一致性校验
  6 | 2. 身份证一致性校验
  7 | 3. 基于rules文件夹中的交叉检验规则
  8 | """
  9 | 
 10 | from typing import Dict, Any
 11 | from src.graph.state import AuditState
 12 | from src.tools.ai_utils import cross_validate_materials_with_ai
 13 | 
 14 | 
 15 | def cross_validation_node(state: AuditState) -> Dict[str, Any]:
 16 |     """
 17 |     完全无缓存的交叉校验节点 - 每次都处理全新数据
 18 |     
 19 |     🚨 已完全取消缓存机制，确保每次传输的信息都是全新的、一次性的
 20 |     """
 21 |     try:
 22 |         print(f"🔍 开始无缓存交叉校验节点...")
 23 |         
 24 |         # 🔍 获取核心信息（优先使用核心信息提取节点的结果）
 25 |         core_info = state.get("core_info")
 26 |         all_extracted_info = state.get("api_extraction_results", {}) or state.get("extracted_content", {})
 27 |         current_step = state.get("current_step", "未知")
 28 |         
 29 |         print(f"🔍 当前状态详细信息:")
 30 |         print(f"   当前步骤: {current_step}")
 31 |         print(f"   核心信息状态: {'有效' if core_info else '无'}")
 32 |         print(f"   提取材料数量: {len(all_extracted_info)}")
 33 |         
 34 |         # 🚨 优先检查核心信息提取节点的结果
 35 |         if not core_info:
 36 |             print(f"⚠️ 没有找到核心信息，检查核心信息提取节点是否正常执行")
 37 |             raise Exception("未找到任何核心信息用于交叉校验")
 38 |         
 39 |         # 🔍 验证核心信息的数据结构
 40 |         if not isinstance(core_info, dict):
 41 |             print(f"⚠️ 核心信息格式不正确: {type(core_info)}")
 42 |             # 尝试转换为字典格式
 43 |             if hasattr(core_info, 'name') and hasattr(core_info, 'id_number'):
 44 |                 core_info = {
 45 |                     "attachments": {
 46 |                         "name": getattr(core_info, 'name', ''),
 47 |                         "id_number": getattr(core_info, 'id_number', ''),
 48 |                         "extracted_from": getattr(core_info, 'extracted_from', [])
 49 |                     }
 50 |                 }
 51 |             else:
 52 |                 raise Exception(f"核心信息格式不可识别: {type(core_info)}")
 53 |         
 54 |         # 🔍 统计有效的核心信息条目
 55 |         valid_entries = 0
 56 |         name_sources = []
 57 |         id_sources = []
 58 |         
 59 |         for category, info in core_info.items():
 60 |             if isinstance(info, dict) and (info.get('name') or info.get('id_number')):
 61 |                 valid_entries += 1
 62 |                 if info.get('name'):
 63 |                     name_sources.append(f"{category}: {info['name']}")
 64 |                 if info.get('id_number'):
 65 |                     id_sources.append(f"{category}: {info['id_number']}")
 66 |         
 67 |         print(f"📋 有效核心信息条目: {valid_entries}")
 68 |         print(f"📋 姓名信息来源: {len(name_sources)} 项")
 69 |         print(f"📋 身份证信息来源: {len(id_sources)} 项")
 70 |         
 71 |         if valid_entries == 0:
 72 |             print(f"⚠️ 所有核心信息条目都为空，无法进行交叉校验")
 73 |             raise Exception("所有核心信息条目都为空，无法进行交叉校验")
 74 |         
 75 |         # 🚨 直接执行交叉验证 - 不使用缓存，使用核心信息提取节点的结果
 76 |         cross_validation_results = cross_validate_materials_with_ai(all_extracted_info, core_info)
 77 |         
 78 |         # 直接转换AI结果为标准格式 - 不存入缓存
 79 |         converted_results = []
 80 |         for ai_result in cross_validation_results:
 81 |             status = ai_result.get('status', 'WARNING')
 82 |             if status == 'PASS' or '✅' in status:
 83 |                 result_status = '✅通过'
 84 |             elif status == 'WARNING' or '⚠️' in status:
 85 |                 result_status = '⚠️警告'
 86 |             elif status == 'ERROR' or '❌' in status:
 87 |                 result_status = '❌不通过'
 88 |             else:
 89 |                 result_status = '⚠️警告'
 90 |             
 91 |             converted_result = {
 92 |                 "rule_name": ai_result.get('rule_name', '未知规则'),
 93 |                 "result": result_status,
 94 |                 "details": ai_result.get('message', 'AI交叉校验完成'),
 95 |                 "priority": ai_result.get('priority', '极高'),
 96 |                 "material_type": "AI交叉校验",
 97 |                 "rule_content": ai_result.get('rule_content', ''),
 98 |                 "timestamp": _get_current_timestamp()
 99 |             }
100 |             converted_results.append(converted_result)
101 |         
102 |         # 🚨 直接返回结果，不使用任何缓存机制
103 |         print(f"✅ 无缓存交叉校验完成，生成{len(converted_results)}项结果")
104 |         
105 |         return {
106 |             "cross_validation": converted_results,
107 |             "current_step": "cross_validation_completed",
108 |             "processing_logs": [
109 |                 f"交叉校验完成，生成{len(converted_results)}项结果",
110 |                 f"基于{valid_entries}项有效核心信息进行校验",
111 |                 "已完全取消缓存机制，确保数据全新"
112 |             ]
113 |         }
114 |         
115 |     except Exception as e:
116 |         print(f"❌ 交叉校验失败: {str(e)}")
117 |         return {
118 |             "current_step": "cross_validation_failed",
119 |             "error_message": f"交叉校验失败: {str(e)}",
120 |             "processing_logs": [f"交叉校验失败: {str(e)}"]
121 |         }
122 | 
123 | 
124 | def _get_current_timestamp() -> str:
125 |     """获取当前时间戳"""
126 |     from datetime import datetime
127 |     return datetime.now().isoformat()


--------------------------------------------------------------------------------
/src/nodes/report_generation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 报告生成节点 - 完全无缓存版本
  3 | 
  4 | 🚨 已完全取消缓存机制，确保每次传输的信息都是全新的、一次性的
  5 | """
  6 | 
  7 | from typing import Dict, Any
  8 | from src.graph.state import AuditState
  9 | 
 10 | 
 11 | def report_generation_node(state: AuditState) -> Dict[str, Any]:
 12 |     """
 13 |     完全无缓存的报告生成节点 - 每次都处理全新数据
 14 |     
 15 |     🚨 已完全取消缓存机制，确保每次传输的信息都是全新的、一次性的
 16 |     """
 17 |     try:
 18 |         print(f"📄 开始无缓存报告生成...")
 19 |         
 20 |         # 生成报告ID
 21 |         from datetime import datetime
 22 |         import uuid
 23 |         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 24 |         report_id = f"AUDIT_{timestamp}_{str(uuid.uuid4())[:8].upper()}"
 25 |         
 26 |         # 直接获取当前状态的所有数据 - 不使用任何缓存
 27 |         material_validation = state.get("material_validation", {})
 28 |         cross_validation = state.get("cross_validation", [])
 29 |         
 30 |         print(f"🔍 当前状态数据:")
 31 |         print(f"   材料校验结果: {len(material_validation)} 项")
 32 |         print(f"   交叉校验结果: {len(cross_validation)} 项")
 33 |         
 34 |         # 直接整合所有数据 - 不做缓存检查
 35 |         all_results = []
 36 |         
 37 |         # 整合material_validation数据
 38 |         for material_type, results in material_validation.items():
 39 |             if isinstance(results, list):
 40 |                 all_results.extend(results)
 41 |             elif results:
 42 |                 all_results.append(results)
 43 |         
 44 |         # 整合cross_validation数据
 45 |         if isinstance(cross_validation, list):
 46 |             all_results.extend(cross_validation)
 47 |         
 48 |         if not all_results:
 49 |             print("⚠️ 未找到任何校验结果，生成空报告")
 50 |         
 51 |         print(f"📊 报告数据统计: 共{len(all_results)}项结果")
 52 |         
 53 |         # 直接生成HTML报告 - 不使用缓存的复杂逻辑
 54 |         html_report = _generate_html_report(all_results, report_id)
 55 |         
 56 |         # 保存报告文件
 57 |         report_path = f"audit_report_{timestamp}.html"
 58 |         
 59 |         if report_path and html_report:
 60 |             with open(report_path, 'w', encoding='utf-8') as f:
 61 |                 f.write(html_report)
 62 |             
 63 |             print(f"✅ 报告已生成: {report_path}")
 64 |         else:
 65 |             raise Exception("报告路径或内容为空")
 66 |         
 67 |         return {
 68 |             "audit_report": html_report,
 69 |             "report_path": report_path,
 70 |             "current_step": "completed",
 71 |             "is_complete": True,
 72 |             "processing_logs": [
 73 |                 f"报告生成完成: {report_id}",
 74 |                 f"处理了{len(all_results)}项结果",
 75 |                 "已完全取消缓存机制，确保数据全新",
 76 |                 f"报告已保存至: {report_path}"
 77 |             ]
 78 |         }
 79 |         
 80 |     except Exception as e:
 81 |         print(f"❌ 报告生成失败: {str(e)}")
 82 |         return {
 83 |             "current_step": "report_generation_failed",
 84 |             "error_message": f"报告生成失败: {str(e)}"
 85 |         }
 86 | 
 87 | 
 88 | def _generate_html_report(all_results: list, report_id: str) -> str:
 89 |     """
 90 |     生成简化的HTML报告 - 完全无缓存机制
 91 |     """
 92 |     from datetime import datetime
 93 |     
 94 |     print(f"📊 报告生成使用数据，共{len(all_results)}项结果")
 95 |     
 96 |     # 按材料类型分组
 97 |     material_groups = {}
 98 |     for result in all_results:
 99 |         material_type = result.get('material_type', '未知类型')
100 |         if material_type not in material_groups:
101 |             material_groups[material_type] = []
102 |         material_groups[material_type].append(result)
103 |     
104 |     # 统计数据
105 |     error_count = sum(1 for r in all_results if r.get('result', '').startswith('❌'))
106 |     warning_count = sum(1 for r in all_results if r.get('result', '').startswith('⚠️'))
107 |     pass_count = sum(1 for r in all_results if r.get('result', '').startswith('✅'))
108 |     total_validations = len(all_results)
109 |     
110 |     print(f"📊 统计: 错误{error_count}, 警告{warning_count}, 通过{pass_count}")
111 | 
112 |     # 生成基本的HTML报告结构
113 |     html_template = f"""
114 | <!DOCTYPE html>
115 | <html lang="zh-CN">
116 | <head>
117 |     <meta charset="UTF-8">
118 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
119 |     <title>职称评审材料审核报告 - {report_id}</title>
120 |     <style>
121 |         body {{ font-family: Arial, 'Microsoft YaHei', sans-serif; margin: 20px; }}
122 |         .header {{ background: #f5f5f5; padding: 20px; border-radius: 5px; }}
123 |         .stats {{ display: flex; gap: 20px; margin: 20px 0; }}
124 |         .stat-item {{ background: #e8f4fd; padding: 15px; border-radius: 5px; flex: 1; }}
125 |         .material-group {{ margin: 20px 0; border: 1px solid #ddd; border-radius: 5px; }}
126 |         .material-header {{ background: #f0f0f0; padding: 10px; font-weight: bold; }}
127 |         .result-item {{ padding: 10px; border-bottom: 1px solid #eee; }}
128 |         .pass {{ color: green; }}
129 |         .warning {{ color: orange; }}
130 |         .error {{ color: red; }}
131 |     </style>
132 | </head>
133 | <body>
134 |     <div class="header">
135 |         <h1>职称评审材料审核报告</h1>
136 |         <p>报告ID: {report_id}</p>
137 |         <p>生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
138 |     </div>
139 |     
140 |     <div class="stats">
141 |         <div class="stat-item">
142 |             <h3>总计</h3>
143 |             <p>{len(all_results)} 项检查</p>
144 |         </div>
145 |         <div class="stat-item error">
146 |             <h3>错误</h3>
147 |             <p>{error_count} 项</p>
148 |         </div>
149 |         <div class="stat-item warning">
150 |             <h3>警告</h3>
151 |             <p>{warning_count} 项</p>
152 |         </div>
153 |         <div class="stat-item pass">
154 |             <h3>通过</h3>
155 |             <p>{pass_count} 项</p>
156 |         </div>
157 |     </div>
158 |     
159 |     <div class="content">
160 |         <h2>详细结果</h2>
161 | """
162 |     
163 |     # 添加材料组详情
164 |     for material_type, results in material_groups.items():
165 |         html_template += f"""
166 |         <div class="material-group">
167 |             <div class="material-header">{material_type} ({len(results)} 项)</div>
168 | """
169 |         for result in results[:10]:  # 限制显示数量
170 |             result_class = "error" if result.get('result', '').startswith('❌') else "warning" if result.get('result', '').startswith('⚠️') else "pass"
171 |             html_template += f"""
172 |             <div class="result-item {result_class}">
173 |                 <strong>{result.get('rule_name', '未知规则')}</strong>: {result.get('result', '未知')}<br>
174 |                 <small>{result.get('details', '无详情')}</small>
175 |             </div>
176 | """
177 |         html_template += "</div>"
178 |     
179 |     html_template += """
180 |     </div>
181 | </body>
182 | </html>
183 |     """
184 |     
185 |     return html_template


--------------------------------------------------------------------------------
/src/config/model_config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 配置管理器
  3 | 
  4 | 用于管理OCR API配置及环境变量
  5 | """
  6 | 
  7 | import os
  8 | from pathlib import Path
  9 | from typing import Dict, Optional
 10 | import logging
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | class ModelConfig:
 15 |     """配置管理器"""
 16 |     
 17 |     def __init__(self):
 18 |         self.project_root = Path(__file__).parent.parent.parent
 19 |         self.cache_dir = self.project_root / ".model_cache"
 20 |         
 21 |         # 智能初始化：检查是否在异步环境中
 22 |         try:
 23 |             import asyncio
 24 |             # 尝试获取当前任务，如果成功说明在异步环境中
 25 |             asyncio.current_task()
 26 |             logger.info("🔄 检测到异步环境，将延迟创建缓存目录")
 27 |         except RuntimeError:
 28 |             # 不在异步环境中，可以安全创建目录
 29 |             self.setup_cache_directories_sync()
 30 |         except Exception:
 31 |             # 如果检测失败，使用同步方式（向后兼容）
 32 |             self.setup_cache_directories_sync()
 33 |     
 34 |     async def setup_cache_directories(self):
 35 |         """设置缓存目录（异步版本）"""
 36 |         try:
 37 |             import asyncio
 38 |             # 使用异步方式创建目录
 39 |             await asyncio.to_thread(self.cache_dir.mkdir, parents=True, exist_ok=True)
 40 |             logger.info(f"📁 缓存目录: {self.cache_dir}")
 41 |             
 42 |         except Exception as e:
 43 |             logger.error(f"❌ 缓存目录设置失败: {e}")
 44 |     
 45 |     def setup_cache_directories_sync(self):
 46 |         """设置缓存目录（同步版本，仅用于初始化）"""
 47 |         try:
 48 |             # 创建本地缓存目录
 49 |             self.cache_dir.mkdir(parents=True, exist_ok=True)
 50 |             logger.info(f"📁 缓存目录: {self.cache_dir}")
 51 |             
 52 |         except Exception as e:
 53 |             logger.error(f"❌ 缓存目录设置失败: {e}")
 54 |     
 55 |     async def is_models_cached(self) -> bool:
 56 |         """检查缓存是否存在（异步版本）"""
 57 |         import asyncio
 58 |         return await asyncio.to_thread(self.cache_dir.exists)
 59 |     
 60 |     async def get_cache_size(self) -> str:
 61 |         """获取缓存目录大小（异步版本）"""
 62 |         try:
 63 |             import asyncio
 64 |             total_size = 0
 65 |             
 66 |             # 使用异步方式遍历文件
 67 |             async def calculate_size():
 68 |                 nonlocal total_size
 69 |                 paths = await asyncio.to_thread(list, self.cache_dir.rglob("*"))
 70 |                 for path in paths:
 71 |                     is_file = await asyncio.to_thread(path.is_file)
 72 |                     if is_file:
 73 |                         stat_result = await asyncio.to_thread(path.stat)
 74 |                         total_size += stat_result.st_size
 75 |             
 76 |             await calculate_size()
 77 |             
 78 |             # 转换为可读格式
 79 |             if total_size < 1024:
 80 |                 return f"{total_size} B"
 81 |             elif total_size < 1024**2:
 82 |                 return f"{total_size/1024:.1f} KB"
 83 |             elif total_size < 1024**3:
 84 |                 return f"{total_size/1024**2:.1f} MB"
 85 |             else:
 86 |                 return f"{total_size/1024**3:.1f} GB"
 87 |                 
 88 |         except Exception as e:
 89 |             logger.error(f"❌ 获取缓存大小失败: {e}")
 90 |             return "未知"
 91 |     
 92 |     async def clear_cache(self):
 93 |         """清理缓存（异步版本）"""
 94 |         try:
 95 |             import shutil
 96 |             import asyncio
 97 |             if self.cache_dir.exists():
 98 |                 await asyncio.to_thread(shutil.rmtree, self.cache_dir)
 99 |                 logger.info("🧹 缓存已清理")
100 |             await self.setup_cache_directories()
101 |         except Exception as e:
102 |             logger.error(f"❌ 清理缓存失败: {e}")
103 |     
104 |     def clear_cache_sync(self):
105 |         """清理缓存（同步版本）"""
106 |         try:
107 |             import shutil
108 |             if self.cache_dir.exists():
109 |                 shutil.rmtree(self.cache_dir)
110 |                 logger.info("🧹 缓存已清理")
111 |             self.setup_cache_directories_sync()
112 |         except Exception as e:
113 |             logger.error(f"❌ 清理缓存失败: {e}")
114 |     
115 |     async def get_status(self) -> Dict[str, str]:
116 |         """获取配置状态（异步版本）"""
117 |         cache_size = await self.get_cache_size()
118 |         return {
119 |             "cache_dir": str(self.cache_dir),
120 |             "cache_size": cache_size,
121 |             "ocr_api_enabled": "启用",
122 |         }
123 |     
124 |     def get_status_sync(self) -> Dict[str, str]:
125 |         """获取配置状态（同步版本）"""
126 |         try:
127 |             total_size = 0
128 |             if self.cache_dir.exists():
129 |                 for path in self.cache_dir.rglob("*"):
130 |                     if path.is_file():
131 |                         total_size += path.stat().st_size
132 |             
133 |             # 转换为可读格式
134 |             if total_size < 1024:
135 |                 cache_size = f"{total_size} B"
136 |             elif total_size < 1024**2:
137 |                 cache_size = f"{total_size/1024:.1f} KB"
138 |             elif total_size < 1024**3:
139 |                 cache_size = f"{total_size/1024**2:.1f} MB"
140 |             else:
141 |                 cache_size = f"{total_size/1024**3:.1f} GB"
142 |         except Exception as e:
143 |             logger.error(f"❌ 获取缓存大小失败: {e}")
144 |             cache_size = "未知"
145 |         
146 |         return {
147 |             "cache_dir": str(self.cache_dir),
148 |             "cache_size": cache_size,
149 |             "ocr_api_enabled": "启用",
150 |         }
151 | 
152 | 
153 | # 全局配置实例
154 | model_config = ModelConfig()
155 | 
156 | 
157 | async def setup_model_environment():
158 |     """设置环境（在应用启动时调用，异步版本）"""
159 |     logger.info("🔧 正在设置环境...")
160 |     
161 |     # 设置缓存目录
162 |     await model_config.setup_cache_directories()
163 |     
164 |     # 打印状态信息
165 |     status = await model_config.get_status()
166 |     logger.info("📊 配置状态:")
167 |     for key, value in status.items():
168 |         logger.info(f"  {key}: {value}")
169 | 
170 | def setup_model_environment_sync():
171 |     """设置环境（同步版本）"""
172 |     logger.info("🔧 正在设置环境...")
173 |     
174 |     # 设置缓存目录
175 |     model_config.setup_cache_directories_sync()
176 |     
177 |     # 打印状态信息
178 |     status = model_config.get_status_sync()
179 |     logger.info("📊 配置状态:")
180 |     for key, value in status.items():
181 |         logger.info(f"  {key}: {value}")
182 | 
183 | 
184 | def print_model_help():
185 |     """打印配置帮助信息"""
186 |     help_text = """
187 | 🔧 OCR API配置选项:
188 | 
189 | 环境变量设置:
190 |   OCR_API_BASE_URL=http://183.203.184.233:8888    # OCR API地址
191 | 
192 | 使用说明:
193 | 1. 启动OCR API服务
194 |    确保您的OCR API服务正在运行
195 |    默认地址: http://183.203.184.233:8888
196 | 
197 | 2. 启动主应用
198 |    python web_app_v2.py
199 | 
200 | 缓存位置: {cache_dir}
201 | """.format(cache_dir=model_config.cache_dir)
202 |     
203 |     print(help_text)
204 | 
205 | 
206 | if __name__ == "__main__":
207 |     print_model_help()


--------------------------------------------------------------------------------
/src/tools/common_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 通用工具
  3 | 
  4 | 提供通用的工具函数：
  5 | - 正则表达式提取
  6 | - 数据清理和验证
  7 | - HTML报告生成
  8 | - 日志记录
  9 | """
 10 | 
 11 | import re
 12 | from typing import Dict, Any, List, Optional, Union
 13 | from pathlib import Path
 14 | from src.models.state import CoreInfo, ValidationResult as StateValidationResult
 15 | 
 16 | def extract_with_regex(content: str) -> tuple[str, str]:
 17 |     """使用正则表达式的备用提取方法（增强版）"""
 18 |     name = ""
 19 |     id_number = ""
 20 |     
 21 |     # 提取姓名（多种格式匹配）
 22 |     name_patterns = [
 23 |         r"姓[名]*[：:]\s*([^\s\n\r\t]+)",  # 姓名：
 24 |         r"申请人[：:]\s*([^\s\n\r\t]+)",        # 申请人：
 25 |         r"姓[\s]*名[\s]*[：:]\s*([^\s\n\r\t]+)",  # 姓 名：
 26 |         r"^([\u4e00-\u9fff]{2,4})[\s]*[男女]",         # 中文姓名后面跟性别
 27 |     ]
 28 |     
 29 |     for pattern in name_patterns:
 30 |         name_match = re.search(pattern, content, re.MULTILINE)
 31 |         if name_match:
 32 |             potential_name = name_match.group(1).strip()
 33 |             # 验证姓名的合理性（中文字符2-4个字）
 34 |             if re.match(r'^[\u4e00-\u9fff]{2,4}$', potential_name):
 35 |                 name = potential_name
 36 |                 break
 37 |     
 38 |     # 提取身份证号（多种格式匹配）
 39 |     id_patterns = [
 40 |         r"身份证[号码]*[：:]\s*(\d{17}[\dX])",  # 身份证号：
 41 |         r"公民身份号码[：:]\s*(\d{17}[\dX])",    # 公民身份号码：
 42 |         r"ID[\s]*Number[\s]*[：:]\s*(\d{17}[\dX])",   # ID Number:
 43 |         r"(\d{17}[\dX])(?![\d])",                   # 直接匹配18位数字（排除更长数字）
 44 |     ]
 45 |     
 46 |     for pattern in id_patterns:
 47 |         id_match = re.search(pattern, content)
 48 |         if id_match:
 49 |             potential_id = id_match.group(1)
 50 |             # 验证身份证号格式
 51 |             if re.match(r'^\d{17}[\dX]$', potential_id):
 52 |                 id_number = potential_id
 53 |                 break
 54 |     
 55 |     if name or id_number:
 56 |         print(f"✅ 正则提取成功: 姓名='{name}', 身份证='{id_number}'")
 57 |     else:
 58 |         print("⚠️ 正则提取未找到有效信息")
 59 |     
 60 |     return name, id_number
 61 | 
 62 | def generate_html_report(core_info: Optional[Union[CoreInfo, Dict[str, Any]]], validation_results: List[Any]) -> str:
 63 |     """生成HTML格式化报告"""
 64 |     # 处理core_info为None的情况
 65 |     if core_info is None:
 66 |         name = '未提取'
 67 |         id_number = '未提取'
 68 |         extracted_from = []
 69 |     else:
 70 |         # 支持CoreInfo对象和Dict两种类型
 71 |         if isinstance(core_info, dict):
 72 |             name = core_info.get('name', '') or '未提取'
 73 |             id_number = core_info.get('id_number', '') or '未提取'
 74 |             extracted_from = core_info.get('extracted_from', []) or []
 75 |         else:
 76 |             # CoreInfo对象
 77 |             name = getattr(core_info, 'name', None) or '未提取'
 78 |             id_number = getattr(core_info, 'id_number', None) or '未提取'
 79 |             extracted_from = getattr(core_info, 'extracted_from', []) or []
 80 |     
 81 |     html_template = f"""
 82 | <!DOCTYPE html>
 83 | <html lang="zh-CN">
 84 | <head>
 85 |     <meta charset="UTF-8">
 86 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 87 |     <title>职称评审材料审核报告</title>
 88 |     <style>
 89 |         body {{ font-family: 'Microsoft YaHei', Arial, sans-serif; margin: 20px; line-height: 1.6; }}
 90 |         .header {{ background: #f8f9fa; padding: 20px; border-radius: 8px; margin-bottom: 20px; }}
 91 |         .section {{ margin-bottom: 30px; }}
 92 |         .result-item {{ padding: 10px; margin: 5px 0; border-left: 4px solid #ddd; background: #f9f9f9; }}
 93 |         .result-pass {{ border-color: #28a745; }}
 94 |         .result-warning {{ border-color: #ffc107; }}
 95 |         .result-error {{ border-color: #dc3545; }}
 96 |         .result-unknown {{ border-color: #6c757d; }}
 97 |         .info-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; }}
 98 |         .info-item {{ background: #e9ecef; padding: 15px; border-radius: 5px; }}
 99 |         h1, h2 {{ color: #333; }}
100 |         .badge {{ padding: 2px 8px; border-radius: 3px; font-size: 12px; color: white; }}
101 |         .badge-pass {{ background: #28a745; }}
102 |         .badge-warning {{ background: #ffc107; color: #000; }}
103 |         .badge-error {{ background: #dc3545; }}
104 |         .badge-unknown {{ background: #6c757d; }}
105 |     </style>
106 | </head>
107 | <body>
108 |     <div class="header">
109 |         <h1>📄 职称评审材料审核报告</h1>
110 |         <p>生成时间：<span id="datetime"></span></p>
111 |     </div>
112 |     
113 |     <div class="section">
114 |         <h2>👤 核心信息</h2>
115 |         <div class="info-grid">
116 |             <div class="info-item">
117 |                 <strong>姓名:</strong> {name}
118 |             </div>
119 |             <div class="info-item">
120 |                 <strong>身份证号:</strong> {id_number}
121 |             </div>
122 |             <div class="info-item">
123 |                 <strong>信息来源:</strong> {', '.join(extracted_from) if extracted_from else '无'}
124 |             </div>
125 |         </div>
126 |     </div>
127 |     
128 |     <div class="section">
129 |         <h2>✅ 审核结果</h2>"""
130 |     
131 |     if validation_results:
132 |         for result in validation_results:
133 |             # 处理不同的ValidationResult类型
134 |             # 支持既有status属性，也支持result属性
135 |             status = getattr(result, 'status', None) or getattr(result, 'result', 'UNKNOWN')
136 |             rule_name = getattr(result, 'rule_name', '未知规则')
137 |             message = getattr(result, 'message', None) or getattr(result, 'details', '无详细信息')
138 |             
139 |             # 统一处理status格式
140 |             if '✅' in status or status == 'PASS':
141 |                 status_normalized = 'pass'
142 |                 status_display = '✅通过'
143 |             elif '⚠️' in status or status == 'WARNING':
144 |                 status_normalized = 'warning' 
145 |                 status_display = '⚠️警告'
146 |             elif '❌' in status or status == 'ERROR':
147 |                 status_normalized = 'error'
148 |                 status_display = '❌不通过'
149 |             else:
150 |                 status_normalized = 'unknown'
151 |                 status_display = status
152 |             
153 |             status_class = f"result-{status_normalized}"
154 |             badge_class = f"badge-{status_normalized}"
155 |             
156 |             html_template += f"""
157 |         <div class="result-item {status_class}">
158 |             <strong>{rule_name}</strong>
159 |             <span class="badge {badge_class}">{status_display}</span>
160 |             <p>{message}</p>
161 |         </div>"""
162 |     else:
163 |         html_template += "<p>无审核结果</p>"
164 |     
165 |     html_template += """
166 |     </div>
167 |     
168 |     <script>
169 |         document.getElementById('datetime').textContent = new Date().toLocaleString('zh-CN');
170 |     </script>
171 | </body>
172 | </html>"""
173 |     
174 |     return html_template


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "langgraph-audit-system"
  7 | version = "0.1.0"
  8 | description = "An enterprise-level intelligent title evaluation material review system built on the LangGraph framework"
  9 | authors = [
 10 |     {name = "LangGraph Audit Team", email = "team@langgraph-audit.com"}
 11 | ]
 12 | license = {text = "MIT"}
 13 | readme = "README.md"
 14 | requires-python = ">=3.10,<3.13"
 15 | keywords = ["langgraph", "audit", "ai", "workflow", "title-evaluation"]
 16 | classifiers = [
 17 |     "Development Status :: 4 - Beta",
 18 |     "Intended Audience :: Developers",
 19 |     "License :: OSI Approved :: MIT License",
 20 |     "Programming Language :: Python :: 3",
 21 |     "Programming Language :: Python :: 3.10",
 22 |     "Programming Language :: Python :: 3.11",
 23 |     "Programming Language :: Python :: 3.12",
 24 |     "Topic :: Software Development :: Libraries :: Python Modules",
 25 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 26 | ]
 27 | 
 28 | # Core dependencies for production
 29 | dependencies = [
 30 |     # LangGraph core dependencies
 31 |     "langgraph>=0.2.0",
 32 |     "langsmith>=0.1.0",
 33 |     "langtrace-python-sdk>=2.0.0",
 34 |     "pydantic>=2.0.0",
 35 |     "typing-extensions>=4.0.0",
 36 |     "langchain-core>=0.1.0",  # LangGraph core dependency
 37 |     "langgraph-cli>=0.1.0",  # LangGraph development tools
 38 |     
 39 |     # LangGraph Redis integration
 40 |     "langgraph-checkpoint-redis>=0.1.0",
 41 |     
 42 |     # PostgreSQL integration
 43 |     "psycopg[binary,pool]>=3.1.0",
 44 |     "asyncpg>=0.29.0",
 45 |     
 46 |     # AI API integration
 47 |     "google-generativeai>=0.3.0",
 48 |     
 49 |     # Environment and configuration
 50 |     "python-dotenv>=1.0.0",
 51 |     "pyyaml>=6.0",
 52 |     
 53 |     # File processing utilities
 54 |     "pathlib2",
 55 |     "zipfile36>=0.1.0",
 56 |     "python-magic>=0.4.0",
 57 |     "Pillow>=10.0.0",
 58 |     
 59 |     # Web framework
 60 |     "fastapi>=0.104.0",
 61 |     "uvicorn>=0.24.0",
 62 |     "python-multipart>=0.0.6",
 63 |     "sse-starlette>=1.6.0",  # Server-Sent Events support
 64 |     "starlette>=0.27.0",
 65 |     
 66 |     # Data processing
 67 |     "pandas>=2.0.0",
 68 |     "numpy>=1.24.0",
 69 |     
 70 |     # HTML report generation
 71 |     "jinja2>=3.1.0",
 72 |     "weasyprint>=60.0",
 73 |     
 74 |     # Monitoring and logging
 75 |     "loguru>=0.7.0",
 76 |     "prometheus-client>=0.19.0",
 77 |     
 78 |     # OCR and computer vision
 79 |     "pytesseract>=0.3.10",
 80 |     "opencv-python>=4.8.0",
 81 |     
 82 |     # Database support
 83 |     "sqlalchemy>=2.0.0",
 84 |     "alembic>=1.12.0",
 85 |     
 86 |     # Async processing
 87 |     "aiofiles>=23.2.0",
 88 |     "aiohttp>=3.8.0",
 89 |     "celery>=5.3.0",
 90 |     "redis>=5.0.0",
 91 |     
 92 |     # Document processing
 93 |     "markdown>=3.5.0",
 94 |     "markdownify>=0.11.0",
 95 | ]
 96 | 
 97 | [project.optional-dependencies]
 98 | # Development dependencies
 99 | dev = [
100 |     "pytest>=7.4.0",
101 |     "pytest-asyncio>=0.21.0",
102 |     "pytest-cov>=4.1.0",
103 |     "black>=23.0.0",
104 |     "isort>=5.12.0",
105 |     "flake8>=6.0.0",
106 |     "mypy>=1.6.0",
107 | ]
108 | 
109 | # Testing dependencies
110 | test = [
111 |     "pytest>=7.4.0",
112 |     "pytest-asyncio>=0.21.0",
113 |     "pytest-cov>=4.1.0",
114 |     "pytest-mock>=3.11.0",
115 |     "httpx>=0.25.0",  # for testing FastAPI endpoints
116 | ]
117 | 
118 | # Documentation dependencies
119 | docs = [
120 |     "mkdocs>=1.5.0",
121 |     "mkdocs-material>=9.4.0",
122 |     "mkdocstrings[python]>=0.23.0",
123 | ]
124 | 
125 | # Full development environment
126 | all = [
127 |     "langgraph-audit-system[dev,test,docs]"
128 | ]
129 | 
130 | [project.urls]
131 | Homepage = "https://github.com/your-org/langgraph-audit-system"
132 | Documentation = "https://your-org.github.io/langgraph-audit-system"
133 | Repository = "https://github.com/your-org/langgraph-audit-system.git"
134 | Issues = "https://github.com/your-org/langgraph-audit-system/issues"
135 | 
136 | [project.scripts]
137 | # Command line entry points
138 | langgraph-audit = "src.agent:main"
139 | audit-debug = "debug_langsmith:main"
140 | check-health = "check_health:main"
141 | 
142 | [tool.hatch.build.targets.wheel]
143 | packages = ["src"]
144 | 
145 | [tool.hatch.build.targets.sdist]
146 | include = [
147 |     "src/",
148 |     "rules/",
149 |     "test_data/",
150 |     "README.md",
151 |     "langgraph.json",
152 |     "pyproject.toml",
153 | ]
154 | exclude = [
155 |     "**/__pycache__/",
156 |     "**/*.pyc",
157 |     "**/*.pyo",
158 |     "**/*.orig",
159 |     "**/*.rej",
160 |     "**/*~",
161 |     "**/#*#",
162 |     "**/.#*",
163 |     ".git/",
164 |     ".pytest_cache/",
165 |     ".coverage",
166 | ]
167 | 
168 | # Black code formatting configuration
169 | [tool.black]
170 | line-length = 88
171 | target-version = ["py310", "py311", "py312"]
172 | include = '\.pyi?$'
173 | extend-exclude = '''
174 | /(
175 |   # directories
176 |   \.eggs
177 |   | \.git
178 |   | \.hg
179 |   | \.mypy_cache
180 |   | \.tox
181 |   | \.venv
182 |   | _build
183 |   | buck-out
184 |   | build
185 |   | dist
186 | )/
187 | '''
188 | 
189 | # isort import sorting configuration
190 | [tool.isort]
191 | profile = "black"
192 | multi_line_output = 3
193 | line_length = 88
194 | known_first_party = ["src"]
195 | known_third_party = ["langgraph", "langsmith", "pydantic", "fastapi"]
196 | 
197 | # Flake8 linting configuration
198 | [tool.flake8]
199 | max-line-length = 88
200 | extend-ignore = ["E203", "W503", "E501"]
201 | exclude = [
202 |     ".git",
203 |     "__pycache__",
204 |     "build",
205 |     "dist",
206 |     ".eggs",
207 |     "*.egg-info",
208 |     ".venv",
209 |     ".pytest_cache",
210 | ]
211 | 
212 | # MyPy type checking configuration
213 | [tool.mypy]
214 | python_version = "3.10"
215 | warn_return_any = true
216 | warn_unused_configs = true
217 | disallow_untyped_defs = true
218 | disallow_incomplete_defs = true
219 | check_untyped_defs = true
220 | disallow_untyped_decorators = true
221 | no_implicit_optional = true
222 | warn_redundant_casts = true
223 | warn_unused_ignores = true
224 | warn_no_return = true
225 | warn_unreachable = true
226 | strict_equality = true
227 | 
228 | [[tool.mypy.overrides]]
229 | module = [
230 |     "pytesseract",
231 |     "cv2",
232 |     "weasyprint",
233 |     "celery",
234 |     "redis",
235 | ]
236 | ignore_missing_imports = true
237 | 
238 | # Pytest configuration
239 | [tool.pytest.ini_options]
240 | minversion = "7.0"
241 | addopts = "-ra -q --strict-markers --strict-config"
242 | testpaths = ["tests"]
243 | python_files = ["test_*.py", "*_test.py"]
244 | python_classes = ["Test*"]
245 | python_functions = ["test_*"]
246 | markers = [
247 |     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
248 |     "integration: marks tests as integration tests",
249 |     "unit: marks tests as unit tests",
250 |     "langsmith: marks tests that require LangSmith API",
251 | ]
252 | 
253 | # Coverage configuration
254 | [tool.coverage.run]
255 | source = ["src"]
256 | branch = true
257 | omit = [
258 |     "*/tests/*",
259 |     "*/test_*",
260 |     "*/__pycache__/*",
261 |     "*/migrations/*",
262 | ]
263 | 
264 | [tool.coverage.report]
265 | precision = 2
266 | exclude_lines = [
267 |     "pragma: no cover",
268 |     "def __repr__",
269 |     "if self.debug:",
270 |     "if settings.DEBUG",
271 |     "raise AssertionError",
272 |     "raise NotImplementedError",
273 |     "if 0:",
274 |     "if __name__ == .__main__.:",
275 |     "class .*\\bProtocol\\):",
276 |     "@(abc\\.)?abstractmethod",
277 | ]


--------------------------------------------------------------------------------
/src/graph/workflow.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 主要的职称评审材料审核工作流定义 - 完全无缓存版本
  3 | 
  4 | 🚨 已完全取消缓存机制，确保每个节点传输的信息都是全新的、一次性的
  5 | 
  6 | 包括：
  7 | 1. ZIP解压和文件夹验证
  8 | 2. PDF内容提取和核心信息提取
  9 | 3. 规则集加载和提取（并行处理）
 10 | 4. 规则校验和交叉验证
 11 | 5. 报告生成
 12 | 
 13 | 只包含一个主工作流：create_audit_workflow()
 14 | """
 15 | 
 16 | # LangGraph 核心导入 - 移除缓存相关的导入
 17 | from langgraph.graph import StateGraph, START, END  # type: ignore
 18 | 
 19 | # 导入 RetryPolicy
 20 | try:
 21 |     from langgraph.types import RetryPolicy  # type: ignore
 22 |     RETRY_POLICY_AVAILABLE = True
 23 | except ImportError:
 24 |     RetryPolicy = None
 25 |     RETRY_POLICY_AVAILABLE = False
 26 | 
 27 | # 已完全移除 checkpointer 和内存存储器相关导入
 28 | 
 29 | from .state import AuditState
 30 | from .edges import (
 31 |     check_pdf_extraction_status,
 32 |     create_parallel_branches,  # 并行分支路由
 33 |     after_rules_loaded,        # 规则加载后路由
 34 |     check_rules_for_validation, # 规则验证路由
 35 |     check_pdf_extraction_for_parallel_processing  # PDF提取并行分发路由
 36 | )
 37 | from src.tools.langsmith_utils import (
 38 |     setup_langsmith_environment,
 39 |     event_logger,
 40 |     with_langsmith_tracing
 41 | )
 42 | 
 43 | 
 44 | @with_langsmith_tracing
 45 | def create_audit_workflow():
 46 |     """
 47 |     创建完全无缓存的职称评审材料审核工作流
 48 |     
 49 |     🚨 已完全取消缓存机制，确保每次传输的信息都是全新的、一次性的
 50 |     
 51 |     工作流程：
 52 |     ZIP解压 -> 并行分支：
 53 |       分支1: PDF内容提取 -> 核心信息提取 -> 交叉校验
 54 |       分支2: 规则集加载 -> 规则提取 -> 汇入验证
 55 |     最后: 报告生成
 56 |     
 57 |     Returns:
 58 |         编译后的LangGraph工作流（无缓存）
 59 |     """
 60 |     # 延迟导入以避免循环依赖
 61 |     from src.nodes import (
 62 |         file_processing_node,
 63 |         core_info_extraction_node,
 64 |         validation_node,
 65 |         report_generation_node
 66 |     )
 67 |     from src.nodes.pdf_extraction import pdf_extraction_node
 68 |     from src.nodes.cross_validation import cross_validation_node
 69 |     from src.nodes.rules_processing import load_rules_node, extract_rules_node
 70 |     
 71 |     # 初始化LangSmith环境
 72 |     setup_langsmith_environment()
 73 |     
 74 |     workflow = StateGraph(AuditState)
 75 |     
 76 |     # 根据LangGraph最佳实践添加重试策略（仅在可用时）
 77 |     retry_policy_io = None
 78 |     retry_policy_ai = None
 79 |     retry_policy_general = None
 80 |     
 81 |     if RETRY_POLICY_AVAILABLE and RetryPolicy is not None:
 82 |         retry_policy_io = RetryPolicy(max_attempts=3, retry_on=[IOError, FileNotFoundError])
 83 |         retry_policy_ai = RetryPolicy(max_attempts=5, retry_on=[TimeoutError, ConnectionError])
 84 |         retry_policy_general = RetryPolicy(max_attempts=2)
 85 |     
 86 |     # 添加所有节点并配置重试策略
 87 |     workflow.add_node(
 88 |         "file_processing", 
 89 |         _wrap_node_with_logging(file_processing_node, "file_processing"),
 90 |         retry_policy=retry_policy_io
 91 |     )
 92 |     workflow.add_node(
 93 |         "pdf_extraction", 
 94 |         _wrap_node_with_logging(pdf_extraction_node, "pdf_extraction"),
 95 |         retry_policy=retry_policy_ai
 96 |     )
 97 |     workflow.add_node(
 98 |         "core_info_extraction",
 99 |         _wrap_node_with_logging(core_info_extraction_node, "core_info_extraction")
100 |     )
101 |     workflow.add_node(
102 |         "validation",
103 |         _wrap_node_with_logging(validation_node, "validation"),
104 |         retry_policy=retry_policy_ai
105 |     )
106 |     workflow.add_node(
107 |         "cross_validation",
108 |         _wrap_node_with_logging(cross_validation_node, "cross_validation"),
109 |         retry_policy=retry_policy_general
110 |     )
111 |     workflow.add_node(
112 |         "report_generation",
113 |         _wrap_node_with_logging(report_generation_node, "report_generation"),
114 |         retry_policy=retry_policy_general
115 |     )
116 |     workflow.add_node(
117 |         "load_rules",
118 |         _wrap_node_with_logging(load_rules_node, "load_rules"),
119 |         retry_policy=retry_policy_general
120 |     )
121 |     workflow.add_node(
122 |         "extract_rules",
123 |         _wrap_node_with_logging(extract_rules_node, "extract_rules"),
124 |         retry_policy=retry_policy_ai
125 |     )
126 |     
127 |     # 定义工作流边连接：添加规则集并行处理支持
128 |     workflow.add_edge(START, "file_processing")
129 |     
130 |     # 从file_processing分叉到并行处理路径
131 |     workflow.add_conditional_edges(
132 |         "file_processing",
133 |         create_parallel_branches,
134 |         ["pdf_extraction", "load_rules"]  # 支持并行分支
135 |     )
136 |     
137 |     # 规则处理分支
138 |     workflow.add_conditional_edges(
139 |         "load_rules",
140 |         after_rules_loaded,
141 |         {
142 |             "extract_rules": "extract_rules",
143 |             "rules_load_failed": END
144 |         }
145 |     )
146 |     
147 |     # 规则提取完成后，将规则通过条件边传递给validation
148 |     workflow.add_conditional_edges(
149 |         "extract_rules",
150 |         check_rules_for_validation,
151 |         ["validation", "cross_validation"]  # 支持Send API并行分发
152 |     )
153 |     
154 |     # PDF提取后进入核心信息提取（主流程）
155 |     workflow.add_conditional_edges(
156 |         "pdf_extraction",
157 |         check_pdf_extraction_status,
158 |         {
159 |             "pdf_extraction_success": "core_info_extraction",
160 |             "pdf_extraction_failed": END
161 |         }
162 |     )
163 |     
164 |     # 🛠️ 关键修复：简化工作流连接，避免多重触发导致的缓存问题
165 |     # 删除直接边，只使用条件边触发节点，确保数据一致性
166 |     
167 |     # validation和cross_validation完成后进入报告生成
168 |     workflow.add_edge("validation", "report_generation")
169 |     workflow.add_edge("cross_validation", "report_generation")
170 |     workflow.add_edge("core_info_extraction", "report_generation")
171 |     
172 |     workflow.add_edge("report_generation", END)
173 |     
174 |     # 编译工作流 - 完全无缓存版本
175 |     # 🚨 已移除所有checkpointer和内存存储相关的配置
176 |     # 确保每个节点传输的信息都是全新的、一次性的
177 |     return workflow.compile()
178 | 
179 | 
180 | 
181 | 
182 | 
183 | def _wrap_node_with_logging(node_func, node_name: str):
184 |     """
185 |     包装节点函数以添加LangSmith日志记录
186 |     
187 |     Args:
188 |         node_func: 节点函数
189 |         node_name: 节点名称
190 |         
191 |     Returns:
192 |         包装后的节点函数
193 |     """
194 |     import asyncio
195 |     import inspect
196 |     
197 |     # 检查节点函数是否为异步函数
198 |     if inspect.iscoroutinefunction(node_func):
199 |         # 异步节点包装器
200 |         async def async_wrapped_node(state):
201 |             try:
202 |                 # 记录节点开始
203 |                 event_logger.log_node_start(node_name, state)
204 |                 
205 |                 # 执行异步节点函数
206 |                 result = await node_func(state)
207 |                 
208 |                 # 记录节点完成
209 |                 event_logger.log_node_complete(node_name, result)
210 |                 
211 |                 return result
212 |                 
213 |             except Exception as e:
214 |                 # 记录节点错误
215 |                 event_logger.log_node_error(node_name, e)
216 |                 raise
217 |         
218 |         return async_wrapped_node
219 |     else:
220 |         # 同步节点包装器
221 |         def sync_wrapped_node(state):
222 |             try:
223 |                 # 记录节点开始
224 |                 event_logger.log_node_start(node_name, state)
225 |                 
226 |                 # 执行节点函数
227 |                 result = node_func(state)
228 |                 
229 |                 # 记录节点完成
230 |                 event_logger.log_node_complete(node_name, result)
231 |                 
232 |                 return result
233 |                 
234 |             except Exception as e:
235 |                 # 记录节点错误
236 |                 event_logger.log_node_error(node_name, e)
237 |                 raise
238 |         
239 |         return sync_wrapped_node
240 | 
241 | 
242 | 
243 | 
244 | 
245 | # 延迟创建默认工作流，避免循环导入
246 | default_workflow = None
247 | 
248 | def get_default_workflow():
249 |     """获取默认工作流（延迟创建）"""
250 |     global default_workflow
251 |     if default_workflow is None:
252 |         default_workflow = create_audit_workflow()
253 |     return default_workflow
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | 


--------------------------------------------------------------------------------
/src/graph/state.py:
--------------------------------------------------------------------------------
  1 | """
  2 | LangGraph工作流状态管理
  3 | 
  4 | 定义审核流程中的状态结构：
  5 | - AuditState: 主要的审核状态
  6 | - 各个节点间的状态传递规则
  7 | - 状态的序列化和反序列化
  8 | - 支持并发安全的状态管理
  9 | """
 10 | 
 11 | from typing import Dict, List, Any, Optional, TypedDict, Annotated
 12 | from dataclasses import dataclass, field
 13 | from pathlib import Path
 14 | import operator
 15 | 
 16 | 
 17 | def step_reducer(existing: str, new: str) -> str:
 18 |     """current_step字段的reducer函数：后写入优先，确保并发安全"""
 19 |     # 对于步骤状态，使用最新的值（last write wins）
 20 |     return new if new else existing
 21 | 
 22 | 
 23 | class AuditState(TypedDict):
 24 |     """审核工作流状态定义（支持并发安全）
 25 | 
 26 |     注意：此处声明的键必须覆盖所有节点读写的字段；
 27 |     未在此处声明的字段在LangGraph状态合并时可能被丢弃，
 28 |     因而需要在这里统一、规范地进行声明。
 29 |     """
 30 |     
 31 |     # 输入文件信息
 32 |     uploaded_file: Optional[str]  # 上传的ZIP压缩包路径
 33 |     file_type: str  # 文件类型 (zip)
 34 |     extraction_path: Optional[str]  # ZIP解压后的根目录
 35 |     extracted_files: Annotated[List[str], operator.add]  # 解压得到的文件列表（并发安全）
 36 |     
 37 |     # 文件夹结构验证
 38 |     folder_validation: Dict[str, Any]  # 17个标准文件夹验证结果
 39 |     folder_classification: Dict[str, List[str]]  # 文件夹分类结果 {文件夹名: [.pdf文件列表]}
 40 |     
 41 |     # PDF内容提取和分析（新增）
 42 |     pdf_extraction_results: Dict[str, Any]  # PDF文件提取结果
 43 |     api_extraction_results: Dict[str, Any]  # 通过API提取的JSON结果
 44 |     
 45 |     # PDF API配置（新增）
 46 |     pdf_api_endpoint: Optional[str]  # PDF提取API端点
 47 |     
 48 |     # 内容提取和分析
 49 |     extracted_content: Dict[str, Any]  # 从PDF文件提取的内容信息
 50 |     content_analysis: Dict[str, Any]  # AI分析的结构化内容
 51 |     core_info: Optional[Dict[str, Any]]  # 核心信息（姓名、身份证号等）
 52 |     
 53 |     # 验证结果（使用reducer确保并发安全）
 54 |     material_validation: Dict[str, List[Any]]  # 材料校验结果
 55 |     cross_validation: Annotated[List[Any], operator.add]  # 交叉校验结果（并发安全）
 56 |     validation_results: Annotated[List[Dict[str, Any]], operator.add]  # 所有校验结果（并发安全）
 57 |     # 详细验证结果与摘要（供报告节点直接消费）
 58 |     validation_results_detailed: Annotated[List[Dict[str, Any]], operator.add]  # 详细验证结果
 59 |     validation_summary: Optional[Dict[str, Any]]  # 验证摘要
 60 |     
 61 |     # 规则集处理（新增并行处理支持）
 62 |     rules_data: Annotated[List[Dict[str, Any]], operator.add]  # 加载的规则集数据（并发安全）
 63 |     parsed_rules: List[Any]  # 🚨 移除reducer，直接替换而不是累加规则（支持RuleInfo对象和字典格式）
 64 |     rules_by_category: Dict[str, List[Any]]  # 按1-17项分类的规则集
 65 |     
 66 |     # 缓存管理（新增）
 67 |     validation_cache: Annotated[List[Dict[str, Any]], operator.add]  # 验证结果缓存
 68 |     cross_validation_cache: Annotated[List[Dict[str, Any]], operator.add]  # 交叉验证结果缓存
 69 |     
 70 |     # 报告生成
 71 |     audit_report: Optional[str]  # 生成的审核报告
 72 |     report_path: Optional[str]  # 报告文件路径
 73 |     report_summary: Optional[Dict[str, Any]]  # 报告摘要（便于前端展示）
 74 |     quality_score: Optional[float]  # 报告质量评分
 75 |     compliance_status: Optional[str]  # 合规性状态（PASS/WARNING/FAIL）
 76 | 
 77 |     # 处理统计（可选，供调试/展示）
 78 |     processing_stats: Optional[Dict[str, Any]]  # 处理统计信息
 79 |     
 80 |     # 流程控制（使用reducer确保并发安全）
 81 |     current_step: Annotated[str, step_reducer]  # 当前步骤（并发安全）
 82 |     error_message: Optional[str]  # 错误信息
 83 |     warnings: Annotated[List[str], operator.add]  # 警告信息（并发安全）
 84 |     processing_logs: Annotated[List[str], operator.add]  # 处理日志（并发安全）
 85 |     is_complete: bool  # 是否完成
 86 |     
 87 |     # 会话管理（LangGraph官方持久化支持）
 88 |     session_id: Optional[str]  # 会话ID
 89 | 
 90 | 
 91 | @dataclass
 92 | class WorkflowConfig:
 93 |     """工作流配置"""
 94 |     
 95 |     # 文件处理配置
 96 |     max_file_size: int = 50 * 1024 * 1024  # 50MB (ZIP压缩包)
 97 |     supported_formats: List[str] = field(default_factory=lambda: ['.zip'])
 98 |     
 99 |     # 文件夹验证配置
100 |     required_folders: List[str] = field(default_factory=lambda: [
101 |         "1.教育经历", "2.工作经历", "3.继续教育(培训情况)", "4.学术技术兼职情况",
102 |         "5.获奖情况", "6.获得荣誉称号情况", "7.主持参与科研项目(基金)情况", 
103 |         "8.主持参与工程技术项目情况", "9.论文", "10.著(译)作(教材)",
104 |         "11.专利(著作权)情况", "12.主持参与指定标准情况", 
105 |         "13.成果被批示、采纳、运用和推广情况", "14.资质证书",
106 |         "15.奖惩情况", "16.考核情况", "17.申报材料附件信息"
107 |     ])
108 |     
109 |     # PDF处理配置
110 |     max_pdf_file_size: int = 20 * 1024 * 1024  # 20MB per PDF file
111 |     pdf_api_timeout: int = 60  # PDF API提取超时时间（秒）
112 |     pdf_api_endpoint: Optional[str] = None  # PDF提取API端点
113 |     
114 |     # AI处理配置
115 |     ai_timeout: int = 300  # AI处理超时时间（秒）
116 |     max_retries: int = 3  # 最大重试次数
117 |     
118 |     # 输出配置
119 |     output_dir: str = 'output'
120 |     report_template: str = 'templates/audit_report.html'
121 | 
122 | 
123 | def create_initial_state(
124 |     uploaded_file: str,
125 |     session_id: Optional[str] = None
126 | ) -> AuditState:
127 |     """创建初始状态（支持并发安全）"""
128 |     
129 |     file_path = Path(uploaded_file)
130 |     file_type = file_path.suffix.lower()
131 |     
132 |     # 尝试从配置获取PDF API端点
133 |     pdf_api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain"  # 默认配置
134 |     try:
135 |         from src.config.api_config import get_pdf_api_config
136 |         api_config = get_pdf_api_config()
137 |         configured_endpoint = api_config.get("pdf_extraction_endpoint")
138 |         if configured_endpoint:
139 |             pdf_api_endpoint = configured_endpoint
140 |             print(f"✅ 从配置文件加载PDF API端点: {pdf_api_endpoint}")
141 |         else:
142 |             print(f"⚠️ 配置文件中未找到PDF API端点，使用默认值: {pdf_api_endpoint}")
143 |     except ImportError:
144 |         print(f"⚠️ 无法导入API配置模块，使用默认PDF API端点: {pdf_api_endpoint}")
145 |     except Exception as e:
146 |         print(f"⚠️ 读取API配置失败: {e}，使用默认PDF API端点: {pdf_api_endpoint}")
147 |     
148 |     # 确保API端点不为空
149 |     if not pdf_api_endpoint:
150 |         pdf_api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain"
151 |         print(f"🔧 强制设置默认PDF API端点: {pdf_api_endpoint}")
152 |     
153 |     return AuditState(
154 |         # 输入文件信息
155 |         uploaded_file=uploaded_file,
156 |         file_type=file_type,
157 |         extraction_path=None,
158 |         extracted_files=[],
159 |         
160 |         # 文件夹结构验证
161 |         folder_validation={},
162 |         folder_classification={},
163 |         
164 |         # PDF内容提取和分析（新增）
165 |         pdf_extraction_results={},
166 |         api_extraction_results={},
167 |         
168 |         # PDF API配置
169 |         pdf_api_endpoint=pdf_api_endpoint,
170 |         
171 |         # 内容提取和分析
172 |         extracted_content={},
173 |         content_analysis={},
174 |         core_info=None,
175 |         
176 |         # 验证结果（初始化为空列表以支持reducer）
177 |         material_validation={},
178 |         cross_validation=[],
179 |         validation_results=[],
180 |         validation_results_detailed=[],
181 |         validation_summary=None,
182 |         
183 |         # 规则集处理（初始化为空列表以支持reducer）
184 |         rules_data=[],
185 |         parsed_rules=[],  # 支持RuleInfo对象和字典格式
186 |         rules_by_category={},
187 |         
188 |         # 缓存管理（新增）
189 |         validation_cache=[],
190 |         cross_validation_cache=[],
191 |         
192 |         # 报告生成
193 |         audit_report=None,
194 |         report_path=None,
195 |         report_summary=None,
196 |         quality_score=None,
197 |         compliance_status=None,
198 |         processing_stats=None,
199 |         
200 |         # 流程控制
201 |         current_step="zip_extraction",
202 |         error_message=None,
203 |         warnings=[],
204 |         processing_logs=[],
205 |         is_complete=False,
206 |         
207 |         # 会话管理
208 |         session_id=session_id
209 |     )
210 | 
211 | 
212 | def update_state_step(state: AuditState, step: str) -> Dict[str, Any]:
213 |     """更新状态步骤（并发安全）"""
214 |     # 使用reducer模式更新step，避免直接赋值
215 |     return {"current_step": step}
216 | 
217 | 
218 | def add_warning(state: AuditState, warning: str) -> Dict[str, Any]:
219 |     """添加警告信息"""
220 |     return {"warnings": [warning]}
221 | 
222 | 
223 | def set_error(state: AuditState, error: str) -> Dict[str, Any]:
224 |     """设置错误信息"""
225 |     return {"error_message": error}
226 | 
227 | 
228 | def mark_complete(state: AuditState) -> Dict[str, Any]:
229 |     """标记流程完成（并发安全）"""
230 |     return {
231 |         "is_complete": True,
232 |         "current_step": "completed"
233 |     }
234 | 


--------------------------------------------------------------------------------
/src/tools/cache_manager.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 缓存管理工具
  3 | 
  4 | 用于管理validation和cross_validation阶段的缓存结果：
  5 | 1. 按材料类型分类整理
  6 | 2. 按优先级排序（高优先级错误在前）
  7 | 3. 过滤通过的结果（仅显示警告和错误）
  8 | 4. 生成结构化的报告数据
  9 | """
 10 | 
 11 | from typing import Dict, List, Any, Optional
 12 | from collections import defaultdict
 13 | 
 14 | 
 15 | class ValidationCacheManager:
 16 |     """验证缓存管理器"""
 17 |     
 18 |     def __init__(self):
 19 |         self.priority_order = {
 20 |             "极高": 1,
 21 |             "高": 2,
 22 |             "中": 3,
 23 |             "低": 4
 24 |         }
 25 |         
 26 |         self.status_order = {
 27 |             "❌不通过": 1,
 28 |             "⚠️警告": 2,
 29 |             "✅通过": 3
 30 |         }
 31 |     
 32 |     def organize_validation_cache(self, validation_cache: List[Dict[str, Any]], 
 33 |                                 cross_validation_cache: List[Dict[str, Any]]) -> Dict[str, Any]:
 34 |         """
 35 |         整理验证缓存数据
 36 |         
 37 |         Args:
 38 |             validation_cache: 材料验证缓存结果
 39 |             cross_validation_cache: 交叉验证缓存结果
 40 |             
 41 |         Returns:
 42 |             整理后的报告数据
 43 |         """
 44 |         print("📊 开始整理验证缓存数据...")
 45 |         
 46 |         # 按材料类型分类
 47 |         material_groups = self._group_by_material_type(validation_cache)
 48 |         
 49 |         # 添加交叉验证结果
 50 |         if cross_validation_cache:
 51 |             material_groups["交叉校验"] = cross_validation_cache
 52 |         
 53 |         # 过滤和排序每个材料类型的结果
 54 |         filtered_groups = {}
 55 |         total_issues = 0
 56 |         
 57 |         for material_type, results in material_groups.items():
 58 |             # 过滤掉通过的结果，只保留警告和错误
 59 |             filtered_results = self._filter_non_passing_results(results)
 60 |             
 61 |             if filtered_results:
 62 |                 # 按优先级和状态排序
 63 |                 sorted_results = self._sort_results_by_priority(filtered_results)
 64 |                 filtered_groups[material_type] = sorted_results
 65 |                 total_issues += len(sorted_results)
 66 |                 
 67 |                 print(f"  📋 {material_type}: {len(sorted_results)}个问题")
 68 |         
 69 |         # 生成统计信息
 70 |         statistics = self._generate_statistics(validation_cache, cross_validation_cache)
 71 |         
 72 |         print(f"✅ 缓存数据整理完成，共发现{total_issues}个需要关注的问题")
 73 |         
 74 |         return {
 75 |             "material_groups": filtered_groups,
 76 |             "statistics": statistics,
 77 |             "total_issues": total_issues,
 78 |             "processed_at": self._get_current_timestamp()
 79 |         }
 80 |     
 81 |     def _group_by_material_type(self, validation_cache: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
 82 |         """按材料类型分组"""
 83 |         groups = defaultdict(list)
 84 |         
 85 |         for result in validation_cache:
 86 |             material_type = result.get("material_type", "未知类型")
 87 |             groups[material_type].append(result)
 88 |         
 89 |         return dict(groups)
 90 |     
 91 |     def _filter_non_passing_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 92 |         """过滤掉通过的结果，只保留警告和错误"""
 93 |         return [
 94 |             result for result in results 
 95 |             if result.get("result", "").strip() != "✅通过"
 96 |         ]
 97 |     
 98 |     def _sort_results_by_priority(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 99 |         """按优先级和状态排序（高优先级、错误状态在前）"""
100 |         def sort_key(result):
101 |             priority = result.get("priority", "中")
102 |             status = result.get("result", "⚠️警告")
103 |             
104 |             priority_score = self.priority_order.get(priority, 3)
105 |             status_score = self.status_order.get(status, 2)
106 |             
107 |             return (priority_score, status_score)
108 |         
109 |         return sorted(results, key=sort_key)
110 |     
111 |     def _generate_statistics(self, validation_cache: List[Dict[str, Any]], 
112 |                            cross_validation_cache: List[Dict[str, Any]]) -> Dict[str, Any]:
113 |         """生成统计信息"""
114 |         all_results = validation_cache + cross_validation_cache
115 |         
116 |         # 按状态统计
117 |         status_counts = defaultdict(int)
118 |         priority_counts = defaultdict(int)
119 |         material_counts = defaultdict(int)
120 |         
121 |         for result in all_results:
122 |             status = result.get("result", "⚠️警告")
123 |             priority = result.get("priority", "中")
124 |             material_type = result.get("material_type", "未知类型")
125 |             
126 |             status_counts[status] += 1
127 |             priority_counts[priority] += 1
128 |             material_counts[material_type] += 1
129 |         
130 |         return {
131 |             "total_results": len(all_results),
132 |             "validation_results": len(validation_cache),
133 |             "cross_validation_results": len(cross_validation_cache),
134 |             "status_distribution": dict(status_counts),
135 |             "priority_distribution": dict(priority_counts),
136 |             "material_distribution": dict(material_counts),
137 |             "issues_count": len([r for r in all_results if r.get("result", "").strip() != "✅通过"])
138 |         }
139 |     
140 |     def get_report_summary(self, organized_data: Dict[str, Any]) -> Dict[str, Any]:
141 |         """
142 |         生成报告摘要
143 |         
144 |         Args:
145 |             organized_data: 整理后的数据
146 |             
147 |         Returns:
148 |             报告摘要信息
149 |         """
150 |         material_groups = organized_data.get("material_groups", {})
151 |         statistics = organized_data.get("statistics", {})
152 |         
153 |         # 计算各类问题数量
154 |         error_count = sum(
155 |             len([r for r in results if r.get("result", "").startswith("❌")])
156 |             for results in material_groups.values()
157 |         )
158 |         
159 |         warning_count = sum(
160 |             len([r for r in results if r.get("result", "").startswith("⚠️")])
161 |             for results in material_groups.values()
162 |         )
163 |         
164 |         # 最高优先级问题
165 |         high_priority_issues = []
166 |         for material_type, results in material_groups.items():
167 |             for result in results:
168 |                 if result.get("priority") in ["极高", "高"]:
169 |                     high_priority_issues.append({
170 |                         "material_type": material_type,
171 |                         "rule_name": result.get("rule_name", ""),
172 |                         "details": result.get("details", ""),
173 |                         "priority": result.get("priority", "")
174 |                     })
175 |         
176 |         return {
177 |             "total_materials_checked": len(statistics.get("material_distribution", {})),
178 |             "total_issues": organized_data.get("total_issues", 0),
179 |             "error_count": error_count,
180 |             "warning_count": warning_count,
181 |             "high_priority_count": len(high_priority_issues),
182 |             "high_priority_issues": high_priority_issues[:5],  # 只显示前5个
183 |             "material_issue_summary": {
184 |                 material_type: len(results) 
185 |                 for material_type, results in material_groups.items()
186 |             }
187 |         }
188 |     
189 |     def _get_current_timestamp(self) -> str:
190 |         """获取当前时间戳"""
191 |         from datetime import datetime
192 |         return datetime.now().isoformat()
193 | 
194 | 
195 | # 全局缓存管理器实例
196 | cache_manager = ValidationCacheManager()
197 | 
198 | 
199 | def organize_audit_cache(state) -> Dict[str, Any]:
200 |     """
201 |     整理审核缓存数据的便捷函数
202 |     
203 |     Args:
204 |         state: 审核状态
205 |         
206 |     Returns:
207 |         整理后的缓存数据
208 |     """
209 |     validation_cache = state.get("validation_cache", [])
210 |     cross_validation_cache = state.get("cross_validation_cache", [])
211 |     
212 |     return cache_manager.organize_validation_cache(validation_cache, cross_validation_cache)
213 | 
214 | 
215 | def get_report_data_from_cache(state) -> Dict[str, Any]:
216 |     """
217 |     从缓存中获取报告数据
218 |     
219 |     Args:
220 |         state: 审核状态
221 |         
222 |     Returns:
223 |         报告数据
224 |     """
225 |     organized_data = organize_audit_cache(state)
226 |     summary = cache_manager.get_report_summary(organized_data)
227 |     
228 |     return {
229 |         "organized_data": organized_data,
230 |         "summary": summary,
231 |         "cache_processed": True
232 |     }


--------------------------------------------------------------------------------
/src/tools/langsmith_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | LangSmith集成工具类
  3 | 
  4 | 提供LangGraph项目的调试、监控和评估功能
  5 | """
  6 | 
  7 | import os
  8 | import uuid
  9 | from typing import Dict, Any, Optional, List
 10 | from datetime import datetime
 11 | import getpass
 12 | 
 13 | def setup_langsmith_environment():
 14 |     """
 15 |     设置LangSmith环境变量
 16 |     
 17 |     根据LangGraph最佳实践配置LangSmith追踪
 18 |     """
 19 |     def _set_env(var: str):
 20 |         """安全地设置环境变量"""
 21 |         if not os.environ.get(var):
 22 |             # 优先从.env文件读取，如果没有则提示输入
 23 |             value = getpass.getpass(f"请输入 {var}: ")
 24 |             os.environ[var] = value
 25 |     
 26 |     # 设置必要的API密钥
 27 |     _set_env("LANGSMITH_API_KEY")
 28 |     
 29 |     # 配置LangSmith追踪
 30 |     os.environ["LANGCHAIN_TRACING_V2"] = "true"
 31 |     os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
 32 |     os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT", "Audit_Workflow_Debug")
 33 |     os.environ["LANGSMITH_TRACING"] = "true"
 34 |     
 35 |     print("✅ LangSmith环境配置完成")
 36 |     print(f"📊 项目名称: {os.environ['LANGCHAIN_PROJECT']}")
 37 | 
 38 | 
 39 | def create_run_config(
 40 |     run_name: Optional[str] = None,
 41 |     tags: Optional[List[str]] = None,
 42 |     metadata: Optional[Dict[str, Any]] = None,
 43 |     thread_id: Optional[str] = None
 44 | ) -> Dict[str, Any]:
 45 |     """
 46 |     创建LangGraph运行配置，支持LangSmith追踪
 47 |     
 48 |     Args:
 49 |         run_name: 运行名称
 50 |         tags: 标签列表
 51 |         metadata: 元数据
 52 |         thread_id: 线程ID
 53 |         
 54 |     Returns:
 55 |         配置字典
 56 |     """
 57 |     config = {}
 58 |     
 59 |     # 生成唯一的运行ID
 60 |     if not config.get("run_id"):
 61 |         config["run_id"] = str(uuid.uuid4())
 62 |     
 63 |     # 设置运行名称
 64 |     if run_name:
 65 |         config["run_name"] = run_name
 66 |     else:
 67 |         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
 68 |         config["run_name"] = f"audit_workflow_{timestamp}"
 69 |     
 70 |     # 设置标签
 71 |     default_tags = ["audit_workflow", "langgraph", "production"]
 72 |     if tags:
 73 |         config["tags"] = default_tags + tags
 74 |     else:
 75 |         config["tags"] = default_tags
 76 |     
 77 |     # 设置元数据
 78 |     default_metadata = {
 79 |         "version": "1.0.0",
 80 |         "environment": os.getenv("ENVIRONMENT", "development"),
 81 |         "project": "职称评审材料审核系统"
 82 |     }
 83 |     if metadata:
 84 |         default_metadata.update(metadata)
 85 |     config["metadata"] = default_metadata
 86 |     
 87 |     # 设置可配置参数
 88 |     configurable = {}
 89 |     if thread_id:
 90 |         configurable["thread_id"] = thread_id
 91 |     
 92 |     if configurable:
 93 |         config["configurable"] = configurable
 94 |     
 95 |     return config
 96 | 
 97 | 
 98 | def log_workflow_step(step_name: str, status: str, data: Optional[Dict] = None):
 99 |     """
100 |     记录工作流步骤，便于调试
101 |     
102 |     Args:
103 |         step_name: 步骤名称
104 |         status: 状态 (started, completed, failed)
105 |         data: 附加数据
106 |     """
107 |     timestamp = datetime.now().isoformat()
108 |     log_entry = {
109 |         "timestamp": timestamp,
110 |         "step": step_name,
111 |         "status": status,
112 |         "data": data or {}
113 |     }
114 |     
115 |     # 使用结构化日志，LangSmith可以捕获
116 |     print(f"🔍 [{timestamp}] {step_name.upper()}: {status}")
117 |     if data:
118 |         print(f"   📝 数据: {data}")
119 | 
120 | 
121 | def create_debug_config(breakpoints: Optional[List[str]] = None) -> Dict[str, Any]:
122 |     """
123 |     创建调试配置
124 |     
125 |     Args:
126 |         breakpoints: 断点列表
127 |         
128 |     Returns:
129 |         调试配置
130 |     """
131 |     config = create_run_config(
132 |         run_name="debug_session",
133 |         tags=["debug", "development"],
134 |         metadata={"mode": "debug"}
135 |     )
136 |     
137 |     if breakpoints:
138 |         config["breakpoints"] = breakpoints
139 |     
140 |     # 启用详细追踪
141 |     config["recursion_limit"] = 50
142 |     
143 |     return config
144 | 
145 | 
146 | def hide_sensitive_data(inputs: Dict[str, Any]) -> Dict[str, Any]:
147 |     """
148 |     隐藏敏感数据，避免在LangSmith中暴露
149 |     
150 |     Args:
151 |         inputs: 输入数据
152 |         
153 |     Returns:
154 |         脱敏后的数据
155 |     """
156 |     copied = inputs.copy()
157 |     
158 |     # 隐藏敏感字段
159 |     sensitive_fields = ["api_key", "password", "token", "secret"]
160 |     
161 |     for key in copied:
162 |         if any(sensitive in key.lower() for sensitive in sensitive_fields):
163 |             copied[key] = "***HIDDEN***"
164 |         
165 |         # 隐藏长文本内容
166 |         if isinstance(copied[key], str) and len(copied[key]) > 1000:
167 |             copied[key] = copied[key][:100] + "...[内容过长已截断]"
168 |     
169 |     return copied
170 | 
171 | 
172 | class LangSmithEventLogger:
173 |     """LangSmith事件记录器"""
174 |     
175 |     def __init__(self, project_name: str = "Audit_Workflow"):
176 |         self.project_name = project_name
177 |         self.events = []
178 |     
179 |     def log_node_start(self, node_name: str, state: Dict[str, Any]):
180 |         """记录节点开始"""
181 |         event = {
182 |             "type": "node_start",
183 |             "node": node_name,
184 |             "timestamp": datetime.now().isoformat(),
185 |             "state_keys": list(state.keys())
186 |         }
187 |         self.events.append(event)
188 |         log_workflow_step(f"节点开始: {node_name}", "started")
189 |     
190 |     def log_node_complete(self, node_name: str, result: Dict[str, Any]):
191 |         """记录节点完成"""
192 |         event = {
193 |             "type": "node_complete", 
194 |             "node": node_name,
195 |             "timestamp": datetime.now().isoformat(),
196 |             "result_keys": list(result.keys())
197 |         }
198 |         self.events.append(event)
199 |         log_workflow_step(f"节点完成: {node_name}", "completed", {"result_keys": list(result.keys())})
200 |     
201 |     def log_node_error(self, node_name: str, error: Exception):
202 |         """记录节点错误"""
203 |         event = {
204 |             "type": "node_error",
205 |             "node": node_name,
206 |             "timestamp": datetime.now().isoformat(),
207 |             "error": str(error),
208 |             "error_type": type(error).__name__
209 |         }
210 |         self.events.append(event)
211 |         log_workflow_step(f"节点错误: {node_name}", "failed", {"error": str(error)})
212 |     
213 |     def get_events(self) -> List[Dict[str, Any]]:
214 |         """获取所有事件"""
215 |         return self.events
216 |     
217 |     def clear_events(self):
218 |         """清空事件"""
219 |         self.events.clear()
220 | 
221 | 
222 | # 全局事件记录器实例
223 | event_logger = LangSmithEventLogger()
224 | 
225 | 
226 | def with_langsmith_tracing(func):
227 |     """
228 |     装饰器：为函数添加LangSmith追踪
229 |     """
230 |     def wrapper(*args, **kwargs):
231 |         from langchain_core.tracers.context import tracing_v2_enabled
232 |         from langsmith import Client
233 |         
234 |         # 创建LangSmith客户端，隐藏敏感数据
235 |         client = Client(
236 |             hide_inputs=hide_sensitive_data,
237 |             hide_outputs=hide_sensitive_data
238 |         )
239 |         
240 |         # 在追踪上下文中执行函数
241 |         with tracing_v2_enabled(client=client):
242 |             return func(*args, **kwargs)
243 |     
244 |     return wrapper
245 | 
246 | 
247 | def stream_with_debug(graph, inputs: Dict[str, Any], config: Optional[Dict[str, Any]] = None):
248 |     """
249 |     流式执行图并输出调试信息
250 |     
251 |     Args:
252 |         graph: LangGraph图实例
253 |         inputs: 输入数据
254 |         config: 配置信息
255 |         
256 |     Yields:
257 |         流式输出结果
258 |     """
259 |     if not config:
260 |         config = create_debug_config()
261 |     
262 |     print(f"🚀 开始执行工作流...")
263 |     print(f"📊 运行ID: {config.get('run_id')}")
264 |     print(f"🏷️  标签: {config.get('tags', [])}")
265 |     
266 |     try:
267 |         # 使用debug模式流式执行
268 |         for chunk in graph.stream(inputs, config, stream_mode="debug"):
269 |             print(f"🔍 调试信息: {chunk}")
270 |             yield chunk
271 |             
272 |     except Exception as e:
273 |         print(f"❌ 执行失败: {str(e)}")
274 |         event_logger.log_node_error("workflow", e)
275 |         raise
276 | 
277 | 
278 | if __name__ == "__main__":
279 |     # 测试LangSmith配置
280 |     setup_langsmith_environment()
281 |     
282 |     # 测试配置创建
283 |     test_config = create_run_config(
284 |         run_name="test_run",
285 |         tags=["test"],
286 |         metadata={"test": True}
287 |     )
288 |     print(f"测试配置: {test_config}")


--------------------------------------------------------------------------------
/src/config/redis.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Redis 配置和连接管理
  3 | 
  4 | 提供 Redis 连接配置、健康检查和连接池管理功能
  5 | """
  6 | 
  7 | import os
  8 | import redis
  9 | from typing import Optional, Dict, Any
 10 | from dataclasses import dataclass
 11 | from loguru import logger
 12 | 
 13 | @dataclass
 14 | class RedisConfig:
 15 |     """Redis 配置类"""
 16 |     
 17 |     # 连接配置
 18 |     host: str = "localhost"
 19 |     port: int = 6379
 20 |     db: int = 0
 21 |     password: Optional[str] = None
 22 |     
 23 |     # 连接池配置
 24 |     max_connections: int = 20
 25 |     retry_on_timeout: bool = True
 26 |     
 27 |     # 超时配置
 28 |     socket_connect_timeout: int = 5
 29 |     socket_timeout: int = 5
 30 |     
 31 |     # TTL 配置 (用于 LangGraph checkpointer)
 32 |     default_ttl: int = 3600  # 1小时，单位：秒
 33 |     refresh_on_read: bool = True
 34 |     
 35 |     # 键前缀
 36 |     checkpoint_prefix: str = "langgraph:checkpoint:"
 37 |     store_prefix: str = "langgraph:store:"
 38 |     
 39 |     @classmethod
 40 |     def from_env(cls) -> "RedisConfig":
 41 |         """从环境变量创建 Redis 配置"""
 42 |         return cls(
 43 |             host=os.getenv("REDIS_HOST", "localhost"),
 44 |             port=int(os.getenv("REDIS_PORT", "6379")),
 45 |             db=int(os.getenv("REDIS_DB", "0")),
 46 |             password=os.getenv("REDIS_PASSWORD"),
 47 |             max_connections=int(os.getenv("REDIS_MAX_CONNECTIONS", "20")),
 48 |             socket_connect_timeout=int(os.getenv("REDIS_SOCKET_CONNECT_TIMEOUT", "5")),
 49 |             socket_timeout=int(os.getenv("REDIS_SOCKET_TIMEOUT", "5")),
 50 |             default_ttl=int(os.getenv("REDIS_DEFAULT_TTL", "3600")),
 51 |             refresh_on_read=os.getenv("REDIS_REFRESH_ON_READ", "true").lower() == "true",
 52 |             checkpoint_prefix=os.getenv("REDIS_CHECKPOINT_PREFIX", "langgraph:checkpoint:"),
 53 |             store_prefix=os.getenv("REDIS_STORE_PREFIX", "langgraph:store:")
 54 |         )
 55 |     
 56 |     def get_connection_url(self) -> str:
 57 |         """获取 Redis 连接 URL"""
 58 |         if self.password:
 59 |             return f"redis://:{self.password}@{self.host}:{self.port}/{self.db}"
 60 |         return f"redis://{self.host}:{self.port}/{self.db}"
 61 |     
 62 |     def get_ttl_config(self) -> Dict[str, Any]:
 63 |         """获取 TTL 配置字典"""
 64 |         return {
 65 |             "default_ttl": self.default_ttl // 60,  # LangGraph Redis 期望分钟
 66 |             "refresh_on_read": self.refresh_on_read
 67 |         }
 68 | 
 69 | 
 70 | class RedisManager:
 71 |     """Redis 连接管理器"""
 72 |     
 73 |     def __init__(self, config: Optional[RedisConfig] = None):
 74 |         self.config = config or RedisConfig.from_env()
 75 |         self._redis_client: Optional[redis.Redis] = None
 76 |         
 77 |     @property
 78 |     def redis_client(self) -> redis.Redis:
 79 |         """获取 Redis 客户端（单例模式）"""
 80 |         if self._redis_client is None:
 81 |             # 创建连接池
 82 |             pool = redis.ConnectionPool(
 83 |                 host=self.config.host,
 84 |                 port=self.config.port,
 85 |                 db=self.config.db,
 86 |                 password=self.config.password,
 87 |                 max_connections=self.config.max_connections,
 88 |                 retry_on_timeout=self.config.retry_on_timeout,
 89 |                 socket_connect_timeout=self.config.socket_connect_timeout,
 90 |                 socket_timeout=self.config.socket_timeout,
 91 |                 decode_responses=True
 92 |             )
 93 |             # 显式创建同步 Redis 客户端
 94 |             self._redis_client = redis.Redis(connection_pool=pool)
 95 |         return self._redis_client
 96 |     
 97 |     def test_connection(self) -> bool:
 98 |         """测试 Redis 连接"""
 99 |         try:
100 |             # 直接调用 ping()，同步客户端应该返回布尔值
101 |             result = self.redis_client.ping()
102 |             # 确保结果是布尔值
103 |             success = bool(result)
104 |             if success:
105 |                 logger.info(f"✅ Redis 连接成功: {self.config.host}:{self.config.port}")
106 |             else:
107 |                 logger.error(f"❌ Redis ping 返回 False")
108 |             return success
109 |         except Exception as e:
110 |             logger.error(f"❌ Redis 连接失败: {e}")
111 |             return False
112 |     
113 |     def get_info(self) -> Dict[str, Any]:
114 |         """获取 Redis 服务器信息"""
115 |         try:
116 |             info = self.redis_client.info()
117 |             # 确保返回的是字典类型
118 |             if isinstance(info, dict):
119 |                 return info
120 |             else:
121 |                 logger.warning(f"Redis info() 返回了非字典类型: {type(info)}")
122 |                 return {}
123 |         except Exception as e:
124 |             logger.error(f"获取 Redis 信息失败: {e}")
125 |             return {}
126 |     
127 |     def clear_cache(self, pattern: str = "*") -> int:
128 |         """清理缓存"""
129 |         try:
130 |             keys = self.redis_client.keys(pattern)
131 |             # 确保 keys 是列表类型
132 |             if isinstance(keys, (list, tuple)) and keys:
133 |                 deleted = self.redis_client.delete(*keys)
134 |                 # 安全处理 deleted 的类型转换
135 |                 if isinstance(deleted, int):
136 |                     deleted_count = deleted
137 |                 else:
138 |                     # 对于非整数类型（包括异步类型），返回 0
139 |                     logger.warning(f"Redis delete() 返回了非整数类型: {type(deleted)}，默认为 0")
140 |                     deleted_count = 0
141 |                     
142 |                 logger.info(f"清理了 {deleted_count} 个缓存键")
143 |                 return deleted_count
144 |             elif isinstance(keys, (list, tuple)):
145 |                 # 空列表
146 |                 return 0
147 |             else:
148 |                 logger.warning(f"Redis keys() 返回了非列表类型: {type(keys)}")
149 |                 return 0
150 |         except Exception as e:
151 |             logger.error(f"清理缓存失败: {e}")
152 |             return 0
153 |     
154 |     def get_memory_usage(self) -> Dict[str, Any]:
155 |         """获取内存使用情况"""
156 |         try:
157 |             # 直接调用 info() 方法，同步 Redis 客户端应该返回字典
158 |             info = self.redis_client.info("memory")
159 |             
160 |             # 确保返回的是字典类型
161 |             if isinstance(info, dict):
162 |                 return {
163 |                     "used_memory": info.get("used_memory", 0),
164 |                     "used_memory_human": info.get("used_memory_human", "0B"),
165 |                     "used_memory_peak": info.get("used_memory_peak", 0),
166 |                     "used_memory_peak_human": info.get("used_memory_peak_human", "0B"),
167 |                     "total_system_memory": info.get("total_system_memory", 0),
168 |                     "total_system_memory_human": info.get("total_system_memory_human", "0B")
169 |                 }
170 |             else:
171 |                 # 如果不是字典类型，记录警告并返回空字典
172 |                 logger.warning(f"Redis info() 返回了非字典类型: {type(info)}")
173 |                 return {}
174 |         except Exception as e:
175 |             logger.error(f"获取内存使用情况失败: {e}")
176 |             return {}
177 |     
178 |     def close(self):
179 |         """关闭 Redis 连接"""
180 |         if self._redis_client:
181 |             self._redis_client.close()
182 |             self._redis_client = None
183 |             logger.info("Redis 连接已关闭")
184 | 
185 | 
186 | # 全局 Redis 管理器实例
187 | _redis_manager: Optional[RedisManager] = None
188 | 
189 | 
190 | def get_redis_manager() -> RedisManager:
191 |     """获取全局 Redis 管理器实例"""
192 |     global _redis_manager
193 |     if _redis_manager is None:
194 |         _redis_manager = RedisManager()
195 |     return _redis_manager
196 | 
197 | 
198 | def get_redis_config() -> RedisConfig:
199 |     """获取 Redis 配置"""
200 |     return get_redis_manager().config
201 | 
202 | 
203 | def test_redis_connection() -> bool:
204 |     """测试 Redis 连接"""
205 |     return get_redis_manager().test_connection()
206 | 
207 | 
208 | # 健康检查函数
209 | def redis_health_check() -> Dict[str, Any]:
210 |     """Redis 健康检查"""
211 |     manager = get_redis_manager()
212 |     
213 |     health_info = {
214 |         "service": "redis",
215 |         "status": "unknown",
216 |         "details": {}
217 |     }
218 |     
219 |     try:
220 |         # 测试连接
221 |         if manager.test_connection():
222 |             health_info["status"] = "healthy"
223 |             health_info["details"]["connection"] = "ok"
224 |             
225 |             # 获取服务器信息
226 |             info = manager.get_info()
227 |             if isinstance(info, dict):
228 |                 health_info["details"]["version"] = info.get("redis_version", "unknown")
229 |                 health_info["details"]["uptime"] = info.get("uptime_in_seconds", 0)
230 |             
231 |             # 获取内存使用情况
232 |             memory_info = manager.get_memory_usage()
233 |             health_info["details"]["memory"] = memory_info
234 |             
235 |         else:
236 |             health_info["status"] = "unhealthy"
237 |             health_info["details"]["error"] = "connection_failed"
238 |             
239 |     except Exception as e:
240 |         health_info["status"] = "unhealthy"
241 |         health_info["details"]["error"] = str(e)
242 |     
243 |     return health_info


--------------------------------------------------------------------------------
/static/styles.css:
--------------------------------------------------------------------------------
  1 | /* LangGraph 职称评审系统 - 样式文件 */
  2 | 
  3 | :root {
  4 |     --primary-color: #0d6efd;
  5 |     --secondary-color: #6c757d;
  6 |     --success-color: #198754;
  7 |     --danger-color: #dc3545;
  8 |     --warning-color: #ffc107;
  9 |     --info-color: #0dcaf0;
 10 |     --light-color: #f8f9fa;
 11 |     --dark-color: #212529;
 12 | }
 13 | 
 14 | body {
 15 |     font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
 16 |     background-color: #f5f7fa;
 17 |     color: var(--dark-color);
 18 | }
 19 | 
 20 | .navbar-brand {
 21 |     font-weight: 600;
 22 |     font-size: 1.25rem;
 23 | }
 24 | 
 25 | .card {
 26 |     border: none;
 27 |     border-radius: 12px;
 28 |     transition: all 0.3s ease;
 29 | }
 30 | 
 31 | .card:hover {
 32 |     transform: translateY(-2px);
 33 | }
 34 | 
 35 | .card-header {
 36 |     border-radius: 12px 12px 0 0 !important;
 37 |     border: none;
 38 |     font-weight: 600;
 39 | }
 40 | 
 41 | .btn {
 42 |     border-radius: 8px;
 43 |     font-weight: 500;
 44 |     transition: all 0.3s ease;
 45 | }
 46 | 
 47 | .btn:hover {
 48 |     transform: translateY(-1px);
 49 | }
 50 | 
 51 | /* 特色卡片样式 */
 52 | .feature-card {
 53 |     text-align: center;
 54 |     padding: 2rem 1rem;
 55 |     border-radius: 12px;
 56 |     background: white;
 57 |     box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
 58 |     transition: all 0.3s ease;
 59 |     margin-bottom: 1.5rem;
 60 | }
 61 | 
 62 | .feature-card:hover {
 63 |     transform: translateY(-4px);
 64 |     box-shadow: 0 8px 15px rgba(0, 0, 0, 0.15);
 65 | }
 66 | 
 67 | .feature-card i {
 68 |     font-size: 2.5rem;
 69 |     margin-bottom: 1rem;
 70 | }
 71 | 
 72 | .feature-card h5 {
 73 |     color: var(--dark-color);
 74 |     margin-bottom: 0.5rem;
 75 |     font-weight: 600;
 76 | }
 77 | 
 78 | .feature-card p {
 79 |     color: var(--secondary-color);
 80 |     margin: 0;
 81 |     font-size: 0.9rem;
 82 | }
 83 | 
 84 | /* 进度条样式 */
 85 | .progress {
 86 |     border-radius: 10px;
 87 |     background-color: #e9ecef;
 88 | }
 89 | 
 90 | .progress-bar {
 91 |     border-radius: 10px;
 92 |     transition: width 0.6s ease;
 93 | }
 94 | 
 95 | /* 工作流步骤样式 */
 96 | .workflow-step {
 97 |     display: flex;
 98 |     align-items: center;
 99 |     padding: 1rem;
100 |     margin-bottom: 0.5rem;
101 |     border-radius: 8px;
102 |     background: white;
103 |     border-left: 4px solid #e9ecef;
104 |     transition: all 0.3s ease;
105 | }
106 | 
107 | .workflow-step.active {
108 |     border-left-color: var(--primary-color);
109 |     background: rgba(13, 110, 253, 0.05);
110 | }
111 | 
112 | .workflow-step.completed {
113 |     border-left-color: var(--success-color);
114 |     background: rgba(25, 135, 84, 0.05);
115 | }
116 | 
117 | .workflow-step.error {
118 |     border-left-color: var(--danger-color);
119 |     background: rgba(220, 53, 69, 0.05);
120 | }
121 | 
122 | .workflow-step-icon {
123 |     width: 40px;
124 |     height: 40px;
125 |     border-radius: 50%;
126 |     display: flex;
127 |     align-items: center;
128 |     justify-content: center;
129 |     margin-right: 1rem;
130 |     font-size: 1.2rem;
131 |     background: #e9ecef;
132 |     color: var(--secondary-color);
133 |     transition: all 0.3s ease;
134 | }
135 | 
136 | .workflow-step.active .workflow-step-icon {
137 |     background: var(--primary-color);
138 |     color: white;
139 | }
140 | 
141 | .workflow-step.completed .workflow-step-icon {
142 |     background: var(--success-color);
143 |     color: white;
144 | }
145 | 
146 | .workflow-step.error .workflow-step-icon {
147 |     background: var(--danger-color);
148 |     color: white;
149 | }
150 | 
151 | .workflow-step-content h6 {
152 |     margin: 0 0 0.25rem 0;
153 |     font-weight: 600;
154 | }
155 | 
156 | .workflow-step-content p {
157 |     margin: 0;
158 |     color: var(--secondary-color);
159 |     font-size: 0.9rem;
160 | }
161 | 
162 | /* 日志容器样式 */
163 | .log-container {
164 |     height: 300px;
165 |     overflow-y: auto;
166 |     background: #2c3e50;
167 |     color: #ecf0f1;
168 |     font-family: 'Courier New', monospace;
169 |     font-size: 0.85rem;
170 |     padding: 1rem;
171 |     border-radius: 8px;
172 | }
173 | 
174 | .log-entry {
175 |     margin-bottom: 0.5rem;
176 |     padding: 0.25rem 0.5rem;
177 |     border-radius: 4px;
178 |     word-wrap: break-word;
179 | }
180 | 
181 | .log-entry.started {
182 |     background: rgba(13, 110, 253, 0.2);
183 |     border-left: 3px solid var(--primary-color);
184 | }
185 | 
186 | .log-entry.progress {
187 |     background: rgba(255, 193, 7, 0.2);
188 |     border-left: 3px solid var(--warning-color);
189 | }
190 | 
191 | .log-entry.completed {
192 |     background: rgba(25, 135, 84, 0.2);
193 |     border-left: 3px solid var(--success-color);
194 | }
195 | 
196 | .log-entry.error {
197 |     background: rgba(220, 53, 69, 0.2);
198 |     border-left: 3px solid var(--danger-color);
199 | }
200 | 
201 | .log-timestamp {
202 |     color: #95a5a6;
203 |     font-size: 0.75rem;
204 |     margin-right: 0.5rem;
205 | }
206 | 
207 | /* 任务列表样式 */
208 | .task-item {
209 |     padding: 1rem;
210 |     border-bottom: 1px solid #e9ecef;
211 |     cursor: pointer;
212 |     transition: all 0.3s ease;
213 | }
214 | 
215 | .task-item:hover {
216 |     background: rgba(13, 110, 253, 0.05);
217 | }
218 | 
219 | .task-item.active {
220 |     background: rgba(13, 110, 253, 0.1);
221 |     border-left: 4px solid var(--primary-color);
222 | }
223 | 
224 | .task-status {
225 |     font-size: 0.75rem;
226 |     padding: 0.25rem 0.5rem;
227 |     border-radius: 12px;
228 |     font-weight: 600;
229 |     text-transform: uppercase;
230 | }
231 | 
232 | .task-status.started {
233 |     background: rgba(13, 110, 253, 0.1);
234 |     color: var(--primary-color);
235 | }
236 | 
237 | .task-status.processing {
238 |     background: rgba(255, 193, 7, 0.1);
239 |     color: #996404;
240 | }
241 | 
242 | .task-status.completed {
243 |     background: rgba(25, 135, 84, 0.1);
244 |     color: var(--success-color);
245 | }
246 | 
247 | .task-status.failed {
248 |     background: rgba(220, 53, 69, 0.1);
249 |     color: var(--danger-color);
250 | }
251 | 
252 | /* 状态徽章样式 */
253 | .badge.bg-primary {
254 |     background-color: var(--primary-color) !important;
255 | }
256 | 
257 | .badge.bg-warning {
258 |     background-color: var(--warning-color) !important;
259 |     color: var(--dark-color) !important;
260 | }
261 | 
262 | .badge.bg-success {
263 |     background-color: var(--success-color) !important;
264 | }
265 | 
266 | .badge.bg-danger {
267 |     background-color: var(--danger-color) !important;
268 | }
269 | 
270 | /* 文件输入样式 */
271 | .form-control:focus {
272 |     border-color: var(--primary-color);
273 |     box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
274 | }
275 | 
276 | .form-select:focus {
277 |     border-color: var(--primary-color);
278 |     box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
279 | }
280 | 
281 | /* 动画效果 */
282 | @keyframes pulse {
283 |     0% {
284 |         transform: scale(1);
285 |     }
286 |     50% {
287 |         transform: scale(1.05);
288 |     }
289 |     100% {
290 |         transform: scale(1);
291 |     }
292 | }
293 | 
294 | .pulse {
295 |     animation: pulse 2s infinite;
296 | }
297 | 
298 | @keyframes fadeInUp {
299 |     from {
300 |         opacity: 0;
301 |         transform: translateY(30px);
302 |     }
303 |     to {
304 |         opacity: 1;
305 |         transform: translateY(0);
306 |     }
307 | }
308 | 
309 | .fade-in-up {
310 |     animation: fadeInUp 0.6s ease-out;
311 | }
312 | 
313 | /* 响应式设计 */
314 | @media (max-width: 768px) {
315 |     .feature-card {
316 |         padding: 1.5rem 1rem;
317 |     }
318 |     
319 |     .workflow-step {
320 |         flex-direction: column;
321 |         text-align: center;
322 |     }
323 |     
324 |     .workflow-step-icon {
325 |         margin-right: 0;
326 |         margin-bottom: 0.5rem;
327 |     }
328 |     
329 |     .log-container {
330 |         height: 200px;
331 |         font-size: 0.8rem;
332 |     }
333 | }
334 | 
335 | /* 滚动条样式 */
336 | .log-container::-webkit-scrollbar {
337 |     width: 8px;
338 | }
339 | 
340 | .log-container::-webkit-scrollbar-track {
341 |     background: #34495e;
342 |     border-radius: 4px;
343 | }
344 | 
345 | .log-container::-webkit-scrollbar-thumb {
346 |     background: #7f8c8d;
347 |     border-radius: 4px;
348 | }
349 | 
350 | .log-container::-webkit-scrollbar-thumb:hover {
351 |     background: #95a5a6;
352 | }
353 | 
354 | /* 加载动画 */
355 | .spinner-border-sm {
356 |     width: 1rem;
357 |     height: 1rem;
358 | }
359 | 
360 | /* 工具提示样式 */
361 | .tooltip {
362 |     font-size: 0.8rem;
363 | }
364 | 
365 | /* 模态框样式 */
366 | .modal-content {
367 |     border-radius: 12px;
368 |     border: none;
369 |     box-shadow: 0 20px 25px rgba(0, 0, 0, 0.15);
370 | }
371 | 
372 | .modal-header {
373 |     border-radius: 12px 12px 0 0;
374 |     border-bottom: 1px solid #e9ecef;
375 | }
376 | 
377 | /* 代码块样式 */
378 | pre {
379 |     background: #2c3e50 !important;
380 |     color: #ecf0f1 !important;
381 |     border: none;
382 |     border-radius: 8px;
383 |     font-family: 'Courier New', monospace;
384 |     font-size: 0.85rem;
385 |     max-height: 400px;
386 |     overflow-y: auto;
387 | }
388 | 
389 | /* 连接状态指示器 */
390 | .connection-status {
391 |     position: fixed;
392 |     top: 20px;
393 |     right: 20px;
394 |     z-index: 1050;
395 |     padding: 0.5rem 1rem;
396 |     border-radius: 20px;
397 |     font-size: 0.8rem;
398 |     font-weight: 600;
399 |     transition: all 0.3s ease;
400 | }
401 | 
402 | .connection-status.connected {
403 |     background: rgba(25, 135, 84, 0.9);
404 |     color: white;
405 | }
406 | 
407 | .connection-status.disconnected {
408 |     background: rgba(220, 53, 69, 0.9);
409 |     color: white;
410 | }
411 | 
412 | .connection-status.connecting {
413 |     background: rgba(255, 193, 7, 0.9);
414 |     color: var(--dark-color);
415 | }


--------------------------------------------------------------------------------
/src/tools/file_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 文件处理工具
  3 | 
  4 | 提供文件处理相关的工具函数：
  5 | - ZIP文件解压
  6 | - 17个标准文件夹结构验证
  7 | - Markdown文件处理
  8 | - 文件路径处理
  9 | """
 10 | 
 11 | import zipfile
 12 | import re
 13 | import markdown
 14 | from pathlib import Path
 15 | from typing import List, Dict, Any, Optional
 16 | 
 17 | async def extract_zip_file(zip_path: str) -> Dict[str, Any]:
 18 |     """
 19 |     解压ZIP文件并返回解压结果
 20 |     
 21 |     Args:
 22 |         zip_path: ZIP文件路径
 23 |         
 24 |     Returns:
 25 |         解压结果字典，包含解压路径和文件列表
 26 |     """
 27 |     try:
 28 |         import asyncio
 29 |         
 30 |         # 使用异步方式处理ZIP文件
 31 |         def _extract_zip():
 32 |             with zipfile.ZipFile(zip_path, 'r') as zip_ref:
 33 |                 # 解压到当前目录的 extracted 文件夹
 34 |                 extract_dir = Path(zip_path).parent / "extracted"
 35 |                 extract_dir.mkdir(exist_ok=True)
 36 |                 
 37 |                 zip_ref.extractall(extract_dir)
 38 |                 
 39 |                 # 收集所有解压的文件
 40 |                 extracted_files = []
 41 |                 import os
 42 |                 # 使用os.walk代替rglob来避免阻塞调用
 43 |                 for root, dirs, files in os.walk(extract_dir):
 44 |                     for file in files:
 45 |                         file_path = os.path.join(root, file)
 46 |                         extracted_files.append(file_path)
 47 |                 return extract_dir, extracted_files
 48 |         
 49 |         extract_dir, extracted_files = await asyncio.to_thread(_extract_zip)
 50 |         
 51 |         return {
 52 |             "extraction_path": str(extract_dir),
 53 |             "files": extracted_files,
 54 |             "success": True
 55 |         }
 56 |     
 57 |     except Exception as e:
 58 |         print(f"解压失败: {e}")
 59 |         return {
 60 |             "extraction_path": None,
 61 |             "files": [],
 62 |             "success": False,
 63 |             "error": str(e)
 64 |         }
 65 | 
 66 | async def validate_folder_structure(extraction_path: str) -> Dict[str, Any]:
 67 |     """
 68 |     验证17个标准文件夹结构
 69 |     
 70 |     支持文件夹在根目录或下一层子目录中
 71 |     
 72 |     Args:
 73 |         extraction_path: 解压后的根目录路径
 74 |         
 75 |     Returns:
 76 |         验证结果字典
 77 |     """
 78 |     # 17个标准文件夹名称
 79 |     standard_folders = [
 80 |         "1.教育经历",
 81 |         "2.工作经历",
 82 |         "3.继续教育(培训情况)",
 83 |         "4.学术技术兼职情况",
 84 |         "5.获奖情况",
 85 |         "6.获得荣誉称号情况",
 86 |         "7.主持参与科研项目(基金)情况",
 87 |         "8.主持参与工程技术项目情况",
 88 |         "9.论文",
 89 |         "10.著(译)作(教材)",
 90 |         "11.专利(著作权)情况",
 91 |         "12.主持参与指定标准情况",
 92 |         "13.成果被批示、采纳、运用和推广情况",
 93 |         "14.资质证书",
 94 |         "15.奖惩情况",
 95 |         "16.考核情况",
 96 |         "17.申报材料附件信息"
 97 |     ]
 98 |     
 99 |     extraction_dir = Path(extraction_path)
100 |     
101 |     # 递归查找17个标准文件夹（在根目录或下一层子目录中）
102 |     async def find_folders_recursively(search_dir: Path, max_depth: int = 2) -> Dict[str, str]:
103 |         """
104 |         递归查找标准文件夹
105 |         
106 |         Args:
107 |             search_dir: 搜索目录
108 |             max_depth: 最大搜索深度（1=仅根目录，2=根目录+一层子目录）
109 |             
110 |         Returns:
111 |             找到的文件夹映射 {文件夹名: 路径}
112 |         """
113 |         found_folders = {}
114 |         
115 |         async def _search_directory(current_dir: Path, current_depth: int):
116 |             if current_depth > max_depth:
117 |                 return
118 |                 
119 |             try:
120 |                 # 使用异步方式读取目录 - 使用os.scandir避免阻塞
121 |                 import asyncio
122 |                 import os
123 |                 
124 |                 try:
125 |                     # 使用asyncio.to_thread包装os.scandir调用
126 |                     def _list_directory():
127 |                         return list(os.scandir(current_dir))
128 |                     
129 |                     entries = await asyncio.to_thread(_list_directory)
130 |                     
131 |                     for entry in entries:
132 |                         # 检查是否是目录
133 |                         def _check_is_dir():
134 |                             return entry.is_dir()
135 |                         
136 |                         if await asyncio.to_thread(_check_is_dir):
137 |                             folder_name = entry.name
138 |                             # 检查是否是标准文件夹
139 |                             if folder_name in standard_folders and folder_name not in found_folders:
140 |                                 found_folders[folder_name] = str(entry.path)
141 |                                 print(f"📁 找到标准文件夹: {folder_name} -> {entry.path}")
142 |                             
143 |                             # 如果还没达到最大深度，继续递归搜索
144 |                             if current_depth < max_depth:
145 |                                 from pathlib import Path
146 |                                 await _search_directory(Path(entry.path), current_depth + 1)
147 |                 except OSError as e:
148 |                     print(f"⚠️ 无法扫描目录 {current_dir}: {e}")
149 |             except PermissionError:
150 |                 print(f"⚠️ 无法访问目录: {current_dir}")
151 |                 
152 |         await _search_directory(search_dir, 1)
153 |         return found_folders
154 |     
155 |     print(f"🔍 开始递归查找17个标准文件夹（最大深度2层）...")
156 |     found_folder_paths = await find_folders_recursively(extraction_dir, max_depth=2)
157 |     
158 |     # 构建文件夹信息
159 |     folders_found = []
160 |     missing_folders = []
161 |     
162 |     for standard_folder in standard_folders:
163 |         if standard_folder in found_folder_paths:
164 |             folders_found.append({
165 |                 "name": standard_folder,
166 |                 "path": found_folder_paths[standard_folder],
167 |                 "exists": True
168 |             })
169 |         else:
170 |             missing_folders.append(standard_folder)
171 |     
172 |     # 获取所有实际存在的文件夹（用于检查额外文件夹）
173 |     import asyncio
174 |     import os
175 |     all_actual_folders = []
176 |     
177 |     async def collect_folders():
178 |         # 使用os.walk代替rglob来避免阻塞调用
179 |         def _walk_dirs():
180 |             folders = []
181 |             for root, dirs, files in os.walk(extraction_dir):
182 |                 for dir_name in dirs:
183 |                     folders.append(dir_name)
184 |             return folders
185 |         
186 |         folder_names = await asyncio.to_thread(_walk_dirs)
187 |         all_actual_folders.extend(folder_names)
188 |     
189 |     await collect_folders()
190 |     
191 |     # 检查额外的文件夹
192 |     extra_folders = []
193 |     for actual_folder in set(all_actual_folders):
194 |         if actual_folder not in standard_folders:
195 |             extra_folders.append(actual_folder)
196 |     
197 |     # 判断是否合规
198 |     is_valid = len(missing_folders) == 0
199 |     
200 |     print(f"📊 文件夹验证结果: 找到 {len(folders_found)}/{len(standard_folders)} 个标准文件夹")
201 |     if missing_folders:
202 |         print(f"⚠️ 缺失的文件夹: {missing_folders}")
203 |     
204 |     return {
205 |         "is_valid": is_valid,
206 |         "folders_found": folders_found,
207 |         "missing_folders": missing_folders,
208 |         "extra_folders": extra_folders,
209 |         "total_standard_folders": len(standard_folders),
210 |         "found_count": len(folders_found)
211 |     }
212 | 
213 | 
214 | def analyze_markdown_structure(md_content: str) -> Dict[str, Any]:
215 |     """
216 |     分析Markdown文件结构
217 |     
218 |     Args:
219 |         md_content: Markdown内容
220 |         
221 |     Returns:
222 |         结构分析结果
223 |     """
224 |     import datetime
225 |     
226 |     try:
227 |         # 基本统计信息
228 |         lines = md_content.split('\n')
229 |         
230 |         # 提取标题
231 |         headers = []
232 |         for line in lines:
233 |             if line.strip().startswith('#'):
234 |                 level = len(line) - len(line.lstrip('#'))
235 |                 title = line.strip('#').strip()
236 |                 headers.append({
237 |                     "level": level,
238 |                     "title": title
239 |                 })
240 |         
241 |         # 提取列表项
242 |         list_items = []
243 |         for line in lines:
244 |             stripped = line.strip()
245 |             if stripped.startswith('-') or stripped.startswith('*') or re.match(r'^\d+\.', stripped):
246 |                 list_items.append(stripped)
247 |         
248 |         return {
249 |             "total_lines": len(lines),
250 |             "total_chars": len(md_content),
251 |             "headers": headers,
252 |             "list_items": list_items,
253 |             "has_content": len(md_content.strip()) > 0,
254 |             "extraction_timestamp": datetime.datetime.now().isoformat()
255 |         }
256 |         
257 |     except Exception as e:
258 |         return {
259 |             "total_lines": 0,
260 |             "total_chars": 0,
261 |             "headers": [],
262 |             "list_items": [],
263 |             "has_content": False,
264 |             "error": str(e)
265 |         }
266 | 
267 | 
268 | async def extract_markdown_content(md_file_path: str) -> Dict[str, Any]:
269 |     """
270 |     提取Markdown文件内容
271 |     
272 |     Args:
273 |         md_file_path: Markdown文件路径
274 |         
275 |     Returns:
276 |         提取结果
277 |     """
278 |     try:
279 |         import asyncio
280 |         # 使用异步方式读取文件
281 |         def _read_file():
282 |             with open(md_file_path, 'r', encoding='utf-8') as f:
283 |                 return f.read()
284 |         
285 |         content = await asyncio.to_thread(_read_file)
286 |         
287 |         structure = analyze_markdown_structure(content)
288 |         
289 |         return {
290 |             "file_path": md_file_path,
291 |             "content": content,
292 |             "structure": structure,
293 |             "success": True
294 |         }
295 |         
296 |     except Exception as e:
297 |         return {
298 |             "file_path": md_file_path,
299 |             "content": "",
300 |             "structure": {},
301 |             "success": False,
302 |             "error": str(e)
303 |         }


--------------------------------------------------------------------------------
/src/nodes/core_info_extraction.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 核心信息提取节点
  3 | 
  4 | 从1-17项材料中分别提取各自的核心信息：
  5 | - 每项材料提取相应的关键字段
  6 | - 输出17个字段的结构化信息
  7 | - 支持AI增强的信息提取
  8 | """
  9 | 
 10 | from typing import Dict, Any, Optional
 11 | from src.graph.state import AuditState
 12 | from src.tools.ai_utils import extract_core_information_with_ai, extract_category_core_info_with_ai
 13 | 
 14 | 
 15 | def core_info_extraction_node(state: AuditState) -> Dict[str, Any]:
 16 |     """
 17 |     完全无缓存的核心信息提取节点 - 每次都处理全新数据
 18 |     
 19 |     🚨 已完全取消缓存机制，确保每次传输的信息都是全新的、一次性的
 20 |     """
 21 |     try:
 22 |         print(f"🎯 开始无缓存核心信息提取...")
 23 |         
 24 |         # 直接获取当前状态的数据 - 不使用任何缓存
 25 |         api_extraction_results = state.get("api_extraction_results", {})
 26 |         extracted_content = state.get("extracted_content", {})
 27 |         
 28 |         print(f"🔍 当前状态数据:")
 29 |         print(f"   API提取结果: {len(api_extraction_results)} 项")
 30 |         print(f"   备用提取内容: {len(extracted_content)} 项")
 31 |         
 32 |         # 确定使用哪个数据源 - 直接判断，不做缓存检查
 33 |         if api_extraction_results:
 34 |             data_source = api_extraction_results
 35 |             print(f"✅ 使用API提取结果: {len(api_extraction_results)} 项")
 36 |         elif extracted_content:
 37 |             data_source = extracted_content
 38 |             print(f"⚠️ 使用备用提取内容: {len(extracted_content)} 项")
 39 |         else:
 40 |             print("⚠️ 没有找到提取的内容，跳过核心信息提取")
 41 |             return {
 42 |                 "core_info": _create_empty_core_info_structure(),
 43 |                 "current_step": "core_info_extraction_skipped",
 44 |                 "processing_logs": ["未找到有效数据，跳过核心信息提取"]
 45 |             }
 46 |         
 47 |         # 直接创建17项核心信息结构 - 不使用缓存
 48 |         core_info_structure = _create_empty_core_info_structure()
 49 |         
 50 |         # 1-17项材料分类映射
 51 |         material_categories = {
 52 |             "1.教育经历": "education",
 53 |             "教育经历": "education",
 54 |             "2.工作经历": "work_experience", 
 55 |             "工作经历": "work_experience",
 56 |             "3.继续教育(培训情况)": "continuing_education",
 57 |             "继续教育": "continuing_education",
 58 |             "培训情况": "continuing_education",
 59 |             "4.学术技术兼职情况": "academic_positions",
 60 |             "学术技术兼职情况": "academic_positions",
 61 |             "5.获奖情况": "awards",
 62 |             "获奖情况": "awards",
 63 |             "6.获得荣誉称号情况": "honors",
 64 |             "荣誉称号": "honors",
 65 |             "7.主持参与科研项目(基金)情况": "research_projects",
 66 |             "科研项目": "research_projects",
 67 |             "8.主持参与工程技术项目情况": "engineering_projects",
 68 |             "工程项目": "engineering_projects",
 69 |             "9.论文": "papers",
 70 |             "论文": "papers",
 71 |             "10.著(译)作(教材)": "publications",
 72 |             "著作": "publications",
 73 |             "教材": "publications",
 74 |             "11.专利(著作权)情况": "patents",
 75 |             "专利": "patents",
 76 |             "12.主持参与指定标准情况": "standards",
 77 |             "标准制定": "standards",
 78 |             "13.成果被批示、采纳、运用和推广情况": "achievements",
 79 |             "成果应用": "achievements",
 80 |             "14.资质证书": "certificates",
 81 |             "资质证书": "certificates",
 82 |             "15.奖惩情况": "rewards_punishments",
 83 |             "奖惩情况": "rewards_punishments",
 84 |             "16.考核情况": "evaluations",
 85 |             "考核情况": "evaluations",
 86 |             "17.申报材料附件信息": "attachments",
 87 |             "附件信息": "attachments"
 88 |         }
 89 |         
 90 |         print(f"📁 发现 {len(data_source)} 个材料类型需要提取核心信息")
 91 |         
 92 |         # 处理每个材料类型
 93 |         for folder_name, folder_data in data_source.items():
 94 |             print(f"🔍 正在处理: {folder_name}")
 95 |             
 96 |             # 确定材料类别
 97 |             category_key = None
 98 |             for key, category in material_categories.items():
 99 |                 if key in folder_name or folder_name in key:
100 |                     category_key = category
101 |                     break
102 |             
103 |             if not category_key:
104 |                 print(f"⚠️ 未识别的材料类型: {folder_name}，归类为附件信息")
105 |                 category_key = "attachments"
106 |             
107 |             # 提取材料内容
108 |             material_content = _extract_material_content_from_folder(folder_data)
109 |             
110 |             if not material_content.strip():
111 |                 print(f"⚠️ {folder_name} 没有有效内容")
112 |                 continue
113 |                 
114 |             # 使用AI提取该材料类型的核心信息
115 |             try:
116 |                 extracted_info = extract_category_core_info_with_ai(
117 |                     category_key, folder_name, material_content
118 |                 )
119 |                 
120 |                 if extracted_info:
121 |                     core_info_structure[category_key] = extracted_info
122 |                     print(f"✅ {folder_name} 核心信息提取成功")
123 |                 else:
124 |                     print(f"⚠️ {folder_name} 核心信息提取失败")
125 |                     
126 |             except Exception as e:
127 |                 print(f"⚠️ {folder_name} 信息提取异常: {e}")
128 |                 # 创建默认结构，保持数据一致性
129 |                 core_info_structure[category_key] = {
130 |                     "name": None,
131 |                     "id_number": None,
132 |                     "extracted_from": [folder_name],
133 |                     "content_summary": None,
134 |                     "key_info": {
135 |                         "category": category_key,
136 |                         "folder_name": folder_name,
137 |                         "error": str(e),
138 |                         "extracted_at": _get_current_timestamp()
139 |                     }
140 |                 }
141 |                 continue
142 |         
143 |         # 统计提取结果
144 |         extracted_categories = []
145 |         name_count = 0
146 |         id_count = 0
147 |         
148 |         for category, info in core_info_structure.items():
149 |             if info and info.get('name'):
150 |                 name_count += 1
151 |             if info and info.get('id_number'):
152 |                 id_count += 1
153 |             if info and (info.get('name') or info.get('id_number') or info.get('content_summary')):
154 |                 extracted_categories.append(category)
155 |         
156 |         print(f"✅ 核心信息提取完成:")
157 |         print(f"   成功处理 {len(extracted_categories)} 项材料")
158 |         print(f"   提取到姓名的材料: {name_count} 项")
159 |         print(f"   提取到身份证号的材料: {id_count} 项")
160 |         
161 |         # 🚨 确保数据结构符合交叉校验节点的期望
162 |         return {
163 |             "core_info": core_info_structure,
164 |             "current_step": "core_info_extraction_completed",
165 |             "processing_logs": [
166 |                 f"核心信息提取完成: 成功处理{len(extracted_categories)}项材料",
167 |                 f"提取到姓名的材料: {name_count}项",
168 |                 f"提取到身份证号的材料: {id_count}项"
169 |             ]
170 |         }
171 |         
172 |     except Exception as e:
173 |         print(f"❌ 核心信息提取失败: {str(e)}")
174 |         # 🚨 即使失败也要返回有效的空结构，确保后续节点能正常处理
175 |         return {
176 |             "core_info": _create_empty_core_info_structure(),
177 |             "current_step": "core_info_extraction_failed",
178 |             "error_message": f"核心信息提取失败: {str(e)}",
179 |             "processing_logs": [f"核心信息提取失败: {str(e)}"]
180 |         }
181 | 
182 | 
183 | def _create_empty_core_info_structure() -> Dict[str, Any]:
184 |     """创建空的1-17项核心信息结构，每项都包含姓名和身份证号用于交叉校验"""
185 |     # 为每一项材料创建相同的基础结构
186 |     base_structure = {
187 |         "name": None,           # 姓名（从该项材料中提取）
188 |         "id_number": None,      # 身份证号（从该项材料中提取）
189 |         "extracted_from": [],   # 信息来源文件
190 |         "content_summary": None, # 内容摘要
191 |         "key_info": {}          # 该项材料的关键信息
192 |     }
193 |     
194 |     return {
195 |         # 1-17项材料，每项都包含姓名和身份证号用于交叉校验
196 |         "education": base_structure.copy(),         # 1.教育经历
197 |         "work_experience": base_structure.copy(),   # 2.工作经历
198 |         "continuing_education": base_structure.copy(),  # 3.继续教育(培训情况)
199 |         "academic_positions": base_structure.copy(),    # 4.学术技术兼职情况
200 |         "awards": base_structure.copy(),            # 5.获奖情况
201 |         "honors": base_structure.copy(),            # 6.获得荣誉称号情况
202 |         "research_projects": base_structure.copy(), # 7.主持参与科研项目(基金)情况
203 |         "engineering_projects": base_structure.copy(),  # 8.主持参与工程技术项目情况
204 |         "papers": base_structure.copy(),            # 9.论文
205 |         "publications": base_structure.copy(),     # 10.著(译)作(教材)
206 |         "patents": base_structure.copy(),           # 11.专利(著作权)情况
207 |         "standards": base_structure.copy(),         # 12.主持参与指定标准情况
208 |         "achievements": base_structure.copy(),      # 13.成果被批示、采纳、运用和推广情况
209 |         "certificates": base_structure.copy(),      # 14.资质证书
210 |         "rewards_punishments": base_structure.copy(),   # 15.奖惩情况
211 |         "evaluations": base_structure.copy(),       # 16.考核情况
212 |         "attachments": base_structure.copy()        # 17.申报材料附件信息
213 |     }
214 | 
215 | 
216 | def _extract_material_content_from_folder(folder_data: Any) -> str:
217 |     """从文件夹数据中提取材料内容"""
218 |     material_content = ""
219 |     
220 |     if isinstance(folder_data, list):
221 |         # 处理api_extraction_results格式
222 |         for json_item in folder_data:
223 |             if isinstance(json_item, dict):
224 |                 content = json_item.get("content", {})
225 |                 if isinstance(content, dict):
226 |                     # 尝试多种可能的内容字段
227 |                     for key in ["md_content", "raw_markdown", "text", "content"]:
228 |                         if key in content:
229 |                             text_content = str(content[key])
230 |                             if text_content.strip():
231 |                                 material_content += text_content + "\n\n"
232 |                             break
233 |                     if not material_content:
234 |                         material_content += str(content) + "\n\n"
235 |                 else:
236 |                     material_content += str(content) + "\n\n"
237 |     
238 |     elif isinstance(folder_data, dict):
239 |         # 处理extracted_content格式
240 |         content_list = folder_data.get("content", [])
241 |         if isinstance(content_list, list):
242 |             for item in content_list:
243 |                 if isinstance(item, dict):
244 |                     if "json_data" in item:
245 |                         json_data = item["json_data"]
246 |                         content = json_data.get("content", {})
247 |                         if isinstance(content, dict):
248 |                             for key in ["md_content", "raw_markdown", "text", "content"]:
249 |                                 if key in content:
250 |                                     text_content = str(content[key])
251 |                                     if text_content.strip():
252 |                                         material_content += text_content + "\n\n"
253 |                                     break
254 |                         else:
255 |                             material_content += str(content) + "\n\n"
256 |                     elif "content" in item:
257 |                         material_content += str(item["content"]) + "\n\n"
258 |                     else:
259 |                         material_content += str(item) + "\n\n"
260 |     
261 |     return material_content.strip()
262 | 
263 | 
264 | def _get_current_timestamp() -> str:
265 |     """获取当前时间戳"""
266 |     from datetime import datetime
267 |     return datetime.now().isoformat()


--------------------------------------------------------------------------------
/src/graph/edges.py:
--------------------------------------------------------------------------------
  1 | """
  2 | LangGraph边和路由逻辑定义
  3 | 
  4 | 包含工作流中的条件边和路由函数：
  5 | - 根据PDF页数决定处理策略的路由
  6 | - 根据材料类型决定校验规则的路由  
  7 | - 根据校验结果决定后续流程的路由
  8 | - 支持Send API实现的并行分支
  9 | """
 10 | 
 11 | from typing import Dict, Any, List, Union
 12 | from .state import AuditState
 13 | 
 14 | # 导入Send API用于并行处理
 15 | try:
 16 |     from langgraph.types import Send
 17 |     SEND_AVAILABLE = True
 18 | except ImportError:
 19 |     Send = None
 20 |     SEND_AVAILABLE = False
 21 | 
 22 | 
 23 | def should_continue_processing(state: AuditState) -> str:
 24 |     """
 25 |     判断是否继续处理流程
 26 |     
 27 |     Returns:
 28 |         "continue": 继续处理
 29 |         "error": 发生错误，终止流程
 30 |     """
 31 |     if state.get("error_message"):
 32 |         return "error"
 33 |     
 34 |     if not state.get("uploaded_file"):
 35 |         return "error"
 36 |     
 37 |     return "continue"
 38 | 
 39 | 
 40 | def route_folder_validation(state: AuditState) -> str:
 41 |     """
 42 |     根据文件夹结构验证结果决定处理策略
 43 |     
 44 |     Returns:
 45 |         "process_folders": 文件夹结构正确，继续处理
 46 |         "error": 文件夹结构错误，终止流程
 47 |     """
 48 |     folder_validation = state.get("folder_validation", {})
 49 |     
 50 |     # 检查是否有17个标准文件夹
 51 |     if not folder_validation:
 52 |         return "error"
 53 |     
 54 |     folders_found = folder_validation.get("folders_found", [])
 55 |     if len(folders_found) < 17:
 56 |         return "error"
 57 |     
 58 |     return "process_folders"
 59 | 
 60 | 
 61 | def should_continue_content_analysis(state: AuditState) -> str:
 62 |     """
 63 |     判断是否继续内容分析
 64 |     
 65 |     Returns:
 66 |         "analyze": 继续分析
 67 |         "skip_analysis": 跳过分析
 68 |         "error": 发生错误
 69 |     """
 70 |     if state.get("error_message"):
 71 |         return "error"
 72 |     
 73 |     extracted_content = state.get("extracted_content", {})
 74 |     if not extracted_content:
 75 |         return "skip_analysis"
 76 |     
 77 |     return "analyze"
 78 | 
 79 | 
 80 | def route_to_cross_validation(state: AuditState) -> str:
 81 |     """
 82 |     决定是否进行交叉校验
 83 |     
 84 |     Returns:
 85 |         "cross_validate": 进行交叉校验
 86 |         "skip_cross_validation": 跳过交叉校验
 87 |         "error": 发生错误
 88 |     """
 89 |     if state.get("error_message"):
 90 |         return "error"
 91 |     
 92 |     # 检查是否有材料校验结果
 93 |     material_validation = state.get("material_validation", {})
 94 |     if not material_validation:
 95 |         return "skip_cross_validation"
 96 |     
 97 |     # 检查是否有核心信息
 98 |     core_info = state.get("core_info")
 99 |     extracted_content = state.get("extracted_content", {})
100 |     
101 |     if not core_info and not extracted_content:
102 |         return "skip_cross_validation"
103 |     
104 |     return "cross_validate"
105 | 
106 | 
107 | def should_generate_report(state: AuditState) -> str:
108 |     """
109 |     判断是否应该生成报告
110 |     
111 |     Returns:
112 |         "generate_report": 生成报告
113 |         "error": 发生错误，终止流程
114 |     """
115 |     if state.get("error_message"):
116 |         return "error"
117 |     
118 |     # 只要有任何处理结果就生成报告
119 |     has_content = any([
120 |         state.get("extracted_content"),
121 |         state.get("material_validation"),
122 |         state.get("cross_validation"),
123 |         state.get("folder_classification")
124 |     ])
125 |     
126 |     if has_content:
127 |         return "generate_report"
128 |     else:
129 |         return "error"
130 | def check_pdf_extraction_for_parallel_processing(state: AuditState) -> Union[List, str]:
131 |     """
132 |     PDF提取完成后，并行分发到core_info_extraction和validation节点
133 |     
134 |     确保PDF提取的数据能同时进入核心信息提取和材料校验
135 |     
136 |     Returns:
137 |         Send对象列表，发送到core_info_extraction和validation
138 |         或者在失败时返回END
139 |     """
140 |     if not SEND_AVAILABLE or Send is None:
141 |         print("⚠️ Send API不可用，使用传统路由")
142 |         # 检查PDF提取状态
143 |         status = check_pdf_extraction_status(state)
144 |         if status == "pdf_extraction_success":
145 |             return "core_info_extraction"  # 退化到传统路由
146 |         else:
147 |             return "END"
148 |     
149 |     # 检查PDF提取状态
150 |     status = check_pdf_extraction_status(state)
151 |     
152 |     if status == "pdf_extraction_success":
153 |         print(f"🚀 PDF提取成功，并行分发到核心信息提取和校验节点")
154 |         
155 |         # 并行发送到两个处理节点
156 |         return [
157 |             Send("core_info_extraction", state),  # 核心信息提取
158 |             Send("validation", state)             # 直接进入校验
159 |         ]
160 |     else:
161 |         print("❌ PDF提取失败，终止流程")
162 |         return "END"
163 | 
164 | 
165 | def check_core_info_for_cross_validation(state: AuditState) -> str:
166 |     """
167 |     检查核心信息是否完成，决定是否进行交叉验证
168 |     
169 |     注意：LangGraph不支持真正的"等待两个节点都完成"逻辑
170 |     这里简化为：只要有核心信息就进行交叉验证
171 |     
172 |     Returns:
173 |         "proceed_cross_validation": 进行交叉验证
174 |         "skip_cross_validation": 跳过交叉验证
175 |     """
176 |     core_info = state.get("core_info")
177 |     extracted_content = state.get("extracted_content", {})
178 |     
179 |     # 只要有核心信息和提取内容就进行交叉验证
180 |     if core_info is not None and extracted_content:
181 |         return "proceed_cross_validation"
182 |     else:
183 |         return "skip_cross_validation"
184 | 
185 | 
186 | def check_pdf_extraction_status(state: AuditState) -> str:
187 |     """
188 |     检查PDF提取状态，确保PDF内容提取完成后才进行下一步
189 |     
190 |     这是关键的状态判断函数，遵循LangGraph条件边的最佳实践
191 |     
192 |     Returns:
193 |         "pdf_extraction_success": PDF提取成功，继续后续流程
194 |         "pdf_extraction_failed": PDF提取失败，跳转到错误处理
195 |         "pdf_extraction_pending": PDF提取正在进行中（理论上不应该出现）
196 |     """
197 |     print("🔍 检查PDF提取状态...")
198 |     
199 |     # 检查当前步骤状态
200 |     current_step = state.get("current_step", "")
201 |     print(f"📋 当前步骤: {current_step}")
202 |     
203 |     # 修复被连接的状态字符串问题
204 |     if "pdf_extraction_failed" in current_step:
205 |         print("❌ PDF提取已标记为失败")
206 |         return "pdf_extraction_failed"
207 |     
208 |     if "pdf_extraction_completed" in current_step:
209 |         print("✅ PDF提取已标记为完成")
210 |         # 检查是否有实际的提取结果
211 |         pdf_extraction_results = state.get("pdf_extraction_results", {})
212 |         api_extraction_results = state.get("api_extraction_results", {})
213 |         if pdf_extraction_results or api_extraction_results:
214 |             print(f"📊 找到PDF提取结果: {len(pdf_extraction_results)} 个文件夹")
215 |             return "pdf_extraction_success"
216 |         else:
217 |             print("⚠️ PDF提取完成但没有结果数据")
218 |             return "pdf_extraction_failed"
219 |     
220 |     # 检查是否有实际的提取结果或空文件夹结构
221 |     pdf_extraction_results = state.get("pdf_extraction_results", {})
222 |     api_extraction_results = state.get("api_extraction_results", {})
223 |     
224 |     # 只要有文件夹结构就认为成功，不一定要有PDF文件
225 |     if pdf_extraction_results:
226 |         total_files = 0
227 |         successful_files = 0
228 |         
229 |         for folder_name, folder_data in pdf_extraction_results.items():
230 |             files = folder_data.get("files", [])
231 |             total_files += len(files)
232 |             successful_files += len([f for f in files if f.get("success")])
233 |         
234 |         print(f"📊 PDF提取统计: {successful_files}/{total_files} 文件成功，{len(pdf_extraction_results)}个文件夹")
235 |         
236 |         # 即使没有PDF文件，只要有文件夹结构就认为成功
237 |         print("✅ 检测到PDF提取结果或文件夹结构")
238 |         return "pdf_extraction_success"
239 |     else:
240 |         print("❌ 没有PDF提取结果")
241 |         return "pdf_extraction_failed"
242 |     
243 |     # 检查错误消息
244 |     error_message = state.get("error_message", "")
245 |     if error_message and "pdf" in error_message.lower() and "failed" in error_message.lower():
246 |         print(f"❌ 发现PDF相关错误: {error_message}")
247 |         return "pdf_extraction_failed"
248 |     
249 |     # 默认情况：如果状态不明确，认为是失败
250 |     print("⚠️ PDF提取状态不明确，默认为失败")
251 |     return "pdf_extraction_failed"
252 | 
253 | 
254 | def create_parallel_branches(state: AuditState) -> Union[List, str]:
255 |     """
256 |     创建并行分支：从文件处理后分发到多个并行路径
257 |     
258 |     使用LangGraph的Send API实现真正的并行处理：
259 |     1. PDF提取路径
260 |     2. 规则处理路径
261 |     
262 |     Returns:
263 |         Send对象列表，每个对象代表一个并行分支
264 |     """
265 |     if not SEND_AVAILABLE or Send is None:
266 |         print("⚠️ Send API不可用，使用传统路由")
267 |         return "pdf_extraction"  # 退化到传统路由
268 |     
269 |     print("🚀 创建并行分支: PDF提取 + 规则处理")
270 |     
271 |     # 返回多个Send对象，实现并行处理
272 |     return [
273 |         Send("pdf_extraction", state),  # PDF提取路径
274 |         Send("load_rules", state)       # 规则加载路径
275 |     ]
276 | 
277 | 
278 | def after_rules_loaded(state: AuditState) -> str:
279 |     """
280 |     规则加载完成后的路由
281 |     
282 |     Returns:
283 |         "extract_rules": 继续提取规则
284 |         "rules_load_failed": 规则加载失败
285 |     """
286 |     current_step = state.get("current_step", "")
287 |     
288 |     if "rules_load_failed" in current_step:
289 |         print("❌ 规则加载失败")
290 |         return "rules_load_failed"
291 |     
292 |     if "rules_loaded" in current_step:
293 |         print("✅ 规则加载成功，继续提取")
294 |         return "extract_rules"
295 |     
296 |     # 检查是否有规则数据
297 |     rules_data = state.get("rules_data", [])
298 |     if rules_data:
299 |         print(f"✅ 发现 {len(rules_data)} 个规则数据，继续提取")
300 |         return "extract_rules"
301 |     
302 |     print("❌ 未找到规则数据")
303 |     return "rules_load_failed"
304 | 
305 | 
306 | def check_core_info_for_parallel_validation(state: AuditState) -> Union[List, str]:
307 |     """
308 |     核心信息提取完成后，并行分发到validation和cross_validation节点
309 |     
310 |     确保PDF提取路径的数据也能进入validation节点
311 |     
312 |     Returns:
313 |         Send对象列表，发送到validation和cross_validation
314 |     """
315 |     if not SEND_AVAILABLE or Send is None:
316 |         print("⚠️ Send API不可用，使用传统路由")
317 |         return "validation"  # 退化到传统路由
318 |     
319 |     core_info = state.get("core_info")
320 |     extracted_content = state.get("extracted_content", {})
321 |     
322 |     print(f"🚀 核心信息提取完成，分发到验证节点")
323 |     print(f"📊 核心信息状态: {core_info is not None}")
324 |     print(f"📊 提取内容状态: {len(extracted_content) if extracted_content else 0} 项")
325 |     
326 |     # 并行发送到两个验证节点
327 |     return [
328 |         Send("validation", state),
329 |         Send("cross_validation", state)
330 |     ]
331 | 
332 | 
333 | def check_rules_for_validation(state: AuditState) -> Union[List, str]:
334 |     """
335 |     检查规则提取结果，决定是否可以进入验证阶段
336 |     
337 |     使用Send API将规则数据发送到validation和cross_validation节点
338 |     
339 |     Returns:
340 |         Send对象列表，发送到validation和cross_validation
341 |     """
342 |     if not SEND_AVAILABLE or Send is None:
343 |         print("⚠️ Send API不可用，使用传统路由")
344 |         return "validation"  # 退化到传统路由
345 |     
346 |     parsed_rules = state.get("parsed_rules", [])
347 |     current_step = state.get("current_step", "")
348 |     
349 |     # 添加详细调试信息
350 |     print(f"🔍 check_rules_for_validation 调试信息:")
351 |     print(f"   current_step: {current_step}")
352 |     print(f"   parsed_rules 数量: {len(parsed_rules)}")
353 |     print(f"   parsed_rules 内容: {parsed_rules[:2] if parsed_rules else '空'}")
354 |     
355 |     # 修复条件判断：只要有规则就传递给validation
356 |     if parsed_rules and len(parsed_rules) > 0:
357 |         print(f"🚀 规则提取成功，分发到验证节点: {len(parsed_rules)} 条规则")
358 |         
359 |         # 将规则数据发送到两个验证节点
360 |         return [
361 |             Send("validation", state),
362 |             Send("cross_validation", state)
363 |         ]
364 |     elif "rules_extract_skipped" in current_step:
365 |         print("🚨 规则提取已跳过，直接进行基础验证")
366 |         return [Send("validation", state)]
367 |     else:
368 |         print("⚠️ 规则提取未完成或无规则数据，只进行基础验证")
369 |         return [Send("validation", state)]


--------------------------------------------------------------------------------
/src/agent.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 主要的职称评审材料审核代理
  3 | 
  4 | 基于LangGraph框架的完整审核系统入口
  5 | 集成LangSmith调试和监控功能
  6 | """
  7 | 
  8 | import os
  9 | from typing import Dict, Any, Optional
 10 | 
 11 | # 定义RunnableConfig为类型别名，避免对langchain_core的依赖
 12 | RunnableConfig = Dict[str, Any]
 13 | 
 14 | # 导入工作流模块
 15 | try:
 16 |     # 优先使用绝对导入
 17 |     from src.graph.workflow import create_audit_workflow
 18 | except ImportError:
 19 |     try:
 20 |         # 如果绝对导入失败，尝试相对导入
 21 |         import sys
 22 |         import os
 23 |         # 添加项目根目录到Python路径
 24 |         project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 25 |         if project_root not in sys.path:
 26 |             sys.path.insert(0, project_root)
 27 |         from src.graph.workflow import create_audit_workflow
 28 |     except ImportError:
 29 |         try:
 30 |             # 最后尝试从当前目录导入
 31 |             from graph.workflow import create_audit_workflow
 32 |         except ImportError:
 33 |             raise ImportError("无法导入工作流模块，请检查项目结构")
 34 | 
 35 | # 导入状态模块
 36 | try:
 37 |     from src.graph.state import (
 38 |         AuditState,
 39 |         create_initial_state
 40 |     )
 41 | except ImportError:
 42 |     try:
 43 |         from graph.state import (
 44 |             AuditState,
 45 |             create_initial_state
 46 |         )
 47 |     except ImportError:
 48 |         # 如果都失败，尝试使用系统路径
 49 |         import sys
 50 |         import os
 51 |         project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 52 |         if project_root not in sys.path:
 53 |             sys.path.insert(0, project_root)
 54 |         try:
 55 |             from src.graph.state import (
 56 |                 AuditState,
 57 |                 create_initial_state
 58 |             )
 59 |         except ImportError:
 60 |             raise ImportError("无法导入状态模块，请检查项目结构")
 61 | 
 62 | try:
 63 |     from src.config.api_config import configure_pdf_api
 64 | except ImportError:
 65 |     try:
 66 |         from config.api_config import configure_pdf_api
 67 |     except ImportError:
 68 |         def configure_pdf_api(*args, **kwargs):
 69 |             print("⚠️ API配置模块未加载，使用默认配置")
 70 | 
 71 | # 删除未使用的 configure_pdf_api_endpoint 导入
 72 | 
 73 | try:
 74 |     from src.tools.langsmith_utils import setup_langsmith_environment
 75 | except ImportError:
 76 |     try:
 77 |         from tools.langsmith_utils import setup_langsmith_environment
 78 |     except ImportError:
 79 |         def setup_langsmith_environment():
 80 |             print("⚠️ LangSmith工具未加载")
 81 | 
 82 | # 初始化主工作流
 83 | if os.getenv("LANGSMITH_API_KEY"):
 84 |     setup_langsmith_environment()
 85 |     print("✅ LangSmith追踪已启用")
 86 |     
 87 | # 使用统一的主工作流
 88 | graph = create_audit_workflow()
 89 | print("✅ 主审核工作流已就绪")
 90 | 
 91 | # 导出主要接口
 92 | __all__ = [
 93 |     "graph",
 94 |     "run_audit",
 95 |     "run_audit_with_tracing",
 96 |     "debug_audit",
 97 |     "configure_pdf_api",
 98 |     "run_pdf_audit",
 99 |     "AuditState",
100 |     "create_initial_state"
101 | ]
102 | 
103 | 
104 | async def run_audit(uploaded_file: str, session_id: Optional[str] = None) -> dict:
105 |     """
106 |     运行审核工作流的便捷函数（异步版本）
107 |     
108 |     Args:
109 |         uploaded_file: 上传的文件路径
110 |         session_id: 会话ID（可选）
111 |         
112 |     Returns:
113 |         审核结果
114 |     """
115 |     # 创建初始状态
116 |     initial_state = create_initial_state(uploaded_file, session_id)
117 |     
118 |     # 确保PDF API端点配置（修复：在ZIP文件审核中也设置）
119 |     if not initial_state.get("pdf_api_endpoint"):
120 |         api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain"
121 |         initial_state["pdf_api_endpoint"] = api_endpoint
122 |         print(f"🔧 为ZIP文件审核设置PDF API端点: {api_endpoint}")
123 |     
124 |     # 为基础审核模式创建配置
125 |     config = None
126 |     if session_id:
127 |         config = {"configurable": {"thread_id": session_id}}
128 |     
129 |     try:
130 |         # 执行工作流（使用异步API）
131 |         print(f"🚀 开始审核流程: {uploaded_file}")
132 |         if config:
133 |             result = await graph.ainvoke(initial_state, config)  # type: ignore
134 |         else:
135 |             result = await graph.ainvoke(initial_state)
136 |         
137 |         print(f"✅ 审核完成! 最终状态: {result.get('current_step', '未知')}")
138 |         return result
139 |         
140 |     except Exception as e:
141 |         print(f"❌ 审核失败: {str(e)}")
142 |         return {
143 |             "error": str(e), 
144 |             "current_step": "failed",
145 |             "error_message": str(e)
146 |         }
147 | 
148 | 
149 | async def run_audit_with_tracing(
150 |     uploaded_file: str, 
151 |     session_id: Optional[str] = None,
152 |     run_name: Optional[str] = None,
153 |     tags: Optional[list] = None
154 | ) -> dict:
155 |     """
156 |     运行带LangSmith追踪的审核工作流（异步版本）
157 |     
158 |     Args:
159 |         uploaded_file: 上传的文件路径
160 |         session_id: 会话ID（可选）
161 |         run_name: 运行名称
162 |         tags: 标签列表
163 |         
164 |     Returns:
165 |         审核结果
166 |     """
167 |     try:
168 |         from src.tools.langsmith_utils import create_run_config, with_langsmith_tracing
169 |         
170 |         # 创建初始状态
171 |         initial_state = create_initial_state(uploaded_file, session_id)
172 |         
173 |         # 确保PDF API端点配置（修复：在带追踪审核中也设置）
174 |         if not initial_state.get("pdf_api_endpoint"):
175 |             api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain"
176 |             initial_state["pdf_api_endpoint"] = api_endpoint
177 |             print(f"🔧 为带追踪审核设置PDF API端点: {api_endpoint}")
178 |         
179 |         # 创建带追踪的配置
180 |         config = create_run_config(
181 |             run_name=run_name or f"audit_with_tracing_{session_id or 'default'}",
182 |             tags=tags or ["web", "tracing", "production"],
183 |             thread_id=session_id
184 |         )
185 |         
186 |         print(f"🔍 开始带追踪的审核流程: {uploaded_file}")
187 |         print(f"📊 运行名称: {config.get('run_name')}")
188 |         print(f"🏷️  标签: {config.get('tags', [])}")
189 |         
190 |         # 使用带追踪的图执行（异步版本）
191 |         @with_langsmith_tracing
192 |         async def traced_audit():
193 |             return await graph.ainvoke(initial_state, config)  # type: ignore
194 |         
195 |         result = await traced_audit()
196 |         
197 |         print(f"✅ 带追踪审核完成! 最终状态: {result.get('current_step', '未知')}")
198 |         return result
199 |         
200 |     except Exception as e:
201 |         print(f"❌ 带追踪审核失败: {str(e)}")
202 |         return {
203 |             "error": str(e),
204 |             "current_step": "failed", 
205 |             "error_message": str(e)
206 |         }
207 | 
208 | 
209 | async def debug_audit(
210 |     uploaded_file: str,
211 |     session_id: Optional[str] = None,
212 |     breakpoints: Optional[list] = None
213 | ) -> dict:
214 |     """
215 |     运行调试模式的审核工作流（异步版本）
216 |     
217 |     Args:
218 |         uploaded_file: 上传的文件路径
219 |         session_id: 会话ID（可选）
220 |         breakpoints: 断点列表
221 |         
222 |     Returns:
223 |         审核结果
224 |     """
225 |     try:
226 |         from src.tools.langsmith_utils import create_debug_config, event_logger
227 |         
228 |         # 创建初始状态
229 |         initial_state = create_initial_state(uploaded_file, session_id)
230 |         
231 |         # 确保PDF API端点配置（修复：在调试模式中也设置）
232 |         if not initial_state.get("pdf_api_endpoint"):
233 |             api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain"
234 |             initial_state["pdf_api_endpoint"] = api_endpoint
235 |             print(f"🔧 为调试模式设置PDF API端点: {api_endpoint}")
236 |         
237 |         # 创建调试配置
238 |         config = create_debug_config(breakpoints=breakpoints)
239 |         if session_id:
240 |             config["configurable"] = {"thread_id": session_id}
241 |         
242 |         print(f"🐛 开始调试模式审核流程: {uploaded_file}")
243 |         print(f"🔧 断点: {breakpoints or ['无']}")
244 |         
245 |         # 清空事件日志
246 |         event_logger.clear_events()
247 |         
248 |         # 执行工作流（异步版本）
249 |         result = await graph.ainvoke(initial_state, config)  # type: ignore
250 |         
251 |         # 收集调试信息
252 |         debug_events = event_logger.get_events()
253 |         
254 |         print(f"✅ 调试模式审核完成! 最终状态: {result.get('current_step', '未知')}")
255 |         print(f"📝 记录了 {len(debug_events)} 个调试事件")
256 |         
257 |         # 在结果中包含调试信息
258 |         result["debug_events"] = debug_events
259 |         return result
260 |         
261 |     except Exception as e:
262 |         print(f"❌ 调试模式审核失败: {str(e)}")
263 |         return {
264 |             "error": str(e),
265 |             "current_step": "failed",
266 |             "error_message": str(e)
267 |         }
268 | 
269 | 
270 | async def run_pdf_audit(
271 |     uploaded_file: str,
272 |     api_endpoint: str,
273 |     session_id: Optional[str] = None,
274 |     with_tracing: bool = False
275 | ) -> dict:
276 |     """
277 |     运行PDF审核工作流（异步版本）
278 |     
279 |     Args:
280 |         uploaded_file: 上传的ZIP文件路径
281 |         api_endpoint: PDF提取API端点
282 |         session_id: 会话ID（可选）
283 |         with_tracing: 是否启用LangSmith追踪
284 |         
285 |     Returns:
286 |         审核结果
287 |     """
288 |     try:
289 |         # 配置PDF API端点
290 |         configure_pdf_api(api_endpoint)
291 |         print(f"🔧 已配置PDF提取API: {api_endpoint}")
292 |         
293 |         # 创建初始状态
294 |         initial_state = create_initial_state(uploaded_file, session_id)
295 |         
296 |         # 直接设置API端点（现在AuditState已经支持这个字段）
297 |         initial_state["pdf_api_endpoint"] = api_endpoint
298 |         
299 |         # 选择执行模式
300 |         if with_tracing:
301 |             print(f"🔍 开始PDF审核流程（启用追踪）: {uploaded_file}")
302 |             return await run_audit_with_tracing(
303 |                 uploaded_file, 
304 |                 session_id, 
305 |                 run_name=f"pdf_audit_{session_id or 'default'}",
306 |                 tags=["pdf", "api_extraction", "production"]
307 |             )
308 |         else:
309 |             print(f"🚀 开始PDF审核流程: {uploaded_file}")
310 |             
311 |             # 为基础审核模式创建配置
312 |             config = None
313 |             if session_id:
314 |                 config = {"configurable": {"thread_id": session_id}}
315 |             
316 |             # 执行工作流（异步版本）
317 |             if config:
318 |                 result = await graph.ainvoke(initial_state, config)  # type: ignore
319 |             else:
320 |                 result = await graph.ainvoke(initial_state)
321 |             
322 |             print(f"✅ PDF审核完成! 最终状态: {result.get('current_step', '未知')}")
323 |             return result
324 |             
325 |     except Exception as e:
326 |         print(f"❌ PDF审核失败: {str(e)}")
327 |         return {
328 |             "error": str(e),
329 |             "current_step": "failed",
330 |             "error_message": str(e),
331 |             "pdf_api_endpoint": api_endpoint
332 |         }
333 | 
334 | 
335 | 
336 | 
337 | 
338 | async def main_async():
339 |     """命令行入口点（异步版本）"""
340 |     import sys
341 |     import argparse
342 |     
343 |     parser = argparse.ArgumentParser(description='LangGraph 职称材料审核系统')
344 |     parser.add_argument('file_path', help='要审核的ZIP文件路径')
345 |     parser.add_argument('--session-id', help='会话ID（可选）')
346 | 
347 |     
348 |     args = parser.parse_args()
349 |     
350 |     # 统一使用主审核函数（异步版本）
351 |     result = await run_audit(args.file_path, args.session_id)
352 |     print(f"✅ 审核结果: {result}")
353 |     
354 |     return result
355 | 
356 | def main():
357 |     """命令行入口点（用于pyproject.toml脚本配置）"""
358 |     import asyncio
359 |     return asyncio.run(main_async())
360 | 
361 | 
362 | if __name__ == "__main__":
363 |     # 示例用法
364 |     import os
365 |     import asyncio
366 |     
367 |     async def example_usage():
368 |         # 检查测试数据
369 |         test_file = "test_data/sample.zip"
370 |         
371 |         if os.path.exists(test_file):
372 |             print("🧪 运行测试审核...")
373 |             result = await run_audit(test_file)
374 |             print(f"📊 审核结果: {result}")
375 |         else:
376 |             print("📋 主代理已就绪，可以通过以下方式使用:")
377 |             print("  from src.agent import run_audit")
378 |             print("  import asyncio")
379 |             print("  result = asyncio.run(run_audit('path/to/your/file.zip'))")
380 |             print("\n🔧 或者直接使用图对象:")
381 |             print("  from src.agent import graph")
382 |             print("  result = await graph.ainvoke(initial_state)")
383 |     
384 |     asyncio.run(example_usage())


--------------------------------------------------------------------------------
/static/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="zh-CN">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>LangGraph 职称评审材料审核系统</title>
  7 |     <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
  8 |     <link href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.10.0/font/bootstrap-icons.css" rel="stylesheet">
  9 |     <link href="/static/styles.css" rel="stylesheet">
 10 | </head>
 11 | <body>
 12 |     <div class="container-fluid">
 13 |         <!-- 头部导航 -->
 14 |         <nav class="navbar navbar-expand-lg navbar-dark bg-primary">
 15 |             <div class="container">
 16 |                 <a class="navbar-brand" href="#">
 17 |                     <i class="bi bi-file-check-fill me-2"></i>
 18 |                     LangGraph 职称评审系统
 19 |                 </a>
 20 |                 <div class="navbar-nav ms-auto">
 21 |                     <a class="nav-link" href="/docs" target="_blank">
 22 |                         <i class="bi bi-book me-1"></i>API文档
 23 |                     </a>
 24 |                 </div>
 25 |             </div>
 26 |         </nav>
 27 | 
 28 |         <div class="container mt-4">
 29 |             <div class="row">
 30 |                 <!-- 左侧上传区域 -->
 31 |                 <div class="col-lg-4">
 32 |                     <div class="card shadow-sm">
 33 |                         <div class="card-header bg-light">
 34 |                             <h5 class="card-title mb-0">
 35 |                                 <i class="bi bi-cloud-upload me-2"></i>文件上传
 36 |                             </h5>
 37 |                         </div>
 38 |                         <div class="card-body">
 39 |                             <form id="uploadForm" enctype="multipart/form-data">
 40 |                                 <div class="mb-3">
 41 |                                     <label for="fileInput" class="form-label">选择文件</label>
 42 |                                     <input type="file" class="form-control" id="fileInput" accept=".zip,.pdf" required>
 43 |                                     <div class="form-text">支持ZIP文件（完整材料包）或PDF文件（单个文档）</div>
 44 |                                 </div>
 45 |                                 
 46 |                                 <div class="mb-3">
 47 |                                     <label for="sessionId" class="form-label">会话ID（可选）</label>
 48 |                                     <input type="text" class="form-control" id="sessionId" placeholder="自动生成">
 49 |                                 </div>
 50 | 
 51 |                                 <div class="mb-3">
 52 |                                     <div class="form-check">
 53 |                                         <input class="form-check-input" type="checkbox" id="withTracing">
 54 |                                         <label class="form-check-label" for="withTracing">
 55 |                                             启用LangSmith追踪
 56 |                                         </label>
 57 |                                     </div>
 58 |                                     <div class="form-check">
 59 |                                         <input class="form-check-input" type="checkbox" id="debugMode">
 60 |                                         <label class="form-check-label" for="debugMode">
 61 |                                             启用调试模式
 62 |                                         </label>
 63 |                                     </div>
 64 |                                 </div>
 65 | 
 66 |                                 <div class="mb-3">
 67 |                                     <label for="streamMode" class="form-label">流式模式</label>
 68 |                                     <select class="form-select" id="streamMode">
 69 |                                         <option value="updates">Updates - 状态更新</option>
 70 |                                         <option value="events">Events - 所有事件</option>
 71 |                                         <option value="custom">Custom - 自定义数据</option>
 72 |                                     </select>
 73 |                                 </div>
 74 | 
 75 |                                 <button type="submit" class="btn btn-primary w-100" id="uploadBtn">
 76 |                                     <i class="bi bi-upload me-2"></i>开始审核
 77 |                                 </button>
 78 |                             </form>
 79 |                         </div>
 80 |                     </div>
 81 | 
 82 |                     <!-- 任务列表 -->
 83 |                     <div class="card shadow-sm mt-4">
 84 |                         <div class="card-header bg-light d-flex justify-content-between align-items-center">
 85 |                             <h5 class="card-title mb-0">
 86 |                                 <i class="bi bi-list-task me-2"></i>任务列表
 87 |                             </h5>
 88 |                             <button class="btn btn-sm btn-outline-primary" id="refreshTasks">
 89 |                                 <i class="bi bi-arrow-clockwise"></i>
 90 |                             </button>
 91 |                         </div>
 92 |                         <div class="card-body p-0">
 93 |                             <div id="taskList" class="list-group list-group-flush">
 94 |                                 <!-- 任务项将在这里动态加载 -->
 95 |                             </div>
 96 |                         </div>
 97 |                     </div>
 98 |                 </div>
 99 | 
100 |                 <!-- 右侧主要内容区域 -->
101 |                 <div class="col-lg-8">
102 |                     <!-- 欢迎界面 -->
103 |                     <div id="welcomeScreen" class="text-center py-5">
104 |                         <div class="mb-4">
105 |                             <i class="bi bi-file-earmark-check display-1 text-primary"></i>
106 |                         </div>
107 |                         <h2>LangGraph 职称评审材料审核系统</h2>
108 |                         <p class="text-muted lead">
109 |                             基于人工智能的智能职称材料审核，支持实时流式处理和进度追踪
110 |                         </p>
111 |                         <div class="row mt-4">
112 |                             <div class="col-md-4">
113 |                                 <div class="feature-card">
114 |                                     <i class="bi bi-lightning-charge text-warning"></i>
115 |                                     <h5>智能处理</h5>
116 |                                     <p>AI驱动的内容提取和规则验证</p>
117 |                                 </div>
118 |                             </div>
119 |                             <div class="col-md-4">
120 |                                 <div class="feature-card">
121 |                                     <i class="bi bi-broadcast text-success"></i>
122 |                                     <h5>实时更新</h5>
123 |                                     <p>流式API支持实时进度追踪</p>
124 |                                 </div>
125 |                             </div>
126 |                             <div class="col-md-4">
127 |                                 <div class="feature-card">
128 |                                     <i class="bi bi-file-earmark-text text-info"></i>
129 |                                     <h5>详细报告</h5>
130 |                                     <p>生成完整的HTML审核报告</p>
131 |                                 </div>
132 |                             </div>
133 |                         </div>
134 |                     </div>
135 | 
136 |                     <!-- 审核进度界面 -->
137 |                     <div id="auditScreen" class="d-none">
138 |                         <!-- 文件信息卡片 -->
139 |                         <div class="card shadow-sm mb-4">
140 |                             <div class="card-header bg-primary text-white">
141 |                                 <h5 class="card-title mb-0">
142 |                                     <i class="bi bi-info-circle me-2"></i>审核信息
143 |                                 </h5>
144 |                             </div>
145 |                             <div class="card-body">
146 |                                 <div class="row">
147 |                                     <div class="col-md-6">
148 |                                         <strong>任务ID:</strong> <span id="currentTaskId"></span><br>
149 |                                         <strong>文件名:</strong> <span id="currentFileName"></span><br>
150 |                                         <strong>文件大小:</strong> <span id="currentFileSize"></span>
151 |                                     </div>
152 |                                     <div class="col-md-6">
153 |                                         <strong>会话ID:</strong> <span id="currentSessionId"></span><br>
154 |                                         <strong>开始时间:</strong> <span id="currentStartTime"></span><br>
155 |                                         <strong>状态:</strong> <span id="currentStatus" class="badge"></span>
156 |                                     </div>
157 |                                 </div>
158 |                             </div>
159 |                         </div>
160 | 
161 |                         <!-- 进度条 -->
162 |                         <div class="card shadow-sm mb-4">
163 |                             <div class="card-body">
164 |                                 <div class="d-flex justify-content-between align-items-center mb-2">
165 |                                     <span class="fw-bold">审核进度</span>
166 |                                     <span id="progressPercent">0%</span>
167 |                                 </div>
168 |                                 <div class="progress mb-3" style="height: 10px;">
169 |                                     <div class="progress-bar progress-bar-striped progress-bar-animated" 
170 |                                          id="progressBar" role="progressbar" style="width: 0%"></div>
171 |                                 </div>
172 |                                 <div id="progressDescription" class="text-muted">准备开始...</div>
173 |                             </div>
174 |                         </div>
175 | 
176 |                         <!-- 工作流步骤 -->
177 |                         <div class="card shadow-sm mb-4">
178 |                             <div class="card-header bg-light">
179 |                                 <h5 class="card-title mb-0">
180 |                                     <i class="bi bi-diagram-2 me-2"></i>工作流步骤
181 |                                 </h5>
182 |                             </div>
183 |                             <div class="card-body">
184 |                                 <div id="workflowSteps">
185 |                                     <!-- 步骤将在这里动态生成 -->
186 |                                 </div>
187 |                             </div>
188 |                         </div>
189 | 
190 |                         <!-- 实时日志 -->
191 |                         <div class="card shadow-sm mb-4">
192 |                             <div class="card-header bg-light d-flex justify-content-between align-items-center">
193 |                                 <h5 class="card-title mb-0">
194 |                                     <i class="bi bi-terminal me-2"></i>实时日志
195 |                                 </h5>
196 |                                 <button class="btn btn-sm btn-outline-secondary" id="clearLogs">
197 |                                     <i class="bi bi-trash"></i>清空
198 |                                 </button>
199 |                             </div>
200 |                             <div class="card-body p-0">
201 |                                 <div id="logContainer" class="log-container">
202 |                                     <!-- 日志将在这里显示 -->
203 |                                 </div>
204 |                             </div>
205 |                         </div>
206 | 
207 |                         <!-- 操作按钮 -->
208 |                         <div class="d-flex gap-2 mb-4">
209 |                             <button class="btn btn-success" id="downloadReport" disabled>
210 |                                 <i class="bi bi-download me-2"></i>下载报告
211 |                             </button>
212 |                             <button class="btn btn-outline-secondary" id="viewDetails" disabled>
213 |                                 <i class="bi bi-eye me-2"></i>查看详情
214 |                             </button>
215 |                             <button class="btn btn-outline-danger" id="cancelTask" disabled>
216 |                                 <i class="bi bi-x-circle me-2"></i>取消任务
217 |                             </button>
218 |                             <button class="btn btn-outline-primary" id="newTask">
219 |                                 <i class="bi bi-plus-circle me-2"></i>新建任务
220 |                             </button>
221 |                         </div>
222 |                     </div>
223 |                 </div>
224 |             </div>
225 |         </div>
226 |     </div>
227 | 
228 |     <!-- 模态对话框 -->
229 |     <div class="modal fade" id="detailsModal" tabindex="-1">
230 |         <div class="modal-dialog modal-lg">
231 |             <div class="modal-content">
232 |                 <div class="modal-header">
233 |                     <h5 class="modal-title">任务详情</h5>
234 |                     <button type="button" class="btn-close" data-bs-dismiss="modal"></button>
235 |                 </div>
236 |                 <div class="modal-body">
237 |                     <pre id="taskDetails" class="bg-light p-3 rounded"></pre>
238 |                 </div>
239 |             </div>
240 |         </div>
241 |     </div>
242 | 
243 |     <!-- 错误提示模态框 -->
244 |     <div class="modal fade" id="errorModal" tabindex="-1">
245 |         <div class="modal-dialog">
246 |             <div class="modal-content">
247 |                 <div class="modal-header bg-danger text-white">
248 |                     <h5 class="modal-title">
249 |                         <i class="bi bi-exclamation-triangle me-2"></i>错误
250 |                     </h5>
251 |                     <button type="button" class="btn-close btn-close-white" data-bs-dismiss="modal"></button>
252 |                 </div>
253 |                 <div class="modal-body">
254 |                     <p id="errorMessage"></p>
255 |                 </div>
256 |                 <div class="modal-footer">
257 |                     <button type="button" class="btn btn-secondary" data-bs-dismiss="modal">关闭</button>
258 |                 </div>
259 |             </div>
260 |         </div>
261 |     </div>
262 | 
263 |     <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
264 |     <script src="/static/scripts.js"></script>
265 | </body>
266 | </html>


--------------------------------------------------------------------------------
/src/nodes/validation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | AI驱动的规则校验节点 - 基于rules文件夹中的Excel规则
  3 | """
  4 | 
  5 | from typing import Dict, List, Any
  6 | from src.graph.state import AuditState
  7 | 
  8 | # 导入AI工具
  9 | try:
 10 |     from src.tools.ai_utils import validate_material_with_ai
 11 |     _ai_utils_available = True
 12 | except ImportError:
 13 |     _ai_utils_available = False
 14 |     validate_material_with_ai = None
 15 | 
 16 | 
 17 | def validation_node(state: AuditState) -> Dict[str, Any]:
 18 |     """
 19 |     完全无缓存的AI智能校验节点 - 每次都处理全新数据
 20 |     
 21 |     🚨 已完全取消缓存机制，确保每次传输的信息都是全新的、一次性的
 22 |     """
 23 |     try:
 24 |         print(f"⚡ 开始无缓存AI智能校验...")
 25 |         
 26 |         # 直接获取当前状态的材料内容和规则数据 - 不使用任何缓存
 27 |         extracted_content = state.get("api_extraction_results", {}) or state.get("extracted_content", {})
 28 |         parsed_rules = state.get("parsed_rules", [])
 29 |         rules_by_category = state.get("rules_by_category", {})
 30 |         
 31 |         print(f"🔍 当前状态数据:")
 32 |         print(f"   材料数量: {len(extracted_content)}")
 33 |         print(f"   规则数量: {len(parsed_rules)}")
 34 |         print(f"   规则分类: {list(rules_by_category.keys())}")
 35 |         
 36 |         if not extracted_content:
 37 |             print("⚠️ 未找到可校验的材料内容")
 38 |             return {
 39 |                 "current_step": "validation_completed",
 40 |                 "processing_logs": ["未找到可校验的材料内容"]
 41 |             }
 42 |         
 43 |         # 直接处理所有材料 - 不使用队列缓存机制
 44 |         validation_results = []
 45 |         material_validation = {}
 46 |         total_materials = len(extracted_content)
 47 |         processed_count = 0
 48 |         
 49 |         print(f"📋 开始校验{total_materials}个材料类型")
 50 |         
 51 |         # 直接遍历处理每个材料 - 完全无缓存
 52 |         for material_type, material_data in extracted_content.items():
 53 |             processed_count += 1
 54 |             print(f"🔍 正在校验: {material_type} ({processed_count}/{total_materials})")
 55 |             
 56 |             try:
 57 |                 # 数据预处理：确保是单个材料的数据
 58 |                 if isinstance(material_data, list) and len(material_data) > 0:
 59 |                     actual_data = material_data[0] if material_data else {}
 60 |                 elif isinstance(material_data, dict):
 61 |                     actual_data = material_data
 62 |                 else:
 63 |                     actual_data = {"content": material_data, "material_type": material_type}
 64 |                 
 65 |                 # 提取材料内容
 66 |                 material_content = _extract_material_content(actual_data)
 67 |                 
 68 |                 # 🎯 智能规则匹配：教育经历材料只与教育经历规则集匹配
 69 |                 matched_rules = _get_matched_rules_for_material(material_type, rules_by_category, parsed_rules)
 70 |                 print(f"🎯 {material_type} 匹配到 {len(matched_rules)} 条相关规则")
 71 |                 
 72 |                 # 使用AI工具进行校验，将规则作为prompt的一部分
 73 |                 material_results = None
 74 |                 
 75 |                 if _ai_utils_available and validate_material_with_ai and material_content.strip():
 76 |                     print(f"✅ 使用AI校验: {material_type}")
 77 |                     
 78 |                     try:
 79 |                         # 使用匹配的规则进行AI校验，而不是所有规则
 80 |                         if matched_rules and len(matched_rules) > 0:
 81 |                             print(f"📤 向AI传递{len(matched_rules)}条匹配的{material_type}规则")
 82 |                             
 83 |                             ai_results = validate_material_with_ai(
 84 |                                 material_type, 
 85 |                                 material_content, 
 86 |                                 rules_context=matched_rules
 87 |                             )
 88 |                         else:
 89 |                             print(f"⚠️ {material_type}未找到匹配的规则，跳过AI校验")
 90 |                             ai_results = []
 91 |                             
 92 |                         if ai_results and len(ai_results) > 0:
 93 |                             print(f"✅ AI校验成功，生成{len(ai_results)}个结果")
 94 |                             # 转换AI结果格式
 95 |                             converted_results = []
 96 |                             for ai_result in ai_results:
 97 |                                 converted_result = {
 98 |                                     "rule_name": ai_result.get("rule_name", f"{material_type}规则校验"),
 99 |                                     "result": _convert_ai_status_to_result(ai_result.get("status", "WARNING")),
100 |                                     "details": ai_result.get("message", "校验完成"),
101 |                                     "priority": _convert_ai_status_to_priority(ai_result.get("status", "WARNING")),
102 |                                     "material_type": material_type,
103 |                                     "rule_content": ai_result.get("rule_content", ""),
104 |                                     "ai_powered": True,
105 |                                     "timestamp": _get_current_timestamp()
106 |                                 }
107 |                                 converted_results.append(converted_result)
108 |                                 validation_results.append(converted_result)
109 |                             
110 |                             material_results = converted_results
111 |                         else:
112 |                             print(f"⚠️ AI校验返回空结果")
113 |                             
114 |                     except Exception as ai_error:
115 |                         print(f"⚠️ AI校验失败: {ai_error}")
116 |                 else:
117 |                     print(f"⚠️ AI工具不可用或无内容")
118 |                 
119 |                 # 如果AI校验失败，创建基础结果
120 |                 if not material_results:
121 |                     print(f"🔧 为{material_type}创建基础校验结果")
122 |                     basic_result = {
123 |                         "rule_name": f"{material_type}基础校验",
124 |                         "result": "⚠️警告",
125 |                         "details": "未能进行AI校验，仅进行了基础检查",
126 |                         "priority": "中",
127 |                         "material_type": material_type,
128 |                         "rule_content": "",
129 |                         "ai_powered": False,
130 |                         "timestamp": _get_current_timestamp()
131 |                     }
132 |                     material_results = [basic_result]
133 |                     validation_results.append(basic_result)
134 |                 
135 |                 # 存储到material_validation中以兼容现有系统
136 |                 material_validation[material_type] = material_results
137 |                 
138 |                 print(f"✅ {material_type}校验完成，生成{len(material_results)}个结果")
139 |                 
140 |             except Exception as material_error:
141 |                 print(f"❌ 校验{material_type}时发生错误: {str(material_error)}")
142 |                 # 为失败的材料创建错误记录
143 |                 error_result = {
144 |                     "rule_name": f"{material_type}校验错误",
145 |                     "result": "❌不通过",
146 |                     "details": f"校验过程发生错误: {str(material_error)}",
147 |                     "priority": "高",
148 |                     "material_type": material_type,
149 |                     "rule_content": "",
150 |                     "timestamp": _get_current_timestamp()
151 |                 }
152 |                 validation_results.append(error_result)
153 |                 material_validation[material_type] = [error_result]
154 |         
155 |         # 直接返回结果，不使用任何缓存机制
156 |         print(f"✅ 无缓存规则校验完成：处理{processed_count}个材料类型，生成{len(validation_results)}项结果")
157 |         
158 |         # 构建详细结果与摘要（供报告使用）
159 |         try:
160 |             from src.models.state import ValidationResult, ValidationSummary
161 |             detailed_results = []
162 |             for rd in validation_results:
163 |                 try:
164 |                     detailed_results.append(ValidationResult.from_validation_output(rd))
165 |                 except Exception as conv_err:
166 |                     print(f"⚠️ 转换验证结果失败: {conv_err}")
167 |             summary = ValidationSummary.from_validation_results(detailed_results) if detailed_results else None
168 |         except Exception as model_err:
169 |             print(f"⚠️ 生成验证模型失败: {model_err}")
170 |             detailed_results = []
171 |             summary = None
172 |             
173 |         return {
174 |             "material_validation": material_validation,
175 |             "validation_cache": validation_results,
176 |             "validation_results_detailed": [r.dict() for r in detailed_results],
177 |             "validation_summary": summary.dict() if summary else None,
178 |             "current_step": "validation_completed",
179 |             "processing_logs": [
180 |                 f"处理了{processed_count}个材料类型",
181 |                 f"生成了{len(validation_results)}项校验结果",
182 |                 "已完全取消缓存机制，确保数据全新"
183 |             ]
184 |         }
185 |         
186 |     except Exception as e:
187 |         print(f"❌ 规则校验失败: {str(e)}")
188 |         return {
189 |             "current_step": "validation_failed",
190 |             "error_message": f"规则校验失败: {str(e)}"
191 |         }
192 | 
193 | 
194 | def _process_validation_results(material_type: str, validation_results: List, 
195 |                               validation_cache_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
196 |     """
197 |     处理AI校验结果并存入缓存
198 |     """
199 |     processed_results = []
200 |     
201 |     if isinstance(validation_results, list) and len(validation_results) > 0:
202 |         for result in validation_results:
203 |             if isinstance(result, dict):
204 |                 result['timestamp'] = _get_current_timestamp()
205 |                 processed_results.append(result)
206 |                 validation_cache_results.append(result)
207 |             else:
208 |                 # 其他类型，转换为字典
209 |                 result_dict = {
210 |                     "rule_name": f"{material_type}校验",
211 |                     "result": "⚠️警告",
212 |                     "details": str(result),
213 |                     "priority": "中",
214 |                     "material_type": material_type,
215 |                     "rule_content": "",
216 |                     "timestamp": _get_current_timestamp()
217 |                 }
218 |                 processed_results.append(result_dict)
219 |                 validation_cache_results.append(result_dict)
220 |     else:
221 |         # 空结果
222 |         result_dict = {
223 |             "rule_name": f"{material_type}校验",
224 |             "result": "⚠️警告",
225 |             "details": "未能生成有效的校验结果",
226 |             "priority": "中",
227 |             "material_type": material_type,
228 |             "rule_content": "",
229 |             "timestamp": _get_current_timestamp()
230 |         }
231 |         processed_results.append(result_dict)
232 |         validation_cache_results.append(result_dict)
233 |     
234 |     return processed_results
235 | 
236 | 
237 | def _get_current_timestamp() -> str:
238 |     """获取当前时间戳"""
239 |     from datetime import datetime
240 |     return datetime.now().isoformat()
241 | 
242 | 
243 | def _convert_ai_status_to_result(status: str) -> str:
244 |     """将AI状态转换为结果格式"""
245 |     status_upper = status.upper()
246 |     if status_upper == "PASS":
247 |         return "✅通过"
248 |     elif status_upper == "WARNING":
249 |         return "⚠️警告"
250 |     elif status_upper == "ERROR":
251 |         return "❌不通过"
252 |     else:
253 |         return "⚠️警告"  # 默认
254 | 
255 | 
256 | def _convert_ai_status_to_priority(status: str) -> str:
257 |     """将AI状态转换为优先级"""
258 |     status_upper = status.upper()
259 |     if status_upper == "ERROR":
260 |         return "高"
261 |     elif status_upper == "WARNING":
262 |         return "中"
263 |     elif status_upper == "PASS":
264 |         return "低"
265 |     else:
266 |         return "中"  # 默认
267 | 
268 | 
269 | def _get_matched_rules_for_material(material_type: str, rules_by_category: Dict[str, List[Any]], all_rules: List[Any]) -> List[Any]:
270 |     """
271 |     🎯 智能规则匹配：教育经历材料只与教育经历规则集匹配
272 |     
273 |     Args:
274 |         material_type: 材料类型（如"教育经历"）
275 |         rules_by_category: 按分类组织的规则
276 |         all_rules: 所有规则列表（备用）
277 |         
278 |     Returns:
279 |         匹配的规则列表
280 |     """
281 |     try:
282 |         print(f"🔍 正在为{material_type}匹配规则...")
283 |         
284 |         # 1-17项材料分类映射表
285 |         material_to_category = {
286 |             # 直接匹配数字编号
287 |             "1.教育经历": "1",
288 |             "2.工作经历": "2", 
289 |             "3.继续教育": "3",
290 |             "4.学术技术兼职情况": "4",
291 |             "5.获奖情况": "5",
292 |             "6.获得荣誉称号情况": "6",
293 |             "7.主持参与科研项目": "7",
294 |             "8.主持参与工程技术项目情况": "8",
295 |             "9.论文": "9",
296 |             "10.著(译)作(教材)": "10",
297 |             "11.专利(著作权)情况": "11",
298 |             "12.主持参与指定标准情况": "12",
299 |             "13.成果被批示、采纳、运用和推广情况": "13",
300 |             "14.资质证书": "14",
301 |             "15.奖惩情况": "15",
302 |             "16.考核情况": "16",
303 |             "17.申报材料附件信息": "17",
304 |             
305 |             # 关键词匹配
306 |             "教育经历": "1",
307 |             "工作经历": "2",
308 |             "继续教育": "3",
309 |             "培训情况": "3",
310 |             "学术技术兼职": "4",
311 |             "获奖": "5",
312 |             "荣誉称号": "6",
313 |             "科研项目": "7",
314 |             "工程项目": "8",
315 |             "项目经历": "8",
316 |             "论文": "9",
317 |             "著作": "10",
318 |             "教材": "10",
319 |             "专利": "11",
320 |             "著作权": "11",
321 |             "标准": "12",
322 |             "成果": "13",
323 |             "证书": "14",
324 |             "资质": "14",
325 |             "奖惩": "15",
326 |             "考核": "16",
327 |             "附件": "17"
328 |         }
329 |         
330 |         # 首先尝试直接匹配
331 |         category_id = material_to_category.get(material_type)
332 |         
333 |         # 如果直接匹配失败，尝试关键词匹配
334 |         if not category_id:
335 |             for keyword, cat_id in material_to_category.items():
336 |                 if keyword in material_type and len(keyword) > 2:  # 避免过短的关键词
337 |                     category_id = cat_id
338 |                     print(f"🎯 通过关键词'{keyword}'匹配到分类 {cat_id}")
339 |                     break
340 |         
341 |         # 获取匹配的规则
342 |         matched_rules = []
343 |         
344 |         if category_id and category_id in rules_by_category:
345 |             matched_rules = rules_by_category[category_id]
346 |             print(f"✅ {material_type} 匹配到分类{category_id}，找到 {len(matched_rules)} 条专用规则")
347 |         
348 |         # 如果没有找到专用规则，查找通用规则
349 |         if not matched_rules:
350 |             # 查找通用规则（如交叉检验规则、通用规则等）
351 |             general_rules = []
352 |             for rule in all_rules:
353 |                 rule_content = getattr(rule, 'content', '') if hasattr(rule, 'content') else rule.get('content', '')
354 |                 source_file = getattr(rule, 'source_file', '') if hasattr(rule, 'source_file') else rule.get('source_file', '')
355 |                 
356 |                 if '通用' in source_file or '交叉' in source_file or '基础' in source_file:
357 |                     general_rules.append(rule)
358 |             
359 |             if general_rules:
360 |                 matched_rules = general_rules
361 |                 print(f"⚠️ {material_type} 未找到专用规则，使用 {len(general_rules)} 条通用规则")
362 |         
363 |         # 最后的备用方案：返回空列表（不使用所有规则）
364 |         if not matched_rules:
365 |             print(f"⚠️ {material_type} 未找到任何匹配的规则，将跳过校验")
366 |         
367 |         return matched_rules
368 |         
369 |     except Exception as e:
370 |         print(f"⚠️ 规则匹配失败: {e}")
371 |         return []
372 | 
373 | 
374 | def _extract_material_content(actual_data: Dict[str, Any]) -> str:
375 |     """从材料数据中提取文本内容"""
376 |     material_content = ""
377 |     if isinstance(actual_data, dict):
378 |         if "content" in actual_data:
379 |             content_data = actual_data["content"]
380 |             if isinstance(content_data, dict):
381 |                 # 尝试多种可能的内容字段
382 |                 for key in ["md_content", "raw_markdown", "text", "content"]:
383 |                     if key in content_data:
384 |                         material_content = str(content_data[key])
385 |                         break
386 |                 if not material_content:
387 |                     material_content = str(content_data)
388 |             else:
389 |                 material_content = str(content_data)
390 |         else:
391 |             material_content = str(actual_data)
392 |     else:
393 |         material_content = str(actual_data)
394 |     
395 |     return material_content


--------------------------------------------------------------------------------
/src/nodes/pdf_extraction.py:
--------------------------------------------------------------------------------
  1 | """
  2 | PDF内容提取节点
  3 | 
  4 | 通过FastAPI接口处理PDF文件内容提取并转换为JSON格式
  5 | """
  6 | 
  7 | import json
  8 | import asyncio
  9 | from typing import Dict, Any, List, Optional
 10 | from pathlib import Path
 11 | import logging
 12 | 
 13 | try:
 14 |     import aiohttp  # type: ignore[import]
 15 |     from aiohttp import ClientTimeout  # type: ignore[import]
 16 | except ImportError:
 17 |     print("Warning: aiohttp not installed. Please install with: pip install aiohttp")
 18 |     aiohttp = None  # type: ignore
 19 |     ClientTimeout = None  # type: ignore
 20 | 
 21 | try:
 22 |     from ..graph.state import AuditState
 23 | except ImportError:
 24 |     from src.graph.state import AuditState
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | 
 29 | async def extract_pdf_via_api(pdf_file_path: str, api_endpoint: str) -> Dict[str, Any]:
 30 |     """
 31 |     通过FastAPI提取PDF内容为JSON
 32 |     
 33 |     基于用户提供的工作案例，使用aiohttp实现类似requests的参数传递方式：
 34 |     - 基础URL和查询参数分开处理
 35 |     - 逐个上传PDF文件（不是压缩包）
 36 |     - 使用multipart/form-data格式
 37 |     
 38 |     Args:
 39 |         pdf_file_path: PDF文件路径
 40 |         api_endpoint: API端点URL（不包含查询参数）
 41 |         
 42 |     Returns:
 43 |         提取的JSON内容
 44 |     """
 45 |     if aiohttp is None:
 46 |         return {
 47 |             "success": False,
 48 |             "error": "aiohttp库未安装，请使用 pip install aiohttp 安装",
 49 |             "file_path": pdf_file_path
 50 |         }
 51 |     
 52 |     try:
 53 |         # 按照用户案例的方式设置参数
 54 |         params = {
 55 |             'parse_method': 'auto',
 56 |             'is_json_md_dump': 'false',
 57 |             'output_dir': 'output',
 58 |             'return_layout': 'false',
 59 |             'return_info': 'false',
 60 |             'return_content_list': 'false',
 61 |             'return_images': 'false'
 62 |         }
 63 |         
 64 |         # 创建请求头
 65 |         headers = {
 66 |             "accept": "application/json",
 67 |             "User-Agent": "LangGraph-PDF-Extractor/1.0"
 68 |         }
 69 |         
 70 |         print(f"📤 正在上传PDF文件: {Path(pdf_file_path).name} 到 {api_endpoint}")
 71 |         
 72 |         async with aiohttp.ClientSession() as session:
 73 |             # 异步读取文件内容
 74 |             try:
 75 |                 file_content = await asyncio.to_thread(lambda: open(pdf_file_path, 'rb').read())
 76 |             except Exception as file_error:
 77 |                 error_msg = f"读取PDF文件失败: {str(file_error)}"
 78 |                 print(f"❌ {error_msg}")
 79 |                 return {
 80 |                     "success": False,
 81 |                     "error": error_msg,
 82 |                     "file_path": pdf_file_path,
 83 |                     "api_endpoint": api_endpoint
 84 |                 }
 85 |             
 86 |             # 按照用户案例创建文件数据
 87 |             data = aiohttp.FormData()
 88 |             data.add_field(
 89 |                 'pdf_file',  # 与用户案例中的字段名一致
 90 |                 file_content, 
 91 |                 filename=Path(pdf_file_path).name, 
 92 |                 content_type='application/pdf'
 93 |             )
 94 |             
 95 |             # 使用params参数传递查询参数，类似requests.post(url, params=params, files=files)
 96 |             # 创建超时设置
 97 |             timeout = ClientTimeout(total=120) if ClientTimeout else aiohttp.ClientTimeout(total=120)
 98 |             
 99 |             async with session.post(
100 |                 api_endpoint, 
101 |                 params=params,  # 查询参数单独传递
102 |                 data=data,      # 文件数据
103 |                 headers=headers, 
104 |                 timeout=timeout
105 |             ) as response:
106 |                 print(f"📊 API响应状态码: {response.status}")
107 |                 
108 |                 if response.status == 200:
109 |                     try:
110 |                         result = await response.json()
111 |                         print(f"✅ 成功提取PDF内容: {Path(pdf_file_path).name}")
112 |                         print(f"📋 API返回结构: {list(result.keys()) if isinstance(result, dict) else type(result)}")
113 |                         return {
114 |                             "success": True,
115 |                             "content": result,
116 |                             "file_path": pdf_file_path,
117 |                             "api_endpoint": str(response.url),
118 |                             "extraction_timestamp": None
119 |                         }
120 |                     except Exception as json_error:
121 |                         error_text = await response.text()
122 |                         print(f"⚠️ API返回非JSON格式: {json_error}")
123 |                         return {
124 |                             "success": False,
125 |                             "error": f"API返回非JSON格式: {json_error}",
126 |                             "error_details": error_text[:500],
127 |                             "file_path": pdf_file_path,
128 |                             "api_endpoint": str(response.url)
129 |                         }
130 |                 else:
131 |                     error_text = await response.text()
132 |                     print(f"❌ API返回错误状态码 {response.status}: {error_text[:200]}...")
133 |                     return {
134 |                         "success": False,
135 |                         "error": f"API返回错误状态码: {response.status}",
136 |                         "error_details": error_text,
137 |                         "file_path": pdf_file_path,
138 |                         "api_endpoint": str(response.url)
139 |                     }
140 |                         
141 |     except FileNotFoundError:
142 |         error_msg = f"找不到PDF文件: {pdf_file_path}"
143 |         print(f"❌ {error_msg}")
144 |         return {
145 |             "success": False,
146 |             "error": error_msg,
147 |             "file_path": pdf_file_path,
148 |             "api_endpoint": api_endpoint
149 |         }
150 |     except Exception as e:
151 |         error_msg = f"API调用失败: {str(e)}"
152 |         print(f"❌ {error_msg}")
153 |         return {
154 |             "success": False,
155 |             "error": error_msg,
156 |             "file_path": pdf_file_path,
157 |             "api_endpoint": api_endpoint
158 |         }
159 | 
160 | 
161 | async def pdf_extraction_node(state: AuditState) -> Dict[str, Any]:
162 |     """
163 |     完全无缓存的PDF内容提取节点 - 每次都处理全新数据
164 |     
165 |     🚨 已完全取消缓存机制，确保每次传输的信息都是全新的、一次性的
166 |     """
167 |     try:
168 |         print(f"📄 开始无缓存PDF内容提取...")
169 |         
170 |         # 直接获取当前状态的文件夹数据 - 不使用任何缓存
171 |         folder_validation = state.get("folder_validation", {})
172 |         
173 |         print(f"🔍 当前状态数据:")
174 |         print(f"   文件夹验证结果: {len(folder_validation.get('folders_found', []))} 个文件夹")
175 |         
176 |         # 验证数据有效性
177 |         if not folder_validation or not folder_validation.get("folders_found"):
178 |             print("⚠️ 未找到有效的文件夹结构数据")
179 |             return {
180 |                 "current_step": "pdf_extraction_failed",
181 |                 "error_message": "没有找到有效的文件夹结构",
182 |                 "processing_logs": ["没有找到有效的文件夹结构"]
183 |             }
184 |         
185 |         # 获取PDF API端点配置
186 |         api_endpoint = state.get("pdf_api_endpoint")
187 |         if not api_endpoint:
188 |             # 尝试使用默认配置
189 |             api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain"
190 |             print(f"⚠️ 状态中未配置PDF API端点，使用默认端点: {api_endpoint}")
191 |             
192 |             # 检查是否有配置文件
193 |             try:
194 |                 from src.config.api_config import get_pdf_api_config
195 |                 api_config = get_pdf_api_config()
196 |                 configured_endpoint = api_config.get("pdf_extraction_endpoint")
197 |                 if configured_endpoint:
198 |                     api_endpoint = configured_endpoint
199 |                     print(f"✅ 从配置文件获取到API端点: {api_endpoint}")
200 |             except ImportError:
201 |                 print("⚠️ 无法导入API配置模块，使用硬编码默认端点")
202 |             except Exception as e:
203 |                 print(f"⚠️ 读取API配置失败: {e}，使用硬编码默认端点")
204 |                 
205 |             # 如果仍然没有API端点，返回错误
206 |             if not api_endpoint:
207 |                 return {
208 |                     "current_step": "pdf_extraction_failed",
209 |                     "error_message": "未配置PDF提取API端点，请检查配置文件或环境变量"
210 |                 }
211 |         
212 |         folders_found = folder_validation["folders_found"]
213 |         pdf_extraction_results = {}
214 |         api_extraction_results = {}
215 |         total_pdf_files = 0
216 |         successful_extractions = 0
217 |         
218 |         # 处理每个标准文件夹中的PDF文件
219 |         for folder_info in folders_found:
220 |             folder_name = folder_info["name"]
221 |             folder_path = folder_info["path"]
222 |             
223 |             print(f"📁 处理文件夹: {folder_name}")
224 |             
225 |             # 查找文件夹中的PDF文件（异步方式）
226 |             folder_path_obj = Path(folder_path)
227 |             
228 |             # 使用asyncio.to_thread来异步执行文件系统操作
229 |             try:
230 |                 pdf_files = await asyncio.to_thread(lambda: list(folder_path_obj.glob("*.pdf")))
231 |             except Exception as glob_error:
232 |                 print(f"❌ 扫描文件夹 {folder_name} 时发生错误: {str(glob_error)}")
233 |                 pdf_extraction_results[folder_name] = {
234 |                     "files": [],
235 |                     "folder_path": folder_path,
236 |                     "material_type": folder_name,
237 |                     "pdf_files_count": 0,
238 |                     "status": "error",
239 |                     "error": str(glob_error)
240 |                 }
241 |                 continue
242 |             
243 |             if not pdf_files:
244 |                 print(f"⚠️ 文件夹 {folder_name} 中没有找到PDF文件")
245 |                 pdf_extraction_results[folder_name] = {
246 |                     "files": [],
247 |                     "folder_path": folder_path,
248 |                     "material_type": folder_name,
249 |                     "pdf_files_count": 0,
250 |                     "status": "empty"
251 |                 }
252 |                 continue
253 |             
254 |             total_pdf_files += len(pdf_files)
255 |             folder_results = []
256 |             
257 |             # 使用asyncio并发处理PDF文件提取
258 |             tasks = []
259 |             for pdf_file in pdf_files:
260 |                 task = extract_pdf_via_api(str(pdf_file), api_endpoint)
261 |                 tasks.append(task)
262 |             
263 |             # 并发执行API调用
264 |             results = await asyncio.gather(*tasks, return_exceptions=True)
265 |             
266 |             for pdf_file, result in zip(pdf_files, results):
267 |                 if isinstance(result, Exception):
268 |                     print(f"❌ 处理文件 {pdf_file.name} 时发生异常: {str(result)}")
269 |                     folder_results.append({
270 |                         "file_name": pdf_file.name,
271 |                         "file_path": str(pdf_file),
272 |                         "success": False,
273 |                         "error": str(result),
274 |                         "material_type": folder_name
275 |                     })
276 |                 elif isinstance(result, dict) and result.get("success"):
277 |                     print(f"✅ 成功提取 {pdf_file.name}")
278 |                     successful_extractions += 1
279 |                     
280 |                     # 异步获取文件大小
281 |                     try:
282 |                         file_size = await asyncio.to_thread(lambda: pdf_file.stat().st_size)
283 |                     except Exception as stat_error:
284 |                         print(f"⚠️ 获取文件大小失败: {stat_error}")
285 |                         file_size = 0
286 |                     
287 |                     # 构建标准化JSON格式
288 |                     standardized_json = {
289 |                         "metadata": {
290 |                             "file_name": pdf_file.name,
291 |                             "file_path": str(pdf_file),
292 |                             "size_bytes": file_size,
293 |                             "material_type": folder_name,
294 |                             "extraction_method": "api"
295 |                         },
296 |                         "content": result.get("content", {}),
297 |                         "validation": {
298 |                             "is_valid": True,
299 |                             "api_endpoint": api_endpoint,
300 |                             "extraction_timestamp": result.get("extraction_timestamp")
301 |                         }
302 |                     }
303 |                     
304 |                     folder_results.append({
305 |                         "file_name": pdf_file.name,
306 |                         "file_path": str(pdf_file),
307 |                         "success": True,
308 |                         "json_data": standardized_json,
309 |                         "json_string": json.dumps(standardized_json, ensure_ascii=False, indent=2),
310 |                         "format": "strict_json",
311 |                         "size": len(json.dumps(standardized_json)),
312 |                         "material_type": folder_name
313 |                     })
314 |                     
315 |                     # 存储API提取结果
316 |                     if folder_name not in api_extraction_results:
317 |                         api_extraction_results[folder_name] = []
318 |                     api_extraction_results[folder_name].append(standardized_json)
319 |                     
320 |                 else:
321 |                     # 处理失败的情况
322 |                     error_msg = "未知错误"
323 |                     if isinstance(result, dict):
324 |                         error_msg = result.get("error", "未知错误")
325 |                     print(f"❌ 提取失败 {pdf_file.name}: {error_msg}")
326 |                     folder_results.append({
327 |                         "file_name": pdf_file.name,
328 |                         "file_path": str(pdf_file),
329 |                         "success": False,
330 |                         "error": error_msg,
331 |                         "material_type": folder_name
332 |                     })
333 |             
334 |             pdf_extraction_results[folder_name] = {
335 |                 "files": folder_results,
336 |                 "folder_path": folder_path,
337 |                 "material_type": folder_name,
338 |                 "pdf_files_count": len(pdf_files),
339 |                 "successful_count": len([r for r in folder_results if r.get("success")]),
340 |                 "status": "success" if folder_results else "empty"
341 |             }
342 |         
343 |         success_folders = sum(1 for item in pdf_extraction_results.values() 
344 |                             if item.get("status") in ["success", "empty"])  # 包括空文件夹
345 |         total_folders = len(pdf_extraction_results)
346 |         
347 |         print(f"✅ PDF内容提取完成: {success_folders}/{total_folders}个文件夹，{successful_extractions}/{total_pdf_files}个PDF文件提取成功")
348 |         
349 |         # 即使没有PDF文件，只要有文件夹结构就认为成功
350 |         if total_folders > 0:
351 |             return {
352 |                 "pdf_extraction_results": pdf_extraction_results,
353 |                 "api_extraction_results": api_extraction_results,
354 |                 "extracted_content": api_extraction_results,  # 保持兼容性
355 |                 "current_step": "pdf_extraction_completed",
356 |                 "processing_stats": {
357 |                     "total_folders": total_folders,
358 |                     "successful_folders": success_folders,
359 |                     "total_pdf_files": total_pdf_files,
360 |                     "successful_extractions": successful_extractions,
361 |                     "extraction_rate": successful_extractions / total_pdf_files if total_pdf_files > 0 else 0
362 |                 }
363 |             }
364 |         else:
365 |             return {
366 |                 "current_step": "pdf_extraction_failed",
367 |                 "error_message": "未找到可处理的文件夹"
368 |             }
369 |         
370 |     except Exception as e:
371 |         logger.error(f"PDF内容提取失败: {str(e)}")
372 |         print(f"❌ PDF内容提取失败: {str(e)}")
373 |         return {
374 |             "current_step": "pdf_extraction_failed",
375 |             "error_message": f"PDF内容提取失败: {str(e)}"
376 |         }
377 | 
378 | 
379 | def configure_pdf_api_endpoint(state: AuditState, api_endpoint: str) -> Dict[str, Any]:
380 |     """
381 |     配置PDF提取API端点
382 |     
383 |     Args:
384 |         state: 当前状态
385 |         api_endpoint: API端点URL
386 |         
387 |     Returns:
388 |         更新的状态
389 |     """
390 |     return {
391 |         "pdf_api_endpoint": api_endpoint,
392 |         "processing_logs": [f"已配置PDF提取API端点: {api_endpoint}"]
393 |     }


--------------------------------------------------------------------------------
/src/models/state.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 数据模型定义
  3 | 
  4 | 定义审核流程中使用的数据模型（不包括LangGraph状态）
  5 | 
  6 | 模型使用状态：
  7 | - CoreInfo: ✅ 高度活跃 - 在多个节点中实际使用
  8 | - RuleInfo: ✅ 高度活跃 - 规则处理核心模型
  9 | - ValidationResult: ⚠️ 部分使用 - 主要用作类型注解
 10 | - CrossValidationResult: ⚠️ 部分使用 - 主要用作类型注解
 11 | - MaterialProcessingStats: ✅ 有效使用 - 在报告生成中实际使用
 12 | - AuditReport: ⚠️ 部分功能未启用 - 完善但使用有限
 13 | 
 14 | 已移除未使用模型：
 15 | - FileInfo: ✖️ 已移除 - 几乎未使用
 16 | - MaterialInfo: ✖️ 已移除 - 使用场景有限，可用Dict替代
 17 | - ReportSummary: ✖️ 已移除 - 完全未使用
 18 | """
 19 | 
 20 | from typing import List, Dict, Any, Optional, Union
 21 | from pydantic import BaseModel, Field
 22 | 
 23 | 
 24 | # ============================================================================
 25 | # 核心业务模型（高度活跃）
 26 | # ============================================================================
 27 | class CoreInfo(BaseModel):
 28 |     """核心信息模型（简化版） - ✅ 高度活跃模型"""
 29 |     name: str = Field(description="姓名，统一格式，去除空格", default="")
 30 |     gender: str = Field(description="性别，男/女", default="")
 31 |     id_number: str = Field(description="身份证号，18位标准格式", default="")
 32 |     extracted_from: List[str] = Field(description="信息来源材料", default_factory=list)
 33 | 
 34 | 
 35 | # ============================================================================
 36 | # 校验结果模型（部分使用）
 37 | # ============================================================================
 38 | class ValidationResult(BaseModel):
 39 |     """校验结果模型 - 增强版，完整存储validation节点的所有输出信息"""
 40 |     rule_id: str
 41 |     rule_name: str
 42 |     status: str  # PASS, WARNING, ERROR
 43 |     result: str  # "✅通过", "⚠️警告", "❌不通过"
 44 |     message: str
 45 |     details: str = Field(description="详细描述信息")
 46 |     priority: str = Field(description="优先级：高/中/低")
 47 |     material_type: str = Field(description="材料类型")
 48 |     rule_content: str = Field(description="应用的规则内容", default="")
 49 |     ai_powered: bool = Field(description="是否AI驱动的校验", default=False)
 50 |     rules_applied: int = Field(description="应用的规则数量", default=0)
 51 |     timestamp: str = Field(description="校验时间戳")
 52 |     
 53 |     @classmethod
 54 |     def from_validation_output(cls, validation_dict: Dict[str, Any]) -> "ValidationResult":
 55 |         """从validation节点输出的字典创建ValidationResult对象"""
 56 |         return cls(
 57 |             rule_id=validation_dict.get('rule_name', '').replace(' ', '_'),
 58 |             rule_name=validation_dict.get('rule_name', ''),
 59 |             status=cls._convert_result_to_status(validation_dict.get('result', '')),
 60 |             result=validation_dict.get('result', ''),
 61 |             message=validation_dict.get('details', ''),
 62 |             details=validation_dict.get('details', ''),
 63 |             priority=validation_dict.get('priority', '中'),
 64 |             material_type=validation_dict.get('material_type', ''),
 65 |             rule_content=validation_dict.get('rule_content', ''),
 66 |             ai_powered=validation_dict.get('ai_powered', False),
 67 |             rules_applied=validation_dict.get('rules_applied', 0),
 68 |             timestamp=validation_dict.get('timestamp', '')
 69 |         )
 70 |     
 71 |     @staticmethod
 72 |     def _convert_result_to_status(result: str) -> str:
 73 |         """将结果转换为状态"""
 74 |         if result.startswith('✅'):
 75 |             return 'PASS'
 76 |         elif result.startswith('⚠️'):
 77 |             return 'WARNING'
 78 |         elif result.startswith('❌'):
 79 |             return 'ERROR'
 80 |         else:
 81 |             return 'WARNING'
 82 | 
 83 | 
 84 | class ValidationSummary(BaseModel):
 85 |     """验证结果摘要模型 - 存储validation节点的完整统计信息"""
 86 |     total_materials_processed: int = Field(description="处理的材料数量")
 87 |     total_validations: int = Field(description="总校验数量")
 88 |     successful_materials: int = Field(description="成功校验的材料数量")
 89 |     error_count: int = Field(description="错误数量")
 90 |     warning_count: int = Field(description="警告数量")
 91 |     pass_count: int = Field(description="通过数量")
 92 |     ai_powered_validations: int = Field(description="AI驱动的校验数量")
 93 |     total_rules_applied: int = Field(description="应用的规则总数")
 94 |     materials_by_type: Dict[str, int] = Field(description="按材料类型统计", default_factory=dict)
 95 |     validation_start_time: Optional[str] = Field(description="校验开始时间", default=None)
 96 |     validation_end_time: Optional[str] = Field(description="校验结束时间", default=None)
 97 |     
 98 |     @classmethod
 99 |     def from_validation_results(cls, validation_results: List[ValidationResult]) -> "ValidationSummary":
100 |         """从验证结果列表创建摘要"""
101 |         error_count = sum(1 for r in validation_results if r.status == 'ERROR')
102 |         warning_count = sum(1 for r in validation_results if r.status == 'WARNING')
103 |         pass_count = sum(1 for r in validation_results if r.status == 'PASS')
104 |         ai_powered_count = sum(1 for r in validation_results if r.ai_powered)
105 |         total_rules = sum(r.rules_applied for r in validation_results)
106 |         
107 |         materials_by_type = {}
108 |         for result in validation_results:
109 |             mat_type = result.material_type
110 |             materials_by_type[mat_type] = materials_by_type.get(mat_type, 0) + 1
111 |         
112 |         return cls(
113 |             total_materials_processed=len(set(r.material_type for r in validation_results)),
114 |             total_validations=len(validation_results),
115 |             successful_materials=len(set(r.material_type for r in validation_results if r.status != 'ERROR')),
116 |             error_count=error_count,
117 |             warning_count=warning_count,
118 |             pass_count=pass_count,
119 |             ai_powered_validations=ai_powered_count,
120 |             total_rules_applied=total_rules,
121 |             materials_by_type=materials_by_type
122 |         )
123 | 
124 | 
125 | class CrossValidationResult(BaseModel):
126 |     """交叉校验结果模型 - ⚠️ 主要用作类型注解，实际多使用Dict"""
127 |     validation_type: str  # name_consistency, id_consistency, time_logic, data_rationality
128 |     status: str  # PASS, WARNING, ERROR
129 |     message: str
130 |     conflicts: List[str] = []
131 | 
132 | 
133 | # ============================================================================
134 | # 规则相关模型（高度活跃）
135 | # ============================================================================
136 | class RuleInfo(BaseModel):
137 |     """规则信息模型 - ✅ 高度活跃模型，在rules_processing和validation中大量使用"""
138 |     rule_id: str = Field(description="规则唯一标识")
139 |     content: str = Field(description="规则内容")
140 |     source_file: str = Field(description="来源文件名")
141 |     category: str = Field(description="1-17中的分类编号", default="17")
142 |     priority: str = Field(description="优先级", default="normal")
143 | 
144 | 
145 | class RuleFileInfo(BaseModel):
146 |     """规则文件信息模型 - ✅ 在rules_processing中使用"""
147 |     file_name: str = Field(description="规则文件名")
148 |     file_path: str = Field(description="文件完整路径")
149 |     file_type: str = Field(description="文件类型 (.xlsx 或 .md)")
150 |     size: int = Field(description="文件大小或规则数量")
151 |     content: Optional[str] = Field(description="文件原始内容（仅Markdown文件）", default=None)
152 |     extracted_rules: Optional[List[RuleInfo]] = Field(description="提取的规则列表（仅Excel文件）", default=None)
153 | 
154 | 
155 | # ============================================================================
156 | # 状态管理模型
157 | # ============================================================================
158 | class AuditState(BaseModel):
159 |     """审核工作流状态定义（业务数据模型）"""
160 |     
161 |     # 输入文件信息
162 |     uploaded_file: Optional[str] = None  # 上传的文件路径
163 |     file_type: str = ""  # 文件类型 (zip/pdf/doc等)
164 |     
165 |     # 文件处理结果
166 |     extracted_files: List[str] = Field(default_factory=list)  # 解压后的文件列表
167 |     file_classification: Dict[str, str] = Field(default_factory=dict)  # 文件分类结果
168 |     
169 |     # PDF处理
170 |     pdf_analysis: Dict[str, Any] = Field(default_factory=dict)  # PDF页数分析结果
171 |     pdf_chunks: Dict[str, List[str]] = Field(default_factory=dict)  # PDF分片结果
172 |     
173 |     # 内容提取
174 |     extracted_content: Dict[str, Any] = Field(default_factory=dict)  # 提取的内容信息
175 |     core_info: Optional[Dict[str, Any]] = None  # 核心信息（姓名、身份证号）
176 |     
177 |     # 规则处理
178 |     rules_data: List[RuleFileInfo] = Field(default_factory=list)  # 加载的规则文件数据
179 |     parsed_rules: List[RuleInfo] = Field(default_factory=list)  # 解析后的规则列表
180 |     rules_by_category: Dict[str, List[RuleInfo]] = Field(default_factory=dict)  # 按1-17项分类的规则
181 |     
182 |     # 验证结果（完整存储）
183 |     validation_results_detailed: List[ValidationResult] = Field(description="详细的验证结果列表", default_factory=list)
184 |     validation_summary: Optional[ValidationSummary] = Field(description="验证结果摘要", default=None)
185 |     material_validation: Dict[str, List[Any]] = Field(default_factory=dict)  # 材料校验结果（兼容）
186 |     cross_validation: List[Any] = Field(default_factory=list)  # 交叉校验结果（并发安全）
187 |     validation_results: List[Dict[str, Any]] = Field(default_factory=list)  # 所有校验结果（兼容）
188 |     
189 |     # 报告生成
190 |     audit_report: Optional["AuditReport"] = None  # 生成的审核报告对象
191 |     report_path: Optional[str] = None  # 报告文件路径
192 |     
193 |     # 流程控制
194 |     current_step: str = "file_processing"  # 当前步骤
195 |     error_message: Optional[str] = None  # 错误信息
196 |     warnings: List[str] = Field(default_factory=list)  # 警告信息
197 |     processing_logs: List[str] = Field(default_factory=list)  # 处理日志
198 |     is_complete: bool = False  # 是否完成
199 |     
200 |     # Redis缓存相关
201 |     session_id: Optional[str] = None  # 会话ID
202 | 
203 | 
204 | # ============================================================================
205 | # 报告相关模型（部分功能未启用）
206 | # ============================================================================
207 | 
208 | 
209 | class AuditReport(BaseModel):
210 |     """审核报告模型（增强版） - ⚠️ 完善但使用有限，主要作为类型注解"""
211 |     
212 |     # 报告基本信息
213 |     report_id: str = Field(description="报告唯一标识")
214 |     generated_at: str = Field(description="生成时间")
215 |     report_version: str = Field(description="报告版本", default="v2.0")
216 |     
217 |     # 申报人信息
218 |     applicant_info: CoreInfo = Field(description="申报人核心信息")
219 |     
220 |     # 审核摘要
221 |     summary: Dict[str, Any] = Field(description="审核结果摘要", default_factory=dict)
222 |     
223 |     # 材料处理统计
224 |     processing_stats: Dict[str, Any] = Field(description="处理统计信息", default_factory=dict)
225 |     
226 |     # 校验结果分类（按严重程度）
227 |     severe_issues: List[ValidationResult] = Field(description="严重问题", default_factory=list)
228 |     warnings: List[ValidationResult] = Field(description="警告问题", default_factory=list)
229 |     suggestions: List[ValidationResult] = Field(description="建议优化", default_factory=list)
230 |     passed_validations: List[ValidationResult] = Field(description="通过的校验", default_factory=list)
231 |     
232 |     # 交叉校验结果
233 |     cross_validation_results: List[CrossValidationResult] = Field(description="交叉校验结果", default_factory=list)
234 |     
235 |     # 按材料分类的结果
236 |     material_results: Dict[str, List[ValidationResult]] = Field(description="按材料类型分类的结果", default_factory=dict)
237 |     
238 |     # 规则应用统计
239 |     rules_applied: Dict[str, Any] = Field(description="应用的规则统计", default_factory=dict)
240 |     
241 |     # HTML报告内容
242 |     html_content: Optional[str] = Field(description="生成的HTML报告内容", default=None)
243 |     
244 |     # 报告文件路径
245 |     file_path: Optional[str] = Field(description="报告文件保存路径", default=None)
246 |     
247 |     # 质量评分
248 |     quality_score: Optional[float] = Field(description="材料质量评分(0-100)", default=None)
249 |     
250 |     # 合规性评估
251 |     compliance_status: str = Field(description="合规性状态", default="PENDING")  # PASS/WARNING/FAIL/PENDING
252 |     
253 |     # 建议措施
254 |     recommendations: List[str] = Field(description="改进建议", default_factory=list)
255 |     
256 |     # 审核日志
257 |     audit_logs: List[str] = Field(description="审核过程日志", default_factory=list)
258 |     
259 |     @classmethod
260 |     def create_from_state(cls, state: Any, report_id: str) -> "AuditReport":
261 |         """从审核状态创建报告"""
262 |         from datetime import datetime
263 |         
264 |         # 获取核心信息
265 |         core_info = state.get('core_info') or {} if hasattr(state, 'get') else getattr(state, 'core_info', None) or {}
266 |         
267 |         # 处理字典和对象访问
268 |         def get_state_value(key: str, default=None):
269 |             if hasattr(state, 'get'):  # 字典类型
270 |                 return state.get(key, default)
271 |             else:  # 对象类型
272 |                 return getattr(state, key, default)
273 |         
274 |         applicant_info = CoreInfo(
275 |             name=core_info.get('name', '') if isinstance(core_info, dict) else '',
276 |             gender=core_info.get('gender', '') if isinstance(core_info, dict) else '',
277 |             id_number=core_info.get('id_number', '') if isinstance(core_info, dict) else '',
278 |             extracted_from=core_info.get('extracted_from', []) if isinstance(core_info, dict) else []
279 |         )
280 |         
281 |         # 创建报告实例
282 |         audit_logs = get_state_value('processing_logs', [])
283 |         if not isinstance(audit_logs, list):
284 |             audit_logs = []
285 |         
286 |         return cls(
287 |             report_id=report_id,
288 |             generated_at=datetime.now().isoformat(),
289 |             applicant_info=applicant_info,
290 |             processing_stats=MaterialProcessingStats.from_state(state).dict(),
291 |             audit_logs=audit_logs
292 |         )
293 |     
294 |     def calculate_quality_score(self) -> float:
295 |         """计算质量评分"""
296 |         total_validations = len(self.severe_issues) + len(self.warnings) + len(self.passed_validations)
297 |         if total_validations == 0:
298 |             return 100.0
299 |         
300 |         # 计算分数：错误扣分更多，警告扣分较少
301 |         error_penalty = len(self.severe_issues) * 10
302 |         warning_penalty = len(self.warnings) * 3
303 |         total_penalty = error_penalty + warning_penalty
304 |         
305 |         score = max(0, 100 - total_penalty)
306 |         return score
307 |     
308 |     def determine_compliance_status(self) -> str:
309 |         """确定合规性状态"""
310 |         if len(self.severe_issues) > 0:
311 |             return "FAIL"
312 |         elif len(self.warnings) > 0:
313 |             return "WARNING"
314 |         else:
315 |             return "PASS"
316 |     
317 |     def get_summary_dict(self) -> Dict[str, Any]:
318 |         """获取摘要字典"""
319 |         return {
320 |             "total_validations": len(self.severe_issues) + len(self.warnings) + len(self.passed_validations),
321 |             "error_count": len(self.severe_issues),
322 |             "warning_count": len(self.warnings),
323 |             "passed_count": len(self.passed_validations),
324 |             "cross_validation_count": len(self.cross_validation_results),
325 |             "quality_score": self.quality_score or self.calculate_quality_score(),
326 |             "compliance_status": self.compliance_status
327 |         }
328 | 
329 | 
330 | # ============================================================================
331 | # 统计模型（有效使用）
332 | # ============================================================================
333 | 
334 | 
335 | class MaterialProcessingStats(BaseModel):
336 |     """材料处理统计模型 - ✅ 在AuditReport中有实际应用"""
337 |     files_extracted: int = Field(description="解压文件数量", default=0)
338 |     pdfs_processed: int = Field(description="处理的PDF数量", default=0)
339 |     content_extracted: bool = Field(description="内容提取成功", default=False)
340 |     core_info_extracted: bool = Field(description="核心信息提取成功", default=False)
341 |     categories_classified: List[str] = Field(description="已分类的材料类型", default_factory=list)
342 |     
343 |     @classmethod
344 |     def from_state(cls, state: Any) -> "MaterialProcessingStats":
345 |         """从审核状态创建处理统计"""
346 |         # 处理字典和对象访问
347 |         def get_state_value(key: str, default=None):
348 |             if hasattr(state, 'get'):  # 字典类型
349 |                 return state.get(key, default)
350 |             else:  # 对象类型
351 |                 return getattr(state, key, default)
352 |         
353 |         extracted_files = get_state_value('extracted_files', []) or []
354 |         extracted_content = get_state_value('extracted_content', {}) or {}
355 |         core_info = get_state_value('core_info')
356 |         
357 |         return cls(
358 |             files_extracted=len(extracted_files),
359 |             pdfs_processed=len([f for f in extracted_files if f.lower().endswith('.pdf')]),
360 |             content_extracted=len(extracted_content) > 0,
361 |             core_info_extracted=bool(core_info and (
362 |                 core_info.get('name') or core_info.get('id_number') 
363 |                 if isinstance(core_info, dict) else False
364 |             )),
365 |             categories_classified=list(extracted_content.keys()) if extracted_content else []
366 |         )


--------------------------------------------------------------------------------