├── rules
├── 9.论文规则集.xlsx
├── 1.教育经历规则集.xlsx
├── 16.考核规则集.xlsx
├── 17.附件规则集.xlsx
├── 2.工作经历规则集.xlsx
├── 8.项目经历规则集.xlsx
├── 交叉检验规则.md
├── 14.资质证书规则集.xlsx
└── 11.专利(著作权)情况规则集.xlsx
├── src
├── __init__.py
├── config
│ ├── __init__.py
│ ├── warning_config.py
│ ├── api_config.py
│ ├── model_config.py
│ └── redis.py
├── models
│ ├── __init__.py
│ └── state.py
├── nodes
│ ├── __init__.py
│ ├── file_processing.py
│ ├── cross_validation.py
│ ├── report_generation.py
│ ├── core_info_extraction.py
│ ├── validation.py
│ └── pdf_extraction.py
├── graph
│ ├── __init__.py
│ ├── workflow.py
│ ├── state.py
│ └── edges.py
├── tools
│ ├── __init__.py
│ ├── workflow_integration.py
│ ├── common_utils.py
│ ├── cache_manager.py
│ ├── langsmith_utils.py
│ └── file_utils.py
└── agent.py
├── langgraph.json
├── __init__.py
├── README.md
├── graph_def.py
├── .gitignore
├── requirements.txt
├── .env.example
├── pyproject.toml
└── static
├── styles.css
└── index.html
/rules/9.论文规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/9.论文规则集.xlsx
--------------------------------------------------------------------------------
/rules/1.教育经历规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/1.教育经历规则集.xlsx
--------------------------------------------------------------------------------
/rules/16.考核规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/16.考核规则集.xlsx
--------------------------------------------------------------------------------
/rules/17.附件规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/17.附件规则集.xlsx
--------------------------------------------------------------------------------
/rules/2.工作经历规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/2.工作经历规则集.xlsx
--------------------------------------------------------------------------------
/rules/8.项目经历规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/8.项目经历规则集.xlsx
--------------------------------------------------------------------------------
/rules/交叉检验规则.md:
--------------------------------------------------------------------------------
1 | 1.所有材料中的主人公姓名必须一致
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/rules/14.资质证书规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/14.资质证书规则集.xlsx
--------------------------------------------------------------------------------
/rules/11.专利(著作权)情况规则集.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/a1594834522-coder/Langgraph-checking/HEAD/rules/11.专利(著作权)情况规则集.xlsx
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 主要源代码目录
3 |
4 | 包含系统的核心模块:
5 | - graph: LangGraph工作流定义
6 | - nodes: 各个处理节点实现
7 | - tools: 工具函数和辅助模块
8 | - models: 数据模型和状态定义
9 | - services: 业务服务层
10 | """
--------------------------------------------------------------------------------
/langgraph.json:
--------------------------------------------------------------------------------
1 | {
2 | "dependencies": ["."],
3 | "graphs": {
4 | "audit_workflow": "graph_def:graph"
5 | },
6 | "dockerfile_lines": [],
7 | "python_version": "3.12",
8 | "env": ".env",
9 | "port": 8123
10 | }
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 企业级职称评审材料审核系统
3 | 基于 LangGraph 框架的智能化审核流程
4 |
5 | 项目结构说明:
6 | - src/: 主要源代码目录
7 | - graph/: LangGraph工作流定义
8 | - nodes/: 各个处理节点实现
9 | - tools/: 工具函数和辅助模块
10 | - models/: 数据模型和状态定义
11 | - services/: 业务服务层
12 | - config/: 配置文件
13 | - tests/: 测试代码
14 | - docs/: 文档目录
15 | - data/: 数据存储目录
16 | """
17 |
18 | # 系统版本信息
19 | __version__ = "1.0.0"
20 | __author__ = "Abruzz1"
21 | __description__ = "企业级职称评审材料审核系统"
--------------------------------------------------------------------------------
/src/config/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 配置模块
3 |
4 | 包含项目所有配置相关的功能:
5 | - Redis 配置和连接管理
6 | - 环境变量配置
7 | - 其他系统配置
8 | """
9 |
10 | from .model_config import (
11 | model_config,
12 | setup_model_environment,
13 | setup_model_environment_sync,
14 | print_model_help
15 | )
16 |
17 | __all__ = [
18 | 'model_config',
19 | 'setup_model_environment',
20 | 'setup_model_environment_sync',
21 | 'print_model_help'
22 | ]
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 企业级职称评审材料审核系统
2 |
3 | 基于LangGraph框架构建的智能化职称评审材料审核系统,通过AI技术自动化处理和校验职称申报材料。
4 |
5 | 🔧 **集成LangSmith调试和监控功能** - 提供完整的工作流追踪、性能监控和调试支持。
6 |
7 | ## 系统架构
8 |
9 | 系统采用LangGraph图形化工作流设计,包含以下主要模块:
10 |
11 | 1. **文件处理模块** - ZIP解压、文件分类
12 | 2. **PDF智能处理** - 页数检测、智能分片
13 | 3. **内容提取** - AI识别、17类材料分类
14 | 4. **规则校验** - 各类材料规则验证
15 | 5. **交叉校验** - 核心信息一致性检查
16 | 6. **报告生成** - HTML格式化输出
17 |
18 | ## 安装说明
19 | 1.创建虚拟环境 python -m venv venv
20 |
21 | 启用 .venv/Scripts/activate
22 |
23 | 2.安装依赖 pip install .
24 |
25 | pip install requirements.txt
26 |
27 | 3.打开开发工具 langgraph dev
28 |
29 | 4.启动网页端 python web_app_v2.py
30 |
31 |
--------------------------------------------------------------------------------
/graph_def.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | LangGraph 工作流图定义
4 |
5 | 专门用于 LangGraph Studio 的图定义文件
6 | 避免复杂的导入路径问题
7 | """
8 |
9 | import sys
10 | import os
11 |
12 | # 确保项目路径在sys.path中
13 | project_root = os.path.dirname(os.path.abspath(__file__))
14 | if project_root not in sys.path:
15 | sys.path.insert(0, project_root)
16 |
17 | try:
18 | # 导入工作流创建函数
19 | from src.graph.workflow import create_audit_workflow
20 |
21 | # 创建图对象
22 | graph = create_audit_workflow()
23 |
24 | print("✅ LangGraph 工作流图已成功创建")
25 |
26 | except Exception as e:
27 | print(f"❌ 创建图失败: {e}")
28 | import traceback
29 | traceback.print_exc()
30 | raise
--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 数据模型包
3 |
4 | 定义系统中使用的所有数据模型:
5 | - 状态管理模型
6 | - 业务数据模型
7 | - 配置模型
8 |
9 | 模型使用状态说明:
10 | ✅ 高度活跃: CoreInfo, RuleInfo, RuleFileInfo, MaterialProcessingStats
11 | ⚠️ 部分使用: ValidationResult, CrossValidationResult, AuditReport
12 | ✖️ 已移除: FileInfo, MaterialInfo, ReportSummary
13 | """
14 |
15 | from .state import (
16 | CoreInfo,
17 | ValidationResult,
18 | CrossValidationResult,
19 | RuleInfo,
20 | RuleFileInfo,
21 | AuditReport,
22 | AuditState,
23 | MaterialProcessingStats
24 | )
25 |
26 | __all__ = [
27 | "CoreInfo",
28 | "ValidationResult",
29 | "CrossValidationResult",
30 | "RuleInfo",
31 | "RuleFileInfo",
32 | "AuditReport",
33 | "AuditState",
34 | "MaterialProcessingStats"
35 | ]
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python bytecode / cache
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # Virtual environments
7 | .venv/
8 | venv/
9 | env/
10 | venv313/
11 |
12 | # Test caches & coverage
13 | .pytest_cache/
14 | .mypy_cache/
15 | .ruff_cache/
16 | .tox/
17 | .nox/
18 | .coverage*
19 | coverage.xml
20 | htmlcov/
21 |
22 | # Packaging / build artifacts
23 | build/
24 | dist/
25 | .eggs/
26 | *.egg-info/
27 | *.egg
28 | pip-wheel-metadata/
29 |
30 | # Jupyter
31 | .ipynb_checkpoints/
32 |
33 | # Logs
34 | logs/
35 | *.log
36 |
37 | # IDE / OS
38 | .vscode/
39 | .idea/
40 | .DS_Store
41 | Thumbs.db
42 | desktop.ini
43 |
44 | # Environment files
45 | .env
46 | !.env.example
47 |
48 | # Project-specific temporary/data dirs
49 | test_data/
50 | temp_pdf_processing/
51 | uploads/
52 | extracted/
53 | .model_cache/
54 | .langgraph_api/
55 | .qoder/
56 |
57 | # Optional: front-end deps if ever used
58 | node_modules/
59 |
60 |
--------------------------------------------------------------------------------
/src/nodes/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 节点定义模块
3 |
4 | 包含LangGraph所有节点定义:
5 | - ZIP解压和文件夹处理节点 (file_processing)
6 | - PDF内容提取节点 (pdf_extraction)
7 | - 核心信息提取节点 (core_info_extraction)
8 | - 规则校验节点 (validation)
9 | - 交叉校验节点 (cross_validation)
10 | - 报告生成节点 (report_generation)
11 | - 规则集加载节点 (load_rules)
12 | - 规则集提取节点 (extract_rules)
13 | """
14 |
15 | # 从独立的节点文件中导入各个节点
16 | from .file_processing import file_processing_node
17 |
18 | from .pdf_extraction import pdf_extraction_node
19 | from .core_info_extraction import core_info_extraction_node
20 | from .validation import validation_node
21 | from .cross_validation import cross_validation_node
22 | from .report_generation import report_generation_node
23 |
24 | # 规则处理节点
25 | from .rules_processing import load_rules_node, extract_rules_node
26 |
27 |
28 | __all__ = [
29 | "file_processing_node",
30 | "pdf_extraction_node",
31 | "core_info_extraction_node",
32 | "validation_node",
33 | "cross_validation_node",
34 | "report_generation_node",
35 | "load_rules_node",
36 | "extract_rules_node"
37 | ]
--------------------------------------------------------------------------------
/src/graph/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | LangGraph工作流定义模块
3 |
4 | 包含系统的主要工作流:
5 | - workflow.py: 主要的审核工作流定义
6 | - state.py: 工作流状态管理
7 | - edges.py: 边和路由逻辑定义
8 | """
9 |
10 | from .workflow import (
11 | create_audit_workflow,
12 | get_default_workflow
13 | )
14 |
15 | from .state import (
16 | AuditState,
17 | WorkflowConfig,
18 | create_initial_state,
19 | update_state_step,
20 | add_warning,
21 | set_error,
22 | mark_complete
23 | )
24 |
25 | from .edges import (
26 | should_continue_processing,
27 | route_folder_validation,
28 | route_to_cross_validation,
29 | should_generate_report,
30 | check_core_info_for_cross_validation,
31 | check_pdf_extraction_status
32 | )
33 |
34 | __all__ = [
35 | # Workflow functions (优化后的版本,只保留主工作流)
36 | "create_audit_workflow",
37 | "get_default_workflow",
38 |
39 | # State management
40 | "AuditState",
41 | "WorkflowConfig",
42 | "create_initial_state",
43 | "update_state_step",
44 | "add_warning",
45 | "set_error",
46 | "mark_complete",
47 |
48 | # Edge routing functions
49 | "should_continue_processing",
50 | "route_folder_validation",
51 | "route_to_cross_validation",
52 | "should_generate_report",
53 | "check_core_info_for_cross_validation",
54 | "check_pdf_extraction_status"
55 | ]
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # 基础依赖包
2 | langgraph>=0.2.0
3 | langsmith>=0.1.0
4 | langtrace-python-sdk>=2.0.0
5 | pydantic>=2.0.0
6 | typing-extensions>=4.0.0
7 | langchain-core>=0.1.0 # LangGraph核心依赖
8 | langgraph-cli>=0.1.0 # LangGraph开发工具
9 | langgraph-checkpoint-redis>=0.1.0 # Redis检查点
10 |
11 | # 谷歌AI API(新版本)
12 | google-genai>=1.33.0
13 |
14 | # 环境变量管理
15 | python-dotenv>=1.0.0
16 |
17 | # 文件处理
18 | pathlib2
19 | zipfile36>=0.1.0
20 | python-magic>=0.4.0
21 | Pillow>=10.0.0
22 |
23 | # Web框架
24 | fastapi>=0.104.0
25 | uvicorn>=0.24.0
26 | python-multipart>=0.0.6
27 | sse-starlette>=1.6.0 # Server-Sent Events支持
28 | starlette>=0.27.0
29 |
30 | # 数据处理
31 | pandas>=2.0.0
32 | numpy>=1.24.0
33 |
34 | # HTML报告生成
35 | jinja2>=3.1.0
36 | weasyprint>=60.0
37 |
38 | # 配置管理
39 | python-dotenv>=1.0.0
40 | pyyaml>=6.0
41 |
42 | # 日志和监控
43 | loguru>=0.7.0
44 | prometheus-client>=0.19.0
45 |
46 | # 测试框架
47 | pytest>=7.4.0
48 | pytest-asyncio>=0.21.0
49 | pytest-cov>=4.1.0
50 |
51 | # 开发工具
52 | black>=23.0.0
53 | isort>=5.12.0
54 | flake8>=6.0.0
55 | mypy>=1.6.0
56 |
57 | # OCR和AI
58 | pytesseract>=0.3.10
59 | opencv-python>=4.8.0
60 |
61 | # 数据库
62 | sqlalchemy>=2.0.0
63 | alembic>=1.12.0
64 |
65 | # 异步处理
66 | aiofiles>=23.2.0
67 | celery>=5.3.0
68 | redis>=5.0.0
69 |
70 | # 文档处理
71 | markdown>=3.5.0
72 | markdownify>=0.11.0
73 |
74 | # 工作流状态管理
75 | psycopg>=3.1.0 # PostgreSQL支持
76 | asyncpg>=0.29.0 # 异步PostgreSQL
--------------------------------------------------------------------------------
/src/tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 工具模块导出
3 |
4 | 按功能模块组织的工具函数导出,包括:
5 | - AI模型工具(ai_utils)
6 | - 文件处理工具(file_utils)
7 | - 通用工具(common_utils)
8 | - 工作流集成工具(workflow_integration)
9 | """
10 |
11 | # AI模型工具
12 | from .ai_utils import (
13 | extract_core_information_with_ai,
14 | validate_material_with_ai,
15 | cross_validate_materials_with_ai,
16 | extract_category_core_info_with_ai
17 | )
18 |
19 | # 文件处理工具
20 | from .file_utils import (
21 | extract_zip_file,
22 | validate_folder_structure,
23 | analyze_markdown_structure,
24 | extract_markdown_content
25 | )
26 |
27 | # 通用工具
28 | from .common_utils import (
29 | extract_with_regex,
30 | generate_html_report
31 | )
32 |
33 | # 工作流集成工具
34 | from .workflow_integration import (
35 | extract_core_information_from_json,
36 | extract_core_information,
37 | validate_material_rules
38 | )
39 |
40 | __all__ = [
41 | # AI模型工具
42 | "extract_core_information_with_ai",
43 | "validate_material_with_ai",
44 | "cross_validate_materials_with_ai",
45 | "extract_category_core_info_with_ai",
46 |
47 | # 文件处理工具
48 | "extract_zip_file",
49 | "validate_folder_structure",
50 | "analyze_markdown_structure",
51 | "extract_markdown_content",
52 |
53 | # 通用工具
54 | "extract_with_regex",
55 | "generate_html_report",
56 |
57 | # 工作流集成工具
58 | "extract_core_information_from_json",
59 | "extract_core_information",
60 | "validate_material_rules"
61 | ]
--------------------------------------------------------------------------------
/src/config/warning_config.py:
--------------------------------------------------------------------------------
1 | """
2 | 警告配置管理
3 |
4 | 统一管理系统中的警告过滤器,特别针对第三方库的弃用警告
5 | """
6 |
7 | import warnings
8 | import os
9 |
10 |
11 | def setup_warning_filters():
12 | """
13 | 设置系统警告过滤器
14 |
15 | 主要针对以下警告进行优化:
16 | 1. pkg_resources弃用警告(来自Marker内部)
17 | 2. 其他第三方库的不必要警告
18 | """
19 |
20 | # 抑制pkg_resources弃用警告
21 | # 这个警告来自Marker库内部,用户无法控制
22 | warnings.filterwarnings(
23 | "ignore",
24 | category=DeprecationWarning,
25 | module="pkg_resources"
26 | )
27 |
28 | # 抑制setuptools相关的pkg_resources警告
29 | warnings.filterwarnings(
30 | "ignore",
31 | message=".*pkg_resources is deprecated.*",
32 | category=UserWarning
33 | )
34 |
35 | # 抑制其他第三方库的常见警告
36 | warnings.filterwarnings(
37 | "ignore",
38 | category=DeprecationWarning,
39 | module="transformers"
40 | )
41 |
42 | # 可选:在开发模式下显示所有警告
43 | if os.environ.get("LANGGRAPH_DEBUG", "false").lower() == "true":
44 | warnings.resetwarnings()
45 | warnings.simplefilter("always", DeprecationWarning)
46 | print("🔍 调试模式:显示所有警告信息")
47 | else:
48 | print("✅ 已配置警告过滤器,抑制第三方库不必要的警告")
49 |
50 |
51 | def suppress_marker_warnings():
52 | """
53 | 保持兼容性函数(已无作用)
54 | """
55 | pass
56 |
57 |
58 | def get_warning_env_vars():
59 | """
60 | 获取用于抑制警告的环境变量字典
61 |
62 | Returns:
63 | 环境变量字典
64 | """
65 | return {
66 | "PYTHONWARNINGS": "ignore::DeprecationWarning:pkg_resources",
67 | "TRANSFORMERS_VERBOSITY": "error", # 降低transformers库的输出等级
68 | "TOKENIZERS_PARALLELISM": "false", # 避免tokenizers并发警告
69 | }
70 |
71 |
72 | # 自动在模块导入时设置警告过滤器
73 | if __name__ != "__main__":
74 | setup_warning_filters()
--------------------------------------------------------------------------------
/src/nodes/file_processing.py:
--------------------------------------------------------------------------------
1 | """
2 | ZIP解压节点
3 |
4 | 专门处理ZIP压缩包解压和17个标准文件夹结构验证
5 | """
6 |
7 | from typing import Dict, Any
8 | from pathlib import Path
9 | from src.graph.state import AuditState
10 | from src.tools import (
11 | extract_zip_file,
12 | validate_folder_structure
13 | )
14 |
15 |
16 | async def file_processing_node(state: AuditState) -> Dict[str, Any]:
17 | """
18 | ZIP解压节点 - 解压ZIP文件并验证17个标准文件夹结构
19 | """
20 | try:
21 | # 支持两种输入字段名(向后兼容)
22 | zip_path = state.get("uploaded_file") or state.get("zip_file_path")
23 |
24 | if not zip_path:
25 | return {
26 | "current_step": "zip_extraction_failed",
27 | "error_message": "未找到上传的ZIP文件路径"
28 | }
29 |
30 | print(f"📦 开始解压ZIP文件: {Path(zip_path).name}")
31 |
32 | # 解压 ZIP 文件
33 | extraction_result = await extract_zip_file(zip_path)
34 |
35 | if not extraction_result:
36 | return {
37 | "current_step": "zip_extraction_failed",
38 | "error_message": "ZIP文件解压失败"
39 | }
40 |
41 | # 获取解压后的根目录
42 | extraction_path = extraction_result.get("extraction_path")
43 | extracted_files = extraction_result.get("files", [])
44 |
45 | # 检查解压是否成功
46 | if not extraction_path:
47 | return {
48 | "current_step": "zip_extraction_failed",
49 | "error_message": "ZIP文件解压失败,无法获取解压路径"
50 | }
51 |
52 | print(f"📁 ZIP解压完成,提取到: {extraction_path}")
53 | print(f"📊 共解压 {len(extracted_files)} 个文件")
54 |
55 | # 验证17个标准文件夹结构
56 | folder_validation = await validate_folder_structure(extraction_path)
57 |
58 | return {
59 | "extraction_path": extraction_path,
60 | "extracted_files": extracted_files,
61 | "folder_validation": folder_validation,
62 | "current_step": "zip_extraction_completed",
63 | "file_type": "zip"
64 | }
65 |
66 | except Exception as e:
67 | print(f"❌ ZIP解压失败: {str(e)}")
68 | return {
69 | "current_step": "zip_extraction_failed",
70 | "error_message": f"ZIP解压失败: {str(e)}"
71 | }
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | # 环境配置示例文件
2 | # 复制为 .env 并填入实际配置值
3 |
4 | # OpenAI API配置
5 | OPENAI_API_KEY=your_openai_api_key_here
6 | OPENAI_BASE_URL=https://api.openai.com/v1
7 | OPENAI_MODEL=gpt-4
8 |
9 | # 应用配置
10 | APP_NAME=职称评审材料审核系统
11 | APP_VERSION=1.0.0
12 | DEBUG=True
13 | LOG_LEVEL=INFO
14 |
15 | # 文件存储配置
16 | UPLOAD_DIR=./data/uploads
17 | TEMP_DIR=./data/temp
18 | OUTPUT_DIR=./data/outputs
19 | MAX_FILE_SIZE=104857600 # 100MB
20 |
21 | # 数据库配置
22 | DATABASE_URL=sqlite:///./data/audit_system.db
23 |
24 | # Redis配置(用于任务队列)
25 | REDIS_URL=redis://localhost:6379/0
26 |
27 | # API服务配置
28 | HOST=0.0.0.0
29 | PORT=8000
30 | WORKERS=4
31 |
32 | # AI处理配置
33 | MAX_CONCURRENT_TASKS=10
34 | PDF_MAX_PAGES=100
35 | CHUNK_SIZE=2048
36 | OVERLAP_SIZE=200
37 |
38 | # 规则校验配置
39 | ENABLE_STRICT_MODE=True
40 | AUTO_RETRY_COUNT=3
41 | VALIDATION_TIMEOUT=300
42 |
43 | # 报告生成配置
44 | REPORT_TEMPLATE_DIR=./templates
45 | REPORT_ASSETS_DIR=./assets
46 | ENABLE_PDF_EXPORT=True
47 |
48 | # 安全配置
49 | SECRET_KEY=your_secret_key_here
50 | ACCESS_TOKEN_EXPIRE_MINUTES=30
51 |
52 | # 监控配置
53 | ENABLE_METRICS=True
54 | METRICS_PORT=9090
55 |
56 | # Marker + Gemma AI配置
57 | # 谷歌AI API配置
58 | GOOGLE_API_KEY=
59 |
60 |
61 |
62 | # Marker配置
63 | MARKER_USE_LLM=true
64 | MARKER_OUTPUT_FORMAT=json
65 | MARKER_FORMAT_LINES=true
66 |
67 | # 设备配置
68 | TORCH_DEVICE=cuda
69 |
70 | # Hugging Face配置(如果需要)
71 | HF_TOKEN=your_huggingface_token_here
72 |
73 | # LangSmith配置(用于调试和监控)
74 | LANGSMITH_API_KEY=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
75 | LANGCHAIN_TRACING_V2=true
76 | LANGCHAIN_ENDPOINT=https://api.smith.langchain.com
77 | LANGCHAIN_PROJECT=Audit_Workflow_Debug
78 | LANGSMITH_TRACING=true
79 |
80 | # LangSmith配置(可选,用于调试和监控)
81 | LANGSMITH_API_KEY=your_langsmith_api_key_here
82 | LANGCHAIN_TRACING_V2=true
83 | LANGCHAIN_ENDPOINT=https://api.smith.langchain.com
84 | LANGCHAIN_PROJECT=Audit_Workflow_Debug
85 | LANGSMITH_TRACING=true
86 |
87 | # Google AI API配置
88 | GOOGLE_API_KEY=your_google_api_key_here
89 | GEMINI_MODEL=gemini-2.5-flash # 可选值: gemini-1.5-flash, gemini-2.5-flash, gemini-pro
90 |
91 | # 应用配置
92 | ENVIRONMENT=development
93 | HOST=0.0.0.0
94 | PORT=8000
95 | DEBUG=true
96 |
97 | # Redis配置(用于缓存和任务队列)
98 | REDIS_URL=redis://localhost:6379/0
99 | REDIS_PASSWORD=
100 | REDIS_DB=0
101 |
102 | # PostgreSQL配置(可选,用于持久化存储)
103 | DATABASE_URL=postgresql://user:password@localhost:5432/langgraph_audit
104 | POSTGRES_USER=langgraph_user
105 | POSTGRES_PASSWORD=your_password
106 | POSTGRES_DB=langgraph_audit
107 | POSTGRES_HOST=localhost
108 | POSTGRES_PORT=5432
109 |
110 | # 文件处理配置
111 | MAX_FILE_SIZE=100MB
112 | ALLOWED_FILE_TYPES=.zip,.md,.txt,.pdf
113 | UPLOAD_DIR=./uploads
114 | EXTRACTED_DIR=./extracted
115 | REPORTS_DIR=./reports
116 |
117 | # 缓存配置
118 | CACHE_ENABLED=true
119 | CACHE_TTL=3600
120 | CACHE_MAX_SIZE=1000
121 |
122 | # 日志配置
123 | LOG_LEVEL=INFO
124 | LOG_FORMAT=json
125 | LOG_FILE=./logs/app.log
126 |
127 | # 安全配置
128 | SECRET_KEY=your-secret-key-here
129 | CORS_ORIGINS=*
130 | ALLOWED_HOSTS=localhost,127.0.0.1,0.0.0.0
131 |
132 | # 工作流配置
133 | WORKFLOW_TIMEOUT=300
134 | MAX_RETRIES=3
135 | CONCURRENT_TASKS=5
136 |
137 | # OCR配置
138 | TESSERACT_PATH=/usr/bin/tesseract
139 | TESSERACT_DATA_PATH=/usr/share/tesseract-ocr/4.00/tessdata
140 |
141 | # 开发工具配置
142 | LANGCHAIN_VERBOSE=false
143 | LANGCHAIN_DEBUG=false
144 |
--------------------------------------------------------------------------------
/src/tools/workflow_integration.py:
--------------------------------------------------------------------------------
1 | """
2 | 审核工作流集成工具
3 |
4 | 提供审核系统的核心集成函数,连接各个工具模块
5 | """
6 |
7 | from typing import List, Dict, Any
8 | from pathlib import Path
9 | from src.models.state import ValidationResult, CoreInfo
10 | from src.tools import (
11 | extract_core_information_with_ai,
12 | validate_material_with_ai,
13 | extract_with_regex
14 | )
15 |
16 | def extract_core_information_from_json(json_extractions: List[Dict[str, Any]]) -> CoreInfo:
17 | """使用Gemma AI从JSON提取结果中智能提取核心信息"""
18 | print("🤖 使用Gemma模型进行智能信息提取...")
19 |
20 | # 整合所有文档内容
21 | combined_content = ""
22 | extracted_from = []
23 |
24 | for json_extraction in json_extractions:
25 | file_path = json_extraction.get("file_path", "")
26 | content_blocks = json_extraction.get("content_blocks", [])
27 |
28 | for block in content_blocks:
29 | content = block.get("content", "")
30 | if content.strip():
31 | combined_content += content + "\n"
32 |
33 | if file_path:
34 | extracted_from.append(Path(file_path).name)
35 |
36 | if not combined_content.strip():
37 | return CoreInfo(name="", id_number="", extracted_from=extracted_from)
38 |
39 | # 使用AI提取,失败时降级到正则表达式
40 | ai_result = extract_core_information_with_ai(combined_content, extracted_from)
41 |
42 | if ai_result:
43 | return CoreInfo(
44 | name=ai_result["name"],
45 | id_number=ai_result["id_number"],
46 | extracted_from=ai_result["extracted_from"]
47 | )
48 | else:
49 | name, id_number = extract_with_regex(combined_content)
50 | return CoreInfo(name=name, id_number=id_number, extracted_from=extracted_from)
51 |
52 | def extract_core_information(materials: List[Dict[str, Any]]) -> CoreInfo:
53 | """提取核心信息(简化版) - 使用Dict替代MaterialInfo"""
54 | # 将Dict转换为JSON格式进行处理
55 | json_extractions = []
56 | for material in materials:
57 | json_extraction = {
58 | "file_path": material.get("material_id", ""),
59 | "content_blocks": [{"content": material.get("content", "")}]
60 | }
61 | json_extractions.append(json_extraction)
62 |
63 | return extract_core_information_from_json(json_extractions)
64 |
65 | def validate_material_rules(material: Dict[str, Any]) -> List[ValidationResult]:
66 | """使用Gemma AI进行智能审核 - 使用Dict替代MaterialInfo"""
67 | material_type = material.get("material_type", "")
68 | content = material.get("content", "")
69 |
70 | print(f"🤖 使用Gemma模型审核材料: {material_type}")
71 |
72 | # 使用AI进行智能审核
73 | ai_results = validate_material_with_ai(material_type, content)
74 |
75 | if ai_results:
76 | results = []
77 | for item in ai_results:
78 | if isinstance(item, dict) and "rule_name" in item:
79 | results.append(ValidationResult(
80 | rule_id=f"GEMMA_{len(results)+1:03d}",
81 | rule_name=item.get("rule_name", "智能审核"),
82 | status=item.get("status", "WARNING"),
83 | message=item.get("message", "审核完成")
84 | ))
85 | return results
86 | else:
87 | # AI失败时返回默认验证结果
88 | return [ValidationResult(
89 | rule_id="FALLBACK_001",
90 | rule_name="默认审核",
91 | status="WARNING",
92 | message="AI审核失败,使用默认审核规则"
93 | )]
94 |
95 |
--------------------------------------------------------------------------------
/src/config/api_config.py:
--------------------------------------------------------------------------------
1 | """
2 | API配置工具
3 |
4 | 用于配置PDF提取API端点和相关参数
5 | """
6 |
7 | from typing import Dict, Any, Optional
8 | import logging
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 | # 全局API配置
13 | _api_config = {
14 | "pdf_extraction_endpoint": "http://183.203.184.233:8888/pdf_parse_supplychain", # 用户提供的实际端点
15 | "timeout": 60,
16 | "max_file_size": 20 * 1024 * 1024, # 20MB
17 | "supported_formats": [".pdf"]
18 | }
19 |
20 |
21 | def configure_pdf_api(endpoint: str, timeout: int = 60, max_file_size: int = 20 * 1024 * 1024) -> None:
22 | """
23 | 配置PDF提取API
24 |
25 | Args:
26 | endpoint: API端点URL
27 | timeout: 超时时间(秒)
28 | max_file_size: 最大文件大小(字节)
29 | """
30 | global _api_config
31 |
32 | _api_config.update({
33 | "pdf_extraction_endpoint": endpoint,
34 | "timeout": timeout,
35 | "max_file_size": max_file_size
36 | })
37 |
38 | logger.info(f"PDF API已配置: {endpoint}")
39 | print(f"✅ PDF提取API已配置: {endpoint}")
40 |
41 |
42 | def get_pdf_api_config() -> Dict[str, Any]:
43 | """
44 | 获取当前PDF API配置
45 |
46 | Returns:
47 | API配置字典
48 | """
49 | return _api_config.copy()
50 |
51 |
52 | def is_pdf_api_configured() -> bool:
53 | """
54 | 检查PDF API是否已配置
55 |
56 | Returns:
57 | 是否已配置
58 | """
59 | return _api_config.get("pdf_extraction_endpoint") is not None
60 |
61 |
62 | async def validate_pdf_file(file_path: str) -> Dict[str, Any]:
63 | """
64 | 验证PDF文件是否符合要求
65 |
66 | Args:
67 | file_path: PDF文件路径
68 |
69 | Returns:
70 | 验证结果
71 | """
72 | import os
73 | from pathlib import Path
74 |
75 | try:
76 | import asyncio
77 | from pathlib import Path
78 | file_path_obj = Path(file_path)
79 |
80 | # 使用异步方式检查文件是否存在
81 | file_exists = await asyncio.to_thread(file_path_obj.exists)
82 | if not file_exists:
83 | return {
84 | "valid": False,
85 | "error": "文件不存在"
86 | }
87 |
88 | # 检查文件扩展名
89 | if file_path_obj.suffix.lower() not in _api_config["supported_formats"]:
90 | return {
91 | "valid": False,
92 | "error": f"不支持的文件格式: {file_path_obj.suffix}"
93 | }
94 |
95 | # 使用异步方式检查文件大小
96 | file_stat = await asyncio.to_thread(file_path_obj.stat)
97 | file_size = file_stat.st_size
98 | if file_size > _api_config["max_file_size"]:
99 | return {
100 | "valid": False,
101 | "error": f"文件过大: {file_size} > {_api_config['max_file_size']}"
102 | }
103 |
104 | return {
105 | "valid": True,
106 | "file_size": file_size,
107 | "format": file_path_obj.suffix.lower()
108 | }
109 |
110 | except Exception as e:
111 | return {
112 | "valid": False,
113 | "error": f"文件验证失败: {str(e)}"
114 | }
115 |
116 |
117 | def create_pdf_api_headers() -> Dict[str, str]:
118 | """
119 | 创建PDF API请求头(基于提供的API示例)
120 |
121 | Returns:
122 | 请求头字典
123 | """
124 | return {
125 | "accept": "application/json", # 与示例一致
126 | "User-Agent": "LangGraph-PDF-Extractor/1.0"
127 | # Content-Type 会由 aiohttp 自动设置为 multipart/form-data
128 | }
129 |
130 |
131 | def get_pdf_api_params() -> Dict[str, str]:
132 | """
133 | 获取PDF API查询参数(基于提供的API示例)
134 |
135 | Returns:
136 | API查询参数字典
137 | """
138 | return {
139 | "parse_method": "auto",
140 | "is_json_md_dump": "false",
141 | "output_dir": "output",
142 | "return_layout": "false",
143 | "return_info": "false",
144 | "return_content_list": "false",
145 | "return_images": "false"
146 | }
147 |
148 |
149 | def build_pdf_api_url(base_endpoint: str, custom_params: Optional[Dict[str, str]] = None) -> str:
150 | """
151 | 构建完整的PDF API URL
152 |
153 | Args:
154 | base_endpoint: 基础端点URL(不包含查询参数)
155 | custom_params: 自定义参数(可选)
156 |
157 | Returns:
158 | 完整的API URL
159 | """
160 | params = get_pdf_api_params()
161 |
162 | # 如果有自定义参数,覆盖默认参数
163 | if custom_params:
164 | params.update(custom_params)
165 |
166 | # 构建查询字符串
167 | query_string = "&".join([f"{k}={v}" for k, v in params.items()])
168 |
169 | # 处理base_endpoint是否已经包含查询参数
170 | separator = "&" if "?" in base_endpoint else "?"
171 |
172 | return f"{base_endpoint}{separator}{query_string}"
--------------------------------------------------------------------------------
/src/nodes/cross_validation.py:
--------------------------------------------------------------------------------
1 | """
2 | 交叉校验节点
3 |
4 | 对核心信息进行交叉校验:
5 | 1. 姓名一致性校验
6 | 2. 身份证一致性校验
7 | 3. 基于rules文件夹中的交叉检验规则
8 | """
9 |
10 | from typing import Dict, Any
11 | from src.graph.state import AuditState
12 | from src.tools.ai_utils import cross_validate_materials_with_ai
13 |
14 |
15 | def cross_validation_node(state: AuditState) -> Dict[str, Any]:
16 | """
17 | 完全无缓存的交叉校验节点 - 每次都处理全新数据
18 |
19 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的
20 | """
21 | try:
22 | print(f"🔍 开始无缓存交叉校验节点...")
23 |
24 | # 🔍 获取核心信息(优先使用核心信息提取节点的结果)
25 | core_info = state.get("core_info")
26 | all_extracted_info = state.get("api_extraction_results", {}) or state.get("extracted_content", {})
27 | current_step = state.get("current_step", "未知")
28 |
29 | print(f"🔍 当前状态详细信息:")
30 | print(f" 当前步骤: {current_step}")
31 | print(f" 核心信息状态: {'有效' if core_info else '无'}")
32 | print(f" 提取材料数量: {len(all_extracted_info)}")
33 |
34 | # 🚨 优先检查核心信息提取节点的结果
35 | if not core_info:
36 | print(f"⚠️ 没有找到核心信息,检查核心信息提取节点是否正常执行")
37 | raise Exception("未找到任何核心信息用于交叉校验")
38 |
39 | # 🔍 验证核心信息的数据结构
40 | if not isinstance(core_info, dict):
41 | print(f"⚠️ 核心信息格式不正确: {type(core_info)}")
42 | # 尝试转换为字典格式
43 | if hasattr(core_info, 'name') and hasattr(core_info, 'id_number'):
44 | core_info = {
45 | "attachments": {
46 | "name": getattr(core_info, 'name', ''),
47 | "id_number": getattr(core_info, 'id_number', ''),
48 | "extracted_from": getattr(core_info, 'extracted_from', [])
49 | }
50 | }
51 | else:
52 | raise Exception(f"核心信息格式不可识别: {type(core_info)}")
53 |
54 | # 🔍 统计有效的核心信息条目
55 | valid_entries = 0
56 | name_sources = []
57 | id_sources = []
58 |
59 | for category, info in core_info.items():
60 | if isinstance(info, dict) and (info.get('name') or info.get('id_number')):
61 | valid_entries += 1
62 | if info.get('name'):
63 | name_sources.append(f"{category}: {info['name']}")
64 | if info.get('id_number'):
65 | id_sources.append(f"{category}: {info['id_number']}")
66 |
67 | print(f"📋 有效核心信息条目: {valid_entries}")
68 | print(f"📋 姓名信息来源: {len(name_sources)} 项")
69 | print(f"📋 身份证信息来源: {len(id_sources)} 项")
70 |
71 | if valid_entries == 0:
72 | print(f"⚠️ 所有核心信息条目都为空,无法进行交叉校验")
73 | raise Exception("所有核心信息条目都为空,无法进行交叉校验")
74 |
75 | # 🚨 直接执行交叉验证 - 不使用缓存,使用核心信息提取节点的结果
76 | cross_validation_results = cross_validate_materials_with_ai(all_extracted_info, core_info)
77 |
78 | # 直接转换AI结果为标准格式 - 不存入缓存
79 | converted_results = []
80 | for ai_result in cross_validation_results:
81 | status = ai_result.get('status', 'WARNING')
82 | if status == 'PASS' or '✅' in status:
83 | result_status = '✅通过'
84 | elif status == 'WARNING' or '⚠️' in status:
85 | result_status = '⚠️警告'
86 | elif status == 'ERROR' or '❌' in status:
87 | result_status = '❌不通过'
88 | else:
89 | result_status = '⚠️警告'
90 |
91 | converted_result = {
92 | "rule_name": ai_result.get('rule_name', '未知规则'),
93 | "result": result_status,
94 | "details": ai_result.get('message', 'AI交叉校验完成'),
95 | "priority": ai_result.get('priority', '极高'),
96 | "material_type": "AI交叉校验",
97 | "rule_content": ai_result.get('rule_content', ''),
98 | "timestamp": _get_current_timestamp()
99 | }
100 | converted_results.append(converted_result)
101 |
102 | # 🚨 直接返回结果,不使用任何缓存机制
103 | print(f"✅ 无缓存交叉校验完成,生成{len(converted_results)}项结果")
104 |
105 | return {
106 | "cross_validation": converted_results,
107 | "current_step": "cross_validation_completed",
108 | "processing_logs": [
109 | f"交叉校验完成,生成{len(converted_results)}项结果",
110 | f"基于{valid_entries}项有效核心信息进行校验",
111 | "已完全取消缓存机制,确保数据全新"
112 | ]
113 | }
114 |
115 | except Exception as e:
116 | print(f"❌ 交叉校验失败: {str(e)}")
117 | return {
118 | "current_step": "cross_validation_failed",
119 | "error_message": f"交叉校验失败: {str(e)}",
120 | "processing_logs": [f"交叉校验失败: {str(e)}"]
121 | }
122 |
123 |
124 | def _get_current_timestamp() -> str:
125 | """获取当前时间戳"""
126 | from datetime import datetime
127 | return datetime.now().isoformat()
--------------------------------------------------------------------------------
/src/nodes/report_generation.py:
--------------------------------------------------------------------------------
1 | """
2 | 报告生成节点 - 完全无缓存版本
3 |
4 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的
5 | """
6 |
7 | from typing import Dict, Any
8 | from src.graph.state import AuditState
9 |
10 |
11 | def report_generation_node(state: AuditState) -> Dict[str, Any]:
12 | """
13 | 完全无缓存的报告生成节点 - 每次都处理全新数据
14 |
15 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的
16 | """
17 | try:
18 | print(f"📄 开始无缓存报告生成...")
19 |
20 | # 生成报告ID
21 | from datetime import datetime
22 | import uuid
23 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
24 | report_id = f"AUDIT_{timestamp}_{str(uuid.uuid4())[:8].upper()}"
25 |
26 | # 直接获取当前状态的所有数据 - 不使用任何缓存
27 | material_validation = state.get("material_validation", {})
28 | cross_validation = state.get("cross_validation", [])
29 |
30 | print(f"🔍 当前状态数据:")
31 | print(f" 材料校验结果: {len(material_validation)} 项")
32 | print(f" 交叉校验结果: {len(cross_validation)} 项")
33 |
34 | # 直接整合所有数据 - 不做缓存检查
35 | all_results = []
36 |
37 | # 整合material_validation数据
38 | for material_type, results in material_validation.items():
39 | if isinstance(results, list):
40 | all_results.extend(results)
41 | elif results:
42 | all_results.append(results)
43 |
44 | # 整合cross_validation数据
45 | if isinstance(cross_validation, list):
46 | all_results.extend(cross_validation)
47 |
48 | if not all_results:
49 | print("⚠️ 未找到任何校验结果,生成空报告")
50 |
51 | print(f"📊 报告数据统计: 共{len(all_results)}项结果")
52 |
53 | # 直接生成HTML报告 - 不使用缓存的复杂逻辑
54 | html_report = _generate_html_report(all_results, report_id)
55 |
56 | # 保存报告文件
57 | report_path = f"audit_report_{timestamp}.html"
58 |
59 | if report_path and html_report:
60 | with open(report_path, 'w', encoding='utf-8') as f:
61 | f.write(html_report)
62 |
63 | print(f"✅ 报告已生成: {report_path}")
64 | else:
65 | raise Exception("报告路径或内容为空")
66 |
67 | return {
68 | "audit_report": html_report,
69 | "report_path": report_path,
70 | "current_step": "completed",
71 | "is_complete": True,
72 | "processing_logs": [
73 | f"报告生成完成: {report_id}",
74 | f"处理了{len(all_results)}项结果",
75 | "已完全取消缓存机制,确保数据全新",
76 | f"报告已保存至: {report_path}"
77 | ]
78 | }
79 |
80 | except Exception as e:
81 | print(f"❌ 报告生成失败: {str(e)}")
82 | return {
83 | "current_step": "report_generation_failed",
84 | "error_message": f"报告生成失败: {str(e)}"
85 | }
86 |
87 |
88 | def _generate_html_report(all_results: list, report_id: str) -> str:
89 | """
90 | 生成简化的HTML报告 - 完全无缓存机制
91 | """
92 | from datetime import datetime
93 |
94 | print(f"📊 报告生成使用数据,共{len(all_results)}项结果")
95 |
96 | # 按材料类型分组
97 | material_groups = {}
98 | for result in all_results:
99 | material_type = result.get('material_type', '未知类型')
100 | if material_type not in material_groups:
101 | material_groups[material_type] = []
102 | material_groups[material_type].append(result)
103 |
104 | # 统计数据
105 | error_count = sum(1 for r in all_results if r.get('result', '').startswith('❌'))
106 | warning_count = sum(1 for r in all_results if r.get('result', '').startswith('⚠️'))
107 | pass_count = sum(1 for r in all_results if r.get('result', '').startswith('✅'))
108 | total_validations = len(all_results)
109 |
110 | print(f"📊 统计: 错误{error_count}, 警告{warning_count}, 通过{pass_count}")
111 |
112 | # 生成基本的HTML报告结构
113 | html_template = f"""
114 |
115 |
116 |
117 |
118 |
119 | 职称评审材料审核报告 - {report_id}
120 |
132 |
133 |
134 |
139 |
140 |
141 |
142 |
总计
143 |
{len(all_results)} 项检查
144 |
145 |
146 |
错误
147 |
{error_count} 项
148 |
149 |
150 |
警告
151 |
{warning_count} 项
152 |
153 |
154 |
通过
155 |
{pass_count} 项
156 |
157 |
158 |
159 |
160 |
详细结果
161 | """
162 |
163 | # 添加材料组详情
164 | for material_type, results in material_groups.items():
165 | html_template += f"""
166 |
167 |
168 | """
169 | for result in results[:10]: # 限制显示数量
170 | result_class = "error" if result.get('result', '').startswith('❌') else "warning" if result.get('result', '').startswith('⚠️') else "pass"
171 | html_template += f"""
172 |
173 | {result.get('rule_name', '未知规则')}: {result.get('result', '未知')}
174 | {result.get('details', '无详情')}
175 |
176 | """
177 | html_template += "
"
178 |
179 | html_template += """
180 |
181 |
182 |
183 | """
184 |
185 | return html_template
--------------------------------------------------------------------------------
/src/config/model_config.py:
--------------------------------------------------------------------------------
1 | """
2 | 配置管理器
3 |
4 | 用于管理OCR API配置及环境变量
5 | """
6 |
7 | import os
8 | from pathlib import Path
9 | from typing import Dict, Optional
10 | import logging
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 | class ModelConfig:
15 | """配置管理器"""
16 |
17 | def __init__(self):
18 | self.project_root = Path(__file__).parent.parent.parent
19 | self.cache_dir = self.project_root / ".model_cache"
20 |
21 | # 智能初始化:检查是否在异步环境中
22 | try:
23 | import asyncio
24 | # 尝试获取当前任务,如果成功说明在异步环境中
25 | asyncio.current_task()
26 | logger.info("🔄 检测到异步环境,将延迟创建缓存目录")
27 | except RuntimeError:
28 | # 不在异步环境中,可以安全创建目录
29 | self.setup_cache_directories_sync()
30 | except Exception:
31 | # 如果检测失败,使用同步方式(向后兼容)
32 | self.setup_cache_directories_sync()
33 |
34 | async def setup_cache_directories(self):
35 | """设置缓存目录(异步版本)"""
36 | try:
37 | import asyncio
38 | # 使用异步方式创建目录
39 | await asyncio.to_thread(self.cache_dir.mkdir, parents=True, exist_ok=True)
40 | logger.info(f"📁 缓存目录: {self.cache_dir}")
41 |
42 | except Exception as e:
43 | logger.error(f"❌ 缓存目录设置失败: {e}")
44 |
45 | def setup_cache_directories_sync(self):
46 | """设置缓存目录(同步版本,仅用于初始化)"""
47 | try:
48 | # 创建本地缓存目录
49 | self.cache_dir.mkdir(parents=True, exist_ok=True)
50 | logger.info(f"📁 缓存目录: {self.cache_dir}")
51 |
52 | except Exception as e:
53 | logger.error(f"❌ 缓存目录设置失败: {e}")
54 |
55 | async def is_models_cached(self) -> bool:
56 | """检查缓存是否存在(异步版本)"""
57 | import asyncio
58 | return await asyncio.to_thread(self.cache_dir.exists)
59 |
60 | async def get_cache_size(self) -> str:
61 | """获取缓存目录大小(异步版本)"""
62 | try:
63 | import asyncio
64 | total_size = 0
65 |
66 | # 使用异步方式遍历文件
67 | async def calculate_size():
68 | nonlocal total_size
69 | paths = await asyncio.to_thread(list, self.cache_dir.rglob("*"))
70 | for path in paths:
71 | is_file = await asyncio.to_thread(path.is_file)
72 | if is_file:
73 | stat_result = await asyncio.to_thread(path.stat)
74 | total_size += stat_result.st_size
75 |
76 | await calculate_size()
77 |
78 | # 转换为可读格式
79 | if total_size < 1024:
80 | return f"{total_size} B"
81 | elif total_size < 1024**2:
82 | return f"{total_size/1024:.1f} KB"
83 | elif total_size < 1024**3:
84 | return f"{total_size/1024**2:.1f} MB"
85 | else:
86 | return f"{total_size/1024**3:.1f} GB"
87 |
88 | except Exception as e:
89 | logger.error(f"❌ 获取缓存大小失败: {e}")
90 | return "未知"
91 |
92 | async def clear_cache(self):
93 | """清理缓存(异步版本)"""
94 | try:
95 | import shutil
96 | import asyncio
97 | if self.cache_dir.exists():
98 | await asyncio.to_thread(shutil.rmtree, self.cache_dir)
99 | logger.info("🧹 缓存已清理")
100 | await self.setup_cache_directories()
101 | except Exception as e:
102 | logger.error(f"❌ 清理缓存失败: {e}")
103 |
104 | def clear_cache_sync(self):
105 | """清理缓存(同步版本)"""
106 | try:
107 | import shutil
108 | if self.cache_dir.exists():
109 | shutil.rmtree(self.cache_dir)
110 | logger.info("🧹 缓存已清理")
111 | self.setup_cache_directories_sync()
112 | except Exception as e:
113 | logger.error(f"❌ 清理缓存失败: {e}")
114 |
115 | async def get_status(self) -> Dict[str, str]:
116 | """获取配置状态(异步版本)"""
117 | cache_size = await self.get_cache_size()
118 | return {
119 | "cache_dir": str(self.cache_dir),
120 | "cache_size": cache_size,
121 | "ocr_api_enabled": "启用",
122 | }
123 |
124 | def get_status_sync(self) -> Dict[str, str]:
125 | """获取配置状态(同步版本)"""
126 | try:
127 | total_size = 0
128 | if self.cache_dir.exists():
129 | for path in self.cache_dir.rglob("*"):
130 | if path.is_file():
131 | total_size += path.stat().st_size
132 |
133 | # 转换为可读格式
134 | if total_size < 1024:
135 | cache_size = f"{total_size} B"
136 | elif total_size < 1024**2:
137 | cache_size = f"{total_size/1024:.1f} KB"
138 | elif total_size < 1024**3:
139 | cache_size = f"{total_size/1024**2:.1f} MB"
140 | else:
141 | cache_size = f"{total_size/1024**3:.1f} GB"
142 | except Exception as e:
143 | logger.error(f"❌ 获取缓存大小失败: {e}")
144 | cache_size = "未知"
145 |
146 | return {
147 | "cache_dir": str(self.cache_dir),
148 | "cache_size": cache_size,
149 | "ocr_api_enabled": "启用",
150 | }
151 |
152 |
153 | # 全局配置实例
154 | model_config = ModelConfig()
155 |
156 |
157 | async def setup_model_environment():
158 | """设置环境(在应用启动时调用,异步版本)"""
159 | logger.info("🔧 正在设置环境...")
160 |
161 | # 设置缓存目录
162 | await model_config.setup_cache_directories()
163 |
164 | # 打印状态信息
165 | status = await model_config.get_status()
166 | logger.info("📊 配置状态:")
167 | for key, value in status.items():
168 | logger.info(f" {key}: {value}")
169 |
170 | def setup_model_environment_sync():
171 | """设置环境(同步版本)"""
172 | logger.info("🔧 正在设置环境...")
173 |
174 | # 设置缓存目录
175 | model_config.setup_cache_directories_sync()
176 |
177 | # 打印状态信息
178 | status = model_config.get_status_sync()
179 | logger.info("📊 配置状态:")
180 | for key, value in status.items():
181 | logger.info(f" {key}: {value}")
182 |
183 |
184 | def print_model_help():
185 | """打印配置帮助信息"""
186 | help_text = """
187 | 🔧 OCR API配置选项:
188 |
189 | 环境变量设置:
190 | OCR_API_BASE_URL=http://183.203.184.233:8888 # OCR API地址
191 |
192 | 使用说明:
193 | 1. 启动OCR API服务
194 | 确保您的OCR API服务正在运行
195 | 默认地址: http://183.203.184.233:8888
196 |
197 | 2. 启动主应用
198 | python web_app_v2.py
199 |
200 | 缓存位置: {cache_dir}
201 | """.format(cache_dir=model_config.cache_dir)
202 |
203 | print(help_text)
204 |
205 |
206 | if __name__ == "__main__":
207 | print_model_help()
--------------------------------------------------------------------------------
/src/tools/common_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | 通用工具
3 |
4 | 提供通用的工具函数:
5 | - 正则表达式提取
6 | - 数据清理和验证
7 | - HTML报告生成
8 | - 日志记录
9 | """
10 |
11 | import re
12 | from typing import Dict, Any, List, Optional, Union
13 | from pathlib import Path
14 | from src.models.state import CoreInfo, ValidationResult as StateValidationResult
15 |
16 | def extract_with_regex(content: str) -> tuple[str, str]:
17 | """使用正则表达式的备用提取方法(增强版)"""
18 | name = ""
19 | id_number = ""
20 |
21 | # 提取姓名(多种格式匹配)
22 | name_patterns = [
23 | r"姓[名]*[::]\s*([^\s\n\r\t]+)", # 姓名:
24 | r"申请人[::]\s*([^\s\n\r\t]+)", # 申请人:
25 | r"姓[\s]*名[\s]*[::]\s*([^\s\n\r\t]+)", # 姓 名:
26 | r"^([\u4e00-\u9fff]{2,4})[\s]*[男女]", # 中文姓名后面跟性别
27 | ]
28 |
29 | for pattern in name_patterns:
30 | name_match = re.search(pattern, content, re.MULTILINE)
31 | if name_match:
32 | potential_name = name_match.group(1).strip()
33 | # 验证姓名的合理性(中文字符2-4个字)
34 | if re.match(r'^[\u4e00-\u9fff]{2,4}$', potential_name):
35 | name = potential_name
36 | break
37 |
38 | # 提取身份证号(多种格式匹配)
39 | id_patterns = [
40 | r"身份证[号码]*[::]\s*(\d{17}[\dX])", # 身份证号:
41 | r"公民身份号码[::]\s*(\d{17}[\dX])", # 公民身份号码:
42 | r"ID[\s]*Number[\s]*[::]\s*(\d{17}[\dX])", # ID Number:
43 | r"(\d{17}[\dX])(?![\d])", # 直接匹配18位数字(排除更长数字)
44 | ]
45 |
46 | for pattern in id_patterns:
47 | id_match = re.search(pattern, content)
48 | if id_match:
49 | potential_id = id_match.group(1)
50 | # 验证身份证号格式
51 | if re.match(r'^\d{17}[\dX]$', potential_id):
52 | id_number = potential_id
53 | break
54 |
55 | if name or id_number:
56 | print(f"✅ 正则提取成功: 姓名='{name}', 身份证='{id_number}'")
57 | else:
58 | print("⚠️ 正则提取未找到有效信息")
59 |
60 | return name, id_number
61 |
62 | def generate_html_report(core_info: Optional[Union[CoreInfo, Dict[str, Any]]], validation_results: List[Any]) -> str:
63 | """生成HTML格式化报告"""
64 | # 处理core_info为None的情况
65 | if core_info is None:
66 | name = '未提取'
67 | id_number = '未提取'
68 | extracted_from = []
69 | else:
70 | # 支持CoreInfo对象和Dict两种类型
71 | if isinstance(core_info, dict):
72 | name = core_info.get('name', '') or '未提取'
73 | id_number = core_info.get('id_number', '') or '未提取'
74 | extracted_from = core_info.get('extracted_from', []) or []
75 | else:
76 | # CoreInfo对象
77 | name = getattr(core_info, 'name', None) or '未提取'
78 | id_number = getattr(core_info, 'id_number', None) or '未提取'
79 | extracted_from = getattr(core_info, 'extracted_from', []) or []
80 |
81 | html_template = f"""
82 |
83 |
84 |
85 |
86 |
87 | 职称评审材料审核报告
88 |
106 |
107 |
108 |
112 |
113 |
114 |
👤 核心信息
115 |
116 |
117 | 姓名: {name}
118 |
119 |
120 | 身份证号: {id_number}
121 |
122 |
123 | 信息来源: {', '.join(extracted_from) if extracted_from else '无'}
124 |
125 |
126 |
127 |
128 |
129 |
✅ 审核结果
"""
130 |
131 | if validation_results:
132 | for result in validation_results:
133 | # 处理不同的ValidationResult类型
134 | # 支持既有status属性,也支持result属性
135 | status = getattr(result, 'status', None) or getattr(result, 'result', 'UNKNOWN')
136 | rule_name = getattr(result, 'rule_name', '未知规则')
137 | message = getattr(result, 'message', None) or getattr(result, 'details', '无详细信息')
138 |
139 | # 统一处理status格式
140 | if '✅' in status or status == 'PASS':
141 | status_normalized = 'pass'
142 | status_display = '✅通过'
143 | elif '⚠️' in status or status == 'WARNING':
144 | status_normalized = 'warning'
145 | status_display = '⚠️警告'
146 | elif '❌' in status or status == 'ERROR':
147 | status_normalized = 'error'
148 | status_display = '❌不通过'
149 | else:
150 | status_normalized = 'unknown'
151 | status_display = status
152 |
153 | status_class = f"result-{status_normalized}"
154 | badge_class = f"badge-{status_normalized}"
155 |
156 | html_template += f"""
157 |
158 |
{rule_name}
159 |
{status_display}
160 |
{message}
161 |
"""
162 | else:
163 | html_template += "
无审核结果
"
164 |
165 | html_template += """
166 |
167 |
168 |
171 |
172 | """
173 |
174 | return html_template
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling"]
3 | build-backend = "hatchling.build"
4 |
5 | [project]
6 | name = "langgraph-audit-system"
7 | version = "0.1.0"
8 | description = "An enterprise-level intelligent title evaluation material review system built on the LangGraph framework"
9 | authors = [
10 | {name = "LangGraph Audit Team", email = "team@langgraph-audit.com"}
11 | ]
12 | license = {text = "MIT"}
13 | readme = "README.md"
14 | requires-python = ">=3.10,<3.13"
15 | keywords = ["langgraph", "audit", "ai", "workflow", "title-evaluation"]
16 | classifiers = [
17 | "Development Status :: 4 - Beta",
18 | "Intended Audience :: Developers",
19 | "License :: OSI Approved :: MIT License",
20 | "Programming Language :: Python :: 3",
21 | "Programming Language :: Python :: 3.10",
22 | "Programming Language :: Python :: 3.11",
23 | "Programming Language :: Python :: 3.12",
24 | "Topic :: Software Development :: Libraries :: Python Modules",
25 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
26 | ]
27 |
28 | # Core dependencies for production
29 | dependencies = [
30 | # LangGraph core dependencies
31 | "langgraph>=0.2.0",
32 | "langsmith>=0.1.0",
33 | "langtrace-python-sdk>=2.0.0",
34 | "pydantic>=2.0.0",
35 | "typing-extensions>=4.0.0",
36 | "langchain-core>=0.1.0", # LangGraph core dependency
37 | "langgraph-cli>=0.1.0", # LangGraph development tools
38 |
39 | # LangGraph Redis integration
40 | "langgraph-checkpoint-redis>=0.1.0",
41 |
42 | # PostgreSQL integration
43 | "psycopg[binary,pool]>=3.1.0",
44 | "asyncpg>=0.29.0",
45 |
46 | # AI API integration
47 | "google-generativeai>=0.3.0",
48 |
49 | # Environment and configuration
50 | "python-dotenv>=1.0.0",
51 | "pyyaml>=6.0",
52 |
53 | # File processing utilities
54 | "pathlib2",
55 | "zipfile36>=0.1.0",
56 | "python-magic>=0.4.0",
57 | "Pillow>=10.0.0",
58 |
59 | # Web framework
60 | "fastapi>=0.104.0",
61 | "uvicorn>=0.24.0",
62 | "python-multipart>=0.0.6",
63 | "sse-starlette>=1.6.0", # Server-Sent Events support
64 | "starlette>=0.27.0",
65 |
66 | # Data processing
67 | "pandas>=2.0.0",
68 | "numpy>=1.24.0",
69 |
70 | # HTML report generation
71 | "jinja2>=3.1.0",
72 | "weasyprint>=60.0",
73 |
74 | # Monitoring and logging
75 | "loguru>=0.7.0",
76 | "prometheus-client>=0.19.0",
77 |
78 | # OCR and computer vision
79 | "pytesseract>=0.3.10",
80 | "opencv-python>=4.8.0",
81 |
82 | # Database support
83 | "sqlalchemy>=2.0.0",
84 | "alembic>=1.12.0",
85 |
86 | # Async processing
87 | "aiofiles>=23.2.0",
88 | "aiohttp>=3.8.0",
89 | "celery>=5.3.0",
90 | "redis>=5.0.0",
91 |
92 | # Document processing
93 | "markdown>=3.5.0",
94 | "markdownify>=0.11.0",
95 | ]
96 |
97 | [project.optional-dependencies]
98 | # Development dependencies
99 | dev = [
100 | "pytest>=7.4.0",
101 | "pytest-asyncio>=0.21.0",
102 | "pytest-cov>=4.1.0",
103 | "black>=23.0.0",
104 | "isort>=5.12.0",
105 | "flake8>=6.0.0",
106 | "mypy>=1.6.0",
107 | ]
108 |
109 | # Testing dependencies
110 | test = [
111 | "pytest>=7.4.0",
112 | "pytest-asyncio>=0.21.0",
113 | "pytest-cov>=4.1.0",
114 | "pytest-mock>=3.11.0",
115 | "httpx>=0.25.0", # for testing FastAPI endpoints
116 | ]
117 |
118 | # Documentation dependencies
119 | docs = [
120 | "mkdocs>=1.5.0",
121 | "mkdocs-material>=9.4.0",
122 | "mkdocstrings[python]>=0.23.0",
123 | ]
124 |
125 | # Full development environment
126 | all = [
127 | "langgraph-audit-system[dev,test,docs]"
128 | ]
129 |
130 | [project.urls]
131 | Homepage = "https://github.com/your-org/langgraph-audit-system"
132 | Documentation = "https://your-org.github.io/langgraph-audit-system"
133 | Repository = "https://github.com/your-org/langgraph-audit-system.git"
134 | Issues = "https://github.com/your-org/langgraph-audit-system/issues"
135 |
136 | [project.scripts]
137 | # Command line entry points
138 | langgraph-audit = "src.agent:main"
139 | audit-debug = "debug_langsmith:main"
140 | check-health = "check_health:main"
141 |
142 | [tool.hatch.build.targets.wheel]
143 | packages = ["src"]
144 |
145 | [tool.hatch.build.targets.sdist]
146 | include = [
147 | "src/",
148 | "rules/",
149 | "test_data/",
150 | "README.md",
151 | "langgraph.json",
152 | "pyproject.toml",
153 | ]
154 | exclude = [
155 | "**/__pycache__/",
156 | "**/*.pyc",
157 | "**/*.pyo",
158 | "**/*.orig",
159 | "**/*.rej",
160 | "**/*~",
161 | "**/#*#",
162 | "**/.#*",
163 | ".git/",
164 | ".pytest_cache/",
165 | ".coverage",
166 | ]
167 |
168 | # Black code formatting configuration
169 | [tool.black]
170 | line-length = 88
171 | target-version = ["py310", "py311", "py312"]
172 | include = '\.pyi?$'
173 | extend-exclude = '''
174 | /(
175 | # directories
176 | \.eggs
177 | | \.git
178 | | \.hg
179 | | \.mypy_cache
180 | | \.tox
181 | | \.venv
182 | | _build
183 | | buck-out
184 | | build
185 | | dist
186 | )/
187 | '''
188 |
189 | # isort import sorting configuration
190 | [tool.isort]
191 | profile = "black"
192 | multi_line_output = 3
193 | line_length = 88
194 | known_first_party = ["src"]
195 | known_third_party = ["langgraph", "langsmith", "pydantic", "fastapi"]
196 |
197 | # Flake8 linting configuration
198 | [tool.flake8]
199 | max-line-length = 88
200 | extend-ignore = ["E203", "W503", "E501"]
201 | exclude = [
202 | ".git",
203 | "__pycache__",
204 | "build",
205 | "dist",
206 | ".eggs",
207 | "*.egg-info",
208 | ".venv",
209 | ".pytest_cache",
210 | ]
211 |
212 | # MyPy type checking configuration
213 | [tool.mypy]
214 | python_version = "3.10"
215 | warn_return_any = true
216 | warn_unused_configs = true
217 | disallow_untyped_defs = true
218 | disallow_incomplete_defs = true
219 | check_untyped_defs = true
220 | disallow_untyped_decorators = true
221 | no_implicit_optional = true
222 | warn_redundant_casts = true
223 | warn_unused_ignores = true
224 | warn_no_return = true
225 | warn_unreachable = true
226 | strict_equality = true
227 |
228 | [[tool.mypy.overrides]]
229 | module = [
230 | "pytesseract",
231 | "cv2",
232 | "weasyprint",
233 | "celery",
234 | "redis",
235 | ]
236 | ignore_missing_imports = true
237 |
238 | # Pytest configuration
239 | [tool.pytest.ini_options]
240 | minversion = "7.0"
241 | addopts = "-ra -q --strict-markers --strict-config"
242 | testpaths = ["tests"]
243 | python_files = ["test_*.py", "*_test.py"]
244 | python_classes = ["Test*"]
245 | python_functions = ["test_*"]
246 | markers = [
247 | "slow: marks tests as slow (deselect with '-m \"not slow\"')",
248 | "integration: marks tests as integration tests",
249 | "unit: marks tests as unit tests",
250 | "langsmith: marks tests that require LangSmith API",
251 | ]
252 |
253 | # Coverage configuration
254 | [tool.coverage.run]
255 | source = ["src"]
256 | branch = true
257 | omit = [
258 | "*/tests/*",
259 | "*/test_*",
260 | "*/__pycache__/*",
261 | "*/migrations/*",
262 | ]
263 |
264 | [tool.coverage.report]
265 | precision = 2
266 | exclude_lines = [
267 | "pragma: no cover",
268 | "def __repr__",
269 | "if self.debug:",
270 | "if settings.DEBUG",
271 | "raise AssertionError",
272 | "raise NotImplementedError",
273 | "if 0:",
274 | "if __name__ == .__main__.:",
275 | "class .*\\bProtocol\\):",
276 | "@(abc\\.)?abstractmethod",
277 | ]
--------------------------------------------------------------------------------
/src/graph/workflow.py:
--------------------------------------------------------------------------------
1 | """
2 | 主要的职称评审材料审核工作流定义 - 完全无缓存版本
3 |
4 | 🚨 已完全取消缓存机制,确保每个节点传输的信息都是全新的、一次性的
5 |
6 | 包括:
7 | 1. ZIP解压和文件夹验证
8 | 2. PDF内容提取和核心信息提取
9 | 3. 规则集加载和提取(并行处理)
10 | 4. 规则校验和交叉验证
11 | 5. 报告生成
12 |
13 | 只包含一个主工作流:create_audit_workflow()
14 | """
15 |
16 | # LangGraph 核心导入 - 移除缓存相关的导入
17 | from langgraph.graph import StateGraph, START, END # type: ignore
18 |
19 | # 导入 RetryPolicy
20 | try:
21 | from langgraph.types import RetryPolicy # type: ignore
22 | RETRY_POLICY_AVAILABLE = True
23 | except ImportError:
24 | RetryPolicy = None
25 | RETRY_POLICY_AVAILABLE = False
26 |
27 | # 已完全移除 checkpointer 和内存存储器相关导入
28 |
29 | from .state import AuditState
30 | from .edges import (
31 | check_pdf_extraction_status,
32 | create_parallel_branches, # 并行分支路由
33 | after_rules_loaded, # 规则加载后路由
34 | check_rules_for_validation, # 规则验证路由
35 | check_pdf_extraction_for_parallel_processing # PDF提取并行分发路由
36 | )
37 | from src.tools.langsmith_utils import (
38 | setup_langsmith_environment,
39 | event_logger,
40 | with_langsmith_tracing
41 | )
42 |
43 |
44 | @with_langsmith_tracing
45 | def create_audit_workflow():
46 | """
47 | 创建完全无缓存的职称评审材料审核工作流
48 |
49 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的
50 |
51 | 工作流程:
52 | ZIP解压 -> 并行分支:
53 | 分支1: PDF内容提取 -> 核心信息提取 -> 交叉校验
54 | 分支2: 规则集加载 -> 规则提取 -> 汇入验证
55 | 最后: 报告生成
56 |
57 | Returns:
58 | 编译后的LangGraph工作流(无缓存)
59 | """
60 | # 延迟导入以避免循环依赖
61 | from src.nodes import (
62 | file_processing_node,
63 | core_info_extraction_node,
64 | validation_node,
65 | report_generation_node
66 | )
67 | from src.nodes.pdf_extraction import pdf_extraction_node
68 | from src.nodes.cross_validation import cross_validation_node
69 | from src.nodes.rules_processing import load_rules_node, extract_rules_node
70 |
71 | # 初始化LangSmith环境
72 | setup_langsmith_environment()
73 |
74 | workflow = StateGraph(AuditState)
75 |
76 | # 根据LangGraph最佳实践添加重试策略(仅在可用时)
77 | retry_policy_io = None
78 | retry_policy_ai = None
79 | retry_policy_general = None
80 |
81 | if RETRY_POLICY_AVAILABLE and RetryPolicy is not None:
82 | retry_policy_io = RetryPolicy(max_attempts=3, retry_on=[IOError, FileNotFoundError])
83 | retry_policy_ai = RetryPolicy(max_attempts=5, retry_on=[TimeoutError, ConnectionError])
84 | retry_policy_general = RetryPolicy(max_attempts=2)
85 |
86 | # 添加所有节点并配置重试策略
87 | workflow.add_node(
88 | "file_processing",
89 | _wrap_node_with_logging(file_processing_node, "file_processing"),
90 | retry_policy=retry_policy_io
91 | )
92 | workflow.add_node(
93 | "pdf_extraction",
94 | _wrap_node_with_logging(pdf_extraction_node, "pdf_extraction"),
95 | retry_policy=retry_policy_ai
96 | )
97 | workflow.add_node(
98 | "core_info_extraction",
99 | _wrap_node_with_logging(core_info_extraction_node, "core_info_extraction")
100 | )
101 | workflow.add_node(
102 | "validation",
103 | _wrap_node_with_logging(validation_node, "validation"),
104 | retry_policy=retry_policy_ai
105 | )
106 | workflow.add_node(
107 | "cross_validation",
108 | _wrap_node_with_logging(cross_validation_node, "cross_validation"),
109 | retry_policy=retry_policy_general
110 | )
111 | workflow.add_node(
112 | "report_generation",
113 | _wrap_node_with_logging(report_generation_node, "report_generation"),
114 | retry_policy=retry_policy_general
115 | )
116 | workflow.add_node(
117 | "load_rules",
118 | _wrap_node_with_logging(load_rules_node, "load_rules"),
119 | retry_policy=retry_policy_general
120 | )
121 | workflow.add_node(
122 | "extract_rules",
123 | _wrap_node_with_logging(extract_rules_node, "extract_rules"),
124 | retry_policy=retry_policy_ai
125 | )
126 |
127 | # 定义工作流边连接:添加规则集并行处理支持
128 | workflow.add_edge(START, "file_processing")
129 |
130 | # 从file_processing分叉到并行处理路径
131 | workflow.add_conditional_edges(
132 | "file_processing",
133 | create_parallel_branches,
134 | ["pdf_extraction", "load_rules"] # 支持并行分支
135 | )
136 |
137 | # 规则处理分支
138 | workflow.add_conditional_edges(
139 | "load_rules",
140 | after_rules_loaded,
141 | {
142 | "extract_rules": "extract_rules",
143 | "rules_load_failed": END
144 | }
145 | )
146 |
147 | # 规则提取完成后,将规则通过条件边传递给validation
148 | workflow.add_conditional_edges(
149 | "extract_rules",
150 | check_rules_for_validation,
151 | ["validation", "cross_validation"] # 支持Send API并行分发
152 | )
153 |
154 | # PDF提取后进入核心信息提取(主流程)
155 | workflow.add_conditional_edges(
156 | "pdf_extraction",
157 | check_pdf_extraction_status,
158 | {
159 | "pdf_extraction_success": "core_info_extraction",
160 | "pdf_extraction_failed": END
161 | }
162 | )
163 |
164 | # 🛠️ 关键修复:简化工作流连接,避免多重触发导致的缓存问题
165 | # 删除直接边,只使用条件边触发节点,确保数据一致性
166 |
167 | # validation和cross_validation完成后进入报告生成
168 | workflow.add_edge("validation", "report_generation")
169 | workflow.add_edge("cross_validation", "report_generation")
170 | workflow.add_edge("core_info_extraction", "report_generation")
171 |
172 | workflow.add_edge("report_generation", END)
173 |
174 | # 编译工作流 - 完全无缓存版本
175 | # 🚨 已移除所有checkpointer和内存存储相关的配置
176 | # 确保每个节点传输的信息都是全新的、一次性的
177 | return workflow.compile()
178 |
179 |
180 |
181 |
182 |
183 | def _wrap_node_with_logging(node_func, node_name: str):
184 | """
185 | 包装节点函数以添加LangSmith日志记录
186 |
187 | Args:
188 | node_func: 节点函数
189 | node_name: 节点名称
190 |
191 | Returns:
192 | 包装后的节点函数
193 | """
194 | import asyncio
195 | import inspect
196 |
197 | # 检查节点函数是否为异步函数
198 | if inspect.iscoroutinefunction(node_func):
199 | # 异步节点包装器
200 | async def async_wrapped_node(state):
201 | try:
202 | # 记录节点开始
203 | event_logger.log_node_start(node_name, state)
204 |
205 | # 执行异步节点函数
206 | result = await node_func(state)
207 |
208 | # 记录节点完成
209 | event_logger.log_node_complete(node_name, result)
210 |
211 | return result
212 |
213 | except Exception as e:
214 | # 记录节点错误
215 | event_logger.log_node_error(node_name, e)
216 | raise
217 |
218 | return async_wrapped_node
219 | else:
220 | # 同步节点包装器
221 | def sync_wrapped_node(state):
222 | try:
223 | # 记录节点开始
224 | event_logger.log_node_start(node_name, state)
225 |
226 | # 执行节点函数
227 | result = node_func(state)
228 |
229 | # 记录节点完成
230 | event_logger.log_node_complete(node_name, result)
231 |
232 | return result
233 |
234 | except Exception as e:
235 | # 记录节点错误
236 | event_logger.log_node_error(node_name, e)
237 | raise
238 |
239 | return sync_wrapped_node
240 |
241 |
242 |
243 |
244 |
245 | # 延迟创建默认工作流,避免循环导入
246 | default_workflow = None
247 |
248 | def get_default_workflow():
249 | """获取默认工作流(延迟创建)"""
250 | global default_workflow
251 | if default_workflow is None:
252 | default_workflow = create_audit_workflow()
253 | return default_workflow
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
--------------------------------------------------------------------------------
/src/graph/state.py:
--------------------------------------------------------------------------------
1 | """
2 | LangGraph工作流状态管理
3 |
4 | 定义审核流程中的状态结构:
5 | - AuditState: 主要的审核状态
6 | - 各个节点间的状态传递规则
7 | - 状态的序列化和反序列化
8 | - 支持并发安全的状态管理
9 | """
10 |
11 | from typing import Dict, List, Any, Optional, TypedDict, Annotated
12 | from dataclasses import dataclass, field
13 | from pathlib import Path
14 | import operator
15 |
16 |
17 | def step_reducer(existing: str, new: str) -> str:
18 | """current_step字段的reducer函数:后写入优先,确保并发安全"""
19 | # 对于步骤状态,使用最新的值(last write wins)
20 | return new if new else existing
21 |
22 |
23 | class AuditState(TypedDict):
24 | """审核工作流状态定义(支持并发安全)
25 |
26 | 注意:此处声明的键必须覆盖所有节点读写的字段;
27 | 未在此处声明的字段在LangGraph状态合并时可能被丢弃,
28 | 因而需要在这里统一、规范地进行声明。
29 | """
30 |
31 | # 输入文件信息
32 | uploaded_file: Optional[str] # 上传的ZIP压缩包路径
33 | file_type: str # 文件类型 (zip)
34 | extraction_path: Optional[str] # ZIP解压后的根目录
35 | extracted_files: Annotated[List[str], operator.add] # 解压得到的文件列表(并发安全)
36 |
37 | # 文件夹结构验证
38 | folder_validation: Dict[str, Any] # 17个标准文件夹验证结果
39 | folder_classification: Dict[str, List[str]] # 文件夹分类结果 {文件夹名: [.pdf文件列表]}
40 |
41 | # PDF内容提取和分析(新增)
42 | pdf_extraction_results: Dict[str, Any] # PDF文件提取结果
43 | api_extraction_results: Dict[str, Any] # 通过API提取的JSON结果
44 |
45 | # PDF API配置(新增)
46 | pdf_api_endpoint: Optional[str] # PDF提取API端点
47 |
48 | # 内容提取和分析
49 | extracted_content: Dict[str, Any] # 从PDF文件提取的内容信息
50 | content_analysis: Dict[str, Any] # AI分析的结构化内容
51 | core_info: Optional[Dict[str, Any]] # 核心信息(姓名、身份证号等)
52 |
53 | # 验证结果(使用reducer确保并发安全)
54 | material_validation: Dict[str, List[Any]] # 材料校验结果
55 | cross_validation: Annotated[List[Any], operator.add] # 交叉校验结果(并发安全)
56 | validation_results: Annotated[List[Dict[str, Any]], operator.add] # 所有校验结果(并发安全)
57 | # 详细验证结果与摘要(供报告节点直接消费)
58 | validation_results_detailed: Annotated[List[Dict[str, Any]], operator.add] # 详细验证结果
59 | validation_summary: Optional[Dict[str, Any]] # 验证摘要
60 |
61 | # 规则集处理(新增并行处理支持)
62 | rules_data: Annotated[List[Dict[str, Any]], operator.add] # 加载的规则集数据(并发安全)
63 | parsed_rules: List[Any] # 🚨 移除reducer,直接替换而不是累加规则(支持RuleInfo对象和字典格式)
64 | rules_by_category: Dict[str, List[Any]] # 按1-17项分类的规则集
65 |
66 | # 缓存管理(新增)
67 | validation_cache: Annotated[List[Dict[str, Any]], operator.add] # 验证结果缓存
68 | cross_validation_cache: Annotated[List[Dict[str, Any]], operator.add] # 交叉验证结果缓存
69 |
70 | # 报告生成
71 | audit_report: Optional[str] # 生成的审核报告
72 | report_path: Optional[str] # 报告文件路径
73 | report_summary: Optional[Dict[str, Any]] # 报告摘要(便于前端展示)
74 | quality_score: Optional[float] # 报告质量评分
75 | compliance_status: Optional[str] # 合规性状态(PASS/WARNING/FAIL)
76 |
77 | # 处理统计(可选,供调试/展示)
78 | processing_stats: Optional[Dict[str, Any]] # 处理统计信息
79 |
80 | # 流程控制(使用reducer确保并发安全)
81 | current_step: Annotated[str, step_reducer] # 当前步骤(并发安全)
82 | error_message: Optional[str] # 错误信息
83 | warnings: Annotated[List[str], operator.add] # 警告信息(并发安全)
84 | processing_logs: Annotated[List[str], operator.add] # 处理日志(并发安全)
85 | is_complete: bool # 是否完成
86 |
87 | # 会话管理(LangGraph官方持久化支持)
88 | session_id: Optional[str] # 会话ID
89 |
90 |
91 | @dataclass
92 | class WorkflowConfig:
93 | """工作流配置"""
94 |
95 | # 文件处理配置
96 | max_file_size: int = 50 * 1024 * 1024 # 50MB (ZIP压缩包)
97 | supported_formats: List[str] = field(default_factory=lambda: ['.zip'])
98 |
99 | # 文件夹验证配置
100 | required_folders: List[str] = field(default_factory=lambda: [
101 | "1.教育经历", "2.工作经历", "3.继续教育(培训情况)", "4.学术技术兼职情况",
102 | "5.获奖情况", "6.获得荣誉称号情况", "7.主持参与科研项目(基金)情况",
103 | "8.主持参与工程技术项目情况", "9.论文", "10.著(译)作(教材)",
104 | "11.专利(著作权)情况", "12.主持参与指定标准情况",
105 | "13.成果被批示、采纳、运用和推广情况", "14.资质证书",
106 | "15.奖惩情况", "16.考核情况", "17.申报材料附件信息"
107 | ])
108 |
109 | # PDF处理配置
110 | max_pdf_file_size: int = 20 * 1024 * 1024 # 20MB per PDF file
111 | pdf_api_timeout: int = 60 # PDF API提取超时时间(秒)
112 | pdf_api_endpoint: Optional[str] = None # PDF提取API端点
113 |
114 | # AI处理配置
115 | ai_timeout: int = 300 # AI处理超时时间(秒)
116 | max_retries: int = 3 # 最大重试次数
117 |
118 | # 输出配置
119 | output_dir: str = 'output'
120 | report_template: str = 'templates/audit_report.html'
121 |
122 |
123 | def create_initial_state(
124 | uploaded_file: str,
125 | session_id: Optional[str] = None
126 | ) -> AuditState:
127 | """创建初始状态(支持并发安全)"""
128 |
129 | file_path = Path(uploaded_file)
130 | file_type = file_path.suffix.lower()
131 |
132 | # 尝试从配置获取PDF API端点
133 | pdf_api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain" # 默认配置
134 | try:
135 | from src.config.api_config import get_pdf_api_config
136 | api_config = get_pdf_api_config()
137 | configured_endpoint = api_config.get("pdf_extraction_endpoint")
138 | if configured_endpoint:
139 | pdf_api_endpoint = configured_endpoint
140 | print(f"✅ 从配置文件加载PDF API端点: {pdf_api_endpoint}")
141 | else:
142 | print(f"⚠️ 配置文件中未找到PDF API端点,使用默认值: {pdf_api_endpoint}")
143 | except ImportError:
144 | print(f"⚠️ 无法导入API配置模块,使用默认PDF API端点: {pdf_api_endpoint}")
145 | except Exception as e:
146 | print(f"⚠️ 读取API配置失败: {e},使用默认PDF API端点: {pdf_api_endpoint}")
147 |
148 | # 确保API端点不为空
149 | if not pdf_api_endpoint:
150 | pdf_api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain"
151 | print(f"🔧 强制设置默认PDF API端点: {pdf_api_endpoint}")
152 |
153 | return AuditState(
154 | # 输入文件信息
155 | uploaded_file=uploaded_file,
156 | file_type=file_type,
157 | extraction_path=None,
158 | extracted_files=[],
159 |
160 | # 文件夹结构验证
161 | folder_validation={},
162 | folder_classification={},
163 |
164 | # PDF内容提取和分析(新增)
165 | pdf_extraction_results={},
166 | api_extraction_results={},
167 |
168 | # PDF API配置
169 | pdf_api_endpoint=pdf_api_endpoint,
170 |
171 | # 内容提取和分析
172 | extracted_content={},
173 | content_analysis={},
174 | core_info=None,
175 |
176 | # 验证结果(初始化为空列表以支持reducer)
177 | material_validation={},
178 | cross_validation=[],
179 | validation_results=[],
180 | validation_results_detailed=[],
181 | validation_summary=None,
182 |
183 | # 规则集处理(初始化为空列表以支持reducer)
184 | rules_data=[],
185 | parsed_rules=[], # 支持RuleInfo对象和字典格式
186 | rules_by_category={},
187 |
188 | # 缓存管理(新增)
189 | validation_cache=[],
190 | cross_validation_cache=[],
191 |
192 | # 报告生成
193 | audit_report=None,
194 | report_path=None,
195 | report_summary=None,
196 | quality_score=None,
197 | compliance_status=None,
198 | processing_stats=None,
199 |
200 | # 流程控制
201 | current_step="zip_extraction",
202 | error_message=None,
203 | warnings=[],
204 | processing_logs=[],
205 | is_complete=False,
206 |
207 | # 会话管理
208 | session_id=session_id
209 | )
210 |
211 |
212 | def update_state_step(state: AuditState, step: str) -> Dict[str, Any]:
213 | """更新状态步骤(并发安全)"""
214 | # 使用reducer模式更新step,避免直接赋值
215 | return {"current_step": step}
216 |
217 |
218 | def add_warning(state: AuditState, warning: str) -> Dict[str, Any]:
219 | """添加警告信息"""
220 | return {"warnings": [warning]}
221 |
222 |
223 | def set_error(state: AuditState, error: str) -> Dict[str, Any]:
224 | """设置错误信息"""
225 | return {"error_message": error}
226 |
227 |
228 | def mark_complete(state: AuditState) -> Dict[str, Any]:
229 | """标记流程完成(并发安全)"""
230 | return {
231 | "is_complete": True,
232 | "current_step": "completed"
233 | }
234 |
--------------------------------------------------------------------------------
/src/tools/cache_manager.py:
--------------------------------------------------------------------------------
1 | """
2 | 缓存管理工具
3 |
4 | 用于管理validation和cross_validation阶段的缓存结果:
5 | 1. 按材料类型分类整理
6 | 2. 按优先级排序(高优先级错误在前)
7 | 3. 过滤通过的结果(仅显示警告和错误)
8 | 4. 生成结构化的报告数据
9 | """
10 |
11 | from typing import Dict, List, Any, Optional
12 | from collections import defaultdict
13 |
14 |
15 | class ValidationCacheManager:
16 | """验证缓存管理器"""
17 |
18 | def __init__(self):
19 | self.priority_order = {
20 | "极高": 1,
21 | "高": 2,
22 | "中": 3,
23 | "低": 4
24 | }
25 |
26 | self.status_order = {
27 | "❌不通过": 1,
28 | "⚠️警告": 2,
29 | "✅通过": 3
30 | }
31 |
32 | def organize_validation_cache(self, validation_cache: List[Dict[str, Any]],
33 | cross_validation_cache: List[Dict[str, Any]]) -> Dict[str, Any]:
34 | """
35 | 整理验证缓存数据
36 |
37 | Args:
38 | validation_cache: 材料验证缓存结果
39 | cross_validation_cache: 交叉验证缓存结果
40 |
41 | Returns:
42 | 整理后的报告数据
43 | """
44 | print("📊 开始整理验证缓存数据...")
45 |
46 | # 按材料类型分类
47 | material_groups = self._group_by_material_type(validation_cache)
48 |
49 | # 添加交叉验证结果
50 | if cross_validation_cache:
51 | material_groups["交叉校验"] = cross_validation_cache
52 |
53 | # 过滤和排序每个材料类型的结果
54 | filtered_groups = {}
55 | total_issues = 0
56 |
57 | for material_type, results in material_groups.items():
58 | # 过滤掉通过的结果,只保留警告和错误
59 | filtered_results = self._filter_non_passing_results(results)
60 |
61 | if filtered_results:
62 | # 按优先级和状态排序
63 | sorted_results = self._sort_results_by_priority(filtered_results)
64 | filtered_groups[material_type] = sorted_results
65 | total_issues += len(sorted_results)
66 |
67 | print(f" 📋 {material_type}: {len(sorted_results)}个问题")
68 |
69 | # 生成统计信息
70 | statistics = self._generate_statistics(validation_cache, cross_validation_cache)
71 |
72 | print(f"✅ 缓存数据整理完成,共发现{total_issues}个需要关注的问题")
73 |
74 | return {
75 | "material_groups": filtered_groups,
76 | "statistics": statistics,
77 | "total_issues": total_issues,
78 | "processed_at": self._get_current_timestamp()
79 | }
80 |
81 | def _group_by_material_type(self, validation_cache: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
82 | """按材料类型分组"""
83 | groups = defaultdict(list)
84 |
85 | for result in validation_cache:
86 | material_type = result.get("material_type", "未知类型")
87 | groups[material_type].append(result)
88 |
89 | return dict(groups)
90 |
91 | def _filter_non_passing_results(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
92 | """过滤掉通过的结果,只保留警告和错误"""
93 | return [
94 | result for result in results
95 | if result.get("result", "").strip() != "✅通过"
96 | ]
97 |
98 | def _sort_results_by_priority(self, results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
99 | """按优先级和状态排序(高优先级、错误状态在前)"""
100 | def sort_key(result):
101 | priority = result.get("priority", "中")
102 | status = result.get("result", "⚠️警告")
103 |
104 | priority_score = self.priority_order.get(priority, 3)
105 | status_score = self.status_order.get(status, 2)
106 |
107 | return (priority_score, status_score)
108 |
109 | return sorted(results, key=sort_key)
110 |
111 | def _generate_statistics(self, validation_cache: List[Dict[str, Any]],
112 | cross_validation_cache: List[Dict[str, Any]]) -> Dict[str, Any]:
113 | """生成统计信息"""
114 | all_results = validation_cache + cross_validation_cache
115 |
116 | # 按状态统计
117 | status_counts = defaultdict(int)
118 | priority_counts = defaultdict(int)
119 | material_counts = defaultdict(int)
120 |
121 | for result in all_results:
122 | status = result.get("result", "⚠️警告")
123 | priority = result.get("priority", "中")
124 | material_type = result.get("material_type", "未知类型")
125 |
126 | status_counts[status] += 1
127 | priority_counts[priority] += 1
128 | material_counts[material_type] += 1
129 |
130 | return {
131 | "total_results": len(all_results),
132 | "validation_results": len(validation_cache),
133 | "cross_validation_results": len(cross_validation_cache),
134 | "status_distribution": dict(status_counts),
135 | "priority_distribution": dict(priority_counts),
136 | "material_distribution": dict(material_counts),
137 | "issues_count": len([r for r in all_results if r.get("result", "").strip() != "✅通过"])
138 | }
139 |
140 | def get_report_summary(self, organized_data: Dict[str, Any]) -> Dict[str, Any]:
141 | """
142 | 生成报告摘要
143 |
144 | Args:
145 | organized_data: 整理后的数据
146 |
147 | Returns:
148 | 报告摘要信息
149 | """
150 | material_groups = organized_data.get("material_groups", {})
151 | statistics = organized_data.get("statistics", {})
152 |
153 | # 计算各类问题数量
154 | error_count = sum(
155 | len([r for r in results if r.get("result", "").startswith("❌")])
156 | for results in material_groups.values()
157 | )
158 |
159 | warning_count = sum(
160 | len([r for r in results if r.get("result", "").startswith("⚠️")])
161 | for results in material_groups.values()
162 | )
163 |
164 | # 最高优先级问题
165 | high_priority_issues = []
166 | for material_type, results in material_groups.items():
167 | for result in results:
168 | if result.get("priority") in ["极高", "高"]:
169 | high_priority_issues.append({
170 | "material_type": material_type,
171 | "rule_name": result.get("rule_name", ""),
172 | "details": result.get("details", ""),
173 | "priority": result.get("priority", "")
174 | })
175 |
176 | return {
177 | "total_materials_checked": len(statistics.get("material_distribution", {})),
178 | "total_issues": organized_data.get("total_issues", 0),
179 | "error_count": error_count,
180 | "warning_count": warning_count,
181 | "high_priority_count": len(high_priority_issues),
182 | "high_priority_issues": high_priority_issues[:5], # 只显示前5个
183 | "material_issue_summary": {
184 | material_type: len(results)
185 | for material_type, results in material_groups.items()
186 | }
187 | }
188 |
189 | def _get_current_timestamp(self) -> str:
190 | """获取当前时间戳"""
191 | from datetime import datetime
192 | return datetime.now().isoformat()
193 |
194 |
195 | # 全局缓存管理器实例
196 | cache_manager = ValidationCacheManager()
197 |
198 |
199 | def organize_audit_cache(state) -> Dict[str, Any]:
200 | """
201 | 整理审核缓存数据的便捷函数
202 |
203 | Args:
204 | state: 审核状态
205 |
206 | Returns:
207 | 整理后的缓存数据
208 | """
209 | validation_cache = state.get("validation_cache", [])
210 | cross_validation_cache = state.get("cross_validation_cache", [])
211 |
212 | return cache_manager.organize_validation_cache(validation_cache, cross_validation_cache)
213 |
214 |
215 | def get_report_data_from_cache(state) -> Dict[str, Any]:
216 | """
217 | 从缓存中获取报告数据
218 |
219 | Args:
220 | state: 审核状态
221 |
222 | Returns:
223 | 报告数据
224 | """
225 | organized_data = organize_audit_cache(state)
226 | summary = cache_manager.get_report_summary(organized_data)
227 |
228 | return {
229 | "organized_data": organized_data,
230 | "summary": summary,
231 | "cache_processed": True
232 | }
--------------------------------------------------------------------------------
/src/tools/langsmith_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | LangSmith集成工具类
3 |
4 | 提供LangGraph项目的调试、监控和评估功能
5 | """
6 |
7 | import os
8 | import uuid
9 | from typing import Dict, Any, Optional, List
10 | from datetime import datetime
11 | import getpass
12 |
13 | def setup_langsmith_environment():
14 | """
15 | 设置LangSmith环境变量
16 |
17 | 根据LangGraph最佳实践配置LangSmith追踪
18 | """
19 | def _set_env(var: str):
20 | """安全地设置环境变量"""
21 | if not os.environ.get(var):
22 | # 优先从.env文件读取,如果没有则提示输入
23 | value = getpass.getpass(f"请输入 {var}: ")
24 | os.environ[var] = value
25 |
26 | # 设置必要的API密钥
27 | _set_env("LANGSMITH_API_KEY")
28 |
29 | # 配置LangSmith追踪
30 | os.environ["LANGCHAIN_TRACING_V2"] = "true"
31 | os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
32 | os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT", "Audit_Workflow_Debug")
33 | os.environ["LANGSMITH_TRACING"] = "true"
34 |
35 | print("✅ LangSmith环境配置完成")
36 | print(f"📊 项目名称: {os.environ['LANGCHAIN_PROJECT']}")
37 |
38 |
39 | def create_run_config(
40 | run_name: Optional[str] = None,
41 | tags: Optional[List[str]] = None,
42 | metadata: Optional[Dict[str, Any]] = None,
43 | thread_id: Optional[str] = None
44 | ) -> Dict[str, Any]:
45 | """
46 | 创建LangGraph运行配置,支持LangSmith追踪
47 |
48 | Args:
49 | run_name: 运行名称
50 | tags: 标签列表
51 | metadata: 元数据
52 | thread_id: 线程ID
53 |
54 | Returns:
55 | 配置字典
56 | """
57 | config = {}
58 |
59 | # 生成唯一的运行ID
60 | if not config.get("run_id"):
61 | config["run_id"] = str(uuid.uuid4())
62 |
63 | # 设置运行名称
64 | if run_name:
65 | config["run_name"] = run_name
66 | else:
67 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
68 | config["run_name"] = f"audit_workflow_{timestamp}"
69 |
70 | # 设置标签
71 | default_tags = ["audit_workflow", "langgraph", "production"]
72 | if tags:
73 | config["tags"] = default_tags + tags
74 | else:
75 | config["tags"] = default_tags
76 |
77 | # 设置元数据
78 | default_metadata = {
79 | "version": "1.0.0",
80 | "environment": os.getenv("ENVIRONMENT", "development"),
81 | "project": "职称评审材料审核系统"
82 | }
83 | if metadata:
84 | default_metadata.update(metadata)
85 | config["metadata"] = default_metadata
86 |
87 | # 设置可配置参数
88 | configurable = {}
89 | if thread_id:
90 | configurable["thread_id"] = thread_id
91 |
92 | if configurable:
93 | config["configurable"] = configurable
94 |
95 | return config
96 |
97 |
98 | def log_workflow_step(step_name: str, status: str, data: Optional[Dict] = None):
99 | """
100 | 记录工作流步骤,便于调试
101 |
102 | Args:
103 | step_name: 步骤名称
104 | status: 状态 (started, completed, failed)
105 | data: 附加数据
106 | """
107 | timestamp = datetime.now().isoformat()
108 | log_entry = {
109 | "timestamp": timestamp,
110 | "step": step_name,
111 | "status": status,
112 | "data": data or {}
113 | }
114 |
115 | # 使用结构化日志,LangSmith可以捕获
116 | print(f"🔍 [{timestamp}] {step_name.upper()}: {status}")
117 | if data:
118 | print(f" 📝 数据: {data}")
119 |
120 |
121 | def create_debug_config(breakpoints: Optional[List[str]] = None) -> Dict[str, Any]:
122 | """
123 | 创建调试配置
124 |
125 | Args:
126 | breakpoints: 断点列表
127 |
128 | Returns:
129 | 调试配置
130 | """
131 | config = create_run_config(
132 | run_name="debug_session",
133 | tags=["debug", "development"],
134 | metadata={"mode": "debug"}
135 | )
136 |
137 | if breakpoints:
138 | config["breakpoints"] = breakpoints
139 |
140 | # 启用详细追踪
141 | config["recursion_limit"] = 50
142 |
143 | return config
144 |
145 |
146 | def hide_sensitive_data(inputs: Dict[str, Any]) -> Dict[str, Any]:
147 | """
148 | 隐藏敏感数据,避免在LangSmith中暴露
149 |
150 | Args:
151 | inputs: 输入数据
152 |
153 | Returns:
154 | 脱敏后的数据
155 | """
156 | copied = inputs.copy()
157 |
158 | # 隐藏敏感字段
159 | sensitive_fields = ["api_key", "password", "token", "secret"]
160 |
161 | for key in copied:
162 | if any(sensitive in key.lower() for sensitive in sensitive_fields):
163 | copied[key] = "***HIDDEN***"
164 |
165 | # 隐藏长文本内容
166 | if isinstance(copied[key], str) and len(copied[key]) > 1000:
167 | copied[key] = copied[key][:100] + "...[内容过长已截断]"
168 |
169 | return copied
170 |
171 |
172 | class LangSmithEventLogger:
173 | """LangSmith事件记录器"""
174 |
175 | def __init__(self, project_name: str = "Audit_Workflow"):
176 | self.project_name = project_name
177 | self.events = []
178 |
179 | def log_node_start(self, node_name: str, state: Dict[str, Any]):
180 | """记录节点开始"""
181 | event = {
182 | "type": "node_start",
183 | "node": node_name,
184 | "timestamp": datetime.now().isoformat(),
185 | "state_keys": list(state.keys())
186 | }
187 | self.events.append(event)
188 | log_workflow_step(f"节点开始: {node_name}", "started")
189 |
190 | def log_node_complete(self, node_name: str, result: Dict[str, Any]):
191 | """记录节点完成"""
192 | event = {
193 | "type": "node_complete",
194 | "node": node_name,
195 | "timestamp": datetime.now().isoformat(),
196 | "result_keys": list(result.keys())
197 | }
198 | self.events.append(event)
199 | log_workflow_step(f"节点完成: {node_name}", "completed", {"result_keys": list(result.keys())})
200 |
201 | def log_node_error(self, node_name: str, error: Exception):
202 | """记录节点错误"""
203 | event = {
204 | "type": "node_error",
205 | "node": node_name,
206 | "timestamp": datetime.now().isoformat(),
207 | "error": str(error),
208 | "error_type": type(error).__name__
209 | }
210 | self.events.append(event)
211 | log_workflow_step(f"节点错误: {node_name}", "failed", {"error": str(error)})
212 |
213 | def get_events(self) -> List[Dict[str, Any]]:
214 | """获取所有事件"""
215 | return self.events
216 |
217 | def clear_events(self):
218 | """清空事件"""
219 | self.events.clear()
220 |
221 |
222 | # 全局事件记录器实例
223 | event_logger = LangSmithEventLogger()
224 |
225 |
226 | def with_langsmith_tracing(func):
227 | """
228 | 装饰器:为函数添加LangSmith追踪
229 | """
230 | def wrapper(*args, **kwargs):
231 | from langchain_core.tracers.context import tracing_v2_enabled
232 | from langsmith import Client
233 |
234 | # 创建LangSmith客户端,隐藏敏感数据
235 | client = Client(
236 | hide_inputs=hide_sensitive_data,
237 | hide_outputs=hide_sensitive_data
238 | )
239 |
240 | # 在追踪上下文中执行函数
241 | with tracing_v2_enabled(client=client):
242 | return func(*args, **kwargs)
243 |
244 | return wrapper
245 |
246 |
247 | def stream_with_debug(graph, inputs: Dict[str, Any], config: Optional[Dict[str, Any]] = None):
248 | """
249 | 流式执行图并输出调试信息
250 |
251 | Args:
252 | graph: LangGraph图实例
253 | inputs: 输入数据
254 | config: 配置信息
255 |
256 | Yields:
257 | 流式输出结果
258 | """
259 | if not config:
260 | config = create_debug_config()
261 |
262 | print(f"🚀 开始执行工作流...")
263 | print(f"📊 运行ID: {config.get('run_id')}")
264 | print(f"🏷️ 标签: {config.get('tags', [])}")
265 |
266 | try:
267 | # 使用debug模式流式执行
268 | for chunk in graph.stream(inputs, config, stream_mode="debug"):
269 | print(f"🔍 调试信息: {chunk}")
270 | yield chunk
271 |
272 | except Exception as e:
273 | print(f"❌ 执行失败: {str(e)}")
274 | event_logger.log_node_error("workflow", e)
275 | raise
276 |
277 |
278 | if __name__ == "__main__":
279 | # 测试LangSmith配置
280 | setup_langsmith_environment()
281 |
282 | # 测试配置创建
283 | test_config = create_run_config(
284 | run_name="test_run",
285 | tags=["test"],
286 | metadata={"test": True}
287 | )
288 | print(f"测试配置: {test_config}")
--------------------------------------------------------------------------------
/src/config/redis.py:
--------------------------------------------------------------------------------
1 | """
2 | Redis 配置和连接管理
3 |
4 | 提供 Redis 连接配置、健康检查和连接池管理功能
5 | """
6 |
7 | import os
8 | import redis
9 | from typing import Optional, Dict, Any
10 | from dataclasses import dataclass
11 | from loguru import logger
12 |
13 | @dataclass
14 | class RedisConfig:
15 | """Redis 配置类"""
16 |
17 | # 连接配置
18 | host: str = "localhost"
19 | port: int = 6379
20 | db: int = 0
21 | password: Optional[str] = None
22 |
23 | # 连接池配置
24 | max_connections: int = 20
25 | retry_on_timeout: bool = True
26 |
27 | # 超时配置
28 | socket_connect_timeout: int = 5
29 | socket_timeout: int = 5
30 |
31 | # TTL 配置 (用于 LangGraph checkpointer)
32 | default_ttl: int = 3600 # 1小时,单位:秒
33 | refresh_on_read: bool = True
34 |
35 | # 键前缀
36 | checkpoint_prefix: str = "langgraph:checkpoint:"
37 | store_prefix: str = "langgraph:store:"
38 |
39 | @classmethod
40 | def from_env(cls) -> "RedisConfig":
41 | """从环境变量创建 Redis 配置"""
42 | return cls(
43 | host=os.getenv("REDIS_HOST", "localhost"),
44 | port=int(os.getenv("REDIS_PORT", "6379")),
45 | db=int(os.getenv("REDIS_DB", "0")),
46 | password=os.getenv("REDIS_PASSWORD"),
47 | max_connections=int(os.getenv("REDIS_MAX_CONNECTIONS", "20")),
48 | socket_connect_timeout=int(os.getenv("REDIS_SOCKET_CONNECT_TIMEOUT", "5")),
49 | socket_timeout=int(os.getenv("REDIS_SOCKET_TIMEOUT", "5")),
50 | default_ttl=int(os.getenv("REDIS_DEFAULT_TTL", "3600")),
51 | refresh_on_read=os.getenv("REDIS_REFRESH_ON_READ", "true").lower() == "true",
52 | checkpoint_prefix=os.getenv("REDIS_CHECKPOINT_PREFIX", "langgraph:checkpoint:"),
53 | store_prefix=os.getenv("REDIS_STORE_PREFIX", "langgraph:store:")
54 | )
55 |
56 | def get_connection_url(self) -> str:
57 | """获取 Redis 连接 URL"""
58 | if self.password:
59 | return f"redis://:{self.password}@{self.host}:{self.port}/{self.db}"
60 | return f"redis://{self.host}:{self.port}/{self.db}"
61 |
62 | def get_ttl_config(self) -> Dict[str, Any]:
63 | """获取 TTL 配置字典"""
64 | return {
65 | "default_ttl": self.default_ttl // 60, # LangGraph Redis 期望分钟
66 | "refresh_on_read": self.refresh_on_read
67 | }
68 |
69 |
70 | class RedisManager:
71 | """Redis 连接管理器"""
72 |
73 | def __init__(self, config: Optional[RedisConfig] = None):
74 | self.config = config or RedisConfig.from_env()
75 | self._redis_client: Optional[redis.Redis] = None
76 |
77 | @property
78 | def redis_client(self) -> redis.Redis:
79 | """获取 Redis 客户端(单例模式)"""
80 | if self._redis_client is None:
81 | # 创建连接池
82 | pool = redis.ConnectionPool(
83 | host=self.config.host,
84 | port=self.config.port,
85 | db=self.config.db,
86 | password=self.config.password,
87 | max_connections=self.config.max_connections,
88 | retry_on_timeout=self.config.retry_on_timeout,
89 | socket_connect_timeout=self.config.socket_connect_timeout,
90 | socket_timeout=self.config.socket_timeout,
91 | decode_responses=True
92 | )
93 | # 显式创建同步 Redis 客户端
94 | self._redis_client = redis.Redis(connection_pool=pool)
95 | return self._redis_client
96 |
97 | def test_connection(self) -> bool:
98 | """测试 Redis 连接"""
99 | try:
100 | # 直接调用 ping(),同步客户端应该返回布尔值
101 | result = self.redis_client.ping()
102 | # 确保结果是布尔值
103 | success = bool(result)
104 | if success:
105 | logger.info(f"✅ Redis 连接成功: {self.config.host}:{self.config.port}")
106 | else:
107 | logger.error(f"❌ Redis ping 返回 False")
108 | return success
109 | except Exception as e:
110 | logger.error(f"❌ Redis 连接失败: {e}")
111 | return False
112 |
113 | def get_info(self) -> Dict[str, Any]:
114 | """获取 Redis 服务器信息"""
115 | try:
116 | info = self.redis_client.info()
117 | # 确保返回的是字典类型
118 | if isinstance(info, dict):
119 | return info
120 | else:
121 | logger.warning(f"Redis info() 返回了非字典类型: {type(info)}")
122 | return {}
123 | except Exception as e:
124 | logger.error(f"获取 Redis 信息失败: {e}")
125 | return {}
126 |
127 | def clear_cache(self, pattern: str = "*") -> int:
128 | """清理缓存"""
129 | try:
130 | keys = self.redis_client.keys(pattern)
131 | # 确保 keys 是列表类型
132 | if isinstance(keys, (list, tuple)) and keys:
133 | deleted = self.redis_client.delete(*keys)
134 | # 安全处理 deleted 的类型转换
135 | if isinstance(deleted, int):
136 | deleted_count = deleted
137 | else:
138 | # 对于非整数类型(包括异步类型),返回 0
139 | logger.warning(f"Redis delete() 返回了非整数类型: {type(deleted)},默认为 0")
140 | deleted_count = 0
141 |
142 | logger.info(f"清理了 {deleted_count} 个缓存键")
143 | return deleted_count
144 | elif isinstance(keys, (list, tuple)):
145 | # 空列表
146 | return 0
147 | else:
148 | logger.warning(f"Redis keys() 返回了非列表类型: {type(keys)}")
149 | return 0
150 | except Exception as e:
151 | logger.error(f"清理缓存失败: {e}")
152 | return 0
153 |
154 | def get_memory_usage(self) -> Dict[str, Any]:
155 | """获取内存使用情况"""
156 | try:
157 | # 直接调用 info() 方法,同步 Redis 客户端应该返回字典
158 | info = self.redis_client.info("memory")
159 |
160 | # 确保返回的是字典类型
161 | if isinstance(info, dict):
162 | return {
163 | "used_memory": info.get("used_memory", 0),
164 | "used_memory_human": info.get("used_memory_human", "0B"),
165 | "used_memory_peak": info.get("used_memory_peak", 0),
166 | "used_memory_peak_human": info.get("used_memory_peak_human", "0B"),
167 | "total_system_memory": info.get("total_system_memory", 0),
168 | "total_system_memory_human": info.get("total_system_memory_human", "0B")
169 | }
170 | else:
171 | # 如果不是字典类型,记录警告并返回空字典
172 | logger.warning(f"Redis info() 返回了非字典类型: {type(info)}")
173 | return {}
174 | except Exception as e:
175 | logger.error(f"获取内存使用情况失败: {e}")
176 | return {}
177 |
178 | def close(self):
179 | """关闭 Redis 连接"""
180 | if self._redis_client:
181 | self._redis_client.close()
182 | self._redis_client = None
183 | logger.info("Redis 连接已关闭")
184 |
185 |
186 | # 全局 Redis 管理器实例
187 | _redis_manager: Optional[RedisManager] = None
188 |
189 |
190 | def get_redis_manager() -> RedisManager:
191 | """获取全局 Redis 管理器实例"""
192 | global _redis_manager
193 | if _redis_manager is None:
194 | _redis_manager = RedisManager()
195 | return _redis_manager
196 |
197 |
198 | def get_redis_config() -> RedisConfig:
199 | """获取 Redis 配置"""
200 | return get_redis_manager().config
201 |
202 |
203 | def test_redis_connection() -> bool:
204 | """测试 Redis 连接"""
205 | return get_redis_manager().test_connection()
206 |
207 |
208 | # 健康检查函数
209 | def redis_health_check() -> Dict[str, Any]:
210 | """Redis 健康检查"""
211 | manager = get_redis_manager()
212 |
213 | health_info = {
214 | "service": "redis",
215 | "status": "unknown",
216 | "details": {}
217 | }
218 |
219 | try:
220 | # 测试连接
221 | if manager.test_connection():
222 | health_info["status"] = "healthy"
223 | health_info["details"]["connection"] = "ok"
224 |
225 | # 获取服务器信息
226 | info = manager.get_info()
227 | if isinstance(info, dict):
228 | health_info["details"]["version"] = info.get("redis_version", "unknown")
229 | health_info["details"]["uptime"] = info.get("uptime_in_seconds", 0)
230 |
231 | # 获取内存使用情况
232 | memory_info = manager.get_memory_usage()
233 | health_info["details"]["memory"] = memory_info
234 |
235 | else:
236 | health_info["status"] = "unhealthy"
237 | health_info["details"]["error"] = "connection_failed"
238 |
239 | except Exception as e:
240 | health_info["status"] = "unhealthy"
241 | health_info["details"]["error"] = str(e)
242 |
243 | return health_info
--------------------------------------------------------------------------------
/static/styles.css:
--------------------------------------------------------------------------------
1 | /* LangGraph 职称评审系统 - 样式文件 */
2 |
3 | :root {
4 | --primary-color: #0d6efd;
5 | --secondary-color: #6c757d;
6 | --success-color: #198754;
7 | --danger-color: #dc3545;
8 | --warning-color: #ffc107;
9 | --info-color: #0dcaf0;
10 | --light-color: #f8f9fa;
11 | --dark-color: #212529;
12 | }
13 |
14 | body {
15 | font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
16 | background-color: #f5f7fa;
17 | color: var(--dark-color);
18 | }
19 |
20 | .navbar-brand {
21 | font-weight: 600;
22 | font-size: 1.25rem;
23 | }
24 |
25 | .card {
26 | border: none;
27 | border-radius: 12px;
28 | transition: all 0.3s ease;
29 | }
30 |
31 | .card:hover {
32 | transform: translateY(-2px);
33 | }
34 |
35 | .card-header {
36 | border-radius: 12px 12px 0 0 !important;
37 | border: none;
38 | font-weight: 600;
39 | }
40 |
41 | .btn {
42 | border-radius: 8px;
43 | font-weight: 500;
44 | transition: all 0.3s ease;
45 | }
46 |
47 | .btn:hover {
48 | transform: translateY(-1px);
49 | }
50 |
51 | /* 特色卡片样式 */
52 | .feature-card {
53 | text-align: center;
54 | padding: 2rem 1rem;
55 | border-radius: 12px;
56 | background: white;
57 | box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
58 | transition: all 0.3s ease;
59 | margin-bottom: 1.5rem;
60 | }
61 |
62 | .feature-card:hover {
63 | transform: translateY(-4px);
64 | box-shadow: 0 8px 15px rgba(0, 0, 0, 0.15);
65 | }
66 |
67 | .feature-card i {
68 | font-size: 2.5rem;
69 | margin-bottom: 1rem;
70 | }
71 |
72 | .feature-card h5 {
73 | color: var(--dark-color);
74 | margin-bottom: 0.5rem;
75 | font-weight: 600;
76 | }
77 |
78 | .feature-card p {
79 | color: var(--secondary-color);
80 | margin: 0;
81 | font-size: 0.9rem;
82 | }
83 |
84 | /* 进度条样式 */
85 | .progress {
86 | border-radius: 10px;
87 | background-color: #e9ecef;
88 | }
89 |
90 | .progress-bar {
91 | border-radius: 10px;
92 | transition: width 0.6s ease;
93 | }
94 |
95 | /* 工作流步骤样式 */
96 | .workflow-step {
97 | display: flex;
98 | align-items: center;
99 | padding: 1rem;
100 | margin-bottom: 0.5rem;
101 | border-radius: 8px;
102 | background: white;
103 | border-left: 4px solid #e9ecef;
104 | transition: all 0.3s ease;
105 | }
106 |
107 | .workflow-step.active {
108 | border-left-color: var(--primary-color);
109 | background: rgba(13, 110, 253, 0.05);
110 | }
111 |
112 | .workflow-step.completed {
113 | border-left-color: var(--success-color);
114 | background: rgba(25, 135, 84, 0.05);
115 | }
116 |
117 | .workflow-step.error {
118 | border-left-color: var(--danger-color);
119 | background: rgba(220, 53, 69, 0.05);
120 | }
121 |
122 | .workflow-step-icon {
123 | width: 40px;
124 | height: 40px;
125 | border-radius: 50%;
126 | display: flex;
127 | align-items: center;
128 | justify-content: center;
129 | margin-right: 1rem;
130 | font-size: 1.2rem;
131 | background: #e9ecef;
132 | color: var(--secondary-color);
133 | transition: all 0.3s ease;
134 | }
135 |
136 | .workflow-step.active .workflow-step-icon {
137 | background: var(--primary-color);
138 | color: white;
139 | }
140 |
141 | .workflow-step.completed .workflow-step-icon {
142 | background: var(--success-color);
143 | color: white;
144 | }
145 |
146 | .workflow-step.error .workflow-step-icon {
147 | background: var(--danger-color);
148 | color: white;
149 | }
150 |
151 | .workflow-step-content h6 {
152 | margin: 0 0 0.25rem 0;
153 | font-weight: 600;
154 | }
155 |
156 | .workflow-step-content p {
157 | margin: 0;
158 | color: var(--secondary-color);
159 | font-size: 0.9rem;
160 | }
161 |
162 | /* 日志容器样式 */
163 | .log-container {
164 | height: 300px;
165 | overflow-y: auto;
166 | background: #2c3e50;
167 | color: #ecf0f1;
168 | font-family: 'Courier New', monospace;
169 | font-size: 0.85rem;
170 | padding: 1rem;
171 | border-radius: 8px;
172 | }
173 |
174 | .log-entry {
175 | margin-bottom: 0.5rem;
176 | padding: 0.25rem 0.5rem;
177 | border-radius: 4px;
178 | word-wrap: break-word;
179 | }
180 |
181 | .log-entry.started {
182 | background: rgba(13, 110, 253, 0.2);
183 | border-left: 3px solid var(--primary-color);
184 | }
185 |
186 | .log-entry.progress {
187 | background: rgba(255, 193, 7, 0.2);
188 | border-left: 3px solid var(--warning-color);
189 | }
190 |
191 | .log-entry.completed {
192 | background: rgba(25, 135, 84, 0.2);
193 | border-left: 3px solid var(--success-color);
194 | }
195 |
196 | .log-entry.error {
197 | background: rgba(220, 53, 69, 0.2);
198 | border-left: 3px solid var(--danger-color);
199 | }
200 |
201 | .log-timestamp {
202 | color: #95a5a6;
203 | font-size: 0.75rem;
204 | margin-right: 0.5rem;
205 | }
206 |
207 | /* 任务列表样式 */
208 | .task-item {
209 | padding: 1rem;
210 | border-bottom: 1px solid #e9ecef;
211 | cursor: pointer;
212 | transition: all 0.3s ease;
213 | }
214 |
215 | .task-item:hover {
216 | background: rgba(13, 110, 253, 0.05);
217 | }
218 |
219 | .task-item.active {
220 | background: rgba(13, 110, 253, 0.1);
221 | border-left: 4px solid var(--primary-color);
222 | }
223 |
224 | .task-status {
225 | font-size: 0.75rem;
226 | padding: 0.25rem 0.5rem;
227 | border-radius: 12px;
228 | font-weight: 600;
229 | text-transform: uppercase;
230 | }
231 |
232 | .task-status.started {
233 | background: rgba(13, 110, 253, 0.1);
234 | color: var(--primary-color);
235 | }
236 |
237 | .task-status.processing {
238 | background: rgba(255, 193, 7, 0.1);
239 | color: #996404;
240 | }
241 |
242 | .task-status.completed {
243 | background: rgba(25, 135, 84, 0.1);
244 | color: var(--success-color);
245 | }
246 |
247 | .task-status.failed {
248 | background: rgba(220, 53, 69, 0.1);
249 | color: var(--danger-color);
250 | }
251 |
252 | /* 状态徽章样式 */
253 | .badge.bg-primary {
254 | background-color: var(--primary-color) !important;
255 | }
256 |
257 | .badge.bg-warning {
258 | background-color: var(--warning-color) !important;
259 | color: var(--dark-color) !important;
260 | }
261 |
262 | .badge.bg-success {
263 | background-color: var(--success-color) !important;
264 | }
265 |
266 | .badge.bg-danger {
267 | background-color: var(--danger-color) !important;
268 | }
269 |
270 | /* 文件输入样式 */
271 | .form-control:focus {
272 | border-color: var(--primary-color);
273 | box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
274 | }
275 |
276 | .form-select:focus {
277 | border-color: var(--primary-color);
278 | box-shadow: 0 0 0 0.2rem rgba(13, 110, 253, 0.25);
279 | }
280 |
281 | /* 动画效果 */
282 | @keyframes pulse {
283 | 0% {
284 | transform: scale(1);
285 | }
286 | 50% {
287 | transform: scale(1.05);
288 | }
289 | 100% {
290 | transform: scale(1);
291 | }
292 | }
293 |
294 | .pulse {
295 | animation: pulse 2s infinite;
296 | }
297 |
298 | @keyframes fadeInUp {
299 | from {
300 | opacity: 0;
301 | transform: translateY(30px);
302 | }
303 | to {
304 | opacity: 1;
305 | transform: translateY(0);
306 | }
307 | }
308 |
309 | .fade-in-up {
310 | animation: fadeInUp 0.6s ease-out;
311 | }
312 |
313 | /* 响应式设计 */
314 | @media (max-width: 768px) {
315 | .feature-card {
316 | padding: 1.5rem 1rem;
317 | }
318 |
319 | .workflow-step {
320 | flex-direction: column;
321 | text-align: center;
322 | }
323 |
324 | .workflow-step-icon {
325 | margin-right: 0;
326 | margin-bottom: 0.5rem;
327 | }
328 |
329 | .log-container {
330 | height: 200px;
331 | font-size: 0.8rem;
332 | }
333 | }
334 |
335 | /* 滚动条样式 */
336 | .log-container::-webkit-scrollbar {
337 | width: 8px;
338 | }
339 |
340 | .log-container::-webkit-scrollbar-track {
341 | background: #34495e;
342 | border-radius: 4px;
343 | }
344 |
345 | .log-container::-webkit-scrollbar-thumb {
346 | background: #7f8c8d;
347 | border-radius: 4px;
348 | }
349 |
350 | .log-container::-webkit-scrollbar-thumb:hover {
351 | background: #95a5a6;
352 | }
353 |
354 | /* 加载动画 */
355 | .spinner-border-sm {
356 | width: 1rem;
357 | height: 1rem;
358 | }
359 |
360 | /* 工具提示样式 */
361 | .tooltip {
362 | font-size: 0.8rem;
363 | }
364 |
365 | /* 模态框样式 */
366 | .modal-content {
367 | border-radius: 12px;
368 | border: none;
369 | box-shadow: 0 20px 25px rgba(0, 0, 0, 0.15);
370 | }
371 |
372 | .modal-header {
373 | border-radius: 12px 12px 0 0;
374 | border-bottom: 1px solid #e9ecef;
375 | }
376 |
377 | /* 代码块样式 */
378 | pre {
379 | background: #2c3e50 !important;
380 | color: #ecf0f1 !important;
381 | border: none;
382 | border-radius: 8px;
383 | font-family: 'Courier New', monospace;
384 | font-size: 0.85rem;
385 | max-height: 400px;
386 | overflow-y: auto;
387 | }
388 |
389 | /* 连接状态指示器 */
390 | .connection-status {
391 | position: fixed;
392 | top: 20px;
393 | right: 20px;
394 | z-index: 1050;
395 | padding: 0.5rem 1rem;
396 | border-radius: 20px;
397 | font-size: 0.8rem;
398 | font-weight: 600;
399 | transition: all 0.3s ease;
400 | }
401 |
402 | .connection-status.connected {
403 | background: rgba(25, 135, 84, 0.9);
404 | color: white;
405 | }
406 |
407 | .connection-status.disconnected {
408 | background: rgba(220, 53, 69, 0.9);
409 | color: white;
410 | }
411 |
412 | .connection-status.connecting {
413 | background: rgba(255, 193, 7, 0.9);
414 | color: var(--dark-color);
415 | }
--------------------------------------------------------------------------------
/src/tools/file_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | 文件处理工具
3 |
4 | 提供文件处理相关的工具函数:
5 | - ZIP文件解压
6 | - 17个标准文件夹结构验证
7 | - Markdown文件处理
8 | - 文件路径处理
9 | """
10 |
11 | import zipfile
12 | import re
13 | import markdown
14 | from pathlib import Path
15 | from typing import List, Dict, Any, Optional
16 |
17 | async def extract_zip_file(zip_path: str) -> Dict[str, Any]:
18 | """
19 | 解压ZIP文件并返回解压结果
20 |
21 | Args:
22 | zip_path: ZIP文件路径
23 |
24 | Returns:
25 | 解压结果字典,包含解压路径和文件列表
26 | """
27 | try:
28 | import asyncio
29 |
30 | # 使用异步方式处理ZIP文件
31 | def _extract_zip():
32 | with zipfile.ZipFile(zip_path, 'r') as zip_ref:
33 | # 解压到当前目录的 extracted 文件夹
34 | extract_dir = Path(zip_path).parent / "extracted"
35 | extract_dir.mkdir(exist_ok=True)
36 |
37 | zip_ref.extractall(extract_dir)
38 |
39 | # 收集所有解压的文件
40 | extracted_files = []
41 | import os
42 | # 使用os.walk代替rglob来避免阻塞调用
43 | for root, dirs, files in os.walk(extract_dir):
44 | for file in files:
45 | file_path = os.path.join(root, file)
46 | extracted_files.append(file_path)
47 | return extract_dir, extracted_files
48 |
49 | extract_dir, extracted_files = await asyncio.to_thread(_extract_zip)
50 |
51 | return {
52 | "extraction_path": str(extract_dir),
53 | "files": extracted_files,
54 | "success": True
55 | }
56 |
57 | except Exception as e:
58 | print(f"解压失败: {e}")
59 | return {
60 | "extraction_path": None,
61 | "files": [],
62 | "success": False,
63 | "error": str(e)
64 | }
65 |
66 | async def validate_folder_structure(extraction_path: str) -> Dict[str, Any]:
67 | """
68 | 验证17个标准文件夹结构
69 |
70 | 支持文件夹在根目录或下一层子目录中
71 |
72 | Args:
73 | extraction_path: 解压后的根目录路径
74 |
75 | Returns:
76 | 验证结果字典
77 | """
78 | # 17个标准文件夹名称
79 | standard_folders = [
80 | "1.教育经历",
81 | "2.工作经历",
82 | "3.继续教育(培训情况)",
83 | "4.学术技术兼职情况",
84 | "5.获奖情况",
85 | "6.获得荣誉称号情况",
86 | "7.主持参与科研项目(基金)情况",
87 | "8.主持参与工程技术项目情况",
88 | "9.论文",
89 | "10.著(译)作(教材)",
90 | "11.专利(著作权)情况",
91 | "12.主持参与指定标准情况",
92 | "13.成果被批示、采纳、运用和推广情况",
93 | "14.资质证书",
94 | "15.奖惩情况",
95 | "16.考核情况",
96 | "17.申报材料附件信息"
97 | ]
98 |
99 | extraction_dir = Path(extraction_path)
100 |
101 | # 递归查找17个标准文件夹(在根目录或下一层子目录中)
102 | async def find_folders_recursively(search_dir: Path, max_depth: int = 2) -> Dict[str, str]:
103 | """
104 | 递归查找标准文件夹
105 |
106 | Args:
107 | search_dir: 搜索目录
108 | max_depth: 最大搜索深度(1=仅根目录,2=根目录+一层子目录)
109 |
110 | Returns:
111 | 找到的文件夹映射 {文件夹名: 路径}
112 | """
113 | found_folders = {}
114 |
115 | async def _search_directory(current_dir: Path, current_depth: int):
116 | if current_depth > max_depth:
117 | return
118 |
119 | try:
120 | # 使用异步方式读取目录 - 使用os.scandir避免阻塞
121 | import asyncio
122 | import os
123 |
124 | try:
125 | # 使用asyncio.to_thread包装os.scandir调用
126 | def _list_directory():
127 | return list(os.scandir(current_dir))
128 |
129 | entries = await asyncio.to_thread(_list_directory)
130 |
131 | for entry in entries:
132 | # 检查是否是目录
133 | def _check_is_dir():
134 | return entry.is_dir()
135 |
136 | if await asyncio.to_thread(_check_is_dir):
137 | folder_name = entry.name
138 | # 检查是否是标准文件夹
139 | if folder_name in standard_folders and folder_name not in found_folders:
140 | found_folders[folder_name] = str(entry.path)
141 | print(f"📁 找到标准文件夹: {folder_name} -> {entry.path}")
142 |
143 | # 如果还没达到最大深度,继续递归搜索
144 | if current_depth < max_depth:
145 | from pathlib import Path
146 | await _search_directory(Path(entry.path), current_depth + 1)
147 | except OSError as e:
148 | print(f"⚠️ 无法扫描目录 {current_dir}: {e}")
149 | except PermissionError:
150 | print(f"⚠️ 无法访问目录: {current_dir}")
151 |
152 | await _search_directory(search_dir, 1)
153 | return found_folders
154 |
155 | print(f"🔍 开始递归查找17个标准文件夹(最大深度2层)...")
156 | found_folder_paths = await find_folders_recursively(extraction_dir, max_depth=2)
157 |
158 | # 构建文件夹信息
159 | folders_found = []
160 | missing_folders = []
161 |
162 | for standard_folder in standard_folders:
163 | if standard_folder in found_folder_paths:
164 | folders_found.append({
165 | "name": standard_folder,
166 | "path": found_folder_paths[standard_folder],
167 | "exists": True
168 | })
169 | else:
170 | missing_folders.append(standard_folder)
171 |
172 | # 获取所有实际存在的文件夹(用于检查额外文件夹)
173 | import asyncio
174 | import os
175 | all_actual_folders = []
176 |
177 | async def collect_folders():
178 | # 使用os.walk代替rglob来避免阻塞调用
179 | def _walk_dirs():
180 | folders = []
181 | for root, dirs, files in os.walk(extraction_dir):
182 | for dir_name in dirs:
183 | folders.append(dir_name)
184 | return folders
185 |
186 | folder_names = await asyncio.to_thread(_walk_dirs)
187 | all_actual_folders.extend(folder_names)
188 |
189 | await collect_folders()
190 |
191 | # 检查额外的文件夹
192 | extra_folders = []
193 | for actual_folder in set(all_actual_folders):
194 | if actual_folder not in standard_folders:
195 | extra_folders.append(actual_folder)
196 |
197 | # 判断是否合规
198 | is_valid = len(missing_folders) == 0
199 |
200 | print(f"📊 文件夹验证结果: 找到 {len(folders_found)}/{len(standard_folders)} 个标准文件夹")
201 | if missing_folders:
202 | print(f"⚠️ 缺失的文件夹: {missing_folders}")
203 |
204 | return {
205 | "is_valid": is_valid,
206 | "folders_found": folders_found,
207 | "missing_folders": missing_folders,
208 | "extra_folders": extra_folders,
209 | "total_standard_folders": len(standard_folders),
210 | "found_count": len(folders_found)
211 | }
212 |
213 |
214 | def analyze_markdown_structure(md_content: str) -> Dict[str, Any]:
215 | """
216 | 分析Markdown文件结构
217 |
218 | Args:
219 | md_content: Markdown内容
220 |
221 | Returns:
222 | 结构分析结果
223 | """
224 | import datetime
225 |
226 | try:
227 | # 基本统计信息
228 | lines = md_content.split('\n')
229 |
230 | # 提取标题
231 | headers = []
232 | for line in lines:
233 | if line.strip().startswith('#'):
234 | level = len(line) - len(line.lstrip('#'))
235 | title = line.strip('#').strip()
236 | headers.append({
237 | "level": level,
238 | "title": title
239 | })
240 |
241 | # 提取列表项
242 | list_items = []
243 | for line in lines:
244 | stripped = line.strip()
245 | if stripped.startswith('-') or stripped.startswith('*') or re.match(r'^\d+\.', stripped):
246 | list_items.append(stripped)
247 |
248 | return {
249 | "total_lines": len(lines),
250 | "total_chars": len(md_content),
251 | "headers": headers,
252 | "list_items": list_items,
253 | "has_content": len(md_content.strip()) > 0,
254 | "extraction_timestamp": datetime.datetime.now().isoformat()
255 | }
256 |
257 | except Exception as e:
258 | return {
259 | "total_lines": 0,
260 | "total_chars": 0,
261 | "headers": [],
262 | "list_items": [],
263 | "has_content": False,
264 | "error": str(e)
265 | }
266 |
267 |
268 | async def extract_markdown_content(md_file_path: str) -> Dict[str, Any]:
269 | """
270 | 提取Markdown文件内容
271 |
272 | Args:
273 | md_file_path: Markdown文件路径
274 |
275 | Returns:
276 | 提取结果
277 | """
278 | try:
279 | import asyncio
280 | # 使用异步方式读取文件
281 | def _read_file():
282 | with open(md_file_path, 'r', encoding='utf-8') as f:
283 | return f.read()
284 |
285 | content = await asyncio.to_thread(_read_file)
286 |
287 | structure = analyze_markdown_structure(content)
288 |
289 | return {
290 | "file_path": md_file_path,
291 | "content": content,
292 | "structure": structure,
293 | "success": True
294 | }
295 |
296 | except Exception as e:
297 | return {
298 | "file_path": md_file_path,
299 | "content": "",
300 | "structure": {},
301 | "success": False,
302 | "error": str(e)
303 | }
--------------------------------------------------------------------------------
/src/nodes/core_info_extraction.py:
--------------------------------------------------------------------------------
1 | """
2 | 核心信息提取节点
3 |
4 | 从1-17项材料中分别提取各自的核心信息:
5 | - 每项材料提取相应的关键字段
6 | - 输出17个字段的结构化信息
7 | - 支持AI增强的信息提取
8 | """
9 |
10 | from typing import Dict, Any, Optional
11 | from src.graph.state import AuditState
12 | from src.tools.ai_utils import extract_core_information_with_ai, extract_category_core_info_with_ai
13 |
14 |
15 | def core_info_extraction_node(state: AuditState) -> Dict[str, Any]:
16 | """
17 | 完全无缓存的核心信息提取节点 - 每次都处理全新数据
18 |
19 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的
20 | """
21 | try:
22 | print(f"🎯 开始无缓存核心信息提取...")
23 |
24 | # 直接获取当前状态的数据 - 不使用任何缓存
25 | api_extraction_results = state.get("api_extraction_results", {})
26 | extracted_content = state.get("extracted_content", {})
27 |
28 | print(f"🔍 当前状态数据:")
29 | print(f" API提取结果: {len(api_extraction_results)} 项")
30 | print(f" 备用提取内容: {len(extracted_content)} 项")
31 |
32 | # 确定使用哪个数据源 - 直接判断,不做缓存检查
33 | if api_extraction_results:
34 | data_source = api_extraction_results
35 | print(f"✅ 使用API提取结果: {len(api_extraction_results)} 项")
36 | elif extracted_content:
37 | data_source = extracted_content
38 | print(f"⚠️ 使用备用提取内容: {len(extracted_content)} 项")
39 | else:
40 | print("⚠️ 没有找到提取的内容,跳过核心信息提取")
41 | return {
42 | "core_info": _create_empty_core_info_structure(),
43 | "current_step": "core_info_extraction_skipped",
44 | "processing_logs": ["未找到有效数据,跳过核心信息提取"]
45 | }
46 |
47 | # 直接创建17项核心信息结构 - 不使用缓存
48 | core_info_structure = _create_empty_core_info_structure()
49 |
50 | # 1-17项材料分类映射
51 | material_categories = {
52 | "1.教育经历": "education",
53 | "教育经历": "education",
54 | "2.工作经历": "work_experience",
55 | "工作经历": "work_experience",
56 | "3.继续教育(培训情况)": "continuing_education",
57 | "继续教育": "continuing_education",
58 | "培训情况": "continuing_education",
59 | "4.学术技术兼职情况": "academic_positions",
60 | "学术技术兼职情况": "academic_positions",
61 | "5.获奖情况": "awards",
62 | "获奖情况": "awards",
63 | "6.获得荣誉称号情况": "honors",
64 | "荣誉称号": "honors",
65 | "7.主持参与科研项目(基金)情况": "research_projects",
66 | "科研项目": "research_projects",
67 | "8.主持参与工程技术项目情况": "engineering_projects",
68 | "工程项目": "engineering_projects",
69 | "9.论文": "papers",
70 | "论文": "papers",
71 | "10.著(译)作(教材)": "publications",
72 | "著作": "publications",
73 | "教材": "publications",
74 | "11.专利(著作权)情况": "patents",
75 | "专利": "patents",
76 | "12.主持参与指定标准情况": "standards",
77 | "标准制定": "standards",
78 | "13.成果被批示、采纳、运用和推广情况": "achievements",
79 | "成果应用": "achievements",
80 | "14.资质证书": "certificates",
81 | "资质证书": "certificates",
82 | "15.奖惩情况": "rewards_punishments",
83 | "奖惩情况": "rewards_punishments",
84 | "16.考核情况": "evaluations",
85 | "考核情况": "evaluations",
86 | "17.申报材料附件信息": "attachments",
87 | "附件信息": "attachments"
88 | }
89 |
90 | print(f"📁 发现 {len(data_source)} 个材料类型需要提取核心信息")
91 |
92 | # 处理每个材料类型
93 | for folder_name, folder_data in data_source.items():
94 | print(f"🔍 正在处理: {folder_name}")
95 |
96 | # 确定材料类别
97 | category_key = None
98 | for key, category in material_categories.items():
99 | if key in folder_name or folder_name in key:
100 | category_key = category
101 | break
102 |
103 | if not category_key:
104 | print(f"⚠️ 未识别的材料类型: {folder_name},归类为附件信息")
105 | category_key = "attachments"
106 |
107 | # 提取材料内容
108 | material_content = _extract_material_content_from_folder(folder_data)
109 |
110 | if not material_content.strip():
111 | print(f"⚠️ {folder_name} 没有有效内容")
112 | continue
113 |
114 | # 使用AI提取该材料类型的核心信息
115 | try:
116 | extracted_info = extract_category_core_info_with_ai(
117 | category_key, folder_name, material_content
118 | )
119 |
120 | if extracted_info:
121 | core_info_structure[category_key] = extracted_info
122 | print(f"✅ {folder_name} 核心信息提取成功")
123 | else:
124 | print(f"⚠️ {folder_name} 核心信息提取失败")
125 |
126 | except Exception as e:
127 | print(f"⚠️ {folder_name} 信息提取异常: {e}")
128 | # 创建默认结构,保持数据一致性
129 | core_info_structure[category_key] = {
130 | "name": None,
131 | "id_number": None,
132 | "extracted_from": [folder_name],
133 | "content_summary": None,
134 | "key_info": {
135 | "category": category_key,
136 | "folder_name": folder_name,
137 | "error": str(e),
138 | "extracted_at": _get_current_timestamp()
139 | }
140 | }
141 | continue
142 |
143 | # 统计提取结果
144 | extracted_categories = []
145 | name_count = 0
146 | id_count = 0
147 |
148 | for category, info in core_info_structure.items():
149 | if info and info.get('name'):
150 | name_count += 1
151 | if info and info.get('id_number'):
152 | id_count += 1
153 | if info and (info.get('name') or info.get('id_number') or info.get('content_summary')):
154 | extracted_categories.append(category)
155 |
156 | print(f"✅ 核心信息提取完成:")
157 | print(f" 成功处理 {len(extracted_categories)} 项材料")
158 | print(f" 提取到姓名的材料: {name_count} 项")
159 | print(f" 提取到身份证号的材料: {id_count} 项")
160 |
161 | # 🚨 确保数据结构符合交叉校验节点的期望
162 | return {
163 | "core_info": core_info_structure,
164 | "current_step": "core_info_extraction_completed",
165 | "processing_logs": [
166 | f"核心信息提取完成: 成功处理{len(extracted_categories)}项材料",
167 | f"提取到姓名的材料: {name_count}项",
168 | f"提取到身份证号的材料: {id_count}项"
169 | ]
170 | }
171 |
172 | except Exception as e:
173 | print(f"❌ 核心信息提取失败: {str(e)}")
174 | # 🚨 即使失败也要返回有效的空结构,确保后续节点能正常处理
175 | return {
176 | "core_info": _create_empty_core_info_structure(),
177 | "current_step": "core_info_extraction_failed",
178 | "error_message": f"核心信息提取失败: {str(e)}",
179 | "processing_logs": [f"核心信息提取失败: {str(e)}"]
180 | }
181 |
182 |
183 | def _create_empty_core_info_structure() -> Dict[str, Any]:
184 | """创建空的1-17项核心信息结构,每项都包含姓名和身份证号用于交叉校验"""
185 | # 为每一项材料创建相同的基础结构
186 | base_structure = {
187 | "name": None, # 姓名(从该项材料中提取)
188 | "id_number": None, # 身份证号(从该项材料中提取)
189 | "extracted_from": [], # 信息来源文件
190 | "content_summary": None, # 内容摘要
191 | "key_info": {} # 该项材料的关键信息
192 | }
193 |
194 | return {
195 | # 1-17项材料,每项都包含姓名和身份证号用于交叉校验
196 | "education": base_structure.copy(), # 1.教育经历
197 | "work_experience": base_structure.copy(), # 2.工作经历
198 | "continuing_education": base_structure.copy(), # 3.继续教育(培训情况)
199 | "academic_positions": base_structure.copy(), # 4.学术技术兼职情况
200 | "awards": base_structure.copy(), # 5.获奖情况
201 | "honors": base_structure.copy(), # 6.获得荣誉称号情况
202 | "research_projects": base_structure.copy(), # 7.主持参与科研项目(基金)情况
203 | "engineering_projects": base_structure.copy(), # 8.主持参与工程技术项目情况
204 | "papers": base_structure.copy(), # 9.论文
205 | "publications": base_structure.copy(), # 10.著(译)作(教材)
206 | "patents": base_structure.copy(), # 11.专利(著作权)情况
207 | "standards": base_structure.copy(), # 12.主持参与指定标准情况
208 | "achievements": base_structure.copy(), # 13.成果被批示、采纳、运用和推广情况
209 | "certificates": base_structure.copy(), # 14.资质证书
210 | "rewards_punishments": base_structure.copy(), # 15.奖惩情况
211 | "evaluations": base_structure.copy(), # 16.考核情况
212 | "attachments": base_structure.copy() # 17.申报材料附件信息
213 | }
214 |
215 |
216 | def _extract_material_content_from_folder(folder_data: Any) -> str:
217 | """从文件夹数据中提取材料内容"""
218 | material_content = ""
219 |
220 | if isinstance(folder_data, list):
221 | # 处理api_extraction_results格式
222 | for json_item in folder_data:
223 | if isinstance(json_item, dict):
224 | content = json_item.get("content", {})
225 | if isinstance(content, dict):
226 | # 尝试多种可能的内容字段
227 | for key in ["md_content", "raw_markdown", "text", "content"]:
228 | if key in content:
229 | text_content = str(content[key])
230 | if text_content.strip():
231 | material_content += text_content + "\n\n"
232 | break
233 | if not material_content:
234 | material_content += str(content) + "\n\n"
235 | else:
236 | material_content += str(content) + "\n\n"
237 |
238 | elif isinstance(folder_data, dict):
239 | # 处理extracted_content格式
240 | content_list = folder_data.get("content", [])
241 | if isinstance(content_list, list):
242 | for item in content_list:
243 | if isinstance(item, dict):
244 | if "json_data" in item:
245 | json_data = item["json_data"]
246 | content = json_data.get("content", {})
247 | if isinstance(content, dict):
248 | for key in ["md_content", "raw_markdown", "text", "content"]:
249 | if key in content:
250 | text_content = str(content[key])
251 | if text_content.strip():
252 | material_content += text_content + "\n\n"
253 | break
254 | else:
255 | material_content += str(content) + "\n\n"
256 | elif "content" in item:
257 | material_content += str(item["content"]) + "\n\n"
258 | else:
259 | material_content += str(item) + "\n\n"
260 |
261 | return material_content.strip()
262 |
263 |
264 | def _get_current_timestamp() -> str:
265 | """获取当前时间戳"""
266 | from datetime import datetime
267 | return datetime.now().isoformat()
--------------------------------------------------------------------------------
/src/graph/edges.py:
--------------------------------------------------------------------------------
1 | """
2 | LangGraph边和路由逻辑定义
3 |
4 | 包含工作流中的条件边和路由函数:
5 | - 根据PDF页数决定处理策略的路由
6 | - 根据材料类型决定校验规则的路由
7 | - 根据校验结果决定后续流程的路由
8 | - 支持Send API实现的并行分支
9 | """
10 |
11 | from typing import Dict, Any, List, Union
12 | from .state import AuditState
13 |
14 | # 导入Send API用于并行处理
15 | try:
16 | from langgraph.types import Send
17 | SEND_AVAILABLE = True
18 | except ImportError:
19 | Send = None
20 | SEND_AVAILABLE = False
21 |
22 |
23 | def should_continue_processing(state: AuditState) -> str:
24 | """
25 | 判断是否继续处理流程
26 |
27 | Returns:
28 | "continue": 继续处理
29 | "error": 发生错误,终止流程
30 | """
31 | if state.get("error_message"):
32 | return "error"
33 |
34 | if not state.get("uploaded_file"):
35 | return "error"
36 |
37 | return "continue"
38 |
39 |
40 | def route_folder_validation(state: AuditState) -> str:
41 | """
42 | 根据文件夹结构验证结果决定处理策略
43 |
44 | Returns:
45 | "process_folders": 文件夹结构正确,继续处理
46 | "error": 文件夹结构错误,终止流程
47 | """
48 | folder_validation = state.get("folder_validation", {})
49 |
50 | # 检查是否有17个标准文件夹
51 | if not folder_validation:
52 | return "error"
53 |
54 | folders_found = folder_validation.get("folders_found", [])
55 | if len(folders_found) < 17:
56 | return "error"
57 |
58 | return "process_folders"
59 |
60 |
61 | def should_continue_content_analysis(state: AuditState) -> str:
62 | """
63 | 判断是否继续内容分析
64 |
65 | Returns:
66 | "analyze": 继续分析
67 | "skip_analysis": 跳过分析
68 | "error": 发生错误
69 | """
70 | if state.get("error_message"):
71 | return "error"
72 |
73 | extracted_content = state.get("extracted_content", {})
74 | if not extracted_content:
75 | return "skip_analysis"
76 |
77 | return "analyze"
78 |
79 |
80 | def route_to_cross_validation(state: AuditState) -> str:
81 | """
82 | 决定是否进行交叉校验
83 |
84 | Returns:
85 | "cross_validate": 进行交叉校验
86 | "skip_cross_validation": 跳过交叉校验
87 | "error": 发生错误
88 | """
89 | if state.get("error_message"):
90 | return "error"
91 |
92 | # 检查是否有材料校验结果
93 | material_validation = state.get("material_validation", {})
94 | if not material_validation:
95 | return "skip_cross_validation"
96 |
97 | # 检查是否有核心信息
98 | core_info = state.get("core_info")
99 | extracted_content = state.get("extracted_content", {})
100 |
101 | if not core_info and not extracted_content:
102 | return "skip_cross_validation"
103 |
104 | return "cross_validate"
105 |
106 |
107 | def should_generate_report(state: AuditState) -> str:
108 | """
109 | 判断是否应该生成报告
110 |
111 | Returns:
112 | "generate_report": 生成报告
113 | "error": 发生错误,终止流程
114 | """
115 | if state.get("error_message"):
116 | return "error"
117 |
118 | # 只要有任何处理结果就生成报告
119 | has_content = any([
120 | state.get("extracted_content"),
121 | state.get("material_validation"),
122 | state.get("cross_validation"),
123 | state.get("folder_classification")
124 | ])
125 |
126 | if has_content:
127 | return "generate_report"
128 | else:
129 | return "error"
130 | def check_pdf_extraction_for_parallel_processing(state: AuditState) -> Union[List, str]:
131 | """
132 | PDF提取完成后,并行分发到core_info_extraction和validation节点
133 |
134 | 确保PDF提取的数据能同时进入核心信息提取和材料校验
135 |
136 | Returns:
137 | Send对象列表,发送到core_info_extraction和validation
138 | 或者在失败时返回END
139 | """
140 | if not SEND_AVAILABLE or Send is None:
141 | print("⚠️ Send API不可用,使用传统路由")
142 | # 检查PDF提取状态
143 | status = check_pdf_extraction_status(state)
144 | if status == "pdf_extraction_success":
145 | return "core_info_extraction" # 退化到传统路由
146 | else:
147 | return "END"
148 |
149 | # 检查PDF提取状态
150 | status = check_pdf_extraction_status(state)
151 |
152 | if status == "pdf_extraction_success":
153 | print(f"🚀 PDF提取成功,并行分发到核心信息提取和校验节点")
154 |
155 | # 并行发送到两个处理节点
156 | return [
157 | Send("core_info_extraction", state), # 核心信息提取
158 | Send("validation", state) # 直接进入校验
159 | ]
160 | else:
161 | print("❌ PDF提取失败,终止流程")
162 | return "END"
163 |
164 |
165 | def check_core_info_for_cross_validation(state: AuditState) -> str:
166 | """
167 | 检查核心信息是否完成,决定是否进行交叉验证
168 |
169 | 注意:LangGraph不支持真正的"等待两个节点都完成"逻辑
170 | 这里简化为:只要有核心信息就进行交叉验证
171 |
172 | Returns:
173 | "proceed_cross_validation": 进行交叉验证
174 | "skip_cross_validation": 跳过交叉验证
175 | """
176 | core_info = state.get("core_info")
177 | extracted_content = state.get("extracted_content", {})
178 |
179 | # 只要有核心信息和提取内容就进行交叉验证
180 | if core_info is not None and extracted_content:
181 | return "proceed_cross_validation"
182 | else:
183 | return "skip_cross_validation"
184 |
185 |
186 | def check_pdf_extraction_status(state: AuditState) -> str:
187 | """
188 | 检查PDF提取状态,确保PDF内容提取完成后才进行下一步
189 |
190 | 这是关键的状态判断函数,遵循LangGraph条件边的最佳实践
191 |
192 | Returns:
193 | "pdf_extraction_success": PDF提取成功,继续后续流程
194 | "pdf_extraction_failed": PDF提取失败,跳转到错误处理
195 | "pdf_extraction_pending": PDF提取正在进行中(理论上不应该出现)
196 | """
197 | print("🔍 检查PDF提取状态...")
198 |
199 | # 检查当前步骤状态
200 | current_step = state.get("current_step", "")
201 | print(f"📋 当前步骤: {current_step}")
202 |
203 | # 修复被连接的状态字符串问题
204 | if "pdf_extraction_failed" in current_step:
205 | print("❌ PDF提取已标记为失败")
206 | return "pdf_extraction_failed"
207 |
208 | if "pdf_extraction_completed" in current_step:
209 | print("✅ PDF提取已标记为完成")
210 | # 检查是否有实际的提取结果
211 | pdf_extraction_results = state.get("pdf_extraction_results", {})
212 | api_extraction_results = state.get("api_extraction_results", {})
213 | if pdf_extraction_results or api_extraction_results:
214 | print(f"📊 找到PDF提取结果: {len(pdf_extraction_results)} 个文件夹")
215 | return "pdf_extraction_success"
216 | else:
217 | print("⚠️ PDF提取完成但没有结果数据")
218 | return "pdf_extraction_failed"
219 |
220 | # 检查是否有实际的提取结果或空文件夹结构
221 | pdf_extraction_results = state.get("pdf_extraction_results", {})
222 | api_extraction_results = state.get("api_extraction_results", {})
223 |
224 | # 只要有文件夹结构就认为成功,不一定要有PDF文件
225 | if pdf_extraction_results:
226 | total_files = 0
227 | successful_files = 0
228 |
229 | for folder_name, folder_data in pdf_extraction_results.items():
230 | files = folder_data.get("files", [])
231 | total_files += len(files)
232 | successful_files += len([f for f in files if f.get("success")])
233 |
234 | print(f"📊 PDF提取统计: {successful_files}/{total_files} 文件成功,{len(pdf_extraction_results)}个文件夹")
235 |
236 | # 即使没有PDF文件,只要有文件夹结构就认为成功
237 | print("✅ 检测到PDF提取结果或文件夹结构")
238 | return "pdf_extraction_success"
239 | else:
240 | print("❌ 没有PDF提取结果")
241 | return "pdf_extraction_failed"
242 |
243 | # 检查错误消息
244 | error_message = state.get("error_message", "")
245 | if error_message and "pdf" in error_message.lower() and "failed" in error_message.lower():
246 | print(f"❌ 发现PDF相关错误: {error_message}")
247 | return "pdf_extraction_failed"
248 |
249 | # 默认情况:如果状态不明确,认为是失败
250 | print("⚠️ PDF提取状态不明确,默认为失败")
251 | return "pdf_extraction_failed"
252 |
253 |
254 | def create_parallel_branches(state: AuditState) -> Union[List, str]:
255 | """
256 | 创建并行分支:从文件处理后分发到多个并行路径
257 |
258 | 使用LangGraph的Send API实现真正的并行处理:
259 | 1. PDF提取路径
260 | 2. 规则处理路径
261 |
262 | Returns:
263 | Send对象列表,每个对象代表一个并行分支
264 | """
265 | if not SEND_AVAILABLE or Send is None:
266 | print("⚠️ Send API不可用,使用传统路由")
267 | return "pdf_extraction" # 退化到传统路由
268 |
269 | print("🚀 创建并行分支: PDF提取 + 规则处理")
270 |
271 | # 返回多个Send对象,实现并行处理
272 | return [
273 | Send("pdf_extraction", state), # PDF提取路径
274 | Send("load_rules", state) # 规则加载路径
275 | ]
276 |
277 |
278 | def after_rules_loaded(state: AuditState) -> str:
279 | """
280 | 规则加载完成后的路由
281 |
282 | Returns:
283 | "extract_rules": 继续提取规则
284 | "rules_load_failed": 规则加载失败
285 | """
286 | current_step = state.get("current_step", "")
287 |
288 | if "rules_load_failed" in current_step:
289 | print("❌ 规则加载失败")
290 | return "rules_load_failed"
291 |
292 | if "rules_loaded" in current_step:
293 | print("✅ 规则加载成功,继续提取")
294 | return "extract_rules"
295 |
296 | # 检查是否有规则数据
297 | rules_data = state.get("rules_data", [])
298 | if rules_data:
299 | print(f"✅ 发现 {len(rules_data)} 个规则数据,继续提取")
300 | return "extract_rules"
301 |
302 | print("❌ 未找到规则数据")
303 | return "rules_load_failed"
304 |
305 |
306 | def check_core_info_for_parallel_validation(state: AuditState) -> Union[List, str]:
307 | """
308 | 核心信息提取完成后,并行分发到validation和cross_validation节点
309 |
310 | 确保PDF提取路径的数据也能进入validation节点
311 |
312 | Returns:
313 | Send对象列表,发送到validation和cross_validation
314 | """
315 | if not SEND_AVAILABLE or Send is None:
316 | print("⚠️ Send API不可用,使用传统路由")
317 | return "validation" # 退化到传统路由
318 |
319 | core_info = state.get("core_info")
320 | extracted_content = state.get("extracted_content", {})
321 |
322 | print(f"🚀 核心信息提取完成,分发到验证节点")
323 | print(f"📊 核心信息状态: {core_info is not None}")
324 | print(f"📊 提取内容状态: {len(extracted_content) if extracted_content else 0} 项")
325 |
326 | # 并行发送到两个验证节点
327 | return [
328 | Send("validation", state),
329 | Send("cross_validation", state)
330 | ]
331 |
332 |
333 | def check_rules_for_validation(state: AuditState) -> Union[List, str]:
334 | """
335 | 检查规则提取结果,决定是否可以进入验证阶段
336 |
337 | 使用Send API将规则数据发送到validation和cross_validation节点
338 |
339 | Returns:
340 | Send对象列表,发送到validation和cross_validation
341 | """
342 | if not SEND_AVAILABLE or Send is None:
343 | print("⚠️ Send API不可用,使用传统路由")
344 | return "validation" # 退化到传统路由
345 |
346 | parsed_rules = state.get("parsed_rules", [])
347 | current_step = state.get("current_step", "")
348 |
349 | # 添加详细调试信息
350 | print(f"🔍 check_rules_for_validation 调试信息:")
351 | print(f" current_step: {current_step}")
352 | print(f" parsed_rules 数量: {len(parsed_rules)}")
353 | print(f" parsed_rules 内容: {parsed_rules[:2] if parsed_rules else '空'}")
354 |
355 | # 修复条件判断:只要有规则就传递给validation
356 | if parsed_rules and len(parsed_rules) > 0:
357 | print(f"🚀 规则提取成功,分发到验证节点: {len(parsed_rules)} 条规则")
358 |
359 | # 将规则数据发送到两个验证节点
360 | return [
361 | Send("validation", state),
362 | Send("cross_validation", state)
363 | ]
364 | elif "rules_extract_skipped" in current_step:
365 | print("🚨 规则提取已跳过,直接进行基础验证")
366 | return [Send("validation", state)]
367 | else:
368 | print("⚠️ 规则提取未完成或无规则数据,只进行基础验证")
369 | return [Send("validation", state)]
--------------------------------------------------------------------------------
/src/agent.py:
--------------------------------------------------------------------------------
1 | """
2 | 主要的职称评审材料审核代理
3 |
4 | 基于LangGraph框架的完整审核系统入口
5 | 集成LangSmith调试和监控功能
6 | """
7 |
8 | import os
9 | from typing import Dict, Any, Optional
10 |
11 | # 定义RunnableConfig为类型别名,避免对langchain_core的依赖
12 | RunnableConfig = Dict[str, Any]
13 |
14 | # 导入工作流模块
15 | try:
16 | # 优先使用绝对导入
17 | from src.graph.workflow import create_audit_workflow
18 | except ImportError:
19 | try:
20 | # 如果绝对导入失败,尝试相对导入
21 | import sys
22 | import os
23 | # 添加项目根目录到Python路径
24 | project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
25 | if project_root not in sys.path:
26 | sys.path.insert(0, project_root)
27 | from src.graph.workflow import create_audit_workflow
28 | except ImportError:
29 | try:
30 | # 最后尝试从当前目录导入
31 | from graph.workflow import create_audit_workflow
32 | except ImportError:
33 | raise ImportError("无法导入工作流模块,请检查项目结构")
34 |
35 | # 导入状态模块
36 | try:
37 | from src.graph.state import (
38 | AuditState,
39 | create_initial_state
40 | )
41 | except ImportError:
42 | try:
43 | from graph.state import (
44 | AuditState,
45 | create_initial_state
46 | )
47 | except ImportError:
48 | # 如果都失败,尝试使用系统路径
49 | import sys
50 | import os
51 | project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
52 | if project_root not in sys.path:
53 | sys.path.insert(0, project_root)
54 | try:
55 | from src.graph.state import (
56 | AuditState,
57 | create_initial_state
58 | )
59 | except ImportError:
60 | raise ImportError("无法导入状态模块,请检查项目结构")
61 |
62 | try:
63 | from src.config.api_config import configure_pdf_api
64 | except ImportError:
65 | try:
66 | from config.api_config import configure_pdf_api
67 | except ImportError:
68 | def configure_pdf_api(*args, **kwargs):
69 | print("⚠️ API配置模块未加载,使用默认配置")
70 |
71 | # 删除未使用的 configure_pdf_api_endpoint 导入
72 |
73 | try:
74 | from src.tools.langsmith_utils import setup_langsmith_environment
75 | except ImportError:
76 | try:
77 | from tools.langsmith_utils import setup_langsmith_environment
78 | except ImportError:
79 | def setup_langsmith_environment():
80 | print("⚠️ LangSmith工具未加载")
81 |
82 | # 初始化主工作流
83 | if os.getenv("LANGSMITH_API_KEY"):
84 | setup_langsmith_environment()
85 | print("✅ LangSmith追踪已启用")
86 |
87 | # 使用统一的主工作流
88 | graph = create_audit_workflow()
89 | print("✅ 主审核工作流已就绪")
90 |
91 | # 导出主要接口
92 | __all__ = [
93 | "graph",
94 | "run_audit",
95 | "run_audit_with_tracing",
96 | "debug_audit",
97 | "configure_pdf_api",
98 | "run_pdf_audit",
99 | "AuditState",
100 | "create_initial_state"
101 | ]
102 |
103 |
104 | async def run_audit(uploaded_file: str, session_id: Optional[str] = None) -> dict:
105 | """
106 | 运行审核工作流的便捷函数(异步版本)
107 |
108 | Args:
109 | uploaded_file: 上传的文件路径
110 | session_id: 会话ID(可选)
111 |
112 | Returns:
113 | 审核结果
114 | """
115 | # 创建初始状态
116 | initial_state = create_initial_state(uploaded_file, session_id)
117 |
118 | # 确保PDF API端点配置(修复:在ZIP文件审核中也设置)
119 | if not initial_state.get("pdf_api_endpoint"):
120 | api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain"
121 | initial_state["pdf_api_endpoint"] = api_endpoint
122 | print(f"🔧 为ZIP文件审核设置PDF API端点: {api_endpoint}")
123 |
124 | # 为基础审核模式创建配置
125 | config = None
126 | if session_id:
127 | config = {"configurable": {"thread_id": session_id}}
128 |
129 | try:
130 | # 执行工作流(使用异步API)
131 | print(f"🚀 开始审核流程: {uploaded_file}")
132 | if config:
133 | result = await graph.ainvoke(initial_state, config) # type: ignore
134 | else:
135 | result = await graph.ainvoke(initial_state)
136 |
137 | print(f"✅ 审核完成! 最终状态: {result.get('current_step', '未知')}")
138 | return result
139 |
140 | except Exception as e:
141 | print(f"❌ 审核失败: {str(e)}")
142 | return {
143 | "error": str(e),
144 | "current_step": "failed",
145 | "error_message": str(e)
146 | }
147 |
148 |
149 | async def run_audit_with_tracing(
150 | uploaded_file: str,
151 | session_id: Optional[str] = None,
152 | run_name: Optional[str] = None,
153 | tags: Optional[list] = None
154 | ) -> dict:
155 | """
156 | 运行带LangSmith追踪的审核工作流(异步版本)
157 |
158 | Args:
159 | uploaded_file: 上传的文件路径
160 | session_id: 会话ID(可选)
161 | run_name: 运行名称
162 | tags: 标签列表
163 |
164 | Returns:
165 | 审核结果
166 | """
167 | try:
168 | from src.tools.langsmith_utils import create_run_config, with_langsmith_tracing
169 |
170 | # 创建初始状态
171 | initial_state = create_initial_state(uploaded_file, session_id)
172 |
173 | # 确保PDF API端点配置(修复:在带追踪审核中也设置)
174 | if not initial_state.get("pdf_api_endpoint"):
175 | api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain"
176 | initial_state["pdf_api_endpoint"] = api_endpoint
177 | print(f"🔧 为带追踪审核设置PDF API端点: {api_endpoint}")
178 |
179 | # 创建带追踪的配置
180 | config = create_run_config(
181 | run_name=run_name or f"audit_with_tracing_{session_id or 'default'}",
182 | tags=tags or ["web", "tracing", "production"],
183 | thread_id=session_id
184 | )
185 |
186 | print(f"🔍 开始带追踪的审核流程: {uploaded_file}")
187 | print(f"📊 运行名称: {config.get('run_name')}")
188 | print(f"🏷️ 标签: {config.get('tags', [])}")
189 |
190 | # 使用带追踪的图执行(异步版本)
191 | @with_langsmith_tracing
192 | async def traced_audit():
193 | return await graph.ainvoke(initial_state, config) # type: ignore
194 |
195 | result = await traced_audit()
196 |
197 | print(f"✅ 带追踪审核完成! 最终状态: {result.get('current_step', '未知')}")
198 | return result
199 |
200 | except Exception as e:
201 | print(f"❌ 带追踪审核失败: {str(e)}")
202 | return {
203 | "error": str(e),
204 | "current_step": "failed",
205 | "error_message": str(e)
206 | }
207 |
208 |
209 | async def debug_audit(
210 | uploaded_file: str,
211 | session_id: Optional[str] = None,
212 | breakpoints: Optional[list] = None
213 | ) -> dict:
214 | """
215 | 运行调试模式的审核工作流(异步版本)
216 |
217 | Args:
218 | uploaded_file: 上传的文件路径
219 | session_id: 会话ID(可选)
220 | breakpoints: 断点列表
221 |
222 | Returns:
223 | 审核结果
224 | """
225 | try:
226 | from src.tools.langsmith_utils import create_debug_config, event_logger
227 |
228 | # 创建初始状态
229 | initial_state = create_initial_state(uploaded_file, session_id)
230 |
231 | # 确保PDF API端点配置(修复:在调试模式中也设置)
232 | if not initial_state.get("pdf_api_endpoint"):
233 | api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain"
234 | initial_state["pdf_api_endpoint"] = api_endpoint
235 | print(f"🔧 为调试模式设置PDF API端点: {api_endpoint}")
236 |
237 | # 创建调试配置
238 | config = create_debug_config(breakpoints=breakpoints)
239 | if session_id:
240 | config["configurable"] = {"thread_id": session_id}
241 |
242 | print(f"🐛 开始调试模式审核流程: {uploaded_file}")
243 | print(f"🔧 断点: {breakpoints or ['无']}")
244 |
245 | # 清空事件日志
246 | event_logger.clear_events()
247 |
248 | # 执行工作流(异步版本)
249 | result = await graph.ainvoke(initial_state, config) # type: ignore
250 |
251 | # 收集调试信息
252 | debug_events = event_logger.get_events()
253 |
254 | print(f"✅ 调试模式审核完成! 最终状态: {result.get('current_step', '未知')}")
255 | print(f"📝 记录了 {len(debug_events)} 个调试事件")
256 |
257 | # 在结果中包含调试信息
258 | result["debug_events"] = debug_events
259 | return result
260 |
261 | except Exception as e:
262 | print(f"❌ 调试模式审核失败: {str(e)}")
263 | return {
264 | "error": str(e),
265 | "current_step": "failed",
266 | "error_message": str(e)
267 | }
268 |
269 |
270 | async def run_pdf_audit(
271 | uploaded_file: str,
272 | api_endpoint: str,
273 | session_id: Optional[str] = None,
274 | with_tracing: bool = False
275 | ) -> dict:
276 | """
277 | 运行PDF审核工作流(异步版本)
278 |
279 | Args:
280 | uploaded_file: 上传的ZIP文件路径
281 | api_endpoint: PDF提取API端点
282 | session_id: 会话ID(可选)
283 | with_tracing: 是否启用LangSmith追踪
284 |
285 | Returns:
286 | 审核结果
287 | """
288 | try:
289 | # 配置PDF API端点
290 | configure_pdf_api(api_endpoint)
291 | print(f"🔧 已配置PDF提取API: {api_endpoint}")
292 |
293 | # 创建初始状态
294 | initial_state = create_initial_state(uploaded_file, session_id)
295 |
296 | # 直接设置API端点(现在AuditState已经支持这个字段)
297 | initial_state["pdf_api_endpoint"] = api_endpoint
298 |
299 | # 选择执行模式
300 | if with_tracing:
301 | print(f"🔍 开始PDF审核流程(启用追踪): {uploaded_file}")
302 | return await run_audit_with_tracing(
303 | uploaded_file,
304 | session_id,
305 | run_name=f"pdf_audit_{session_id or 'default'}",
306 | tags=["pdf", "api_extraction", "production"]
307 | )
308 | else:
309 | print(f"🚀 开始PDF审核流程: {uploaded_file}")
310 |
311 | # 为基础审核模式创建配置
312 | config = None
313 | if session_id:
314 | config = {"configurable": {"thread_id": session_id}}
315 |
316 | # 执行工作流(异步版本)
317 | if config:
318 | result = await graph.ainvoke(initial_state, config) # type: ignore
319 | else:
320 | result = await graph.ainvoke(initial_state)
321 |
322 | print(f"✅ PDF审核完成! 最终状态: {result.get('current_step', '未知')}")
323 | return result
324 |
325 | except Exception as e:
326 | print(f"❌ PDF审核失败: {str(e)}")
327 | return {
328 | "error": str(e),
329 | "current_step": "failed",
330 | "error_message": str(e),
331 | "pdf_api_endpoint": api_endpoint
332 | }
333 |
334 |
335 |
336 |
337 |
338 | async def main_async():
339 | """命令行入口点(异步版本)"""
340 | import sys
341 | import argparse
342 |
343 | parser = argparse.ArgumentParser(description='LangGraph 职称材料审核系统')
344 | parser.add_argument('file_path', help='要审核的ZIP文件路径')
345 | parser.add_argument('--session-id', help='会话ID(可选)')
346 |
347 |
348 | args = parser.parse_args()
349 |
350 | # 统一使用主审核函数(异步版本)
351 | result = await run_audit(args.file_path, args.session_id)
352 | print(f"✅ 审核结果: {result}")
353 |
354 | return result
355 |
356 | def main():
357 | """命令行入口点(用于pyproject.toml脚本配置)"""
358 | import asyncio
359 | return asyncio.run(main_async())
360 |
361 |
362 | if __name__ == "__main__":
363 | # 示例用法
364 | import os
365 | import asyncio
366 |
367 | async def example_usage():
368 | # 检查测试数据
369 | test_file = "test_data/sample.zip"
370 |
371 | if os.path.exists(test_file):
372 | print("🧪 运行测试审核...")
373 | result = await run_audit(test_file)
374 | print(f"📊 审核结果: {result}")
375 | else:
376 | print("📋 主代理已就绪,可以通过以下方式使用:")
377 | print(" from src.agent import run_audit")
378 | print(" import asyncio")
379 | print(" result = asyncio.run(run_audit('path/to/your/file.zip'))")
380 | print("\n🔧 或者直接使用图对象:")
381 | print(" from src.agent import graph")
382 | print(" result = await graph.ainvoke(initial_state)")
383 |
384 | asyncio.run(example_usage())
--------------------------------------------------------------------------------
/static/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | LangGraph 职称评审材料审核系统
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
27 |
28 |
29 |
30 |
31 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
LangGraph 职称评审材料审核系统
108 |
109 | 基于人工智能的智能职称材料审核,支持实时流式处理和进度追踪
110 |
111 |
112 |
113 |
114 |
115 |
智能处理
116 |
AI驱动的内容提取和规则验证
117 |
118 |
119 |
120 |
121 |
122 |
实时更新
123 |
流式API支持实时进度追踪
124 |
125 |
126 |
127 |
128 |
129 |
详细报告
130 |
生成完整的HTML审核报告
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
145 |
146 |
147 |
148 | 任务ID:
149 | 文件名:
150 | 文件大小:
151 |
152 |
153 | 会话ID:
154 | 开始时间:
155 | 状态:
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 | 审核进度
166 | 0%
167 |
168 |
172 |
准备开始...
173 |
174 |
175 |
176 |
177 |
189 |
190 |
191 |
206 |
207 |
208 |
209 |
212 |
215 |
218 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
242 |
243 |
244 |
262 |
263 |
264 |
265 |
266 |
--------------------------------------------------------------------------------
/src/nodes/validation.py:
--------------------------------------------------------------------------------
1 | """
2 | AI驱动的规则校验节点 - 基于rules文件夹中的Excel规则
3 | """
4 |
5 | from typing import Dict, List, Any
6 | from src.graph.state import AuditState
7 |
8 | # 导入AI工具
9 | try:
10 | from src.tools.ai_utils import validate_material_with_ai
11 | _ai_utils_available = True
12 | except ImportError:
13 | _ai_utils_available = False
14 | validate_material_with_ai = None
15 |
16 |
17 | def validation_node(state: AuditState) -> Dict[str, Any]:
18 | """
19 | 完全无缓存的AI智能校验节点 - 每次都处理全新数据
20 |
21 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的
22 | """
23 | try:
24 | print(f"⚡ 开始无缓存AI智能校验...")
25 |
26 | # 直接获取当前状态的材料内容和规则数据 - 不使用任何缓存
27 | extracted_content = state.get("api_extraction_results", {}) or state.get("extracted_content", {})
28 | parsed_rules = state.get("parsed_rules", [])
29 | rules_by_category = state.get("rules_by_category", {})
30 |
31 | print(f"🔍 当前状态数据:")
32 | print(f" 材料数量: {len(extracted_content)}")
33 | print(f" 规则数量: {len(parsed_rules)}")
34 | print(f" 规则分类: {list(rules_by_category.keys())}")
35 |
36 | if not extracted_content:
37 | print("⚠️ 未找到可校验的材料内容")
38 | return {
39 | "current_step": "validation_completed",
40 | "processing_logs": ["未找到可校验的材料内容"]
41 | }
42 |
43 | # 直接处理所有材料 - 不使用队列缓存机制
44 | validation_results = []
45 | material_validation = {}
46 | total_materials = len(extracted_content)
47 | processed_count = 0
48 |
49 | print(f"📋 开始校验{total_materials}个材料类型")
50 |
51 | # 直接遍历处理每个材料 - 完全无缓存
52 | for material_type, material_data in extracted_content.items():
53 | processed_count += 1
54 | print(f"🔍 正在校验: {material_type} ({processed_count}/{total_materials})")
55 |
56 | try:
57 | # 数据预处理:确保是单个材料的数据
58 | if isinstance(material_data, list) and len(material_data) > 0:
59 | actual_data = material_data[0] if material_data else {}
60 | elif isinstance(material_data, dict):
61 | actual_data = material_data
62 | else:
63 | actual_data = {"content": material_data, "material_type": material_type}
64 |
65 | # 提取材料内容
66 | material_content = _extract_material_content(actual_data)
67 |
68 | # 🎯 智能规则匹配:教育经历材料只与教育经历规则集匹配
69 | matched_rules = _get_matched_rules_for_material(material_type, rules_by_category, parsed_rules)
70 | print(f"🎯 {material_type} 匹配到 {len(matched_rules)} 条相关规则")
71 |
72 | # 使用AI工具进行校验,将规则作为prompt的一部分
73 | material_results = None
74 |
75 | if _ai_utils_available and validate_material_with_ai and material_content.strip():
76 | print(f"✅ 使用AI校验: {material_type}")
77 |
78 | try:
79 | # 使用匹配的规则进行AI校验,而不是所有规则
80 | if matched_rules and len(matched_rules) > 0:
81 | print(f"📤 向AI传递{len(matched_rules)}条匹配的{material_type}规则")
82 |
83 | ai_results = validate_material_with_ai(
84 | material_type,
85 | material_content,
86 | rules_context=matched_rules
87 | )
88 | else:
89 | print(f"⚠️ {material_type}未找到匹配的规则,跳过AI校验")
90 | ai_results = []
91 |
92 | if ai_results and len(ai_results) > 0:
93 | print(f"✅ AI校验成功,生成{len(ai_results)}个结果")
94 | # 转换AI结果格式
95 | converted_results = []
96 | for ai_result in ai_results:
97 | converted_result = {
98 | "rule_name": ai_result.get("rule_name", f"{material_type}规则校验"),
99 | "result": _convert_ai_status_to_result(ai_result.get("status", "WARNING")),
100 | "details": ai_result.get("message", "校验完成"),
101 | "priority": _convert_ai_status_to_priority(ai_result.get("status", "WARNING")),
102 | "material_type": material_type,
103 | "rule_content": ai_result.get("rule_content", ""),
104 | "ai_powered": True,
105 | "timestamp": _get_current_timestamp()
106 | }
107 | converted_results.append(converted_result)
108 | validation_results.append(converted_result)
109 |
110 | material_results = converted_results
111 | else:
112 | print(f"⚠️ AI校验返回空结果")
113 |
114 | except Exception as ai_error:
115 | print(f"⚠️ AI校验失败: {ai_error}")
116 | else:
117 | print(f"⚠️ AI工具不可用或无内容")
118 |
119 | # 如果AI校验失败,创建基础结果
120 | if not material_results:
121 | print(f"🔧 为{material_type}创建基础校验结果")
122 | basic_result = {
123 | "rule_name": f"{material_type}基础校验",
124 | "result": "⚠️警告",
125 | "details": "未能进行AI校验,仅进行了基础检查",
126 | "priority": "中",
127 | "material_type": material_type,
128 | "rule_content": "",
129 | "ai_powered": False,
130 | "timestamp": _get_current_timestamp()
131 | }
132 | material_results = [basic_result]
133 | validation_results.append(basic_result)
134 |
135 | # 存储到material_validation中以兼容现有系统
136 | material_validation[material_type] = material_results
137 |
138 | print(f"✅ {material_type}校验完成,生成{len(material_results)}个结果")
139 |
140 | except Exception as material_error:
141 | print(f"❌ 校验{material_type}时发生错误: {str(material_error)}")
142 | # 为失败的材料创建错误记录
143 | error_result = {
144 | "rule_name": f"{material_type}校验错误",
145 | "result": "❌不通过",
146 | "details": f"校验过程发生错误: {str(material_error)}",
147 | "priority": "高",
148 | "material_type": material_type,
149 | "rule_content": "",
150 | "timestamp": _get_current_timestamp()
151 | }
152 | validation_results.append(error_result)
153 | material_validation[material_type] = [error_result]
154 |
155 | # 直接返回结果,不使用任何缓存机制
156 | print(f"✅ 无缓存规则校验完成:处理{processed_count}个材料类型,生成{len(validation_results)}项结果")
157 |
158 | # 构建详细结果与摘要(供报告使用)
159 | try:
160 | from src.models.state import ValidationResult, ValidationSummary
161 | detailed_results = []
162 | for rd in validation_results:
163 | try:
164 | detailed_results.append(ValidationResult.from_validation_output(rd))
165 | except Exception as conv_err:
166 | print(f"⚠️ 转换验证结果失败: {conv_err}")
167 | summary = ValidationSummary.from_validation_results(detailed_results) if detailed_results else None
168 | except Exception as model_err:
169 | print(f"⚠️ 生成验证模型失败: {model_err}")
170 | detailed_results = []
171 | summary = None
172 |
173 | return {
174 | "material_validation": material_validation,
175 | "validation_cache": validation_results,
176 | "validation_results_detailed": [r.dict() for r in detailed_results],
177 | "validation_summary": summary.dict() if summary else None,
178 | "current_step": "validation_completed",
179 | "processing_logs": [
180 | f"处理了{processed_count}个材料类型",
181 | f"生成了{len(validation_results)}项校验结果",
182 | "已完全取消缓存机制,确保数据全新"
183 | ]
184 | }
185 |
186 | except Exception as e:
187 | print(f"❌ 规则校验失败: {str(e)}")
188 | return {
189 | "current_step": "validation_failed",
190 | "error_message": f"规则校验失败: {str(e)}"
191 | }
192 |
193 |
194 | def _process_validation_results(material_type: str, validation_results: List,
195 | validation_cache_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
196 | """
197 | 处理AI校验结果并存入缓存
198 | """
199 | processed_results = []
200 |
201 | if isinstance(validation_results, list) and len(validation_results) > 0:
202 | for result in validation_results:
203 | if isinstance(result, dict):
204 | result['timestamp'] = _get_current_timestamp()
205 | processed_results.append(result)
206 | validation_cache_results.append(result)
207 | else:
208 | # 其他类型,转换为字典
209 | result_dict = {
210 | "rule_name": f"{material_type}校验",
211 | "result": "⚠️警告",
212 | "details": str(result),
213 | "priority": "中",
214 | "material_type": material_type,
215 | "rule_content": "",
216 | "timestamp": _get_current_timestamp()
217 | }
218 | processed_results.append(result_dict)
219 | validation_cache_results.append(result_dict)
220 | else:
221 | # 空结果
222 | result_dict = {
223 | "rule_name": f"{material_type}校验",
224 | "result": "⚠️警告",
225 | "details": "未能生成有效的校验结果",
226 | "priority": "中",
227 | "material_type": material_type,
228 | "rule_content": "",
229 | "timestamp": _get_current_timestamp()
230 | }
231 | processed_results.append(result_dict)
232 | validation_cache_results.append(result_dict)
233 |
234 | return processed_results
235 |
236 |
237 | def _get_current_timestamp() -> str:
238 | """获取当前时间戳"""
239 | from datetime import datetime
240 | return datetime.now().isoformat()
241 |
242 |
243 | def _convert_ai_status_to_result(status: str) -> str:
244 | """将AI状态转换为结果格式"""
245 | status_upper = status.upper()
246 | if status_upper == "PASS":
247 | return "✅通过"
248 | elif status_upper == "WARNING":
249 | return "⚠️警告"
250 | elif status_upper == "ERROR":
251 | return "❌不通过"
252 | else:
253 | return "⚠️警告" # 默认
254 |
255 |
256 | def _convert_ai_status_to_priority(status: str) -> str:
257 | """将AI状态转换为优先级"""
258 | status_upper = status.upper()
259 | if status_upper == "ERROR":
260 | return "高"
261 | elif status_upper == "WARNING":
262 | return "中"
263 | elif status_upper == "PASS":
264 | return "低"
265 | else:
266 | return "中" # 默认
267 |
268 |
269 | def _get_matched_rules_for_material(material_type: str, rules_by_category: Dict[str, List[Any]], all_rules: List[Any]) -> List[Any]:
270 | """
271 | 🎯 智能规则匹配:教育经历材料只与教育经历规则集匹配
272 |
273 | Args:
274 | material_type: 材料类型(如"教育经历")
275 | rules_by_category: 按分类组织的规则
276 | all_rules: 所有规则列表(备用)
277 |
278 | Returns:
279 | 匹配的规则列表
280 | """
281 | try:
282 | print(f"🔍 正在为{material_type}匹配规则...")
283 |
284 | # 1-17项材料分类映射表
285 | material_to_category = {
286 | # 直接匹配数字编号
287 | "1.教育经历": "1",
288 | "2.工作经历": "2",
289 | "3.继续教育": "3",
290 | "4.学术技术兼职情况": "4",
291 | "5.获奖情况": "5",
292 | "6.获得荣誉称号情况": "6",
293 | "7.主持参与科研项目": "7",
294 | "8.主持参与工程技术项目情况": "8",
295 | "9.论文": "9",
296 | "10.著(译)作(教材)": "10",
297 | "11.专利(著作权)情况": "11",
298 | "12.主持参与指定标准情况": "12",
299 | "13.成果被批示、采纳、运用和推广情况": "13",
300 | "14.资质证书": "14",
301 | "15.奖惩情况": "15",
302 | "16.考核情况": "16",
303 | "17.申报材料附件信息": "17",
304 |
305 | # 关键词匹配
306 | "教育经历": "1",
307 | "工作经历": "2",
308 | "继续教育": "3",
309 | "培训情况": "3",
310 | "学术技术兼职": "4",
311 | "获奖": "5",
312 | "荣誉称号": "6",
313 | "科研项目": "7",
314 | "工程项目": "8",
315 | "项目经历": "8",
316 | "论文": "9",
317 | "著作": "10",
318 | "教材": "10",
319 | "专利": "11",
320 | "著作权": "11",
321 | "标准": "12",
322 | "成果": "13",
323 | "证书": "14",
324 | "资质": "14",
325 | "奖惩": "15",
326 | "考核": "16",
327 | "附件": "17"
328 | }
329 |
330 | # 首先尝试直接匹配
331 | category_id = material_to_category.get(material_type)
332 |
333 | # 如果直接匹配失败,尝试关键词匹配
334 | if not category_id:
335 | for keyword, cat_id in material_to_category.items():
336 | if keyword in material_type and len(keyword) > 2: # 避免过短的关键词
337 | category_id = cat_id
338 | print(f"🎯 通过关键词'{keyword}'匹配到分类 {cat_id}")
339 | break
340 |
341 | # 获取匹配的规则
342 | matched_rules = []
343 |
344 | if category_id and category_id in rules_by_category:
345 | matched_rules = rules_by_category[category_id]
346 | print(f"✅ {material_type} 匹配到分类{category_id},找到 {len(matched_rules)} 条专用规则")
347 |
348 | # 如果没有找到专用规则,查找通用规则
349 | if not matched_rules:
350 | # 查找通用规则(如交叉检验规则、通用规则等)
351 | general_rules = []
352 | for rule in all_rules:
353 | rule_content = getattr(rule, 'content', '') if hasattr(rule, 'content') else rule.get('content', '')
354 | source_file = getattr(rule, 'source_file', '') if hasattr(rule, 'source_file') else rule.get('source_file', '')
355 |
356 | if '通用' in source_file or '交叉' in source_file or '基础' in source_file:
357 | general_rules.append(rule)
358 |
359 | if general_rules:
360 | matched_rules = general_rules
361 | print(f"⚠️ {material_type} 未找到专用规则,使用 {len(general_rules)} 条通用规则")
362 |
363 | # 最后的备用方案:返回空列表(不使用所有规则)
364 | if not matched_rules:
365 | print(f"⚠️ {material_type} 未找到任何匹配的规则,将跳过校验")
366 |
367 | return matched_rules
368 |
369 | except Exception as e:
370 | print(f"⚠️ 规则匹配失败: {e}")
371 | return []
372 |
373 |
374 | def _extract_material_content(actual_data: Dict[str, Any]) -> str:
375 | """从材料数据中提取文本内容"""
376 | material_content = ""
377 | if isinstance(actual_data, dict):
378 | if "content" in actual_data:
379 | content_data = actual_data["content"]
380 | if isinstance(content_data, dict):
381 | # 尝试多种可能的内容字段
382 | for key in ["md_content", "raw_markdown", "text", "content"]:
383 | if key in content_data:
384 | material_content = str(content_data[key])
385 | break
386 | if not material_content:
387 | material_content = str(content_data)
388 | else:
389 | material_content = str(content_data)
390 | else:
391 | material_content = str(actual_data)
392 | else:
393 | material_content = str(actual_data)
394 |
395 | return material_content
--------------------------------------------------------------------------------
/src/nodes/pdf_extraction.py:
--------------------------------------------------------------------------------
1 | """
2 | PDF内容提取节点
3 |
4 | 通过FastAPI接口处理PDF文件内容提取并转换为JSON格式
5 | """
6 |
7 | import json
8 | import asyncio
9 | from typing import Dict, Any, List, Optional
10 | from pathlib import Path
11 | import logging
12 |
13 | try:
14 | import aiohttp # type: ignore[import]
15 | from aiohttp import ClientTimeout # type: ignore[import]
16 | except ImportError:
17 | print("Warning: aiohttp not installed. Please install with: pip install aiohttp")
18 | aiohttp = None # type: ignore
19 | ClientTimeout = None # type: ignore
20 |
21 | try:
22 | from ..graph.state import AuditState
23 | except ImportError:
24 | from src.graph.state import AuditState
25 |
26 | logger = logging.getLogger(__name__)
27 |
28 |
29 | async def extract_pdf_via_api(pdf_file_path: str, api_endpoint: str) -> Dict[str, Any]:
30 | """
31 | 通过FastAPI提取PDF内容为JSON
32 |
33 | 基于用户提供的工作案例,使用aiohttp实现类似requests的参数传递方式:
34 | - 基础URL和查询参数分开处理
35 | - 逐个上传PDF文件(不是压缩包)
36 | - 使用multipart/form-data格式
37 |
38 | Args:
39 | pdf_file_path: PDF文件路径
40 | api_endpoint: API端点URL(不包含查询参数)
41 |
42 | Returns:
43 | 提取的JSON内容
44 | """
45 | if aiohttp is None:
46 | return {
47 | "success": False,
48 | "error": "aiohttp库未安装,请使用 pip install aiohttp 安装",
49 | "file_path": pdf_file_path
50 | }
51 |
52 | try:
53 | # 按照用户案例的方式设置参数
54 | params = {
55 | 'parse_method': 'auto',
56 | 'is_json_md_dump': 'false',
57 | 'output_dir': 'output',
58 | 'return_layout': 'false',
59 | 'return_info': 'false',
60 | 'return_content_list': 'false',
61 | 'return_images': 'false'
62 | }
63 |
64 | # 创建请求头
65 | headers = {
66 | "accept": "application/json",
67 | "User-Agent": "LangGraph-PDF-Extractor/1.0"
68 | }
69 |
70 | print(f"📤 正在上传PDF文件: {Path(pdf_file_path).name} 到 {api_endpoint}")
71 |
72 | async with aiohttp.ClientSession() as session:
73 | # 异步读取文件内容
74 | try:
75 | file_content = await asyncio.to_thread(lambda: open(pdf_file_path, 'rb').read())
76 | except Exception as file_error:
77 | error_msg = f"读取PDF文件失败: {str(file_error)}"
78 | print(f"❌ {error_msg}")
79 | return {
80 | "success": False,
81 | "error": error_msg,
82 | "file_path": pdf_file_path,
83 | "api_endpoint": api_endpoint
84 | }
85 |
86 | # 按照用户案例创建文件数据
87 | data = aiohttp.FormData()
88 | data.add_field(
89 | 'pdf_file', # 与用户案例中的字段名一致
90 | file_content,
91 | filename=Path(pdf_file_path).name,
92 | content_type='application/pdf'
93 | )
94 |
95 | # 使用params参数传递查询参数,类似requests.post(url, params=params, files=files)
96 | # 创建超时设置
97 | timeout = ClientTimeout(total=120) if ClientTimeout else aiohttp.ClientTimeout(total=120)
98 |
99 | async with session.post(
100 | api_endpoint,
101 | params=params, # 查询参数单独传递
102 | data=data, # 文件数据
103 | headers=headers,
104 | timeout=timeout
105 | ) as response:
106 | print(f"📊 API响应状态码: {response.status}")
107 |
108 | if response.status == 200:
109 | try:
110 | result = await response.json()
111 | print(f"✅ 成功提取PDF内容: {Path(pdf_file_path).name}")
112 | print(f"📋 API返回结构: {list(result.keys()) if isinstance(result, dict) else type(result)}")
113 | return {
114 | "success": True,
115 | "content": result,
116 | "file_path": pdf_file_path,
117 | "api_endpoint": str(response.url),
118 | "extraction_timestamp": None
119 | }
120 | except Exception as json_error:
121 | error_text = await response.text()
122 | print(f"⚠️ API返回非JSON格式: {json_error}")
123 | return {
124 | "success": False,
125 | "error": f"API返回非JSON格式: {json_error}",
126 | "error_details": error_text[:500],
127 | "file_path": pdf_file_path,
128 | "api_endpoint": str(response.url)
129 | }
130 | else:
131 | error_text = await response.text()
132 | print(f"❌ API返回错误状态码 {response.status}: {error_text[:200]}...")
133 | return {
134 | "success": False,
135 | "error": f"API返回错误状态码: {response.status}",
136 | "error_details": error_text,
137 | "file_path": pdf_file_path,
138 | "api_endpoint": str(response.url)
139 | }
140 |
141 | except FileNotFoundError:
142 | error_msg = f"找不到PDF文件: {pdf_file_path}"
143 | print(f"❌ {error_msg}")
144 | return {
145 | "success": False,
146 | "error": error_msg,
147 | "file_path": pdf_file_path,
148 | "api_endpoint": api_endpoint
149 | }
150 | except Exception as e:
151 | error_msg = f"API调用失败: {str(e)}"
152 | print(f"❌ {error_msg}")
153 | return {
154 | "success": False,
155 | "error": error_msg,
156 | "file_path": pdf_file_path,
157 | "api_endpoint": api_endpoint
158 | }
159 |
160 |
161 | async def pdf_extraction_node(state: AuditState) -> Dict[str, Any]:
162 | """
163 | 完全无缓存的PDF内容提取节点 - 每次都处理全新数据
164 |
165 | 🚨 已完全取消缓存机制,确保每次传输的信息都是全新的、一次性的
166 | """
167 | try:
168 | print(f"📄 开始无缓存PDF内容提取...")
169 |
170 | # 直接获取当前状态的文件夹数据 - 不使用任何缓存
171 | folder_validation = state.get("folder_validation", {})
172 |
173 | print(f"🔍 当前状态数据:")
174 | print(f" 文件夹验证结果: {len(folder_validation.get('folders_found', []))} 个文件夹")
175 |
176 | # 验证数据有效性
177 | if not folder_validation or not folder_validation.get("folders_found"):
178 | print("⚠️ 未找到有效的文件夹结构数据")
179 | return {
180 | "current_step": "pdf_extraction_failed",
181 | "error_message": "没有找到有效的文件夹结构",
182 | "processing_logs": ["没有找到有效的文件夹结构"]
183 | }
184 |
185 | # 获取PDF API端点配置
186 | api_endpoint = state.get("pdf_api_endpoint")
187 | if not api_endpoint:
188 | # 尝试使用默认配置
189 | api_endpoint = "http://183.203.184.233:8888/pdf_parse_supplychain"
190 | print(f"⚠️ 状态中未配置PDF API端点,使用默认端点: {api_endpoint}")
191 |
192 | # 检查是否有配置文件
193 | try:
194 | from src.config.api_config import get_pdf_api_config
195 | api_config = get_pdf_api_config()
196 | configured_endpoint = api_config.get("pdf_extraction_endpoint")
197 | if configured_endpoint:
198 | api_endpoint = configured_endpoint
199 | print(f"✅ 从配置文件获取到API端点: {api_endpoint}")
200 | except ImportError:
201 | print("⚠️ 无法导入API配置模块,使用硬编码默认端点")
202 | except Exception as e:
203 | print(f"⚠️ 读取API配置失败: {e},使用硬编码默认端点")
204 |
205 | # 如果仍然没有API端点,返回错误
206 | if not api_endpoint:
207 | return {
208 | "current_step": "pdf_extraction_failed",
209 | "error_message": "未配置PDF提取API端点,请检查配置文件或环境变量"
210 | }
211 |
212 | folders_found = folder_validation["folders_found"]
213 | pdf_extraction_results = {}
214 | api_extraction_results = {}
215 | total_pdf_files = 0
216 | successful_extractions = 0
217 |
218 | # 处理每个标准文件夹中的PDF文件
219 | for folder_info in folders_found:
220 | folder_name = folder_info["name"]
221 | folder_path = folder_info["path"]
222 |
223 | print(f"📁 处理文件夹: {folder_name}")
224 |
225 | # 查找文件夹中的PDF文件(异步方式)
226 | folder_path_obj = Path(folder_path)
227 |
228 | # 使用asyncio.to_thread来异步执行文件系统操作
229 | try:
230 | pdf_files = await asyncio.to_thread(lambda: list(folder_path_obj.glob("*.pdf")))
231 | except Exception as glob_error:
232 | print(f"❌ 扫描文件夹 {folder_name} 时发生错误: {str(glob_error)}")
233 | pdf_extraction_results[folder_name] = {
234 | "files": [],
235 | "folder_path": folder_path,
236 | "material_type": folder_name,
237 | "pdf_files_count": 0,
238 | "status": "error",
239 | "error": str(glob_error)
240 | }
241 | continue
242 |
243 | if not pdf_files:
244 | print(f"⚠️ 文件夹 {folder_name} 中没有找到PDF文件")
245 | pdf_extraction_results[folder_name] = {
246 | "files": [],
247 | "folder_path": folder_path,
248 | "material_type": folder_name,
249 | "pdf_files_count": 0,
250 | "status": "empty"
251 | }
252 | continue
253 |
254 | total_pdf_files += len(pdf_files)
255 | folder_results = []
256 |
257 | # 使用asyncio并发处理PDF文件提取
258 | tasks = []
259 | for pdf_file in pdf_files:
260 | task = extract_pdf_via_api(str(pdf_file), api_endpoint)
261 | tasks.append(task)
262 |
263 | # 并发执行API调用
264 | results = await asyncio.gather(*tasks, return_exceptions=True)
265 |
266 | for pdf_file, result in zip(pdf_files, results):
267 | if isinstance(result, Exception):
268 | print(f"❌ 处理文件 {pdf_file.name} 时发生异常: {str(result)}")
269 | folder_results.append({
270 | "file_name": pdf_file.name,
271 | "file_path": str(pdf_file),
272 | "success": False,
273 | "error": str(result),
274 | "material_type": folder_name
275 | })
276 | elif isinstance(result, dict) and result.get("success"):
277 | print(f"✅ 成功提取 {pdf_file.name}")
278 | successful_extractions += 1
279 |
280 | # 异步获取文件大小
281 | try:
282 | file_size = await asyncio.to_thread(lambda: pdf_file.stat().st_size)
283 | except Exception as stat_error:
284 | print(f"⚠️ 获取文件大小失败: {stat_error}")
285 | file_size = 0
286 |
287 | # 构建标准化JSON格式
288 | standardized_json = {
289 | "metadata": {
290 | "file_name": pdf_file.name,
291 | "file_path": str(pdf_file),
292 | "size_bytes": file_size,
293 | "material_type": folder_name,
294 | "extraction_method": "api"
295 | },
296 | "content": result.get("content", {}),
297 | "validation": {
298 | "is_valid": True,
299 | "api_endpoint": api_endpoint,
300 | "extraction_timestamp": result.get("extraction_timestamp")
301 | }
302 | }
303 |
304 | folder_results.append({
305 | "file_name": pdf_file.name,
306 | "file_path": str(pdf_file),
307 | "success": True,
308 | "json_data": standardized_json,
309 | "json_string": json.dumps(standardized_json, ensure_ascii=False, indent=2),
310 | "format": "strict_json",
311 | "size": len(json.dumps(standardized_json)),
312 | "material_type": folder_name
313 | })
314 |
315 | # 存储API提取结果
316 | if folder_name not in api_extraction_results:
317 | api_extraction_results[folder_name] = []
318 | api_extraction_results[folder_name].append(standardized_json)
319 |
320 | else:
321 | # 处理失败的情况
322 | error_msg = "未知错误"
323 | if isinstance(result, dict):
324 | error_msg = result.get("error", "未知错误")
325 | print(f"❌ 提取失败 {pdf_file.name}: {error_msg}")
326 | folder_results.append({
327 | "file_name": pdf_file.name,
328 | "file_path": str(pdf_file),
329 | "success": False,
330 | "error": error_msg,
331 | "material_type": folder_name
332 | })
333 |
334 | pdf_extraction_results[folder_name] = {
335 | "files": folder_results,
336 | "folder_path": folder_path,
337 | "material_type": folder_name,
338 | "pdf_files_count": len(pdf_files),
339 | "successful_count": len([r for r in folder_results if r.get("success")]),
340 | "status": "success" if folder_results else "empty"
341 | }
342 |
343 | success_folders = sum(1 for item in pdf_extraction_results.values()
344 | if item.get("status") in ["success", "empty"]) # 包括空文件夹
345 | total_folders = len(pdf_extraction_results)
346 |
347 | print(f"✅ PDF内容提取完成: {success_folders}/{total_folders}个文件夹,{successful_extractions}/{total_pdf_files}个PDF文件提取成功")
348 |
349 | # 即使没有PDF文件,只要有文件夹结构就认为成功
350 | if total_folders > 0:
351 | return {
352 | "pdf_extraction_results": pdf_extraction_results,
353 | "api_extraction_results": api_extraction_results,
354 | "extracted_content": api_extraction_results, # 保持兼容性
355 | "current_step": "pdf_extraction_completed",
356 | "processing_stats": {
357 | "total_folders": total_folders,
358 | "successful_folders": success_folders,
359 | "total_pdf_files": total_pdf_files,
360 | "successful_extractions": successful_extractions,
361 | "extraction_rate": successful_extractions / total_pdf_files if total_pdf_files > 0 else 0
362 | }
363 | }
364 | else:
365 | return {
366 | "current_step": "pdf_extraction_failed",
367 | "error_message": "未找到可处理的文件夹"
368 | }
369 |
370 | except Exception as e:
371 | logger.error(f"PDF内容提取失败: {str(e)}")
372 | print(f"❌ PDF内容提取失败: {str(e)}")
373 | return {
374 | "current_step": "pdf_extraction_failed",
375 | "error_message": f"PDF内容提取失败: {str(e)}"
376 | }
377 |
378 |
379 | def configure_pdf_api_endpoint(state: AuditState, api_endpoint: str) -> Dict[str, Any]:
380 | """
381 | 配置PDF提取API端点
382 |
383 | Args:
384 | state: 当前状态
385 | api_endpoint: API端点URL
386 |
387 | Returns:
388 | 更新的状态
389 | """
390 | return {
391 | "pdf_api_endpoint": api_endpoint,
392 | "processing_logs": [f"已配置PDF提取API端点: {api_endpoint}"]
393 | }
--------------------------------------------------------------------------------
/src/models/state.py:
--------------------------------------------------------------------------------
1 | """
2 | 数据模型定义
3 |
4 | 定义审核流程中使用的数据模型(不包括LangGraph状态)
5 |
6 | 模型使用状态:
7 | - CoreInfo: ✅ 高度活跃 - 在多个节点中实际使用
8 | - RuleInfo: ✅ 高度活跃 - 规则处理核心模型
9 | - ValidationResult: ⚠️ 部分使用 - 主要用作类型注解
10 | - CrossValidationResult: ⚠️ 部分使用 - 主要用作类型注解
11 | - MaterialProcessingStats: ✅ 有效使用 - 在报告生成中实际使用
12 | - AuditReport: ⚠️ 部分功能未启用 - 完善但使用有限
13 |
14 | 已移除未使用模型:
15 | - FileInfo: ✖️ 已移除 - 几乎未使用
16 | - MaterialInfo: ✖️ 已移除 - 使用场景有限,可用Dict替代
17 | - ReportSummary: ✖️ 已移除 - 完全未使用
18 | """
19 |
20 | from typing import List, Dict, Any, Optional, Union
21 | from pydantic import BaseModel, Field
22 |
23 |
24 | # ============================================================================
25 | # 核心业务模型(高度活跃)
26 | # ============================================================================
27 | class CoreInfo(BaseModel):
28 | """核心信息模型(简化版) - ✅ 高度活跃模型"""
29 | name: str = Field(description="姓名,统一格式,去除空格", default="")
30 | gender: str = Field(description="性别,男/女", default="")
31 | id_number: str = Field(description="身份证号,18位标准格式", default="")
32 | extracted_from: List[str] = Field(description="信息来源材料", default_factory=list)
33 |
34 |
35 | # ============================================================================
36 | # 校验结果模型(部分使用)
37 | # ============================================================================
38 | class ValidationResult(BaseModel):
39 | """校验结果模型 - 增强版,完整存储validation节点的所有输出信息"""
40 | rule_id: str
41 | rule_name: str
42 | status: str # PASS, WARNING, ERROR
43 | result: str # "✅通过", "⚠️警告", "❌不通过"
44 | message: str
45 | details: str = Field(description="详细描述信息")
46 | priority: str = Field(description="优先级:高/中/低")
47 | material_type: str = Field(description="材料类型")
48 | rule_content: str = Field(description="应用的规则内容", default="")
49 | ai_powered: bool = Field(description="是否AI驱动的校验", default=False)
50 | rules_applied: int = Field(description="应用的规则数量", default=0)
51 | timestamp: str = Field(description="校验时间戳")
52 |
53 | @classmethod
54 | def from_validation_output(cls, validation_dict: Dict[str, Any]) -> "ValidationResult":
55 | """从validation节点输出的字典创建ValidationResult对象"""
56 | return cls(
57 | rule_id=validation_dict.get('rule_name', '').replace(' ', '_'),
58 | rule_name=validation_dict.get('rule_name', ''),
59 | status=cls._convert_result_to_status(validation_dict.get('result', '')),
60 | result=validation_dict.get('result', ''),
61 | message=validation_dict.get('details', ''),
62 | details=validation_dict.get('details', ''),
63 | priority=validation_dict.get('priority', '中'),
64 | material_type=validation_dict.get('material_type', ''),
65 | rule_content=validation_dict.get('rule_content', ''),
66 | ai_powered=validation_dict.get('ai_powered', False),
67 | rules_applied=validation_dict.get('rules_applied', 0),
68 | timestamp=validation_dict.get('timestamp', '')
69 | )
70 |
71 | @staticmethod
72 | def _convert_result_to_status(result: str) -> str:
73 | """将结果转换为状态"""
74 | if result.startswith('✅'):
75 | return 'PASS'
76 | elif result.startswith('⚠️'):
77 | return 'WARNING'
78 | elif result.startswith('❌'):
79 | return 'ERROR'
80 | else:
81 | return 'WARNING'
82 |
83 |
84 | class ValidationSummary(BaseModel):
85 | """验证结果摘要模型 - 存储validation节点的完整统计信息"""
86 | total_materials_processed: int = Field(description="处理的材料数量")
87 | total_validations: int = Field(description="总校验数量")
88 | successful_materials: int = Field(description="成功校验的材料数量")
89 | error_count: int = Field(description="错误数量")
90 | warning_count: int = Field(description="警告数量")
91 | pass_count: int = Field(description="通过数量")
92 | ai_powered_validations: int = Field(description="AI驱动的校验数量")
93 | total_rules_applied: int = Field(description="应用的规则总数")
94 | materials_by_type: Dict[str, int] = Field(description="按材料类型统计", default_factory=dict)
95 | validation_start_time: Optional[str] = Field(description="校验开始时间", default=None)
96 | validation_end_time: Optional[str] = Field(description="校验结束时间", default=None)
97 |
98 | @classmethod
99 | def from_validation_results(cls, validation_results: List[ValidationResult]) -> "ValidationSummary":
100 | """从验证结果列表创建摘要"""
101 | error_count = sum(1 for r in validation_results if r.status == 'ERROR')
102 | warning_count = sum(1 for r in validation_results if r.status == 'WARNING')
103 | pass_count = sum(1 for r in validation_results if r.status == 'PASS')
104 | ai_powered_count = sum(1 for r in validation_results if r.ai_powered)
105 | total_rules = sum(r.rules_applied for r in validation_results)
106 |
107 | materials_by_type = {}
108 | for result in validation_results:
109 | mat_type = result.material_type
110 | materials_by_type[mat_type] = materials_by_type.get(mat_type, 0) + 1
111 |
112 | return cls(
113 | total_materials_processed=len(set(r.material_type for r in validation_results)),
114 | total_validations=len(validation_results),
115 | successful_materials=len(set(r.material_type for r in validation_results if r.status != 'ERROR')),
116 | error_count=error_count,
117 | warning_count=warning_count,
118 | pass_count=pass_count,
119 | ai_powered_validations=ai_powered_count,
120 | total_rules_applied=total_rules,
121 | materials_by_type=materials_by_type
122 | )
123 |
124 |
125 | class CrossValidationResult(BaseModel):
126 | """交叉校验结果模型 - ⚠️ 主要用作类型注解,实际多使用Dict"""
127 | validation_type: str # name_consistency, id_consistency, time_logic, data_rationality
128 | status: str # PASS, WARNING, ERROR
129 | message: str
130 | conflicts: List[str] = []
131 |
132 |
133 | # ============================================================================
134 | # 规则相关模型(高度活跃)
135 | # ============================================================================
136 | class RuleInfo(BaseModel):
137 | """规则信息模型 - ✅ 高度活跃模型,在rules_processing和validation中大量使用"""
138 | rule_id: str = Field(description="规则唯一标识")
139 | content: str = Field(description="规则内容")
140 | source_file: str = Field(description="来源文件名")
141 | category: str = Field(description="1-17中的分类编号", default="17")
142 | priority: str = Field(description="优先级", default="normal")
143 |
144 |
145 | class RuleFileInfo(BaseModel):
146 | """规则文件信息模型 - ✅ 在rules_processing中使用"""
147 | file_name: str = Field(description="规则文件名")
148 | file_path: str = Field(description="文件完整路径")
149 | file_type: str = Field(description="文件类型 (.xlsx 或 .md)")
150 | size: int = Field(description="文件大小或规则数量")
151 | content: Optional[str] = Field(description="文件原始内容(仅Markdown文件)", default=None)
152 | extracted_rules: Optional[List[RuleInfo]] = Field(description="提取的规则列表(仅Excel文件)", default=None)
153 |
154 |
155 | # ============================================================================
156 | # 状态管理模型
157 | # ============================================================================
158 | class AuditState(BaseModel):
159 | """审核工作流状态定义(业务数据模型)"""
160 |
161 | # 输入文件信息
162 | uploaded_file: Optional[str] = None # 上传的文件路径
163 | file_type: str = "" # 文件类型 (zip/pdf/doc等)
164 |
165 | # 文件处理结果
166 | extracted_files: List[str] = Field(default_factory=list) # 解压后的文件列表
167 | file_classification: Dict[str, str] = Field(default_factory=dict) # 文件分类结果
168 |
169 | # PDF处理
170 | pdf_analysis: Dict[str, Any] = Field(default_factory=dict) # PDF页数分析结果
171 | pdf_chunks: Dict[str, List[str]] = Field(default_factory=dict) # PDF分片结果
172 |
173 | # 内容提取
174 | extracted_content: Dict[str, Any] = Field(default_factory=dict) # 提取的内容信息
175 | core_info: Optional[Dict[str, Any]] = None # 核心信息(姓名、身份证号)
176 |
177 | # 规则处理
178 | rules_data: List[RuleFileInfo] = Field(default_factory=list) # 加载的规则文件数据
179 | parsed_rules: List[RuleInfo] = Field(default_factory=list) # 解析后的规则列表
180 | rules_by_category: Dict[str, List[RuleInfo]] = Field(default_factory=dict) # 按1-17项分类的规则
181 |
182 | # 验证结果(完整存储)
183 | validation_results_detailed: List[ValidationResult] = Field(description="详细的验证结果列表", default_factory=list)
184 | validation_summary: Optional[ValidationSummary] = Field(description="验证结果摘要", default=None)
185 | material_validation: Dict[str, List[Any]] = Field(default_factory=dict) # 材料校验结果(兼容)
186 | cross_validation: List[Any] = Field(default_factory=list) # 交叉校验结果(并发安全)
187 | validation_results: List[Dict[str, Any]] = Field(default_factory=list) # 所有校验结果(兼容)
188 |
189 | # 报告生成
190 | audit_report: Optional["AuditReport"] = None # 生成的审核报告对象
191 | report_path: Optional[str] = None # 报告文件路径
192 |
193 | # 流程控制
194 | current_step: str = "file_processing" # 当前步骤
195 | error_message: Optional[str] = None # 错误信息
196 | warnings: List[str] = Field(default_factory=list) # 警告信息
197 | processing_logs: List[str] = Field(default_factory=list) # 处理日志
198 | is_complete: bool = False # 是否完成
199 |
200 | # Redis缓存相关
201 | session_id: Optional[str] = None # 会话ID
202 |
203 |
204 | # ============================================================================
205 | # 报告相关模型(部分功能未启用)
206 | # ============================================================================
207 |
208 |
209 | class AuditReport(BaseModel):
210 | """审核报告模型(增强版) - ⚠️ 完善但使用有限,主要作为类型注解"""
211 |
212 | # 报告基本信息
213 | report_id: str = Field(description="报告唯一标识")
214 | generated_at: str = Field(description="生成时间")
215 | report_version: str = Field(description="报告版本", default="v2.0")
216 |
217 | # 申报人信息
218 | applicant_info: CoreInfo = Field(description="申报人核心信息")
219 |
220 | # 审核摘要
221 | summary: Dict[str, Any] = Field(description="审核结果摘要", default_factory=dict)
222 |
223 | # 材料处理统计
224 | processing_stats: Dict[str, Any] = Field(description="处理统计信息", default_factory=dict)
225 |
226 | # 校验结果分类(按严重程度)
227 | severe_issues: List[ValidationResult] = Field(description="严重问题", default_factory=list)
228 | warnings: List[ValidationResult] = Field(description="警告问题", default_factory=list)
229 | suggestions: List[ValidationResult] = Field(description="建议优化", default_factory=list)
230 | passed_validations: List[ValidationResult] = Field(description="通过的校验", default_factory=list)
231 |
232 | # 交叉校验结果
233 | cross_validation_results: List[CrossValidationResult] = Field(description="交叉校验结果", default_factory=list)
234 |
235 | # 按材料分类的结果
236 | material_results: Dict[str, List[ValidationResult]] = Field(description="按材料类型分类的结果", default_factory=dict)
237 |
238 | # 规则应用统计
239 | rules_applied: Dict[str, Any] = Field(description="应用的规则统计", default_factory=dict)
240 |
241 | # HTML报告内容
242 | html_content: Optional[str] = Field(description="生成的HTML报告内容", default=None)
243 |
244 | # 报告文件路径
245 | file_path: Optional[str] = Field(description="报告文件保存路径", default=None)
246 |
247 | # 质量评分
248 | quality_score: Optional[float] = Field(description="材料质量评分(0-100)", default=None)
249 |
250 | # 合规性评估
251 | compliance_status: str = Field(description="合规性状态", default="PENDING") # PASS/WARNING/FAIL/PENDING
252 |
253 | # 建议措施
254 | recommendations: List[str] = Field(description="改进建议", default_factory=list)
255 |
256 | # 审核日志
257 | audit_logs: List[str] = Field(description="审核过程日志", default_factory=list)
258 |
259 | @classmethod
260 | def create_from_state(cls, state: Any, report_id: str) -> "AuditReport":
261 | """从审核状态创建报告"""
262 | from datetime import datetime
263 |
264 | # 获取核心信息
265 | core_info = state.get('core_info') or {} if hasattr(state, 'get') else getattr(state, 'core_info', None) or {}
266 |
267 | # 处理字典和对象访问
268 | def get_state_value(key: str, default=None):
269 | if hasattr(state, 'get'): # 字典类型
270 | return state.get(key, default)
271 | else: # 对象类型
272 | return getattr(state, key, default)
273 |
274 | applicant_info = CoreInfo(
275 | name=core_info.get('name', '') if isinstance(core_info, dict) else '',
276 | gender=core_info.get('gender', '') if isinstance(core_info, dict) else '',
277 | id_number=core_info.get('id_number', '') if isinstance(core_info, dict) else '',
278 | extracted_from=core_info.get('extracted_from', []) if isinstance(core_info, dict) else []
279 | )
280 |
281 | # 创建报告实例
282 | audit_logs = get_state_value('processing_logs', [])
283 | if not isinstance(audit_logs, list):
284 | audit_logs = []
285 |
286 | return cls(
287 | report_id=report_id,
288 | generated_at=datetime.now().isoformat(),
289 | applicant_info=applicant_info,
290 | processing_stats=MaterialProcessingStats.from_state(state).dict(),
291 | audit_logs=audit_logs
292 | )
293 |
294 | def calculate_quality_score(self) -> float:
295 | """计算质量评分"""
296 | total_validations = len(self.severe_issues) + len(self.warnings) + len(self.passed_validations)
297 | if total_validations == 0:
298 | return 100.0
299 |
300 | # 计算分数:错误扣分更多,警告扣分较少
301 | error_penalty = len(self.severe_issues) * 10
302 | warning_penalty = len(self.warnings) * 3
303 | total_penalty = error_penalty + warning_penalty
304 |
305 | score = max(0, 100 - total_penalty)
306 | return score
307 |
308 | def determine_compliance_status(self) -> str:
309 | """确定合规性状态"""
310 | if len(self.severe_issues) > 0:
311 | return "FAIL"
312 | elif len(self.warnings) > 0:
313 | return "WARNING"
314 | else:
315 | return "PASS"
316 |
317 | def get_summary_dict(self) -> Dict[str, Any]:
318 | """获取摘要字典"""
319 | return {
320 | "total_validations": len(self.severe_issues) + len(self.warnings) + len(self.passed_validations),
321 | "error_count": len(self.severe_issues),
322 | "warning_count": len(self.warnings),
323 | "passed_count": len(self.passed_validations),
324 | "cross_validation_count": len(self.cross_validation_results),
325 | "quality_score": self.quality_score or self.calculate_quality_score(),
326 | "compliance_status": self.compliance_status
327 | }
328 |
329 |
330 | # ============================================================================
331 | # 统计模型(有效使用)
332 | # ============================================================================
333 |
334 |
335 | class MaterialProcessingStats(BaseModel):
336 | """材料处理统计模型 - ✅ 在AuditReport中有实际应用"""
337 | files_extracted: int = Field(description="解压文件数量", default=0)
338 | pdfs_processed: int = Field(description="处理的PDF数量", default=0)
339 | content_extracted: bool = Field(description="内容提取成功", default=False)
340 | core_info_extracted: bool = Field(description="核心信息提取成功", default=False)
341 | categories_classified: List[str] = Field(description="已分类的材料类型", default_factory=list)
342 |
343 | @classmethod
344 | def from_state(cls, state: Any) -> "MaterialProcessingStats":
345 | """从审核状态创建处理统计"""
346 | # 处理字典和对象访问
347 | def get_state_value(key: str, default=None):
348 | if hasattr(state, 'get'): # 字典类型
349 | return state.get(key, default)
350 | else: # 对象类型
351 | return getattr(state, key, default)
352 |
353 | extracted_files = get_state_value('extracted_files', []) or []
354 | extracted_content = get_state_value('extracted_content', {}) or {}
355 | core_info = get_state_value('core_info')
356 |
357 | return cls(
358 | files_extracted=len(extracted_files),
359 | pdfs_processed=len([f for f in extracted_files if f.lower().endswith('.pdf')]),
360 | content_extracted=len(extracted_content) > 0,
361 | core_info_extracted=bool(core_info and (
362 | core_info.get('name') or core_info.get('id_number')
363 | if isinstance(core_info, dict) else False
364 | )),
365 | categories_classified=list(extracted_content.keys()) if extracted_content else []
366 | )
--------------------------------------------------------------------------------