├── system_prompt.md ├── mcp_config_example.json ├── requirements.txt ├── .gitignore ├── README.md └── DATA_MCP.py /system_prompt.md: -------------------------------------------------------------------------------- 1 | # DataHill - IPython 数据分析 MCP 系统提示词 2 | 3 | 你是一位专业的数据分析助手,擅长使用 IPython 进行数据分析和处理。你具备完整的数据科学工作流程能力,能够帮助用户处理各种数据分析任务。 4 | 5 | ## 核心能力 6 | 7 | - **数据加载与预处理**: 支持多种数据格式(CSV、Excel、JSON等),自动处理编码问题 8 | - **交互式分析环境**: 基于真正的 IPython 内核,支持所有 Python 数据科学生态 9 | - **智能会话管理**: 独立的分析会话,变量隔离,持久化状态 10 | - **内存优化**: 智能采样和内存监控,避免大数据集上下文溢出 11 | - **代码执行**: 支持 Python 代码、IPython 魔法命令、系统命令 12 | 13 | ## 标准工作流程 14 | 15 | ### 1. 初始化阶段 16 | - 创建独立的 IPython 会话以确保环境隔离 17 | - 获取 session_id 后再进行后续操作,避免会话错误 18 | 19 | ### 2. 数据探索阶段 20 | - 加载数据文件到 DataFrame 中 21 | - 查看数据的前几行确认数据样式、表头名称和含义 22 | - 对于大数据集,使用采样功能了解列数据的分布情况 23 | - 获取数据的基本统计信息和结构概览 24 | 25 | ### 3. 分析规划阶段 26 | - 根据用户问题制定完善的分析计划 27 | - 充分考虑各种可能的情况和边界条件 28 | - 将复杂问题分解为可执行的步骤 29 | 30 | ### 4. 逐步执行阶段 31 | - 通过 IPython 交互逐步完成数据处理和分析 32 | - 所有 Python 代码必须包含明确的中文注释 33 | - 每个步骤都要等待上一步的输出结果再进行下一步 34 | - 根据执行反馈及时调整代码,确保运行正确 35 | 36 | ### 5. 结果输出阶段 37 | - 根据脚本输出结合用户问题给出正确简短的答案 38 | - 提供清晰的数据洞察和建议 39 | 40 | ## 数据处理要求 41 | 42 | ### 文件处理规范 43 | - CSV 文件默认使用逗号分隔符 "," 44 | - 支持中文地址和内容,注意编码处理(建议 UTF-8) 45 | - 对于时间和日期筛选,必须使用 Python 的 date 或 time 模块 46 | 47 | ### 代码执行规范 48 | - 所有 Python 代码都必须通过专用的代码执行工具运行 49 | - 代码要有详细的中文注释说明每步操作的目的 50 | - 优先使用 DataFrame 的内置方法进行数据操作 51 | - 充分利用 pandas、numpy 等数据科学库的功能 52 | 53 | ### 内存管理 54 | - 对于大数据集,使用采样方法查看数据内容 55 | - 定期监控内存使用情况 56 | - 及时清理不需要的变量释放内存空间 57 | - 合理使用数据类型优化内存占用 58 | 59 | ## 最佳实践 60 | 61 | ### 数据探索 62 | - 使用 `df.head()` 查看数据前几行 63 | - 使用 `df.info()` 了解数据结构和类型 64 | - 使用 `df.describe()` 获取数值列的统计摘要 65 | - 对于分类列,使用 `df['column'].value_counts()` 查看分布 66 | 67 | ### 数据质量检查 68 | - 检查缺失值:`df.isnull().sum()` 69 | - 检查重复值:`df.duplicated().sum()` 70 | - 检查数据类型是否正确 71 | - 识别异常值和离群点 72 | 73 | ### 时间序列处理 74 | - 使用 `pd.to_datetime()` 转换时间格式 75 | - 使用 `datetime` 模块进行时间筛选和计算 76 | - 注意时区处理和日期格式统一 77 | 78 | ### 可视化建议 79 | - 使用 `matplotlib` 和 `seaborn` 进行数据可视化 80 | - 图表要有清晰的标题和标签 81 | - 选择合适的图表类型展示数据特征 82 | 83 | ## 错误处理 84 | 85 | ### 常见问题处理 86 | - 编码错误:尝试不同编码方式(utf-8, gbk, gb2312) 87 | - 数据类型错误:使用适当的类型转换函数 88 | - 内存不足:使用分块处理或采样方法 89 | - 路径错误:确认文件路径的正确性 90 | 91 | ### 调试策略 92 | - 逐步执行代码,观察每步的输出结果 93 | - 使用 `print()` 语句调试中间变量 94 | - 查看错误堆栈信息定位问题 95 | - 根据错误信息调整代码逻辑 96 | 97 | ## 交互原则 98 | 99 | 1. **会话优先**: 必须先创建会话再进行数据操作 100 | 2. **步骤清晰**: 每个分析步骤都要有明确的目标和输出 101 | 3. **等待确认**: 获得上一步结果后再执行下一步 102 | 4. **注释详细**: 所有代码都要有中文注释说明 103 | 5. **结果验证**: 通过多种方法验证分析结果的正确性 104 | 6. **用户导向**: 始终围绕用户的具体问题进行分析 105 | 106 | 今天的日期是: {{date}} 107 | 108 | 请遵循以上工作流程和最佳实践,为用户提供专业、准确的数据分析服务。 -------------------------------------------------------------------------------- /mcp_config_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "DataHill IPython Data Analysis MCP Server", 3 | "version": "1.0.0", 4 | "description": "A lightweight data analysis MCP tool based on real IPython kernel", 5 | "author": "DataHill Team", 6 | "license": "MIT", 7 | "mcpServers": { 8 | "dataHill": { 9 | "command": "python", 10 | "args": ["DATA_MCP.py"], 11 | "env": { 12 | "PYTHONPATH": ".", 13 | "DATA_ANALYSIS_MODE": "professional", 14 | "DATAHILL_VERSION": "1.0", 15 | "PANDAS_DISPLAY_MAX_ROWS": "100", 16 | "MATPLOTLIB_BACKEND": "Agg" 17 | }, 18 | "cwd": ".", 19 | "timeout": 30000, 20 | "stdio": true 21 | } 22 | }, 23 | "capabilities": { 24 | "tools": [ 25 | { 26 | "name": "create_ipython_session", 27 | "description": "Create new IPython session for data analysis" 28 | }, 29 | { 30 | "name": "execute_code", 31 | "description": "Execute Python code, magic commands, and system commands" 32 | }, 33 | { 34 | "name": "load_csv_file", 35 | "description": "Load CSV files with automatic encoding detection" 36 | }, 37 | { 38 | "name": "load_excel_file", 39 | "description": "Load Excel files (supports .xlsx/.xls)" 40 | }, 41 | { 42 | "name": "sample_column_data", 43 | "description": "Smart sampling for large dataset column viewing" 44 | }, 45 | { 46 | "name": "check_memory_usage", 47 | "description": "Monitor memory usage and system resources" 48 | } 49 | ], 50 | "features": { 51 | "session_management": true, 52 | "data_loading": true, 53 | "code_execution": true, 54 | "memory_monitoring": true, 55 | "smart_sampling": true, 56 | "multi_format_support": true, 57 | "encoding_detection": true 58 | } 59 | }, 60 | "future_roadmap": { 61 | "multi_agent_system": { 62 | "phase_1": { 63 | "duration": "3 months", 64 | "agents": [ 65 | "data_analysis_expert", 66 | "visualization_expert", 67 | "statistical_analysis_expert" 68 | ] 69 | }, 70 | "phase_2": { 71 | "duration": "6 months", 72 | "features": [ 73 | "agent_collaboration_framework", 74 | "machine_learning_expert", 75 | "report_generation_expert" 76 | ] 77 | }, 78 | "phase_3": { 79 | "duration": "9 months", 80 | "features": [ 81 | "data_cleaning_expert", 82 | "time_series_expert", 83 | "business_analysis_expert" 84 | ] 85 | }, 86 | "phase_4": { 87 | "duration": "12 months", 88 | "features": [ 89 | "agent_orchestration_platform", 90 | "custom_agent_builder", 91 | "distributed_agent_system" 92 | ] 93 | } 94 | } 95 | }, 96 | "supported_clients": [ 97 | "Claude Desktop", 98 | "MCP stdio clients", 99 | "Custom MCP implementations" 100 | ], 101 | "system_requirements": { 102 | "python": ">=3.8", 103 | "memory": "4GB+ recommended", 104 | "os": ["Windows", "macOS", "Linux"] 105 | } 106 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # IPython 数据分析 MCP 服务器依赖 2 | # DataHill Project Dependencies 3 | 4 | # ============================================================================ 5 | # 核心 MCP 和 IPython 依赖 / Core MCP and IPython Dependencies 6 | # ============================================================================ 7 | fastmcp>=0.5.0 # MCP 服务器框架 / MCP server framework 8 | ipython>=8.0.0 # IPython 交互式环境 / IPython interactive environment 9 | pydantic>=2.0.0 # 数据模型和验证 / Data models and validation 10 | 11 | # ============================================================================ 12 | # 数据处理核心库 / Data Processing Core Libraries 13 | # ============================================================================ 14 | pandas>=2.0.0 # 数据处理和分析 / Data processing and analysis 15 | numpy>=1.24.0 # 数值计算基础库 / Numerical computation foundation 16 | 17 | # ============================================================================ 18 | # 文件格式支持 / File Format Support 19 | # ============================================================================ 20 | openpyxl>=3.1.0 # Excel .xlsx 文件支持 / Excel .xlsx file support 21 | xlrd>=2.0.0 # Excel .xls 文件支持 / Excel .xls file support 22 | 23 | # ============================================================================ 24 | # 系统监控和管理 / System Monitoring and Management 25 | # ============================================================================ 26 | psutil>=5.9.0 # 内存和系统监控 / Memory and system monitoring 27 | 28 | # ============================================================================ 29 | # 可选依赖 - 数据可视化 / Optional Dependencies - Data Visualization 30 | # ============================================================================ 31 | matplotlib>=3.7.0 # 基础绘图库 / Basic plotting library 32 | seaborn>=0.12.0 # 统计可视化 / Statistical visualization 33 | plotly>=5.15.0 # 交互式可视化 / Interactive visualization 34 | 35 | # ============================================================================ 36 | # 可选依赖 - 机器学习 / Optional Dependencies - Machine Learning 37 | # ============================================================================ 38 | scikit-learn>=1.3.0 # 机器学习库 / Machine learning library 39 | scipy>=1.11.0 # 科学计算库 / Scientific computing library 40 | 41 | # ============================================================================ 42 | # 可选依赖 - 高级数据格式 / Optional Dependencies - Advanced Data Formats 43 | # ============================================================================ 44 | pyarrow>=12.0.0 # Parquet 文件支持 / Parquet file support 45 | h5py>=3.9.0 # HDF5 文件支持 / HDF5 file support 46 | sqlalchemy>=2.0.0 # 数据库连接 / Database connectivity 47 | 48 | # ============================================================================ 49 | # 可选依赖 - 性能优化 / Optional Dependencies - Performance Optimization 50 | # ============================================================================ 51 | numba>=0.57.0 # JIT 编译加速 / JIT compilation acceleration 52 | 53 | # ============================================================================ 54 | # 开发和测试依赖 / Development and Testing Dependencies 55 | # ============================================================================ 56 | pytest>=7.4.0 # 测试框架 / Testing framework 57 | pytest-asyncio>=0.21.0 # 异步测试支持 / Async testing support 58 | black>=23.0.0 # 代码格式化 / Code formatting 59 | flake8>=6.0.0 # 代码检查 / Code linting 60 | 61 | # ============================================================================ 62 | # 未来多智能体系统依赖 / Future Multi-Agent System Dependencies 63 | # ============================================================================ 64 | # 以下依赖将在多智能体系统开发阶段添加 65 | # The following dependencies will be added during multi-agent system development phase 66 | 67 | # langchain>=0.1.0 # 智能体框架 / Agent framework 68 | # langgraph>=0.1.0 # 图形化智能体工作流 / Graphical agent workflows 69 | # openai>=1.0.0 # OpenAI API 支持 / OpenAI API support 70 | # anthropic>=0.25.0 # Anthropic API 支持 / Anthropic API support 71 | # chromadb>=0.4.0 # 向量数据库 / Vector database 72 | # faiss-cpu>=1.7.4 # 相似性搜索 / Similarity search 73 | # transformers>=4.35.0 # 预训练模型 / Pre-trained models -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IPython 数据分析 MCP 项目 - Git 忽略文件 2 | # 只保留与 DATA_MCP.py 相关的文件,忽略其他内容 3 | 4 | # ============================================================================ 5 | # Python 相关 6 | # ============================================================================ 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | *.so 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | Pipfile.lock 88 | 89 | # PEP 582 90 | __pypackages__/ 91 | 92 | # Celery stuff 93 | celerybeat-schedule 94 | celerybeat.pid 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | 126 | # ============================================================================ 127 | # 操作系统相关 128 | # ============================================================================ 129 | # macOS 130 | .DS_Store 131 | .AppleDouble 132 | .LSOverride 133 | Icon 134 | ._* 135 | .DocumentRevisions-V100 136 | .fseventsd 137 | .Spotlight-V100 138 | .TemporaryItems 139 | .Trashes 140 | .VolumeIcon.icns 141 | .com.apple.timemachine.donotpresent 142 | .AppleDB 143 | .AppleDesktop 144 | Network Trash Folder 145 | Temporary Items 146 | .apdisk 147 | 148 | # Windows 149 | Thumbs.db 150 | Thumbs.db:encryptable 151 | ehthumbs.db 152 | ehthumbs_vista.db 153 | *.tmp 154 | *.temp 155 | Desktop.ini 156 | $RECYCLE.BIN/ 157 | *.cab 158 | *.msi 159 | *.msix 160 | *.msm 161 | *.msp 162 | *.lnk 163 | 164 | # Linux 165 | *~ 166 | .fuse_hidden* 167 | .directory 168 | .Trash-* 169 | .nfs* 170 | 171 | # ============================================================================ 172 | # 编辑器和 IDE 173 | # ============================================================================ 174 | # VSCode 175 | .vscode/ 176 | *.code-workspace 177 | 178 | # PyCharm 179 | .idea/ 180 | *.iws 181 | *.iml 182 | *.ipr 183 | 184 | # Sublime Text 185 | *.tmlanguage.cache 186 | *.tmPreferences.cache 187 | *.stTheme.cache 188 | *.sublime-workspace 189 | *.sublime-project 190 | 191 | # Vim 192 | [._]*.s[a-v][a-z] 193 | [._]*.sw[a-p] 194 | [._]s[a-rt-v][a-z] 195 | [._]ss[a-gi-z] 196 | [._]sw[a-p] 197 | Session.vim 198 | Sessionx.vim 199 | .netrwhist 200 | *~ 201 | tags 202 | [._]*.un~ 203 | 204 | # Emacs 205 | *~ 206 | \#*\# 207 | /.emacs.desktop 208 | /.emacs.desktop.lock 209 | *.elc 210 | auto-save-list 211 | tramp 212 | .\#* 213 | 214 | # ============================================================================ 215 | # 数据文件(保留示例数据) 216 | # ============================================================================ 217 | # 忽略所有数据文件,除了示例文件 218 | *.csv 219 | *.xlsx 220 | *.xls 221 | *.json 222 | *.parquet 223 | *.h5 224 | *.hdf5 225 | *.db 226 | *.sqlite 227 | *.sqlite3 228 | 229 | # 但保留示例数据文件 230 | !example_data.* 231 | !sample_data.* 232 | !demo_data.* 233 | !test_data.* 234 | !模拟数据.* 235 | 236 | # ============================================================================ 237 | # MCP 和配置文件 238 | # ============================================================================ 239 | # 忽略个人配置 240 | .mcp/ 241 | mcp_config_personal.json 242 | claude_desktop_config.json 243 | 244 | # 保留示例配置 245 | !mcp_config_example.json 246 | !.mcp.json 247 | 248 | # ============================================================================ 249 | # 日志和临时文件 250 | # ============================================================================ 251 | *.log 252 | *.out 253 | *.err 254 | logs/ 255 | temp/ 256 | tmp/ 257 | .tmp/ 258 | 259 | # ============================================================================ 260 | # 不相关的项目文件 261 | # ============================================================================ 262 | # 忽略与 DATA_MCP 无关的 Python 项目 263 | langgraph_workflow.py 264 | planner_agent.py 265 | worker_agent.py 266 | state_models.py 267 | 268 | # 忽略其他项目的文件 269 | node_modules/ 270 | package*.json 271 | yarn.lock 272 | *.js 273 | *.ts 274 | *.tsx 275 | *.jsx 276 | *.html 277 | *.css 278 | *.scss 279 | *.sass 280 | *.less 281 | 282 | # ============================================================================ 283 | # 备份和归档文件 284 | # ============================================================================ 285 | *.bak 286 | *.backup 287 | *.old 288 | *.orig 289 | *.save 290 | backup_* 291 | archive_* 292 | *.zip 293 | *.tar 294 | *.tar.gz 295 | *.tar.bz2 296 | *.tar.xz 297 | *.rar 298 | *.7z 299 | 300 | # ============================================================================ 301 | # 特定忽略(根据项目需要) 302 | # ============================================================================ 303 | # 忽略可能的敏感信息 304 | secrets.py 305 | secret_* 306 | private_* 307 | personal_* 308 | .secret 309 | .private 310 | 311 | # 忽略大文件 312 | *.bin 313 | *.exe 314 | *.dll 315 | *.so.* 316 | *.dylib 317 | 318 | # 忽略模型文件 319 | *.pkl 320 | *.pickle 321 | *.model 322 | *.h5 323 | *.onnx 324 | *.pt 325 | *.pth 326 | *.weights 327 | 328 | # ============================================================================ 329 | # 允许的文件(明确包含) 330 | # ============================================================================ 331 | # 核心项目文件 332 | !DATA_MCP.py 333 | !README.md 334 | !README_EN.md 335 | !requirements.txt 336 | !system_prompt_example.md 337 | !LICENSE 338 | !.gitignore 339 | 340 | # 文档和示例 341 | !docs/ 342 | !examples/ 343 | !*.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IPython 数据分析 MCP 服务器 / IPython Data Analysis MCP Server 2 | 3 | [🇨🇳 中文](#中文版本) | [🇺🇸 English](#english-version) 4 | 5 | --- 6 | 7 | ## 中文版本 8 | 9 | 基于真正 IPython 内核的轻量级数据分析 MCP (Model Context Protocol) 工具,提供完整的交互式 Python 数据分析环境,支持会话管理、数据加载、实时数据查看等核心功能。 10 | 11 | ### 🚀 核心特性 12 | 13 | - **真正的 IPython 环境**: 基于 IPython InteractiveShell,支持所有 IPython 功能 14 | - **多会话管理**: 独立的会话空间,变量隔离,持久化状态 15 | - **智能数据加载**: 支持 CSV/Excel/JSON,自动编码检测,智能变量命名 16 | - **实时监控**: 内存使用监控、变量管理、执行历史追踪 17 | - **完整功能支持**: Python代码、IPython魔法命令、系统命令执行 18 | - **智能采样**: 大数据集友好的列数据查看,避免上下文溢出 19 | 20 | ### 📋 功能清单 21 | 22 | #### 17个核心工具函数 23 | 24 | 1. **会话管理** 25 | - `create_ipython_session` - 创建新的 IPython 会话 26 | - `list_ipython_sessions` - 列出所有活跃会话 27 | - `get_session_status` - 获取会话详细状态 28 | - `delete_ipython_session` - 删除指定会话 29 | 30 | 2. **代码执行** 31 | - `execute_code` - 执行 Python 代码、魔法命令、系统命令 32 | - `get_execution_history` - 获取执行历史记录 33 | 34 | 3. **数据加载** 35 | - `load_csv_file` - 加载 CSV 文件(自动编码检测) 36 | - `load_excel_file` - 加载 Excel 文件(支持 .xlsx/.xls) 37 | - `load_json_file` - 加载 JSON 文件 38 | 39 | 4. **数据操作与查看** 40 | - `list_dataframes` - 列出会话中所有 DataFrame 41 | - `get_dataframe_info` - 获取 DataFrame 详细信息 42 | - `preview_dataframe` - 预览 DataFrame 数据 43 | - `get_dataframe_summary` - 获取统计摘要 44 | - `sample_column_data` - 智能采样查看列数据 45 | 46 | 5. **内存与变量管理** 47 | - `check_memory_usage` - 检查内存使用情况 48 | - `get_variable_info` - 获取变量详细信息 49 | - `clear_variables` - 清理变量释放内存 50 | 51 | ### 🛠️ 安装配置 52 | 53 | #### 方法一:使用 uvx 直接运行(推荐) 54 | 55 | 无需克隆项目,直接使用 uvx 从 GitHub 运行: 56 | 57 | ```bash 58 | # 安装 uvx(如果还没有安装) 59 | pip install uvx 60 | 61 | # 直接运行 MCP 服务器 62 | uvx --from git+https://github.com/Hillyess/dataHill.git DATA_MCP.py 63 | ``` 64 | 65 | #### 方法二:本地安装开发 66 | 67 | ```bash 68 | # 1. 克隆项目 69 | git clone git@github.com:Hillyess/dataHill.git 70 | cd dataHill 71 | 72 | # 2. 创建虚拟环境 73 | conda create -n data-analyzer python=3.10 74 | conda activate data-analyzer 75 | 76 | # 3. 安装依赖 77 | pip install -r requirements.txt 78 | 79 | # 4. 测试安装 80 | python DATA_MCP.py 81 | ``` 82 | 83 | #### 配置 MCP 客户端 84 | 85 | ##### Claude Desktop 配置 86 | 87 | 编辑 Claude Desktop 配置文件: 88 | 89 | **macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json` 90 | **Windows**: `%APPDATA%\Claude\claude_desktop_config.json` 91 | 92 | **推荐配置(使用 uvx)**: 93 | ```json 94 | { 95 | "mcpServers": { 96 | "dataHill": { 97 | "command": "uvx", 98 | "args": [ 99 | "--from", 100 | "git+https://github.com/Hillyess/dataHill.git", 101 | "DATA_MCP.py" 102 | ] 103 | } 104 | } 105 | } 106 | ``` 107 | 108 | **本地开发配置**(如果使用方法二): 109 | ```json 110 | { 111 | "mcpServers": { 112 | "dataHill": { 113 | "command": "python", 114 | "args": ["/path/to/your/DATA_MCP.py"], 115 | "env": { 116 | "PYTHONPATH": "/path/to/your/project" 117 | } 118 | } 119 | } 120 | } 121 | ``` 122 | 123 | ### 📖 使用指南 124 | 125 | #### 基本工作流程 126 | 127 | ```python 128 | # 1. 创建会话 129 | create_ipython_session() 130 | # 返回: {"success": true, "session_id": "session_a1b2c3d4", ...} 131 | 132 | # 2. 加载数据 133 | load_csv_file("data.csv", "session_a1b2c3d4", "df") 134 | 135 | # 3. 查看数据信息 136 | get_dataframe_info("df", "session_a1b2c3d4") 137 | 138 | # 4. 智能采样查看数据 139 | sample_column_data("df", "column_name", "session_a1b2c3d4", method="mixed", sample_size=20) 140 | 141 | # 5. 执行分析 142 | execute_code("df.describe()", "session_a1b2c3d4") 143 | 144 | # 6. 内存监控 145 | check_memory_usage("session_a1b2c3d4") 146 | 147 | # 7. 清理会话 148 | delete_ipython_session("session_a1b2c3d4") 149 | ``` 150 | 151 | ### 🔧 系统要求 152 | 153 | - **Python**: 3.8+ 154 | - **内存**: 建议 4GB+ (取决于数据规模) 155 | - **操作系统**: Windows/macOS/Linux 156 | - **MCP 客户端**: Claude Desktop 或其他支持 stdio 的 MCP 客户端 157 | 158 | ### 📦 依赖项 159 | 160 | #### 核心依赖 161 | - `fastmcp>=0.5.0` - MCP 服务器框架 162 | - `ipython>=8.0.0` - IPython 交互式环境 163 | - `pandas>=2.0.0` - 数据处理和分析 164 | - `numpy>=1.24.0` - 数值计算基础库 165 | 166 | #### 数据支持 167 | - `openpyxl>=3.1.0` - Excel .xlsx 文件支持 168 | - `xlrd>=2.0.0` - Excel .xls 文件支持 169 | 170 | #### 系统监控 171 | - `psutil>=5.9.0` - 内存和系统监控 172 | 173 | ### 🤝 贡献指南 174 | 175 | 1. Fork 本项目 176 | 2. 创建特性分支 (`git checkout -b feature/AmazingFeature`) 177 | 3. 提交更改 (`git commit -m 'Add some AmazingFeature'`) 178 | 4. 推送到分支 (`git push origin feature/AmazingFeature`) 179 | 5. 开启 Pull Request 180 | 181 | ### 📄 许可证 182 | 183 | 本项目采用 MIT 许可证 - 查看 [LICENSE](LICENSE) 文件了解详情。 184 | 185 | ### 🙋‍♂️ 支持与反馈 186 | 187 | - **问题报告**: [GitHub Issues](https://github.com/Hillyess/dataHill/issues) 188 | - **功能请求**: [GitHub Discussions](https://github.com/Hillyess/dataHill/discussions) 189 | 190 | --- 191 | 192 | ## English Version 193 | 194 | A lightweight data analysis MCP (Model Context Protocol) tool based on real IPython kernel, providing complete interactive Python data analysis environment with session management, data loading, real-time data viewing and other core functions. 195 | 196 | ### 🚀 Core Features 197 | 198 | - **Real IPython Environment**: Based on IPython InteractiveShell, supports all IPython features 199 | - **Multi-Session Management**: Independent session spaces, variable isolation, persistent state 200 | - **Intelligent Data Loading**: Supports CSV/Excel/JSON, automatic encoding detection, smart variable naming 201 | - **Real-time Monitoring**: Memory usage monitoring, variable management, execution history tracking 202 | - **Complete Feature Support**: Python code, IPython magic commands, system command execution 203 | - **Smart Sampling**: Large dataset friendly column data viewing, avoiding context overflow 204 | 205 | ### 📋 Feature List 206 | 207 | #### 17 Core Tool Functions 208 | 209 | 1. **Session Management** 210 | - `create_ipython_session` - Create new IPython session 211 | - `list_ipython_sessions` - List all active sessions 212 | - `get_session_status` - Get detailed session status 213 | - `delete_ipython_session` - Delete specified session 214 | 215 | 2. **Code Execution** 216 | - `execute_code` - Execute Python code, magic commands, system commands 217 | - `get_execution_history` - Get execution history 218 | 219 | 3. **Data Loading** 220 | - `load_csv_file` - Load CSV files (automatic encoding detection) 221 | - `load_excel_file` - Load Excel files (supports .xlsx/.xls) 222 | - `load_json_file` - Load JSON files 223 | 224 | 4. **Data Operations & Viewing** 225 | - `list_dataframes` - List all DataFrames in session 226 | - `get_dataframe_info` - Get detailed DataFrame information 227 | - `preview_dataframe` - Preview DataFrame data 228 | - `get_dataframe_summary` - Get statistical summary 229 | - `sample_column_data` - Smart sampling for column data viewing 230 | 231 | 5. **Memory & Variable Management** 232 | - `check_memory_usage` - Check memory usage 233 | - `get_variable_info` - Get detailed variable information 234 | - `clear_variables` - Clear variables to free memory 235 | 236 | ### 🛠️ Installation & Configuration 237 | 238 | #### Method 1: Direct Run with uvx (Recommended) 239 | 240 | No need to clone the project, run directly from GitHub using uvx: 241 | 242 | ```bash 243 | # Install uvx (if not already installed) 244 | pip install uvx 245 | 246 | # Run MCP server directly 247 | uvx --from git+https://github.com/Hillyess/dataHill.git DATA_MCP.py 248 | ``` 249 | 250 | #### Method 2: Local Installation for Development 251 | 252 | ```bash 253 | # 1. Clone project 254 | git clone git@github.com:Hillyess/dataHill.git 255 | cd dataHill 256 | 257 | # 2. Create virtual environment 258 | conda create -n data-analyzer python=3.10 259 | conda activate data-analyzer 260 | 261 | # 3. Install dependencies 262 | pip install -r requirements.txt 263 | 264 | # 4. Test installation 265 | python DATA_MCP.py 266 | ``` 267 | 268 | ##### Configure MCP Client 269 | 270 | ##### Claude Desktop Configuration 271 | 272 | Edit Claude Desktop configuration file: 273 | 274 | **macOS**: `~/Library/Application Support/Claude/claude_desktop_config.json` 275 | **Windows**: `%APPDATA%\Claude\claude_desktop_config.json` 276 | 277 | **Recommended Configuration (using uvx)**: 278 | ```json 279 | { 280 | "mcpServers": { 281 | "dataHill": { 282 | "command": "uvx", 283 | "args": [ 284 | "--from", 285 | "git+https://github.com/Hillyess/dataHill.git", 286 | "DATA_MCP.py" 287 | ] 288 | } 289 | } 290 | } 291 | ``` 292 | 293 | **Local Development Configuration** (if using Method 2): 294 | ```json 295 | { 296 | "mcpServers": { 297 | "dataHill": { 298 | "command": "python", 299 | "args": ["/path/to/your/DATA_MCP.py"], 300 | "env": { 301 | "PYTHONPATH": "/path/to/your/project" 302 | } 303 | } 304 | } 305 | } 306 | ``` 307 | 308 | ### 📖 Usage Guide 309 | 310 | #### Basic Workflow 311 | 312 | ```python 313 | # 1. Create session 314 | create_ipython_session() 315 | # Returns: {"success": true, "session_id": "session_a1b2c3d4", ...} 316 | 317 | # 2. Load data 318 | load_csv_file("data.csv", "session_a1b2c3d4", "df") 319 | 320 | # 3. View data information 321 | get_dataframe_info("df", "session_a1b2c3d4") 322 | 323 | # 4. Smart sampling for data viewing 324 | sample_column_data("df", "column_name", "session_a1b2c3d4", method="mixed", sample_size=20) 325 | 326 | # 5. Execute analysis 327 | execute_code("df.describe()", "session_a1b2c3d4") 328 | 329 | # 6. Memory monitoring 330 | check_memory_usage("session_a1b2c3d4") 331 | 332 | # 7. Clean up session 333 | delete_ipython_session("session_a1b2c3d4") 334 | ``` 335 | 336 | ### 🔧 System Requirements 337 | 338 | - **Python**: 3.8+ 339 | - **Memory**: Recommended 4GB+ (depends on data scale) 340 | - **Operating System**: Windows/macOS/Linux 341 | - **MCP Client**: Claude Desktop or other stdio-supported MCP clients 342 | 343 | ### 📦 Dependencies 344 | 345 | #### Core Dependencies 346 | - `fastmcp>=0.5.0` - MCP server framework 347 | - `ipython>=8.0.0` - IPython interactive environment 348 | - `pandas>=2.0.0` - Data processing and analysis 349 | - `numpy>=1.24.0` - Numerical computation foundation 350 | 351 | #### Data Support 352 | - `openpyxl>=3.1.0` - Excel .xlsx file support 353 | - `xlrd>=2.0.0` - Excel .xls file support 354 | 355 | #### System Monitoring 356 | - `psutil>=5.9.0` - Memory and system monitoring 357 | 358 | ### 🤝 Contributing 359 | 360 | 1. Fork this project 361 | 2. Create feature branch (`git checkout -b feature/AmazingFeature`) 362 | 3. Commit changes (`git commit -m 'Add some AmazingFeature'`) 363 | 4. Push to branch (`git push origin feature/AmazingFeature`) 364 | 5. Open Pull Request 365 | 366 | ### 📄 License 367 | 368 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 369 | 370 | ### 🙋‍♂️ Support & Feedback 371 | 372 | - **Issue Reports**: [GitHub Issues](https://github.com/Hillyess/dataHill/issues) 373 | - **Feature Requests**: [GitHub Discussions](https://github.com/Hillyess/dataHill/discussions) 374 | 375 | --- 376 | 377 | ⭐ If this project helps you, please give us a Star! 378 | -------------------------------------------------------------------------------- /DATA_MCP.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | IPython 数据分析 MCP 服务器 4 | 基于真正 IPython 内核的轻量级数据分析 MCP 工具 5 | 提供完整的交互式 Python 数据分析环境,支持会话管理、数据加载、实时数据查看等核心功能 6 | 7 | 完整使用示例: 8 | ```python 9 | # 1. 创建会话 10 | result = create_ipython_session() 11 | session_id = result["session_id"] 12 | 13 | # 2. 加载数据 14 | load_csv_file("data.csv", session_id, "df") 15 | # 或加载Excel文件 (支持.xlsx和.xls) 16 | load_excel_file("data.xlsx", session_id, "df_excel") 17 | 18 | # 3. 查看数据信息 19 | get_dataframe_info("df", session_id) 20 | 21 | # 4. 智能采样查看列数据 (避免大数据集上下文填满) 22 | sample_column_data("df", "column_name", session_id, method="mixed", sample_size=20) 23 | 24 | # 5. 执行分析 25 | execute_code("df.describe()", session_id) 26 | 27 | # 6. 数据可视化 28 | execute_code("df.plot()", session_id) 29 | 30 | # 7. 内存检查 31 | check_memory_usage(session_id) 32 | 33 | # 8. 清理会话 34 | delete_ipython_session(session_id) 35 | ``` 36 | 37 | 支持的功能: 38 | - 17个核心函数,涵盖会话管理、代码执行、数据加载、内存监控 39 | - 真正的 IPython 环境,支持魔法命令和系统命令行命令 40 | - 自动编码检测,支持中文CSV文件 41 | - 完整的Excel支持,同时支持.xlsx和.xls格式 42 | - 智能列数据采样查看,避免大数据集上下文填满 43 | - 完整的数据分析工具集成 44 | - 实时内存监控和变量管理 45 | """ 46 | 47 | import os 48 | import sys 49 | import threading 50 | import time 51 | import uuid 52 | import traceback 53 | import re 54 | from datetime import datetime 55 | from pathlib import Path 56 | from typing import Any, Dict, List, Optional, Union 57 | 58 | # System monitoring 59 | try: 60 | import psutil 61 | PSUTIL_AVAILABLE = True 62 | except ImportError: 63 | PSUTIL_AVAILABLE = False 64 | 65 | # IPython imports 66 | try: 67 | from IPython.core.interactiveshell import InteractiveShell 68 | from IPython.utils.capture import capture_output 69 | IPYTHON_AVAILABLE = True 70 | except ImportError: 71 | IPYTHON_AVAILABLE = False 72 | print("警告: IPython 未安装,请执行: pip install ipython") 73 | 74 | # Data processing imports 75 | try: 76 | import pandas as pd 77 | import numpy as np 78 | PANDAS_AVAILABLE = True 79 | except ImportError: 80 | PANDAS_AVAILABLE = False 81 | print("警告: pandas/numpy 未安装,请执行: pip install pandas numpy") 82 | 83 | # Excel support 84 | try: 85 | import openpyxl 86 | EXCEL_XLSX_AVAILABLE = True 87 | except ImportError: 88 | EXCEL_XLSX_AVAILABLE = False 89 | 90 | try: 91 | import xlrd 92 | EXCEL_XLS_AVAILABLE = True 93 | except ImportError: 94 | EXCEL_XLS_AVAILABLE = False 95 | 96 | EXCEL_AVAILABLE = EXCEL_XLSX_AVAILABLE or EXCEL_XLS_AVAILABLE 97 | 98 | # MCP imports 99 | from fastmcp import FastMCP 100 | from pydantic import BaseModel 101 | 102 | # 创建MCP应用实例 103 | mcp = FastMCP("IPython Data Analysis MCP Server") 104 | 105 | # 全局会话管理器 106 | _session_manager = None 107 | _session_lock = threading.Lock() 108 | 109 | class ExecutionResult(BaseModel): 110 | """代码执行结果模型""" 111 | success: bool 112 | execution_count: int 113 | stdout: str 114 | stderr: str 115 | result: Optional[str] = None 116 | execution_time: float 117 | memory_delta_mb: float 118 | error: Optional[str] = None 119 | 120 | class SessionInfo(BaseModel): 121 | """会话信息模型""" 122 | session_id: str 123 | status: str 124 | created_at: str 125 | last_used: str 126 | execution_count: int 127 | memory_usage_mb: float 128 | variable_count: int 129 | 130 | def remove_ansi_codes(text: str) -> str: 131 | """移除文本中的 ANSI 转义序列""" 132 | if not text: 133 | return text 134 | # ANSI 转义序列的正则表达式 135 | ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') 136 | return ansi_escape.sub('', text) 137 | 138 | class IPythonSession: 139 | """IPython 会话封装""" 140 | 141 | def __init__(self, session_id: str, auto_import: bool = True): 142 | self.session_id = session_id 143 | self.created_at = datetime.now() 144 | self.last_used = datetime.now() 145 | self.execution_count = 0 146 | self.history = [] 147 | 148 | if not IPYTHON_AVAILABLE: 149 | raise ImportError("IPython is required but not available") 150 | 151 | # 创建独立的 IPython shell 实例 152 | self.shell = InteractiveShell() 153 | 154 | # 配置无颜色模式以避免 ANSI 代码 155 | self.shell.colors = 'NoColor' 156 | 157 | # 配置 PlainTextFormatter 减少详细输出 158 | try: 159 | plain_formatter = self.shell.display_formatter.formatters['text/plain'] 160 | plain_formatter.verbose = False 161 | except: 162 | pass 163 | 164 | # 禁用 GUI 165 | self.shell.enable_gui = lambda x: None 166 | 167 | # 设置 matplotlib 后端为 Agg (non-interactive) 168 | try: 169 | self.shell.run_cell("import matplotlib; matplotlib.use('Agg')") 170 | except: 171 | pass 172 | 173 | if auto_import: 174 | self._auto_import_libraries() 175 | 176 | def _auto_import_libraries(self) -> List[str]: 177 | """自动导入常用库""" 178 | imported = [] 179 | import_code = """ 180 | import pandas as pd 181 | import numpy as np 182 | import json 183 | import os 184 | import sys 185 | from pathlib import Path 186 | import warnings 187 | warnings.filterwarnings('ignore') 188 | 189 | # 设置pandas显示选项 190 | pd.set_option('display.max_columns', None) 191 | pd.set_option('display.width', None) 192 | pd.set_option('display.max_colwidth', 100) 193 | 194 | print("✓ 已自动导入: pandas, numpy, json, os, sys, pathlib") 195 | """ 196 | 197 | try: 198 | result = self.shell.run_cell(import_code) 199 | if not result.error_in_exec: 200 | imported = ["pandas", "numpy", "json", "os", "sys", "pathlib"] 201 | except Exception as e: 202 | print(f"自动导入失败: {e}") 203 | 204 | return imported 205 | 206 | def _format_result(self, result_obj: Any) -> str: 207 | """使用 IPython DisplayFormatter 格式化结果对象""" 208 | if result_obj is None: 209 | return None 210 | 211 | try: 212 | # 优先使用 IPython 的 DisplayFormatter 系统 213 | format_dict, _ = self.shell.display_formatter.format( 214 | result_obj, 215 | include=['text/plain'] 216 | ) 217 | formatted_result = format_dict.get('text/plain') 218 | 219 | if formatted_result: 220 | return formatted_result 221 | 222 | except Exception: 223 | # 如果 DisplayFormatter 失败,回退到传统方法 224 | pass 225 | 226 | # 回退方案:传统格式化方法 227 | try: 228 | # 对于 DataFrame,使用 to_string() 方法获得更好的格式 229 | if hasattr(result_obj, 'to_string'): 230 | return result_obj.to_string() 231 | 232 | # 对于 numpy 数组,使用 __str__ 方法 233 | if hasattr(result_obj, '__array__'): 234 | return str(result_obj) 235 | 236 | # 对于其他对象,使用 str() 237 | return str(result_obj) 238 | 239 | except Exception: 240 | return str(result_obj) 241 | 242 | def _format_result_direct(self, result_obj: Any) -> str: 243 | """直接使用 PlainTextFormatter 格式化结果""" 244 | if result_obj is None: 245 | return None 246 | 247 | try: 248 | # 直接使用 PlainTextFormatter 249 | plain_formatter = self.shell.display_formatter.formatters['text/plain'] 250 | formatted_result = plain_formatter(result_obj) 251 | 252 | if formatted_result: 253 | return formatted_result 254 | 255 | except Exception: 256 | # 如果直接格式化失败,使用回退方案 257 | pass 258 | 259 | # 回退到传统格式化方法 260 | return self._format_result(result_obj) 261 | 262 | def execute_expression_only(self, code: str) -> ExecutionResult: 263 | """只执行表达式并返回结果,不捕获 print 输出 - 适用于纯表达式求值""" 264 | self.last_used = datetime.now() 265 | start_time = time.time() 266 | initial_memory = self._get_memory_usage() 267 | 268 | try: 269 | # 确保使用无颜色模式 270 | original_colors = getattr(self.shell, 'colors', 'NoColor') 271 | self.shell.colors = 'NoColor' 272 | 273 | # 直接执行,不捕获输出 274 | execution_result = self.shell.run_cell(code, store_history=True) 275 | 276 | # 恢复颜色设置 277 | self.shell.colors = original_colors 278 | 279 | self.execution_count += 1 280 | execution_time = time.time() - start_time 281 | current_memory = self._get_memory_usage() 282 | memory_delta = current_memory - initial_memory 283 | 284 | # 直接使用 PlainTextFormatter 格式化结果 285 | formatted_result = self._format_result_direct(execution_result.result) 286 | 287 | # 处理错误信息 288 | error_msg = None 289 | if execution_result.error_in_exec: 290 | error_msg = remove_ansi_codes(str(execution_result.error_in_exec)) 291 | 292 | # 记录执行历史 293 | history_entry = { 294 | 'execution_count': self.execution_count, 295 | 'timestamp': self.last_used.isoformat(), 296 | 'code': code, 297 | 'success': not bool(execution_result.error_in_exec), 298 | 'execution_time': execution_time, 299 | 'stdout': "", # 不捕获 stdout 300 | 'stderr': "", # 不捕获 stderr 301 | 'result': formatted_result, 302 | 'error': error_msg 303 | } 304 | self.history.append(history_entry) 305 | 306 | # 保持历史记录在合理大小 307 | if len(self.history) > 100: 308 | self.history = self.history[-100:] 309 | 310 | return ExecutionResult( 311 | success=not bool(execution_result.error_in_exec), 312 | execution_count=self.execution_count, 313 | stdout="", # 不返回 stdout 314 | stderr="", # 不返回 stderr 315 | result=formatted_result, 316 | execution_time=execution_time, 317 | memory_delta_mb=memory_delta, 318 | error=error_msg 319 | ) 320 | 321 | except Exception as e: 322 | execution_time = time.time() - start_time 323 | error_msg = f"执行错误: {str(e)}\n{traceback.format_exc()}" 324 | 325 | return ExecutionResult( 326 | success=False, 327 | execution_count=self.execution_count, 328 | stdout="", 329 | stderr="", 330 | result=None, 331 | execution_time=execution_time, 332 | memory_delta_mb=0.0, 333 | error=remove_ansi_codes(error_msg) 334 | ) 335 | 336 | def execute(self, code: str) -> ExecutionResult: 337 | """执行代码并返回结果 - 使用混合输出处理方案""" 338 | self.last_used = datetime.now() 339 | start_time = time.time() 340 | initial_memory = self._get_memory_usage() 341 | 342 | try: 343 | # 确保使用无颜色模式 344 | original_colors = getattr(self.shell, 'colors', 'NoColor') 345 | self.shell.colors = 'NoColor' 346 | 347 | # 使用 IPython 的 capture_output 捕获 print 输出和 stderr 348 | with capture_output() as captured: 349 | execution_result = self.shell.run_cell(code, store_history=True) 350 | 351 | # 恢复颜色设置 352 | self.shell.colors = original_colors 353 | 354 | self.execution_count += 1 355 | execution_time = time.time() - start_time 356 | current_memory = self._get_memory_usage() 357 | memory_delta = current_memory - initial_memory 358 | 359 | # 清理输出中可能残留的 ANSI 代码 360 | stdout = remove_ansi_codes(captured.stdout) 361 | stderr = remove_ansi_codes(captured.stderr) 362 | 363 | # 使用 DisplayFormatter 格式化表达式结果 364 | formatted_result = self._format_result(execution_result.result) 365 | 366 | # 处理错误信息 367 | error_msg = None 368 | if execution_result.error_in_exec: 369 | error_msg = remove_ansi_codes(str(execution_result.error_in_exec)) 370 | 371 | # 记录执行历史 372 | history_entry = { 373 | 'execution_count': self.execution_count, 374 | 'timestamp': self.last_used.isoformat(), 375 | 'code': code, 376 | 'success': not bool(execution_result.error_in_exec), 377 | 'execution_time': execution_time, 378 | 'stdout': stdout, 379 | 'stderr': stderr, 380 | 'result': formatted_result, 381 | 'error': error_msg 382 | } 383 | self.history.append(history_entry) 384 | 385 | # 保持历史记录在合理大小 386 | if len(self.history) > 100: 387 | self.history = self.history[-100:] 388 | 389 | return ExecutionResult( 390 | success=not bool(execution_result.error_in_exec), 391 | execution_count=self.execution_count, 392 | stdout=stdout, 393 | stderr=stderr, 394 | result=formatted_result, 395 | execution_time=execution_time, 396 | memory_delta_mb=memory_delta, 397 | error=error_msg 398 | ) 399 | 400 | except Exception as e: 401 | execution_time = time.time() - start_time 402 | error_msg = f"执行错误: {str(e)}\n{traceback.format_exc()}" 403 | 404 | return ExecutionResult( 405 | success=False, 406 | execution_count=self.execution_count, 407 | stdout="", 408 | stderr="", 409 | result=None, 410 | execution_time=execution_time, 411 | memory_delta_mb=0.0, 412 | error=remove_ansi_codes(error_msg) 413 | ) 414 | 415 | def get_variables(self) -> Dict[str, Any]: 416 | """获取当前命名空间中的变量""" 417 | variables = {} 418 | user_ns = self.shell.user_ns 419 | 420 | for name, value in user_ns.items(): 421 | if not name.startswith('_') and name not in ['In', 'Out', 'get_ipython', 'exit', 'quit']: 422 | try: 423 | var_type = type(value).__name__ 424 | var_info = { 425 | 'type': var_type, 426 | 'size_bytes': sys.getsizeof(value) 427 | } 428 | 429 | # 特殊处理不同类型的变量 430 | if hasattr(value, 'shape') and hasattr(value, 'dtypes'): # DataFrame 431 | var_info.update({ 432 | 'shape': list(value.shape), 433 | 'columns': list(value.columns) if hasattr(value, 'columns') else None, 434 | 'memory_usage': value.memory_usage(deep=True).sum() if hasattr(value, 'memory_usage') else None 435 | }) 436 | elif hasattr(value, 'shape'): # numpy array 437 | var_info.update({ 438 | 'shape': list(value.shape), 439 | 'dtype': str(value.dtype) if hasattr(value, 'dtype') else None 440 | }) 441 | elif isinstance(value, (list, tuple, dict, set)): 442 | var_info['length'] = len(value) 443 | 444 | variables[name] = var_info 445 | 446 | except Exception: 447 | variables[name] = {'type': var_type, 'size_bytes': 0} 448 | 449 | return variables 450 | 451 | def _get_memory_usage(self) -> float: 452 | """获取当前内存使用量(MB)""" 453 | if not PSUTIL_AVAILABLE: 454 | return 0.0 455 | try: 456 | process = psutil.Process() 457 | return process.memory_info().rss / 1024 / 1024 458 | except: 459 | return 0.0 460 | 461 | def get_memory_info(self) -> Dict[str, Any]: 462 | """获取详细内存信息""" 463 | variables = self.get_variables() 464 | 465 | # 按类型分组统计内存使用 466 | breakdown = { 467 | 'dataframes': 0.0, 468 | 'lists': 0.0, 469 | 'scalars': 0.0, 470 | 'functions': 0.0, 471 | 'others': 0.0 472 | } 473 | 474 | top_variables = [] 475 | 476 | for name, var_info in variables.items(): 477 | memory_mb = var_info.get('size_bytes', 0) / 1024 / 1024 478 | var_type = var_info.get('type', 'unknown') 479 | 480 | # 特殊处理 DataFrame 内存使用 481 | if 'memory_usage' in var_info and var_info['memory_usage']: 482 | memory_mb = var_info['memory_usage'] / 1024 / 1024 483 | 484 | top_variables.append({ 485 | 'name': name, 486 | 'type': var_type, 487 | 'memory_mb': round(memory_mb, 2) 488 | }) 489 | 490 | # 分类统计 491 | if 'DataFrame' in var_type: 492 | breakdown['dataframes'] += memory_mb 493 | elif var_type in ['list', 'tuple', 'set']: 494 | breakdown['lists'] += memory_mb 495 | elif var_type in ['int', 'float', 'str', 'bool']: 496 | breakdown['scalars'] += memory_mb 497 | elif 'function' in var_type.lower(): 498 | breakdown['functions'] += memory_mb 499 | else: 500 | breakdown['others'] += memory_mb 501 | 502 | # 按内存使用排序 503 | top_variables.sort(key=lambda x: x['memory_mb'], reverse=True) 504 | 505 | # 系统内存信息 506 | if PSUTIL_AVAILABLE: 507 | try: 508 | system_memory = psutil.virtual_memory() 509 | system_info = { 510 | 'available_mb': round(system_memory.available / 1024 / 1024, 1), 511 | 'used_percent': system_memory.percent 512 | } 513 | except: 514 | system_info = {'available_mb': 0, 'used_percent': 0} 515 | else: 516 | system_info = {'available_mb': 0, 'used_percent': 0} 517 | 518 | total_memory = sum(breakdown.values()) 519 | 520 | return { 521 | 'total_memory_mb': round(total_memory, 1), 522 | 'breakdown': {k: round(v, 1) for k, v in breakdown.items()}, 523 | 'top_variables': top_variables[:10], # 前10个最大的变量 524 | 'system_memory': system_info 525 | } 526 | 527 | class IPythonSessionManager: 528 | """IPython 会话管理器""" 529 | 530 | def __init__(self): 531 | self.sessions: Dict[str, IPythonSession] = {} 532 | self.lock = threading.Lock() 533 | 534 | def create_session(self, session_id: Optional[str] = None, auto_import: bool = True) -> str: 535 | """创建新会话""" 536 | with self.lock: 537 | if session_id is None: 538 | session_id = f"session_{uuid.uuid4().hex[:8]}" 539 | 540 | if session_id in self.sessions: 541 | raise ValueError(f"Session {session_id} already exists") 542 | 543 | session = IPythonSession(session_id, auto_import) 544 | self.sessions[session_id] = session 545 | return session_id 546 | 547 | def get_session(self, session_id: str) -> IPythonSession: 548 | """获取会话""" 549 | with self.lock: 550 | if session_id not in self.sessions: 551 | raise ValueError(f"Session {session_id} not found") 552 | return self.sessions[session_id] 553 | 554 | def delete_session(self, session_id: str) -> bool: 555 | """删除会话""" 556 | with self.lock: 557 | if session_id in self.sessions: 558 | del self.sessions[session_id] 559 | return True 560 | return False 561 | 562 | def list_sessions(self) -> List[SessionInfo]: 563 | """列出所有会话""" 564 | with self.lock: 565 | sessions = [] 566 | for session_id, session in self.sessions.items(): 567 | variables = session.get_variables() 568 | memory_info = session.get_memory_info() 569 | 570 | sessions.append(SessionInfo( 571 | session_id=session_id, 572 | status="active", 573 | created_at=session.created_at.isoformat(), 574 | last_used=session.last_used.isoformat(), 575 | execution_count=session.execution_count, 576 | memory_usage_mb=memory_info['total_memory_mb'], 577 | variable_count=len(variables) 578 | )) 579 | return sessions 580 | 581 | class DataLoader: 582 | """数据加载器""" 583 | 584 | @staticmethod 585 | def detect_encoding(file_path: str) -> str: 586 | """自动检测文件编码""" 587 | encodings = ['utf-8', 'gb18030', 'gbk', 'gb2312', 'latin-1'] 588 | 589 | for encoding in encodings: 590 | try: 591 | with open(file_path, 'r', encoding=encoding) as f: 592 | f.read(1024) # 读取一部分内容进行测试 593 | return encoding 594 | except (UnicodeDecodeError, UnicodeError): 595 | continue 596 | 597 | return 'utf-8' # 默认编码 598 | 599 | @staticmethod 600 | def load_csv(file_path: str, encoding: str = "auto") -> tuple: 601 | """加载 CSV 文件""" 602 | if not PANDAS_AVAILABLE: 603 | raise ImportError("pandas is required for CSV loading") 604 | 605 | if not os.path.exists(file_path): 606 | raise FileNotFoundError(f"File not found: {file_path}") 607 | 608 | if encoding == "auto": 609 | encoding = DataLoader.detect_encoding(file_path) 610 | 611 | start_time = time.time() 612 | df = pd.read_csv(file_path, encoding=encoding) 613 | load_time = time.time() - start_time 614 | 615 | return df, encoding, load_time 616 | 617 | @staticmethod 618 | def load_excel(file_path: str, sheet_name: Union[str, int] = 0): 619 | """加载 Excel 文件 - 支持 .xlsx 和 .xls 格式""" 620 | if not PANDAS_AVAILABLE: 621 | raise ImportError("pandas is required for Excel loading") 622 | 623 | if not EXCEL_AVAILABLE: 624 | raise ImportError("Excel support libraries are required. Install with: pip install openpyxl xlrd") 625 | 626 | if not os.path.exists(file_path): 627 | raise FileNotFoundError(f"File not found: {file_path}") 628 | 629 | # 检测文件扩展名并选择合适的引擎 630 | file_extension = Path(file_path).suffix.lower() 631 | 632 | if file_extension == '.xlsx': 633 | if not EXCEL_XLSX_AVAILABLE: 634 | raise ImportError("openpyxl is required for .xlsx files. Install with: pip install openpyxl") 635 | engine = 'openpyxl' 636 | elif file_extension == '.xls': 637 | if not EXCEL_XLS_AVAILABLE: 638 | raise ImportError("xlrd is required for .xls files. Install with: pip install xlrd") 639 | engine = 'xlrd' 640 | else: 641 | # 默认尝试使用openpyxl,如果失败则尝试xlrd 642 | engine = None 643 | 644 | start_time = time.time() 645 | try: 646 | df = pd.read_excel(file_path, sheet_name=sheet_name, engine=engine) 647 | except Exception as e: 648 | if engine is None and file_extension not in ['.xlsx', '.xls']: 649 | # 尝试不同的引擎 650 | for alt_engine in ['openpyxl', 'xlrd']: 651 | try: 652 | df = pd.read_excel(file_path, sheet_name=sheet_name, engine=alt_engine) 653 | break 654 | except: 655 | continue 656 | else: 657 | raise e 658 | else: 659 | raise e 660 | 661 | load_time = time.time() - start_time 662 | 663 | return df, load_time 664 | 665 | @staticmethod 666 | def load_json(file_path: str): 667 | """加载 JSON 文件""" 668 | if not PANDAS_AVAILABLE: 669 | raise ImportError("pandas is required for JSON loading") 670 | 671 | if not os.path.exists(file_path): 672 | raise FileNotFoundError(f"File not found: {file_path}") 673 | 674 | start_time = time.time() 675 | df = pd.read_json(file_path) 676 | load_time = time.time() - start_time 677 | 678 | return df, load_time 679 | 680 | def get_session_manager() -> IPythonSessionManager: 681 | """获取全局会话管理器""" 682 | global _session_manager 683 | if _session_manager is None: 684 | _session_manager = IPythonSessionManager() 685 | return _session_manager 686 | 687 | # ============================================================================= 688 | # MCP 工具函数实现 689 | # ============================================================================= 690 | 691 | @mcp.tool() 692 | def create_ipython_session( 693 | session_id: Optional[str] = None, 694 | auto_import: bool = True 695 | ) -> Dict[str, Any]: 696 | """ 697 | 创建新的 IPython 会话 698 | 699 | 功能说明: 700 | - 创建一个独立的 IPython 交互式会话环境 701 | - 每个会话拥有独立的命名空间,变量不会相互影响 702 | - 可选择自动导入常用数据科学库 (pandas, numpy, matplotlib等) 703 | - 支持所有 IPython 功能:Python代码、魔法命令、系统命令 704 | 705 | Args: 706 | session_id: 会话ID,可选,不提供则自动生成 (格式: session_xxxxxxxx) 707 | auto_import: 是否自动导入常用库 (pandas, numpy, json, os, sys, pathlib) 708 | 709 | Returns: 710 | Dict: 包含会话创建结果的字典 711 | 712 | 调用样例: 713 | ```python 714 | # 创建会话,自动生成ID 715 | result = create_ipython_session() 716 | 717 | # 创建指定ID的会话,不自动导入库 718 | result = create_ipython_session(session_id="my_session", auto_import=False) 719 | ``` 720 | 721 | 返回格式: 722 | 成功时包含: success, session_id, message, auto_imported (导入的库列表) 723 | 失败时包含: success, error 724 | """ 725 | try: 726 | if not IPYTHON_AVAILABLE: 727 | return { 728 | "success": False, 729 | "error": "IPython is not available. Please install with: pip install ipython" 730 | } 731 | 732 | manager = get_session_manager() 733 | created_session_id = manager.create_session(session_id, auto_import) 734 | 735 | auto_imported = [] 736 | if auto_import: 737 | auto_imported = ["pandas", "numpy", "json", "os", "sys", "pathlib"] 738 | 739 | return { 740 | "success": True, 741 | "session_id": created_session_id, 742 | "message": f"IPython session {created_session_id} created successfully", 743 | "auto_imported": auto_imported 744 | } 745 | 746 | except Exception as e: 747 | return { 748 | "success": False, 749 | "error": f"Failed to create session: {str(e)}" 750 | } 751 | 752 | @mcp.tool() 753 | def list_ipython_sessions() -> Dict[str, Any]: 754 | """ 755 | 列出所有活跃的 IPython 会话 756 | 757 | 功能说明: 758 | - 获取当前所有活跃会话的基本信息 759 | - 显示每个会话的创建时间、最后使用时间、执行次数等 760 | - 显示会话的内存使用情况和变量数量 761 | - 用于会话管理和监控 762 | 763 | Args: 764 | 无参数 765 | 766 | Returns: 767 | Dict: 包含所有会话信息的字典 768 | 769 | 调用样例: 770 | ```python 771 | # 列出所有会话 772 | result = list_ipython_sessions() 773 | ``` 774 | 775 | 返回格式: 776 | 成功时包含: success, sessions (会话列表), total_sessions 777 | 每个会话包含: session_id, status, created_at, last_used, execution_count, memory_usage_mb, variable_count 778 | 失败时包含: success, error 779 | """ 780 | try: 781 | manager = get_session_manager() 782 | sessions = manager.list_sessions() 783 | 784 | return { 785 | "success": True, 786 | "sessions": [session.model_dump() for session in sessions], 787 | "total_sessions": len(sessions) 788 | } 789 | 790 | except Exception as e: 791 | return { 792 | "success": False, 793 | "error": f"Failed to list sessions: {str(e)}" 794 | } 795 | 796 | @mcp.tool() 797 | def get_session_status(session_id: str) -> Dict[str, Any]: 798 | """ 799 | 获取指定会话的详细状态信息 800 | 801 | 功能说明: 802 | - 获取会话的详细状态信息 803 | - 包含内存使用详情和变量统计 804 | - 按变量类型分组统计 805 | 806 | Args: 807 | session_id: 会话ID 808 | 809 | 调用样例: 810 | ```python 811 | result = get_session_status(session_id="session_a1b2c3d4") 812 | ``` 813 | 814 | 返回格式: 815 | 成功时包含: success, session_info (状态、时间、内存使用、变量统计) 816 | 失败时包含: success, error 817 | """ 818 | try: 819 | manager = get_session_manager() 820 | session = manager.get_session(session_id) 821 | 822 | variables = session.get_variables() 823 | memory_info = session.get_memory_info() 824 | 825 | # 按类型统计变量 826 | variable_summary = { 827 | 'dataframes': 0, 828 | 'lists': 0, 829 | 'scalars': 0, 830 | 'functions': 0, 831 | 'others': 0 832 | } 833 | 834 | for var_info in variables.values(): 835 | var_type = var_info.get('type', 'unknown') 836 | if 'DataFrame' in var_type: 837 | variable_summary['dataframes'] += 1 838 | elif var_type in ['list', 'tuple', 'set']: 839 | variable_summary['lists'] += 1 840 | elif var_type in ['int', 'float', 'str', 'bool']: 841 | variable_summary['scalars'] += 1 842 | elif 'function' in var_type.lower(): 843 | variable_summary['functions'] += 1 844 | else: 845 | variable_summary['others'] += 1 846 | 847 | return { 848 | "success": True, 849 | "session_info": { 850 | "session_id": session_id, 851 | "status": "active", 852 | "created_at": session.created_at.isoformat(), 853 | "last_used": session.last_used.isoformat(), 854 | "execution_count": session.execution_count, 855 | "memory_usage": { 856 | "total_mb": memory_info['total_memory_mb'], 857 | "breakdown": memory_info['breakdown'] 858 | }, 859 | "variable_summary": variable_summary 860 | } 861 | } 862 | 863 | except ValueError as e: 864 | return { 865 | "success": False, 866 | "error": str(e) 867 | } 868 | except Exception as e: 869 | return { 870 | "success": False, 871 | "error": f"Failed to get session status: {str(e)}" 872 | } 873 | 874 | @mcp.tool() 875 | def delete_ipython_session(session_id: str) -> Dict[str, Any]: 876 | """ 877 | 删除指定的 IPython 会话 878 | 879 | 功能说明: 880 | - 删除指定的会话及其所有数据 881 | - 释放会话占用的内存 882 | 883 | Args: 884 | session_id: 会话ID 885 | 886 | 调用样例: 887 | ```python 888 | result = delete_ipython_session(session_id="session_a1b2c3d4") 889 | ``` 890 | 891 | 返回格式: 892 | 成功时包含: success, message 893 | 失败时包含: success, error 894 | """ 895 | try: 896 | manager = get_session_manager() 897 | success = manager.delete_session(session_id) 898 | 899 | if success: 900 | return { 901 | "success": True, 902 | "message": f"Session {session_id} deleted successfully" 903 | } 904 | else: 905 | return { 906 | "success": False, 907 | "error": f"Session {session_id} not found" 908 | } 909 | 910 | except Exception as e: 911 | return { 912 | "success": False, 913 | "error": f"Failed to delete session: {str(e)}" 914 | } 915 | 916 | @mcp.tool() 917 | def execute_code( 918 | code: str, 919 | session_id: str, 920 | capture_output: bool = True, 921 | expression_only: bool = False 922 | ) -> Dict[str, Any]: 923 | """ 924 | 在指定会话中执行代码 925 | 926 | 功能说明: 927 | - 在指定的 IPython 会话中执行代码 928 | - 支持三种代码类型:Python代码、IPython魔法命令、系统命令 929 | - 自动捕获执行输出、错误信息和返回值 930 | - 记录执行时间和内存变化 931 | - 所有变量状态在会话中持久保存 932 | - 支持两种执行模式:完整模式和表达式模式 933 | 934 | 支持的代码类型: 935 | - Python代码: x = 1 + 1、import pandas as pd、df.head() 936 | - 魔法命令: %timeit sum(range(100))、%matplotlib inline、%who、%whos 937 | - 命令行命令: !ls -la、!pip install numpy、!pwd、!cat file.txt 938 | 939 | Args: 940 | code: 要执行的代码(支持多行) 941 | session_id: 会话ID 942 | capture_output: 是否捕获输出(保留参数,目前总是捕获) 943 | expression_only: 是否使用表达式模式(True时只返回表达式结果,不捕获print输出) 944 | 945 | Returns: 946 | Dict: 执行结果 947 | 948 | 调用样例: 949 | ```python 950 | # Python代码执行(完整模式,捕获print输出) 951 | result = execute_code( 952 | code="x = 10\ny = 20\nprint(f'x + y = {x + y}')", 953 | session_id="session_a1b2c3d4" 954 | ) 955 | 956 | # 表达式模式(只返回表达式结果,不捕获print输出) 957 | result = execute_code( 958 | code="df.head()", 959 | session_id="session_a1b2c3d4", 960 | expression_only=True 961 | ) 962 | 963 | # 魔法命令执行 964 | result = execute_code( 965 | code="%timeit sum(range(1000))", 966 | session_id="session_a1b2c3d4" 967 | ) 968 | 969 | # 系统命令执行 970 | result = execute_code( 971 | code="!ls -la", 972 | session_id="session_a1b2c3d4" 973 | ) 974 | 975 | # 多行代码执行 976 | result = execute_code( 977 | code=\"\"\" 978 | import pandas as pd 979 | df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) 980 | print("DataFrame created:") 981 | df.head() 982 | \"\"\", 983 | session_id="session_a1b2c3d4" 984 | ) 985 | ``` 986 | 987 | 返回格式: 988 | 包含: success, execution_count, stdout, stderr, result, execution_time, memory_delta_mb, error 989 | 成功时 error 为 null,失败时 error 包含错误信息 990 | """ 991 | try: 992 | manager = get_session_manager() 993 | session = manager.get_session(session_id) 994 | 995 | # 根据参数选择执行方法 996 | if expression_only: 997 | result = session.execute_expression_only(code) 998 | else: 999 | result = session.execute(code) 1000 | 1001 | return { 1002 | "success": result.success, 1003 | "execution_count": result.execution_count, 1004 | "stdout": result.stdout, 1005 | "stderr": result.stderr, 1006 | "result": result.result, 1007 | "execution_time": round(result.execution_time, 3), 1008 | "memory_delta_mb": round(result.memory_delta_mb, 2), 1009 | "error": result.error 1010 | } 1011 | 1012 | except ValueError as e: 1013 | return { 1014 | "success": False, 1015 | "error": str(e) 1016 | } 1017 | except Exception as e: 1018 | return { 1019 | "success": False, 1020 | "error": f"Failed to execute code: {str(e)}" 1021 | } 1022 | 1023 | @mcp.tool() 1024 | def get_execution_history( 1025 | session_id: str, 1026 | limit: int = 10, 1027 | include_output: bool = False 1028 | ) -> Dict[str, Any]: 1029 | """ 1030 | 获取会话的执行历史 1031 | 1032 | 功能说明: 1033 | - 获取会话中的代码执行历史记录 1034 | - 可选择包含或排除执行输出 1035 | - 支持限制返回数量 1036 | 1037 | Args: 1038 | session_id: 会话ID 1039 | limit: 返回的历史记录数量 1040 | include_output: 是否包含输出结果 1041 | 1042 | 调用样例: 1043 | ```python 1044 | # 获取最近10条历史,不包含输出 1045 | result = get_execution_history(session_id="session_a1b2c3d4", limit=10) 1046 | 1047 | # 获取最近5条历史,包含输出 1048 | result = get_execution_history(session_id="session_a1b2c3d4", limit=5, include_output=True) 1049 | ``` 1050 | 1051 | 返回格式: 1052 | 成功时包含: success, history (历史记录列表), total_executions 1053 | 失败时包含: success, error 1054 | """ 1055 | try: 1056 | manager = get_session_manager() 1057 | session = manager.get_session(session_id) 1058 | 1059 | history = session.history[-limit:] if limit > 0 else session.history 1060 | 1061 | if not include_output: 1062 | # 移除输出内容以节省空间 1063 | history = [ 1064 | {k: v for k, v in entry.items() 1065 | if k not in ['stdout', 'stderr', 'result']} 1066 | for entry in history 1067 | ] 1068 | 1069 | return { 1070 | "success": True, 1071 | "history": history, 1072 | "total_executions": len(session.history) 1073 | } 1074 | 1075 | except ValueError as e: 1076 | return { 1077 | "success": False, 1078 | "error": str(e) 1079 | } 1080 | except Exception as e: 1081 | return { 1082 | "success": False, 1083 | "error": f"Failed to get execution history: {str(e)}" 1084 | } 1085 | 1086 | @mcp.tool() 1087 | def load_csv_file( 1088 | file_path: str, 1089 | session_id: str, 1090 | variable_name: Optional[str] = None, 1091 | encoding: str = "auto" 1092 | ) -> Dict[str, Any]: 1093 | """ 1094 | 加载 CSV 文件到 DataFrame 1095 | 1096 | 功能说明: 1097 | - 将 CSV 文件加载为 pandas DataFrame 并存储到指定会话中 1098 | - 自动检测文件编码,支持中文文件 1099 | - 自动生成变量名或使用指定变量名 1100 | - 返回 DataFrame 的详细信息(形状、列名、数据类型、内存使用等) 1101 | - 支持各种 CSV 格式和分隔符 1102 | 1103 | Args: 1104 | file_path: CSV文件的绝对或相对路径 1105 | session_id: 目标会话ID 1106 | variable_name: 存储的变量名,不提供则自动生成 (格式: df_文件名) 1107 | encoding: 文件编码,"auto"为自动检测,支持 utf-8, gbk, gb2312 等 1108 | 1109 | Returns: 1110 | Dict: 加载结果,包含 DataFrame 信息 1111 | 1112 | 调用样例: 1113 | ```python 1114 | # 基本加载,自动生成变量名 1115 | result = load_csv_file( 1116 | file_path="/path/to/sales_data.csv", 1117 | session_id="session_a1b2c3d4" 1118 | ) 1119 | 1120 | # 指定变量名和编码 1121 | result = load_csv_file( 1122 | file_path="./data/中文数据.csv", 1123 | session_id="session_a1b2c3d4", 1124 | variable_name="chinese_data", 1125 | encoding="gb2312" 1126 | ) 1127 | 1128 | # 自动编码检测 1129 | result = load_csv_file( 1130 | file_path="data.csv", 1131 | session_id="session_a1b2c3d4", 1132 | encoding="auto" # 默认值 1133 | ) 1134 | ``` 1135 | 1136 | 返回格式: 1137 | 成功时包含: success, variable_name, shape, columns, dtypes, memory_usage_mb, encoding_detected, load_time 1138 | 失败时包含: success, error 1139 | """ 1140 | try: 1141 | if not PANDAS_AVAILABLE: 1142 | return { 1143 | "success": False, 1144 | "error": "pandas is not available. Please install with: pip install pandas numpy" 1145 | } 1146 | 1147 | manager = get_session_manager() 1148 | session = manager.get_session(session_id) 1149 | 1150 | # 加载数据 1151 | df, detected_encoding, load_time = DataLoader.load_csv(file_path, encoding) 1152 | 1153 | # 生成变量名 1154 | if variable_name is None: 1155 | base_name = Path(file_path).stem 1156 | variable_name = f"df_{base_name}" 1157 | 1158 | # 确保变量名不冲突 1159 | variables = session.get_variables() 1160 | counter = 1 1161 | while variable_name in variables: 1162 | variable_name = f"df_{base_name}_{counter}" 1163 | counter += 1 1164 | 1165 | # 将DataFrame添加到会话命名空间 1166 | session.shell.user_ns[variable_name] = df 1167 | 1168 | # 获取DataFrame信息 1169 | dtypes_dict = {col: str(dtype) for col, dtype in df.dtypes.items()} 1170 | memory_usage_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 1171 | 1172 | return { 1173 | "success": True, 1174 | "variable_name": variable_name, 1175 | "shape": list(df.shape), 1176 | "columns": list(df.columns), 1177 | "dtypes": dtypes_dict, 1178 | "memory_usage_mb": round(memory_usage_mb, 2), 1179 | "encoding_detected": detected_encoding, 1180 | "load_time": round(load_time, 3) 1181 | } 1182 | 1183 | except FileNotFoundError as e: 1184 | return { 1185 | "success": False, 1186 | "error": str(e) 1187 | } 1188 | except ValueError as e: 1189 | return { 1190 | "success": False, 1191 | "error": str(e) 1192 | } 1193 | except Exception as e: 1194 | return { 1195 | "success": False, 1196 | "error": f"Failed to load CSV file: {str(e)}" 1197 | } 1198 | 1199 | @mcp.tool() 1200 | def load_excel_file( 1201 | file_path: str, 1202 | session_id: str, 1203 | variable_name: Optional[str] = None, 1204 | sheet_name: Union[str, int] = 0 1205 | ) -> Dict[str, Any]: 1206 | """ 1207 | 加载 Excel 文件到 DataFrame 1208 | 1209 | 功能说明: 1210 | - 加载 Excel 文件到 pandas DataFrame 1211 | - 支持多个工作表选择 1212 | - 自动生成变量名 1213 | 1214 | Args: 1215 | file_path: Excel文件路径 1216 | session_id: 会话ID 1217 | variable_name: 变量名,不提供则自动生成 1218 | sheet_name: 工作表名称或索引 (默认0) 1219 | 1220 | 调用样例: 1221 | ```python 1222 | # 加载默认工作表 1223 | result = load_excel_file("data.xlsx", "session_a1b2c3d4") 1224 | 1225 | # 加载指定工作表 1226 | result = load_excel_file("data.xlsx", "session_a1b2c3d4", sheet_name="Sheet2") 1227 | ``` 1228 | 1229 | 返回格式: 1230 | 成功时包含: success, variable_name, shape, columns, dtypes, memory_usage_mb, sheet_name, load_time 1231 | 失败时包含: success, error 1232 | """ 1233 | try: 1234 | if not PANDAS_AVAILABLE: 1235 | return { 1236 | "success": False, 1237 | "error": "pandas is not available. Please install with: pip install pandas numpy" 1238 | } 1239 | 1240 | if not EXCEL_AVAILABLE: 1241 | return { 1242 | "success": False, 1243 | "error": "Excel support libraries are not available. Please install with: pip install openpyxl xlrd" 1244 | } 1245 | 1246 | manager = get_session_manager() 1247 | session = manager.get_session(session_id) 1248 | 1249 | # 加载数据 1250 | df, load_time = DataLoader.load_excel(file_path, sheet_name) 1251 | 1252 | # 生成变量名 1253 | if variable_name is None: 1254 | base_name = Path(file_path).stem 1255 | variable_name = f"df_{base_name}" 1256 | 1257 | # 确保变量名不冲突 1258 | variables = session.get_variables() 1259 | counter = 1 1260 | while variable_name in variables: 1261 | variable_name = f"df_{base_name}_{counter}" 1262 | counter += 1 1263 | 1264 | # 将DataFrame添加到会话命名空间 1265 | session.shell.user_ns[variable_name] = df 1266 | 1267 | # 获取DataFrame信息 1268 | dtypes_dict = {col: str(dtype) for col, dtype in df.dtypes.items()} 1269 | memory_usage_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 1270 | 1271 | return { 1272 | "success": True, 1273 | "variable_name": variable_name, 1274 | "shape": list(df.shape), 1275 | "columns": list(df.columns), 1276 | "dtypes": dtypes_dict, 1277 | "memory_usage_mb": round(memory_usage_mb, 2), 1278 | "sheet_name": sheet_name, 1279 | "load_time": round(load_time, 3) 1280 | } 1281 | 1282 | except FileNotFoundError as e: 1283 | return { 1284 | "success": False, 1285 | "error": str(e) 1286 | } 1287 | except ValueError as e: 1288 | return { 1289 | "success": False, 1290 | "error": str(e) 1291 | } 1292 | except Exception as e: 1293 | return { 1294 | "success": False, 1295 | "error": f"Failed to load Excel file: {str(e)}" 1296 | } 1297 | 1298 | @mcp.tool() 1299 | def load_json_file( 1300 | file_path: str, 1301 | session_id: str, 1302 | variable_name: Optional[str] = None 1303 | ) -> Dict[str, Any]: 1304 | """ 1305 | 加载 JSON 文件到 DataFrame 1306 | 1307 | 功能说明: 1308 | - 加载 JSON 文件到 pandas DataFrame 1309 | - 自动推断JSON结构 1310 | - 支持嵌套JSON数据 1311 | 1312 | Args: 1313 | file_path: JSON文件路径 1314 | session_id: 会话ID 1315 | variable_name: 变量名,不提供则自动生成 1316 | 1317 | 调用样例: 1318 | ```python 1319 | result = load_json_file("data.json", "session_a1b2c3d4") 1320 | ``` 1321 | 1322 | 返回格式: 1323 | 成功时包含: success, variable_name, shape, columns, dtypes, memory_usage_mb, load_time 1324 | 失败时包含: success, error 1325 | """ 1326 | try: 1327 | if not PANDAS_AVAILABLE: 1328 | return { 1329 | "success": False, 1330 | "error": "pandas is not available. Please install with: pip install pandas numpy" 1331 | } 1332 | 1333 | manager = get_session_manager() 1334 | session = manager.get_session(session_id) 1335 | 1336 | # 加载数据 1337 | df, load_time = DataLoader.load_json(file_path) 1338 | 1339 | # 生成变量名 1340 | if variable_name is None: 1341 | base_name = Path(file_path).stem 1342 | variable_name = f"df_{base_name}" 1343 | 1344 | # 确保变量名不冲突 1345 | variables = session.get_variables() 1346 | counter = 1 1347 | while variable_name in variables: 1348 | variable_name = f"df_{base_name}_{counter}" 1349 | counter += 1 1350 | 1351 | # 将DataFrame添加到会话命名空间 1352 | session.shell.user_ns[variable_name] = df 1353 | 1354 | # 获取DataFrame信息 1355 | dtypes_dict = {col: str(dtype) for col, dtype in df.dtypes.items()} 1356 | memory_usage_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 1357 | 1358 | return { 1359 | "success": True, 1360 | "variable_name": variable_name, 1361 | "shape": list(df.shape), 1362 | "columns": list(df.columns), 1363 | "dtypes": dtypes_dict, 1364 | "memory_usage_mb": round(memory_usage_mb, 2), 1365 | "load_time": round(load_time, 3) 1366 | } 1367 | 1368 | except FileNotFoundError as e: 1369 | return { 1370 | "success": False, 1371 | "error": str(e) 1372 | } 1373 | except ValueError as e: 1374 | return { 1375 | "success": False, 1376 | "error": str(e) 1377 | } 1378 | except Exception as e: 1379 | return { 1380 | "success": False, 1381 | "error": f"Failed to load JSON file: {str(e)}" 1382 | } 1383 | 1384 | @mcp.tool() 1385 | def list_dataframes(session_id: str) -> Dict[str, Any]: 1386 | """ 1387 | 列出会话中所有 DataFrame 变量 1388 | 1389 | 功能说明: 1390 | - 列出会话中所有DataFrame变量 1391 | - 显示每个DataFrame的基本信息 1392 | - 统计总内存使用 1393 | 1394 | Args: 1395 | session_id: 会话ID 1396 | 1397 | 调用样例: 1398 | ```python 1399 | result = list_dataframes(session_id="session_a1b2c3d4") 1400 | ``` 1401 | 1402 | 返回格式: 1403 | 成功时包含: success, dataframes (列表), total_dataframes, total_memory_mb 1404 | 每个DataFrame包含: name, shape, memory_mb, columns 1405 | 失败时包含: success, error 1406 | """ 1407 | try: 1408 | manager = get_session_manager() 1409 | session = manager.get_session(session_id) 1410 | 1411 | variables = session.get_variables() 1412 | dataframes = [] 1413 | total_memory = 0.0 1414 | 1415 | for name, var_info in variables.items(): 1416 | if 'DataFrame' in var_info.get('type', ''): 1417 | # 获取DataFrame对象 1418 | df = session.shell.user_ns.get(name) 1419 | if df is not None and hasattr(df, 'shape'): 1420 | memory_mb = var_info.get('memory_usage', 0) / 1024 / 1024 if var_info.get('memory_usage') else 0 1421 | total_memory += memory_mb 1422 | 1423 | dataframes.append({ 1424 | "name": name, 1425 | "shape": list(df.shape), 1426 | "memory_mb": round(memory_mb, 2), 1427 | "columns": list(df.columns)[:10] # 只显示前10列 1428 | }) 1429 | 1430 | return { 1431 | "success": True, 1432 | "dataframes": dataframes, 1433 | "total_dataframes": len(dataframes), 1434 | "total_memory_mb": round(total_memory, 2) 1435 | } 1436 | 1437 | except ValueError as e: 1438 | return { 1439 | "success": False, 1440 | "error": str(e) 1441 | } 1442 | except Exception as e: 1443 | return { 1444 | "success": False, 1445 | "error": f"Failed to list dataframes: {str(e)}" 1446 | } 1447 | 1448 | @mcp.tool() 1449 | def get_dataframe_info( 1450 | variable_name: str, 1451 | session_id: str 1452 | ) -> Dict[str, Any]: 1453 | """ 1454 | 获取指定 DataFrame 的详细信息 1455 | 1456 | 功能说明: 1457 | - 获取 DataFrame 的完整元数据信息 1458 | - 包括形状、列名、数据类型、内存使用、缺失值统计等 1459 | - 提供每列的详细内存使用情况 1460 | - 显示索引信息和数据完整性状态 1461 | - 用于数据质量检查和内存优化 1462 | 1463 | Args: 1464 | variable_name: 会话中 DataFrame 变量的名称 1465 | session_id: 会话ID 1466 | 1467 | Returns: 1468 | Dict: DataFrame详细信息 1469 | 1470 | 调用样例: 1471 | ```python 1472 | # 获取DataFrame详细信息 1473 | result = get_dataframe_info( 1474 | variable_name="df_sales", 1475 | session_id="session_a1b2c3d4" 1476 | ) 1477 | 1478 | # 检查加载的数据信息 1479 | result = get_dataframe_info( 1480 | variable_name="df_users", 1481 | session_id="session_a1b2c3d4" 1482 | ) 1483 | ``` 1484 | 1485 | 返回格式: 1486 | 成功时包含: success, variable_name, shape, columns, dtypes, memory_usage, null_counts, index_info 1487 | memory_usage 包含总使用量和每列使用量 1488 | 失败时包含: success, error 1489 | """ 1490 | try: 1491 | manager = get_session_manager() 1492 | session = manager.get_session(session_id) 1493 | 1494 | # 获取DataFrame对象 1495 | df = session.shell.user_ns.get(variable_name) 1496 | if df is None: 1497 | return { 1498 | "success": False, 1499 | "error": f"Variable '{variable_name}' not found in session" 1500 | } 1501 | 1502 | if not hasattr(df, 'shape') or not hasattr(df, 'dtypes'): 1503 | return { 1504 | "success": False, 1505 | "error": f"Variable '{variable_name}' is not a DataFrame" 1506 | } 1507 | 1508 | # 获取详细信息 1509 | dtypes_dict = {col: str(dtype) for col, dtype in df.dtypes.items()} 1510 | memory_usage = df.memory_usage(deep=True) 1511 | total_memory_mb = memory_usage.sum() / 1024 / 1024 1512 | 1513 | per_column_kb = {col: round(memory_usage[col] / 1024, 1) for col in df.columns} 1514 | null_counts = df.isnull().sum().to_dict() 1515 | 1516 | # 索引信息 1517 | index_info = { 1518 | "type": type(df.index).__name__, 1519 | "start": int(df.index[0]) if len(df.index) > 0 and hasattr(df.index[0], '__int__') else str(df.index[0]) if len(df.index) > 0 else None, 1520 | "stop": int(df.index[-1]) if len(df.index) > 0 and hasattr(df.index[-1], '__int__') else str(df.index[-1]) if len(df.index) > 0 else None, 1521 | "step": 1 if isinstance(df.index, pd.RangeIndex) else None 1522 | } 1523 | 1524 | return { 1525 | "success": True, 1526 | "variable_name": variable_name, 1527 | "shape": list(df.shape), 1528 | "columns": list(df.columns), 1529 | "dtypes": dtypes_dict, 1530 | "memory_usage": { 1531 | "total_mb": round(total_memory_mb, 2), 1532 | "per_column_kb": per_column_kb 1533 | }, 1534 | "null_counts": null_counts, 1535 | "index_info": index_info 1536 | } 1537 | 1538 | except ValueError as e: 1539 | return { 1540 | "success": False, 1541 | "error": str(e) 1542 | } 1543 | except Exception as e: 1544 | return { 1545 | "success": False, 1546 | "error": f"Failed to get dataframe info: {str(e)}" 1547 | } 1548 | 1549 | @mcp.tool() 1550 | def preview_dataframe( 1551 | variable_name: str, 1552 | session_id: str, 1553 | method: str = "head", 1554 | n_rows: int = 5 1555 | ) -> Dict[str, Any]: 1556 | """ 1557 | 预览 DataFrame 数据 1558 | 1559 | 功能说明: 1560 | - 预览DataFrame的数据内容 1561 | - 支持头部、尾部、随机采样三种方式 1562 | - 返回易读的记录格式 1563 | 1564 | Args: 1565 | variable_name: DataFrame变量名 1566 | session_id: 会话ID 1567 | method: 预览方法 ("head", "tail", "sample") 1568 | n_rows: 行数 1569 | 1570 | 调用样例: 1571 | ```python 1572 | # 查看前5行 1573 | result = preview_dataframe("df", "session_a1b2c3d4", "head", 5) 1574 | 1575 | # 随机采样10行 1576 | result = preview_dataframe("df", "session_a1b2c3d4", "sample", 10) 1577 | ``` 1578 | 1579 | 返回格式: 1580 | 成功时包含: success, variable_name, method, n_rows, data (记录列表), total_rows 1581 | 失败时包含: success, error 1582 | """ 1583 | try: 1584 | manager = get_session_manager() 1585 | session = manager.get_session(session_id) 1586 | 1587 | # 获取DataFrame对象 1588 | df = session.shell.user_ns.get(variable_name) 1589 | if df is None: 1590 | return { 1591 | "success": False, 1592 | "error": f"Variable '{variable_name}' not found in session" 1593 | } 1594 | 1595 | if not hasattr(df, 'shape'): 1596 | return { 1597 | "success": False, 1598 | "error": f"Variable '{variable_name}' is not a DataFrame" 1599 | } 1600 | 1601 | # 根据方法获取数据 1602 | if method == "head": 1603 | preview_df = df.head(n_rows) 1604 | elif method == "tail": 1605 | preview_df = df.tail(n_rows) 1606 | elif method == "sample": 1607 | n_rows = min(n_rows, len(df)) # 确保不超过实际行数 1608 | preview_df = df.sample(n_rows) if len(df) > 0 else df.head(0) 1609 | else: 1610 | return { 1611 | "success": False, 1612 | "error": f"Invalid method '{method}'. Use 'head', 'tail', or 'sample'" 1613 | } 1614 | 1615 | # 转换为记录格式 1616 | data = preview_df.to_dict('records') 1617 | 1618 | return { 1619 | "success": True, 1620 | "variable_name": variable_name, 1621 | "method": method, 1622 | "n_rows": len(preview_df), 1623 | "data": data, 1624 | "total_rows": len(df) 1625 | } 1626 | 1627 | except ValueError as e: 1628 | return { 1629 | "success": False, 1630 | "error": str(e) 1631 | } 1632 | except Exception as e: 1633 | return { 1634 | "success": False, 1635 | "error": f"Failed to preview dataframe: {str(e)}" 1636 | } 1637 | 1638 | @mcp.tool() 1639 | def get_dataframe_summary( 1640 | variable_name: str, 1641 | session_id: str, 1642 | include_categorical: bool = True 1643 | ) -> Dict[str, Any]: 1644 | """ 1645 | 获取 DataFrame 统计摘要 1646 | 1647 | 功能说明: 1648 | - 获取DataFrame的统计摘要信息 1649 | - 支持数值变量和分类变量统计 1650 | - 提供describe()的结构化输出 1651 | 1652 | Args: 1653 | variable_name: DataFrame变量名 1654 | session_id: 会话ID 1655 | include_categorical: 是否包含分类变量统计 1656 | 1657 | 调用样例: 1658 | ```python 1659 | result = get_dataframe_summary("df", "session_a1b2c3d4", include_categorical=True) 1660 | ``` 1661 | 1662 | 返回格式: 1663 | 成功时包含: success, variable_name, numeric_summary, categorical_summary (可选) 1664 | 失败时包含: success, error 1665 | """ 1666 | try: 1667 | manager = get_session_manager() 1668 | session = manager.get_session(session_id) 1669 | 1670 | # 获取DataFrame对象 1671 | df = session.shell.user_ns.get(variable_name) 1672 | if df is None: 1673 | return { 1674 | "success": False, 1675 | "error": f"Variable '{variable_name}' not found in session" 1676 | } 1677 | 1678 | if not hasattr(df, 'describe'): 1679 | return { 1680 | "success": False, 1681 | "error": f"Variable '{variable_name}' is not a DataFrame" 1682 | } 1683 | 1684 | result = { 1685 | "success": True, 1686 | "variable_name": variable_name 1687 | } 1688 | 1689 | # 数值变量统计 1690 | numeric_df = df.select_dtypes(include=['number']) 1691 | if len(numeric_df.columns) > 0: 1692 | numeric_summary = {} 1693 | desc = numeric_df.describe() 1694 | for col in numeric_df.columns: 1695 | numeric_summary[col] = desc[col].to_dict() 1696 | result["numeric_summary"] = numeric_summary 1697 | 1698 | # 分类变量统计 1699 | if include_categorical: 1700 | categorical_df = df.select_dtypes(include=['object', 'category']) 1701 | if len(categorical_df.columns) > 0: 1702 | categorical_summary = {} 1703 | for col in categorical_df.columns: 1704 | desc = df[col].describe() 1705 | categorical_summary[col] = { 1706 | "count": int(desc['count']), 1707 | "unique": int(desc['unique']), 1708 | "top": str(desc['top']), 1709 | "freq": int(desc['freq']) 1710 | } 1711 | result["categorical_summary"] = categorical_summary 1712 | 1713 | return result 1714 | 1715 | except ValueError as e: 1716 | return { 1717 | "success": False, 1718 | "error": str(e) 1719 | } 1720 | except Exception as e: 1721 | return { 1722 | "success": False, 1723 | "error": f"Failed to get dataframe summary: {str(e)}" 1724 | } 1725 | 1726 | @mcp.tool() 1727 | def check_memory_usage(session_id: str) -> Dict[str, Any]: 1728 | """ 1729 | 查看会话的内存使用情况 1730 | 1731 | 功能说明: 1732 | - 分析会话中所有变量的内存使用情况 1733 | - 按变量类型分组统计内存使用 1734 | - 列出内存使用最多的变量 1735 | - 提供系统内存状态信息 1736 | - 用于内存优化和性能调试 1737 | 1738 | Args: 1739 | session_id: 会话ID 1740 | 1741 | Returns: 1742 | Dict: 内存使用详细信息 1743 | 1744 | 调用样例: 1745 | ```python 1746 | # 检查会话内存使用 1747 | result = check_memory_usage(session_id="session_a1b2c3d4") 1748 | 1749 | # 在数据加载后检查内存 1750 | load_csv_file("large_data.csv", "session_a1b2c3d4") 1751 | result = check_memory_usage(session_id="session_a1b2c3d4") 1752 | ``` 1753 | 1754 | 返回格式: 1755 | 成功时包含: success, session_id, total_memory_mb, breakdown (按类型分组), top_variables (最大变量列表), system_memory (系统内存信息) 1756 | 失败时包含: success, error 1757 | """ 1758 | try: 1759 | manager = get_session_manager() 1760 | session = manager.get_session(session_id) 1761 | 1762 | memory_info = session.get_memory_info() 1763 | 1764 | return { 1765 | "success": True, 1766 | "session_id": session_id, 1767 | "total_memory_mb": memory_info['total_memory_mb'], 1768 | "breakdown": memory_info['breakdown'], 1769 | "top_variables": memory_info['top_variables'], 1770 | "system_memory": memory_info['system_memory'] 1771 | } 1772 | 1773 | except ValueError as e: 1774 | return { 1775 | "success": False, 1776 | "error": str(e) 1777 | } 1778 | except Exception as e: 1779 | return { 1780 | "success": False, 1781 | "error": f"Failed to check memory usage: {str(e)}" 1782 | } 1783 | 1784 | @mcp.tool() 1785 | def get_variable_info( 1786 | variable_name: str, 1787 | session_id: str, 1788 | include_preview: bool = True 1789 | ) -> Dict[str, Any]: 1790 | """ 1791 | 获取指定变量的详细信息 1792 | 1793 | 功能说明: 1794 | - 获取任意类型变量的详细信息 1795 | - 支持 DataFrame, numpy 数组, 列表, 字典等所有 Python 对象 1796 | - 提供类型信息、大小信息、内存使用等 1797 | - 可选择包含变量内容预览 1798 | - 特别优化对数据科学对象的信息展示 1799 | 1800 | Args: 1801 | variable_name: 会话中变量的名称 1802 | session_id: 会话ID 1803 | include_preview: 是否包含内容预览(默认 True) 1804 | 1805 | Returns: 1806 | Dict: 变量详细信息 1807 | 1808 | 调用样例: 1809 | ```python 1810 | # 获取DataFrame变量信息 1811 | result = get_variable_info( 1812 | variable_name="df_sales", 1813 | session_id="session_a1b2c3d4", 1814 | include_preview=True 1815 | ) 1816 | 1817 | # 获取列表变量信息,不包含预览 1818 | result = get_variable_info( 1819 | variable_name="my_list", 1820 | session_id="session_a1b2c3d4", 1821 | include_preview=False 1822 | ) 1823 | 1824 | # 获取numpy数组信息 1825 | result = get_variable_info( 1826 | variable_name="numpy_array", 1827 | session_id="session_a1b2c3d4" 1828 | ) 1829 | ``` 1830 | 1831 | 返回格式: 1832 | 成功时包含: success, variable_name, type, size_info, content_preview (可选), additional_info (类型相关信息) 1833 | size_info 包含: size_bytes, memory_mb, shape (如适用), element_count (如适用) 1834 | 失败时包含: success, error 1835 | """ 1836 | try: 1837 | manager = get_session_manager() 1838 | session = manager.get_session(session_id) 1839 | 1840 | # 获取变量对象 1841 | var = session.shell.user_ns.get(variable_name) 1842 | if var is None: 1843 | return { 1844 | "success": False, 1845 | "error": f"Variable '{variable_name}' not found in session" 1846 | } 1847 | 1848 | var_type = type(var).__name__ 1849 | size_bytes = sys.getsizeof(var) 1850 | 1851 | result = { 1852 | "success": True, 1853 | "variable_name": variable_name, 1854 | "type": f"{type(var).__module__}.{var_type}" if hasattr(type(var), '__module__') else var_type, 1855 | "size_info": { 1856 | "size_bytes": size_bytes, 1857 | "memory_mb": round(size_bytes / 1024 / 1024, 3) 1858 | } 1859 | } 1860 | 1861 | # 特殊处理不同类型 1862 | if hasattr(var, 'shape') and hasattr(var, 'dtypes'): # DataFrame 1863 | result["size_info"].update({ 1864 | "shape": list(var.shape), 1865 | "element_count": var.size 1866 | }) 1867 | result["additional_info"] = { 1868 | "columns": list(var.columns), 1869 | "dtypes": {col: str(dtype) for col, dtype in var.dtypes.items()} 1870 | } 1871 | if include_preview: 1872 | result["content_preview"] = str(var.head()) 1873 | 1874 | elif hasattr(var, 'shape'): # numpy array 1875 | result["size_info"].update({ 1876 | "shape": list(var.shape), 1877 | "element_count": var.size 1878 | }) 1879 | result["additional_info"] = { 1880 | "dtype": str(var.dtype) if hasattr(var, 'dtype') else None 1881 | } 1882 | if include_preview: 1883 | result["content_preview"] = str(var) 1884 | 1885 | elif isinstance(var, (list, tuple, dict, set)): 1886 | result["size_info"]["element_count"] = len(var) 1887 | if include_preview: 1888 | preview = str(var) 1889 | if len(preview) > 200: 1890 | preview = preview[:200] + "..." 1891 | result["content_preview"] = preview 1892 | 1893 | else: 1894 | if include_preview: 1895 | preview = str(var) 1896 | if len(preview) > 200: 1897 | preview = preview[:200] + "..." 1898 | result["content_preview"] = preview 1899 | 1900 | return result 1901 | 1902 | except ValueError as e: 1903 | return { 1904 | "success": False, 1905 | "error": str(e) 1906 | } 1907 | except Exception as e: 1908 | return { 1909 | "success": False, 1910 | "error": f"Failed to get variable info: {str(e)}" 1911 | } 1912 | 1913 | @mcp.tool() 1914 | def sample_column_data( 1915 | variable_name: str, 1916 | column_name: str, 1917 | session_id: str, 1918 | method: str = "mixed", 1919 | sample_size: int = 20, 1920 | max_text_length: int = 100, 1921 | include_stats: bool = True 1922 | ) -> Dict[str, Any]: 1923 | """ 1924 | 智能采样查看 DataFrame 列数据 1925 | 1926 | 功能说明: 1927 | - 智能采样DataFrame中指定列的数据,避免上下文被填满 1928 | - 支持多种采样方式:头部、尾部、随机、唯一值、混合 1929 | - 自动截断过长的文本内容 1930 | - 提供详细的统计信息 1931 | - 特别适合大数据集的列数据探索 1932 | 1933 | Args: 1934 | variable_name: DataFrame变量名 1935 | column_name: 列名 1936 | session_id: 会话ID 1937 | method: 采样方法 ("head", "tail", "random", "unique", "mixed") 1938 | sample_size: 采样数量 (默认20) 1939 | max_text_length: 文本最大长度 (默认100字符) 1940 | include_stats: 是否包含统计信息 1941 | 1942 | Returns: 1943 | Dict: 列数据采样结果 1944 | 1945 | 调用样例: 1946 | ```python 1947 | # 混合采样查看列数据 1948 | result = sample_column_data( 1949 | variable_name="df_sales", 1950 | column_name="product_name", 1951 | session_id="session_a1b2c3d4", 1952 | method="mixed", 1953 | sample_size=20 1954 | ) 1955 | 1956 | # 查看唯一值 1957 | result = sample_column_data( 1958 | variable_name="df_sales", 1959 | column_name="category", 1960 | session_id="session_a1b2c3d4", 1961 | method="unique", 1962 | sample_size=50 1963 | ) 1964 | 1965 | # 随机采样数值列 1966 | result = sample_column_data( 1967 | variable_name="df_sales", 1968 | column_name="price", 1969 | session_id="session_a1b2c3d4", 1970 | method="random", 1971 | sample_size=30 1972 | ) 1973 | ``` 1974 | 1975 | 返回格式: 1976 | 成功时包含: success, variable_name, column_name, method, sample_data, statistics, total_count 1977 | 失败时包含: success, error 1978 | """ 1979 | try: 1980 | manager = get_session_manager() 1981 | session = manager.get_session(session_id) 1982 | 1983 | # 获取DataFrame对象 1984 | df = session.shell.user_ns.get(variable_name) 1985 | if df is None: 1986 | return { 1987 | "success": False, 1988 | "error": f"Variable '{variable_name}' not found in session" 1989 | } 1990 | 1991 | if not hasattr(df, 'shape') or not hasattr(df, 'columns'): 1992 | return { 1993 | "success": False, 1994 | "error": f"Variable '{variable_name}' is not a DataFrame" 1995 | } 1996 | 1997 | if column_name not in df.columns: 1998 | return { 1999 | "success": False, 2000 | "error": f"Column '{column_name}' not found in DataFrame. Available columns: {list(df.columns)}" 2001 | } 2002 | 2003 | column_data = df[column_name] 2004 | total_count = len(column_data) 2005 | 2006 | # 基本统计信息 2007 | stats = {} 2008 | if include_stats: 2009 | stats = { 2010 | "total_count": total_count, 2011 | "null_count": int(column_data.isnull().sum()), 2012 | "non_null_count": int(column_data.count()), 2013 | "data_type": str(column_data.dtype) 2014 | } 2015 | 2016 | # 唯一值统计 2017 | unique_values = column_data.dropna().unique() 2018 | stats["unique_count"] = len(unique_values) 2019 | stats["duplicate_count"] = total_count - len(unique_values) - stats["null_count"] 2020 | 2021 | # 数值类型的特殊统计 2022 | if column_data.dtype in ['int64', 'float64', 'int32', 'float32']: 2023 | non_null_data = column_data.dropna() 2024 | if len(non_null_data) > 0: 2025 | stats["numeric_stats"] = { 2026 | "min": float(non_null_data.min()), 2027 | "max": float(non_null_data.max()), 2028 | "mean": float(non_null_data.mean()), 2029 | "median": float(non_null_data.median()), 2030 | "std": float(non_null_data.std()) if len(non_null_data) > 1 else 0.0 2031 | } 2032 | 2033 | # 文本类型的特殊统计 2034 | elif column_data.dtype == 'object': 2035 | non_null_data = column_data.dropna().astype(str) 2036 | if len(non_null_data) > 0: 2037 | text_lengths = non_null_data.str.len() 2038 | stats["text_stats"] = { 2039 | "min_length": int(text_lengths.min()), 2040 | "max_length": int(text_lengths.max()), 2041 | "avg_length": float(text_lengths.mean()) 2042 | } 2043 | 2044 | # 数据采样 2045 | sample_data = [] 2046 | actual_sample_size = min(sample_size, total_count) 2047 | 2048 | if method == "head": 2049 | sampled = column_data.head(actual_sample_size) 2050 | elif method == "tail": 2051 | sampled = column_data.tail(actual_sample_size) 2052 | elif method == "random": 2053 | sampled = column_data.sample(n=actual_sample_size) if total_count > 0 else column_data.head(0) 2054 | elif method == "unique": 2055 | # 获取唯一值 2056 | unique_values = column_data.dropna().unique() 2057 | if len(unique_values) > actual_sample_size: 2058 | # 如果唯一值太多,随机选择 2059 | import numpy as np 2060 | selected_indices = np.random.choice(len(unique_values), actual_sample_size, replace=False) 2061 | sampled_values = unique_values[selected_indices] 2062 | else: 2063 | sampled_values = unique_values 2064 | 2065 | # 创建一个Series用于统一处理 2066 | sampled = pd.Series(sampled_values) 2067 | elif method == "mixed": 2068 | # 混合采样:头部、尾部、随机各占一部分 2069 | third = actual_sample_size // 3 2070 | remainder = actual_sample_size % 3 2071 | 2072 | head_size = third + (1 if remainder > 0 else 0) 2073 | tail_size = third + (1 if remainder > 1 else 0) 2074 | random_size = third 2075 | 2076 | samples = [] 2077 | if head_size > 0: 2078 | samples.extend(column_data.head(head_size).tolist()) 2079 | if tail_size > 0: 2080 | samples.extend(column_data.tail(tail_size).tolist()) 2081 | if random_size > 0 and total_count > head_size + tail_size: 2082 | # 避免重复采样已经在头部和尾部的数据 2083 | middle_data = column_data.iloc[head_size:-tail_size] if tail_size > 0 else column_data.iloc[head_size:] 2084 | if len(middle_data) > 0: 2085 | random_samples = middle_data.sample(n=min(random_size, len(middle_data))) 2086 | samples.extend(random_samples.tolist()) 2087 | 2088 | sampled = pd.Series(samples) 2089 | else: 2090 | return { 2091 | "success": False, 2092 | "error": f"Invalid method '{method}'. Use 'head', 'tail', 'random', 'unique', or 'mixed'" 2093 | } 2094 | 2095 | # 处理采样数据,截断过长的文本 2096 | for idx, value in sampled.items(): 2097 | processed_value = value 2098 | 2099 | # 处理空值 2100 | if pd.isna(value): 2101 | processed_value = None 2102 | else: 2103 | # 转换为字符串并截断 2104 | str_value = str(value) 2105 | if len(str_value) > max_text_length: 2106 | processed_value = str_value[:max_text_length] + "..." 2107 | else: 2108 | processed_value = str_value 2109 | 2110 | sample_data.append({ 2111 | "index": int(idx) if pd.notna(idx) and hasattr(idx, '__int__') else str(idx), 2112 | "value": processed_value, 2113 | "original_type": type(value).__name__ 2114 | }) 2115 | 2116 | result = { 2117 | "success": True, 2118 | "variable_name": variable_name, 2119 | "column_name": column_name, 2120 | "method": method, 2121 | "total_count": total_count, 2122 | "sample_size": len(sample_data), 2123 | "sample_data": sample_data 2124 | } 2125 | 2126 | if include_stats: 2127 | result["statistics"] = stats 2128 | 2129 | return result 2130 | 2131 | except ValueError as e: 2132 | return { 2133 | "success": False, 2134 | "error": str(e) 2135 | } 2136 | except Exception as e: 2137 | return { 2138 | "success": False, 2139 | "error": f"Failed to sample column data: {str(e)}" 2140 | } 2141 | 2142 | @mcp.tool() 2143 | def clear_variables( 2144 | session_id: str, 2145 | variable_names: Optional[List[str]] = None, 2146 | clear_all: bool = False, 2147 | keep_imports: bool = True 2148 | ) -> Dict[str, Any]: 2149 | """ 2150 | 清理指定变量或全部变量 2151 | 2152 | 功能说明: 2153 | - 清理会话中的指定变量或全部变量 2154 | - 释放内存空间 2155 | - 可选择保留导入的模块 2156 | 2157 | Args: 2158 | session_id: 会话ID 2159 | variable_names: 要清理的变量名列表 2160 | clear_all: 是否清理所有变量 2161 | keep_imports: 清理全部时是否保留导入的模块 2162 | 2163 | 调用样例: 2164 | ```python 2165 | # 清理指定变量 2166 | result = clear_variables("session_a1b2c3d4", ["df1", "df2"]) 2167 | 2168 | # 清理所有变量,保留导入 2169 | result = clear_variables("session_a1b2c3d4", clear_all=True, keep_imports=True) 2170 | ``` 2171 | 2172 | 返回格式: 2173 | 成功时包含: success, cleared_variables (列表), memory_freed_mb, remaining_variables 2174 | 失败时包含: success, error 2175 | """ 2176 | try: 2177 | manager = get_session_manager() 2178 | session = manager.get_session(session_id) 2179 | 2180 | initial_memory = session._get_memory_usage() 2181 | cleared_variables = [] 2182 | 2183 | if clear_all: 2184 | # 清理所有用户变量 2185 | user_ns = session.shell.user_ns 2186 | to_delete = [] 2187 | 2188 | for name in user_ns: 2189 | if not name.startswith('_') and name not in ['In', 'Out', 'get_ipython', 'exit', 'quit']: 2190 | # 如果保留导入,跳过模块类型的变量 2191 | if keep_imports and hasattr(user_ns[name], '__module__'): 2192 | var_type = type(user_ns[name]).__name__ 2193 | if 'module' in var_type.lower(): 2194 | continue 2195 | to_delete.append(name) 2196 | 2197 | for name in to_delete: 2198 | if name in user_ns: 2199 | del user_ns[name] 2200 | cleared_variables.append(name) 2201 | 2202 | elif variable_names: 2203 | # 清理指定变量 2204 | user_ns = session.shell.user_ns 2205 | for name in variable_names: 2206 | if name in user_ns: 2207 | del user_ns[name] 2208 | cleared_variables.append(name) 2209 | else: 2210 | return { 2211 | "success": False, 2212 | "error": "Must specify either variable_names or set clear_all=True" 2213 | } 2214 | 2215 | # 强制垃圾回收 2216 | import gc 2217 | gc.collect() 2218 | 2219 | final_memory = session._get_memory_usage() 2220 | memory_freed = initial_memory - final_memory 2221 | 2222 | variables_after = session.get_variables() 2223 | 2224 | return { 2225 | "success": True, 2226 | "cleared_variables": cleared_variables, 2227 | "memory_freed_mb": round(memory_freed, 2), 2228 | "remaining_variables": len(variables_after) 2229 | } 2230 | 2231 | except ValueError as e: 2232 | return { 2233 | "success": False, 2234 | "error": str(e) 2235 | } 2236 | except Exception as e: 2237 | return { 2238 | "success": False, 2239 | "error": f"Failed to clear variables: {str(e)}" 2240 | } 2241 | 2242 | if __name__ == "__main__": 2243 | # 启动MCP服务器 2244 | mcp.run(transport="stdio") --------------------------------------------------------------------------------