├── .claude └── commands │ ├── exe-parallel.md │ ├── init-parallel.md │ └── prime.md ├── .gitignore ├── README.md ├── ai_docs ├── claude_opus_mcp_breakdown.md └── uv_running_python.md ├── images └── mcp-server-prompts.png ├── quick-data-mcp ├── .gitignore ├── .mcp.json.sample ├── .python-version ├── README.md ├── data │ ├── README.md │ ├── ecommerce_orders.json │ ├── employee_survey.csv │ └── product_performance.csv ├── main.py ├── pyproject.toml ├── src │ └── mcp_server │ │ ├── __init__.py │ │ ├── config │ │ ├── __init__.py │ │ └── settings.py │ │ ├── models │ │ ├── __init__.py │ │ └── schemas.py │ │ ├── prompts │ │ ├── __init__.py │ │ ├── correlation_investigation_prompt.py │ │ ├── dashboard_design_consultation_prompt.py │ │ ├── data_quality_assessment_prompt.py │ │ ├── dataset_first_look_prompt.py │ │ ├── find_datasources_prompt.py │ │ ├── insight_generation_workshop_prompt.py │ │ ├── list_mcp_assets_prompt.py │ │ ├── pattern_discovery_session_prompt.py │ │ └── segmentation_workshop_prompt.py │ │ ├── resources │ │ ├── __init__.py │ │ ├── data_resources.py │ │ ├── get_analysis_suggestions_resource.py │ │ ├── get_available_analyses_resource.py │ │ ├── get_column_types_resource.py │ │ ├── get_current_dataset_resource.py │ │ ├── get_dataset_sample_resource.py │ │ ├── get_dataset_schema_resource.py │ │ ├── get_dataset_summary_resource.py │ │ ├── get_loaded_datasets_resource.py │ │ ├── get_memory_usage_resource.py │ │ ├── get_server_config_resource.py │ │ ├── get_system_status_resource.py │ │ └── get_user_profile_resource.py │ │ ├── server.py │ │ └── tools │ │ ├── __init__.py │ │ ├── analyze_distributions_tool.py │ │ ├── calculate_feature_importance_tool.py │ │ ├── compare_datasets_tool.py │ │ ├── create_chart_tool.py │ │ ├── detect_outliers_tool.py │ │ ├── execute_custom_analytics_code_tool.py │ │ ├── export_insights_tool.py │ │ ├── find_correlations_tool.py │ │ ├── generate_dashboard_tool.py │ │ ├── list_loaded_datasets_tool.py │ │ ├── load_dataset_tool.py │ │ ├── memory_optimization_report_tool.py │ │ ├── merge_datasets_tool.py │ │ ├── pandas_tools.py │ │ ├── segment_by_column_tool.py │ │ ├── suggest_analysis_tool.py │ │ ├── time_series_analysis_tool.py │ │ └── validate_data_quality_tool.py └── tests │ ├── __init__.py │ ├── conftest.py │ ├── prompts │ ├── __init__.py │ ├── test_dataset_first_look_prompt.py │ ├── test_find_datasources_prompt.py │ └── test_list_mcp_assets_prompt.py │ ├── resources │ ├── __init__.py │ └── test_get_server_config_resource.py │ ├── test_analytics_prompts.py │ ├── test_analytics_tools.py │ ├── test_custom_analytics_code.py │ ├── test_data_resources.py │ ├── test_pandas_tools.py │ ├── test_resource_mirror_tools.py │ └── tools │ ├── __init__.py │ └── test_load_dataset_tool.py └── specs ├── custom_analytic_code.md ├── poc_init_generic.md └── resource_workaround.md /.claude/commands/exe-parallel.md: -------------------------------------------------------------------------------- 1 | # Parallel Task Version Execution 2 | 3 | ## Variables 4 | PLAN_TO_EXECUTE: $ARGUMENTS 5 | NUMBER_OF_PARALLEL_WORKTREES: $ARGUMENTS 6 | 7 | ## Run these commands top to bottom 8 | RUN `eza . --tree` 9 | READ: PLAN_TO_EXECUTE 10 | 11 | ## Instructions 12 | 13 | We're going to create NUMBER_OF_PARALLEL_WORKTREES new subagents that use the Task tool to create N versions of the same feature in parallel. 14 | 15 | This enables use to concurrently build the same feature in parallel so we can test and validate each subagent's changes in isolation then pick the best changes. 16 | 17 | The first agent will run in trees/-1/ 18 | The second agent will run in trees/-2/ 19 | ... 20 | The last agent will run in trees/-/ 21 | 22 | The code in trees/-/ will be identical to the code in the current branch. It will be setup and ready for you to build the feature end to end. 23 | 24 | Each agent will independently implement the engineering plan detailed in PLAN_TO_EXECUTE in their respective workspace. 25 | 26 | When the subagent completes it's work, have the subagent to report their final changes made in a comprehensive `RESULTS.md` file at the root of their respective workspace. 27 | 28 | Each subagent should validate their changes with proper tests: `uv run pytest tests/` in their respective workspaces. -------------------------------------------------------------------------------- /.claude/commands/init-parallel.md: -------------------------------------------------------------------------------- 1 | # Initialize parallel git worktree directories 2 | 3 | ## Variables 4 | FEATURE_NAME: $ARGUMENTS 5 | NUMBER_OF_PARALLEL_WORKTREES: $ARGUMENTS 6 | 7 | ## Execute these commands 8 | > Execute the loop in parallel with the Batch and Task tool 9 | 10 | - create a new dir `trees/` 11 | - for i in NUMBER_OF_PARALLEL_WORKTREES 12 | - RUN `git worktree add -b FEATURE_NAME-i ./trees/FEATURE_NAME-i` 13 | - RUN `cd ./trees/FEATURE_NAME-i/`, `uv sync` 14 | - RUN `cd trees/FEATURE_NAME-i`, `git ls-files` to validate 15 | - RUN `git worktree list` to verify all trees were created properly -------------------------------------------------------------------------------- /.claude/commands/prime.md: -------------------------------------------------------------------------------- 1 | ## RUN 2 | eza . --tree --level 5 --git-ignore 3 | 4 | ## READ 5 | @README.md 6 | 7 | ## Remember 8 | `specs/` - is where we plan new engineering work 9 | `ai_docs/` - is where useful reference material exists to guide our work 10 | `arch-modular-mcp/` - production-ready modular MCP server architecture 11 | `arch-modular-mcp/src/mcp_server/` - core server implementation with tools, resources, prompts 12 | `arch-modular-mcp/tests/` - comprehensive test suite (19 tests) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | pnpm-debug.log* 8 | lerna-debug.log* 9 | 10 | node_modules 11 | dist 12 | dist-ssr 13 | *.local 14 | 15 | # Editor directories and files 16 | .vscode/* 17 | !.vscode/extensions.json 18 | .idea 19 | .DS_Store 20 | *.suo 21 | *.ntvs* 22 | *.njsproj 23 | *.sln 24 | *.sw? 25 | 26 | .aider* 27 | adw 28 | bun.* 29 | 30 | 31 | *.pyc 32 | *.pyo 33 | *.pyd 34 | 35 | 36 | output/ 37 | input/ 38 | 39 | # Based on https://raw.githubusercontent.com/github/gitignore/main/Node.gitignore 40 | 41 | # Logs 42 | 43 | logs 44 | _.log 45 | npm-debug.log_ 46 | yarn-debug.log* 47 | yarn-error.log* 48 | lerna-debug.log* 49 | .pnpm-debug.log* 50 | 51 | # Caches 52 | 53 | .cache 54 | 55 | # Diagnostic reports (https://nodejs.org/api/report.html) 56 | 57 | report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json 58 | 59 | # Runtime data 60 | 61 | pids 62 | _.pid 63 | _.seed 64 | *.pid.lock 65 | 66 | # Directory for instrumented libs generated by jscoverage/JSCover 67 | 68 | lib-cov 69 | 70 | # Coverage directory used by tools like istanbul 71 | 72 | coverage 73 | *.lcov 74 | 75 | # nyc test coverage 76 | 77 | .nyc_output 78 | 79 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 80 | 81 | .grunt 82 | 83 | # Bower dependency directory (https://bower.io/) 84 | 85 | bower_components 86 | 87 | # node-waf configuration 88 | 89 | .lock-wscript 90 | 91 | # Compiled binary addons (https://nodejs.org/api/addons.html) 92 | 93 | build/Release 94 | 95 | # Dependency directories 96 | 97 | node_modules/ 98 | jspm_packages/ 99 | 100 | # Snowpack dependency directory (https://snowpack.dev/) 101 | 102 | web_modules/ 103 | 104 | # TypeScript cache 105 | 106 | *.tsbuildinfo 107 | 108 | # Optional npm cache directory 109 | 110 | .npm 111 | 112 | # Optional eslint cache 113 | 114 | .eslintcache 115 | 116 | # Optional stylelint cache 117 | 118 | .stylelintcache 119 | 120 | # Microbundle cache 121 | 122 | .rpt2_cache/ 123 | .rts2_cache_cjs/ 124 | .rts2_cache_es/ 125 | .rts2_cache_umd/ 126 | 127 | # Optional REPL history 128 | 129 | .node_repl_history 130 | 131 | # Output of 'npm pack' 132 | 133 | *.tgz 134 | 135 | # Yarn Integrity file 136 | 137 | .yarn-integrity 138 | 139 | # dotenv environment variable files 140 | 141 | .env 142 | .env.development.local 143 | .env.test.local 144 | .env.production.local 145 | .env.local 146 | 147 | # parcel-bundler cache (https://parceljs.org/) 148 | 149 | .parcel-cache 150 | 151 | # Next.js build output 152 | 153 | .next 154 | out 155 | 156 | # Nuxt.js build / generate output 157 | 158 | .nuxt 159 | dist 160 | 161 | # Gatsby files 162 | 163 | # Comment in the public line in if your project uses Gatsby and not Next.js 164 | 165 | # https://nextjs.org/blog/next-9-1#public-directory-support 166 | 167 | # public 168 | 169 | # vuepress build output 170 | 171 | .vuepress/dist 172 | 173 | # vuepress v2.x temp and cache directory 174 | 175 | .temp 176 | 177 | # Docusaurus cache and generated files 178 | 179 | .docusaurus 180 | 181 | # Serverless directories 182 | 183 | .serverless/ 184 | 185 | # FuseBox cache 186 | 187 | .fusebox/ 188 | 189 | # DynamoDB Local files 190 | 191 | .dynamodb/ 192 | 193 | # TernJS port file 194 | 195 | .tern-port 196 | 197 | # Stores VSCode versions used for testing VSCode extensions 198 | 199 | .vscode-test 200 | 201 | # yarn v2 202 | 203 | .yarn/cache 204 | .yarn/unplugged 205 | .yarn/build-state.yml 206 | .yarn/install-state.gz 207 | .pnp.* 208 | 209 | # IntelliJ based IDEs 210 | .idea 211 | 212 | # Finder (MacOS) folder config 213 | .DS_Store 214 | .aider* 215 | !.aider.conf.yml.example 216 | 217 | __pycache__/ 218 | 219 | .venv/ 220 | 221 | *.after* 222 | *.begin.* 223 | .env 224 | 225 | extras 226 | 227 | session_dir 228 | 229 | transcript_analysis.* 230 | 231 | GHOSTME.md 232 | 233 | director_log.txt 234 | 235 | uv.lock 236 | 237 | specs/*.yml 238 | 239 | server/reports_legacy/ 240 | 241 | **.ignore** 242 | 243 | trees/ 244 | 245 | .mcp.json 246 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MCP From Zero: Quick Data 2 | > Purpose: Learn to build Powerful Model Context Protocol (MCP) servers by scaling tools into reusable agentic workflows (ADWs aka Prompts w/tools). 3 | 4 | ## Quick-Data 5 | > Quick-Data is a MCP server that gives your agent arbitrary data analysis on .json and .csv files. 6 | > 7 | > We use quick-data as a concrete use case to experiment with the MCP Server elements specifically: Prompts > Tools > Resources. 8 | > 9 | > See [quick-data-mcp](quick-data-mcp/README.md) for details on the MCP server 10 | 11 | MCP Server Prompts 12 | 13 | ## Leading Questions 14 | 15 | We experiment with three leading questions: 16 | 17 | 1. How can we MAXIMIZE the value of custom built MCP servers by using tools, resources, and prompts TOGETHER? 18 | 2. What's the BEST codebase architecture for building MCP servers? 19 | 3. Can we build an agentic workflow (prompt w/tools) that can be used to rapidly build MCP servers? 20 | 21 | ## Understanding MCP Components 22 | 23 | MCP servers have three main building blocks that extend what AI models can do: 24 | 25 | ### Tools 26 | **What**: Functions that AI models can call to perform actions. 27 | 28 | **When to use**: When you want the AI to DO something at a low to mid atomic level based on your domain specific use cases. 29 | 30 | **Example**: 31 | ```python 32 | @mcp.tool() 33 | async def create_task(title: str, description: str) -> dict: 34 | """Create a new task.""" 35 | # AI can call this to actually create tasks 36 | return {"id": "123", "title": title, "status": "created"} 37 | ``` 38 | 39 | ### Resources 40 | **What**: Data that AI models can read and access. 41 | 42 | **When to use**: When you want the AI to READ information - user profiles, configuration, status, or any data source. 43 | 44 | **Example**: 45 | ```python 46 | @mcp.resource("users://{user_id}/profile") 47 | async def get_user_profile(user_id: str) -> dict: 48 | """Get user profile by ID.""" 49 | # AI can read this data to understand users 50 | return {"id": user_id, "name": "John", "role": "developer"} 51 | ``` 52 | 53 | ### Prompts 54 | **What**: Pre-built conversation templates that start specific types of discussions. 55 | 56 | **When to use**: When you want to give the AI structured starting points for common, repeatable workflows for your domain specific use cases. 57 | 58 | **Example**: 59 | ```python 60 | @mcp.prompt() 61 | async def code_review(code: str) -> str: 62 | """Start a code review conversation.""" 63 | # AI gets a structured template for code reviews 64 | return f"Review this code for security and performance:\n{code}" 65 | ``` 66 | 67 | ## Quick Decision Guide 68 | 69 | - **Need AI to take action?** → Use **Tools** 70 | - **Need AI to read data?** → Use **Resources** 71 | - **Need Reusable Agentic Workflows (ADWs)?** → Use **Prompts** 72 | 73 | ## Quick Setup 74 | 75 | To use the Quick Data MCP server: 76 | 77 | 1. **Navigate to the MCP server directory**: 78 | ```bash 79 | cd quick-data-mcp/ 80 | ``` 81 | 82 | 2. **Configure for your MCP client**: 83 | ```bash 84 | # Copy the sample configuration 85 | cp .mcp.json.sample .mcp.json 86 | 87 | # Edit .mcp.json and update the --directory path to your absolute path 88 | # Example: "/Users/yourusername/path/to/quick-data-mcp" 89 | ``` 90 | 91 | 3. **Test the server**: 92 | ```bash 93 | uv run python main.py 94 | ``` 95 | 96 | See [quick-data-mcp/README.md](quick-data-mcp/README.md) for complete setup and usage documentation. 97 | 98 | ## Resources 99 | - MCP Clients: https://modelcontextprotocol.io/clients 100 | - Claude Code Resource Support Github Issue: https://github.com/anthropics/claude-code/issuesç/545 101 | 102 | ## Master AI Coding 103 | Learn to code with AI with foundational [Principles of AI Coding](https://agenticengineer.com/principled-ai-coding?y=jprompt) 104 | 105 | Follow the [IndyDevDan youtube channel](https://www.youtube.com/@indydevdan) for more AI coding tips and tricks. 106 | 107 | Use the best Agentic Coding Tool: [Claude Code](https://docs.anthropic.com/en/docs/claude-code/overview) -------------------------------------------------------------------------------- /ai_docs/uv_running_python.md: -------------------------------------------------------------------------------- 1 | # UV Running Python Scripts 2 | 3 | Based on the official UV documentation for script execution, here's how `uv run python` works and why it's recommended for MCP servers. 4 | 5 | ## Key Benefits of `uv run python` 6 | 7 | ### Automatic Environment Management 8 | - **No manual activation**: UV automatically creates and manages virtual environments 9 | - **Dependency isolation**: Each script runs in its own isolated environment 10 | - **Cross-platform**: Works consistently across macOS, Linux, and Windows 11 | 12 | ### Declarative Dependencies 13 | Scripts can declare their dependencies inline using script metadata: 14 | 15 | ```python 16 | # /// script 17 | # dependencies = ["mcp[cli]>=1.9.2", "pydantic>=2.0.0"] 18 | # /// 19 | 20 | import mcp 21 | # Script content here... 22 | ``` 23 | 24 | ### Flexible Execution Options 25 | 26 | ```bash 27 | # Basic script execution 28 | uv run python script.py 29 | 30 | # Add dependencies on-the-fly 31 | uv run --with mcp[cli] python script.py 32 | 33 | # Specify Python version 34 | uv run --python 3.12 python script.py 35 | 36 | # Use alternative package indexes 37 | uv run --index-url https://custom.pypi.org/simple/ python script.py 38 | ``` 39 | 40 | ## Why Use `uv run python` for MCP Servers? 41 | 42 | ### 1. **Dependency Isolation** 43 | Each MCP server runs with its exact dependencies without conflicts: 44 | 45 | ```bash 46 | # Each server gets its own environment 47 | uv run python /path/to/server1/main.py # Uses server1's dependencies 48 | uv run python /path/to/server2/main.py # Uses server2's dependencies 49 | ``` 50 | 51 | ### 2. **Reproducible Environments** 52 | UV ensures consistent dependency versions across different machines: 53 | 54 | ```bash 55 | # With pyproject.toml, dependencies are locked 56 | uv run python main.py # Always uses same versions 57 | ``` 58 | 59 | ### 3. **No Pre-activation Required** 60 | Unlike traditional virtual environments, no need to activate: 61 | 62 | ```bash 63 | # Traditional approach (error-prone) 64 | source .venv/bin/activate 65 | python main.py 66 | 67 | # UV approach (automatic) 68 | uv run python main.py 69 | ``` 70 | 71 | ### 4. **Better Error Handling** 72 | UV provides clear error messages when dependencies are missing or incompatible. 73 | 74 | ## MCP Configuration Best Practices 75 | 76 | ### For Local Development 77 | ```json 78 | { 79 | "mcpServers": { 80 | "my-server": { 81 | "command": "uv", 82 | "args": ["run", "python", "/absolute/path/to/main.py"], 83 | "env": { 84 | "LOG_LEVEL": "DEBUG" 85 | } 86 | } 87 | } 88 | } 89 | ``` 90 | 91 | ### For Production Deployment 92 | ```json 93 | { 94 | "mcpServers": { 95 | "production-server": { 96 | "command": "uv", 97 | "args": ["run", "--python", "3.12", "python", "/path/to/main.py"], 98 | "env": { 99 | "LOG_LEVEL": "INFO", 100 | "PYTHONPATH": "/path/to/server" 101 | } 102 | } 103 | } 104 | } 105 | ``` 106 | 107 | ## Script Metadata Support 108 | 109 | UV supports PEP 723 script metadata for dependency declaration: 110 | 111 | ```python 112 | #!/usr/bin/env python3 113 | # /// script 114 | # requires-python = ">=3.10" 115 | # dependencies = [ 116 | # "mcp[cli]>=1.9.2", 117 | # "pydantic>=2.0.0", 118 | # "httpx>=0.25.0" 119 | # ] 120 | # /// 121 | 122 | """MCP Server with inline dependencies.""" 123 | 124 | from mcp.server import FastMCP 125 | # Rest of server implementation... 126 | ``` 127 | 128 | ## Performance Considerations 129 | 130 | ### Environment Caching 131 | UV caches environments for faster subsequent runs: 132 | 133 | ```bash 134 | # First run: Creates environment 135 | uv run python main.py # ~2-3 seconds 136 | 137 | # Subsequent runs: Uses cached environment 138 | uv run python main.py # ~0.1 seconds 139 | ``` 140 | 141 | ### Lock Files 142 | Use `uv.lock` for reproducible builds: 143 | 144 | ```bash 145 | # Generate lock file 146 | uv lock 147 | 148 | # Run with exact locked versions 149 | uv run python main.py 150 | ``` 151 | 152 | ## Troubleshooting 153 | 154 | ### Common Issues 155 | 156 | 1. **Permission Errors**: Ensure UV has write access to cache directory 157 | 2. **Path Issues**: Always use absolute paths in MCP configurations 158 | 3. **Python Version**: Specify Python version if system default differs 159 | 160 | ### Debug Commands 161 | 162 | ```bash 163 | # Check UV environment 164 | uv run python -c "import sys; print(sys.executable)" 165 | 166 | # List installed packages 167 | uv run python -m pip list 168 | 169 | # Verbose execution 170 | uv run --verbose python main.py 171 | ``` 172 | 173 | ## Inline Dependencies with `--with` Flag 174 | 175 | UV's `--with` flag allows you to add dependencies on-the-fly without modifying project files. This is particularly useful for one-off scripts, debugging, and quick prototyping. 176 | 177 | ### Basic Usage 178 | 179 | ```bash 180 | # Run a script with additional dependencies 181 | uv run --with requests python -c "import requests; print(requests.__version__)" 182 | 183 | # Multiple dependencies 184 | uv run --with "pandas>=2.0.0" --with "plotly>=5.0.0" python -c " 185 | import pandas as pd 186 | import plotly.express as px 187 | df = pd.DataFrame({'x': [1,2,3], 'y': [4,5,6]}) 188 | print('Data loaded successfully!') 189 | " 190 | ``` 191 | 192 | ### Data Analysis Examples 193 | 194 | ```bash 195 | # Quick data analysis without a dedicated project 196 | uv run --with pandas --with matplotlib python -c " 197 | import pandas as pd 198 | import matplotlib.pyplot as plt 199 | import numpy as np 200 | 201 | # Generate sample data 202 | data = pd.DataFrame({ 203 | 'date': pd.date_range('2024-01-01', periods=30), 204 | 'sales': np.random.randint(100, 1000, 30) 205 | }) 206 | 207 | # Quick analysis 208 | print('Sales Summary:') 209 | print(data['sales'].describe()) 210 | print(f'Total Sales: ${data[\"sales\"].sum():,}') 211 | " 212 | 213 | # Web scraping with requests and beautifulsoup 214 | uv run --with requests --with beautifulsoup4 python -c " 215 | import requests 216 | from bs4 import BeautifulSoup 217 | 218 | response = requests.get('https://httpbin.org/json') 219 | print(f'Status: {response.status_code}') 220 | print(f'Data: {response.json()}') 221 | " 222 | ``` 223 | 224 | ### MCP Server Prototyping 225 | 226 | ```bash 227 | # Prototype an MCP server with specific versions 228 | uv run --with "mcp[cli]>=1.9.2" --with "pydantic>=2.11.0" python -c " 229 | from mcp.server import FastMCP 230 | from pydantic import BaseModel 231 | 232 | class TestModel(BaseModel): 233 | name: str 234 | value: int 235 | 236 | mcp = FastMCP('test-server') 237 | 238 | @mcp.tool() 239 | async def test_tool(name: str, value: int) -> dict: 240 | model = TestModel(name=name, value=value) 241 | return {'message': f'Created {model.name} with value {model.value}'} 242 | 243 | print('MCP server prototype created successfully!') 244 | print(f'Server name: {mcp.name}') 245 | " 246 | 247 | # Test database connections 248 | uv run --with psycopg2-binary --with sqlalchemy python -c " 249 | import sqlalchemy 250 | from sqlalchemy import create_engine, text 251 | 252 | print(f'SQLAlchemy version: {sqlalchemy.__version__}') 253 | print('Database drivers available') 254 | " 255 | ``` 256 | 257 | ### Development Workflow Examples 258 | 259 | ```bash 260 | # Test package compatibility 261 | uv run --with "fastapi>=0.104.0" --with "uvicorn>=0.24.0" python -c " 262 | import fastapi 263 | import uvicorn 264 | print(f'FastAPI: {fastapi.__version__}') 265 | print(f'Uvicorn: {uvicorn.__version__}') 266 | print('Compatibility check passed!') 267 | " 268 | 269 | # Data format conversion 270 | uv run --with "openpyxl>=3.1.0" --with "pandas>=2.2.0" python -c " 271 | import pandas as pd 272 | import json 273 | 274 | # Simulate reading Excel and converting to JSON 275 | data = {'col1': [1, 2, 3], 'col2': ['a', 'b', 'c']} 276 | df = pd.DataFrame(data) 277 | 278 | json_output = df.to_json(orient='records', indent=2) 279 | print('Excel to JSON conversion:') 280 | print(json_output) 281 | " 282 | 283 | # API testing 284 | uv run --with httpx --with pydantic python -c " 285 | import httpx 286 | import asyncio 287 | from pydantic import BaseModel 288 | 289 | class ApiResponse(BaseModel): 290 | status: str 291 | data: dict 292 | 293 | async def test_api(): 294 | async with httpx.AsyncClient() as client: 295 | response = await client.get('https://httpbin.org/json') 296 | print(f'Status: {response.status_code}') 297 | print(f'Headers: {dict(response.headers)}') 298 | return response.json() 299 | 300 | result = asyncio.run(test_api()) 301 | print(f'API test result: {result}') 302 | " 303 | ``` 304 | 305 | ### Machine Learning Prototyping 306 | 307 | ```bash 308 | # Quick ML experiment 309 | uv run --with scikit-learn --with numpy python -c " 310 | import numpy as np 311 | from sklearn.datasets import make_classification 312 | from sklearn.model_selection import train_test_split 313 | from sklearn.ensemble import RandomForestClassifier 314 | from sklearn.metrics import accuracy_score 315 | 316 | # Generate sample data 317 | X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42) 318 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 319 | 320 | # Train model 321 | model = RandomForestClassifier(n_estimators=100, random_state=42) 322 | model.fit(X_train, y_train) 323 | 324 | # Evaluate 325 | predictions = model.predict(X_test) 326 | accuracy = accuracy_score(y_test, predictions) 327 | 328 | print(f'Model Accuracy: {accuracy:.3f}') 329 | print(f'Feature Importance (top 5):') 330 | for i, importance in enumerate(model.feature_importances_[:5]): 331 | print(f' Feature {i}: {importance:.3f}') 332 | " 333 | ``` 334 | 335 | ### Advanced Usage Patterns 336 | 337 | ```bash 338 | # Combine with version constraints 339 | uv run --with "numpy>=1.24.0,<2.0.0" --with "scipy>=1.11.0" python -c " 340 | import numpy as np 341 | import scipy as sp 342 | print(f'NumPy: {np.__version__} (meets constraint)') 343 | print(f'SciPy: {sp.__version__} (compatible)') 344 | " 345 | 346 | # Use pre-release versions for testing 347 | uv run --with "django>=5.0.0rc1" --with "channels>=4.0.0" python -c " 348 | import django 349 | print(f'Testing with Django pre-release: {django.__version__}') 350 | " 351 | 352 | # Development dependencies for testing 353 | uv run --with pytest --with pytest-asyncio --with hypothesis python -c " 354 | import pytest 355 | import hypothesis 356 | print('Testing framework ready!') 357 | print(f'Pytest: {pytest.__version__}') 358 | print(f'Hypothesis: {hypothesis.__version__}') 359 | " 360 | ``` 361 | 362 | ### Benefits of `--with` Flag 363 | 364 | - ✅ **No project modification**: Test dependencies without changing `pyproject.toml` 365 | - ✅ **Rapid prototyping**: Quickly test ideas with different package combinations 366 | - ✅ **Debugging**: Add debugging tools without permanent installation 367 | - ✅ **CI/CD testing**: Test different dependency versions in pipelines 368 | - ✅ **Documentation examples**: Run examples without environment setup 369 | 370 | ### When to Use `--with` vs Project Dependencies 371 | 372 | | Use `--with` for: | Use project dependencies for: | 373 | |-------------------|-------------------------------| 374 | | One-off scripts | Production applications | 375 | | Quick prototypes | Long-term projects | 376 | | Testing compatibility | Reproducible builds | 377 | | Debug sessions | Team collaboration | 378 | | Documentation examples | Deployment scenarios | 379 | 380 | ## Comparison with Traditional Approaches 381 | 382 | | Method | Pros | Cons | 383 | |--------|------|------| 384 | | `python main.py` | Simple | No isolation, manual env management | 385 | | `source .venv/bin/activate && python main.py` | Explicit control | Manual activation, platform-specific | 386 | | `uv run python main.py` | Automatic isolation, cross-platform | Requires UV installation | 387 | 388 | ## Conclusion 389 | 390 | Using `uv run python` for MCP servers provides: 391 | - ✅ **Automatic dependency management** 392 | - ✅ **Environment isolation** 393 | - ✅ **Cross-platform compatibility** 394 | - ✅ **Reproducible deployments** 395 | - ✅ **No manual environment activation** 396 | 397 | This makes it the recommended approach for running MCP servers in both development and production environments. -------------------------------------------------------------------------------- /images/mcp-server-prompts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/disler/quick-data-mcp/02142fc23b8c7f417decf08e897c8c0c079565a8/images/mcp-server-prompts.png -------------------------------------------------------------------------------- /quick-data-mcp/.gitignore: -------------------------------------------------------------------------------- 1 | # Generated outputs 2 | outputs/ 3 | *.html 4 | 5 | # MCP Configuration (user-specific paths) 6 | .mcp.json 7 | 8 | # Python 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | *.so 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # Testing 32 | .pytest_cache/ 33 | .coverage 34 | htmlcov/ 35 | 36 | # Virtual environments 37 | .env 38 | .venv 39 | env/ 40 | venv/ 41 | ENV/ 42 | env.bak/ 43 | venv.bak/ 44 | 45 | # IDE 46 | .vscode/ 47 | .idea/ 48 | *.swp 49 | *.swo 50 | 51 | # OS 52 | .DS_Store 53 | .DS_Store? 54 | ._* 55 | .Spotlight-V100 56 | .Trashes 57 | ehthumbs.db 58 | Thumbs.db -------------------------------------------------------------------------------- /quick-data-mcp/.mcp.json.sample: -------------------------------------------------------------------------------- 1 | { 2 | "mcpServers": { 3 | "quick-data": { 4 | "command": "/path/to/uv", 5 | "args": [ 6 | "--directory", 7 | "/path/to/your/quick-data-mcp", 8 | "run", 9 | "python", 10 | "main.py" 11 | ], 12 | "env": { 13 | "LOG_LEVEL": "INFO" 14 | } 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /quick-data-mcp/.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /quick-data-mcp/data/README.md: -------------------------------------------------------------------------------- 1 | # Sample Datasets for Analytics MCP Server 2 | 3 | This directory contains sample datasets for testing and demonstrating the generic data analytics MCP server capabilities. 4 | 5 | ## Datasets 6 | 7 | ### 1. E-commerce Orders (`ecommerce_orders.json`) 8 | - **Description**: Sample e-commerce transaction data 9 | - **Format**: JSON 10 | - **Rows**: 15 orders 11 | - **Use Cases**: Sales analysis, customer segmentation, regional performance 12 | - **Key Columns**: 13 | - `order_value` (numerical): Order amount in USD 14 | - `product_category` (categorical): Product type 15 | - `region` (categorical): Geographic region 16 | - `customer_segment` (categorical): Customer tier 17 | - `order_date` (temporal): Transaction date 18 | 19 | ### 2. Employee Survey (`employee_survey.csv`) 20 | - **Description**: Employee satisfaction and workforce data 21 | - **Format**: CSV 22 | - **Rows**: 25 employees 23 | - **Use Cases**: HR analytics, satisfaction analysis, departmental comparisons 24 | - **Key Columns**: 25 | - `satisfaction_score` (numerical): Employee satisfaction (1-10) 26 | - `tenure_years` (numerical): Years with company 27 | - `department` (categorical): Work department 28 | - `remote_work` (categorical): Work arrangement 29 | - `salary_band` (categorical): Compensation level 30 | 31 | ### 3. Product Performance (`product_performance.csv`) 32 | - **Description**: Product sales and inventory metrics 33 | - **Format**: CSV 34 | - **Rows**: 20 products 35 | - **Use Cases**: Product analysis, inventory optimization, supplier evaluation 36 | - **Key Columns**: 37 | - `monthly_sales` (numerical): Units sold per month 38 | - `inventory_level` (numerical): Current stock 39 | - `rating` (numerical): Customer rating (1-5) 40 | - `category` (categorical): Product category 41 | - `supplier` (categorical): Supplier name 42 | - `launch_date` (temporal): Product launch date 43 | 44 | ## Usage Examples 45 | 46 | ### Loading Datasets 47 | ```python 48 | # Load e-commerce data 49 | load_dataset('data/ecommerce_orders.json', 'ecommerce') 50 | 51 | # Load employee survey 52 | load_dataset('data/employee_survey.csv', 'employees') 53 | 54 | # Load product data 55 | load_dataset('data/product_performance.csv', 'products') 56 | ``` 57 | 58 | ### Analysis Examples 59 | 60 | #### Segmentation Analysis 61 | ```python 62 | # Analyze orders by region 63 | segment_by_column('ecommerce', 'region') 64 | 65 | # Compare employees by department 66 | segment_by_column('employees', 'department') 67 | 68 | # Group products by category 69 | segment_by_column('products', 'category') 70 | ``` 71 | 72 | #### Correlation Analysis 73 | ```python 74 | # Find relationships in employee data 75 | find_correlations('employees') 76 | 77 | # Analyze product metrics 78 | find_correlations('products', ['monthly_sales', 'rating', 'inventory_level']) 79 | ``` 80 | 81 | #### Visualization 82 | ```python 83 | # Order value distribution 84 | create_chart('ecommerce', 'histogram', 'order_value') 85 | 86 | # Sales by product category 87 | create_chart('products', 'bar', 'category', 'monthly_sales') 88 | 89 | # Satisfaction vs tenure 90 | create_chart('employees', 'scatter', 'tenure_years', 'satisfaction_score') 91 | ``` 92 | 93 | #### Time Series Analysis 94 | ```python 95 | # Order trends over time 96 | time_series_analysis('ecommerce', 'order_date', 'order_value') 97 | 98 | # Product launch timeline 99 | time_series_analysis('products', 'launch_date', 'monthly_sales') 100 | ``` 101 | 102 | #### Data Quality Assessment 103 | ```python 104 | # Check data quality 105 | validate_data_quality('ecommerce') 106 | validate_data_quality('employees') 107 | validate_data_quality('products') 108 | ``` 109 | 110 | ## Dataset Characteristics 111 | 112 | | Dataset | Numerical Cols | Categorical Cols | Temporal Cols | Suggested Analyses | 113 | |---------|----------------|------------------|---------------|-------------------| 114 | | E-commerce | 1 | 5 | 1 | Segmentation, Time Series | 115 | | Employees | 2 | 3 | 0 | Correlation, Segmentation | 116 | | Products | 3 | 3 | 1 | Correlation, Time Series | 117 | 118 | ## Testing Scenarios 119 | 120 | These datasets are designed to test various analytics capabilities: 121 | 122 | 1. **Schema Discovery**: Different data types and formats 123 | 2. **Segmentation**: Multiple categorical variables for grouping 124 | 3. **Correlation**: Numerical relationships to explore 125 | 4. **Time Series**: Date columns for temporal analysis 126 | 5. **Data Quality**: Clean data with good coverage 127 | 6. **Visualization**: Various chart types and combinations 128 | 7. **Cross-Dataset**: Potential for merging and comparison 129 | 130 | ## Extending the Datasets 131 | 132 | You can modify these datasets or add new ones by: 133 | 1. Adding more rows for larger-scale testing 134 | 2. Introducing missing values to test data quality features 135 | 3. Creating related datasets for merge testing 136 | 4. Adding more numerical columns for advanced correlation analysis 137 | 5. Including text columns for natural language processing features -------------------------------------------------------------------------------- /quick-data-mcp/data/ecommerce_orders.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "order_id": "ord_001", 4 | "customer_id": "cust_123", 5 | "product_category": "electronics", 6 | "order_value": 299.99, 7 | "order_date": "2024-11-15", 8 | "region": "west_coast", 9 | "payment_method": "credit_card", 10 | "customer_segment": "premium" 11 | }, 12 | { 13 | "order_id": "ord_002", 14 | "customer_id": "cust_124", 15 | "product_category": "books", 16 | "order_value": 29.99, 17 | "order_date": "2024-11-14", 18 | "region": "midwest", 19 | "payment_method": "paypal", 20 | "customer_segment": "standard" 21 | }, 22 | { 23 | "order_id": "ord_003", 24 | "customer_id": "cust_125", 25 | "product_category": "clothing", 26 | "order_value": 89.5, 27 | "order_date": "2024-11-13", 28 | "region": "east_coast", 29 | "payment_method": "credit_card", 30 | "customer_segment": "premium" 31 | }, 32 | { 33 | "order_id": "ord_004", 34 | "customer_id": "cust_126", 35 | "product_category": "electronics", 36 | "order_value": 599.99, 37 | "order_date": "2024-11-12", 38 | "region": "west_coast", 39 | "payment_method": "debit_card", 40 | "customer_segment": "premium" 41 | }, 42 | { 43 | "order_id": "ord_005", 44 | "customer_id": "cust_127", 45 | "product_category": "home_garden", 46 | "order_value": 149.99, 47 | "order_date": "2024-11-11", 48 | "region": "south", 49 | "payment_method": "credit_card", 50 | "customer_segment": "standard" 51 | }, 52 | { 53 | "order_id": "ord_006", 54 | "customer_id": "cust_128", 55 | "product_category": "books", 56 | "order_value": 19.99, 57 | "order_date": "2024-11-10", 58 | "region": "midwest", 59 | "payment_method": "paypal", 60 | "customer_segment": "basic" 61 | }, 62 | { 63 | "order_id": "ord_007", 64 | "customer_id": "cust_129", 65 | "product_category": "electronics", 66 | "order_value": 1299.99, 67 | "order_date": "2024-11-09", 68 | "region": "east_coast", 69 | "payment_method": "credit_card", 70 | "customer_segment": "premium" 71 | }, 72 | { 73 | "order_id": "ord_008", 74 | "customer_id": "cust_130", 75 | "product_category": "clothing", 76 | "order_value": 45.0, 77 | "order_date": "2024-11-08", 78 | "region": "west_coast", 79 | "payment_method": "debit_card", 80 | "customer_segment": "standard" 81 | }, 82 | { 83 | "order_id": "ord_009", 84 | "customer_id": "cust_131", 85 | "product_category": "sports", 86 | "order_value": 79.99, 87 | "order_date": "2024-11-07", 88 | "region": "south", 89 | "payment_method": "credit_card", 90 | "customer_segment": "standard" 91 | }, 92 | { 93 | "order_id": "ord_010", 94 | "customer_id": "cust_132", 95 | "product_category": "home_garden", 96 | "order_value": 225.5, 97 | "order_date": "2024-11-06", 98 | "region": "midwest", 99 | "payment_method": "paypal", 100 | "customer_segment": "premium" 101 | }, 102 | { 103 | "order_id": "ord_011", 104 | "customer_id": "cust_133", 105 | "product_category": "books", 106 | "order_value": 39.99, 107 | "order_date": "2024-11-05", 108 | "region": "east_coast", 109 | "payment_method": "credit_card", 110 | "customer_segment": "basic" 111 | }, 112 | { 113 | "order_id": "ord_012", 114 | "customer_id": "cust_134", 115 | "product_category": "electronics", 116 | "order_value": 449.99, 117 | "order_date": "2024-11-04", 118 | "region": "west_coast", 119 | "payment_method": "credit_card", 120 | "customer_segment": "premium" 121 | }, 122 | { 123 | "order_id": "ord_013", 124 | "customer_id": "cust_135", 125 | "product_category": "clothing", 126 | "order_value": 120.0, 127 | "order_date": "2024-11-03", 128 | "region": "south", 129 | "payment_method": "debit_card", 130 | "customer_segment": "standard" 131 | }, 132 | { 133 | "order_id": "ord_014", 134 | "customer_id": "cust_136", 135 | "product_category": "sports", 136 | "order_value": 159.99, 137 | "order_date": "2024-11-02", 138 | "region": "midwest", 139 | "payment_method": "paypal", 140 | "customer_segment": "premium" 141 | }, 142 | { 143 | "order_id": "ord_015", 144 | "customer_id": "cust_137", 145 | "product_category": "home_garden", 146 | "order_value": 89.99, 147 | "order_date": "2024-11-01", 148 | "region": "east_coast", 149 | "payment_method": "credit_card", 150 | "customer_segment": "standard" 151 | } 152 | ] 153 | -------------------------------------------------------------------------------- /quick-data-mcp/data/employee_survey.csv: -------------------------------------------------------------------------------- 1 | employee_id,department,satisfaction_score,tenure_years,remote_work,salary_band 2 | emp_001,engineering,8.5,3.2,yes,senior 3 | emp_002,sales,6.2,1.8,no,mid 4 | emp_003,marketing,9.1,5.5,hybrid,senior 5 | emp_004,engineering,7.8,2.1,yes,mid 6 | emp_005,sales,5.9,0.8,no,junior 7 | emp_006,hr,8.2,4.3,hybrid,senior 8 | emp_007,engineering,9.3,6.2,yes,senior 9 | emp_008,marketing,6.7,2.5,hybrid,mid 10 | emp_009,sales,7.1,3.1,no,mid 11 | emp_010,finance,8.9,4.7,hybrid,senior 12 | emp_011,engineering,7.5,1.9,yes,mid 13 | emp_012,sales,6.8,2.8,no,mid 14 | emp_013,marketing,8.1,3.6,hybrid,senior 15 | emp_014,hr,7.3,1.5,hybrid,junior 16 | emp_015,engineering,9.0,5.1,yes,senior 17 | emp_016,finance,7.9,3.8,hybrid,mid 18 | emp_017,sales,5.5,0.6,no,junior 19 | emp_018,marketing,8.6,4.2,hybrid,senior 20 | emp_019,engineering,8.8,2.9,yes,mid 21 | emp_020,hr,6.9,2.3,hybrid,mid 22 | emp_021,sales,7.6,4.1,no,senior 23 | emp_022,finance,8.3,3.4,hybrid,mid 24 | emp_023,engineering,9.2,7.1,yes,senior 25 | emp_024,marketing,7.0,1.7,hybrid,junior 26 | emp_025,sales,6.4,1.2,no,junior -------------------------------------------------------------------------------- /quick-data-mcp/data/product_performance.csv: -------------------------------------------------------------------------------- 1 | product_id,category,monthly_sales,inventory_level,supplier,launch_date,rating 2 | prod_001,widgets,1250,45,supplier_a,2024-01-15,4.2 3 | prod_002,gadgets,890,12,supplier_b,2023-08-22,3.8 4 | prod_003,tools,2100,78,supplier_a,2023-12-03,4.5 5 | prod_004,widgets,950,23,supplier_c,2024-02-10,4.1 6 | prod_005,gadgets,1650,56,supplier_b,2023-09-15,4.3 7 | prod_006,accessories,750,89,supplier_d,2024-03-20,3.9 8 | prod_007,tools,1890,34,supplier_a,2023-11-08,4.6 9 | prod_008,widgets,1100,67,supplier_c,2024-01-28,4.0 10 | prod_009,gadgets,1340,15,supplier_b,2023-10-12,4.2 11 | prod_010,accessories,620,92,supplier_d,2024-04-05,3.7 12 | prod_011,tools,2350,41,supplier_a,2023-10-30,4.7 13 | prod_012,widgets,780,28,supplier_c,2024-02-18,3.8 14 | prod_013,gadgets,1520,63,supplier_b,2023-07-25,4.4 15 | prod_014,accessories,840,76,supplier_d,2024-03-12,4.0 16 | prod_015,tools,1980,52,supplier_a,2023-12-20,4.5 17 | prod_016,widgets,1050,39,supplier_c,2024-01-08,4.1 18 | prod_017,gadgets,1120,18,supplier_b,2023-09-03,3.9 19 | prod_018,accessories,690,81,supplier_d,2024-04-15,3.8 20 | prod_019,tools,2240,46,supplier_a,2023-11-15,4.6 21 | prod_020,widgets,920,55,supplier_c,2024-02-25,4.0 -------------------------------------------------------------------------------- /quick-data-mcp/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Entry point for the Recommended MCP Server.""" 3 | 4 | import sys 5 | import os 6 | 7 | # Add src to Python path so we can import our server 8 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) 9 | 10 | from mcp_server.server import mcp 11 | 12 | 13 | def main(): 14 | """Run the MCP server.""" 15 | mcp.run() 16 | 17 | 18 | if __name__ == "__main__": 19 | main() 20 | -------------------------------------------------------------------------------- /quick-data-mcp/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "quick-data-mcp" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "mcp[cli]>=1.9.2", 9 | "pandas>=2.2.3", 10 | "plotly>=6.1.2", 11 | "pytest>=8.3.5", 12 | "pytest-asyncio>=1.0.0", 13 | ] 14 | -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/__init__.py: -------------------------------------------------------------------------------- 1 | """MCP Server package for the recommended architecture.""" 2 | 3 | __version__ = "0.1.0" -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/config/__init__.py: -------------------------------------------------------------------------------- 1 | """Configuration package.""" -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/config/settings.py: -------------------------------------------------------------------------------- 1 | """Server configuration settings.""" 2 | 3 | import os 4 | from typing import Optional 5 | 6 | 7 | class Settings: 8 | """Application settings.""" 9 | 10 | def __init__(self): 11 | self.server_name = "Modular MCP Server" 12 | self.version = "0.1.0" 13 | self.log_level = os.getenv("LOG_LEVEL", "INFO") 14 | self.api_key: Optional[str] = os.getenv("API_KEY") 15 | self.database_url: Optional[str] = os.getenv("DATABASE_URL") 16 | 17 | @property 18 | def server_info(self) -> dict: 19 | """Get server information.""" 20 | return { 21 | "name": self.server_name, 22 | "version": self.version, 23 | "log_level": self.log_level 24 | } 25 | 26 | 27 | settings = Settings() -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Models package.""" -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/models/schemas.py: -------------------------------------------------------------------------------- 1 | """Data models and schemas for analytics platform.""" 2 | 3 | from pydantic import BaseModel, Field 4 | from typing import Optional, List, Dict, Any, Union 5 | from datetime import datetime 6 | from enum import Enum 7 | import pandas as pd 8 | import numpy as np 9 | 10 | 11 | class ColumnInfo(BaseModel): 12 | """Column metadata and characteristics.""" 13 | name: str 14 | dtype: str 15 | unique_values: int 16 | null_percentage: float 17 | sample_values: List[Any] 18 | suggested_role: str # 'categorical', 'numerical', 'temporal', 'identifier' 19 | 20 | @classmethod 21 | def from_series(cls, series: pd.Series, name: str) -> 'ColumnInfo': 22 | """Auto-discover column characteristics from pandas Series.""" 23 | 24 | # Determine suggested role 25 | if pd.api.types.is_numeric_dtype(series): 26 | role = 'numerical' 27 | elif pd.api.types.is_datetime64_any_dtype(series): 28 | role = 'temporal' 29 | elif series.nunique() / len(series) < 0.5: # Low cardinality = categorical 30 | role = 'categorical' 31 | elif series.nunique() == len(series): # Unique values = identifier 32 | role = 'identifier' 33 | else: 34 | role = 'categorical' 35 | 36 | return cls( 37 | name=name, 38 | dtype=str(series.dtype), 39 | unique_values=series.nunique(), 40 | null_percentage=series.isnull().mean() * 100, 41 | sample_values=series.dropna().head(3).tolist(), 42 | suggested_role=role 43 | ) 44 | 45 | 46 | class DatasetSchema(BaseModel): 47 | """Dynamically discovered dataset schema.""" 48 | name: str 49 | columns: Dict[str, ColumnInfo] 50 | row_count: int 51 | suggested_analyses: List[str] 52 | 53 | @classmethod 54 | def from_dataframe(cls, df: pd.DataFrame, name: str) -> 'DatasetSchema': 55 | """Auto-discover schema from pandas DataFrame.""" 56 | columns = {} 57 | for col in df.columns: 58 | columns[col] = ColumnInfo.from_series(df[col], col) 59 | 60 | # Generate analysis suggestions based on column types 61 | suggestions = [] 62 | numerical_cols = [col for col, info in columns.items() if info.suggested_role == 'numerical'] 63 | categorical_cols = [col for col, info in columns.items() if info.suggested_role == 'categorical'] 64 | temporal_cols = [col for col, info in columns.items() if info.suggested_role == 'temporal'] 65 | 66 | if len(numerical_cols) >= 2: 67 | suggestions.append("correlation_analysis") 68 | if categorical_cols: 69 | suggestions.append("segmentation_analysis") 70 | if temporal_cols: 71 | suggestions.append("time_series_analysis") 72 | 73 | return cls( 74 | name=name, 75 | columns=columns, 76 | row_count=len(df), 77 | suggested_analyses=suggestions 78 | ) 79 | 80 | 81 | # Global in-memory storage for datasets 82 | loaded_datasets: Dict[str, pd.DataFrame] = {} 83 | dataset_schemas: Dict[str, DatasetSchema] = {} 84 | 85 | 86 | class DatasetManager: 87 | """Simple in-memory dataset management.""" 88 | 89 | @staticmethod 90 | def load_dataset(file_path: str, dataset_name: str) -> dict: 91 | """Load dataset into memory with automatic schema discovery.""" 92 | 93 | # Determine format from file extension 94 | if file_path.endswith('.json'): 95 | df = pd.read_json(file_path) 96 | file_format = 'json' 97 | elif file_path.endswith('.csv'): 98 | df = pd.read_csv(file_path) 99 | file_format = 'csv' 100 | else: 101 | raise ValueError(f"Unsupported file format: {file_path}") 102 | 103 | # Store in global memory 104 | loaded_datasets[dataset_name] = df 105 | 106 | # Discover and cache schema 107 | schema = DatasetSchema.from_dataframe(df, dataset_name) 108 | dataset_schemas[dataset_name] = schema 109 | 110 | return { 111 | "status": "loaded", 112 | "dataset_name": dataset_name, 113 | "rows": len(df), 114 | "columns": list(df.columns), 115 | "format": file_format, 116 | "memory_usage": f"{df.memory_usage(deep=True).sum() / 1024**2:.1f} MB" 117 | } 118 | 119 | @staticmethod 120 | def get_dataset(dataset_name: str) -> pd.DataFrame: 121 | """Retrieve dataset from memory.""" 122 | if dataset_name not in loaded_datasets: 123 | raise ValueError(f"Dataset '{dataset_name}' not loaded. Use load_dataset() first.") 124 | return loaded_datasets[dataset_name] 125 | 126 | @staticmethod 127 | def list_datasets() -> List[str]: 128 | """Get names of all loaded datasets.""" 129 | return list(loaded_datasets.keys()) 130 | 131 | @staticmethod 132 | def get_dataset_info(dataset_name: str) -> dict: 133 | """Get basic info about loaded dataset.""" 134 | if dataset_name not in loaded_datasets: 135 | raise ValueError(f"Dataset '{dataset_name}' not loaded") 136 | 137 | df = loaded_datasets[dataset_name] 138 | schema = dataset_schemas[dataset_name] 139 | 140 | return { 141 | "name": dataset_name, 142 | "shape": df.shape, 143 | "columns": list(df.columns), 144 | "memory_usage_mb": df.memory_usage(deep=True).sum() / 1024**2, 145 | "schema": schema.model_dump() 146 | } 147 | 148 | @staticmethod 149 | def clear_dataset(dataset_name: str) -> dict: 150 | """Remove dataset from memory.""" 151 | if dataset_name not in loaded_datasets: 152 | return {"error": f"Dataset '{dataset_name}' not found"} 153 | 154 | del loaded_datasets[dataset_name] 155 | del dataset_schemas[dataset_name] 156 | 157 | return {"status": "success", "message": f"Dataset '{dataset_name}' cleared from memory"} 158 | 159 | @staticmethod 160 | def clear_all_datasets() -> dict: 161 | """Clear all datasets from memory.""" 162 | count = len(loaded_datasets) 163 | loaded_datasets.clear() 164 | dataset_schemas.clear() 165 | 166 | return {"status": "success", "message": f"Cleared {count} datasets from memory"} 167 | 168 | 169 | class ChartConfig(BaseModel): 170 | """Configuration for chart generation.""" 171 | dataset_name: str 172 | chart_type: str # 'bar', 'histogram', 'scatter', 'line', 'box' 173 | x_column: str 174 | y_column: Optional[str] = None 175 | groupby_column: Optional[str] = None 176 | title: Optional[str] = None 177 | 178 | 179 | class AnalysisResult(BaseModel): 180 | """Generic analysis result.""" 181 | dataset_name: str 182 | analysis_type: str 183 | timestamp: datetime = Field(default_factory=datetime.now) 184 | results: Dict[str, Any] 185 | metadata: Dict[str, Any] = Field(default_factory=dict) 186 | 187 | 188 | class DataQualityReport(BaseModel): 189 | """Data quality assessment report.""" 190 | dataset_name: str 191 | total_rows: int 192 | total_columns: int 193 | missing_data: Dict[str, float] # column -> percentage missing 194 | duplicate_rows: int 195 | potential_issues: List[str] 196 | quality_score: float # 0-100 197 | recommendations: List[str] 198 | 199 | 200 | # Legacy models - kept for minimal backward compatibility if needed 201 | class Status(str, Enum): 202 | """Status enum.""" 203 | PENDING = "pending" 204 | IN_PROGRESS = "in_progress" 205 | COMPLETED = "completed" 206 | FAILED = "failed" 207 | 208 | 209 | class UserProfile(BaseModel): 210 | """User profile model.""" 211 | id: str 212 | name: str 213 | email: str 214 | status: str = "active" 215 | preferences: Dict[str, Any] = Field(default_factory=dict) -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | """Prompts package.""" 2 | 3 | from .dataset_first_look_prompt import dataset_first_look 4 | from .segmentation_workshop_prompt import segmentation_workshop 5 | from .data_quality_assessment_prompt import data_quality_assessment 6 | from .correlation_investigation_prompt import correlation_investigation 7 | from .pattern_discovery_session_prompt import pattern_discovery_session 8 | from .insight_generation_workshop_prompt import insight_generation_workshop 9 | from .dashboard_design_consultation_prompt import dashboard_design_consultation 10 | from .find_datasources_prompt import find_datasources 11 | from .list_mcp_assets_prompt import list_mcp_assets 12 | 13 | __all__ = [ 14 | "dataset_first_look", 15 | "segmentation_workshop", 16 | "data_quality_assessment", 17 | "correlation_investigation", 18 | "pattern_discovery_session", 19 | "insight_generation_workshop", 20 | "dashboard_design_consultation", 21 | "find_datasources", 22 | "list_mcp_assets" 23 | ] -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/prompts/correlation_investigation_prompt.py: -------------------------------------------------------------------------------- 1 | """Correlation investigation prompt implementation.""" 2 | 3 | from typing import List, Optional 4 | from ..models.schemas import DatasetManager, dataset_schemas 5 | 6 | 7 | async def correlation_investigation(dataset_name: str) -> str: 8 | """Guide correlation analysis workflow.""" 9 | try: 10 | if dataset_name not in dataset_schemas: 11 | return f"Dataset '{dataset_name}' not loaded. Use load_dataset() tool first." 12 | 13 | schema = dataset_schemas[dataset_name] 14 | 15 | # Find numerical columns 16 | numerical_cols = [name for name, info in schema.columns.items() 17 | if info.suggested_role == 'numerical'] 18 | 19 | if len(numerical_cols) < 2: 20 | return f"""**Correlation Analysis: Insufficient Numerical Data** 21 | 22 | Your **{dataset_name}** dataset has {len(numerical_cols)} numerical column(s): {', '.join(numerical_cols) if numerical_cols else 'none'} 23 | 24 | **To perform correlation analysis, you need:** 25 | • At least 2 numerical columns 26 | • Sufficient data variation (not all identical values) 27 | 28 | **Suggestions:** 29 | 1. Check if any categorical columns contain numerical data stored as text 30 | 2. Convert date columns to numerical formats (days since epoch, etc.) 31 | 3. Create numerical features from categorical data (count encodings, etc.) 32 | 4. Load additional datasets with more numerical variables 33 | 34 | **Alternative analyses you can perform:** 35 | • Data quality assessment: `validate_data_quality('{dataset_name}')` 36 | • Distribution analysis: `analyze_distributions('{dataset_name}', 'column_name')` 37 | • Segmentation: `segment_by_column('{dataset_name}', 'categorical_column')` 38 | """ 39 | 40 | prompt = f"""Let's explore **correlations** in your **{dataset_name}** dataset! 41 | 42 | **📊 Available numerical columns** ({len(numerical_cols)}): 43 | """ 44 | 45 | for col in numerical_cols: 46 | col_info = schema.columns[col] 47 | prompt += f"• **{col}**: {col_info.unique_values} unique values, {col_info.null_percentage:.1f}% missing\n" 48 | prompt += f" Sample values: {', '.join(map(str, col_info.sample_values))}\n" 49 | 50 | prompt += f""" 51 | **🎯 Correlation analysis strategy:** 52 | 53 | 1. **Start broad**: Find all significant correlations 54 | → `find_correlations('{dataset_name}')` 55 | 56 | 2. **Focus on strong relationships**: Investigate correlations > 0.7 57 | → Look for business logic behind statistical relationships 58 | 59 | 3. **Create visualizations**: Plot the strongest correlations 60 | → `create_chart('{dataset_name}', 'scatter', 'column1', 'column2')` 61 | 62 | 4. **Segment analysis**: Check if correlations hold across different groups 63 | → Combine with categorical segmentation 64 | 65 | **🔍 What to look for:** 66 | • **Strong positive correlations** (0.7+): Variables that increase together 67 | • **Strong negative correlations** (-0.7+): Variables that move oppositely 68 | • **Moderate correlations** (0.3-0.7): Interesting but not overwhelming relationships 69 | • **No correlation** (~0): Independent variables 70 | 71 | **⚠️ Correlation insights:** 72 | • Correlation ≠ Causation (remember this!) 73 | • High correlation might indicate redundant features 74 | • Unexpected correlations often reveal interesting patterns 75 | 76 | **Quick commands to start:** 77 | • `find_correlations('{dataset_name}')` - Find all correlations 78 | • `find_correlations('{dataset_name}', ['{numerical_cols[0]}', '{numerical_cols[1]}'])` - Focus on specific columns""" 79 | 80 | if len(numerical_cols) >= 2: 81 | prompt += f""" 82 | • `create_chart('{dataset_name}', 'scatter', '{numerical_cols[0]}', '{numerical_cols[1]}')` - Visualize relationship""" 83 | 84 | prompt += f""" 85 | 86 | **💡 Advanced correlation techniques:** 87 | • Partial correlations (controlling for other variables) 88 | • Correlation matrices with hierarchical clustering 89 | • Rolling correlations for time series data 90 | 91 | Ready to discover hidden relationships in your data? What correlation analysis would you like to start with?""" 92 | 93 | return prompt 94 | 95 | except Exception as e: 96 | return f"Error generating correlation investigation prompt: {str(e)}" -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/prompts/dashboard_design_consultation_prompt.py: -------------------------------------------------------------------------------- 1 | """Dashboard design consultation prompt implementation.""" 2 | 3 | from typing import List, Optional 4 | from ..models.schemas import DatasetManager, dataset_schemas 5 | 6 | 7 | async def dashboard_design_consultation(dataset_name: str, audience: str = "general") -> str: 8 | """Plan dashboards for specific audiences.""" 9 | try: 10 | if dataset_name not in dataset_schemas: 11 | return f"Dataset '{dataset_name}' not loaded. Use load_dataset() tool first." 12 | 13 | schema = dataset_schemas[dataset_name] 14 | 15 | # Analyze available data types 16 | numerical_cols = [name for name, info in schema.columns.items() 17 | if info.suggested_role == 'numerical'] 18 | categorical_cols = [name for name, info in schema.columns.items() 19 | if info.suggested_role == 'categorical'] 20 | temporal_cols = [name for name, info in schema.columns.items() 21 | if info.suggested_role == 'temporal'] 22 | 23 | prompt = f"""📊 **Dashboard Design Consultation: {dataset_name}** 24 | 25 | **Target Audience**: {audience} 26 | 27 | Let's design a compelling dashboard from your **{schema.row_count:,} records** that tells a clear story! 28 | 29 | **📋 Available data for dashboards:** 30 | • **{len(numerical_cols)} numerical metrics**: Perfect for KPIs, trends, and comparisons 31 | • **{len(categorical_cols)} categorical dimensions**: Great for filtering and segmentation 32 | • **{len(temporal_cols)} time dimensions**: Ideal for time series and trend analysis 33 | 34 | **🎯 Dashboard design principles:** 35 | 36 | **For Executive/Leadership Audience:** 37 | • High-level KPIs and trend indicators 38 | • Exception-based reporting (what needs attention) 39 | • Comparative analysis (vs targets, previous periods) 40 | • Clean, simple visualizations with clear takeaways 41 | 42 | **For Operational/Management Audience:** 43 | • Detailed performance metrics 44 | • Drill-down capabilities by segment/category 45 | • Operational efficiency indicators 46 | • Actionable insights for daily decisions 47 | 48 | **For Analytical/Technical Audience:** 49 | • Comprehensive data exploration capabilities 50 | • Statistical analysis and correlation views 51 | • Raw data access and filtering options 52 | • Advanced visualization types 53 | 54 | **📊 Dashboard component recommendations:** 55 | 56 | **1. Key Performance Indicators (KPIs)**""" 57 | 58 | if numerical_cols: 59 | prompt += f""" 60 | • Primary metrics from: {', '.join(numerical_cols[:3])} 61 | • Trend indicators and period-over-period changes 62 | • Target vs actual comparisons""" 63 | 64 | prompt += f""" 65 | 66 | **2. Trend Analysis**""" 67 | 68 | if temporal_cols and numerical_cols: 69 | prompt += f""" 70 | • Time series charts showing {numerical_cols[0]} over {temporal_cols[0]} 71 | • Seasonal patterns and growth trends 72 | • Anomaly detection and highlighting""" 73 | 74 | prompt += f""" 75 | 76 | **3. Segmentation Views**""" 77 | 78 | if categorical_cols and numerical_cols: 79 | prompt += f""" 80 | • Performance by {categorical_cols[0]} (bar charts, tables) 81 | • Comparative analysis across segments 82 | • Top/bottom performer identification""" 83 | 84 | prompt += f""" 85 | 86 | **4. Distribution Analysis** 87 | • Data quality indicators and completeness 88 | • Outlier detection and unusual patterns 89 | • Statistical summaries and ranges 90 | 91 | **🛠️ Dashboard creation workflow:** 92 | 93 | 1. **Define dashboard objectives** 94 | → What decisions should this dashboard support? 95 | → What questions should it answer? 96 | 97 | 2. **Create individual visualizations** 98 | → `create_chart('{dataset_name}', 'chart_type', 'x_column', 'y_column')` 99 | → Test different chart types for each insight 100 | 101 | 3. **Build comprehensive dashboard** 102 | → `generate_dashboard('{dataset_name}', chart_configs)` 103 | → Combine multiple visualizations 104 | 105 | 4. **Export for sharing** 106 | → `export_insights('{dataset_name}', 'html')` 107 | → Create shareable dashboard file 108 | 109 | **📊 Recommended chart types by purpose:** 110 | 111 | **KPI Monitoring**: Bar charts, line charts, gauge charts 112 | **Trend Analysis**: Line charts, area charts, sparklines 113 | **Comparison**: Bar charts, grouped charts, heatmaps 114 | **Distribution**: Histograms, box plots, violin plots 115 | **Relationship**: Scatter plots, correlation matrices 116 | 117 | **🎨 Dashboard layout suggestions for {audience}:** 118 | """ 119 | 120 | if audience.lower() in ['executive', 'leadership', 'c-suite']: 121 | prompt += """ 122 | • **Top row**: 3-4 key KPIs with trend indicators 123 | • **Second row**: Main performance chart (trend over time) 124 | • **Bottom rows**: Segmentation breakdown and key insights 125 | • **Colors**: Minimal palette, red/green for performance indicators""" 126 | 127 | elif audience.lower() in ['manager', 'operational', 'team lead']: 128 | prompt += """ 129 | • **Left panel**: Filters and controls for interactivity 130 | • **Main area**: Primary operational metrics and trends 131 | • **Right panel**: Top/bottom performers and alerts 132 | • **Bottom**: Detailed breakdowns and drill-down options""" 133 | 134 | elif audience.lower() in ['analyst', 'technical', 'data team']: 135 | prompt += """ 136 | • **Full data exploration**: Multiple visualization types 137 | • **Statistical summaries**: Correlation matrices, distributions 138 | • **Interactive filters**: Full dataset slicing capabilities 139 | • **Export options**: Data download and analysis tools""" 140 | 141 | else: 142 | prompt += """ 143 | • **Balanced approach**: Mix of high-level and detailed views 144 | • **Clear navigation**: Logical flow from summary to detail 145 | • **Contextual information**: Explanations and data definitions 146 | • **Action orientation**: Clear next steps and recommendations""" 147 | 148 | prompt += f""" 149 | 150 | **🚀 Let's start building your dashboard!** 151 | 152 | **Immediate next steps:** 153 | 1. **Identify your top 3 KPIs** from available numerical columns 154 | 2. **Choose primary segmentation** from categorical columns 155 | 3. **Create initial visualizations** with create_chart() 156 | 4. **Iterate and refine** based on feedback 157 | 158 | **Quick start commands:** 159 | """ 160 | 161 | if numerical_cols and categorical_cols: 162 | prompt += f"""• `create_chart('{dataset_name}', 'bar', '{categorical_cols[0]}', '{numerical_cols[0]}')` - Key metric by segment 163 | """ 164 | if len(numerical_cols) >= 2: 165 | prompt += f"""• `create_chart('{dataset_name}', 'scatter', '{numerical_cols[0]}', '{numerical_cols[1]}')` - Relationship analysis 166 | """ 167 | if temporal_cols and numerical_cols: 168 | prompt += f"""• `create_chart('{dataset_name}', 'line', '{temporal_cols[0]}', '{numerical_cols[0]}')` - Trend analysis 169 | """ 170 | 171 | prompt += f""" 172 | 173 | What type of dashboard story do you want to tell with your **{dataset_name}** data?""" 174 | 175 | return prompt 176 | 177 | except Exception as e: 178 | return f"Error generating dashboard consultation prompt: {str(e)}" -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/prompts/data_quality_assessment_prompt.py: -------------------------------------------------------------------------------- 1 | """Data quality assessment prompt implementation.""" 2 | 3 | from typing import List, Optional 4 | from ..models.schemas import DatasetManager, dataset_schemas 5 | 6 | 7 | async def data_quality_assessment(dataset_name: str) -> str: 8 | """Guide systematic data quality review.""" 9 | try: 10 | if dataset_name not in dataset_schemas: 11 | return f"Dataset '{dataset_name}' not loaded. Use load_dataset() tool first." 12 | 13 | schema = dataset_schemas[dataset_name] 14 | df = DatasetManager.get_dataset(dataset_name) 15 | 16 | prompt = f"""Let's systematically review the quality of your **{dataset_name}** dataset. 17 | 18 | **📋 Dataset Overview:** 19 | • **{schema.row_count:,} rows** × **{len(schema.columns)} columns** 20 | • **Memory usage**: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB 21 | 22 | **🔍 Data Quality Indicators:** 23 | """ 24 | 25 | # Missing values analysis 26 | missing_data = [] 27 | for col_name, col_info in schema.columns.items(): 28 | if col_info.null_percentage > 0: 29 | missing_data.append((col_name, col_info.null_percentage)) 30 | 31 | if missing_data: 32 | missing_data.sort(key=lambda x: x[1], reverse=True) 33 | prompt += f"\n**📋 Missing Values** ({len(missing_data)} columns affected):\n" 34 | for col, pct in missing_data[:5]: # Show top 5 35 | status = "🔴" if pct > 50 else "🟡" if pct > 10 else "🟢" 36 | prompt += f"{status} **{col}**: {pct:.1f}% missing\n" 37 | if len(missing_data) > 5: 38 | prompt += f"• ... and {len(missing_data) - 5} more columns with missing data\n" 39 | else: 40 | prompt += f"\n**✅ Missing Values**: No missing values detected! Excellent data quality.\n" 41 | 42 | # Data type consistency 43 | object_cols = [name for name, info in schema.columns.items() 44 | if info.dtype == 'object' and info.suggested_role not in ['categorical', 'identifier']] 45 | 46 | if object_cols: 47 | prompt += f"\n**⚠️ Mixed Data Types**: {', '.join(object_cols)} may need type conversion\n" 48 | 49 | # Duplicates check (simple heuristic) 50 | potential_id_cols = [name for name, info in schema.columns.items() 51 | if info.suggested_role == 'identifier'] 52 | 53 | if potential_id_cols: 54 | prompt += f"\n**🔍 Potential Duplicates**: Check uniqueness of {', '.join(potential_id_cols)}\n" 55 | 56 | # Column cardinality insights 57 | high_cardinality = [name for name, info in schema.columns.items() 58 | if info.unique_values > schema.row_count * 0.8] 59 | low_cardinality = [name for name, info in schema.columns.items() 60 | if info.unique_values < 10 and info.suggested_role == 'categorical'] 61 | 62 | if high_cardinality: 63 | prompt += f"\n**📊 High Cardinality Columns**: {', '.join(high_cardinality)} (many unique values)\n" 64 | prompt += "→ Consider if these should be identifiers or need grouping\n" 65 | 66 | if low_cardinality: 67 | prompt += f"\n**🏷️ Low Cardinality Columns**: {', '.join(low_cardinality)} (few unique values)\n" 68 | prompt += "→ Perfect for segmentation and grouping analysis\n" 69 | 70 | prompt += f""" 71 | **🎯 Recommended quality checks:** 72 | 73 | 1. **Comprehensive validation**: `validate_data_quality('{dataset_name}')` 74 | → Get detailed quality report with recommendations 75 | 76 | 2. **Distribution analysis**: Check for outliers and unusual patterns 77 | → `analyze_distributions('{dataset_name}', 'column_name')` 78 | 79 | 3. **Outlier detection**: Find unusual values in numerical columns 80 | → `detect_outliers('{dataset_name}')` 81 | 82 | 4. **Correlation check**: Look for unexpected relationships 83 | → `find_correlations('{dataset_name}')` 84 | 85 | **💡 Quick quality assessment commands:** 86 | • `validate_data_quality('{dataset_name}')` - Full quality report 87 | • `detect_outliers('{dataset_name}')` - Find unusual values""" 88 | 89 | if missing_data: 90 | most_missing_col = missing_data[0][0] 91 | prompt += f""" 92 | • `analyze_distributions('{dataset_name}', '{most_missing_col}')` - Investigate missing data patterns""" 93 | 94 | prompt += f""" 95 | 96 | **🔧 Common data quality improvements:** 97 | • Remove or impute missing values 98 | • Standardize categorical value formats 99 | • Convert data types appropriately 100 | • Remove duplicate records 101 | • Handle outliers appropriately 102 | 103 | What data quality aspect would you like to investigate first?""" 104 | 105 | return prompt 106 | 107 | except Exception as e: 108 | return f"Error generating data quality assessment prompt: {str(e)}" -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/prompts/dataset_first_look_prompt.py: -------------------------------------------------------------------------------- 1 | """Dataset first look prompt implementation.""" 2 | 3 | from mcp.server.fastmcp.prompts import base 4 | from typing import List, Optional 5 | from ..models.schemas import DatasetManager, dataset_schemas 6 | 7 | 8 | async def dataset_first_look(dataset_name: str) -> str: 9 | """Adaptive first-look analysis based on dataset characteristics.""" 10 | try: 11 | if dataset_name not in dataset_schemas: 12 | return f"Dataset '{dataset_name}' not loaded. Use load_dataset() tool first." 13 | 14 | schema = dataset_schemas[dataset_name] 15 | 16 | # Organize columns by type for display 17 | numerical_cols = [name for name, info in schema.columns.items() 18 | if info.suggested_role == 'numerical'] 19 | categorical_cols = [name for name, info in schema.columns.items() 20 | if info.suggested_role == 'categorical'] 21 | temporal_cols = [name for name, info in schema.columns.items() 22 | if info.suggested_role == 'temporal'] 23 | identifier_cols = [name for name, info in schema.columns.items() 24 | if info.suggested_role == 'identifier'] 25 | 26 | prompt = f"""Let's explore your **{dataset_name}** dataset together! 27 | 28 | I can see you have **{schema.row_count:,} records** with **{len(schema.columns)} columns**: 29 | 30 | """ 31 | 32 | if numerical_cols: 33 | prompt += f"**📊 Numerical columns** ({len(numerical_cols)}): {', '.join(numerical_cols)}\n" 34 | prompt += "→ Perfect for correlation analysis, statistical summaries, and trend analysis\n\n" 35 | 36 | if categorical_cols: 37 | prompt += f"**🏷️ Categorical columns** ({len(categorical_cols)}): {', '.join(categorical_cols)}\n" 38 | prompt += "→ Great for segmentation, group comparisons, and distribution analysis\n\n" 39 | 40 | if temporal_cols: 41 | prompt += f"**📅 Date/Time columns** ({len(temporal_cols)}): {', '.join(temporal_cols)}\n" 42 | prompt += "→ Ideal for time series analysis and trend identification\n\n" 43 | 44 | if identifier_cols: 45 | prompt += f"**🔑 Identifier columns** ({len(identifier_cols)}): {', '.join(identifier_cols)}\n" 46 | prompt += "→ Useful for data validation and uniqueness checks\n\n" 47 | 48 | # Add specific recommendations based on data 49 | prompt += "**🎯 Recommended starting points:**\n" 50 | 51 | if len(numerical_cols) >= 2: 52 | prompt += f"• **Correlation Analysis**: Explore relationships between {numerical_cols[0]} and {numerical_cols[1]}\n" 53 | prompt += f" Command: `find_correlations('{dataset_name}')`\n" 54 | 55 | if categorical_cols and numerical_cols: 56 | prompt += f"• **Segmentation**: Group by {categorical_cols[0]} to analyze {numerical_cols[0]} patterns\n" 57 | prompt += f" Command: `segment_by_column('{dataset_name}', '{categorical_cols[0]}')`\n" 58 | 59 | if temporal_cols and numerical_cols: 60 | prompt += f"• **Time Trends**: Track {numerical_cols[0]} changes over {temporal_cols[0]}\n" 61 | prompt += f" Command: `time_series_analysis('{dataset_name}', '{temporal_cols[0]}', '{numerical_cols[0]}')`\n" 62 | 63 | # Data quality insights 64 | high_null_cols = [name for name, info in schema.columns.items() 65 | if info.null_percentage > 10] 66 | if high_null_cols: 67 | prompt += f"• **Data Quality Review**: {len(high_null_cols)} columns have missing values to investigate\n" 68 | prompt += f" Command: `validate_data_quality('{dataset_name}')`\n" 69 | 70 | prompt += f"\n**Available tools**: `segment_by_column`, `find_correlations`, `create_chart`, `validate_data_quality`, `analyze_distributions`, `detect_outliers`\n" 71 | 72 | # Add visualization suggestions 73 | if numerical_cols: 74 | prompt += f"\n**📈 Visualization ideas:**\n" 75 | prompt += f"• Histogram: `create_chart('{dataset_name}', 'histogram', '{numerical_cols[0]}')`\n" 76 | if len(numerical_cols) >= 2: 77 | prompt += f"• Scatter plot: `create_chart('{dataset_name}', 'scatter', '{numerical_cols[0]}', '{numerical_cols[1]}')`\n" 78 | if categorical_cols: 79 | prompt += f"• Bar chart: `create_chart('{dataset_name}', 'bar', '{categorical_cols[0]}', '{numerical_cols[0]}')`\n" 80 | 81 | prompt += f"\nWhat aspect of your **{dataset_name}** data would you like to explore first?" 82 | 83 | return prompt 84 | 85 | except Exception as e: 86 | return f"Error generating first look prompt: {str(e)}" -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/prompts/find_datasources_prompt.py: -------------------------------------------------------------------------------- 1 | """Find data sources prompt implementation.""" 2 | 3 | import os 4 | from pathlib import Path 5 | from typing import List, Optional 6 | 7 | 8 | async def find_datasources(directory_path: str = ".") -> str: 9 | """Discover available data files and present them as load options.""" 10 | try: 11 | # Get the current working directory or specified path 12 | current_dir = Path(directory_path).resolve() 13 | 14 | # Find .csv and .json files 15 | csv_files = list(current_dir.glob("*.csv")) 16 | json_files = list(current_dir.glob("*.json")) 17 | 18 | # Also check common data subdirectories 19 | data_subdirs = ["data", "datasets", "files"] 20 | subdir_files = [] 21 | 22 | for subdir in data_subdirs: 23 | subdir_path = current_dir / subdir 24 | if subdir_path.exists() and subdir_path.is_dir(): 25 | subdir_csv = list(subdir_path.glob("*.csv")) 26 | subdir_json = list(subdir_path.glob("*.json")) 27 | if subdir_csv or subdir_json: 28 | subdir_files.append((subdir, subdir_csv + subdir_json)) 29 | 30 | # Build the prompt response 31 | prompt = f"""📁 **Data Source Discovery: {current_dir.name}** 32 | 33 | Looking for data files in: `{current_dir}` 34 | 35 | """ 36 | 37 | # Current directory files 38 | if csv_files or json_files: 39 | prompt += f"**📊 Data files found in current directory:**\n\n" 40 | 41 | all_current_files = sorted(csv_files + json_files, key=lambda x: x.name.lower()) 42 | for file_path in all_current_files: 43 | file_size = file_path.stat().st_size 44 | size_str = format_file_size(file_size) 45 | file_type = file_path.suffix.upper()[1:] # Remove the dot 46 | 47 | # Generate suggested dataset name (filename without extension) 48 | suggested_name = file_path.stem.lower().replace(" ", "_").replace("-", "_") 49 | 50 | prompt += f"• **{file_path.name}** ({file_type}, {size_str})\n" 51 | prompt += f" → `load_dataset('{file_path}', '{suggested_name}')`\n\n" 52 | 53 | # Subdirectory files 54 | if subdir_files: 55 | prompt += f"**📂 Data files found in subdirectories:**\n\n" 56 | 57 | for subdir_name, files in subdir_files: 58 | prompt += f"**{subdir_name}/ directory:**\n" 59 | 60 | sorted_files = sorted(files, key=lambda x: x.name.lower()) 61 | for file_path in sorted_files: 62 | file_size = file_path.stat().st_size 63 | size_str = format_file_size(file_size) 64 | file_type = file_path.suffix.upper()[1:] 65 | 66 | # Generate suggested dataset name 67 | suggested_name = file_path.stem.lower().replace(" ", "_").replace("-", "_") 68 | 69 | prompt += f" • **{file_path.name}** ({file_type}, {size_str})\n" 70 | prompt += f" → `load_dataset('{file_path}', '{suggested_name}')`\n" 71 | 72 | prompt += "\n" 73 | 74 | # No files found 75 | if not csv_files and not json_files and not subdir_files: 76 | prompt += f"""**❌ No data files found** 77 | 78 | No .csv or .json files were found in: 79 | • Current directory: `{current_dir}` 80 | • Common data subdirectories: {', '.join(data_subdirs)} 81 | 82 | **💡 Suggestions:** 83 | • Check if you're in the correct directory 84 | • Look for data files with different extensions 85 | • Create sample data files for testing 86 | • Download sample datasets from online sources 87 | 88 | **🔍 Manual file search:** 89 | You can also manually specify file paths: 90 | • `load_dataset('path/to/your/file.csv', 'my_dataset')` 91 | • `load_dataset('path/to/your/file.json', 'my_dataset')` 92 | """ 93 | else: 94 | # Add usage instructions 95 | total_files = len(csv_files) + len(json_files) + sum(len(files) for _, files in subdir_files) 96 | prompt += f"""**🚀 Ready to load data!** 97 | 98 | Found **{total_files} data file(s)** ready for analysis. 99 | 100 | **Next steps:** 101 | 1. Copy one of the `load_dataset()` commands above 102 | 2. Run it to load your data into memory 103 | 3. Start exploring with `dataset_first_look('dataset_name')` 104 | 105 | **💡 Pro tips:** 106 | • Choose descriptive dataset names for easier reference 107 | • Larger files may take longer to load 108 | • You can load multiple datasets simultaneously 109 | • Use `list_loaded_datasets()` to see what's currently loaded 110 | 111 | **🔧 Advanced loading options:** 112 | • Sample large datasets: `load_dataset('file.csv', 'name', sample_size=1000)` 113 | • Custom paths: `load_dataset('/full/path/to/file.csv', 'name')` 114 | """ 115 | 116 | return prompt 117 | 118 | except Exception as e: 119 | return f"""**❌ Error discovering data sources** 120 | 121 | Failed to scan directory: {str(e)} 122 | 123 | **💡 Troubleshooting:** 124 | • Check if the directory path exists and is accessible 125 | • Ensure you have read permissions for the directory 126 | • Try specifying a different directory path 127 | • Use absolute paths if relative paths aren't working 128 | 129 | **Manual alternative:** 130 | If automatic discovery isn't working, you can still load data manually: 131 | `load_dataset('your_file.csv', 'dataset_name')` 132 | """ 133 | 134 | 135 | def format_file_size(size_bytes: int) -> str: 136 | """Convert file size in bytes to human readable format.""" 137 | if size_bytes == 0: 138 | return "0 B" 139 | 140 | size_names = ["B", "KB", "MB", "GB"] 141 | i = 0 142 | size = float(size_bytes) 143 | 144 | while size >= 1024.0 and i < len(size_names) - 1: 145 | size /= 1024.0 146 | i += 1 147 | 148 | if i == 0: 149 | return f"{int(size)} {size_names[i]}" 150 | else: 151 | return f"{size:.1f} {size_names[i]}" -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/prompts/insight_generation_workshop_prompt.py: -------------------------------------------------------------------------------- 1 | """Insight generation workshop prompt implementation.""" 2 | 3 | from typing import List, Optional 4 | from ..models.schemas import DatasetManager, dataset_schemas 5 | 6 | 7 | async def insight_generation_workshop(dataset_name: str, business_context: str = "general") -> str: 8 | """Generate business insights from data analysis.""" 9 | try: 10 | if dataset_name not in dataset_schemas: 11 | return f"Dataset '{dataset_name}' not loaded. Use load_dataset() tool first." 12 | 13 | schema = dataset_schemas[dataset_name] 14 | 15 | prompt = f"""💡 **Business Insights Workshop: {dataset_name}** 16 | 17 | Context: **{business_context}** analysis 18 | 19 | Let's transform your **{schema.row_count:,} records** into actionable business insights! 20 | 21 | **🎯 Insight generation framework:** 22 | 23 | **Phase 1: Data Understanding** 24 | • What does each variable represent in your business? 25 | • Which metrics matter most for decision-making? 26 | • What questions are stakeholders asking? 27 | 28 | **Phase 2: Pattern Analysis** 29 | • `suggest_analysis('{dataset_name}')` - Get AI-powered analysis recommendations 30 | • Run suggested analyses to uncover patterns 31 | • Focus on business-relevant relationships 32 | 33 | **Phase 3: Insight Synthesis** 34 | • Translate statistical findings into business language 35 | • Identify actionable opportunities 36 | • Quantify potential business impact 37 | 38 | **📊 Business insight categories:** 39 | 40 | **Performance Insights** - How are we doing? 41 | • Identify top/bottom performers 42 | • Measure efficiency and effectiveness 43 | • Track progress against goals 44 | 45 | **Segmentation Insights** - Who are our different groups? 46 | • Customer/product/regional segments 47 | • Behavioral patterns and preferences 48 | • Market opportunities by segment 49 | 50 | **Predictive Insights** - What's likely to happen? 51 | • Trend analysis and forecasting 52 | • Risk identification 53 | • Opportunity prediction 54 | 55 | **Optimization Insights** - How can we improve? 56 | • Resource allocation opportunities 57 | • Process improvement areas 58 | • Strategy refinement suggestions 59 | 60 | **🔍 Context-specific analysis for {business_context}:** 61 | """ 62 | 63 | # Add context-specific suggestions 64 | if business_context.lower() in ['sales', 'revenue', 'ecommerce']: 65 | prompt += """ 66 | • **Sales Performance**: Analyze conversion rates, deal sizes, seasonal patterns 67 | • **Customer Behavior**: Purchase frequency, preferences, lifetime value 68 | • **Channel Effectiveness**: Performance by sales channel or region 69 | • **Product Insights**: Best/worst performers, cross-selling opportunities""" 70 | 71 | elif business_context.lower() in ['marketing', 'campaign', 'advertising']: 72 | prompt += """ 73 | • **Campaign Performance**: ROI, engagement rates, conversion metrics 74 | • **Audience Segmentation**: Demographics, behavior, response patterns 75 | • **Channel Analysis**: Most effective marketing channels and timing 76 | • **Content Insights**: What messaging/content drives best results""" 77 | 78 | elif business_context.lower() in ['operations', 'process', 'efficiency']: 79 | prompt += """ 80 | • **Process Efficiency**: Bottlenecks, cycle times, resource utilization 81 | • **Quality Metrics**: Error rates, compliance, consistency 82 | • **Resource Optimization**: Capacity planning, cost reduction opportunities 83 | • **Performance Trends**: Improving or declining operational metrics""" 84 | 85 | elif business_context.lower() in ['hr', 'employee', 'workforce']: 86 | prompt += """ 87 | • **Workforce Analytics**: Productivity, satisfaction, retention patterns 88 | • **Performance Management**: Top performers, skill gaps, development needs 89 | • **Engagement Insights**: What drives employee satisfaction and retention 90 | • **Organizational Health**: Diversity, growth, cultural indicators""" 91 | 92 | else: 93 | prompt += """ 94 | • **Key Performance Indicators**: Identify and track most important metrics 95 | • **Trend Analysis**: Understanding directional changes over time 96 | • **Comparative Analysis**: Benchmarking against targets or competitors 97 | • **Root Cause Analysis**: Understanding drivers of performance""" 98 | 99 | prompt += f""" 100 | 101 | **🚀 Insight generation workflow:** 102 | 103 | 1. **Explore the data landscape** 104 | → `dataset_first_look('{dataset_name}')` - Understand what you have 105 | 106 | 2. **Run targeted analyses** 107 | → Focus on business-critical variables and relationships 108 | 109 | 3. **Create compelling visualizations** 110 | → `create_chart()` with business-relevant comparisons 111 | 112 | 4. **Generate actionable recommendations** 113 | → `export_insights('{dataset_name}', 'html')` - Create business report 114 | 115 | **💼 Questions to drive insight generation:** 116 | • What decisions do you need to make based on this data? 117 | • Which patterns would surprise your stakeholders? 118 | • What actions could you take if you knew X about your data? 119 | • How can these insights drive measurable business value? 120 | 121 | **🎯 Ready to generate insights?** 122 | 123 | Start by telling me: 124 | 1. What specific business questions are you trying to answer? 125 | 2. Which variables in your dataset are most business-critical? 126 | 3. What decisions or actions might result from your analysis? 127 | 128 | Let's turn your data into business intelligence that drives results!""" 129 | 130 | return prompt 131 | 132 | except Exception as e: 133 | return f"Error generating insight workshop prompt: {str(e)}" -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/prompts/list_mcp_assets_prompt.py: -------------------------------------------------------------------------------- 1 | """List MCP assets prompt implementation.""" 2 | 3 | 4 | async def list_mcp_assets() -> str: 5 | """Return a comprehensive list of all MCP server capabilities.""" 6 | 7 | return """# 🚀 Quick-Data MCP Server Assets 8 | 9 | ## 📝 Prompts 10 | Interactive conversation starters and analysis guides: 11 | 12 | • **dataset_first_look** (dataset_name) - Initial exploration guide for any new dataset 13 | • **segmentation_workshop** (dataset_name) - Plan segmentation strategy based on available columns 14 | • **data_quality_assessment** (dataset_name) - Systematic data quality assessment workflow 15 | • **correlation_investigation** (dataset_name) - Guide correlation analysis workflow 16 | • **pattern_discovery_session** (dataset_name) - Open-ended pattern mining conversation 17 | • **insight_generation_workshop** (dataset_name, business_context) - Generate business insights from data 18 | • **dashboard_design_consultation** (dataset_name, audience) - Plan dashboards for specific audiences 19 | • **find_datasources** (directory_path) - Discover available data files and present load options 20 | 21 | ## 🔧 Tools 22 | Data analysis and manipulation functions: 23 | 24 | ### Dataset Management 25 | • **load_dataset** (file_path, dataset_name, sample_size) - Load JSON/CSV datasets with automatic schema discovery 26 | • **list_loaded_datasets** () - Show all datasets currently in memory 27 | • **clear_dataset** (dataset_name) - Remove specific dataset from memory 28 | • **clear_all_datasets** () - Clear all datasets from memory 29 | • **get_dataset_info** (dataset_name) - Get basic info about loaded dataset 30 | 31 | ### Analysis Tools 32 | • **segment_by_column** (dataset_name, column_name, method, top_n) - Generic segmentation on categorical columns 33 | • **find_correlations** (dataset_name, columns, threshold) - Find correlations between numerical columns 34 | • **analyze_distributions** (dataset_name, column_name) - Analyze distribution of any column 35 | • **detect_outliers** (dataset_name, columns, method) - Detect outliers using configurable methods 36 | • **time_series_analysis** (dataset_name, date_column, value_column, frequency) - Temporal analysis for date data 37 | • **suggest_analysis** (dataset_name) - AI recommendations based on data characteristics 38 | 39 | ### Visualization 40 | • **create_chart** (dataset_name, chart_type, x_column, y_column, groupby_column, title, save_path) - Create charts that adapt to any dataset 41 | • **generate_dashboard** (dataset_name, chart_configs) - Generate multi-chart dashboards 42 | 43 | ### Advanced Analytics 44 | • **validate_data_quality** (dataset_name) - Comprehensive data quality assessment 45 | • **compare_datasets** (dataset_a, dataset_b, common_columns) - Compare multiple datasets 46 | • **merge_datasets** (dataset_configs, join_strategy) - Join datasets on common keys 47 | • **calculate_feature_importance** (dataset_name, target_column, feature_columns) - Feature importance for predictive modeling 48 | • **memory_optimization_report** (dataset_name) - Analyze memory usage and suggest optimizations 49 | • **export_insights** (dataset_name, format, include_charts) - Export analysis in multiple formats 50 | • **execute_custom_analytics_code** (dataset_name, python_code) - Execute custom Python code against loaded datasets 51 | 52 | ### Resource Mirror Tools 53 | Tool versions of resources for tool-only MCP clients: 54 | • **resource_datasets_loaded** () - Tool mirror of datasets://loaded resource 55 | • **resource_datasets_schema** (dataset_name) - Tool mirror of datasets schema resource 56 | • **resource_datasets_summary** (dataset_name) - Tool mirror of datasets summary resource 57 | • **resource_datasets_sample** (dataset_name) - Tool mirror of datasets sample resource 58 | • **resource_analytics_current_dataset** () - Tool mirror of current dataset resource 59 | • **resource_analytics_available_analyses** () - Tool mirror of available analyses resource 60 | • **resource_analytics_column_types** () - Tool mirror of column types resource 61 | • **resource_analytics_suggested_insights** () - Tool mirror of suggested insights resource 62 | • **resource_analytics_memory_usage** () - Tool mirror of memory usage resource 63 | • **resource_config_server** () - Tool mirror of server config resource 64 | • **resource_users_profile** (user_id) - Tool mirror of user profile resource 65 | • **resource_system_status** () - Tool mirror of system status resource 66 | 67 | ## 📊 Resources 68 | Dynamic data context and system information: 69 | 70 | ### Dataset Resources 71 | • **datasets://loaded** - List of all currently loaded datasets with basic info 72 | • **datasets://{dataset_name}/schema** - Dynamic schema for any loaded dataset 73 | • **datasets://{dataset_name}/summary** - Statistical summary (pandas.describe() equivalent) 74 | • **datasets://{dataset_name}/sample** - Sample rows for data preview 75 | 76 | ### Analytics Resources 77 | • **analytics://current_dataset** - Currently active dataset name and basic stats 78 | • **analytics://available_analyses** - List of applicable analysis types for current data 79 | • **analytics://column_types** - Column classification (categorical, numerical, temporal, text) 80 | • **analytics://suggested_insights** - AI-generated analysis recommendations 81 | • **analytics://memory_usage** - Monitor memory usage of loaded datasets 82 | 83 | ### System Resources 84 | • **config://server** - Server configuration information 85 | • **users://{user_id}/profile** - User profile information by ID 86 | • **system://status** - System status and health information 87 | 88 | --- 89 | 90 | **🎯 Quick Start:** 91 | 1. Use `find_datasources()` to discover available data files 92 | 2. Load data with `load_dataset(file_path, dataset_name)` 93 | 3. Start exploring with `dataset_first_look(dataset_name)` 94 | 4. Use specific analysis tools or `execute_custom_analytics_code()` for custom analysis 95 | 96 | **💡 Pro Tips:** 97 | • Use prompts for guided workflows and analysis planning 98 | • Tools provide direct functionality and data manipulation 99 | • Resources offer real-time context and metadata about your data 100 | • All functions work generically across any dataset structure 101 | """ -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/prompts/pattern_discovery_session_prompt.py: -------------------------------------------------------------------------------- 1 | """Pattern discovery session prompt implementation.""" 2 | 3 | from typing import List, Optional 4 | from ..models.schemas import DatasetManager, dataset_schemas 5 | 6 | 7 | async def pattern_discovery_session(dataset_name: str) -> str: 8 | """Open-ended pattern mining conversation.""" 9 | try: 10 | if dataset_name not in dataset_schemas: 11 | return f"Dataset '{dataset_name}' not loaded. Use load_dataset() tool first." 12 | 13 | schema = dataset_schemas[dataset_name] 14 | 15 | # Categorize columns 16 | numerical_cols = [name for name, info in schema.columns.items() 17 | if info.suggested_role == 'numerical'] 18 | categorical_cols = [name for name, info in schema.columns.items() 19 | if info.suggested_role == 'categorical'] 20 | temporal_cols = [name for name, info in schema.columns.items() 21 | if info.suggested_role == 'temporal'] 22 | 23 | prompt = f"""🔍 **Pattern Discovery Session: {dataset_name}** 24 | 25 | Let's uncover hidden patterns and insights in your data! With **{schema.row_count:,} records** and **{len(schema.columns)} variables**, there are many potential discoveries waiting. 26 | 27 | **📊 Your data landscape:** 28 | • **{len(numerical_cols)} numerical variables**: Perfect for trends, distributions, and correlations 29 | • **{len(categorical_cols)} categorical variables**: Great for segmentation and group patterns 30 | • **{len(temporal_cols)} temporal variables**: Ideal for time-based patterns and seasonality 31 | 32 | **🎯 Pattern discovery toolkit:** 33 | 34 | **1. Distribution Patterns** - Understand your data's shape 35 | • `analyze_distributions('{dataset_name}', 'column_name')` - Detailed distribution analysis 36 | • Look for: skewness, multiple peaks, unusual gaps, outliers 37 | 38 | **2. Relationship Patterns** - Find connections between variables""" 39 | 40 | if len(numerical_cols) >= 2: 41 | prompt += f""" 42 | • `find_correlations('{dataset_name}')` - Statistical relationships 43 | • `create_chart('{dataset_name}', 'scatter', '{numerical_cols[0]}', '{numerical_cols[1]}')` - Visual relationships""" 44 | 45 | if categorical_cols and numerical_cols: 46 | prompt += f""" 47 | 48 | **3. Segmentation Patterns** - Discover group differences 49 | • `segment_by_column('{dataset_name}', '{categorical_cols[0]}')` - Group-based analysis 50 | • Look for: performance differences, size variations, behavioral patterns""" 51 | 52 | if temporal_cols and numerical_cols: 53 | prompt += f""" 54 | 55 | **4. Temporal Patterns** - Time-based insights 56 | • `time_series_analysis('{dataset_name}', '{temporal_cols[0]}', '{numerical_cols[0]}')` - Trend analysis 57 | • Look for: seasonality, cycles, growth trends, anomalies""" 58 | 59 | prompt += f""" 60 | 61 | **5. Quality Patterns** - Data integrity insights 62 | • `validate_data_quality('{dataset_name}')` - Systematic quality assessment 63 | • `detect_outliers('{dataset_name}')` - Unusual value detection 64 | 65 | **🔬 Advanced pattern hunting:** 66 | • **Feature importance**: `calculate_feature_importance('{dataset_name}', 'target_column')` 67 | • **Cross-pattern analysis**: Combine multiple discovery techniques 68 | • **Visual pattern exploration**: Create multiple chart types to see different perspectives 69 | 70 | **💡 Pattern discovery questions to explore:** 71 | • Which variables have the most unusual distributions? 72 | • Are there hidden subgroups in your data? 73 | • Do certain combinations of variables create interesting patterns? 74 | • Are there seasonal or cyclical patterns in time-based data? 75 | • Which variables are most predictive of outcomes? 76 | 77 | **🚀 Let's start discovering! Choose your exploration path:** 78 | 1. **"Show me the most interesting distributions"** - Start with distribution analysis 79 | 2. **"Find the strongest relationships"** - Begin with correlation analysis 80 | 3. **"Reveal hidden segments"** - Start with categorical segmentation 81 | 4. **"Uncover time patterns"** - Begin with temporal analysis 82 | 5. **"Assess data quality first"** - Start with quality assessment 83 | 84 | What patterns are you most curious about discovering in your **{dataset_name}** data?""" 85 | 86 | return prompt 87 | 88 | except Exception as e: 89 | return f"Error generating pattern discovery prompt: {str(e)}" -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/prompts/segmentation_workshop_prompt.py: -------------------------------------------------------------------------------- 1 | """Segmentation workshop prompt implementation.""" 2 | 3 | from typing import List, Optional 4 | from ..models.schemas import DatasetManager, dataset_schemas 5 | 6 | 7 | async def segmentation_workshop(dataset_name: str) -> str: 8 | """Interactive segmentation guidance based on actual dataset.""" 9 | try: 10 | if dataset_name not in dataset_schemas: 11 | return f"Dataset '{dataset_name}' not loaded. Use load_dataset() tool first." 12 | 13 | schema = dataset_schemas[dataset_name] 14 | 15 | # Find categorical columns suitable for segmentation 16 | categorical_cols = [name for name, info in schema.columns.items() 17 | if info.suggested_role == 'categorical'] 18 | numerical_cols = [name for name, info in schema.columns.items() 19 | if info.suggested_role == 'numerical'] 20 | 21 | if not categorical_cols: 22 | return f"""**Segmentation Challenge: No categorical columns found in {dataset_name}** 23 | 24 | Don't worry! You can still create meaningful segments: 25 | 26 | **🔢 Numerical Segmentation Options:** 27 | """ + (f""" 28 | • **Quantile-based segments**: Split {numerical_cols[0]} into high/medium/low groups 29 | • **Threshold-based segments**: Above/below average {numerical_cols[0]} 30 | • **Custom ranges**: Define meaningful business ranges for {numerical_cols[0]} 31 | 32 | **💡 Pro tip**: Create categorical columns first using pandas: 33 | ```python 34 | df['value_segment'] = pd.cut(df['{numerical_cols[0]}'], bins=3, labels=['Low', 'Medium', 'High']) 35 | ``` 36 | 37 | Then use: `segment_by_column('{dataset_name}', 'value_segment')` 38 | """ if numerical_cols else """ 39 | • Consider loading additional data with categorical variables 40 | • Check if any text columns could be categorized 41 | • Create categories from existing numerical data using ranges 42 | """) 43 | 44 | prompt = f"""Let's create meaningful segments from your **{dataset_name}** data! 45 | 46 | **Available categorical columns for grouping:** 47 | """ 48 | 49 | for col in categorical_cols: 50 | col_info = schema.columns[col] 51 | prompt += f"• **{col}**: {col_info.unique_values} unique values (examples: {', '.join(map(str, col_info.sample_values))})\n" 52 | 53 | if numerical_cols: 54 | prompt += f"\n**📊 Numerical columns to analyze by segment:**\n" 55 | for col in numerical_cols: 56 | col_info = schema.columns[col] 57 | prompt += f"• **{col}**: {col_info.dtype} (sample values: {', '.join(map(str, col_info.sample_values))})\n" 58 | 59 | prompt += f""" 60 | **🎯 Segmentation strategies:** 61 | 62 | 1. **Simple segmentation**: Group by one categorical column 63 | Example: `segment_by_column('{dataset_name}', '{categorical_cols[0]}')` 64 | 65 | 2. **Cross-segmentation**: Combine multiple categories (manual analysis) 66 | Example: Group by {categorical_cols[0]}, then analyze patterns within each group 67 | 68 | 3. **Value-based segments**: Focus on high/low values of numerical columns""" 69 | 70 | if numerical_cols: 71 | prompt += f""" 72 | Example: Top 20% vs bottom 20% by {numerical_cols[0]}""" 73 | 74 | prompt += f""" 75 | 76 | **📈 Suggested analysis workflow:** 77 | 1. Start with basic segmentation of your most important categorical variable 78 | 2. Look for interesting patterns in the numerical data 79 | 3. Create visualizations to show segment differences 80 | 4. Dive deeper into the most interesting segments 81 | 82 | **Quick commands to try:** 83 | • `segment_by_column('{dataset_name}', '{categorical_cols[0] if categorical_cols else "category_column"}')`""" 84 | 85 | if categorical_cols and numerical_cols: 86 | prompt += f""" 87 | • `create_chart('{dataset_name}', 'bar', '{categorical_cols[0]}', '{numerical_cols[0]}')`""" 88 | 89 | prompt += f""" 90 | 91 | Which segmentation approach interests you most? I can guide you through the specific analysis steps!""" 92 | 93 | return prompt 94 | 95 | except Exception as e: 96 | return f"Error generating segmentation workshop prompt: {str(e)}" -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/__init__.py: -------------------------------------------------------------------------------- 1 | """Resources package.""" 2 | 3 | from .get_server_config_resource import get_server_config 4 | from .get_loaded_datasets_resource import get_loaded_datasets 5 | from .get_dataset_schema_resource import get_dataset_schema 6 | from .get_dataset_summary_resource import get_dataset_summary 7 | from .get_dataset_sample_resource import get_dataset_sample 8 | from .get_current_dataset_resource import get_current_dataset 9 | from .get_available_analyses_resource import get_available_analyses 10 | from .get_column_types_resource import get_column_types 11 | from .get_analysis_suggestions_resource import get_analysis_suggestions 12 | from .get_memory_usage_resource import get_memory_usage 13 | from .get_user_profile_resource import get_user_profile 14 | from .get_system_status_resource import get_system_status 15 | 16 | __all__ = [ 17 | "get_server_config", 18 | "get_loaded_datasets", 19 | "get_dataset_schema", 20 | "get_dataset_summary", 21 | "get_dataset_sample", 22 | "get_current_dataset", 23 | "get_available_analyses", 24 | "get_column_types", 25 | "get_analysis_suggestions", 26 | "get_memory_usage", 27 | "get_user_profile", 28 | "get_system_status" 29 | ] -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/get_analysis_suggestions_resource.py: -------------------------------------------------------------------------------- 1 | """Analysis suggestions resource implementation.""" 2 | 3 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 4 | from typing import Dict, Any, Optional 5 | 6 | 7 | async def get_analysis_suggestions(dataset_name: Optional[str] = None) -> dict: 8 | """AI-generated analysis recommendations.""" 9 | try: 10 | if dataset_name is None: 11 | datasets = DatasetManager.list_datasets() 12 | if not datasets: 13 | return {"error": "No datasets loaded"} 14 | dataset_name = datasets[-1] 15 | 16 | # Import here to avoid circular imports 17 | from ..tools.pandas_tools import suggest_analysis 18 | return await suggest_analysis(dataset_name) 19 | 20 | except Exception as e: 21 | return {"error": f"Failed to get suggestions: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/get_available_analyses_resource.py: -------------------------------------------------------------------------------- 1 | """Available analyses resource implementation.""" 2 | 3 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 4 | from typing import Dict, Any, Optional 5 | 6 | 7 | async def get_available_analyses(dataset_name: Optional[str] = None) -> dict: 8 | """List of applicable analysis types for current data.""" 9 | try: 10 | if dataset_name is None: 11 | datasets = DatasetManager.list_datasets() 12 | if not datasets: 13 | return {"error": "No datasets loaded"} 14 | dataset_name = datasets[-1] # Use most recent 15 | 16 | if dataset_name not in dataset_schemas: 17 | return {"error": f"Dataset '{dataset_name}' not loaded"} 18 | 19 | schema = dataset_schemas[dataset_name] 20 | 21 | # Get columns by type 22 | numerical_cols = [name for name, info in schema.columns.items() 23 | if info.suggested_role == 'numerical'] 24 | categorical_cols = [name for name, info in schema.columns.items() 25 | if info.suggested_role == 'categorical'] 26 | temporal_cols = [name for name, info in schema.columns.items() 27 | if info.suggested_role == 'temporal'] 28 | 29 | available_analyses = [] 30 | 31 | # Basic analyses always available 32 | available_analyses.extend([ 33 | { 34 | "type": "data_quality_assessment", 35 | "description": "Comprehensive data quality report", 36 | "requirements": "Any dataset", 37 | "tool": "validate_data_quality" 38 | }, 39 | { 40 | "type": "distribution_analysis", 41 | "description": "Analyze column distributions", 42 | "requirements": "Any columns", 43 | "tool": "analyze_distributions" 44 | } 45 | ]) 46 | 47 | # Conditional analyses based on column types 48 | if len(numerical_cols) >= 2: 49 | available_analyses.append({ 50 | "type": "correlation_analysis", 51 | "description": f"Find relationships between {len(numerical_cols)} numerical variables", 52 | "requirements": "2+ numerical columns", 53 | "tool": "find_correlations", 54 | "applicable_columns": numerical_cols 55 | }) 56 | 57 | available_analyses.append({ 58 | "type": "outlier_detection", 59 | "description": "Detect outliers in numerical data", 60 | "requirements": "Numerical columns", 61 | "tool": "detect_outliers", 62 | "applicable_columns": numerical_cols 63 | }) 64 | 65 | if categorical_cols: 66 | available_analyses.append({ 67 | "type": "segmentation", 68 | "description": f"Group data by {len(categorical_cols)} categorical variables", 69 | "requirements": "Categorical columns", 70 | "tool": "segment_by_column", 71 | "applicable_columns": categorical_cols 72 | }) 73 | 74 | if temporal_cols and numerical_cols: 75 | available_analyses.append({ 76 | "type": "time_series", 77 | "description": f"Analyze trends over time", 78 | "requirements": "Date + numerical columns", 79 | "tool": "time_series_analysis", 80 | "applicable_columns": {"date_columns": temporal_cols, "value_columns": numerical_cols} 81 | }) 82 | 83 | if len(df := DatasetManager.get_dataset(dataset_name)) > 1: 84 | available_analyses.append({ 85 | "type": "feature_importance", 86 | "description": "Calculate feature importance for prediction", 87 | "requirements": "Numerical target + feature columns", 88 | "tool": "calculate_feature_importance" 89 | }) 90 | 91 | return { 92 | "dataset_name": dataset_name, 93 | "available_analyses": available_analyses, 94 | "dataset_summary": { 95 | "numerical_columns": len(numerical_cols), 96 | "categorical_columns": len(categorical_cols), 97 | "temporal_columns": len(temporal_cols), 98 | "total_rows": schema.row_count 99 | } 100 | } 101 | 102 | except Exception as e: 103 | return {"error": f"Failed to get available analyses: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/get_column_types_resource.py: -------------------------------------------------------------------------------- 1 | """Column types resource implementation.""" 2 | 3 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 4 | from typing import Dict, Any, Optional 5 | 6 | 7 | async def get_column_types(dataset_name: Optional[str] = None) -> dict: 8 | """Column classification (categorical, numerical, temporal, text).""" 9 | try: 10 | if dataset_name is None: 11 | datasets = DatasetManager.list_datasets() 12 | if not datasets: 13 | return {"error": "No datasets loaded"} 14 | dataset_name = datasets[-1] 15 | 16 | if dataset_name not in dataset_schemas: 17 | return {"error": f"Dataset '{dataset_name}' not loaded"} 18 | 19 | schema = dataset_schemas[dataset_name] 20 | 21 | column_classification = {} 22 | type_counts = {"numerical": 0, "categorical": 0, "temporal": 0, "identifier": 0} 23 | 24 | for col_name, col_info in schema.columns.items(): 25 | column_classification[col_name] = { 26 | "suggested_role": col_info.suggested_role, 27 | "dtype": col_info.dtype, 28 | "unique_values": col_info.unique_values, 29 | "null_percentage": col_info.null_percentage, 30 | "sample_values": col_info.sample_values 31 | } 32 | type_counts[col_info.suggested_role] += 1 33 | 34 | return { 35 | "dataset_name": dataset_name, 36 | "column_classification": column_classification, 37 | "type_summary": type_counts, 38 | "total_columns": len(schema.columns) 39 | } 40 | 41 | except Exception as e: 42 | return {"error": f"Failed to get column types: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/get_current_dataset_resource.py: -------------------------------------------------------------------------------- 1 | """Current dataset resource implementation.""" 2 | 3 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 4 | from typing import Dict, Any, Optional 5 | 6 | 7 | async def get_current_dataset() -> dict: 8 | """Currently active dataset name and basic stats.""" 9 | try: 10 | datasets = DatasetManager.list_datasets() 11 | 12 | if not datasets: 13 | return { 14 | "status": "no_datasets_loaded", 15 | "message": "No datasets currently loaded", 16 | "suggestion": "Use load_dataset() to load a dataset" 17 | } 18 | 19 | # Return info about the most recently loaded dataset 20 | latest_dataset = datasets[-1] # Assuming last is most recent 21 | info = DatasetManager.get_dataset_info(latest_dataset) 22 | 23 | return { 24 | "current_dataset": latest_dataset, 25 | "shape": info["shape"], 26 | "memory_mb": round(info["memory_usage_mb"], 2), 27 | "all_loaded_datasets": datasets, 28 | "total_datasets": len(datasets) 29 | } 30 | 31 | except Exception as e: 32 | return {"error": f"Failed to get current dataset: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/get_dataset_sample_resource.py: -------------------------------------------------------------------------------- 1 | """Dataset sample resource implementation.""" 2 | 3 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 4 | from typing import Dict, Any, Optional 5 | 6 | 7 | async def get_dataset_sample(dataset_name: str, n_rows: int = 5) -> dict: 8 | """Sample rows for data preview.""" 9 | try: 10 | df = DatasetManager.get_dataset(dataset_name) 11 | 12 | # Get sample rows 13 | sample_df = df.head(n_rows) 14 | 15 | return { 16 | "dataset_name": dataset_name, 17 | "sample_size": len(sample_df), 18 | "total_rows": len(df), 19 | "columns": list(df.columns), 20 | "sample_data": sample_df.to_dict('records') 21 | } 22 | 23 | except Exception as e: 24 | return {"error": f"Failed to get sample: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/get_dataset_schema_resource.py: -------------------------------------------------------------------------------- 1 | """Dataset schema resource implementation.""" 2 | 3 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 4 | from typing import Dict, Any, Optional 5 | 6 | 7 | async def get_dataset_schema(dataset_name: str) -> dict: 8 | """Get dynamic schema for any loaded dataset.""" 9 | try: 10 | if dataset_name not in dataset_schemas: 11 | return {"error": f"Dataset '{dataset_name}' not loaded"} 12 | 13 | schema = dataset_schemas[dataset_name] 14 | 15 | # Organize columns by type 16 | columns_by_type = { 17 | "numerical": [], 18 | "categorical": [], 19 | "temporal": [], 20 | "identifier": [] 21 | } 22 | 23 | for col_name, col_info in schema.columns.items(): 24 | columns_by_type[col_info.suggested_role].append({ 25 | "name": col_name, 26 | "dtype": col_info.dtype, 27 | "unique_values": col_info.unique_values, 28 | "null_percentage": round(col_info.null_percentage, 1), 29 | "sample_values": col_info.sample_values 30 | }) 31 | 32 | return { 33 | "dataset_name": dataset_name, 34 | "total_rows": schema.row_count, 35 | "total_columns": len(schema.columns), 36 | "columns_by_type": columns_by_type, 37 | "suggested_analyses": schema.suggested_analyses, 38 | "schema_generated": True 39 | } 40 | 41 | except Exception as e: 42 | return {"error": f"Failed to get schema: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/get_dataset_summary_resource.py: -------------------------------------------------------------------------------- 1 | """Dataset summary resource implementation.""" 2 | 3 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 4 | from typing import Dict, Any, Optional 5 | 6 | 7 | async def get_dataset_summary(dataset_name: str) -> dict: 8 | """Statistical summary (pandas.describe() equivalent).""" 9 | try: 10 | df = DatasetManager.get_dataset(dataset_name) 11 | 12 | # Get basic info 13 | summary = { 14 | "dataset_name": dataset_name, 15 | "shape": df.shape, 16 | "memory_usage_mb": round(df.memory_usage(deep=True).sum() / 1024**2, 2) 17 | } 18 | 19 | # Numerical summary 20 | numerical_cols = df.select_dtypes(include=['number']).columns 21 | if len(numerical_cols) > 0: 22 | summary["numerical_summary"] = df[numerical_cols].describe().to_dict() 23 | 24 | # Categorical summary 25 | categorical_cols = df.select_dtypes(include=['object', 'category']).columns 26 | if len(categorical_cols) > 0: 27 | summary["categorical_summary"] = {} 28 | for col in categorical_cols: 29 | summary["categorical_summary"][col] = { 30 | "unique_count": df[col].nunique(), 31 | "top_values": df[col].value_counts().head(5).to_dict(), 32 | "null_count": df[col].isnull().sum() 33 | } 34 | 35 | # Missing data summary 36 | missing_data = df.isnull().sum() 37 | summary["missing_data"] = { 38 | "total_missing": int(missing_data.sum()), 39 | "columns_with_missing": missing_data[missing_data > 0].to_dict() 40 | } 41 | 42 | return summary 43 | 44 | except Exception as e: 45 | return {"error": f"Failed to generate summary: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/get_loaded_datasets_resource.py: -------------------------------------------------------------------------------- 1 | """Loaded datasets resource implementation.""" 2 | 3 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 4 | from typing import Dict, Any, Optional 5 | 6 | 7 | async def get_loaded_datasets() -> dict: 8 | """List all datasets currently in memory.""" 9 | try: 10 | datasets = [] 11 | total_memory = 0 12 | 13 | for name in DatasetManager.list_datasets(): 14 | info = DatasetManager.get_dataset_info(name) 15 | memory_mb = info["memory_usage_mb"] 16 | total_memory += memory_mb 17 | 18 | datasets.append({ 19 | "name": name, 20 | "rows": info["shape"][0], 21 | "columns": info["shape"][1], 22 | "memory_mb": round(memory_mb, 1), 23 | "column_types": { 24 | role: len([c for c, col_info in info["schema"]["columns"].items() 25 | if col_info["suggested_role"] == role]) 26 | for role in ["numerical", "categorical", "temporal", "identifier"] 27 | } 28 | }) 29 | 30 | return { 31 | "datasets": datasets, 32 | "total_datasets": len(datasets), 33 | "total_memory_mb": round(total_memory, 1), 34 | "status": "loaded" if datasets else "empty" 35 | } 36 | 37 | except Exception as e: 38 | return {"error": f"Failed to list datasets: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/get_memory_usage_resource.py: -------------------------------------------------------------------------------- 1 | """Memory usage resource implementation.""" 2 | 3 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 4 | from typing import Dict, Any, Optional 5 | 6 | 7 | async def get_memory_usage() -> dict: 8 | """Monitor memory usage of loaded datasets.""" 9 | try: 10 | usage = [] 11 | total_memory = 0 12 | 13 | for name in DatasetManager.list_datasets(): 14 | info = DatasetManager.get_dataset_info(name) 15 | memory_mb = info["memory_usage_mb"] 16 | total_memory += memory_mb 17 | 18 | usage.append({ 19 | "dataset": name, 20 | "memory_mb": round(memory_mb, 1), 21 | "rows": info["shape"][0], 22 | "columns": info["shape"][1], 23 | "memory_per_row_kb": round(memory_mb * 1024 / info["shape"][0], 2) if info["shape"][0] > 0 else 0 24 | }) 25 | 26 | # Sort by memory usage 27 | usage.sort(key=lambda x: x["memory_mb"], reverse=True) 28 | 29 | return { 30 | "datasets": usage, 31 | "total_memory_mb": round(total_memory, 1), 32 | "dataset_count": len(usage), 33 | "memory_recommendations": [ 34 | "Consider sampling large datasets before analysis", 35 | "Clear unused datasets with clear_dataset()", 36 | "Use memory_optimization_report() for optimization tips" 37 | ] if total_memory > 100 else ["Memory usage is optimal"] 38 | } 39 | 40 | except Exception as e: 41 | return {"error": f"Failed to get memory usage: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/get_server_config_resource.py: -------------------------------------------------------------------------------- 1 | """Server configuration resource implementation.""" 2 | 3 | from ..config.settings import settings 4 | from typing import Dict, Any, Optional 5 | 6 | 7 | async def get_server_config() -> dict: 8 | """Get server configuration.""" 9 | config = settings.server_info.copy() 10 | config.update({ 11 | "analytics_features": [ 12 | "dataset_loading", 13 | "schema_discovery", 14 | "correlation_analysis", 15 | "segmentation", 16 | "data_quality_assessment", 17 | "visualization", 18 | "outlier_detection", 19 | "time_series_analysis" 20 | ], 21 | "supported_formats": ["CSV", "JSON"], 22 | "memory_storage": "in_memory_dataframes" 23 | }) 24 | return config -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/get_system_status_resource.py: -------------------------------------------------------------------------------- 1 | """System status resource implementation (legacy).""" 2 | 3 | from ..models.schemas import DatasetManager 4 | from ..config.settings import settings 5 | from typing import Dict, Any, Optional 6 | 7 | 8 | async def get_system_status() -> dict: 9 | """Get system status information.""" 10 | datasets = DatasetManager.list_datasets() 11 | total_memory = sum(DatasetManager.get_dataset_info(name)["memory_usage_mb"] for name in datasets) 12 | 13 | return { 14 | "status": "healthy", 15 | "uptime": "Active session", 16 | "version": settings.version, 17 | "features": [ 18 | "dataset_loading", 19 | "schema_discovery", 20 | "correlation_analysis", 21 | "segmentation", 22 | "data_quality_assessment", 23 | "visualization", 24 | "outlier_detection", 25 | "time_series_analysis" 26 | ], 27 | "datasets_loaded": len(datasets), 28 | "total_memory_mb": round(total_memory, 1), 29 | "dependencies": { 30 | "mcp": "1.9.2", 31 | "pandas": "2.2.3+", 32 | "plotly": "6.1.2+", 33 | "pydantic": "2.11.5" 34 | } 35 | } -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/resources/get_user_profile_resource.py: -------------------------------------------------------------------------------- 1 | """User profile resource implementation (legacy).""" 2 | 3 | from ..models.schemas import UserProfile 4 | from typing import Dict, Any, Optional 5 | 6 | 7 | async def get_user_profile(user_id: str) -> dict: 8 | """Get user profile by ID.""" 9 | # In production, this would fetch from a database 10 | profile = UserProfile( 11 | id=user_id, 12 | name=f"User {user_id}", 13 | email=f"user{user_id}@example.com", 14 | status="active", 15 | preferences={ 16 | "theme": "dark", 17 | "notifications": True, 18 | "language": "en" 19 | } 20 | ) 21 | 22 | return profile.model_dump() -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/__init__.py: -------------------------------------------------------------------------------- 1 | """Tools package.""" 2 | 3 | # Import modules for grouped access 4 | from . import pandas_tools 5 | 6 | # Analytics tools 7 | from .validate_data_quality_tool import validate_data_quality 8 | from .compare_datasets_tool import compare_datasets 9 | from .merge_datasets_tool import merge_datasets 10 | from .generate_dashboard_tool import generate_dashboard 11 | from .export_insights_tool import export_insights 12 | from .calculate_feature_importance_tool import calculate_feature_importance 13 | from .memory_optimization_report_tool import memory_optimization_report 14 | from .execute_custom_analytics_code_tool import execute_custom_analytics_code 15 | 16 | # Pandas tools 17 | from .load_dataset_tool import load_dataset 18 | from .list_loaded_datasets_tool import list_loaded_datasets 19 | from .segment_by_column_tool import segment_by_column 20 | from .find_correlations_tool import find_correlations 21 | from .create_chart_tool import create_chart 22 | from .analyze_distributions_tool import analyze_distributions 23 | from .detect_outliers_tool import detect_outliers 24 | from .time_series_analysis_tool import time_series_analysis 25 | from .suggest_analysis_tool import suggest_analysis 26 | 27 | __all__ = [ 28 | # Modules 29 | "pandas_tools", 30 | 31 | # Analytics tools 32 | "validate_data_quality", 33 | "compare_datasets", 34 | "merge_datasets", 35 | "generate_dashboard", 36 | "export_insights", 37 | "calculate_feature_importance", 38 | "memory_optimization_report", 39 | "execute_custom_analytics_code", 40 | 41 | # Pandas tools 42 | "load_dataset", 43 | "list_loaded_datasets", 44 | "segment_by_column", 45 | "find_correlations", 46 | "create_chart", 47 | "analyze_distributions", 48 | "detect_outliers", 49 | "time_series_analysis", 50 | "suggest_analysis" 51 | ] -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/analyze_distributions_tool.py: -------------------------------------------------------------------------------- 1 | """Distribution analysis tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional, Union 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas, ChartConfig 7 | 8 | 9 | async def analyze_distributions(dataset_name: str, column_name: str) -> dict: 10 | """Analyze distribution of any column.""" 11 | try: 12 | df = DatasetManager.get_dataset(dataset_name) 13 | 14 | if column_name not in df.columns: 15 | return {"error": f"Column '{column_name}' not found in dataset"} 16 | 17 | series = df[column_name] 18 | 19 | result = { 20 | "dataset": dataset_name, 21 | "column": column_name, 22 | "dtype": str(series.dtype), 23 | "total_values": len(series), 24 | "unique_values": series.nunique(), 25 | "null_values": series.isnull().sum(), 26 | "null_percentage": round(series.isnull().mean() * 100, 2) 27 | } 28 | 29 | if pd.api.types.is_numeric_dtype(series): 30 | # Numerical distribution 31 | result.update({ 32 | "distribution_type": "numerical", 33 | "mean": round(series.mean(), 3), 34 | "median": round(series.median(), 3), 35 | "std": round(series.std(), 3), 36 | "min": series.min(), 37 | "max": series.max(), 38 | "quartiles": { 39 | "q25": round(series.quantile(0.25), 3), 40 | "q50": round(series.quantile(0.50), 3), 41 | "q75": round(series.quantile(0.75), 3) 42 | }, 43 | "skewness": round(series.skew(), 3), 44 | "kurtosis": round(series.kurtosis(), 3) 45 | }) 46 | else: 47 | # Categorical distribution 48 | value_counts = series.value_counts().head(10) 49 | result.update({ 50 | "distribution_type": "categorical", 51 | "most_frequent": value_counts.index[0] if len(value_counts) > 0 else None, 52 | "frequency_of_most_common": int(value_counts.iloc[0]) if len(value_counts) > 0 else 0, 53 | "top_10_values": value_counts.to_dict() 54 | }) 55 | 56 | return result 57 | 58 | except Exception as e: 59 | return {"error": f"Distribution analysis failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/calculate_feature_importance_tool.py: -------------------------------------------------------------------------------- 1 | """Feature importance calculation tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 7 | 8 | 9 | async def calculate_feature_importance( 10 | dataset_name: str, 11 | target_column: str, 12 | feature_columns: Optional[List[str]] = None 13 | ) -> dict: 14 | """Calculate feature importance for predictive modeling.""" 15 | try: 16 | df = DatasetManager.get_dataset(dataset_name) 17 | 18 | if target_column not in df.columns: 19 | return {"error": f"Target column '{target_column}' not found"} 20 | 21 | # Auto-select feature columns if not provided 22 | if feature_columns is None: 23 | feature_columns = [col for col in df.columns if col != target_column] 24 | 25 | # Filter to numerical columns only for correlation-based importance 26 | numerical_features = [] 27 | for col in feature_columns: 28 | if col in df.columns and pd.api.types.is_numeric_dtype(df[col]): 29 | numerical_features.append(col) 30 | 31 | if not numerical_features: 32 | return {"error": "No numerical feature columns found"} 33 | 34 | if not pd.api.types.is_numeric_dtype(df[target_column]): 35 | return {"error": "Target column must be numerical for correlation-based feature importance"} 36 | 37 | # Calculate correlations with target 38 | correlations = df[numerical_features + [target_column]].corr()[target_column] 39 | 40 | # Calculate feature importance (absolute correlation) 41 | feature_importance = {} 42 | for feature in numerical_features: 43 | correlation = correlations[feature] 44 | importance = abs(correlation) if not pd.isna(correlation) else 0 45 | feature_importance[feature] = { 46 | "correlation": round(correlation, 4), 47 | "importance": round(importance, 4), 48 | "rank": 0 # Will be set below 49 | } 50 | 51 | # Rank features by importance 52 | sorted_features = sorted(feature_importance.items(), key=lambda x: x[1]["importance"], reverse=True) 53 | for rank, (feature, info) in enumerate(sorted_features, 1): 54 | feature_importance[feature]["rank"] = rank 55 | 56 | return { 57 | "dataset": dataset_name, 58 | "target_column": target_column, 59 | "method": "correlation_based", 60 | "feature_importance": feature_importance, 61 | "top_features": [f[0] for f in sorted_features[:5]], 62 | "features_analyzed": len(numerical_features) 63 | } 64 | 65 | except Exception as e: 66 | return {"error": f"Feature importance calculation failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/compare_datasets_tool.py: -------------------------------------------------------------------------------- 1 | """Dataset comparison tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 7 | 8 | 9 | async def compare_datasets(dataset_a: str, dataset_b: str, common_columns: Optional[List[str]] = None) -> dict: 10 | """Compare multiple datasets.""" 11 | try: 12 | df_a = DatasetManager.get_dataset(dataset_a) 13 | df_b = DatasetManager.get_dataset(dataset_b) 14 | 15 | # Find common columns if not specified 16 | if common_columns is None: 17 | common_columns = list(set(df_a.columns) & set(df_b.columns)) 18 | 19 | if not common_columns: 20 | return {"error": "No common columns found between datasets"} 21 | 22 | comparison = { 23 | "dataset_a": dataset_a, 24 | "dataset_b": dataset_b, 25 | "shape_comparison": { 26 | "dataset_a_shape": df_a.shape, 27 | "dataset_b_shape": df_b.shape, 28 | "row_difference": df_a.shape[0] - df_b.shape[0], 29 | "column_difference": df_a.shape[1] - df_b.shape[1] 30 | }, 31 | "common_columns": common_columns, 32 | "column_comparisons": {} 33 | } 34 | 35 | # Compare each common column 36 | for col in common_columns: 37 | col_comparison = { 38 | "column": col, 39 | "dtype_a": str(df_a[col].dtype), 40 | "dtype_b": str(df_b[col].dtype), 41 | "unique_values_a": df_a[col].nunique(), 42 | "unique_values_b": df_b[col].nunique(), 43 | "null_pct_a": round(df_a[col].isnull().mean() * 100, 2), 44 | "null_pct_b": round(df_b[col].isnull().mean() * 100, 2) 45 | } 46 | 47 | # Numerical comparison 48 | if pd.api.types.is_numeric_dtype(df_a[col]) and pd.api.types.is_numeric_dtype(df_b[col]): 49 | col_comparison.update({ 50 | "mean_a": round(df_a[col].mean(), 3), 51 | "mean_b": round(df_b[col].mean(), 3), 52 | "mean_difference": round(df_a[col].mean() - df_b[col].mean(), 3), 53 | "std_a": round(df_a[col].std(), 3), 54 | "std_b": round(df_b[col].std(), 3) 55 | }) 56 | 57 | comparison["column_comparisons"][col] = col_comparison 58 | 59 | return comparison 60 | 61 | except Exception as e: 62 | return {"error": f"Dataset comparison failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/create_chart_tool.py: -------------------------------------------------------------------------------- 1 | """Chart creation tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import plotly.express as px 6 | import plotly.graph_objects as go 7 | from plotly.offline import plot 8 | import json 9 | from pathlib import Path 10 | from typing import List, Dict, Any, Optional, Union 11 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas, ChartConfig 12 | 13 | 14 | async def create_chart( 15 | dataset_name: str, 16 | chart_type: str, 17 | x_column: str, 18 | y_column: Optional[str] = None, 19 | groupby_column: Optional[str] = None, 20 | title: Optional[str] = None, 21 | save_path: Optional[str] = None 22 | ) -> dict: 23 | """Create generic charts that adapt to any dataset.""" 24 | try: 25 | df = DatasetManager.get_dataset(dataset_name) 26 | 27 | # Validate columns exist 28 | required_cols = [x_column] 29 | if y_column: 30 | required_cols.append(y_column) 31 | if groupby_column: 32 | required_cols.append(groupby_column) 33 | 34 | missing_cols = [col for col in required_cols if col not in df.columns] 35 | if missing_cols: 36 | return {"error": f"Columns not found: {missing_cols}"} 37 | 38 | # Generate title if not provided 39 | if title is None: 40 | title = f"{chart_type.title()} Chart: {x_column}" 41 | if y_column: 42 | title += f" vs {y_column}" 43 | if groupby_column: 44 | title += f" (grouped by {groupby_column})" 45 | 46 | # Create chart based on type 47 | fig = None 48 | chart_data = None 49 | 50 | if chart_type == "histogram": 51 | fig = px.histogram(df, x=x_column, color=groupby_column, title=title) 52 | chart_data = df[x_column].value_counts().head(20).to_dict() 53 | 54 | elif chart_type == "bar": 55 | if not y_column: 56 | # Count plot 57 | if groupby_column: 58 | chart_data = df.groupby([x_column, groupby_column]).size().unstack(fill_value=0) 59 | fig = px.bar(chart_data, title=title) 60 | else: 61 | chart_data = df[x_column].value_counts().head(20) 62 | fig = px.bar(x=chart_data.index, y=chart_data.values, title=title) 63 | else: 64 | # Aggregated bar chart 65 | if groupby_column: 66 | agg_data = df.groupby([x_column, groupby_column])[y_column].mean().unstack(fill_value=0) 67 | fig = px.bar(agg_data, title=title) 68 | chart_data = agg_data.to_dict() 69 | else: 70 | agg_data = df.groupby(x_column)[y_column].mean() 71 | fig = px.bar(x=agg_data.index, y=agg_data.values, title=title, 72 | labels={'x': x_column, 'y': f'Mean {y_column}'}) 73 | chart_data = agg_data.to_dict() 74 | 75 | elif chart_type == "scatter": 76 | if not y_column: 77 | return {"error": "Scatter plot requires both x_column and y_column"} 78 | fig = px.scatter(df, x=x_column, y=y_column, color=groupby_column, title=title) 79 | chart_data = {"x_mean": df[x_column].mean(), "y_mean": df[y_column].mean()} 80 | 81 | elif chart_type == "line": 82 | if not y_column: 83 | return {"error": "Line plot requires both x_column and y_column"} 84 | 85 | # Sort by x_column for proper line plotting 86 | df_sorted = df.sort_values(x_column) 87 | 88 | if groupby_column: 89 | fig = px.line(df_sorted, x=x_column, y=y_column, color=groupby_column, title=title) 90 | else: 91 | # Group by x_column and aggregate y_column 92 | line_data = df_sorted.groupby(x_column)[y_column].mean().reset_index() 93 | fig = px.line(line_data, x=x_column, y=y_column, title=title) 94 | 95 | chart_data = {"trend": "line_chart_generated"} 96 | 97 | elif chart_type == "box": 98 | if not y_column: 99 | fig = px.box(df, x=x_column, title=title) 100 | else: 101 | fig = px.box(df, x=x_column, y=y_column, title=title) 102 | chart_data = {"quartiles": "box_plot_generated"} 103 | 104 | else: 105 | return {"error": f"Unsupported chart type: {chart_type}. Supported: histogram, bar, scatter, line, box"} 106 | 107 | # Save chart if path provided 108 | chart_file = None 109 | if save_path or fig: 110 | if save_path is None: 111 | # Create outputs/charts directory if it doesn't exist 112 | outputs_dir = Path("outputs/charts") 113 | outputs_dir.mkdir(parents=True, exist_ok=True) 114 | save_path = outputs_dir / f"chart_{dataset_name}_{chart_type}_{x_column}.html" 115 | 116 | chart_file = str(Path(save_path).with_suffix('.html')) 117 | fig.write_html(chart_file) 118 | 119 | return { 120 | "dataset": dataset_name, 121 | "chart_type": chart_type, 122 | "chart_config": { 123 | "x_column": x_column, 124 | "y_column": y_column, 125 | "groupby_column": groupby_column, 126 | "title": title 127 | }, 128 | "chart_data_sample": chart_data, 129 | "chart_file": chart_file, 130 | "status": "success" 131 | } 132 | 133 | except Exception as e: 134 | return {"error": f"Chart creation failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/detect_outliers_tool.py: -------------------------------------------------------------------------------- 1 | """Outlier detection tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional, Union 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas, ChartConfig 7 | 8 | 9 | async def detect_outliers( 10 | dataset_name: str, 11 | columns: Optional[List[str]] = None, 12 | method: str = "iqr" 13 | ) -> dict: 14 | """Detect outliers using configurable methods.""" 15 | try: 16 | df = DatasetManager.get_dataset(dataset_name) 17 | 18 | # Auto-select numerical columns if none specified 19 | if columns is None: 20 | columns = df.select_dtypes(include=[np.number]).columns.tolist() 21 | 22 | if not columns: 23 | return {"error": "No numerical columns found for outlier detection"} 24 | 25 | # Filter to existing columns 26 | existing_columns = [col for col in columns if col in df.columns] 27 | 28 | outliers_info = {} 29 | total_outliers = 0 30 | 31 | for col in existing_columns: 32 | series = df[col].dropna() 33 | 34 | if method == "iqr": 35 | Q1 = series.quantile(0.25) 36 | Q3 = series.quantile(0.75) 37 | IQR = Q3 - Q1 38 | lower_bound = Q1 - 1.5 * IQR 39 | upper_bound = Q3 + 1.5 * IQR 40 | 41 | outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col] 42 | 43 | elif method == "zscore": 44 | z_scores = np.abs((series - series.mean()) / series.std()) 45 | outlier_indices = z_scores > 3 46 | outliers = series[outlier_indices] 47 | lower_bound = series.mean() - 3 * series.std() 48 | upper_bound = series.mean() + 3 * series.std() 49 | 50 | else: 51 | return {"error": f"Unsupported method: {method}. Use 'iqr' or 'zscore'"} 52 | 53 | outlier_count = len(outliers) 54 | total_outliers += outlier_count 55 | 56 | outliers_info[col] = { 57 | "outlier_count": outlier_count, 58 | "outlier_percentage": round(outlier_count / len(series) * 100, 2), 59 | "lower_bound": round(lower_bound, 3), 60 | "upper_bound": round(upper_bound, 3), 61 | "outlier_values": outliers.head(10).tolist(), 62 | "method": method 63 | } 64 | 65 | return { 66 | "dataset": dataset_name, 67 | "method": method, 68 | "columns_analyzed": existing_columns, 69 | "total_outliers": total_outliers, 70 | "outliers_by_column": outliers_info 71 | } 72 | 73 | except Exception as e: 74 | return {"error": f"Outlier detection failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/execute_custom_analytics_code_tool.py: -------------------------------------------------------------------------------- 1 | """Custom analytics code execution tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 7 | 8 | 9 | async def execute_custom_analytics_code(dataset_name: str, python_code: str) -> str: 10 | """ 11 | Execute custom Python code against a loaded dataset. 12 | 13 | Implementation steps: 14 | 1. Get dataset from DatasetManager 15 | 2. Serialize dataset to JSON for subprocess 16 | 3. Wrap user code in execution template 17 | 4. Execute via subprocess with uv run python -c 18 | 5. Capture and return stdout/stderr 19 | """ 20 | import asyncio 21 | import json 22 | 23 | try: 24 | # Step 1: Get dataset 25 | df = DatasetManager.get_dataset(dataset_name) 26 | 27 | # Step 2: Serialize dataset 28 | dataset_json = df.to_json(orient='records') 29 | 30 | # Step 3: Create execution template 31 | # Need to properly indent user code 32 | import textwrap 33 | indented_user_code = textwrap.indent(python_code, ' ') 34 | 35 | execution_code = f''' 36 | import pandas as pd 37 | import numpy as np 38 | import plotly.express as px 39 | import json 40 | 41 | try: 42 | # Load dataset 43 | dataset_data = {dataset_json} 44 | df = pd.DataFrame(dataset_data) 45 | 46 | # Execute user code 47 | {indented_user_code} 48 | 49 | except Exception as e: 50 | print(f"ERROR: {{type(e).__name__}}: {{str(e)}}") 51 | import traceback 52 | print("Traceback:") 53 | print(traceback.format_exc()) 54 | ''' 55 | 56 | # Step 4: Execute subprocess 57 | process = await asyncio.create_subprocess_exec( 58 | 'uv', 'run', '--with', 'pandas', '--with', 'numpy', '--with', 'plotly', 59 | 'python', '-c', execution_code, 60 | stdout=asyncio.subprocess.PIPE, 61 | stderr=asyncio.subprocess.STDOUT 62 | ) 63 | 64 | # Step 5: Get output with timeout 65 | try: 66 | stdout, _ = await asyncio.wait_for(process.communicate(), timeout=30.0) 67 | return stdout.decode('utf-8') 68 | except asyncio.TimeoutError: 69 | process.kill() 70 | await process.wait() 71 | return "TIMEOUT: Code execution exceeded 30 second limit" 72 | 73 | except Exception as e: 74 | return f"EXECUTION ERROR: {type(e).__name__}: {str(e)}" -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/export_insights_tool.py: -------------------------------------------------------------------------------- 1 | """Insights export tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional 6 | from datetime import datetime 7 | from pathlib import Path 8 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 9 | 10 | 11 | async def export_insights(dataset_name: str, format: str = "json", include_charts: bool = False) -> dict: 12 | """Export analysis in multiple formats.""" 13 | try: 14 | if dataset_name not in loaded_datasets: 15 | return {"error": f"Dataset '{dataset_name}' not loaded"} 16 | 17 | df = DatasetManager.get_dataset(dataset_name) 18 | schema = dataset_schemas[dataset_name] 19 | 20 | # Generate comprehensive insights 21 | insights = { 22 | "dataset_name": dataset_name, 23 | "export_timestamp": datetime.now().isoformat(), 24 | "dataset_info": { 25 | "shape": df.shape, 26 | "memory_usage_mb": round(df.memory_usage(deep=True).sum() / 1024**2, 2), 27 | "columns": list(df.columns) 28 | }, 29 | "schema_summary": { 30 | "numerical_columns": len([c for c, info in schema.columns.items() if info.suggested_role == 'numerical']), 31 | "categorical_columns": len([c for c, info in schema.columns.items() if info.suggested_role == 'categorical']), 32 | "temporal_columns": len([c for c, info in schema.columns.items() if info.suggested_role == 'temporal']), 33 | "identifier_columns": len([c for c, info in schema.columns.items() if info.suggested_role == 'identifier']) 34 | }, 35 | "data_quality": { 36 | "missing_data_columns": len([c for c in df.columns if df[c].isnull().any()]), 37 | "duplicate_rows": df.duplicated().sum(), 38 | "total_missing_values": df.isnull().sum().sum() 39 | }, 40 | "suggested_analyses": schema.suggested_analyses 41 | } 42 | 43 | # Add statistical summaries for numerical columns 44 | numerical_cols = [c for c, info in schema.columns.items() if info.suggested_role == 'numerical'] 45 | if numerical_cols: 46 | insights["numerical_summary"] = df[numerical_cols].describe().to_dict() 47 | 48 | # Add value counts for categorical columns 49 | categorical_cols = [c for c, info in schema.columns.items() if info.suggested_role == 'categorical'] 50 | if categorical_cols: 51 | insights["categorical_summary"] = {} 52 | for col in categorical_cols[:5]: # Limit to first 5 categorical columns 53 | insights["categorical_summary"][col] = df[col].value_counts().head(10).to_dict() 54 | 55 | # Export in requested format 56 | export_file = None 57 | 58 | if format.lower() == "json": 59 | import json 60 | # Create outputs/reports directory if it doesn't exist 61 | outputs_dir = Path("outputs/reports") 62 | outputs_dir.mkdir(parents=True, exist_ok=True) 63 | export_file = outputs_dir / f"insights_{dataset_name}.json" 64 | with open(export_file, 'w') as f: 65 | json.dump(insights, f, indent=2, default=str) 66 | 67 | elif format.lower() == "csv": 68 | # Create a summary CSV 69 | # Create outputs/reports directory if it doesn't exist 70 | outputs_dir = Path("outputs/reports") 71 | outputs_dir.mkdir(parents=True, exist_ok=True) 72 | export_file = outputs_dir / f"insights_{dataset_name}.csv" 73 | 74 | # Create summary rows 75 | summary_data = [] 76 | summary_data.append(["Dataset Name", dataset_name]) 77 | summary_data.append(["Export Date", insights["export_timestamp"]]) 78 | summary_data.append(["Total Rows", df.shape[0]]) 79 | summary_data.append(["Total Columns", df.shape[1]]) 80 | summary_data.append(["Memory Usage (MB)", insights["dataset_info"]["memory_usage_mb"]]) 81 | summary_data.append(["Numerical Columns", insights["schema_summary"]["numerical_columns"]]) 82 | summary_data.append(["Categorical Columns", insights["schema_summary"]["categorical_columns"]]) 83 | summary_data.append(["Missing Values", insights["data_quality"]["total_missing_values"]]) 84 | summary_data.append(["Duplicate Rows", insights["data_quality"]["duplicate_rows"]]) 85 | 86 | summary_df = pd.DataFrame(summary_data, columns=["Metric", "Value"]) 87 | summary_df.to_csv(export_file, index=False) 88 | 89 | elif format.lower() == "html": 90 | # Create HTML report 91 | # Create outputs/reports directory if it doesn't exist 92 | outputs_dir = Path("outputs/reports") 93 | outputs_dir.mkdir(parents=True, exist_ok=True) 94 | export_file = outputs_dir / f"insights_{dataset_name}.html" 95 | 96 | html_content = f""" 97 | 98 | Data Insights: {dataset_name} 99 | 100 |

Data Analysis Report: {dataset_name}

101 |

Dataset Overview

102 |
    103 |
  • Rows: {df.shape[0]:,}
  • 104 |
  • Columns: {df.shape[1]}
  • 105 |
  • Memory Usage: {insights['dataset_info']['memory_usage_mb']} MB
  • 106 |
107 | 108 |

Column Types

109 |
    110 |
  • Numerical: {insights['schema_summary']['numerical_columns']}
  • 111 |
  • Categorical: {insights['schema_summary']['categorical_columns']}
  • 112 |
  • Temporal: {insights['schema_summary']['temporal_columns']}
  • 113 |
  • Identifier: {insights['schema_summary']['identifier_columns']}
  • 114 |
115 | 116 |

Data Quality

117 |
    118 |
  • Missing Values: {insights['data_quality']['total_missing_values']}
  • 119 |
  • Duplicate Rows: {insights['data_quality']['duplicate_rows']}
  • 120 |
121 | 122 |

Suggested Analyses

123 |
    124 | {''.join([f'
  • {analysis}
  • ' for analysis in schema.suggested_analyses])} 125 |
126 | 127 | 128 | """ 129 | 130 | with open(export_file, 'w') as f: 131 | f.write(html_content) 132 | else: 133 | return {"error": f"Unsupported export format: {format}. Use 'json', 'csv', or 'html'"} 134 | 135 | return { 136 | "dataset": dataset_name, 137 | "export_format": format, 138 | "export_file": export_file, 139 | "insights_summary": { 140 | "total_metrics": len(insights), 141 | "has_numerical_summary": "numerical_summary" in insights, 142 | "has_categorical_summary": "categorical_summary" in insights 143 | }, 144 | "status": "success" 145 | } 146 | 147 | except Exception as e: 148 | return {"error": f"Export failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/find_correlations_tool.py: -------------------------------------------------------------------------------- 1 | """Correlation finding tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional, Union 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas, ChartConfig 7 | 8 | 9 | async def find_correlations( 10 | dataset_name: str, 11 | columns: Optional[List[str]] = None, 12 | threshold: float = 0.3 13 | ) -> dict: 14 | """Find correlations between numerical columns.""" 15 | try: 16 | df = DatasetManager.get_dataset(dataset_name) 17 | 18 | # Auto-select numerical columns if none specified 19 | if columns is None: 20 | columns = df.select_dtypes(include=[np.number]).columns.tolist() 21 | 22 | if len(columns) < 2: 23 | return {"error": "Need at least 2 numerical columns for correlation analysis"} 24 | 25 | # Filter to only existing columns 26 | existing_columns = [col for col in columns if col in df.columns] 27 | if len(existing_columns) < 2: 28 | return {"error": f"Only {len(existing_columns)} of specified columns exist in dataset"} 29 | 30 | # Calculate correlation matrix 31 | corr_matrix = df[existing_columns].corr() 32 | 33 | # Find strongest correlations (excluding self-correlations) 34 | strong_correlations = [] 35 | for i in range(len(existing_columns)): 36 | for j in range(i+1, len(existing_columns)): 37 | corr_value = corr_matrix.iloc[i, j] 38 | if not pd.isna(corr_value) and abs(corr_value) > threshold: 39 | strength = "strong" if abs(corr_value) > 0.7 else "moderate" 40 | direction = "positive" if corr_value > 0 else "negative" 41 | 42 | strong_correlations.append({ 43 | "column_1": existing_columns[i], 44 | "column_2": existing_columns[j], 45 | "correlation": round(corr_value, 3), 46 | "strength": strength, 47 | "direction": direction 48 | }) 49 | 50 | # Sort by absolute correlation value 51 | strong_correlations.sort(key=lambda x: abs(x["correlation"]), reverse=True) 52 | 53 | return { 54 | "dataset": dataset_name, 55 | "correlation_matrix": corr_matrix.to_dict(), 56 | "strong_correlations": strong_correlations, 57 | "columns_analyzed": existing_columns, 58 | "threshold": threshold 59 | } 60 | 61 | except Exception as e: 62 | return {"error": f"Correlation analysis failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/generate_dashboard_tool.py: -------------------------------------------------------------------------------- 1 | """Dashboard generation tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional 6 | from datetime import datetime 7 | from pathlib import Path 8 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 9 | 10 | 11 | async def generate_dashboard(dataset_name: str, chart_configs: List[Dict[str, Any]]) -> dict: 12 | """Generate multi-chart dashboards from any data.""" 13 | try: 14 | df = DatasetManager.get_dataset(dataset_name) 15 | 16 | if not chart_configs: 17 | return {"error": "No chart configurations provided"} 18 | 19 | dashboard_results = { 20 | "dataset": dataset_name, 21 | "dashboard_generated": datetime.now().isoformat(), 22 | "charts": [], 23 | "dashboard_file": None 24 | } 25 | 26 | # Import here to avoid circular imports 27 | from .pandas_tools import create_chart 28 | 29 | # Generate each chart 30 | for i, config in enumerate(chart_configs): 31 | try: 32 | chart_result = await create_chart( 33 | dataset_name=dataset_name, 34 | chart_type=config.get("chart_type", "bar"), 35 | x_column=config["x_column"], 36 | y_column=config.get("y_column"), 37 | groupby_column=config.get("groupby_column"), 38 | title=config.get("title"), 39 | save_path=f"outputs/charts/dashboard_{dataset_name}_chart_{i+1}" 40 | ) 41 | 42 | if "error" not in chart_result: 43 | dashboard_results["charts"].append({ 44 | "chart_id": i+1, 45 | "config": config, 46 | "result": chart_result, 47 | "status": "success" 48 | }) 49 | else: 50 | dashboard_results["charts"].append({ 51 | "chart_id": i+1, 52 | "config": config, 53 | "error": chart_result["error"], 54 | "status": "failed" 55 | }) 56 | 57 | except Exception as chart_error: 58 | dashboard_results["charts"].append({ 59 | "chart_id": i+1, 60 | "config": config, 61 | "error": str(chart_error), 62 | "status": "failed" 63 | }) 64 | 65 | # Count successful charts 66 | successful_charts = len([c for c in dashboard_results["charts"] if c["status"] == "success"]) 67 | failed_charts = len([c for c in dashboard_results["charts"] if c["status"] == "failed"]) 68 | 69 | dashboard_results.update({ 70 | "summary": { 71 | "total_charts": len(chart_configs), 72 | "successful_charts": successful_charts, 73 | "failed_charts": failed_charts 74 | } 75 | }) 76 | 77 | return dashboard_results 78 | 79 | except Exception as e: 80 | return {"error": f"Dashboard generation failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/list_loaded_datasets_tool.py: -------------------------------------------------------------------------------- 1 | """List loaded datasets tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional, Union 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas, ChartConfig 7 | 8 | 9 | async def list_loaded_datasets() -> dict: 10 | """Show all datasets currently in memory.""" 11 | try: 12 | datasets = [] 13 | total_memory = 0 14 | 15 | for name in DatasetManager.list_datasets(): 16 | info = DatasetManager.get_dataset_info(name) 17 | memory_mb = info["memory_usage_mb"] 18 | total_memory += memory_mb 19 | 20 | datasets.append({ 21 | "name": name, 22 | "rows": info["shape"][0], 23 | "columns": info["shape"][1], 24 | "memory_mb": round(memory_mb, 1) 25 | }) 26 | 27 | return { 28 | "loaded_datasets": datasets, 29 | "total_datasets": len(datasets), 30 | "total_memory_mb": round(total_memory, 1) 31 | } 32 | 33 | except Exception as e: 34 | return {"error": f"Failed to list datasets: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/load_dataset_tool.py: -------------------------------------------------------------------------------- 1 | """Dataset loading tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional, Union 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas, ChartConfig 7 | 8 | 9 | async def load_dataset(file_path: str, dataset_name: str, sample_size: Optional[int] = None) -> dict: 10 | """Load any JSON/CSV dataset into memory with automatic schema discovery.""" 11 | try: 12 | result = DatasetManager.load_dataset(file_path, dataset_name) 13 | 14 | # Apply sampling if requested 15 | if sample_size and sample_size < result["rows"]: 16 | df = DatasetManager.get_dataset(dataset_name) 17 | sampled_df = df.sample(n=sample_size, random_state=42) 18 | loaded_datasets[dataset_name] = sampled_df 19 | 20 | # Update schema for sampled data 21 | schema = dataset_schemas[dataset_name] 22 | schema.row_count = len(sampled_df) 23 | 24 | result["rows"] = len(sampled_df) 25 | result["sampled"] = True 26 | result["original_rows"] = len(df) 27 | 28 | return result 29 | 30 | except Exception as e: 31 | return { 32 | "status": "error", 33 | "message": f"Failed to load dataset: {str(e)}" 34 | } -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/memory_optimization_report_tool.py: -------------------------------------------------------------------------------- 1 | """Memory optimization report tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 7 | 8 | 9 | async def memory_optimization_report(dataset_name: str) -> dict: 10 | """Analyze memory usage and suggest optimizations.""" 11 | try: 12 | df = DatasetManager.get_dataset(dataset_name) 13 | 14 | # Current memory usage 15 | memory_usage = df.memory_usage(deep=True) 16 | total_memory = memory_usage.sum() 17 | 18 | # Analyze each column for optimization potential 19 | optimization_suggestions = [] 20 | potential_savings = 0 21 | 22 | for col in df.columns: 23 | col_memory = memory_usage[col] 24 | col_type = str(df[col].dtype) 25 | 26 | suggestion = { 27 | "column": col, 28 | "current_memory_kb": round(col_memory / 1024, 2), 29 | "current_dtype": col_type, 30 | "suggestion": None, 31 | "potential_savings_kb": 0 32 | } 33 | 34 | # String optimization 35 | if col_type == 'object': 36 | if df[col].apply(lambda x: isinstance(x, str)).all(): 37 | # Check if can be categorical 38 | unique_ratio = df[col].nunique() / len(df) 39 | if unique_ratio < 0.5: 40 | suggestion["suggestion"] = "Convert to categorical" 41 | suggestion["potential_savings_kb"] = round(col_memory * 0.6 / 1024, 2) 42 | potential_savings += col_memory * 0.6 43 | 44 | # Integer optimization 45 | elif 'int64' in col_type: 46 | col_min, col_max = df[col].min(), df[col].max() 47 | if col_min >= 0 and col_max <= 255: 48 | suggestion["suggestion"] = "Convert to uint8" 49 | suggestion["potential_savings_kb"] = round(col_memory * 0.875 / 1024, 2) 50 | potential_savings += col_memory * 0.875 51 | elif col_min >= -128 and col_max <= 127: 52 | suggestion["suggestion"] = "Convert to int8" 53 | suggestion["potential_savings_kb"] = round(col_memory * 0.875 / 1024, 2) 54 | potential_savings += col_memory * 0.875 55 | elif col_min >= -32768 and col_max <= 32767: 56 | suggestion["suggestion"] = "Convert to int16" 57 | suggestion["potential_savings_kb"] = round(col_memory * 0.75 / 1024, 2) 58 | potential_savings += col_memory * 0.75 59 | elif col_min >= -2147483648 and col_max <= 2147483647: 60 | suggestion["suggestion"] = "Convert to int32" 61 | suggestion["potential_savings_kb"] = round(col_memory * 0.5 / 1024, 2) 62 | potential_savings += col_memory * 0.5 63 | 64 | # Float optimization 65 | elif 'float64' in col_type: 66 | # Check if values fit in float32 67 | if df[col].between(-3.4e38, 3.4e38).all(): 68 | suggestion["suggestion"] = "Convert to float32" 69 | suggestion["potential_savings_kb"] = round(col_memory * 0.5 / 1024, 2) 70 | potential_savings += col_memory * 0.5 71 | 72 | if suggestion["suggestion"]: 73 | optimization_suggestions.append(suggestion) 74 | 75 | return { 76 | "dataset": dataset_name, 77 | "current_memory_usage": { 78 | "total_mb": round(total_memory / 1024**2, 2), 79 | "per_column_kb": {col: round(mem / 1024, 2) for col, mem in memory_usage.items()} 80 | }, 81 | "optimization_suggestions": optimization_suggestions, 82 | "potential_savings": { 83 | "total_mb": round(potential_savings / 1024**2, 2), 84 | "percentage": round(potential_savings / total_memory * 100, 2) 85 | }, 86 | "recommendations": [ 87 | "Convert low-cardinality strings to categorical", 88 | "Use smaller integer types when possible", 89 | "Consider float32 for decimal numbers", 90 | "Remove unused columns before analysis" 91 | ] 92 | } 93 | 94 | except Exception as e: 95 | return {"error": f"Memory optimization analysis failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/merge_datasets_tool.py: -------------------------------------------------------------------------------- 1 | """Dataset merging tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas 7 | 8 | 9 | async def merge_datasets( 10 | dataset_configs: List[Dict[str, Any]], 11 | join_strategy: str = "inner" 12 | ) -> dict: 13 | """Join datasets on common keys.""" 14 | try: 15 | if len(dataset_configs) < 2: 16 | return {"error": "Need at least 2 datasets to merge"} 17 | 18 | # Start with first dataset 19 | first_config = dataset_configs[0] 20 | merged_df = DatasetManager.get_dataset(first_config["dataset_name"]) 21 | 22 | merge_info = { 23 | "merge_strategy": join_strategy, 24 | "datasets_merged": [first_config["dataset_name"]], 25 | "final_shape": merged_df.shape, 26 | "merge_steps": [] 27 | } 28 | 29 | # Merge with each subsequent dataset 30 | for config in dataset_configs[1:]: 31 | dataset_name = config["dataset_name"] 32 | join_column = config.get("join_column") 33 | 34 | df_to_merge = DatasetManager.get_dataset(dataset_name) 35 | 36 | if join_column: 37 | # Merge on specific column 38 | if join_column not in merged_df.columns: 39 | return {"error": f"Join column '{join_column}' not found in merged dataset"} 40 | if join_column not in df_to_merge.columns: 41 | return {"error": f"Join column '{join_column}' not found in dataset '{dataset_name}'"} 42 | 43 | before_shape = merged_df.shape 44 | merged_df = merged_df.merge(df_to_merge, on=join_column, how=join_strategy, suffixes=('', f'_{dataset_name}')) 45 | after_shape = merged_df.shape 46 | 47 | merge_info["merge_steps"].append({ 48 | "merged_with": dataset_name, 49 | "join_column": join_column, 50 | "before_shape": before_shape, 51 | "after_shape": after_shape, 52 | "rows_gained": after_shape[0] - before_shape[0], 53 | "columns_gained": after_shape[1] - before_shape[1] 54 | }) 55 | else: 56 | # Concatenate datasets 57 | before_shape = merged_df.shape 58 | merged_df = pd.concat([merged_df, df_to_merge], ignore_index=True, sort=False) 59 | after_shape = merged_df.shape 60 | 61 | merge_info["merge_steps"].append({ 62 | "concatenated_with": dataset_name, 63 | "before_shape": before_shape, 64 | "after_shape": after_shape, 65 | "rows_added": after_shape[0] - before_shape[0] 66 | }) 67 | 68 | merge_info["datasets_merged"].append(dataset_name) 69 | 70 | # Save merged dataset 71 | merged_name = f"merged_{'_'.join(merge_info['datasets_merged'])}" 72 | loaded_datasets[merged_name] = merged_df 73 | 74 | # Create schema for merged dataset 75 | from ..models.schemas import DatasetSchema 76 | schema = DatasetSchema.from_dataframe(merged_df, merged_name) 77 | dataset_schemas[merged_name] = schema 78 | 79 | merge_info.update({ 80 | "merged_dataset_name": merged_name, 81 | "final_shape": merged_df.shape, 82 | "final_columns": list(merged_df.columns), 83 | "status": "success" 84 | }) 85 | 86 | return merge_info 87 | 88 | except Exception as e: 89 | return {"error": f"Dataset merge failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/pandas_tools.py: -------------------------------------------------------------------------------- 1 | """Pandas-based data analysis tools.""" 2 | 3 | from .load_dataset_tool import load_dataset 4 | from .list_loaded_datasets_tool import list_loaded_datasets 5 | from .segment_by_column_tool import segment_by_column 6 | from .find_correlations_tool import find_correlations 7 | from .create_chart_tool import create_chart 8 | from .analyze_distributions_tool import analyze_distributions 9 | from .detect_outliers_tool import detect_outliers 10 | from .time_series_analysis_tool import time_series_analysis 11 | from .suggest_analysis_tool import suggest_analysis 12 | 13 | __all__ = [ 14 | "load_dataset", 15 | "list_loaded_datasets", 16 | "segment_by_column", 17 | "find_correlations", 18 | "create_chart", 19 | "analyze_distributions", 20 | "detect_outliers", 21 | "time_series_analysis", 22 | "suggest_analysis" 23 | ] -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/segment_by_column_tool.py: -------------------------------------------------------------------------------- 1 | """Segmentation by column tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional, Union 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas, ChartConfig 7 | 8 | 9 | async def segment_by_column( 10 | dataset_name: str, 11 | column_name: str, 12 | method: str = "auto", 13 | top_n: int = 10 14 | ) -> dict: 15 | """Generic segmentation that works on any categorical column.""" 16 | try: 17 | df = DatasetManager.get_dataset(dataset_name) 18 | 19 | if column_name not in df.columns: 20 | return {"error": f"Column '{column_name}' not found in dataset '{dataset_name}'"} 21 | 22 | # Auto-select aggregation based on available numerical columns 23 | numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist() 24 | 25 | # Remove the groupby column from numerical columns if it's there 26 | if column_name in numerical_cols: 27 | numerical_cols.remove(column_name) 28 | 29 | if not numerical_cols: 30 | # No numerical columns - just count 31 | segments = df.groupby(column_name).size().to_frame('count') 32 | segments = segments.sort_values('count', ascending=False).head(top_n) 33 | else: 34 | # Aggregate numerical columns 35 | agg_dict = {} 36 | for col in numerical_cols: 37 | agg_dict[col] = ['count', 'mean', 'sum', 'std'] 38 | 39 | segments = df.groupby(column_name).agg(agg_dict) 40 | # Flatten column names 41 | segments.columns = ['_'.join(col).strip() for col in segments.columns] 42 | segments = segments.head(top_n) 43 | 44 | # Calculate percentages 45 | total_rows = len(df) 46 | if 'count' in segments.columns: 47 | segments['percentage'] = (segments['count'] / total_rows * 100).round(2) 48 | else: 49 | # Calculate counts for percentage 50 | counts = df.groupby(column_name).size() 51 | segments['count'] = counts 52 | segments['percentage'] = (counts / total_rows * 100).round(2) 53 | 54 | return { 55 | "dataset": dataset_name, 56 | "segmented_by": column_name, 57 | "segment_count": len(segments), 58 | "segments": segments.to_dict(), 59 | "total_rows": total_rows, 60 | "numerical_columns_analyzed": numerical_cols 61 | } 62 | 63 | except Exception as e: 64 | return {"error": f"Segmentation failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/suggest_analysis_tool.py: -------------------------------------------------------------------------------- 1 | """Analysis suggestion tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional, Union 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas, ChartConfig 7 | 8 | 9 | async def suggest_analysis(dataset_name: str) -> dict: 10 | """AI recommendations based on data characteristics.""" 11 | try: 12 | if dataset_name not in dataset_schemas: 13 | return {"error": f"Dataset '{dataset_name}' not loaded"} 14 | 15 | schema = dataset_schemas[dataset_name] 16 | 17 | # Get columns by type 18 | numerical_cols = [name for name, info in schema.columns.items() 19 | if info.suggested_role == 'numerical'] 20 | categorical_cols = [name for name, info in schema.columns.items() 21 | if info.suggested_role == 'categorical'] 22 | temporal_cols = [name for name, info in schema.columns.items() 23 | if info.suggested_role == 'temporal'] 24 | 25 | suggestions = [] 26 | 27 | # Numerical columns → correlation analysis 28 | if len(numerical_cols) >= 2: 29 | suggestions.append({ 30 | "type": "correlation_analysis", 31 | "description": f"Find relationships between {len(numerical_cols)} numerical variables", 32 | "columns": numerical_cols, 33 | "tool": "find_correlations", 34 | "priority": "high", 35 | "command": f"find_correlations('{dataset_name}')" 36 | }) 37 | 38 | # Categorical columns → segmentation 39 | if categorical_cols and numerical_cols: 40 | suggestions.append({ 41 | "type": "segmentation", 42 | "description": f"Group data by {len(categorical_cols)} categorical variables", 43 | "columns": categorical_cols, 44 | "tool": "segment_by_column", 45 | "priority": "high", 46 | "command": f"segment_by_column('{dataset_name}', '{categorical_cols[0]}')" 47 | }) 48 | 49 | # Date columns → time series 50 | if temporal_cols and numerical_cols: 51 | suggestions.append({ 52 | "type": "time_series", 53 | "description": f"Analyze trends over time using {len(temporal_cols)} date columns", 54 | "columns": temporal_cols, 55 | "tool": "time_series_analysis", 56 | "priority": "medium", 57 | "command": f"time_series_analysis('{dataset_name}', '{temporal_cols[0]}', '{numerical_cols[0]}')" 58 | }) 59 | 60 | # Distribution analysis for interesting columns 61 | high_cardinality_cols = [name for name, info in schema.columns.items() 62 | if info.unique_values > 10 and info.suggested_role in ['numerical', 'categorical']] 63 | if high_cardinality_cols: 64 | suggestions.append({ 65 | "type": "distribution_analysis", 66 | "description": "Analyze distributions of high-variance columns", 67 | "columns": high_cardinality_cols[:3], 68 | "tool": "analyze_distributions", 69 | "priority": "medium", 70 | "command": f"analyze_distributions('{dataset_name}', '{high_cardinality_cols[0]}')" 71 | }) 72 | 73 | # Outlier detection for numerical columns 74 | if numerical_cols: 75 | suggestions.append({ 76 | "type": "outlier_detection", 77 | "description": f"Find outliers in {len(numerical_cols)} numerical columns", 78 | "columns": numerical_cols, 79 | "tool": "detect_outliers", 80 | "priority": "medium", 81 | "command": f"detect_outliers('{dataset_name}')" 82 | }) 83 | 84 | # Data quality checks 85 | high_null_cols = [name for name, info in schema.columns.items() 86 | if info.null_percentage > 10] 87 | if high_null_cols: 88 | suggestions.append({ 89 | "type": "data_quality", 90 | "description": f"Review data quality - {len(high_null_cols)} columns have >10% missing values", 91 | "columns": high_null_cols, 92 | "tool": "validate_data_quality", 93 | "priority": "low", 94 | "command": f"validate_data_quality('{dataset_name}')" 95 | }) 96 | 97 | return { 98 | "dataset_name": dataset_name, 99 | "suggestions": suggestions, 100 | "dataset_summary": { 101 | "numerical_columns": len(numerical_cols), 102 | "categorical_columns": len(categorical_cols), 103 | "temporal_columns": len(temporal_cols), 104 | "total_rows": schema.row_count 105 | } 106 | } 107 | 108 | except Exception as e: 109 | return {"error": f"Analysis suggestion failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/time_series_analysis_tool.py: -------------------------------------------------------------------------------- 1 | """Time series analysis tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional, Union 6 | from ..models.schemas import DatasetManager, loaded_datasets, dataset_schemas, ChartConfig 7 | 8 | 9 | async def time_series_analysis( 10 | dataset_name: str, 11 | date_column: str, 12 | value_column: str, 13 | frequency: str = "auto" 14 | ) -> dict: 15 | """Temporal analysis when dates are detected.""" 16 | try: 17 | df = DatasetManager.get_dataset(dataset_name) 18 | 19 | if date_column not in df.columns: 20 | return {"error": f"Date column '{date_column}' not found"} 21 | if value_column not in df.columns: 22 | return {"error": f"Value column '{value_column}' not found"} 23 | 24 | # Ensure date column is datetime 25 | df_ts = df.copy() 26 | df_ts[date_column] = pd.to_datetime(df_ts[date_column]) 27 | 28 | # Sort by date 29 | df_ts = df_ts.sort_values(date_column) 30 | 31 | # Basic time series statistics 32 | date_range = df_ts[date_column].max() - df_ts[date_column].min() 33 | 34 | # Group by date and aggregate value 35 | if frequency == "auto": 36 | # Determine frequency based on data span 37 | if date_range.days > 365: 38 | freq = "M" # Monthly 39 | elif date_range.days > 31: 40 | freq = "W" # Weekly 41 | else: 42 | freq = "D" # Daily 43 | else: 44 | freq = frequency 45 | 46 | # Resample time series 47 | df_ts.set_index(date_column, inplace=True) 48 | ts_resampled = df_ts[value_column].resample(freq).mean() 49 | 50 | # Calculate trend (simple linear) 51 | x = np.arange(len(ts_resampled)) 52 | y = ts_resampled.values 53 | slope, intercept = np.polyfit(x, y, 1) 54 | 55 | # Calculate basic statistics 56 | result = { 57 | "dataset": dataset_name, 58 | "date_column": date_column, 59 | "value_column": value_column, 60 | "frequency": freq, 61 | "date_range": { 62 | "start": df_ts.index.min().isoformat(), 63 | "end": df_ts.index.max().isoformat(), 64 | "days": date_range.days 65 | }, 66 | "trend": { 67 | "slope": round(slope, 4), 68 | "direction": "increasing" if slope > 0 else "decreasing" if slope < 0 else "stable" 69 | }, 70 | "statistics": { 71 | "mean": round(ts_resampled.mean(), 3), 72 | "std": round(ts_resampled.std(), 3), 73 | "min": round(ts_resampled.min(), 3), 74 | "max": round(ts_resampled.max(), 3) 75 | }, 76 | "data_points": len(ts_resampled), 77 | "sample_values": ts_resampled.head(10).to_dict() 78 | } 79 | 80 | return result 81 | 82 | except Exception as e: 83 | return {"error": f"Time series analysis failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/src/mcp_server/tools/validate_data_quality_tool.py: -------------------------------------------------------------------------------- 1 | """Data quality validation tool implementation.""" 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from typing import List, Dict, Any, Optional 6 | from ..models.schemas import DatasetManager, DataQualityReport, AnalysisResult, loaded_datasets, dataset_schemas 7 | 8 | 9 | async def validate_data_quality(dataset_name: str) -> dict: 10 | """Comprehensive data quality assessment.""" 11 | try: 12 | df = DatasetManager.get_dataset(dataset_name) 13 | schema = dataset_schemas[dataset_name] 14 | 15 | # Missing data analysis 16 | missing_data = {} 17 | for col in df.columns: 18 | null_pct = df[col].isnull().mean() * 100 19 | if null_pct > 0: 20 | missing_data[col] = round(null_pct, 2) 21 | 22 | # Duplicate rows 23 | duplicate_rows = df.duplicated().sum() 24 | 25 | # Potential issues detection 26 | issues = [] 27 | recommendations = [] 28 | 29 | # High missing data 30 | high_missing = [col for col, pct in missing_data.items() if pct > 50] 31 | if high_missing: 32 | issues.append(f"High missing data in columns: {', '.join(high_missing)}") 33 | recommendations.append("Consider dropping columns with >50% missing data or investigate data collection process") 34 | 35 | # Duplicate rows 36 | if duplicate_rows > 0: 37 | issues.append(f"{duplicate_rows} duplicate rows found") 38 | recommendations.append("Remove duplicate rows or investigate if duplicates are intentional") 39 | 40 | # Potential ID columns that aren't unique 41 | for col_name, col_info in schema.columns.items(): 42 | if col_info.suggested_role == 'identifier' and col_info.unique_values < len(df): 43 | issues.append(f"Column '{col_name}' appears to be an ID but has duplicates") 44 | recommendations.append(f"Investigate duplicate values in '{col_name}' column") 45 | 46 | # Mixed data types in object columns 47 | object_cols = df.select_dtypes(include=['object']).columns 48 | for col in object_cols: 49 | sample_types = set(type(x).__name__ for x in df[col].dropna().head(100)) 50 | if len(sample_types) > 1: 51 | issues.append(f"Mixed data types in column '{col}': {sample_types}") 52 | recommendations.append(f"Standardize data types in column '{col}'") 53 | 54 | # Calculate quality score (0-100) 55 | score = 100 56 | score -= len(missing_data) * 5 # Penalize for missing data 57 | score -= (duplicate_rows / len(df)) * 20 # Penalize for duplicates 58 | score -= len([col for col, pct in missing_data.items() if pct > 10]) * 10 # High missing penalty 59 | score = max(0, score) 60 | 61 | if not issues: 62 | recommendations.append("Data quality looks good! Proceed with analysis.") 63 | 64 | quality_report = DataQualityReport( 65 | dataset_name=dataset_name, 66 | total_rows=len(df), 67 | total_columns=len(df.columns), 68 | missing_data=missing_data, 69 | duplicate_rows=duplicate_rows, 70 | potential_issues=issues, 71 | quality_score=round(score, 1), 72 | recommendations=recommendations 73 | ) 74 | 75 | return quality_report.model_dump() 76 | 77 | except Exception as e: 78 | return {"error": f"Data quality validation failed: {str(e)}"} -------------------------------------------------------------------------------- /quick-data-mcp/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Test package.""" -------------------------------------------------------------------------------- /quick-data-mcp/tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Test configuration and fixtures.""" 2 | 3 | import pytest 4 | import sys 5 | import os 6 | 7 | # Add src to Python path for tests 8 | sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(__file__)), 'src')) 9 | 10 | from mcp_server.server import get_server 11 | 12 | 13 | @pytest.fixture 14 | def mcp_server(): 15 | """Get the MCP server instance for testing.""" 16 | return get_server() -------------------------------------------------------------------------------- /quick-data-mcp/tests/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for prompts package.""" -------------------------------------------------------------------------------- /quick-data-mcp/tests/prompts/test_dataset_first_look_prompt.py: -------------------------------------------------------------------------------- 1 | """Tests for dataset first look prompt functionality.""" 2 | 3 | import pytest 4 | import pandas as pd 5 | import tempfile 6 | import os 7 | 8 | from mcp_server.prompts.dataset_first_look_prompt import dataset_first_look 9 | from mcp_server.models.schemas import DatasetManager, loaded_datasets, dataset_schemas 10 | 11 | 12 | @pytest.fixture 13 | def sample_dataset(): 14 | """Create a sample dataset for testing.""" 15 | data = { 16 | 'order_id': ['ord_001', 'ord_002', 'ord_003', 'ord_004', 'ord_005'], 17 | 'customer_id': ['cust_123', 'cust_124', 'cust_125', 'cust_126', 'cust_127'], 18 | 'product_category': ['electronics', 'books', 'clothing', 'electronics', 'home_garden'], 19 | 'order_value': [299.99, 29.99, 89.50, 599.99, 149.99], 20 | 'order_date': ['2024-11-15', '2024-11-14', '2024-11-13', '2024-11-12', '2024-11-11'], 21 | 'region': ['west_coast', 'midwest', 'east_coast', 'west_coast', 'south'], 22 | 'customer_segment': ['premium', 'standard', 'premium', 'premium', 'standard'] 23 | } 24 | df = pd.DataFrame(data) 25 | 26 | with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: 27 | df.to_csv(f.name, index=False) 28 | yield f.name 29 | 30 | # Cleanup 31 | os.unlink(f.name) 32 | 33 | 34 | @pytest.fixture 35 | def simple_dataset(): 36 | """Create a simple dataset with limited columns for testing edge cases.""" 37 | data = { 38 | 'id': [1, 2, 3, 4, 5], 39 | 'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'] 40 | } 41 | df = pd.DataFrame(data) 42 | 43 | with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: 44 | df.to_csv(f.name, index=False) 45 | yield f.name 46 | 47 | # Cleanup 48 | os.unlink(f.name) 49 | 50 | 51 | @pytest.fixture(autouse=True) 52 | def clear_datasets(): 53 | """Clear datasets before and after each test.""" 54 | loaded_datasets.clear() 55 | dataset_schemas.clear() 56 | yield 57 | loaded_datasets.clear() 58 | dataset_schemas.clear() 59 | 60 | 61 | class TestDatasetFirstLook: 62 | """Test dataset first look prompt functionality.""" 63 | 64 | @pytest.mark.asyncio 65 | async def test_dataset_first_look_comprehensive(self, sample_dataset): 66 | """Test first look prompt with comprehensive dataset.""" 67 | DatasetManager.load_dataset(sample_dataset, 'ecommerce') 68 | 69 | result = await dataset_first_look('ecommerce') 70 | 71 | assert isinstance(result, str) 72 | assert 'ecommerce' in result 73 | assert '5 records' in result 74 | assert 'columns' in result 75 | assert '📊 Numerical columns' in result 76 | assert '🏷️ Categorical columns' in result 77 | # Date columns might be detected as identifiers if all dates are unique 78 | assert ('📅 Date/Time columns' in result or '🔑 Identifier columns' in result) 79 | assert '🎯 Recommended starting points' in result 80 | assert 'find_correlations' in result or 'segment_by_column' in result 81 | 82 | @pytest.mark.asyncio 83 | async def test_dataset_first_look_simple(self, simple_dataset): 84 | """Test first look prompt with simple dataset.""" 85 | DatasetManager.load_dataset(simple_dataset, 'simple') 86 | 87 | result = await dataset_first_look('simple') 88 | 89 | assert isinstance(result, str) 90 | assert 'simple' in result 91 | assert '5 records' in result 92 | # Should still provide useful guidance even with limited data 93 | assert 'What aspect' in result 94 | 95 | @pytest.mark.asyncio 96 | async def test_dataset_first_look_nonexistent(self): 97 | """Test error handling for non-existent dataset.""" 98 | result = await dataset_first_look('nonexistent') 99 | 100 | assert isinstance(result, str) 101 | assert 'not loaded' in result 102 | assert 'load_dataset()' in result 103 | 104 | 105 | if __name__ == '__main__': 106 | pytest.main([__file__]) -------------------------------------------------------------------------------- /quick-data-mcp/tests/prompts/test_find_datasources_prompt.py: -------------------------------------------------------------------------------- 1 | """Tests for find_datasources prompt functionality.""" 2 | 3 | import pytest 4 | import tempfile 5 | import os 6 | import json 7 | import pandas as pd 8 | from pathlib import Path 9 | 10 | from mcp_server.prompts.find_datasources_prompt import find_datasources, format_file_size 11 | 12 | 13 | class TestFindDatasources: 14 | """Test find_datasources prompt functionality.""" 15 | 16 | @pytest.mark.asyncio 17 | async def test_find_datasources_with_files(self): 18 | """Test finding data sources in a directory with CSV and JSON files.""" 19 | with tempfile.TemporaryDirectory() as temp_dir: 20 | temp_path = Path(temp_dir) 21 | 22 | # Create sample CSV file 23 | csv_data = {'id': [1, 2, 3], 'name': ['A', 'B', 'C']} 24 | csv_file = temp_path / "sample_data.csv" 25 | pd.DataFrame(csv_data).to_csv(csv_file, index=False) 26 | 27 | # Create sample JSON file 28 | json_data = [{'id': 1, 'value': 100}, {'id': 2, 'value': 200}] 29 | json_file = temp_path / "test_data.json" 30 | with open(json_file, 'w') as f: 31 | json.dump(json_data, f) 32 | 33 | result = await find_datasources(str(temp_path)) 34 | 35 | assert isinstance(result, str) 36 | assert "Data Source Discovery" in result 37 | assert "sample_data.csv" in result 38 | assert "test_data.json" in result 39 | assert "load_dataset" in result 40 | assert "sample_data" in result # Suggested dataset name 41 | assert "test_data" in result # Suggested dataset name 42 | assert "CSV" in result 43 | assert "JSON" in result 44 | 45 | @pytest.mark.asyncio 46 | async def test_find_datasources_with_subdirectories(self): 47 | """Test finding data sources in subdirectories.""" 48 | with tempfile.TemporaryDirectory() as temp_dir: 49 | temp_path = Path(temp_dir) 50 | 51 | # Create data subdirectory 52 | data_dir = temp_path / "data" 53 | data_dir.mkdir() 54 | 55 | # Create file in subdirectory 56 | csv_data = {'x': [1, 2], 'y': [3, 4]} 57 | csv_file = data_dir / "subdir_data.csv" 58 | pd.DataFrame(csv_data).to_csv(csv_file, index=False) 59 | 60 | result = await find_datasources(str(temp_path)) 61 | 62 | assert isinstance(result, str) 63 | assert "Data Source Discovery" in result 64 | assert "data/ directory" in result 65 | assert "subdir_data.csv" in result 66 | assert "load_dataset" in result 67 | 68 | @pytest.mark.asyncio 69 | async def test_find_datasources_no_files(self): 70 | """Test behavior when no data files are found.""" 71 | with tempfile.TemporaryDirectory() as temp_dir: 72 | # Create a text file (not CSV/JSON) 73 | text_file = Path(temp_dir) / "readme.txt" 74 | text_file.write_text("This is not a data file") 75 | 76 | result = await find_datasources(temp_dir) 77 | 78 | assert isinstance(result, str) 79 | assert "No data files found" in result 80 | assert "Suggestions:" in result 81 | assert "Manual file search:" in result 82 | 83 | @pytest.mark.asyncio 84 | async def test_find_datasources_current_directory(self): 85 | """Test finding data sources in current directory (default behavior).""" 86 | # Test with default parameter (current directory) 87 | result = await find_datasources() 88 | 89 | assert isinstance(result, str) 90 | assert "Data Source Discovery" in result 91 | # Should not error out, even if no files found 92 | assert ("Data files found" in result or "No data files found" in result) 93 | 94 | @pytest.mark.asyncio 95 | async def test_find_datasources_nonexistent_directory(self): 96 | """Test handling for non-existent directory.""" 97 | result = await find_datasources("/nonexistent/directory/path") 98 | 99 | assert isinstance(result, str) 100 | # Non-existent directory should be handled gracefully 101 | assert ("No data files found" in result or "Error discovering data sources" in result) 102 | assert "Manual file search:" in result 103 | 104 | @pytest.mark.asyncio 105 | async def test_find_datasources_file_size_formatting(self): 106 | """Test that file sizes are properly formatted.""" 107 | with tempfile.TemporaryDirectory() as temp_dir: 108 | temp_path = Path(temp_dir) 109 | 110 | # Create a larger CSV file 111 | large_data = {'col' + str(i): list(range(100)) for i in range(10)} 112 | csv_file = temp_path / "large_data.csv" 113 | pd.DataFrame(large_data).to_csv(csv_file, index=False) 114 | 115 | result = await find_datasources(str(temp_path)) 116 | 117 | assert isinstance(result, str) 118 | assert "large_data.csv" in result 119 | # Should have file size information 120 | assert ("KB" in result or "MB" in result or "B" in result) 121 | 122 | @pytest.mark.asyncio 123 | async def test_find_datasources_special_characters_in_filename(self): 124 | """Test handling of files with special characters in names.""" 125 | with tempfile.TemporaryDirectory() as temp_dir: 126 | temp_path = Path(temp_dir) 127 | 128 | # Create file with spaces and hyphens 129 | csv_data = {'a': [1, 2], 'b': [3, 4]} 130 | csv_file = temp_path / "My Data-File.csv" 131 | pd.DataFrame(csv_data).to_csv(csv_file, index=False) 132 | 133 | result = await find_datasources(str(temp_path)) 134 | 135 | assert isinstance(result, str) 136 | assert "My Data-File.csv" in result 137 | # Should suggest cleaned up dataset name 138 | assert "my_data_file" in result 139 | 140 | 141 | class TestFormatFileSize: 142 | """Test format_file_size utility function.""" 143 | 144 | def test_format_file_size_bytes(self): 145 | """Test formatting file sizes in bytes.""" 146 | assert format_file_size(0) == "0 B" 147 | assert format_file_size(512) == "512 B" 148 | assert format_file_size(1023) == "1023 B" 149 | 150 | def test_format_file_size_kilobytes(self): 151 | """Test formatting file sizes in kilobytes.""" 152 | assert format_file_size(1024) == "1.0 KB" 153 | assert format_file_size(2048) == "2.0 KB" 154 | assert format_file_size(1536) == "1.5 KB" 155 | 156 | def test_format_file_size_megabytes(self): 157 | """Test formatting file sizes in megabytes.""" 158 | assert format_file_size(1024 * 1024) == "1.0 MB" 159 | assert format_file_size(1024 * 1024 * 2.5) == "2.5 MB" 160 | 161 | def test_format_file_size_gigabytes(self): 162 | """Test formatting file sizes in gigabytes.""" 163 | assert format_file_size(1024 * 1024 * 1024) == "1.0 GB" 164 | assert format_file_size(1024 * 1024 * 1024 * 1.5) == "1.5 GB" 165 | 166 | 167 | if __name__ == '__main__': 168 | pytest.main([__file__]) -------------------------------------------------------------------------------- /quick-data-mcp/tests/prompts/test_list_mcp_assets_prompt.py: -------------------------------------------------------------------------------- 1 | """Tests for list_mcp_assets prompt functionality.""" 2 | 3 | import pytest 4 | 5 | from mcp_server.prompts.list_mcp_assets_prompt import list_mcp_assets 6 | 7 | 8 | class TestListMcpAssets: 9 | """Test list_mcp_assets prompt functionality.""" 10 | 11 | @pytest.mark.asyncio 12 | async def test_list_mcp_assets_returns_string(self): 13 | """Test that list_mcp_assets returns a string.""" 14 | result = await list_mcp_assets() 15 | 16 | assert isinstance(result, str) 17 | assert len(result) > 0 18 | 19 | @pytest.mark.asyncio 20 | async def test_list_mcp_assets_contains_sections(self): 21 | """Test that the output contains the expected sections.""" 22 | result = await list_mcp_assets() 23 | 24 | # Check for main sections 25 | assert "## 📝 Prompts" in result 26 | assert "## 🔧 Tools" in result 27 | assert "## 📊 Resources" in result 28 | 29 | @pytest.mark.asyncio 30 | async def test_list_mcp_assets_contains_key_prompts(self): 31 | """Test that key prompts are listed.""" 32 | result = await list_mcp_assets() 33 | 34 | # Check for some key prompts 35 | assert "dataset_first_look" in result 36 | assert "find_datasources" in result 37 | assert "segmentation_workshop" in result 38 | assert "data_quality_assessment" in result 39 | 40 | @pytest.mark.asyncio 41 | async def test_list_mcp_assets_contains_key_tools(self): 42 | """Test that key tools are listed.""" 43 | result = await list_mcp_assets() 44 | 45 | # Check for some key tools 46 | assert "load_dataset" in result 47 | assert "create_chart" in result 48 | assert "analyze_distributions" in result 49 | assert "execute_custom_analytics_code" in result 50 | assert "validate_data_quality" in result 51 | 52 | @pytest.mark.asyncio 53 | async def test_list_mcp_assets_contains_key_resources(self): 54 | """Test that key resources are listed.""" 55 | result = await list_mcp_assets() 56 | 57 | # Check for some key resources 58 | assert "datasets://loaded" in result 59 | assert "analytics://current_dataset" in result 60 | assert "config://server" in result 61 | assert "system://status" in result 62 | 63 | @pytest.mark.asyncio 64 | async def test_list_mcp_assets_formatting(self): 65 | """Test that the output is properly formatted.""" 66 | result = await list_mcp_assets() 67 | 68 | # Check for markdown formatting 69 | assert result.startswith("# ") # Should start with main heading 70 | assert "🚀" in result # Should have emoji 71 | assert "•" in result # Should have bullet points 72 | assert "**" in result # Should have bold formatting 73 | 74 | # Check for quick start section 75 | assert "🎯 Quick Start:" in result 76 | assert "💡 Pro Tips:" in result 77 | 78 | @pytest.mark.asyncio 79 | async def test_list_mcp_assets_subsections(self): 80 | """Test that tool subsections are present.""" 81 | result = await list_mcp_assets() 82 | 83 | # Check for tool subsections 84 | assert "### Dataset Management" in result 85 | assert "### Analysis Tools" in result 86 | assert "### Visualization" in result 87 | assert "### Advanced Analytics" in result 88 | assert "### Resource Mirror Tools" in result 89 | 90 | # Check for resource subsections 91 | assert "### Dataset Resources" in result 92 | assert "### Analytics Resources" in result 93 | assert "### System Resources" in result 94 | 95 | 96 | if __name__ == '__main__': 97 | pytest.main([__file__]) -------------------------------------------------------------------------------- /quick-data-mcp/tests/resources/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for resources package.""" -------------------------------------------------------------------------------- /quick-data-mcp/tests/resources/test_get_server_config_resource.py: -------------------------------------------------------------------------------- 1 | """Tests for get_server_config resource functionality.""" 2 | 3 | import pytest 4 | from mcp_server.resources.get_server_config_resource import get_server_config 5 | from mcp_server.config.settings import settings 6 | 7 | 8 | class TestGetServerConfig: 9 | """Test get_server_config resource functionality.""" 10 | 11 | @pytest.mark.asyncio 12 | async def test_get_server_config(self): 13 | """Test getting server configuration.""" 14 | config = await get_server_config() 15 | 16 | assert isinstance(config, dict) 17 | assert config["name"] == settings.server_name 18 | assert config["version"] == settings.version 19 | assert config["log_level"] == settings.log_level 20 | assert "analytics_features" in config 21 | assert isinstance(config["analytics_features"], list) 22 | assert "dataset_loading" in config["analytics_features"] 23 | assert "supported_formats" in config 24 | assert "CSV" in config["supported_formats"] 25 | assert "JSON" in config["supported_formats"] 26 | 27 | 28 | if __name__ == '__main__': 29 | pytest.main([__file__]) -------------------------------------------------------------------------------- /quick-data-mcp/tests/test_custom_analytics_code.py: -------------------------------------------------------------------------------- 1 | """Tests for custom analytics code execution tool.""" 2 | 3 | import pytest 4 | import pandas as pd 5 | from mcp_server import tools 6 | from mcp_server.server import execute_custom_analytics_code 7 | from mcp_server.models.schemas import DatasetManager, loaded_datasets, dataset_schemas 8 | 9 | 10 | @pytest.fixture 11 | def sample_test_dataset(): 12 | """Create a test dataset for custom code execution.""" 13 | # Create sample data 14 | data = { 15 | 'customer_id': ['C001', 'C002', 'C003', 'C001', 'C002'], 16 | 'order_value': [100.0, 250.0, 75.0, 150.0, 200.0], 17 | 'category': ['A', 'B', 'A', 'C', 'B'], 18 | 'date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'], 19 | 'region': ['North', 'South', 'North', 'East', 'South'] 20 | } 21 | df = pd.DataFrame(data) 22 | 23 | # Load into DatasetManager 24 | loaded_datasets['test_custom'] = df 25 | 26 | # Create basic schema 27 | from mcp_server.models.schemas import DatasetSchema 28 | schema = DatasetSchema.from_dataframe(df, 'test_custom') 29 | dataset_schemas['test_custom'] = schema 30 | 31 | yield 'test_custom' 32 | 33 | # Cleanup 34 | if 'test_custom' in loaded_datasets: 35 | del loaded_datasets['test_custom'] 36 | if 'test_custom' in dataset_schemas: 37 | del dataset_schemas['test_custom'] 38 | 39 | 40 | @pytest.mark.asyncio 41 | class TestCustomAnalyticsCode: 42 | 43 | async def test_basic_execution(self, sample_test_dataset): 44 | """Test simple code execution with valid operations.""" 45 | result = await execute_custom_analytics_code( 46 | "test_custom", 47 | "print('Dataset shape:', df.shape)" 48 | ) 49 | assert "Dataset shape:" in result 50 | assert "(5, 5)" in result 51 | 52 | async def test_data_analysis(self, sample_test_dataset): 53 | """Test actual data analysis operations.""" 54 | code = """ 55 | print("Columns:", df.columns.tolist()) 56 | print("Row count:", len(df)) 57 | print("Customer count:", df['customer_id'].nunique()) 58 | if 'order_value' in df.columns: 59 | print("Total sales:", df['order_value'].sum()) 60 | """ 61 | result = await execute_custom_analytics_code("test_custom", code) 62 | assert "Columns:" in result 63 | assert "Row count: 5" in result 64 | assert "Customer count: 3" in result 65 | assert "Total sales: 775.0" in result 66 | 67 | async def test_error_handling(self, sample_test_dataset): 68 | """Test error capture and reporting.""" 69 | result = await execute_custom_analytics_code( 70 | "test_custom", 71 | "result = df['nonexistent_column'].sum()" 72 | ) 73 | assert "ERROR:" in result 74 | assert "KeyError" in result 75 | assert "nonexistent_column" in result 76 | 77 | async def test_timeout_handling(self, sample_test_dataset): 78 | """Test timeout behavior with long-running code.""" 79 | result = await execute_custom_analytics_code( 80 | "test_custom", 81 | """ 82 | import time 83 | time.sleep(35) # Longer than 30 second timeout 84 | print("This should not appear") 85 | """ 86 | ) 87 | assert "TIMEOUT:" in result 88 | assert "30 second limit" in result 89 | 90 | async def test_invalid_dataset(self): 91 | """Test behavior with nonexistent dataset.""" 92 | result = await execute_custom_analytics_code( 93 | "nonexistent_dataset", 94 | "print(df.shape)" 95 | ) 96 | assert "EXECUTION ERROR:" in result 97 | assert "not loaded" in result or "not found" in result 98 | 99 | async def test_empty_code(self, sample_test_dataset): 100 | """Test execution with empty code.""" 101 | result = await execute_custom_analytics_code("test_custom", "") 102 | # Should complete without error (no output) 103 | assert result is not None 104 | assert result.strip() == "" 105 | 106 | async def test_multiline_output(self, sample_test_dataset): 107 | """Test code that produces multiple lines of output.""" 108 | code = """ 109 | for i in range(3): 110 | print(f"Line {i+1}") 111 | print("Final line") 112 | """ 113 | result = await execute_custom_analytics_code("test_custom", code) 114 | lines = result.strip().split('\n') 115 | assert len(lines) == 4 116 | assert "Line 1" in result 117 | assert "Line 2" in result 118 | assert "Line 3" in result 119 | assert "Final line" in result 120 | 121 | async def test_pandas_operations(self, sample_test_dataset): 122 | """Test pandas operations work correctly.""" 123 | code = """ 124 | # Test groupby operations 125 | customer_totals = df.groupby('customer_id')['order_value'].sum() 126 | print("Customer totals:") 127 | print(customer_totals.sort_values(ascending=False)) 128 | 129 | # Test filtering 130 | high_value = df[df['order_value'] > 150] 131 | print("High value orders:", len(high_value)) 132 | 133 | # Test basic stats 134 | print("Average order:", df['order_value'].mean()) 135 | """ 136 | result = await execute_custom_analytics_code("test_custom", code) 137 | assert "Customer totals:" in result 138 | assert "C001" in result # Should show customer C001 139 | assert "High value orders: 2" in result 140 | assert "Average order: 155.0" in result 141 | 142 | async def test_numpy_operations(self, sample_test_dataset): 143 | """Test numpy operations work correctly.""" 144 | code = """ 145 | import numpy as np 146 | print("NumPy available:", hasattr(np, 'array')) 147 | print("Array operations:") 148 | values = np.array(df['order_value']) 149 | print("Mean:", np.mean(values)) 150 | print("Std:", np.std(values)) 151 | """ 152 | result = await execute_custom_analytics_code("test_custom", code) 153 | assert "NumPy available: True" in result 154 | assert "Mean:" in result 155 | assert "Std:" in result 156 | 157 | async def test_plotly_import(self, sample_test_dataset): 158 | """Test plotly is available for visualization.""" 159 | code = """ 160 | import plotly.express as px 161 | print("Plotly available:", hasattr(px, 'bar')) 162 | print("Can create figure:", hasattr(px, 'Figure') or callable(getattr(px, 'bar', None))) 163 | """ 164 | result = await execute_custom_analytics_code("test_custom", code) 165 | assert "Plotly available: True" in result 166 | 167 | async def test_complex_analysis(self, sample_test_dataset): 168 | """Test complex multi-step analysis.""" 169 | code = """ 170 | # Multi-step analysis 171 | print("=== Sales Analysis ===") 172 | 173 | # 1. Customer analysis 174 | customer_metrics = df.groupby('customer_id').agg({ 175 | 'order_value': ['sum', 'mean', 'count'] 176 | }).round(2) 177 | customer_metrics.columns = ['total', 'avg', 'orders'] 178 | 179 | print("Top customer by total sales:") 180 | top_customer = customer_metrics.sort_values('total', ascending=False).iloc[0] 181 | print(f"Total: ${top_customer['total']}, Avg: ${top_customer['avg']}, Orders: {int(top_customer['orders'])}") 182 | 183 | # 2. Category analysis 184 | category_sales = df.groupby('category')['order_value'].sum() 185 | print("Sales by category:") 186 | for cat, sales in category_sales.items(): 187 | print(f"{cat}: ${sales}") 188 | 189 | # 3. Regional analysis 190 | region_stats = df.groupby('region').agg({ 191 | 'order_value': ['sum', 'count'] 192 | }).round(2) 193 | print("Regional performance:") 194 | print(region_stats) 195 | """ 196 | result = await execute_custom_analytics_code("test_custom", code) 197 | assert "=== Sales Analysis ===" in result 198 | assert "Top customer by total sales:" in result 199 | assert "Sales by category:" in result 200 | assert "Regional performance:" in result 201 | assert "$" in result # Should have dollar amounts 202 | 203 | async def test_syntax_error_handling(self, sample_test_dataset): 204 | """Test handling of Python syntax errors.""" 205 | result = await execute_custom_analytics_code( 206 | "test_custom", 207 | """ 208 | print("Starting analysis" 209 | # Missing closing parenthesis - syntax error 210 | for i in range(5) 211 | print(i) 212 | """ 213 | ) 214 | # Syntax errors are caught by Python before our try/catch, so they don't have "ERROR:" prefix 215 | assert ("SyntaxError" in result or "invalid syntax" in result or "was never closed" in result) 216 | 217 | async def test_runtime_error_handling(self, sample_test_dataset): 218 | """Test handling of runtime errors.""" 219 | result = await execute_custom_analytics_code( 220 | "test_custom", 221 | """ 222 | print("Before error") 223 | result = 10 / 0 # Division by zero 224 | print("After error - should not appear") 225 | """ 226 | ) 227 | assert "Before error" in result 228 | assert "ERROR:" in result 229 | assert "ZeroDivisionError" in result 230 | assert "After error - should not appear" not in result 231 | 232 | async def test_large_output_handling(self, sample_test_dataset): 233 | """Test handling of large output.""" 234 | code = """ 235 | # Generate substantial output 236 | for i in range(100): 237 | print(f"Line {i}: Data value {i * 10}") 238 | print("Completed large output test") 239 | """ 240 | result = await execute_custom_analytics_code("test_custom", code) 241 | assert "Line 0: Data value 0" in result 242 | assert "Line 99: Data value 990" in result 243 | assert "Completed large output test" in result 244 | 245 | async def test_direct_analytics_function(self, sample_test_dataset): 246 | """Test the underlying analytics function directly.""" 247 | result = await tools.execute_custom_analytics_code( 248 | "test_custom", 249 | "print('Direct function call works:', df.shape)" 250 | ) 251 | assert "Direct function call works:" in result 252 | assert "(5, 5)" in result -------------------------------------------------------------------------------- /quick-data-mcp/tests/test_data_resources.py: -------------------------------------------------------------------------------- 1 | """Tests for data resources.""" 2 | 3 | import pytest 4 | from mcp_server.resources import data_resources 5 | from mcp_server.config.settings import settings 6 | 7 | 8 | @pytest.mark.asyncio 9 | async def test_get_server_config(): 10 | """Test getting server configuration.""" 11 | config = await data_resources.get_server_config() 12 | 13 | assert isinstance(config, dict) 14 | assert config["name"] == settings.server_name 15 | assert config["version"] == settings.version 16 | assert config["log_level"] == settings.log_level 17 | 18 | 19 | @pytest.mark.asyncio 20 | async def test_get_user_profile(): 21 | """Test getting user profile by ID.""" 22 | user_id = "test123" 23 | profile = await data_resources.get_user_profile(user_id) 24 | 25 | assert isinstance(profile, dict) 26 | assert profile["id"] == user_id 27 | assert profile["name"] == f"User {user_id}" 28 | assert profile["email"] == f"user{user_id}@example.com" 29 | assert profile["status"] == "active" 30 | assert "preferences" in profile 31 | assert isinstance(profile["preferences"], dict) 32 | 33 | # Test preferences structure 34 | prefs = profile["preferences"] 35 | assert "theme" in prefs 36 | assert "notifications" in prefs 37 | assert "language" in prefs 38 | 39 | 40 | @pytest.mark.asyncio 41 | async def test_get_system_status(): 42 | """Test getting system status information.""" 43 | status = await data_resources.get_system_status() 44 | 45 | assert isinstance(status, dict) 46 | assert status["status"] == "healthy" 47 | assert "uptime" in status 48 | assert status["version"] == settings.version 49 | assert "features" in status 50 | assert isinstance(status["features"], list) 51 | assert "dependencies" in status 52 | assert isinstance(status["dependencies"], dict) 53 | 54 | # Check expected features 55 | features = status["features"] 56 | expected_features = [ 57 | "dataset_loading", 58 | "schema_discovery", 59 | "correlation_analysis", 60 | "segmentation", 61 | "data_quality_assessment" 62 | ] 63 | for feature in expected_features: 64 | assert feature in features 65 | 66 | # Check dependencies 67 | deps = status["dependencies"] 68 | assert "mcp" in deps 69 | assert "pandas" in deps 70 | assert "plotly" in deps 71 | assert "pydantic" in deps 72 | 73 | 74 | @pytest.mark.asyncio 75 | async def test_user_profile_different_ids(): 76 | """Test user profiles with different IDs.""" 77 | user_ids = ["user1", "admin", "test_user_123"] 78 | 79 | for user_id in user_ids: 80 | profile = await data_resources.get_user_profile(user_id) 81 | assert profile["id"] == user_id 82 | assert profile["name"] == f"User {user_id}" 83 | assert profile["email"] == f"user{user_id}@example.com" -------------------------------------------------------------------------------- /quick-data-mcp/tests/tools/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for tools package.""" -------------------------------------------------------------------------------- /quick-data-mcp/tests/tools/test_load_dataset_tool.py: -------------------------------------------------------------------------------- 1 | """Tests for load_dataset tool functionality.""" 2 | 3 | import pytest 4 | import pandas as pd 5 | import json 6 | import tempfile 7 | import os 8 | 9 | from mcp_server.tools.load_dataset_tool import load_dataset 10 | from mcp_server.models.schemas import DatasetManager, loaded_datasets, dataset_schemas 11 | 12 | 13 | @pytest.fixture 14 | def sample_csv_file(): 15 | """Create a temporary CSV file for testing.""" 16 | data = { 17 | 'id': [1, 2, 3, 4, 5], 18 | 'category': ['A', 'B', 'A', 'C', 'B'], 19 | 'value': [10.5, 20.0, 15.5, 30.0, 25.5], 20 | 'date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'] 21 | } 22 | df = pd.DataFrame(data) 23 | 24 | with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: 25 | df.to_csv(f.name, index=False) 26 | yield f.name 27 | 28 | # Cleanup 29 | os.unlink(f.name) 30 | 31 | 32 | @pytest.fixture 33 | def sample_json_file(): 34 | """Create a temporary JSON file for testing.""" 35 | data = [ 36 | {'id': 1, 'name': 'Alice', 'score': 85, 'department': 'engineering'}, 37 | {'id': 2, 'name': 'Bob', 'score': 90, 'department': 'sales'}, 38 | {'id': 3, 'name': 'Charlie', 'score': 78, 'department': 'engineering'}, 39 | {'id': 4, 'name': 'Diana', 'score': 92, 'department': 'marketing'}, 40 | {'id': 5, 'name': 'Eve', 'score': 88, 'department': 'sales'} 41 | ] 42 | 43 | with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: 44 | json.dump(data, f) 45 | f.flush() # Ensure data is written 46 | temp_file = f.name 47 | 48 | yield temp_file 49 | os.unlink(temp_file) 50 | 51 | 52 | @pytest.fixture(autouse=True) 53 | def clear_datasets(): 54 | """Clear datasets before and after each test.""" 55 | loaded_datasets.clear() 56 | dataset_schemas.clear() 57 | yield 58 | loaded_datasets.clear() 59 | dataset_schemas.clear() 60 | 61 | 62 | class TestLoadDataset: 63 | """Test load_dataset tool functionality.""" 64 | 65 | @pytest.mark.asyncio 66 | async def test_load_csv_dataset(self, sample_csv_file): 67 | """Test loading a CSV dataset.""" 68 | result = await load_dataset(sample_csv_file, 'test_csv') 69 | 70 | assert result["status"] == "loaded" 71 | assert result["dataset_name"] == "test_csv" 72 | assert result["rows"] == 5 73 | assert len(result["columns"]) == 4 74 | assert result["format"] == "csv" 75 | assert "test_csv" in loaded_datasets 76 | assert "test_csv" in dataset_schemas 77 | 78 | @pytest.mark.asyncio 79 | async def test_load_json_dataset(self, sample_json_file): 80 | """Test loading a JSON dataset.""" 81 | result = await load_dataset(sample_json_file, 'test_json') 82 | 83 | assert result["status"] == "loaded" 84 | assert result["dataset_name"] == "test_json" 85 | assert result["rows"] == 5 86 | assert len(result["columns"]) == 4 87 | assert result["format"] == "json" 88 | assert "test_json" in loaded_datasets 89 | assert "test_json" in dataset_schemas 90 | 91 | @pytest.mark.asyncio 92 | async def test_load_dataset_with_sampling(self, sample_csv_file): 93 | """Test loading a dataset with sampling.""" 94 | result = await load_dataset(sample_csv_file, 'test_sampled', sample_size=3) 95 | 96 | assert result["status"] == "loaded" 97 | assert result["dataset_name"] == "test_sampled" 98 | assert result["rows"] == 3 # Should be sampled to 3 rows 99 | assert result["sampled"] == True 100 | assert result["original_rows"] == 5 101 | assert "test_sampled" in loaded_datasets 102 | assert len(loaded_datasets["test_sampled"]) == 3 103 | 104 | @pytest.mark.asyncio 105 | async def test_load_nonexistent_file(self): 106 | """Test loading a non-existent file.""" 107 | result = await load_dataset('nonexistent.csv', 'test_fail') 108 | 109 | assert result["status"] == "error" 110 | assert "Failed to load dataset" in result["message"] 111 | assert "test_fail" not in loaded_datasets 112 | 113 | 114 | if __name__ == '__main__': 115 | pytest.main([__file__]) --------------------------------------------------------------------------------