├── .dockerignore
├── docs
├── mcp_vscode.png
├── mcp_inspector.png
├── architecture.md
└── pyspark_guide.md
├── helpers
├── utils
│ ├── __init__.py
│ ├── context.py
│ ├── validators.py
│ ├── authentication.py
│ └── table_tools.py
├── logging_config.py
├── clients
│ ├── __init__.py
│ ├── report_client.py
│ ├── workspace_client.py
│ ├── warehouse_client.py
│ ├── lakehouse_client.py
│ ├── notebook_client.py
│ ├── table_client.py
│ ├── semanticModel_client.py
│ ├── sql_client.py
│ └── fabric_client.py
├── formatters
│ ├── metadata_formatter.py
│ └── schema_formatter.py
└── pyspark_helpers.py
├── .gitignore
├── Dockerfile
├── pyproject.toml
├── fabric_mcp.py
├── tools
├── __init__.py
├── workspace.py
├── sql_endpoint.py
├── report.py
├── semantic_model.py
├── warehouse.py
├── load_data.py
├── lakehouse.py
├── table.py
└── notebook.py
├── test_notebook_creation.py
├── test_security.py
└── README.md
/.dockerignore:
--------------------------------------------------------------------------------
1 | .venv
2 | Inprogress
3 | .ruff_cache/
4 | # Python bytecode files
5 | __pycache__/
--------------------------------------------------------------------------------
/docs/mcp_vscode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datumnova/ms-fabric-mcp/HEAD/docs/mcp_vscode.png
--------------------------------------------------------------------------------
/docs/mcp_inspector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datumnova/ms-fabric-mcp/HEAD/docs/mcp_inspector.png
--------------------------------------------------------------------------------
/helpers/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from helpers.utils.validators import _is_valid_uuid
2 |
3 | __all__ = [
4 | "_is_valid_uuid",
5 | ]
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python-generated files
2 | __pycache__/
3 | *.py[oc]
4 | build/
5 | dist/
6 | wheels/
7 | *.egg-info
8 |
9 | # Virtual environments
10 | .venv
11 |
12 | # Ruff cache
13 | .ruff_cache/
14 | Inprogress
15 |
--------------------------------------------------------------------------------
/helpers/utils/context.py:
--------------------------------------------------------------------------------
1 | from mcp.server.fastmcp import FastMCP
2 | from cachetools import TTLCache
3 |
4 |
5 | # Create MCP instance with context manager
6 | mcp = FastMCP("Fabric MCP Server ", json_response=True, stateless_http=True)
7 | mcp.settings.log_level = "debug"
8 |
9 | # Shared cache and context
10 | __ctx_cache = TTLCache(maxsize=100, ttl=300) # Cache for 5 minutes
11 | ctx = mcp.get_context()
12 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.12-slim
2 |
3 | # Install uv.
4 | COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
5 |
6 | # Copy the application into the container.
7 | COPY . /app
8 |
9 | # Install the application dependencies.
10 | WORKDIR /app
11 | RUN uv sync --frozen --no-cache
12 |
13 | # Run the application.
14 | CMD ["uv", "run", "python", "fabric_mcp.py", "--port", "8081"]
15 | # CMD ["/app/.venv/bin/fastapi", "run", "app/fabric_mcp.py", "--port", "80", "--host", "0.0.0.0"]
--------------------------------------------------------------------------------
/helpers/logging_config.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 |
4 | def get_logger(name: str) -> logging.Logger:
5 | """Set up and return a logger."""
6 | logger = logging.getLogger(name)
7 | handler = logging.StreamHandler()
8 | formatter = logging.Formatter(
9 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
10 | )
11 | handler.setFormatter(formatter)
12 | logger.addHandler(handler)
13 | logger.setLevel(logging.DEBUG)
14 | logger.propagate = True
15 | return logger
16 |
--------------------------------------------------------------------------------
/helpers/utils/validators.py:
--------------------------------------------------------------------------------
1 | from uuid import UUID
2 |
3 |
4 | def _is_valid_uuid(
5 | guid: str,
6 | ):
7 | """
8 | Validates if a string is a valid GUID in version 4
9 |
10 | Parameters
11 | ----------
12 | guid : str
13 | GUID to be validated.
14 |
15 | Returns
16 | -------
17 | bool
18 | Boolean that indicates if the string is a GUID or not.
19 | """
20 |
21 | try:
22 | UUID(str(guid), version=4)
23 | return True
24 | except ValueError:
25 | return False
26 |
--------------------------------------------------------------------------------
/helpers/utils/authentication.py:
--------------------------------------------------------------------------------
1 | from azure.identity import DefaultAzureCredential
2 | from cachetools import TTLCache
3 |
4 |
5 | def get_azure_credentials(client_id: str, cache: TTLCache) -> DefaultAzureCredential:
6 | """
7 | Get Azure credentials using DefaultAzureCredential.
8 | This function is used to authenticate with Azure services.
9 | """
10 | if f"{client_id}_creds" in cache:
11 | return cache[f"{client_id}_creds"]
12 | # If credentials are not cached, create a new DefaultAzureCredential instance
13 | # and store it in the cache.
14 | else:
15 | cache[f"{client_id}_creds"] = DefaultAzureCredential()
16 | return cache[f"{client_id}_creds"]
17 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "ms-fabric-mcp"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.12"
7 | dependencies = [
8 | "mcp[cli]",
9 | "azure-identity",
10 | "deltalake",
11 | "requests",
12 | "cachetools",
13 | "semantic-link-labs",
14 | "azure-storage-blob",
15 | "polars",
16 | "sqlalchemy",
17 | "pyodbc",
18 | "tabulate",
19 | "fastapi[standard]",
20 | "python-jose[cryptography]",
21 | "passlib[bcrypt]",
22 | "python-multipart",
23 | "fastapi-mcp",
24 | ]
25 |
26 | [tool.setuptools]
27 | packages = ["helpers", "helpers.clients", "helpers.formatters", "helpers.utils"]
28 |
29 | [project.scripts]
30 | mcp = "mcp.cli:app [cli]"
31 |
--------------------------------------------------------------------------------
/helpers/clients/__init__.py:
--------------------------------------------------------------------------------
1 | from helpers.clients.lakehouse_client import LakehouseClient
2 | from helpers.clients.warehouse_client import WarehouseClient
3 | from helpers.clients.table_client import TableClient
4 | from helpers.clients.workspace_client import WorkspaceClient
5 | from helpers.clients.semanticModel_client import SemanticModelClient
6 | from helpers.clients.report_client import ReportClient
7 | from helpers.clients.fabric_client import FabricApiClient
8 | from helpers.clients.sql_client import SQLClient, get_sql_endpoint
9 | from helpers.clients.notebook_client import NotebookClient
10 |
11 |
12 | __all__ = [
13 | "LakehouseClient",
14 | "WarehouseClient",
15 | "TableClient",
16 | "WorkspaceClient",
17 | "FabricApiClient",
18 | "SemanticModelClient",
19 | "ReportClient",
20 | "NotebookClient",
21 | "SQLClient",
22 | "get_sql_endpoint",
23 | ]
24 |
--------------------------------------------------------------------------------
/helpers/clients/report_client.py:
--------------------------------------------------------------------------------
1 | from helpers.logging_config import get_logger
2 | from helpers.clients.fabric_client import FabricApiClient
3 |
4 | logger = get_logger(__name__)
5 |
6 |
7 | class ReportClient:
8 | def __init__(self, client: FabricApiClient):
9 | self.client = client
10 |
11 | async def list_reports(self, workspace_id: str):
12 | """List all reports in a workspace."""
13 | reports = await self.client.get_reports(workspace_id)
14 |
15 | if not reports:
16 | return f"No reports found in workspace '{workspace_id}'."
17 |
18 | return reports
19 |
20 | async def get_report(self, workspace_id: str, report_id: str) -> dict:
21 | """Get a specific report by ID."""
22 | report = await self.client.get_report(workspace_id, report_id)
23 |
24 | if not report:
25 | return (
26 | f"No report found with ID '{report_id}' in workspace '{workspace_id}'."
27 | )
28 |
29 | return report
30 |
--------------------------------------------------------------------------------
/fabric_mcp.py:
--------------------------------------------------------------------------------
1 | from tools import *
2 | from helpers.logging_config import get_logger
3 | from helpers.utils.context import mcp, __ctx_cache
4 | import uvicorn
5 | import argparse
6 | import logging
7 |
8 |
9 |
10 | logger = get_logger(__name__)
11 | logger.level = logging.INFO
12 |
13 |
14 | @mcp.tool()
15 | async def clear_context() -> str:
16 | """Clear the current session context.
17 |
18 | Returns:
19 | A string confirming the context has been cleared.
20 | """
21 | __ctx_cache.clear()
22 | return "Context cleared."
23 |
24 |
25 | if __name__ == "__main__":
26 | # Initialize and run the server
27 | logger.info("Starting MCP server...")
28 | parser = argparse.ArgumentParser(description="Run MCP Streamable HTTP based server")
29 | parser.add_argument("--port", type=int, default=8081, help="Localhost port to listen on")
30 | args = parser.parse_args()
31 |
32 | # Start the server with Streamable HTTP transport
33 | uvicorn.run(mcp.streamable_http_app, host="0.0.0.0", port=args.port)
34 | # mcp.run(transport="stdio")
35 |
--------------------------------------------------------------------------------
/helpers/clients/workspace_client.py:
--------------------------------------------------------------------------------
1 | from helpers.logging_config import get_logger
2 | from helpers.clients.fabric_client import FabricApiClient
3 |
4 | logger = get_logger(__name__)
5 |
6 |
7 | class WorkspaceClient:
8 | def __init__(self, client: FabricApiClient):
9 | self.client = client
10 |
11 | async def list_workspaces(self):
12 | """List all available workspaces."""
13 | workspaces = await self.client.get_workspaces()
14 | if not workspaces:
15 | raise ValueError("No workspaces found.")
16 |
17 | markdown = "# Fabric Workspaces\n\n"
18 | markdown += "| ID | Name | Capacity |\n"
19 | markdown += "|-----|------|----------|\n"
20 |
21 | for ws in workspaces:
22 | markdown += f"| {ws['id']} | {ws['displayName']} | {ws.get('capacityId', 'N/A')} |\n"
23 |
24 | return markdown
25 |
26 | async def resolve_workspace(self, workspace_name: str):
27 | """Resolve workspace name to workspace ID."""
28 | return await self.client.resolve_workspace_name_and_id(workspace=workspace_name)
29 |
--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
1 | from tools.workspace import set_workspace, list_workspaces
2 | from tools.warehouse import set_warehouse, list_warehouses
3 | from tools.lakehouse import set_lakehouse, list_lakehouses
4 | from tools.table import (
5 | set_table,
6 | list_tables,
7 | get_lakehouse_table_schema,
8 | get_all_lakehouse_schemas,
9 | run_query,
10 | )
11 | from tools.semantic_model import (
12 | list_semantic_models,
13 | get_semantic_model,
14 | )
15 | from tools.report import (
16 | list_reports,
17 | get_report,
18 | )
19 | from tools.load_data import load_data_from_url
20 | from tools.notebook import list_notebooks, create_notebook
21 |
22 | __all__ = [
23 | "set_workspace",
24 | "list_workspaces",
25 | "set_warehouse",
26 | "list_warehouses",
27 | "set_lakehouse",
28 | "list_lakehouses",
29 | "set_table",
30 | "list_tables",
31 | "get_lakehouse_table_schema",
32 | "get_all_lakehouse_schemas",
33 | "list_semantic_models",
34 | "get_semantic_model",
35 | "list_reports",
36 | "get_report",
37 | "load_data_from_url",
38 | "run_query",
39 | "list_notebooks",
40 | "create_notebook",
41 | ]
42 |
--------------------------------------------------------------------------------
/helpers/formatters/metadata_formatter.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | import json
3 |
4 |
5 | def format_metadata_to_markdown(metadata: object) -> str:
6 | """Convert Delta table metadata to a responsive markdown format with HTML."""
7 | md = "#### Metadata\n\n"
8 | md += "
\n"
9 | md += f" - ID:
- {metadata.id}
\n"
10 | if metadata.name:
11 | md += f" - Name:
- {metadata.name}
\n"
12 | if metadata.description:
13 | md += f" - Description:
- {metadata.description}
\n"
14 | if metadata.partition_columns:
15 | md += f" - Partition Columns:
- {', '.join(metadata.partition_columns)}
\n"
16 | if metadata.created_time:
17 | created_time = datetime.fromtimestamp(metadata.created_time / 1000)
18 | md += f" - Created:
- {created_time.strftime('%Y-%m-%d %H:%M:%S')}
\n"
19 | if metadata.configuration:
20 | md += " - Configuration:
\n"
21 | md += " - \n"
22 | md += "
\n"
23 | md += " View JSON
\n"
24 | md += " \n"
25 | md += json.dumps(metadata.configuration, indent=2)
26 | md += "\n
\n"
27 | md += " \n"
28 | md += " \n"
29 | md += "
\n"
30 | return md
31 |
--------------------------------------------------------------------------------
/tools/workspace.py:
--------------------------------------------------------------------------------
1 | from helpers.utils.context import mcp, __ctx_cache
2 | from mcp.server.fastmcp import Context
3 | from helpers.utils.authentication import get_azure_credentials
4 | from helpers.clients import (
5 | FabricApiClient,
6 | WorkspaceClient,
7 | )
8 |
9 |
10 | @mcp.tool()
11 | async def set_workspace(workspace: str, ctx: Context) -> str:
12 | """Set the current workspace for the session.
13 |
14 | Args:
15 | workspace: Name or ID of the workspace
16 | ctx: Context object containing client information
17 | Returns:
18 | A string confirming the workspace has been set.
19 | """
20 | __ctx_cache[f"{ctx.client_id}_workspace"] = workspace
21 | return f"Workspace set to '{workspace}'."
22 |
23 |
24 | @mcp.tool()
25 | async def list_workspaces(ctx: Context) -> str:
26 | """List all available Fabric workspaces.
27 |
28 | Args:
29 | ctx: Context object containing client information
30 |
31 | Returns:
32 | A string containing the list of workspaces or an error message.
33 | """
34 | try:
35 | client = WorkspaceClient(
36 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
37 | )
38 |
39 | workspaces = await client.list_workspaces()
40 |
41 | return workspaces
42 |
43 | except Exception as e:
44 | return f"Error listing workspaces: {str(e)}"
45 |
--------------------------------------------------------------------------------
/helpers/formatters/schema_formatter.py:
--------------------------------------------------------------------------------
1 | from typing import Dict
2 | from helpers.formatters.metadata_formatter import format_metadata_to_markdown
3 |
4 |
5 | def format_schema_to_markdown(
6 | table_info: Dict, schema: object, metadata: object
7 | ) -> str:
8 | """Convert a Delta table schema and metadata to a responsive markdown format with HTML."""
9 | md = f"Delta Table: {table_info['name']}
\n"
10 | md += f"Type: {table_info['type']}
\n"
11 | md += f"Location: {table_info['location']}
\n\n"
12 |
13 | # Responsive schema table wrapped in a scrollable div
14 | md += "Schema
\n"
15 | md += '\n'
16 | md += '
\n'
17 | md += " \n"
18 | md += " | Column Name | \n"
19 | md += " Data Type | \n"
20 | md += " Nullable | \n"
21 | md += "
\n"
22 |
23 | for field in schema.fields:
24 | md += " \n"
25 | md += f" | {field.name} | \n"
26 | md += f" {field.type} | \n"
27 | md += f" {field.nullable} | \n"
28 | md += "
\n"
29 |
30 | md += "
\n"
31 | md += "
\n\n"
32 |
33 | # Collapsible metadata section for a dynamic feel
34 | md += "\n"
35 | md += " View Metadata
\n\n"
36 | md += format_metadata_to_markdown(metadata)
37 | md += "\n \n"
38 |
39 | return md + "\n"
40 |
--------------------------------------------------------------------------------
/helpers/clients/warehouse_client.py:
--------------------------------------------------------------------------------
1 | from helpers.logging_config import get_logger
2 | from helpers.clients.fabric_client import FabricApiClient
3 | from typing import Optional, Dict, Any
4 |
5 | logger = get_logger(__name__)
6 |
7 |
8 | class WarehouseClient:
9 | def __init__(self, client: FabricApiClient):
10 | self.client = client
11 |
12 | async def list_warehouses(self, workspace: str):
13 | """List all warehouses in a lakehouse."""
14 | warehouses = await self.client.get_warehouses(workspace)
15 |
16 | if not warehouses:
17 | return f"No warehouses found in workspace '{workspace}'."
18 |
19 | markdown = f"# Warehouses in workspace '{workspace}'\n\n"
20 | markdown += "| ID | Name |\n"
21 | markdown += "|-----|------|\n"
22 |
23 | for wh in warehouses:
24 | markdown += f"| {wh['id']} | {wh['displayName']} |\n"
25 |
26 | return markdown
27 |
28 | async def get_warehouse(
29 | self,
30 | workspace: str,
31 | warehouse: str,
32 | ) -> Optional[Dict[str, Any]]:
33 | """Get details of a specific warehouse."""
34 | if not warehouse:
35 | raise ValueError("Warehouse name cannot be empty.")
36 |
37 | return await self.client.get_item(
38 | workspace_id=workspace, item_id=warehouse, item_type="warehouse"
39 | )
40 |
41 | async def create_warehouse(
42 | self,
43 | name: str,
44 | workspace: str,
45 | description: Optional[str] = None,
46 | ):
47 | """Create a new warehouse."""
48 | if not name:
49 | raise ValueError("Warehouse name cannot be empty.")
50 |
51 | return await self.client.create_item(
52 | name=name, workspace=workspace, description=description, type="Warehouse"
53 | )
54 |
--------------------------------------------------------------------------------
/tools/sql_endpoint.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | from helpers.utils.context import mcp, __ctx_cache
3 | from mcp.server.fastmcp import Context
4 | from helpers.clients import get_sql_endpoint
5 |
6 |
7 | @mcp.tool()
8 | async def get_sql_endpoint(
9 | workspace: Optional[str] = None,
10 | lakehouse: Optional[str] = None,
11 | warehouse: Optional[str] = None,
12 | type: Optional[str] = None,
13 | ctx: Context = None,
14 | ) -> str:
15 | """
16 | Retrieve the SQL endpoint for a specified lakehouse or warehouse.
17 |
18 | Args:
19 | workspace: Name or ID of the workspace (optional).
20 | lakehouse: Name or ID of the lakehouse (optional).
21 | warehouse: Name or ID of the warehouse (optional).
22 | type: Type of resource ('lakehouse' or 'warehouse'). If not provided, it will be inferred.
23 | ctx: Context object containing client information.
24 |
25 | Returns:
26 | A string containing the resource type, name/ID, and its SQL endpoint.
27 | """
28 | try:
29 | if ctx is None:
30 | raise ValueError("Context (ctx) must be provided.")
31 |
32 | if workspace is None:
33 | workspace = __ctx_cache.get(f"{ctx.client_id}_workspace")
34 | if workspace is None:
35 | raise ValueError("Workspace must be specified or set in context.")
36 | if lakehouse is None and warehouse is None:
37 | lakehouse = __ctx_cache.get(f"{ctx.client_id}_lakehouse")
38 | warehouse = __ctx_cache.get(f"{ctx.client_id}_warehouse")
39 | if warehouse is None and lakehouse is None:
40 | raise ValueError(
41 | "Either lakehouse or warehouse must be specified or set in context."
42 | )
43 |
44 | name, endpoint = await get_sql_endpoint(
45 | workspace=workspace,
46 | lakehouse=lakehouse,
47 | warehouse=warehouse, # Add warehouse to the call
48 | type=type,
49 | )
50 |
51 | return (
52 | endpoint
53 | if endpoint
54 | else f"No SQL endpoint found for {type} '{lakehouse or warehouse}' in workspace '{workspace}'."
55 | )
56 | except Exception as e:
57 | return f"Error retrieving SQL endpoint: {str(e)}"
58 |
--------------------------------------------------------------------------------
/helpers/utils/table_tools.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Tuple, Optional
2 | from azure.identity import DefaultAzureCredential
3 | from deltalake import DeltaTable
4 | from helpers.logging_config import get_logger
5 | import asyncio
6 |
7 | logger = get_logger(__name__)
8 |
9 |
10 | async def get_delta_schemas(
11 | tables: List[Dict], credential: DefaultAzureCredential
12 | ) -> List[Tuple[Dict, object, object]]:
13 | """Get schema and metadata for each Delta table"""
14 | delta_tables = []
15 | logger.info(f"Starting schema extraction for {len(tables)} tables")
16 |
17 | # Get token for Azure Storage (not Fabric API)
18 | token = credential.get_token("https://storage.azure.com/.default").token
19 | storage_options = {"bearer_token": token, "use_fabric_endpoint": "true"}
20 |
21 | for table in tables:
22 | task = asyncio.create_task(get_delta_table(table, storage_options))
23 | delta_tables.append(task)
24 | logger.debug(f"Created task for table: {table['name']}")
25 | # Wait for all tasks to complete
26 | delta_tables = await asyncio.gather(*delta_tables)
27 | logger.info(f"Completed schema extraction for {len(delta_tables)} tables")
28 | # Filter out None values
29 | delta_tables = [dt for dt in delta_tables if dt is not None]
30 | return delta_tables
31 |
32 |
33 | async def get_delta_table(
34 | table: Dict, storage_options: Optional[Dict] = None
35 | ) -> Optional[Tuple[Dict, object, object]]:
36 | """Get Delta table schema and metadata"""
37 | logger.debug(f"Processing table: {table['name']}")
38 |
39 | # Check if the table is a Delta table
40 |
41 | if table["format"].lower() == "delta":
42 | try:
43 | table_path = table["location"]
44 | logger.debug(f"Processing Delta table: {table['name']} at {table_path}")
45 |
46 | # Create DeltaTable instance with storage options
47 | delta_table = DeltaTable(table_path, storage_options=storage_options)
48 |
49 | # Get both schema and metadata
50 | result = (table, delta_table.schema(), delta_table.metadata())
51 | logger.info(f"Processed table: {table['name']}")
52 | return result
53 |
54 | except Exception as e:
55 | logger.error(f"Could not process table {table['name']}: {str(e)}")
56 | return None
57 |
--------------------------------------------------------------------------------
/helpers/clients/lakehouse_client.py:
--------------------------------------------------------------------------------
1 | from helpers.utils import _is_valid_uuid
2 | from helpers.logging_config import get_logger
3 | from helpers.clients.fabric_client import FabricApiClient
4 | from typing import Optional, Dict, Any
5 |
6 | logger = get_logger(__name__)
7 |
8 |
9 | class LakehouseClient:
10 | def __init__(self, client: FabricApiClient):
11 | self.client = client
12 |
13 | async def list_lakehouses(self, workspace: str):
14 | """List all lakehouses in a workspace."""
15 | if not _is_valid_uuid(workspace):
16 | raise ValueError("Invalid workspace ID.")
17 | lakehouses = await self.client.get_lakehouses(workspace)
18 |
19 | if not lakehouses:
20 | return f"No lakehouses found in workspace '{workspace}'."
21 |
22 | markdown = f"# Lakehouses in workspace '{workspace}'\n\n"
23 | markdown += "| ID | Name |\n"
24 | markdown += "|-----|------|\n"
25 |
26 | for lh in lakehouses:
27 | markdown += f"| {lh['id']} | {lh['displayName']} |\n"
28 |
29 | return markdown
30 |
31 | async def get_lakehouse(
32 | self,
33 | workspace: str,
34 | lakehouse: str,
35 | ) -> Optional[Dict[str, Any]]:
36 | """Get details of a specific lakehouse."""
37 | if not _is_valid_uuid(workspace):
38 | raise ValueError("Invalid workspace ID.")
39 |
40 | if not lakehouse:
41 | raise ValueError("Lakehouse name cannot be empty.")
42 |
43 | response = await self.client.get_item(workspace_id=workspace, item_id=lakehouse)
44 | logger.info(f"Lakehouse details: {response}")
45 | return response
46 |
47 | async def resolve_lakehouse(self, workspace_id: str, lakehouse_name: str):
48 | """Resolve lakehouse name to lakehouse ID."""
49 | return await self.client.resolve_item_name_and_id(
50 | workspace=workspace_id, item=lakehouse_name, type="Lakehouse"
51 | )
52 |
53 | async def create_lakehouse(
54 | self,
55 | name: str,
56 | workspace: str,
57 | description: Optional[str] = None,
58 | ):
59 | """Create a new lakehouse."""
60 | if not _is_valid_uuid(workspace):
61 | raise ValueError("Invalid workspace ID.")
62 |
63 | if not name:
64 | raise ValueError("Lakehouse name cannot be empty.")
65 |
66 | return await self.client.create_item(
67 | name=name, workspace=workspace, description="description", type="Lakehouse"
68 | )
69 |
--------------------------------------------------------------------------------
/tools/report.py:
--------------------------------------------------------------------------------
1 | from helpers.utils.context import mcp, __ctx_cache
2 | from mcp.server.fastmcp import Context
3 | from helpers.utils.authentication import get_azure_credentials
4 | from helpers.clients import (
5 | FabricApiClient,
6 | ReportClient,
7 | )
8 | from helpers.logging_config import get_logger
9 | from typing import Optional
10 |
11 | logger = get_logger(__name__)
12 |
13 |
14 | @mcp.tool()
15 | async def list_reports(workspace: Optional[str] = None, ctx: Context = None) -> str:
16 | """List all reports in a Fabric workspace.
17 |
18 | Args:
19 | workspace: Name or ID of the workspace (optional)
20 | ctx: Context object containing client information
21 | Returns:
22 | A string containing the list of reports or an error message.
23 | """
24 | try:
25 | client = ReportClient(
26 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
27 | )
28 |
29 | reports = await client.list_reports(
30 | workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"]
31 | )
32 |
33 | markdown = f"# Reports in workspace '{workspace}'\n\n"
34 | markdown += "| ID | Name | Description |\n"
35 | markdown += "|-----|------|-------------|\n"
36 |
37 | for report in reports:
38 | markdown += f"| {report.get('id', 'N/A')} | {report.get('displayName', 'N/A')} | {report.get('description', 'N/A')} |\n"
39 |
40 | return markdown
41 |
42 | except Exception as e:
43 | return f"Error listing reports: {str(e)}"
44 |
45 |
46 | @mcp.tool()
47 | async def get_report(
48 | workspace: Optional[str] = None,
49 | report_id: Optional[str] = None,
50 | ctx: Context = None,
51 | ) -> str:
52 | """Get a specific report by ID.
53 |
54 | Args:
55 | workspace: Name or ID of the workspace (optional)
56 | report_id: ID of the report (optional)
57 | ctx: Context object containing client information
58 |
59 | Returns:
60 | A string containing the report details or an error message.
61 | """
62 | try:
63 | client = ReportClient(
64 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
65 | )
66 |
67 | report = await client.get_report(
68 | workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"],
69 | report_id,
70 | )
71 |
72 | if not report:
73 | return f"No report found with ID '{report_id}' in workspace '{workspace}'."
74 |
75 | return f"Report details:\n\n{report}"
76 |
77 | except Exception as e:
78 | return f"Error getting report: {str(e)}"
79 |
--------------------------------------------------------------------------------
/tools/semantic_model.py:
--------------------------------------------------------------------------------
1 | from helpers.utils.context import mcp, __ctx_cache
2 | from mcp.server.fastmcp import Context
3 | from helpers.utils.authentication import get_azure_credentials
4 | from helpers.clients import (
5 | FabricApiClient,
6 | SemanticModelClient,
7 | )
8 | from helpers.logging_config import get_logger
9 |
10 | from typing import Optional
11 |
12 | logger = get_logger(__name__)
13 |
14 |
15 | @mcp.tool()
16 | async def list_semantic_models(
17 | workspace: Optional[str] = None, ctx: Context = None
18 | ) -> str:
19 | """List all semantic models in a Fabric workspace.
20 |
21 | Args:
22 | workspace: Name or ID of the workspace (optional)
23 | ctx: Context object containing client information
24 |
25 | Returns:
26 | A string containing the list of semantic models or an error message.
27 | """
28 | try:
29 | client = SemanticModelClient(
30 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
31 | )
32 |
33 | models = await client.list_semantic_models(
34 | workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"]
35 | )
36 |
37 | markdown = f"# Semantic Models in workspace '{workspace}'\n\n"
38 | markdown += "| ID | Name | Folder ID | Description |\n"
39 | markdown += "|-----|------|-----------|-------------|\n"
40 |
41 | for model in models:
42 | markdown += f"| {model.get('id', 'N/A')} | {model.get('displayName', 'N/A')} | {model.get('folderId', 'N/A')} | {model.get('description', 'N/A')} |\n"
43 |
44 | return markdown
45 |
46 | except Exception as e:
47 | return f"Error listing semantic models: {str(e)}"
48 |
49 |
50 | @mcp.tool()
51 | async def get_semantic_model(
52 | workspace: Optional[str] = None,
53 | model_id: Optional[str] = None,
54 | ctx: Context = None,
55 | ) -> str:
56 | """Get a specific semantic model by ID.
57 |
58 | Args:
59 | workspace: Name or ID of the workspace (optional)
60 | model_id: ID of the semantic model (optional)
61 | ctx: Context object containing client information
62 |
63 | Returns:
64 | A string containing the details of the semantic model or an error message.
65 | """
66 | try:
67 | client = SemanticModelClient(
68 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
69 | )
70 |
71 | model = await client.get_semantic_model(
72 | workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"],
73 | model_id if model_id else __ctx_cache[f"{ctx.client_id}_semantic_model"],
74 | )
75 |
76 | return f"Semantic Model '{model['displayName']}' details:\n\n{model}"
77 |
78 | except Exception as e:
79 | return f"Error retrieving semantic model: {str(e)}"
80 |
--------------------------------------------------------------------------------
/tools/warehouse.py:
--------------------------------------------------------------------------------
1 | from helpers.utils.context import mcp, __ctx_cache
2 | from mcp.server.fastmcp import Context
3 | from helpers.utils.authentication import get_azure_credentials
4 | from helpers.clients import (
5 | FabricApiClient,
6 | WarehouseClient,
7 | )
8 |
9 | from typing import Optional
10 |
11 |
12 | @mcp.tool()
13 | async def set_warehouse(warehouse: str, ctx: Context) -> str:
14 | """Set the current warehouse for the session.
15 |
16 | Args:
17 | warehouse: Name or ID of the warehouse
18 | ctx: Context object containing client information
19 |
20 | Returns:
21 | A string confirming the warehouse has been set.
22 | """
23 | __ctx_cache[f"{ctx.client_id}_warehouse"] = warehouse
24 | return f"Warehouse set to '{warehouse}'."
25 |
26 |
27 | @mcp.tool()
28 | async def list_warehouses(workspace: Optional[str] = None, ctx: Context = None) -> str:
29 | """List all warehouses in a Fabric workspace.
30 |
31 | Args:
32 | workspace: Name or ID of the workspace (optional)
33 | ctx: Context object containing client information
34 |
35 | Returns:
36 | A string containing the list of warehouses or an error message.
37 | """
38 | try:
39 | client = WarehouseClient(
40 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
41 | )
42 |
43 | warehouses = await client.list_warehouses(
44 | workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"]
45 | )
46 |
47 | return warehouses
48 |
49 | except Exception as e:
50 | return f"Error listing warehouses: {str(e)}"
51 |
52 |
53 | @mcp.tool()
54 | async def create_warehouse(
55 | name: str,
56 | workspace: Optional[str] = None,
57 | description: Optional[str] = None,
58 | ctx: Context = None,
59 | ) -> str:
60 | """Create a new warehouse in a Fabric workspace.
61 |
62 | Args:
63 | name: Name of the warehouse
64 | workspace: Name or ID of the workspace (optional)
65 | description: Description of the warehouse (optional)
66 | ctx: Context object containing client information
67 | Returns:
68 | A string confirming the warehouse has been created or an error message.
69 | """
70 | try:
71 | client = WarehouseClient(
72 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
73 | )
74 |
75 | response = await client.create_warehouse(
76 | name=name,
77 | workspace=workspace
78 | if workspace
79 | else __ctx_cache[f"{ctx.client_id}_workspace"],
80 | description=description,
81 | )
82 |
83 | return f"Warehouse '{response['id']}' created successfully."
84 |
85 | except Exception as e:
86 | return f"Error creating warehouse: {str(e)}"
87 |
--------------------------------------------------------------------------------
/helpers/clients/notebook_client.py:
--------------------------------------------------------------------------------
1 | from helpers.utils import _is_valid_uuid
2 | from helpers.logging_config import get_logger
3 | from helpers.clients.fabric_client import FabricApiClient
4 | from typing import Dict, Any
5 |
6 | logger = get_logger(__name__)
7 |
8 |
9 | class NotebookClient:
10 | def __init__(self, client: FabricApiClient):
11 | self.client = client
12 |
13 | async def list_notebooks(self, workspace: str):
14 | """List all notebooks in a workspace."""
15 | if not _is_valid_uuid(workspace):
16 | raise ValueError("Invalid workspace ID.")
17 | notebooks = await self.client.get_notebooks(workspace)
18 |
19 | if not notebooks:
20 | return f"No notebooks found in workspace '{workspace}'."
21 |
22 | markdown = f"# Notebooks in workspace '{workspace}'\n\n"
23 | markdown += "| ID | Name |\n"
24 | markdown += "|-----|------|\n"
25 |
26 | for nb in notebooks:
27 | markdown += f"| {nb['id']} | {nb['displayName']} |\n"
28 |
29 | return markdown
30 |
31 | async def get_notebook(self, workspace: str, notebook_id: str) -> Dict[str, Any]:
32 | """Get a specific notebook by ID."""
33 | if not _is_valid_uuid(workspace):
34 | raise ValueError("Invalid workspace ID.")
35 | if not _is_valid_uuid(notebook_id):
36 | raise ValueError("Invalid notebook ID.")
37 |
38 | notebook = await self.client.get_notebook(workspace, notebook_id)
39 |
40 | if not notebook:
41 | return (
42 | f"No notebook found with ID '{notebook_id}' in workspace '{workspace}'."
43 | )
44 |
45 | return notebook
46 |
47 | async def create_notebook(
48 | self, workspace: str, notebook_name: str, content: str
49 | ) -> Dict[str, Any]:
50 | """Create a new notebook."""
51 | try:
52 | workspace, workspace_id = await self.client.resolve_workspace_name_and_id(
53 | workspace
54 | )
55 | if not workspace_id:
56 | raise ValueError("Invalid workspace ID.")
57 |
58 | logger.info(f"Creating notebook '{notebook_name}' in workspace '{workspace}' (ID: {workspace_id}).")
59 |
60 | try:
61 | response = await self.client.create_notebook(
62 | workspace_id=workspace_id,
63 | notebook_name=notebook_name,
64 | ipynb_name=notebook_name,
65 | content=content,
66 | )
67 | except Exception as e:
68 | error_msg = f"Failed to create notebook '{notebook_name}' in workspace '{workspace}': {str(e)}"
69 | logger.error(error_msg)
70 | return error_msg
71 |
72 |
73 | logger.info(f"Successfully created notebook '{notebook_name}' with ID: {response['id']}")
74 | return response
75 |
76 | except Exception as e:
77 | error_msg = f"Error creating notebook '{notebook_name}': {str(e)}"
78 | logger.error(error_msg)
79 | return error_msg
80 |
--------------------------------------------------------------------------------
/tools/load_data.py:
--------------------------------------------------------------------------------
1 | from helpers.utils.context import mcp, __ctx_cache
2 | from mcp.server.fastmcp import Context
3 | from helpers.utils.authentication import get_azure_credentials
4 | from helpers.clients import (
5 | FabricApiClient,
6 | LakehouseClient,
7 | WarehouseClient,
8 | get_sql_endpoint,
9 | )
10 | from helpers.logging_config import get_logger
11 | import tempfile
12 | import os
13 | import requests
14 | from typing import Optional
15 |
16 | logger = get_logger(__name__)
17 |
18 |
19 | @mcp.tool()
20 | async def load_data_from_url(
21 | url: str,
22 | destination_table: str,
23 | workspace: Optional[str] = None,
24 | lakehouse: Optional[str] = None,
25 | warehouse: Optional[str] = None,
26 | ctx: Context = None,
27 | ) -> str:
28 | """Load data from a URL into a table in a warehouse or lakehouse.
29 |
30 | Args:
31 | url: The URL to download data from (CSV or Parquet supported).
32 | destination_table: The name of the table to load data into.
33 | workspace: Name or ID of the workspace (optional).
34 | lakehouse: Name or ID of the lakehouse (optional).
35 | warehouse: Name or ID of the warehouse (optional).
36 | ctx: Context object containing client information.
37 | Returns:
38 | A string confirming the data load or an error message.
39 | """
40 | try:
41 | # Download the file
42 | response = requests.get(url)
43 | if response.status_code != 200:
44 | return f"Failed to download file from URL: {url}"
45 | file_ext = url.split("?")[0].split(".")[-1].lower()
46 | if file_ext not in ("csv", "parquet"):
47 | return f"Unsupported file type: {file_ext}. Only CSV and Parquet are supported."
48 | with tempfile.NamedTemporaryFile(
49 | delete=False, suffix=f".{file_ext}"
50 | ) as tmp_file:
51 | tmp_file.write(response.content)
52 | tmp_path = tmp_file.name
53 | # Choose destination: lakehouse or warehouse
54 | credential = get_azure_credentials(ctx.client_id, __ctx_cache)
55 | resource_id = None
56 | resource_type = None
57 | if lakehouse:
58 | client = LakehouseClient(FabricApiClient(credential))
59 | resource_id = lakehouse
60 | resource_type = "lakehouse"
61 | elif warehouse:
62 | client = WarehouseClient(FabricApiClient(credential))
63 | resource_id = warehouse
64 | resource_type = "warehouse"
65 | else:
66 | return "Either lakehouse or warehouse must be specified."
67 | # Here you would call the appropriate method to upload/ingest the file into the table.
68 | # This is a placeholder for the actual implementation, which depends on the client API.
69 | # For now, just return a success message with file info.
70 | os.remove(tmp_path)
71 | return f"Data from {url} loaded into table '{destination_table}' in {resource_type} '{resource_id}'. (File type: {file_ext})"
72 | except Exception as e:
73 | return f"Error loading data: {str(e)}"
74 |
75 |
76 | # @mcp.resource(
77 | # uri="tables://{table_name}",
78 | # )
79 |
--------------------------------------------------------------------------------
/helpers/clients/table_client.py:
--------------------------------------------------------------------------------
1 | from helpers.logging_config import get_logger
2 | from helpers.clients.fabric_client import FabricApiClient
3 | from helpers.utils.table_tools import get_delta_schemas
4 | from azure.identity import DefaultAzureCredential
5 | from helpers.formatters.schema_formatter import format_schema_to_markdown
6 | from datetime import datetime
7 |
8 | logger = get_logger(__name__)
9 |
10 |
11 | class TableClient:
12 | def __init__(self, client: FabricApiClient):
13 | self.client = client
14 |
15 | async def list_tables(
16 | self, workspace_id: str, rsc_id: str, rsc_type: str = "lakehouse"
17 | ):
18 | """List all tables in a lakehouse."""
19 | tables = await self.client.get_tables(workspace_id, rsc_id, rsc_type)
20 |
21 | if not tables:
22 | return f"No tables found in {rsc_type} '{rsc_id}'."
23 |
24 | return tables
25 |
26 | async def get_table_schema(
27 | self,
28 | workspace: str,
29 | rsc_id: str,
30 | rsc_type: str,
31 | table_name: str,
32 | credential: DefaultAzureCredential,
33 | ):
34 | """Retrieve schema for a specific table."""
35 |
36 | tables = await self.list_tables(workspace, rsc_id, rsc_type)
37 |
38 | # Find the specific table
39 | matching_tables = [t for t in tables if t["name"].lower() == table_name.lower()]
40 |
41 | if not matching_tables:
42 | return f"No table found with name '{table_name}' in {rsc_type} '{rsc_id}'."
43 |
44 | table = matching_tables[0]
45 |
46 | # Check that it is a Delta table
47 | if table["format"].lower() != "delta":
48 | return f"The table '{table_name}' is not a Delta table (format: {table['format']})."
49 |
50 | # Get schema
51 | delta_tables = await get_delta_schemas([table], credential)
52 |
53 | if not delta_tables:
54 | return f"Could not retrieve schema for table '{table['name']}'."
55 |
56 | # Format result as markdown
57 | table_info, schema, metadata = delta_tables[0]
58 | markdown = format_schema_to_markdown(table_info, schema, metadata)
59 |
60 | return markdown
61 |
62 | async def get_all_schemas(
63 | self,
64 | workspace: str,
65 | rsc_id: str,
66 | rsc_type: str,
67 | credential: DefaultAzureCredential,
68 | ):
69 | """Get schemas for all Delta tables in a Fabric lakehouse."""
70 | # Get all tables
71 | tables = await self.list_tables(workspace, rsc_id, rsc_type)
72 |
73 | if isinstance(tables, str):
74 | return tables
75 |
76 | if not tables:
77 | return f"No tables found in {rsc_type} '{rsc_id}'."
78 |
79 | # Filter to only Delta tables
80 | delta_format_tables = [t for t in tables if t["format"].lower() == "delta"]
81 |
82 | if not delta_format_tables:
83 | return f"No Delta tables found in {rsc_type} '{rsc_id}'."
84 |
85 | # Get schema for all tables
86 | delta_tables = await get_delta_schemas(delta_format_tables, credential)
87 |
88 | if not delta_tables:
89 | return "Could not retrieve schemas for any tables."
90 |
91 | # Format the result as markdown
92 | markdown = "# Delta Table Schemas\n\n"
93 | markdown += f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
94 | markdown += f"Workspace: {workspace}\n"
95 | markdown += f"Lakehouse: {rsc_id}\n\n"
96 |
97 | for table_info, schema, metadata in delta_tables:
98 | markdown += format_schema_to_markdown(table_info, schema, metadata)
99 |
100 | return markdown
101 |
--------------------------------------------------------------------------------
/test_notebook_creation.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Test script to validate the notebook creation fixes
4 | """
5 | import asyncio
6 | import sys
7 | import json
8 | from helpers.clients.fabric_client import FabricApiClient
9 | from helpers.clients.notebook_client import NotebookClient
10 | from helpers.utils.authentication import get_azure_credentials
11 | from helpers.logging_config import get_logger
12 |
13 | logger = get_logger(__name__)
14 |
15 | async def test_notebook_creation():
16 | """Test notebook creation with improved error handling"""
17 | try:
18 | # Initialize clients
19 | credentials = get_azure_credentials("test-client-id", {})
20 | fabric_client = FabricApiClient(credentials)
21 | notebook_client = NotebookClient(fabric_client)
22 |
23 | # Test workspace - using "My workspace"
24 | workspace_id = "645f0acc-fd1e-42fe-ae6e-e919b6c63322"
25 | notebook_name = "Test Debug Notebook"
26 |
27 | # Create a simple notebook content
28 | notebook_json = {
29 | "nbformat": 4,
30 | "nbformat_minor": 5,
31 | "cells": [
32 | {
33 | "cell_type": "code",
34 | "source": ["print('Hello, Fabric!')\n"],
35 | "execution_count": None,
36 | "outputs": [],
37 | "metadata": {},
38 | }
39 | ],
40 | "metadata": {"language_info": {"name": "python"}},
41 | }
42 | notebook_content = json.dumps(notebook_json)
43 |
44 | print(f"Testing notebook creation in workspace: {workspace_id}")
45 | print(f"Notebook name: {notebook_name}")
46 |
47 | # Test the notebook creation
48 | result = await notebook_client.create_notebook(
49 | workspace=workspace_id,
50 | notebook_name=notebook_name,
51 | content=notebook_content
52 | )
53 |
54 | print(f"Result: {result}")
55 |
56 | if isinstance(result, dict) and result.get("id"):
57 | print(f"✅ SUCCESS: Created notebook with ID: {result['id']}")
58 | return True
59 | else:
60 | print(f"❌ FAILED: {result}")
61 | return False
62 |
63 | except Exception as e:
64 | print(f"❌ ERROR: {str(e)}")
65 | logger.error(f"Test failed: {str(e)}", exc_info=True)
66 | return False
67 |
68 | async def test_workspace_resolution():
69 | """Test workspace name resolution"""
70 | try:
71 | credentials = get_azure_credentials("test-client-id", {})
72 | fabric_client = FabricApiClient(credentials)
73 |
74 | # Test workspace resolution
75 | workspace_name, workspace_id = await fabric_client.resolve_workspace_name_and_id("My workspace")
76 | print(f"✅ Workspace resolution: '{workspace_name}' -> {workspace_id}")
77 | return True
78 |
79 | except Exception as e:
80 | print(f"❌ Workspace resolution failed: {str(e)}")
81 | return False
82 |
83 | if __name__ == "__main__":
84 | print("=" * 50)
85 | print("Testing Fabric MCP Notebook Creation Fixes")
86 | print("=" * 50)
87 |
88 | # Test workspace resolution first
89 | print("\n1. Testing workspace resolution...")
90 | success1 = asyncio.run(test_workspace_resolution())
91 |
92 | # Test notebook creation
93 | print("\n2. Testing notebook creation...")
94 | success2 = asyncio.run(test_notebook_creation())
95 |
96 | print("\n" + "=" * 50)
97 | if success1 and success2:
98 | print("✅ ALL TESTS PASSED")
99 | sys.exit(0)
100 | else:
101 | print("❌ SOME TESTS FAILED")
102 | sys.exit(1)
103 |
--------------------------------------------------------------------------------
/helpers/clients/semanticModel_client.py:
--------------------------------------------------------------------------------
1 | from helpers.logging_config import get_logger
2 | from helpers.clients.fabric_client import FabricApiClient
3 |
4 | logger = get_logger(__name__)
5 |
6 |
7 | class SemanticModelClient:
8 | def __init__(self, client: FabricApiClient):
9 | self.client = client
10 |
11 | async def list_semantic_models(self, workspace_id: str):
12 | """List all semantic models in a workspace."""
13 | models = await self.client.get_semantic_models(workspace_id)
14 |
15 | if not models:
16 | return f"No semantic models found in workspace '{workspace_id}'."
17 |
18 | return models
19 |
20 | async def get_semantic_model(self, workspace_id: str, model_id: str):
21 | """Get a specific semantic model by ID."""
22 | model = await self.client.get_semantic_model(workspace_id, model_id)
23 |
24 | if not model:
25 | return f"No semantic model found with ID '{model_id}' in workspace '{workspace_id}'."
26 |
27 | return model
28 |
29 | # async def get_model_schema(
30 | # self,
31 | # workspace: str,
32 | # rsc_id: str,
33 | # rsc_type: str,
34 | # table_name: str,
35 | # credential: DefaultAzureCredential,
36 | # ):
37 | # """Retrieve schema for a specific model."""
38 |
39 | # models = await self.list_semantic_models(workspace)
40 |
41 | # # Find the specific table
42 | # matching_tables = [t for t in tables if t["name"].lower() == table_name.lower()]
43 |
44 | # if not matching_tables:
45 | # return f"No table found with name '{table_name}' in {rsc_type} '{rsc_id}'."
46 |
47 | # table = matching_tables[0]
48 |
49 | # # Check that it is a Delta table
50 | # if table["format"].lower() != "delta":
51 | # return f"The table '{table_name}' is not a Delta table (format: {table['format']})."
52 |
53 | # # Get schema
54 | # delta_tables = await get_delta_schemas([table], credential)
55 |
56 | # if not delta_tables:
57 | # return f"Could not retrieve schema for table '{table}'."
58 |
59 | # # Format result as markdown
60 | # table_info, schema, metadata = delta_tables[0]
61 | # markdown = format_schema_to_markdown(table_info, schema, metadata)
62 |
63 | # return markdown
64 |
65 | # async def get_all_schemas(
66 | # self,
67 | # workspace: str,
68 | # rsc_id: str,
69 | # rsc_type: str,
70 | # credential: DefaultAzureCredential,
71 | # ):
72 | # """Get schemas for all Delta tables in a Fabric lakehouse."""
73 | # # Get all tables
74 | # tables = await self.list_tables(workspace, rsc_id, rsc_type)
75 |
76 | # if isinstance(tables, str):
77 | # return tables
78 |
79 | # if not tables:
80 | # return f"No tables found in {rsc_type} '{rsc_id}'."
81 |
82 | # # Filter to only Delta tables
83 | # delta_format_tables = [t for t in tables if t["format"].lower() == "delta"]
84 |
85 | # if not delta_format_tables:
86 | # return f"No Delta tables found in {rsc_type} '{rsc_id}'."
87 |
88 | # # Get schema for all tables
89 | # delta_tables = await get_delta_schemas(delta_format_tables, credential)
90 |
91 | # logger.debug(f"Delta Tables response: {tables}")
92 | # if not delta_tables:
93 | # return "Could not retrieve schemas for any tables."
94 |
95 | # # Format the result as markdown
96 | # markdown = "# Delta Table Schemas\n\n"
97 | # markdown += f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
98 | # markdown += f"Workspace: {workspace}\n"
99 | # markdown += f"Lakehouse: {rsc_id}\n\n"
100 |
101 | # for table_info, schema, metadata in delta_tables:
102 | # markdown += format_schema_to_markdown(table_info, schema, metadata)
103 |
104 | # return markdown
105 |
--------------------------------------------------------------------------------
/tools/lakehouse.py:
--------------------------------------------------------------------------------
1 | from helpers.utils.context import mcp, __ctx_cache
2 | from mcp.server.fastmcp import Context
3 | from helpers.utils.authentication import get_azure_credentials
4 | from helpers.clients import (
5 | FabricApiClient,
6 | LakehouseClient,
7 | )
8 | from helpers.logging_config import get_logger
9 |
10 | # import sempy_labs as labs
11 | # import sempy_labs.lakehouse as slh
12 |
13 | from typing import Optional
14 |
15 | logger = get_logger(__name__)
16 |
17 |
18 | @mcp.tool()
19 | async def set_lakehouse(lakehouse: str, ctx: Context) -> str:
20 | """Set the current lakehouse for the session.
21 |
22 | Args:
23 | lakehouse: Name or ID of the lakehouse
24 | ctx: Context object containing client information
25 |
26 | Returns:
27 | A string confirming the lakehouse has been set.
28 | """
29 | __ctx_cache[f"{ctx.client_id}_lakehouse"] = lakehouse
30 | return f"Lakehouse set to '{lakehouse}'."
31 |
32 |
33 | @mcp.tool()
34 | async def list_lakehouses(workspace: Optional[str] = None, ctx: Context = None) -> str:
35 | """List all lakehouses in a Fabric workspace.
36 |
37 | Args:
38 | workspace: Name or ID of the workspace (optional)
39 | ctx: Context object containing client information
40 |
41 | Returns:
42 | A string containing the list of lakehouses or an error message.
43 | """
44 | try:
45 | credential = get_azure_credentials(ctx.client_id, __ctx_cache)
46 | fabric_client = FabricApiClient(credential=credential)
47 | lakehouse_client = LakehouseClient(client=fabric_client)
48 | ws = workspace or __ctx_cache.get(f"{ctx.client_id}_workspace")
49 | if not ws:
50 | return "Workspace not set. Please set a workspace using the 'set_workspace' command."
51 | return await lakehouse_client.list_lakehouses(workspace=ws)
52 | except Exception as e:
53 | logger.error(f"Error listing lakehouses: {e}")
54 | return f"Error listing lakehouses: {e}"
55 |
56 |
57 | # @mcp.tool()
58 | # async def list_lakehouses_semantic_link(workspace: Optional[str] = None, ctx: Context = None) -> str:
59 | # """List all lakehouses in a Fabric workspace using semantic-link-labs."""
60 | # try:
61 | # manager = LakehouseManager()
62 | # lakehouses = manager.list_lakehouses(workspace_id=workspace or __ctx_cache.get(f"{ctx.client_id}_workspace"))
63 | # markdown = f"# Lakehouses (semantic-link-labs) in workspace '{workspace}'\n\n"
64 | # markdown += "| ID | Name |\n"
65 | # markdown += "|-----|------|\n"
66 | # for lh in lakehouses:
67 | # markdown += f"| {lh.get('id', 'N/A')} | {lh.get('displayName', 'N/A')} |\n"
68 | # return markdown
69 | # except Exception as e:
70 | # return f"Error listing lakehouses with semantic-link-labs: {str(e)}"
71 |
72 |
73 | @mcp.tool()
74 | async def create_lakehouse(
75 | name: str,
76 | workspace: Optional[str] = None,
77 | description: Optional[str] = None,
78 | ctx: Context = None,
79 | ) -> str:
80 | """Create a new lakehouse in a Fabric workspace.
81 |
82 | Args:
83 | name: Name of the lakehouse
84 | workspace: Name or ID of the workspace (optional)
85 | description: Description of the lakehouse (optional)
86 | ctx: Context object containing client information
87 | Returns:
88 | A string confirming the lakehouse has been created or an error message.
89 | """
90 | try:
91 | credential = get_azure_credentials(ctx.client_id, __ctx_cache)
92 | fabric_client = FabricApiClient(credential=credential)
93 | lakehouse_client = LakehouseClient(client=fabric_client)
94 | ws = workspace or __ctx_cache.get(f"{ctx.client_id}_workspace")
95 | if not ws:
96 | return "Workspace not set. Please set a workspace using the 'set_workspace' command."
97 | return await lakehouse_client.create_lakehouse(
98 | name=name, workspace=ws, description=description
99 | )
100 | except Exception as e:
101 | logger.error(f"Error creating lakehouse: {e}")
102 | return f"Error creating lakehouse: {e}"
103 |
--------------------------------------------------------------------------------
/helpers/clients/sql_client.py:
--------------------------------------------------------------------------------
1 | import polars as pl
2 | from sqlalchemy import create_engine, Engine
3 | from itertools import chain, repeat
4 | import urllib
5 | import struct
6 | from typing import Optional
7 | from azure.identity import DefaultAzureCredential
8 | from helpers.clients import FabricApiClient, LakehouseClient, WarehouseClient
9 |
10 |
11 | # prepare connection string
12 | sql_endpoint = "lkxke5qat5vu7fpnluz5o7cnme-qlbrb7caj77uthvfhqdxwd5v54.datawarehouse.fabric.microsoft.com"
13 | database = "EDR_WAREHOUSE"
14 | DRIVER = "{{ODBC Driver 18 for SQL Server}}"
15 |
16 |
17 | def get_sqlalchemy_connection_string(driver: str, server: str, database: str) -> Engine:
18 | """
19 | Constructs a SQLAlchemy connection string based on the provided parameters.
20 |
21 | Args:
22 | driver (str): The database driver (e.g., 'mssql+pyodbc').
23 | server (str): The server address.
24 | database (str): The database name.
25 |
26 | Returns:
27 | Engine: A SQLAlchemy engine object.
28 | """
29 | connection_string = f"Driver={{ODBC Driver 18 for SQL Server}};Server={sql_endpoint},1433;Database={database};Encrypt=Yes;TrustServerCertificate=No"
30 | params = urllib.parse.quote(connection_string)
31 | # authentication
32 | resource_url = "https://database.windows.net/.default"
33 | azure_credentials = DefaultAzureCredential()
34 | token_object = azure_credentials.get_token(resource_url)
35 | # Retrieve an access token
36 | token_as_bytes = bytes(
37 | token_object.token, "UTF-8"
38 | ) # Convert the token to a UTF-8 byte string
39 | encoded_bytes = bytes(
40 | chain.from_iterable(zip(token_as_bytes, repeat(0)))
41 | ) # Encode the bytes to a Windows byte string
42 | token_bytes = (
43 | struct.pack(" tuple:
63 | """
64 | Retrieve the SQL endpoint for a specified lakehouse or warehouse.
65 |
66 | Args:
67 | lakehouse: Name or ID of the lakehouse (optional).
68 | warehouse: Name or ID of the warehouse (optional).
69 | type: Type of resource ('lakehouse' or 'warehouse').
70 | workspace: Name or ID of the workspace (optional).
71 | Returns:
72 | A tuple (database, sql_endpoint) or (None, error_message) in case of error.
73 | """
74 | try:
75 | credential = DefaultAzureCredential()
76 | fabClient = FabricApiClient(credential)
77 | resource_name = None
78 | endpoint = None
79 | workspace_name, workspace_id = await fabClient.resolve_workspace_name_and_id(
80 | workspace
81 | )
82 | if type and type.lower() == "lakehouse":
83 | client = LakehouseClient(fabClient)
84 | resource_name, resource_id = await fabClient.resolve_item_name_and_id(
85 | workspace=workspace_id, item=lakehouse, type="Lakehouse"
86 | )
87 | lakehouse_obj = await client.get_lakehouse(
88 | workspace=workspace, lakehouse=resource_id
89 | )
90 | endpoint = (
91 | lakehouse_obj.get("properties", {})
92 | .get("sqlEndpointProperties", {})
93 | .get("connectionString")
94 | )
95 | elif type and type.lower() == "warehouse":
96 | client = WarehouseClient(fabClient)
97 | resource_name, resource_id = await fabClient.resolve_item_name_and_id(
98 | workspace=workspace_id, item=warehouse, type="Warehouse"
99 | )
100 | warehouse_obj = await client.get_warehouse(
101 | workspace=workspace, warehouse=resource_id
102 | )
103 | endpoint = warehouse_obj.get("properties", {}).get("connectionString")
104 | if resource_name and endpoint:
105 | return resource_name, endpoint
106 | else:
107 | return (
108 | None,
109 | f"No SQL endpoint found for {type} '{lakehouse or warehouse}' in workspace '{workspace}'.",
110 | )
111 | except Exception as e:
112 | return None, f"Error retrieving SQL endpoint: {str(e)}"
113 |
114 |
115 | class SQLClient:
116 | def __init__(self, sql_endpoint: str, database: str):
117 | self.engine = get_sqlalchemy_connection_string(DRIVER, sql_endpoint, database)
118 |
119 | def run_query(self, query: str) -> pl.DataFrame:
120 | return pl.read_database(query, connection=self.engine)
121 |
122 | def load_data(self, df: pl.DataFrame, table_name: str, if_exists: str = "append"):
123 | pdf = df.to_pandas()
124 | pdf.to_sql(table_name, con=self.engine, if_exists=if_exists, index=False)
125 |
--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
1 | # Microsoft Fabric MCP Architecture with LLM Integration
2 |
3 | ## Complete Architecture Diagram
4 |
5 | ```mermaid
6 | graph TB
7 | subgraph "Developer Environment"
8 | IDE[IDE/VSCode]
9 | DEV[Developer]
10 | PROJ[Project Files]
11 | end
12 |
13 | subgraph "AI Layer"
14 | LLM[Large Language Model
Claude/GPT/etc.]
15 | CONTEXT[Conversation Context]
16 | REASONING[AI Reasoning Engine]
17 | end
18 |
19 | subgraph "MCP Layer"
20 | MCP[MCP Server]
21 | TOOLS[PySpark Tools]
22 | HELPERS[PySpark Helpers]
23 | TEMPLATES[Template Manager]
24 | VALIDATORS[Code Validators]
25 | GENERATORS[Code Generators]
26 | end
27 |
28 | subgraph "Microsoft Fabric"
29 | API[Fabric API]
30 | WS[Workspace]
31 | LH[Lakehouse]
32 | NB[Notebooks]
33 | TABLES[Delta Tables]
34 | SPARK[Spark Clusters]
35 | end
36 |
37 | subgraph "Operations Flow"
38 | CREATE[Create Notebooks]
39 | VALIDATE[Validate Code]
40 | GENERATE[Generate Code]
41 | ANALYZE[Analyze Performance]
42 | DEPLOY[Deploy to Fabric]
43 | end
44 |
45 | %% Developer interactions
46 | DEV --> IDE
47 | IDE --> PROJ
48 |
49 | %% LLM interactions
50 | IDE <--> LLM
51 | LLM <--> CONTEXT
52 | LLM --> REASONING
53 |
54 | %% MCP interactions
55 | LLM <--> MCP
56 | MCP --> TOOLS
57 | TOOLS --> HELPERS
58 | TOOLS --> TEMPLATES
59 | TOOLS --> VALIDATORS
60 | TOOLS --> GENERATORS
61 |
62 | %% Fabric interactions
63 | MCP <--> API
64 | API --> WS
65 | WS --> LH
66 | WS --> NB
67 | LH --> TABLES
68 | NB --> SPARK
69 |
70 | %% Operation flows
71 | TOOLS --> CREATE
72 | TOOLS --> VALIDATE
73 | TOOLS --> GENERATE
74 | TOOLS --> ANALYZE
75 | CREATE --> DEPLOY
76 |
77 | %% Data flow arrows
78 | REASONING -.->|"Intelligent Decisions"| TOOLS
79 | CONTEXT -.->|"Project Awareness"| VALIDATORS
80 |
81 | %% Styling
82 | classDef devEnv fill:#e1f5fe
83 | classDef aiLayer fill:#fff9c4
84 | classDef mcpLayer fill:#f3e5f5
85 | classDef fabricLayer fill:#e8f5e8
86 | classDef operations fill:#fff3e0
87 |
88 | class IDE,DEV,PROJ devEnv
89 | class LLM,CONTEXT,REASONING aiLayer
90 | class MCP,TOOLS,HELPERS,TEMPLATES,VALIDATORS,GENERATORS mcpLayer
91 | class API,WS,LH,NB,TABLES,SPARK fabricLayer
92 | class CREATE,VALIDATE,GENERATE,ANALYZE,DEPLOY operations
93 | ```
94 |
95 | ## Architecture Components
96 |
97 | ### **1. Developer Environment**
98 | - **IDE/VSCode**: Primary development interface with MCP integration
99 | - **Developer**: Data engineer/scientist working on PySpark projects
100 | - **Project Files**: Local project structure and configuration
101 |
102 | ### **2. AI Layer**
103 | - **Large Language Model**: Claude, GPT, or other LLM providing intelligent assistance
104 | - **Conversation Context**: Maintains project context and conversation history
105 | - **AI Reasoning Engine**: Makes intelligent decisions about code generation and optimization
106 |
107 | ### **3. MCP Layer (This Server)**
108 | - **MCP Server**: Core server handling tool requests from the LLM
109 | - **PySpark Tools**: 11 specialized tools for notebook operations
110 | - **PySpark Helpers**: Template management and code generation
111 | - **Template Manager**: Pre-built notebook templates for different scenarios
112 | - **Code Validators**: Syntax, best practices, and Fabric compatibility checks
113 | - **Code Generators**: Intelligent PySpark code generation
114 |
115 | ### **4. Microsoft Fabric**
116 | - **Fabric API**: REST API for all Fabric operations
117 | - **Workspace**: Fabric workspace containing resources
118 | - **Lakehouse**: Data storage with Delta Lake tables
119 | - **Notebooks**: PySpark notebooks for data processing
120 | - **Delta Tables**: Structured data storage
121 | - **Spark Clusters**: Compute resources for PySpark execution
122 |
123 | ### **5. Operations Flow**
124 | - **Create Notebooks**: Generate notebooks from templates
125 | - **Validate Code**: Check syntax, performance, and compatibility
126 | - **Generate Code**: Create PySpark snippets for common operations
127 | - **Analyze Performance**: Evaluate and optimize notebook performance
128 | - **Deploy to Fabric**: Push notebooks and execute in Fabric environment
129 |
130 | ## Enhanced Interaction Flow with LLM
131 |
132 | 1. **Developer requests PySpark assistance in IDE**
133 | 2. **IDE communicates with LLM (Claude/GPT)**
134 | 3. **LLM analyzes request using conversation context and reasoning**
135 | 4. **LLM calls MCP server tools based on intelligent analysis**
136 | 5. **MCP server processes request using specialized tools**
137 | 6. **Tools utilize helpers, templates, and validators**
138 | 7. **MCP server calls Fabric API for operations**
139 | 8. **Results flow back through MCP to LLM**
140 | 9. **LLM processes and formats results intelligently**
141 | 10. **Developer receives contextual, intelligent responses in IDE**
142 |
143 | ## Key Benefits of LLM Integration
144 |
145 | ### **Intelligent Decision Making**
146 | - LLM analyzes developer intent and context
147 | - Chooses appropriate tools and templates automatically
148 | - Provides contextual recommendations based on project history
149 |
150 | ### **Natural Language Interface**
151 | - Developers can request features in natural language
152 | - LLM translates requests to appropriate MCP tool calls
153 | - Reduces need to remember specific tool names and parameters
154 |
155 | ### **Context Awareness**
156 | - LLM maintains conversation history and project context
157 | - Provides consistent recommendations across sessions
158 | - Learns from previous interactions and code patterns
159 |
160 | ### **Enhanced Code Generation**
161 | - LLM combines multiple tool outputs intelligently
162 | - Provides explanations and documentation with generated code
163 | - Adapts to developer's coding style and preferences
164 |
165 | ## Example LLM-Enhanced Workflows
166 |
167 | ### **Scenario 1: Natural Language Request**
168 | ```
169 | Developer: "Help me create a PySpark notebook that reads sales data from our lakehouse,
170 | cleans it, and creates a summary table with performance optimization."
171 |
172 | LLM Process:
173 | 1. Analyzes intent: notebook creation + data processing + optimization
174 | 2. Calls create_fabric_notebook() with ETL template
175 | 3. Calls generate_fabric_code() for lakehouse reading
176 | 4. Calls validate_fabric_code() for optimization checks
177 | 5. Provides complete solution with explanations
178 | ```
179 |
180 | ### **Scenario 2: Performance Optimization**
181 | ```
182 | Developer: "My PySpark notebook is running slowly. Can you help optimize it?"
183 |
184 | LLM Process:
185 | 1. Calls analyze_notebook_performance() on current notebook
186 | 2. Calls validate_fabric_code() for anti-pattern detection
187 | 3. Calls generate_fabric_code() for optimized alternatives
188 | 4. Provides detailed optimization report with before/after comparisons
189 | ```
190 |
191 | ### **Scenario 3: Best Practices Guidance**
192 | ```
193 | Developer: "Is this PySpark code following Fabric best practices?"
194 |
195 | LLM Process:
196 | 1. Calls validate_fabric_code() for compatibility checks
197 | 2. Analyzes results with reasoning engine
198 | 3. Provides detailed feedback with specific recommendations
199 | 4. Suggests alternative approaches using generate_fabric_code()
200 | ```
201 |
202 | This architecture leverages the power of LLMs to provide intelligent, context-aware assistance while utilizing specialized MCP tools for precise Fabric operations!
203 |
--------------------------------------------------------------------------------
/tools/table.py:
--------------------------------------------------------------------------------
1 | from helpers.utils.context import mcp, __ctx_cache
2 | from mcp.server.fastmcp import Context
3 | from helpers.utils.authentication import get_azure_credentials
4 | from helpers.clients import (
5 | FabricApiClient,
6 | TableClient,
7 | SQLClient,
8 | get_sql_endpoint,
9 | )
10 |
11 | from typing import Optional
12 | from helpers.logging_config import get_logger
13 |
14 | logger = get_logger(__name__)
15 |
16 |
17 | @mcp.tool()
18 | async def set_table(table_name: str, ctx: Context) -> str:
19 | """Set the current table for the session.
20 |
21 | Args:
22 | table_name: Name of the table to set
23 | ctx: Context object containing client information
24 |
25 | Returns:
26 | A string confirming the table has been set.
27 | """
28 | __ctx_cache[f"{ctx.client_id}_table"] = table_name
29 | return f"Table set to '{table_name}'."
30 |
31 |
32 | @mcp.tool()
33 | async def list_tables(
34 | workspace: Optional[str] = None,
35 | lakehouse: Optional[str] = None,
36 | ctx: Context = None,
37 | ) -> str:
38 | """List all tables in a Fabric workspace.
39 |
40 | Args:
41 | workspace: Name or ID of the workspace (optional)
42 | lakehouse: Name or ID of the lakehouse (optional)
43 | ctx: Context object containing client information
44 |
45 | Returns:
46 | A string containing the list of tables or an error message.
47 | """
48 | try:
49 | client = TableClient(
50 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
51 | )
52 |
53 | tables = await client.list_tables(
54 | workspace_id=workspace
55 | if workspace
56 | else __ctx_cache[f"{ctx.client_id}_workspace"],
57 | rsc_id=lakehouse
58 | if lakehouse
59 | else __ctx_cache[f"{ctx.client_id}_lakehouse"],
60 | )
61 |
62 | return tables
63 |
64 | except Exception as e:
65 | return f"Error listing tables: {str(e)}"
66 |
67 |
68 | @mcp.tool()
69 | async def get_lakehouse_table_schema(
70 | workspace: Optional[str],
71 | lakehouse: Optional[str],
72 | table_name: str = None,
73 | ctx: Context = None,
74 | ) -> str:
75 | """Get schema for a specific table in a Fabric lakehouse.
76 |
77 | Args:
78 | workspace: Name or ID of the workspace
79 | lakehouse: Name or ID of the lakehouse
80 | table_name: Name of the table to retrieve
81 | ctx: Context object containing client information
82 |
83 | Returns:
84 | A string containing the schema of the specified table or an error message.
85 | """
86 | try:
87 | credential = get_azure_credentials(ctx.client_id, __ctx_cache)
88 | client = TableClient(FabricApiClient(credential))
89 |
90 | if table_name is None:
91 | return "Table name must be specified."
92 | if lakehouse is None:
93 | if f"{ctx.client_id}_lakehouse" in __ctx_cache:
94 | lakehouse = __ctx_cache[f"{ctx.client_id}_lakehouse"]
95 | else:
96 | return "Lakehouse must be specified or set in the context."
97 |
98 | if workspace is None:
99 | if f"{ctx.client_id}_workspace" in __ctx_cache:
100 | workspace = __ctx_cache[f"{ctx.client_id}_workspace"]
101 | else:
102 | return "Workspace must be specified or set in the context."
103 |
104 | schema = await client.get_table_schema(
105 | workspace, lakehouse, "lakehouse", table_name, credential
106 | )
107 |
108 | return schema
109 |
110 | except Exception as e:
111 | return f"Error retrieving table schema: {str(e)}"
112 |
113 |
114 | @mcp.tool()
115 | async def get_all_lakehouse_schemas(
116 | lakehouse: Optional[str], workspace: Optional[str] = None, ctx: Context = None
117 | ) -> str:
118 | """Get schemas for all Delta tables in a Fabric lakehouse.
119 |
120 | Args:
121 | workspace: Name or ID of the workspace
122 | lakehouse: Name or ID of the lakehouse
123 | ctx: Context object containing client information
124 |
125 | Returns:
126 | A string containing the schemas of all Delta tables or an error message.
127 | """
128 | try:
129 | credential = get_azure_credentials(ctx.client_id, __ctx_cache)
130 | client = TableClient(FabricApiClient(credential))
131 |
132 | if workspace is None:
133 | if f"{ctx.client_id}_workspace" in __ctx_cache:
134 | workspace = __ctx_cache[f"{ctx.client_id}_workspace"]
135 | else:
136 | return "Workspace must be specified or set in the context."
137 | if lakehouse is None:
138 | if f"{ctx.client_id}_lakehouse" in __ctx_cache:
139 | lakehouse = __ctx_cache[f"{ctx.client_id}_lakehouse"]
140 | else:
141 | return "Lakehouse must be specified or set in the context."
142 | schemas = await client.get_all_schemas(
143 | workspace, lakehouse, "lakehouse", credential
144 | )
145 |
146 | return schemas
147 |
148 | except Exception as e:
149 | return f"Error retrieving table schemas: {str(e)}"
150 |
151 |
152 | @mcp.tool()
153 | async def run_query(
154 | workspace: Optional[str] = None,
155 | lakehouse: Optional[str] = None,
156 | warehouse: Optional[str] = None,
157 | query: str = None,
158 | type: Optional[str] = None, # Add type hint for 'type'
159 | ctx: Context = None,
160 | ) -> str:
161 | """Read data from a table in a warehouse or lakehouse.
162 |
163 | Args:
164 | workspace: Name or ID of the workspace (optional).
165 | lakehouse: Name or ID of the lakehouse (optional).
166 | warehouse: Name or ID of the warehouse (optional).
167 | query: The SQL query to execute.
168 | type: Type of resource ('lakehouse' or 'warehouse'). If not provided, it will be inferred.
169 | ctx: Context object containing client information.
170 | Returns:
171 | A string confirming the data read or an error message.
172 | """
173 | try:
174 | if ctx is None:
175 | raise ValueError("Context (ctx) must be provided.")
176 | if query is None:
177 | raise ValueError("Query must be specified.")
178 | # Always resolve the SQL endpoint and database name
179 | database, sql_endpoint = await get_sql_endpoint(
180 | workspace=workspace,
181 | lakehouse=lakehouse,
182 | warehouse=warehouse,
183 | type=type,
184 | )
185 | if (
186 | not database
187 | or not sql_endpoint
188 | or sql_endpoint.startswith("Error")
189 | or sql_endpoint.startswith("No SQL endpoint")
190 | ):
191 | return f"Failed to resolve SQL endpoint: {sql_endpoint}"
192 | logger.info(f"Running query '{query}' on SQL endpoint {sql_endpoint}")
193 | client = SQLClient(sql_endpoint=sql_endpoint, database=database)
194 | df = client.run_query(query)
195 | if df.is_empty():
196 | return f"No data found for query '{query}'."
197 |
198 | # Convert to markdown for user-friendly display
199 |
200 | # markdown = f"### Query: {query} (shape: {df.shape})\n\n"
201 | # with pl.Config() as cfg:
202 | # cfg.set_tbl_formatting('ASCII_MARKDOWN')
203 | # display(Markdown(repr(df)))
204 | # markdown += f"\n\n### Data Preview:\n\n"
205 | # markdown += df.head(10).to_pandas().to_markdown(index=False)
206 | # markdown += f"\n\nColumns: {', '.join(df.columns)}"
207 | return df.to_dict() # Return the DataFrame as a dictionary for easier handling
208 | except Exception as e:
209 | logger.error(f"Error reading data: {str(e)}")
210 | return f"Error reading data: {str(e)}"
211 |
--------------------------------------------------------------------------------
/test_security.py:
--------------------------------------------------------------------------------
1 | """
2 | Test script for the secure MCP server.
3 | Validates authentication, authorization, and security features.
4 | """
5 |
6 | import requests
7 | import json
8 | import time
9 | import subprocess
10 | import threading
11 | from typing import Optional
12 |
13 | class SecurityTester:
14 | """Test suite for MCP server security."""
15 |
16 | def __init__(self, base_url: str = "http://localhost:8081"):
17 | self.base_url = base_url.rstrip('/')
18 | self.session = requests.Session()
19 | self.session.verify = False # For self-signed certificates
20 |
21 | def test_health_check(self) -> bool:
22 | """Test health check endpoint."""
23 | try:
24 | response = self.session.get(f"{self.base_url}/health", timeout=5)
25 | response.raise_for_status()
26 | health_data = response.json()
27 | print(f"✅ Health check passed: {health_data}")
28 | return True
29 | except Exception as e:
30 | print(f"❌ Health check failed: {e}")
31 | return False
32 |
33 | def test_unauthenticated_access(self) -> bool:
34 | """Test that unauthenticated requests are rejected."""
35 | try:
36 | mcp_request = {
37 | "jsonrpc": "2.0",
38 | "id": 1,
39 | "method": "tools/list"
40 | }
41 |
42 | response = self.session.post(
43 | f"{self.base_url}/mcp",
44 | json=mcp_request,
45 | timeout=5
46 | )
47 |
48 | if response.status_code == 401:
49 | print("✅ Unauthenticated access properly rejected")
50 | return True
51 | else:
52 | print(f"❌ Unauthenticated access allowed (status: {response.status_code})")
53 | return False
54 |
55 | except Exception as e:
56 | print(f"❌ Error testing unauthenticated access: {e}")
57 | return False
58 |
59 | def test_authentication(self, username: str = "admin", password: str = "changeme") -> Optional[str]:
60 | """Test username/password authentication."""
61 | try:
62 | response = self.session.post(
63 | f"{self.base_url}/auth/login",
64 | json={"username": username, "password": password},
65 | timeout=5
66 | )
67 |
68 | if response.status_code == 200:
69 | token_data = response.json()
70 | token = token_data.get("access_token")
71 | print(f"✅ Authentication successful: {username}")
72 | return token
73 | else:
74 | print(f"❌ Authentication failed: {response.status_code} - {response.text}")
75 | return None
76 |
77 | except Exception as e:
78 | print(f"❌ Authentication error: {e}")
79 | return None
80 |
81 | def test_authenticated_access(self, token: str) -> bool:
82 | """Test authenticated MCP tool access."""
83 | try:
84 | headers = {"Authorization": f"Bearer {token}"}
85 | mcp_request = {
86 | "jsonrpc": "2.0",
87 | "id": 1,
88 | "method": "tools/list"
89 | }
90 |
91 | response = self.session.post(
92 | f"{self.base_url}/mcp",
93 | json=mcp_request,
94 | headers=headers,
95 | timeout=5
96 | )
97 |
98 | if response.status_code == 200:
99 | result = response.json()
100 | print(f"✅ Authenticated access successful")
101 | if 'result' in result and 'tools' in result['result']:
102 | tools = result['result']['tools']
103 | print(f" Available tools: {[t['name'] for t in tools]}")
104 | return True
105 | else:
106 | print(f"❌ Authenticated access failed: {response.status_code}")
107 | return False
108 |
109 | except Exception as e:
110 | print(f"❌ Authenticated access error: {e}")
111 | return False
112 |
113 | def test_token_verification(self, token: str) -> bool:
114 | """Test token verification endpoint."""
115 | try:
116 | headers = {"Authorization": f"Bearer {token}"}
117 | response = self.session.get(
118 | f"{self.base_url}/auth/verify",
119 | headers=headers,
120 | timeout=5
121 | )
122 |
123 | if response.status_code == 200:
124 | verify_data = response.json()
125 | print(f"✅ Token verification passed: {verify_data}")
126 | return True
127 | else:
128 | print(f"❌ Token verification failed: {response.status_code}")
129 | return False
130 |
131 | except Exception as e:
132 | print(f"❌ Token verification error: {e}")
133 | return False
134 |
135 | def test_invalid_credentials(self) -> bool:
136 | """Test that invalid credentials are rejected."""
137 | try:
138 | response = self.session.post(
139 | f"{self.base_url}/auth/login",
140 | json={"username": "invalid", "password": "invalid"},
141 | timeout=5
142 | )
143 |
144 | if response.status_code == 401:
145 | print("✅ Invalid credentials properly rejected")
146 | return True
147 | else:
148 | print(f"❌ Invalid credentials accepted (status: {response.status_code})")
149 | return False
150 |
151 | except Exception as e:
152 | print(f"❌ Error testing invalid credentials: {e}")
153 | return False
154 |
155 | def test_rate_limiting(self, token: str) -> bool:
156 | """Test rate limiting functionality."""
157 | try:
158 | headers = {"Authorization": f"Bearer {token}"}
159 |
160 | # Make multiple rapid requests
161 | success_count = 0
162 | rate_limited = False
163 |
164 | for i in range(10):
165 | response = self.session.get(
166 | f"{self.base_url}/health",
167 | headers=headers,
168 | timeout=5
169 | )
170 |
171 | if response.status_code == 200:
172 | success_count += 1
173 | elif response.status_code == 429: # Too Many Requests
174 | rate_limited = True
175 | break
176 |
177 | time.sleep(0.1) # Small delay between requests
178 |
179 | if success_count > 0:
180 | print(f"✅ Rate limiting configured (processed {success_count} requests)")
181 | if rate_limited:
182 | print(" Rate limit triggered as expected")
183 | return True
184 | else:
185 | print("❌ No requests succeeded")
186 | return False
187 |
188 | except Exception as e:
189 | print(f"❌ Rate limiting test error: {e}")
190 | return False
191 |
192 | def test_security_headers(self) -> bool:
193 | """Test that security headers are present."""
194 | try:
195 | response = self.session.get(f"{self.base_url}/", timeout=5)
196 | headers = response.headers
197 |
198 | security_headers = {
199 | 'X-Content-Type-Options': 'nosniff',
200 | 'X-Frame-Options': 'DENY',
201 | 'X-XSS-Protection': '1; mode=block',
202 | 'Content-Security-Policy': "default-src 'self'"
203 | }
204 |
205 | missing_headers = []
206 | for header, expected_value in security_headers.items():
207 | if header not in headers:
208 | missing_headers.append(header)
209 | elif headers[header] != expected_value:
210 | print(f"⚠️ Security header {header} has unexpected value: {headers[header]}")
211 |
212 | if missing_headers:
213 | print(f"❌ Missing security headers: {missing_headers}")
214 | return False
215 | else:
216 | print("✅ All security headers present")
217 | return True
218 |
219 | except Exception as e:
220 | print(f"❌ Security headers test error: {e}")
221 | return False
222 |
223 | def run_all_tests(self) -> bool:
224 | """Run all security tests."""
225 | print("🔒 Starting MCP Server Security Tests")
226 | print("=" * 50)
227 |
228 | test_results = []
229 |
230 | # Test health check
231 | test_results.append(self.test_health_check())
232 |
233 | # Test security headers
234 | test_results.append(self.test_security_headers())
235 |
236 | # Test unauthenticated access
237 | test_results.append(self.test_unauthenticated_access())
238 |
239 | # Test invalid credentials
240 | test_results.append(self.test_invalid_credentials())
241 |
242 | # Test authentication
243 | token = self.test_authentication()
244 | if token:
245 | test_results.append(True)
246 |
247 | # Test authenticated access
248 | test_results.append(self.test_authenticated_access(token))
249 |
250 | # Test token verification
251 | test_results.append(self.test_token_verification(token))
252 |
253 | # Test rate limiting
254 | test_results.append(self.test_rate_limiting(token))
255 | else:
256 | test_results.extend([False, False, False, False])
257 |
258 | # Results summary
259 | passed = sum(test_results)
260 | total = len(test_results)
261 |
262 | print("\n" + "=" * 50)
263 | print(f"Test Results: {passed}/{total} passed")
264 |
265 | if passed == total:
266 | print("🎉 All security tests passed!")
267 | return True
268 | else:
269 | print("⚠️ Some security tests failed. Please review the output above.")
270 | return False
271 |
272 | def main():
273 | """Main test runner."""
274 | import argparse
275 |
276 | parser = argparse.ArgumentParser(description="Test MCP server security")
277 | parser.add_argument("--url", default="http://localhost:8081", help="Server URL")
278 | parser.add_argument("--start-server", action="store_true", help="Start secure server before testing")
279 | parser.add_argument("--server-args", default="", help="Additional server arguments")
280 | args = parser.parse_args()
281 |
282 | server_process = None
283 |
284 | if args.start_server:
285 | print("🚀 Starting secure MCP server...")
286 | server_cmd = f"python secure_fabric_mcp.py {args.server_args}"
287 | server_process = subprocess.Popen(
288 | server_cmd.split(),
289 | stdout=subprocess.PIPE,
290 | stderr=subprocess.PIPE
291 | )
292 |
293 | # Wait for server to start
294 | print("⏳ Waiting for server to start...")
295 | time.sleep(5)
296 |
297 | try:
298 | # Run tests
299 | tester = SecurityTester(args.url)
300 | success = tester.run_all_tests()
301 |
302 | if success:
303 | print("\n✅ Security validation completed successfully!")
304 | else:
305 | print("\n❌ Security validation failed!")
306 |
307 | finally:
308 | if server_process:
309 | print("\n🛑 Stopping server...")
310 | server_process.terminate()
311 | server_process.wait()
312 |
313 | if __name__ == "__main__":
314 | main()
315 |
--------------------------------------------------------------------------------
/docs/pyspark_guide.md:
--------------------------------------------------------------------------------
1 | # PySpark Development Guide for Microsoft Fabric MCP
2 |
3 | This guide explains how to use the enhanced PySpark capabilities in the Microsoft Fabric MCP server for developing, testing, and optimizing PySpark notebooks.
4 |
5 | ## Overview
6 |
7 | The MCP server now provides comprehensive PySpark development support with:
8 | - 📓 **Advanced notebook templates** for different use cases
9 | - 🔧 **Code generation** for common PySpark operations
10 | - ✅ **Code validation** and best practices checking
11 | - 🎯 **Fabric-specific optimizations**
12 | - 📊 **Performance analysis** tools
13 | - 🚀 **Execution monitoring** capabilities
14 |
15 | ## Architecture Diagram
16 |
17 | ```mermaid
18 | graph TB
19 | subgraph "Developer Environment"
20 | IDE[IDE/VSCode]
21 | DEV[Developer]
22 | PROJ[Project Files]
23 | end
24 |
25 | subgraph "MCP Layer"
26 | MCP[MCP Server]
27 | TOOLS[PySpark Tools]
28 | HELPERS[PySpark Helpers]
29 | TEMPLATES[Template Manager]
30 | VALIDATORS[Code Validators]
31 | GENERATORS[Code Generators]
32 | end
33 |
34 | subgraph "Microsoft Fabric"
35 | API[Fabric API]
36 | WS[Workspace]
37 | LH[Lakehouse]
38 | NB[Notebooks]
39 | TABLES[Delta Tables]
40 | SPARK[Spark Clusters]
41 | end
42 |
43 | subgraph "Operations Flow"
44 | CREATE[Create Notebooks]
45 | VALIDATE[Validate Code]
46 | GENERATE[Generate Code]
47 | ANALYZE[Analyze Performance]
48 | DEPLOY[Deploy to Fabric]
49 | end
50 |
51 | %% Developer interactions
52 | DEV --> IDE
53 | IDE --> PROJ
54 |
55 | %% MCP interactions
56 | IDE <--> MCP
57 | MCP --> TOOLS
58 | TOOLS --> HELPERS
59 | TOOLS --> TEMPLATES
60 | TOOLS --> VALIDATORS
61 | TOOLS --> GENERATORS
62 |
63 | %% Fabric interactions
64 | MCP <--> API
65 | API --> WS
66 | WS --> LH
67 | WS --> NB
68 | LH --> TABLES
69 | NB --> SPARK
70 |
71 | %% Operation flows
72 | TOOLS --> CREATE
73 | TOOLS --> VALIDATE
74 | TOOLS --> GENERATE
75 | TOOLS --> ANALYZE
76 | CREATE --> DEPLOY
77 |
78 | %% Styling
79 | classDef devEnv fill:#e1f5fe
80 | classDef mcpLayer fill:#f3e5f5
81 | classDef fabricLayer fill:#e8f5e8
82 | classDef operations fill:#fff3e0
83 |
84 | class IDE,DEV,PROJ devEnv
85 | class MCP,TOOLS,HELPERS,TEMPLATES,VALIDATORS,GENERATORS mcpLayer
86 | class API,WS,LH,NB,TABLES,SPARK fabricLayer
87 | class CREATE,VALIDATE,GENERATE,ANALYZE,DEPLOY operations
88 | ```
89 |
90 | ### Architecture Components
91 |
92 | #### **1. Developer Environment**
93 | - **IDE/VSCode**: Primary development interface with MCP integration
94 | - **Developer**: Data engineer/scientist working on PySpark projects
95 | - **Project Files**: Local project structure and configuration
96 |
97 | #### **2. MCP Layer (This Server)**
98 | - **MCP Server**: Core server handling tool requests
99 | - **PySpark Tools**: 11 specialized tools for notebook operations
100 | - **PySpark Helpers**: Template management and code generation
101 | - **Template Manager**: Pre-built notebook templates for different scenarios
102 | - **Code Validators**: Syntax, best practices, and Fabric compatibility checks
103 | - **Code Generators**: Intelligent PySpark code generation
104 |
105 | #### **3. Microsoft Fabric**
106 | - **Fabric API**: REST API for all Fabric operations
107 | - **Workspace**: Fabric workspace containing resources
108 | - **Lakehouse**: Data storage with Delta Lake tables
109 | - **Notebooks**: PySpark notebooks for data processing
110 | - **Delta Tables**: Structured data storage
111 | - **Spark Clusters**: Compute resources for PySpark execution
112 |
113 | #### **4. Operations Flow**
114 | - **Create Notebooks**: Generate notebooks from templates
115 | - **Validate Code**: Check syntax, performance, and compatibility
116 | - **Generate Code**: Create PySpark snippets for common operations
117 | - **Analyze Performance**: Evaluate and optimize notebook performance
118 | - **Deploy to Fabric**: Push notebooks and execute in Fabric environment
119 |
120 | ### Interaction Flow
121 |
122 | 1. **Developer writes/requests PySpark code in IDE**
123 | 2. **IDE communicates with MCP server via protocol**
124 | 3. **MCP server processes request using specialized tools**
125 | 4. **Tools utilize helpers, templates, and validators**
126 | 5. **MCP server calls Fabric API for operations**
127 | 6. **Results flow back through MCP to IDE**
128 | 7. **Developer receives generated code, validation results, or analysis**
129 |
130 | ### Benefits of This Architecture
131 |
132 | - **Seamless Integration**: Work directly from your IDE without switching contexts
133 | - **Intelligent Assistance**: AI-powered code generation and validation
134 | - **Fabric Optimization**: Specialized tools for Microsoft Fabric environment
135 | - **Performance Focus**: Built-in performance analysis and optimization
136 | - **Template-Driven**: Quick start with proven patterns
137 | - **Real-time Feedback**: Immediate validation and suggestions
138 |
139 | ## Available Tools
140 |
141 | ### 1. Notebook Management
142 |
143 | #### `list_notebooks`
144 | List all notebooks in a workspace.
145 | ```
146 | Usage: list_notebooks(workspace="my_workspace")
147 | ```
148 |
149 | #### `get_notebook_content`
150 | Retrieve the content of a specific notebook.
151 | ```
152 | Usage: get_notebook_content(workspace="my_workspace", notebook_id="notebook_id")
153 | ```
154 |
155 | #### `create_pyspark_notebook`
156 | Create a notebook from built-in PySpark templates.
157 | ```
158 | Usage: create_pyspark_notebook(
159 | workspace="my_workspace",
160 | notebook_name="my_pyspark_notebook",
161 | template_type="basic" # Options: basic, etl, analytics, ml
162 | )
163 | ```
164 |
165 | #### `create_fabric_notebook`
166 | Create a notebook optimized for Microsoft Fabric with advanced templates.
167 | ```
168 | Usage: create_fabric_notebook(
169 | workspace="my_workspace",
170 | notebook_name="fabric_optimized_notebook",
171 | template_type="fabric_integration" # Options: fabric_integration, streaming
172 | )
173 | ```
174 |
175 | ### 2. Code Generation
176 |
177 | #### `generate_pyspark_code`
178 | Generate PySpark code for common operations.
179 | ```
180 | Usage: generate_pyspark_code(
181 | operation="read_table",
182 | source_table="lakehouse.my_table",
183 | columns="id,name,age"
184 | )
185 |
186 | Available operations:
187 | - read_table: Read data from tables
188 | - write_table: Write data to tables
189 | - transform: Data transformations
190 | - join: Table joins
191 | - aggregate: Data aggregations
192 | - schema_inference: Schema analysis
193 | - data_quality: Data quality checks
194 | - performance_optimization: Performance tuning
195 | ```
196 |
197 | #### `generate_fabric_code`
198 | Generate Fabric-specific PySpark code.
199 | ```
200 | Usage: generate_fabric_code(
201 | operation="read_lakehouse",
202 | lakehouse_name="my_lakehouse",
203 | table_name="my_table"
204 | )
205 |
206 | Available operations:
207 | - read_lakehouse: Read from Fabric Lakehouse
208 | - write_lakehouse: Write to Fabric Lakehouse
209 | - merge_delta: Delta Lake merge operations
210 | - performance_monitor: Performance monitoring
211 | ```
212 |
213 | ### 3. Code Validation
214 |
215 | #### `validate_pyspark_code`
216 | Validate PySpark code for syntax and best practices.
217 | ```
218 | Usage: validate_pyspark_code(code="df = spark.table('my_table')")
219 | ```
220 |
221 | #### `validate_fabric_code`
222 | Validate code specifically for Microsoft Fabric compatibility.
223 | ```
224 | Usage: validate_fabric_code(code="df = spark.table('my_table')")
225 | ```
226 |
227 | ### 4. Performance Analysis
228 |
229 | #### `analyze_notebook_performance`
230 | Analyze a notebook's performance and provide optimization recommendations.
231 | ```
232 | Usage: analyze_notebook_performance(
233 | workspace="my_workspace",
234 | notebook_id="notebook_id"
235 | )
236 | ```
237 |
238 | ### 5. Notebook Editing
239 |
240 | #### `update_notebook_cell`
241 | Update a specific cell in a notebook.
242 | ```
243 | Usage: update_notebook_cell(
244 | workspace="my_workspace",
245 | notebook_id="notebook_id",
246 | cell_index=0,
247 | cell_content="print('Hello, Fabric!')",
248 | cell_type="code"
249 | )
250 | ```
251 |
252 | ## Template Types
253 |
254 | ### Basic Templates (`create_pyspark_notebook`)
255 |
256 | 1. **basic**: Fundamental PySpark operations
257 | - Spark session initialization
258 | - Basic DataFrame operations
259 | - Sample data creation
260 |
261 | 2. **etl**: ETL pipeline template
262 | - Extract, Transform, Load patterns
263 | - Data cleaning and processing
264 | - Delta Lake integration
265 |
266 | 3. **analytics**: Data analytics template
267 | - Aggregations and window functions
268 | - Advanced analytics patterns
269 | - Statistical operations
270 |
271 | 4. **ml**: Machine learning template
272 | - MLlib pipeline creation
273 | - Feature engineering
274 | - Model training and evaluation
275 |
276 | ### Advanced Templates (`create_fabric_notebook`)
277 |
278 | 1. **fabric_integration**: Microsoft Fabric integration
279 | - Lakehouse connectivity
280 | - Delta Lake operations
281 | - Fabric-specific utilities
282 |
283 | 2. **streaming**: Structured Streaming template
284 | - Real-time data processing
285 | - Stream-to-Delta operations
286 | - Windowed aggregations
287 |
288 | ## Best Practices
289 |
290 | ### 1. Fabric-Specific Optimizations
291 |
292 | ✅ **Use managed tables:**
293 | ```python
294 | df = spark.table("lakehouse.my_table") # Preferred
295 | # instead of direct file paths
296 | ```
297 |
298 | ✅ **Use Delta Lake format:**
299 | ```python
300 | df.write.format("delta").mode("overwrite").saveAsTable("my_table")
301 | ```
302 |
303 | ✅ **Leverage notebookutils:**
304 | ```python
305 | import notebookutils as nbu
306 | workspace_id = nbu.runtime.context.workspaceId
307 | ```
308 |
309 | ### 2. Performance Optimizations
310 |
311 | ✅ **Cache frequently used DataFrames:**
312 | ```python
313 | df.cache() # Cache before multiple actions
314 | ```
315 |
316 | ✅ **Use broadcast for small tables:**
317 | ```python
318 | from pyspark.sql.functions import broadcast
319 | result = large_df.join(broadcast(small_df), "key")
320 | ```
321 |
322 | ✅ **Partition large datasets:**
323 | ```python
324 | df.write.partitionBy("year", "month").saveAsTable("partitioned_table")
325 | ```
326 |
327 | ### 3. Code Quality
328 |
329 | ✅ **Define explicit schemas:**
330 | ```python
331 | schema = StructType([
332 | StructField("id", IntegerType(), True),
333 | StructField("name", StringType(), True)
334 | ])
335 | df = spark.createDataFrame(data, schema)
336 | ```
337 |
338 | ✅ **Handle null values:**
339 | ```python
340 | df.filter(col("column").isNotNull())
341 | ```
342 |
343 | ❌ **Avoid these anti-patterns:**
344 | ```python
345 | # Don't collect large datasets
346 | for row in df.collect(): # Avoid this
347 | process(row)
348 |
349 | # Don't use .toPandas() on large data
350 | pandas_df = large_df.toPandas() # Risk of OOM
351 | ```
352 |
353 | ## Workflow Examples
354 |
355 | ### 1. Creating and Optimizing a PySpark Notebook
356 |
357 | ```python
358 | # 1. Create a new notebook from template
359 | create_fabric_notebook(
360 | workspace="analytics_workspace",
361 | notebook_name="sales_analysis",
362 | template_type="fabric_integration"
363 | )
364 |
365 | # 2. Generate code for specific operations
366 | generate_fabric_code(
367 | operation="read_lakehouse",
368 | lakehouse_name="sales_lakehouse",
369 | table_name="transactions"
370 | )
371 |
372 | # 3. Validate the generated code
373 | validate_fabric_code(code="df = spark.table('sales_lakehouse.transactions')")
374 |
375 | # 4. Analyze performance
376 | analyze_notebook_performance(
377 | workspace="analytics_workspace",
378 | notebook_id="sales_analysis_notebook_id"
379 | )
380 | ```
381 |
382 | ### 2. ETL Pipeline Development
383 |
384 | ```python
385 | # 1. Create ETL notebook
386 | create_pyspark_notebook(
387 | workspace="etl_workspace",
388 | notebook_name="daily_etl",
389 | template_type="etl"
390 | )
391 |
392 | # 2. Generate transformation code
393 | generate_pyspark_code(
394 | operation="transform",
395 | columns="customer_id,product_id,amount",
396 | filter_condition="amount > 0"
397 | )
398 |
399 | # 3. Generate Delta merge code
400 | generate_fabric_code(
401 | operation="merge_delta",
402 | target_table="sales_summary"
403 | )
404 | ```
405 |
406 | ### 3. Performance Monitoring
407 |
408 | ```python
409 | # 1. Generate performance monitoring code
410 | generate_fabric_code(operation="performance_monitor")
411 |
412 | # 2. Validate for performance issues
413 | validate_fabric_code(code="""
414 | df1 = spark.table("large_table")
415 | df2 = spark.table("small_table")
416 | result = df1.join(df2, "key")
417 | result.collect() # This will be flagged
418 | """)
419 |
420 | # 3. Analyze existing notebook
421 | analyze_notebook_performance(
422 | workspace="my_workspace",
423 | notebook_id="existing_notebook_id"
424 | )
425 | ```
426 |
427 | ## Error Handling
428 |
429 | The MCP tools provide comprehensive error handling:
430 |
431 | - **Syntax validation**: Checks Python syntax before execution
432 | - **Fabric compatibility**: Ensures code works in Fabric environment
433 | - **Performance warnings**: Identifies potential performance issues
434 | - **Best practice suggestions**: Recommends improvements
435 |
436 | ## Integration with IDE
437 |
438 | When using the MCP in your IDE:
439 |
440 | 1. **Autocomplete**: The MCP provides intelligent code generation
441 | 2. **Validation**: Real-time code validation and suggestions
442 | 3. **Templates**: Quick notebook creation from templates
443 | 4. **Performance insights**: Analyze and optimize existing notebooks
444 |
445 | ## Troubleshooting
446 |
447 | ### Common Issues
448 |
449 | 1. **Context not provided**: Ensure `ctx` parameter is passed to all functions
450 | 2. **Invalid workspace**: Verify workspace name or ID exists
451 | 3. **Notebook not found**: Check notebook ID or name spelling
452 | 4. **Template not found**: Use valid template types listed above
453 |
454 | ### Getting Help
455 |
456 | Use the validation tools to identify issues:
457 | - `validate_pyspark_code()` for general PySpark validation
458 | - `validate_fabric_code()` for Fabric-specific validation
459 | - `analyze_notebook_performance()` for performance insights
460 |
461 | ## Advanced Features
462 |
463 | ### Custom Templates
464 |
465 | The helper module supports extending templates. You can create custom templates by modifying the `PySparkTemplateManager` class in `helpers/pyspark_helpers.py`.
466 |
467 | ### Code Generation Extensions
468 |
469 | Add new code generation patterns by extending the `PySparkCodeGenerator` class with additional methods for specific use cases.
470 |
471 | ### Performance Metrics
472 |
473 | The performance analysis tool provides:
474 | - Operation counts per cell
475 | - Performance issue detection
476 | - Optimization opportunity identification
477 | - Scoring system (0-100)
478 |
479 | This comprehensive PySpark development environment helps you write, test, and optimize PySpark notebooks efficiently in Microsoft Fabric!
480 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Microsoft Fabric MCP Server
2 |
3 | A comprehensive Python-based MCP (Model Context Protocol) server for interacting with Microsoft Fabric APIs, featuring advanced PySpark notebook development, testing, and optimization capabilities with LLM integration.
4 |
5 | ## 🚀 Features
6 |
7 | ### **Core Fabric Operations**
8 | - ✅ Workspace, lakehouse, warehouse, and table management
9 | - ✅ Delta table schemas and metadata retrieval
10 | - ✅ SQL query execution and data loading
11 | - ✅ Report and semantic model operations
12 |
13 | ### **Advanced PySpark Development**
14 | - 📓 **Intelligent notebook creation** with 6 specialized templates
15 | - 🔧 **Smart code generation** for common PySpark operations
16 | - ✅ **Comprehensive validation** with syntax and best practices checking
17 | - 🎯 **Fabric-specific optimizations** and compatibility checks
18 | - 📊 **Performance analysis** with scoring and optimization recommendations
19 | - 🚀 **Real-time monitoring** and execution insights
20 |
21 | ### **LLM Integration**
22 | - 🤖 **Natural language interface** for PySpark development
23 | - 🧠 **Context-aware assistance** with conversation memory
24 | - 🎨 **Intelligent code formatting** and explanations
25 | - 📈 **Smart optimization suggestions** based on project patterns
26 |
27 | ## 🏗️ Architecture
28 |
29 | ```mermaid
30 | graph TB
31 | subgraph "Developer Environment"
32 | IDE[IDE/VSCode]
33 | DEV[Developer]
34 | PROJ[Project Files]
35 | end
36 |
37 | subgraph "AI Layer"
38 | LLM[Large Language Model
Claude/GPT/etc.]
39 | CONTEXT[Conversation Context]
40 | REASONING[AI Reasoning Engine]
41 | end
42 |
43 | subgraph "MCP Layer"
44 | MCP[MCP Server]
45 | TOOLS[PySpark Tools]
46 | HELPERS[PySpark Helpers]
47 | TEMPLATES[Template Manager]
48 | VALIDATORS[Code Validators]
49 | GENERATORS[Code Generators]
50 | end
51 |
52 | subgraph "Microsoft Fabric"
53 | API[Fabric API]
54 | WS[Workspace]
55 | LH[Lakehouse]
56 | NB[Notebooks]
57 | TABLES[Delta Tables]
58 | SPARK[Spark Clusters]
59 | end
60 |
61 | subgraph "Operations Flow"
62 | CREATE[Create Notebooks]
63 | VALIDATE[Validate Code]
64 | GENERATE[Generate Code]
65 | ANALYZE[Analyze Performance]
66 | DEPLOY[Deploy to Fabric]
67 | end
68 |
69 | %% Developer interactions
70 | DEV --> IDE
71 | IDE --> PROJ
72 |
73 | %% LLM interactions
74 | IDE <--> LLM
75 | LLM <--> CONTEXT
76 | LLM --> REASONING
77 |
78 | %% MCP interactions
79 | LLM <--> MCP
80 | MCP --> TOOLS
81 | TOOLS --> HELPERS
82 | TOOLS --> TEMPLATES
83 | TOOLS --> VALIDATORS
84 | TOOLS --> GENERATORS
85 |
86 | %% Fabric interactions
87 | MCP <--> API
88 | API --> WS
89 | WS --> LH
90 | WS --> NB
91 | LH --> TABLES
92 | NB --> SPARK
93 |
94 | %% Operation flows
95 | TOOLS --> CREATE
96 | TOOLS --> VALIDATE
97 | TOOLS --> GENERATE
98 | TOOLS --> ANALYZE
99 | CREATE --> DEPLOY
100 |
101 | %% Data flow arrows
102 | REASONING -.->|"Intelligent Decisions"| TOOLS
103 | CONTEXT -.->|"Project Awareness"| VALIDATORS
104 |
105 | %% Styling
106 | classDef devEnv fill:#e1f5fe
107 | classDef aiLayer fill:#fff9c4
108 | classDef mcpLayer fill:#f3e5f5
109 | classDef fabricLayer fill:#e8f5e8
110 | classDef operations fill:#fff3e0
111 |
112 | class IDE,DEV,PROJ devEnv
113 | class LLM,CONTEXT,REASONING aiLayer
114 | class MCP,TOOLS,HELPERS,TEMPLATES,VALIDATORS,GENERATORS mcpLayer
115 | class API,WS,LH,NB,TABLES,SPARK fabricLayer
116 | class CREATE,VALIDATE,GENERATE,ANALYZE,DEPLOY operations
117 | ```
118 |
119 | ### **Interaction Flow**
120 | 1. **Developer requests assistance in IDE**
121 | 2. **IDE communicates with LLM (Claude/GPT)**
122 | 3. **LLM analyzes using context and reasoning**
123 | 4. **LLM calls MCP server tools intelligently**
124 | 5. **MCP tools interact with Fabric API**
125 | 6. **Results flow back through LLM with intelligent formatting**
126 | 7. **Developer receives contextual, smart responses**
127 |
128 | ## 📋 Requirements
129 |
130 | - **Python 3.12+**
131 | - **Azure credentials** for authentication
132 | - **uv** (from astral): [Installation instructions](https://docs.astral.sh/uv/getting-started/installation/#installing-uv)
133 | - **Azure CLI**: [Installation instructions](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest)
134 | - **Optional: Node.js** for MCP inspector: [Installation instructions](https://nodejs.org/en/download)
135 |
136 | ## 🔧 Installation
137 |
138 | 1. **Clone the repository:**
139 | ```bash
140 | git clone https://github.com/your-repo/fabric-mcp.git
141 | cd fabric-mcp
142 | ```
143 |
144 | 2. **Set up virtual environment:**
145 | ```bash
146 | uv sync
147 | ```
148 |
149 | 3. **Install dependencies:**
150 | ```bash
151 | pip install -r requirements.txt
152 | ```
153 |
154 | ## 🚀 Usage
155 |
156 | 1. **Using STDIO**
157 |
158 | ### **Connect to Microsoft Fabric**
159 |
160 | ```bash
161 | az login --scope https://api.fabric.microsoft.com/.default
162 | ```
163 |
164 | ### **Running with MCP Inspector**
165 |
166 | ```bash
167 | uv run --with mcp mcp dev fabric_mcp.py
168 | ```
169 | This starts the server with inspector at `http://localhost:6274`.
170 |
171 | ### **VSCode Integration**
172 |
173 | Add to your `launch.json`:
174 | ```json
175 | {
176 | "mcp": {
177 | "servers": {
178 | "ms-fabric-mcp": {
179 | "type": "stdio",
180 | "command": "\\.venv\\Scripts\\python.exe",
181 | "args": ["\\fabric_mcp.py"]
182 | }
183 | }
184 | }
185 | }
186 | ```
187 |
188 | 2. **Using HTTP**
189 | ### **Start the MCP Server**
190 | ```bash
191 | uv run python .\fabric_mcp.py --port 8081
192 | ```
193 |
194 | ### **VSCode Integration**
195 |
196 | Add to your `launch.json`:
197 | ```json
198 | {
199 | "mcp": {
200 | "servers": {
201 | "ms-fabric-mcp": {
202 | "type": "http",
203 | "url": "http://:8081/mcp/",
204 | "headers": {
205 | "Accept": "application/json,text/event-stream",
206 | }
207 | }
208 | }
209 | }
210 | }
211 | ```
212 |
213 | ## 🛠️ Complete Tool Reference
214 |
215 | ### **1. Workspace Management**
216 |
217 | #### `list_workspaces`
218 | List all available Fabric workspaces.
219 | ```python
220 | # Usage in LLM: "List all my Fabric workspaces"
221 | ```
222 |
223 | #### `set_workspace`
224 | Set the current workspace context for the session.
225 | ```python
226 | set_workspace(workspace="Analytics-Workspace")
227 | ```
228 |
229 | ### **2. Lakehouse Operations**
230 |
231 | #### `list_lakehouses`
232 | List all lakehouses in a workspace.
233 | ```python
234 | list_lakehouses(workspace="Analytics-Workspace")
235 | ```
236 |
237 | #### `create_lakehouse`
238 | Create a new lakehouse.
239 | ```python
240 | create_lakehouse(
241 | name="Sales-Data-Lake",
242 | workspace="Analytics-Workspace",
243 | description="Sales data lakehouse"
244 | )
245 | ```
246 |
247 | #### `set_lakehouse`
248 | Set current lakehouse context.
249 | ```python
250 | set_lakehouse(lakehouse="Sales-Data-Lake")
251 | ```
252 |
253 | ### **3. Warehouse Operations**
254 |
255 | #### `list_warehouses`
256 | List all warehouses in a workspace.
257 | ```python
258 | list_warehouses(workspace="Analytics-Workspace")
259 | ```
260 |
261 | #### `create_warehouse`
262 | Create a new warehouse.
263 | ```python
264 | create_warehouse(
265 | name="Sales-DW",
266 | workspace="Analytics-Workspace",
267 | description="Sales data warehouse"
268 | )
269 | ```
270 |
271 | #### `set_warehouse`
272 | Set current warehouse context.
273 | ```python
274 | set_warehouse(warehouse="Sales-DW")
275 | ```
276 |
277 | ### **4. Table Operations**
278 |
279 | #### `list_tables`
280 | List all tables in a lakehouse.
281 | ```python
282 | list_tables(workspace="Analytics-Workspace", lakehouse="Sales-Data-Lake")
283 | ```
284 |
285 | #### `get_lakehouse_table_schema`
286 | Get schema for a specific table.
287 | ```python
288 | get_lakehouse_table_schema(
289 | workspace="Analytics-Workspace",
290 | lakehouse="Sales-Data-Lake",
291 | table_name="transactions"
292 | )
293 | ```
294 |
295 | #### `get_all_lakehouse_schemas`
296 | Get schemas for all tables in a lakehouse.
297 | ```python
298 | get_all_lakehouse_schemas(
299 | workspace="Analytics-Workspace",
300 | lakehouse="Sales-Data-Lake"
301 | )
302 | ```
303 |
304 | #### `set_table`
305 | Set current table context.
306 | ```python
307 | set_table(table_name="transactions")
308 | ```
309 |
310 | ### **5. SQL Operations**
311 |
312 | #### `get_sql_endpoint`
313 | Get SQL endpoint for lakehouse or warehouse.
314 | ```python
315 | get_sql_endpoint(
316 | workspace="Analytics-Workspace",
317 | lakehouse="Sales-Data-Lake",
318 | type="lakehouse"
319 | )
320 | ```
321 |
322 | #### `run_query`
323 | Execute SQL queries.
324 | ```python
325 | run_query(
326 | workspace="Analytics-Workspace",
327 | lakehouse="Sales-Data-Lake",
328 | query="SELECT COUNT(*) FROM transactions",
329 | type="lakehouse"
330 | )
331 | ```
332 |
333 | ### **6. Data Loading**
334 |
335 | #### `load_data_from_url`
336 | Load data from URL into tables.
337 | ```python
338 | load_data_from_url(
339 | url="https://example.com/data.csv",
340 | destination_table="new_data",
341 | workspace="Analytics-Workspace",
342 | lakehouse="Sales-Data-Lake"
343 | )
344 | ```
345 |
346 | ### **7. Reports & Models**
347 |
348 | #### `list_reports`
349 | List all reports in a workspace.
350 | ```python
351 | list_reports(workspace="Analytics-Workspace")
352 | ```
353 |
354 | #### `get_report`
355 | Get specific report details.
356 | ```python
357 | get_report(workspace="Analytics-Workspace", report_id="report-id")
358 | ```
359 |
360 | #### `list_semantic_models`
361 | List semantic models in workspace.
362 | ```python
363 | list_semantic_models(workspace="Analytics-Workspace")
364 | ```
365 |
366 | #### `get_semantic_model`
367 | Get specific semantic model.
368 | ```python
369 | get_semantic_model(workspace="Analytics-Workspace", model_id="model-id")
370 | ```
371 |
372 | ### **8. Basic Notebook Operations**
373 |
374 | #### `list_notebooks`
375 | List all notebooks in a workspace.
376 | ```python
377 | list_notebooks(workspace="Analytics-Workspace")
378 | ```
379 |
380 | #### `get_notebook_content`
381 | Retrieve notebook content.
382 | ```python
383 | get_notebook_content(
384 | workspace="Analytics-Workspace",
385 | notebook_id="notebook-id"
386 | )
387 | ```
388 |
389 | #### `update_notebook_cell`
390 | Update specific notebook cells.
391 | ```python
392 | update_notebook_cell(
393 | workspace="Analytics-Workspace",
394 | notebook_id="notebook-id",
395 | cell_index=0,
396 | cell_content="print('Hello, Fabric!')",
397 | cell_type="code"
398 | )
399 | ```
400 |
401 | ### **9. Advanced PySpark Notebook Creation**
402 |
403 | #### `create_pyspark_notebook`
404 | Create notebooks from basic templates.
405 | ```python
406 | create_pyspark_notebook(
407 | workspace="Analytics-Workspace",
408 | notebook_name="Data-Analysis",
409 | template_type="analytics" # Options: basic, etl, analytics, ml
410 | )
411 | ```
412 |
413 | #### `create_fabric_notebook`
414 | Create Fabric-optimized notebooks.
415 | ```python
416 | create_fabric_notebook(
417 | workspace="Analytics-Workspace",
418 | notebook_name="Fabric-Pipeline",
419 | template_type="fabric_integration" # Options: fabric_integration, streaming
420 | )
421 | ```
422 |
423 | ### **10. PySpark Code Generation**
424 |
425 | #### `generate_pyspark_code`
426 | Generate code for common operations.
427 | ```python
428 | generate_pyspark_code(
429 | operation="read_table",
430 | source_table="sales.transactions",
431 | columns="id,amount,date"
432 | )
433 |
434 | # Available operations:
435 | # - read_table, write_table, transform, join, aggregate
436 | # - schema_inference, data_quality, performance_optimization
437 | ```
438 |
439 | #### `generate_fabric_code`
440 | Generate Fabric-specific code.
441 | ```python
442 | generate_fabric_code(
443 | operation="read_lakehouse",
444 | lakehouse_name="Sales-Data-Lake",
445 | table_name="transactions"
446 | )
447 |
448 | # Available operations:
449 | # - read_lakehouse, write_lakehouse, merge_delta, performance_monitor
450 | ```
451 |
452 | ### **11. Code Validation & Analysis**
453 |
454 | #### `validate_pyspark_code`
455 | Validate PySpark code syntax and best practices.
456 | ```python
457 | validate_pyspark_code(code="""
458 | df = spark.table('transactions')
459 | df.show()
460 | """)
461 | ```
462 |
463 | #### `validate_fabric_code`
464 | Validate Fabric compatibility.
465 | ```python
466 | validate_fabric_code(code="""
467 | df = spark.table('lakehouse.transactions')
468 | df.write.format('delta').saveAsTable('summary')
469 | """)
470 | ```
471 |
472 | #### `analyze_notebook_performance`
473 | Comprehensive performance analysis.
474 | ```python
475 | analyze_notebook_performance(
476 | workspace="Analytics-Workspace",
477 | notebook_id="notebook-id"
478 | )
479 | ```
480 |
481 | ### **12. Context Management**
482 |
483 | #### `clear_context`
484 | Clear current session context.
485 | ```python
486 | clear_context()
487 | ```
488 |
489 | ## 📊 PySpark Templates
490 |
491 | ### **Basic Templates**
492 | 1. **basic**: Fundamental PySpark operations and DataFrame usage
493 | 2. **etl**: Complete ETL pipeline with data cleaning and Delta Lake
494 | 3. **analytics**: Advanced analytics with aggregations and window functions
495 | 4. **ml**: Machine learning pipeline with MLlib and feature engineering
496 |
497 | ### **Advanced Templates**
498 | 1. **fabric_integration**: Lakehouse connectivity and Fabric-specific utilities
499 | 2. **streaming**: Real-time processing with Structured Streaming
500 |
501 | ## 🎯 Best Practices
502 |
503 | ### **Fabric Optimization**
504 | ```python
505 | # ✅ Use managed tables
506 | df = spark.table("lakehouse.my_table")
507 |
508 | # ✅ Use Delta Lake format
509 | df.write.format("delta").mode("overwrite").saveAsTable("my_table")
510 |
511 | # ✅ Leverage notebookutils
512 | import notebookutils as nbu
513 | workspace_id = nbu.runtime.context.workspaceId
514 | ```
515 |
516 | ### **Performance Optimization**
517 | ```python
518 | # ✅ Cache frequently used DataFrames
519 | df.cache()
520 |
521 | # ✅ Use broadcast for small tables
522 | from pyspark.sql.functions import broadcast
523 | result = large_df.join(broadcast(small_df), "key")
524 |
525 | # ✅ Partition large datasets
526 | df.write.partitionBy("year", "month").saveAsTable("partitioned_table")
527 | ```
528 |
529 | ### **Code Quality**
530 | ```python
531 | # ✅ Define explicit schemas
532 | schema = StructType([
533 | StructField("id", IntegerType(), True),
534 | StructField("name", StringType(), True)
535 | ])
536 |
537 | # ✅ Handle null values
538 | df.filter(col("column").isNotNull())
539 | ```
540 |
541 | ## 🔄 Example LLM-Enhanced Workflows
542 |
543 | ### **Natural Language Requests**
544 | ```
545 | Human: "Create a PySpark notebook that reads sales data, cleans it, and optimizes performance"
546 |
547 | LLM Response:
548 | 1. Creates Fabric-optimized notebook with ETL template
549 | 2. Generates lakehouse reading code
550 | 3. Adds data cleaning transformations
551 | 4. Includes performance optimization patterns
552 | 5. Validates code for best practices
553 | ```
554 |
555 | ### **Performance Analysis**
556 | ```
557 | Human: "My PySpark notebook is slow. Help me optimize it."
558 |
559 | LLM Response:
560 | 1. Analyzes notebook performance (scoring 0-100)
561 | 2. Identifies anti-patterns and bottlenecks
562 | 3. Suggests specific optimizations
563 | 4. Generates optimized code alternatives
564 | 5. Provides before/after comparisons
565 | ```
566 |
567 | ## 🔍 Troubleshooting
568 |
569 | ### **Common Issues**
570 | - **Authentication**: Ensure `az login` with correct scope
571 | - **Context**: Use `clear_context()` to reset session state
572 | - **Workspace**: Verify workspace names and permissions
573 | - **Templates**: Check available template types in documentation
574 |
575 | ### **Getting Help**
576 | - Use validation tools for code issues
577 | - Check performance analysis for optimization opportunities
578 | - Leverage LLM natural language interface for guidance
579 |
580 | ## 📈 Performance Metrics
581 |
582 | The analysis tools provide:
583 | - **Operation counts** per notebook cell
584 | - **Performance issues** detection and flagging
585 | - **Optimization opportunities** identification
586 | - **Scoring system** (0-100) for code quality
587 | - **Fabric compatibility** assessment
588 |
589 | ## 🤝 Contributing
590 |
591 | This project welcomes contributions! Please see our contributing guidelines for details.
592 |
593 | ## 📄 License
594 |
595 | This project is licensed under the MIT License. See the LICENSE file for details.
596 |
597 | ## 🙏 Acknowledgments
598 |
599 | Inspired by: https://github.com/Augustab/microsoft_fabric_mcp/tree/main
600 |
601 | ---
602 |
603 | **Ready to supercharge your Microsoft Fabric development with intelligent PySpark assistance!** 🚀
604 |
--------------------------------------------------------------------------------
/helpers/pyspark_helpers.py:
--------------------------------------------------------------------------------
1 | """
2 | PySpark helper utilities for Microsoft Fabric MCP Server.
3 | This module provides templates, code generation, and execution helpers for PySpark notebooks.
4 | """
5 |
6 | import json
7 | from typing import Dict, List, Any, Optional
8 | from helpers.logging_config import get_logger
9 |
10 | logger = get_logger(__name__)
11 |
12 | class PySparkTemplateManager:
13 | """Manages PySpark notebook templates and code generation."""
14 |
15 | @staticmethod
16 | def get_fabric_integration_template() -> Dict[str, Any]:
17 | """Template for Fabric-specific PySpark operations."""
18 | return {
19 | "cells": [
20 | {
21 | "cell_type": "markdown",
22 | "source": [
23 | "# Microsoft Fabric PySpark Integration\n",
24 | "\n",
25 | "This notebook demonstrates integration with Microsoft Fabric resources using PySpark.\n"
26 | ],
27 | "metadata": {}
28 | },
29 | {
30 | "cell_type": "code",
31 | "source": [
32 | "# Initialize Fabric integration\n",
33 | "from pyspark.sql import SparkSession\n",
34 | "from pyspark.sql.functions import *\n",
35 | "from pyspark.sql.types import *\n",
36 | "from delta.tables import DeltaTable\n",
37 | "import notebookutils as nbu\n",
38 | "\n",
39 | "# Spark session is pre-configured in Fabric\n",
40 | "print(f\"Spark version: {spark.version}\")\n",
41 | "print(f\"Available cores: {spark.sparkContext.defaultParallelism}\")\n",
42 | "\n",
43 | "# Get current workspace and lakehouse context\n",
44 | "print(f\"Current workspace: {nbu.runtime.context.workspaceId}\")\n"
45 | ],
46 | "execution_count": None,
47 | "outputs": [],
48 | "metadata": {}
49 | },
50 | {
51 | "cell_type": "code",
52 | "source": [
53 | "# Connect to Fabric Lakehouse\n",
54 | "# List available tables in the default lakehouse\n",
55 | "try:\n",
56 | " tables = spark.sql(\"SHOW TABLES\").collect()\n",
57 | " print(\"Available tables in current lakehouse:\")\n",
58 | " for table in tables:\n",
59 | " print(f\" - {table.database}.{table.tableName}\")\n",
60 | "except Exception as e:\n",
61 | " print(f\"No default lakehouse attached or no tables found: {e}\")\n"
62 | ],
63 | "execution_count": None,
64 | "outputs": [],
65 | "metadata": {}
66 | },
67 | {
68 | "cell_type": "code",
69 | "source": [
70 | "# Read from Fabric Lakehouse table\n",
71 | "# Replace 'your_table_name' with actual table name\n",
72 | "# df = spark.table(\"your_table_name\")\n",
73 | "\n",
74 | "# Alternative: Read from files in Lakehouse\n",
75 | "# df = spark.read.format(\"delta\").load(\"Tables/your_table_name\")\n",
76 | "\n",
77 | "# For demo, create sample data\n",
78 | "sample_data = [\n",
79 | " (1, \"Product A\", 100.0, \"2024-01-01\"),\n",
80 | " (2, \"Product B\", 150.0, \"2024-01-02\"),\n",
81 | " (3, \"Product C\", 200.0, \"2024-01-03\")\n",
82 | "]\n",
83 | "\n",
84 | "schema = StructType([\n",
85 | " StructField(\"id\", IntegerType(), True),\n",
86 | " StructField(\"product_name\", StringType(), True),\n",
87 | " StructField(\"price\", DoubleType(), True),\n",
88 | " StructField(\"date_created\", StringType(), True)\n",
89 | "])\n",
90 | "\n",
91 | "df = spark.createDataFrame(sample_data, schema)\n",
92 | "df = df.withColumn(\"date_created\", to_date(col(\"date_created\"), \"yyyy-MM-dd\"))\n",
93 | "df.show()\n"
94 | ],
95 | "execution_count": None,
96 | "outputs": [],
97 | "metadata": {}
98 | },
99 | {
100 | "cell_type": "code",
101 | "source": [
102 | "# Write to Fabric Lakehouse as Delta table\n",
103 | "table_name = \"fabric_demo_products\"\n",
104 | "\n",
105 | "# Option 1: Write as managed table\n",
106 | "df.write \\\n",
107 | " .format(\"delta\") \\\n",
108 | " .mode(\"overwrite\") \\\n",
109 | " .option(\"overwriteSchema\", \"true\") \\\n",
110 | " .saveAsTable(table_name)\n",
111 | "\n",
112 | "print(f\"Successfully wrote {df.count()} records to table '{table_name}'\")\n",
113 | "\n",
114 | "# Verify the table was created\n",
115 | "result = spark.table(table_name)\n",
116 | "print(\"\\nTable verification:\")\n",
117 | "result.show()\n"
118 | ],
119 | "execution_count": None,
120 | "outputs": [],
121 | "metadata": {}
122 | },
123 | {
124 | "cell_type": "code",
125 | "source": [
126 | "# Advanced Delta Lake operations in Fabric\n",
127 | "from delta.tables import DeltaTable\n",
128 | "\n",
129 | "# Create DeltaTable reference\n",
130 | "delta_table = DeltaTable.forName(spark, table_name)\n",
131 | "\n",
132 | "# Show table history\n",
133 | "print(\"Table history:\")\n",
134 | "delta_table.history().show(truncate=False)\n",
135 | "\n",
136 | "# Perform merge operation (upsert)\n",
137 | "new_data = [\n",
138 | " (1, \"Product A Updated\", 110.0, \"2024-01-01\"), # Update existing\n",
139 | " (4, \"Product D\", 250.0, \"2024-01-04\") # Insert new\n",
140 | "]\n",
141 | "\n",
142 | "new_df = spark.createDataFrame(new_data, schema)\n",
143 | "new_df = new_df.withColumn(\"date_created\", to_date(col(\"date_created\"), \"yyyy-MM-dd\"))\n",
144 | "\n",
145 | "# Merge operation\n",
146 | "delta_table.alias(\"target\") \\\n",
147 | " .merge(\n",
148 | " new_df.alias(\"source\"),\n",
149 | " \"target.id = source.id\"\n",
150 | " ) \\\n",
151 | " .whenMatchedUpdateAll() \\\n",
152 | " .whenNotMatchedInsertAll() \\\n",
153 | " .execute()\n",
154 | "\n",
155 | "print(\"\\nAfter merge operation:\")\n",
156 | "spark.table(table_name).show()\n"
157 | ],
158 | "execution_count": None,
159 | "outputs": [],
160 | "metadata": {}
161 | }
162 | ]
163 | }
164 |
165 | @staticmethod
166 | def get_streaming_template() -> Dict[str, Any]:
167 | """Template for PySpark Structured Streaming in Fabric."""
168 | return {
169 | "cells": [
170 | {
171 | "cell_type": "markdown",
172 | "source": [
173 | "# PySpark Structured Streaming in Fabric\n",
174 | "\n",
175 | "This notebook demonstrates real-time data processing using PySpark Structured Streaming.\n"
176 | ],
177 | "metadata": {}
178 | },
179 | {
180 | "cell_type": "code",
181 | "source": [
182 | "# Import streaming libraries\n",
183 | "from pyspark.sql import SparkSession\n",
184 | "from pyspark.sql.functions import *\n",
185 | "from pyspark.sql.types import *\n",
186 | "import time\n",
187 | "\n",
188 | "print(f\"Spark version: {spark.version}\")\n",
189 | "print(\"Structured Streaming capabilities enabled\")\n"
190 | ],
191 | "execution_count": None,
192 | "outputs": [],
193 | "metadata": {}
194 | },
195 | {
196 | "cell_type": "code",
197 | "source": [
198 | "# Define schema for streaming data\n",
199 | "streaming_schema = StructType([\n",
200 | " StructField(\"timestamp\", TimestampType(), True),\n",
201 | " StructField(\"user_id\", StringType(), True),\n",
202 | " StructField(\"event_type\", StringType(), True),\n",
203 | " StructField(\"value\", DoubleType(), True)\n",
204 | "])\n",
205 | "\n",
206 | "# Create a streaming DataFrame (example with rate source for demo)\n",
207 | "streaming_df = spark \\\n",
208 | " .readStream \\\n",
209 | " .format(\"rate\") \\\n",
210 | " .option(\"rowsPerSecond\", 10) \\\n",
211 | " .load()\n",
212 | "\n",
213 | "# Transform the rate stream to simulate real events\n",
214 | "events_df = streaming_df \\\n",
215 | " .withColumn(\"user_id\", concat(lit(\"user_\"), (col(\"value\") % 100).cast(\"string\"))) \\\n",
216 | " .withColumn(\"event_type\", \n",
217 | " when(col(\"value\") % 3 == 0, \"purchase\")\n",
218 | " .when(col(\"value\") % 3 == 1, \"view\")\n",
219 | " .otherwise(\"click\")\n",
220 | " ) \\\n",
221 | " .withColumn(\"event_value\", (col(\"value\") % 1000).cast(\"double\")) \\\n",
222 | " .select(\"timestamp\", \"user_id\", \"event_type\", \"event_value\")\n",
223 | "\n",
224 | "print(\"Streaming DataFrame created\")\n",
225 | "print(f\"Schema: {events_df.schema}\")\n"
226 | ],
227 | "execution_count": None,
228 | "outputs": [],
229 | "metadata": {}
230 | },
231 | {
232 | "cell_type": "code",
233 | "source": [
234 | "# Streaming aggregations\n",
235 | "# Count events by type in 30-second windows\n",
236 | "windowed_counts = events_df \\\n",
237 | " .withWatermark(\"timestamp\", \"30 seconds\") \\\n",
238 | " .groupBy(\n",
239 | " window(col(\"timestamp\"), \"30 seconds\"),\n",
240 | " col(\"event_type\")\n",
241 | " ) \\\n",
242 | " .count() \\\n",
243 | " .orderBy(\"window\")\n",
244 | "\n",
245 | "# Start streaming query (console output)\n",
246 | "query = windowed_counts \\\n",
247 | " .writeStream \\\n",
248 | " .outputMode(\"complete\") \\\n",
249 | " .format(\"console\") \\\n",
250 | " .option(\"truncate\", False) \\\n",
251 | " .trigger(processingTime=\"10 seconds\") \\\n",
252 | " .start()\n",
253 | "\n",
254 | "print(\"Streaming query started. Check output below...\")\n",
255 | "print(f\"Query ID: {query.id}\")\n"
256 | ],
257 | "execution_count": None,
258 | "outputs": [],
259 | "metadata": {}
260 | },
261 | {
262 | "cell_type": "code",
263 | "source": [
264 | "# Let the stream run for a short time\n",
265 | "import time\n",
266 | "time.sleep(30) # Run for 30 seconds\n",
267 | "\n",
268 | "# Stop the query\n",
269 | "query.stop()\n",
270 | "print(\"Streaming query stopped\")\n",
271 | "\n",
272 | "# Show query progress\n",
273 | "print(\"\\nQuery progress:\")\n",
274 | "print(query.lastProgress)\n"
275 | ],
276 | "execution_count": None,
277 | "outputs": [],
278 | "metadata": {}
279 | },
280 | {
281 | "cell_type": "code",
282 | "source": [
283 | "# Stream to Delta Lake table\n",
284 | "streaming_table = \"streaming_events\"\n",
285 | "\n",
286 | "# Create another streaming query that writes to Delta\n",
287 | "delta_query = events_df \\\n",
288 | " .writeStream \\\n",
289 | " .format(\"delta\") \\\n",
290 | " .outputMode(\"append\") \\\n",
291 | " .option(\"checkpointLocation\", \"/tmp/checkpoint/streaming_events\") \\\n",
292 | " .table(streaming_table)\n",
293 | "\n",
294 | "print(f\"Started streaming to Delta table: {streaming_table}\")\n",
295 | "print(f\"Query ID: {delta_query.id}\")\n",
296 | "\n",
297 | "# Let it run briefly\n",
298 | "time.sleep(20)\n",
299 | "\n",
300 | "# Stop and check results\n",
301 | "delta_query.stop()\n",
302 | "\n",
303 | "# Read from the Delta table\n",
304 | "result_df = spark.table(streaming_table)\n",
305 | "print(f\"\\nTotal records in Delta table: {result_df.count()}\")\n",
306 | "result_df.show(20)\n"
307 | ],
308 | "execution_count": None,
309 | "outputs": [],
310 | "metadata": {}
311 | }
312 | ]
313 | }
314 |
315 | class PySparkCodeGenerator:
316 | """Generates PySpark code snippets for common operations."""
317 |
318 | @staticmethod
319 | def generate_fabric_lakehouse_reader(lakehouse_name: str, table_name: str) -> str:
320 | """Generate code to read from a Fabric Lakehouse table."""
321 | return f"""# Read from Fabric Lakehouse table
322 | df = spark.table("{lakehouse_name}.{table_name}")
323 |
324 | # Alternative: Read from Delta files directly
325 | # df = spark.read.format("delta").load("Tables/{table_name}")
326 |
327 | # Show basic info
328 | print(f"Records: {{df.count()}}")
329 | print(f"Columns: {{len(df.columns)}}")
330 | df.printSchema()
331 | df.show(10)"""
332 |
333 | @staticmethod
334 | def generate_fabric_lakehouse_writer(table_name: str, mode: str = "overwrite") -> str:
335 | """Generate code to write to a Fabric Lakehouse table."""
336 | return f"""# Write to Fabric Lakehouse table
337 | df.write \\
338 | .format("delta") \\
339 | .mode("{mode}") \\
340 | .option("overwriteSchema", "true") \\
341 | .saveAsTable("{table_name}")
342 |
343 | print(f"Successfully wrote {{df.count()}} records to table '{table_name}'")
344 |
345 | # Verify the write
346 | verification_df = spark.table("{table_name}")
347 | print(f"Verification - Table now has {{verification_df.count()}} records")"""
348 |
349 | @staticmethod
350 | def generate_delta_merge_operation(target_table: str, source_df_name: str, join_condition: str) -> str:
351 | """Generate code for Delta Lake merge operations."""
352 | return f"""# Delta Lake merge operation (UPSERT)
353 | from delta.tables import DeltaTable
354 |
355 | # Create DeltaTable reference
356 | target_table = DeltaTable.forName(spark, "{target_table}")
357 |
358 | # Perform merge operation
359 | target_table.alias("target") \\
360 | .merge(
361 | {source_df_name}.alias("source"),
362 | "{join_condition}"
363 | ) \\
364 | .whenMatchedUpdateAll() \\
365 | .whenNotMatchedInsertAll() \\
366 | .execute()
367 |
368 | print("Merge operation completed successfully")
369 | print(f"Table now has {{spark.table('{target_table}').count()}} records")"""
370 |
371 | @staticmethod
372 | def generate_performance_monitoring() -> str:
373 | """Generate code for monitoring PySpark performance."""
374 | return """# PySpark Performance Monitoring
375 |
376 | # 1. Check Spark configuration
377 | print("=== Spark Configuration ===")
378 | for key, value in spark.sparkContext.getConf().getAll():
379 | if 'spark.sql' in key or 'spark.serializer' in key:
380 | print(f"{key}: {value}")
381 |
382 | # 2. Monitor DataFrame operations
383 | from pyspark.sql.utils import AnalysisException
384 | import time
385 |
386 | def monitor_operation(df, operation_name):
387 | start_time = time.time()
388 | try:
389 | count = df.count()
390 | end_time = time.time()
391 | duration = end_time - start_time
392 | print(f"{operation_name}: {count} records in {duration:.2f} seconds")
393 | return count, duration
394 | except Exception as e:
395 | print(f"Error in {operation_name}: {e}")
396 | return 0, 0
397 |
398 | # Example usage:
399 | # count, duration = monitor_operation(df, "DataFrame Count")
400 |
401 | # 3. Show execution plan
402 | print("\\n=== Execution Plan ===")
403 | df.explain(True)
404 |
405 | # 4. Cache analysis
406 | print("\\n=== Storage Levels ===")
407 | print(f"DataFrame cached: {df.is_cached}")
408 | if df.is_cached:
409 | print(f"Storage level: {df.storageLevel}")"""
410 |
411 | class PySparkValidator:
412 | """Validates PySpark code and suggests optimizations."""
413 |
414 | @staticmethod
415 | def validate_fabric_compatibility(code: str) -> Dict[str, List[str]]:
416 | """Check if code is compatible with Microsoft Fabric."""
417 | issues = []
418 | suggestions = []
419 |
420 | # Check for Fabric-specific patterns
421 | if 'SparkSession.builder' in code:
422 | issues.append("❌ Don't create SparkSession in Fabric - use pre-configured 'spark' variable")
423 |
424 | if 'notebookutils' not in code and any(pattern in code for pattern in ['lakehouse', 'workspace']):
425 | suggestions.append("💡 Consider using 'notebookutils' for Fabric integration")
426 |
427 | if '.saveAsTable(' in code and 'format("delta")' not in code:
428 | suggestions.append("💡 Specify Delta format explicitly when saving tables in Fabric")
429 |
430 | if 'jdbc' in code.lower():
431 | suggestions.append("💡 Consider using Fabric's built-in connectors instead of JDBC")
432 |
433 | return {
434 | "issues": issues,
435 | "suggestions": suggestions
436 | }
437 |
438 | @staticmethod
439 | def check_performance_patterns(code: str) -> Dict[str, List[str]]:
440 | """Check for performance anti-patterns and optimizations."""
441 | warnings = []
442 | optimizations = []
443 |
444 | # Performance anti-patterns
445 | if '.collect()' in code:
446 | warnings.append("⚠️ .collect() can cause OOM on large datasets")
447 |
448 | if 'rdd.' in code and 'parallelize' not in code:
449 | warnings.append("⚠️ RDD operations are less optimized than DataFrame operations")
450 |
451 | if code.count('spark.read') > 3 and '.cache()' not in code:
452 | optimizations.append("💡 Consider caching frequently accessed DataFrames")
453 |
454 | if '.join(' in code and 'broadcast' not in code:
455 | optimizations.append("💡 Consider broadcast joins for small dimension tables")
456 |
457 | if '.write.' in code and 'partitionBy' not in code:
458 | optimizations.append("💡 Consider partitioning large datasets for better performance")
459 |
460 | return {
461 | "warnings": warnings,
462 | "optimizations": optimizations
463 | }
464 |
465 | def create_notebook_from_template(template_name: str, custom_params: Optional[Dict] = None) -> Dict[str, Any]:
466 | """Create a complete notebook from a template."""
467 | template_manager = PySparkTemplateManager()
468 |
469 | templates = {
470 | "fabric_integration": template_manager.get_fabric_integration_template(),
471 | "streaming": template_manager.get_streaming_template(),
472 | }
473 |
474 | if template_name not in templates:
475 | raise ValueError(f"Unknown template: {template_name}. Available: {list(templates.keys())}")
476 |
477 | template = templates[template_name]
478 |
479 | # Create notebook structure
480 | notebook = {
481 | "nbformat": 4,
482 | "nbformat_minor": 5,
483 | "cells": template["cells"],
484 | "metadata": {
485 | "language_info": {"name": "python"},
486 | "kernel_info": {"name": "synapse_pyspark"},
487 | "description": f"PySpark notebook created from {template_name} template"
488 | }
489 | }
490 |
491 | return notebook
492 |
--------------------------------------------------------------------------------
/helpers/clients/fabric_client.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | from typing import Dict, Any, List, Optional, Tuple, Union
3 | import base64
4 | from urllib.parse import quote
5 | from functools import lru_cache
6 | import requests
7 | from azure.identity import DefaultAzureCredential
8 | from helpers.logging_config import get_logger
9 | from helpers.utils import _is_valid_uuid
10 | import json
11 | from uuid import UUID
12 |
13 | logger = get_logger(__name__)
14 | # from sempy_labs._helper_functions import create_item
15 |
16 |
17 |
18 | class FabricApiConfig(BaseModel):
19 | """Configuration for Fabric API"""
20 |
21 | base_url: str = "https://api.fabric.microsoft.com/v1"
22 | max_results: int = 100
23 |
24 |
25 | class FabricApiClient:
26 | """Client for communicating with the Fabric API"""
27 |
28 | def __init__(self, credential=None, config=None):
29 | self.credential = credential or DefaultAzureCredential()
30 | self.config = config or FabricApiConfig()
31 | # Initialize cached methods
32 | self._cached_resolve_workspace = lru_cache(maxsize=128)(self._resolve_workspace)
33 | self._cached_resolve_lakehouse = lru_cache(maxsize=128)(self._resolve_lakehouse)
34 |
35 | def _get_headers(self) -> Dict[str, str]:
36 | """Get headers for Fabric API calls"""
37 | return {
38 | "Authorization": f"Bearer {self.credential.get_token('https://api.fabric.microsoft.com/.default').token}"
39 | }
40 |
41 | def _build_url(
42 | self, endpoint: str, continuation_token: Optional[str] = None
43 | ) -> str:
44 | # If the endpoint starts with http, use it as-is.
45 | url = (
46 | endpoint
47 | if endpoint.startswith("http")
48 | else f"{self.config.base_url}/{endpoint.lstrip('/')}"
49 | )
50 | if continuation_token:
51 | separator = "&" if "?" in url else "?"
52 | encoded_token = quote(continuation_token)
53 | url += f"{separator}continuationToken={encoded_token}"
54 | return url
55 |
56 | async def _make_request(
57 | self,
58 | endpoint: str,
59 | params: Optional[Dict] = None,
60 | method: str = "GET",
61 | use_pagination: bool = False,
62 | data_key: str = "value",
63 | lro: bool = False,
64 | lro_poll_interval: int = 2, # seconds between polls
65 | lro_timeout: int = 300, # max seconds to wait
66 | ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
67 | """
68 | Make an asynchronous call to the Fabric API.
69 |
70 | If use_pagination is True, it will automatically handle paginated responses.
71 |
72 | If lro is True, will poll for long-running operation completion.
73 | """
74 | import time
75 |
76 | params = params or {}
77 |
78 | if not use_pagination:
79 | url = self._build_url(endpoint=endpoint)
80 | try:
81 | if method.upper() == "POST":
82 | # logger.debug(f"Authorization header: {self._get_headers()}")
83 | # logger.debug(f"Request URL: {url}")
84 | # logger.debug(f"Request parameters: {params}")
85 | response = requests.post(
86 | url,
87 | headers=self._get_headers(),
88 | json=params,
89 | timeout=120,
90 | )
91 | else:
92 | if "maxResults" not in params:
93 | params["maxResults"] = self.config.max_results
94 | response = requests.request(
95 | method=method.upper(),
96 | url=url,
97 | headers=self._get_headers(),
98 | params=params,
99 | timeout=120,
100 | )
101 |
102 | # LRO support: check for 202 and Operation-Location
103 | if lro and response.status_code == 202:
104 | op_url = response.headers.get(
105 | "Operation-Location"
106 | ) or response.headers.get("operation-location")
107 | if not op_url:
108 | logger.error("LRO: No Operation-Location header found.")
109 | return None
110 | logger.info(f"LRO: Polling {op_url} for operation status...")
111 | start_time = time.time()
112 | while True:
113 | poll_resp = requests.get(
114 | op_url, headers=self._get_headers(), timeout=60
115 | )
116 | if poll_resp.status_code not in (200, 201, 202):
117 | logger.error(
118 | f"LRO: Poll failed with status {poll_resp.status_code}"
119 | )
120 | return None
121 | poll_data = poll_resp.json()
122 | status = poll_data.get("status") or poll_data.get(
123 | "operationStatus"
124 | )
125 | if status in (
126 | "Succeeded",
127 | "succeeded",
128 | "Completed",
129 | "completed",
130 | ):
131 | logger.info("LRO: Operation succeeded.")
132 | return poll_data
133 | if status in ("Failed", "failed", "Canceled", "canceled"):
134 | logger.error(
135 | f"LRO: Operation failed or canceled. Status: {status}"
136 | )
137 | return poll_data
138 | if time.time() - start_time > lro_timeout:
139 | logger.error("LRO: Polling timed out.")
140 | return None
141 | logger.debug(
142 | f"LRO: Status {status}, waiting {lro_poll_interval}s..."
143 | )
144 | time.sleep(lro_poll_interval)
145 | response.raise_for_status()
146 | return response.json()
147 | except requests.RequestException as e:
148 | logger.error(f"API call failed: {str(e)}")
149 | if e.response is not None:
150 | logger.error(f"Response content: {e.response.text}")
151 | return None
152 | else:
153 | results = []
154 | continuation_token = None
155 | while True:
156 | url = self._build_url(
157 | endpoint=endpoint, continuation_token=continuation_token
158 | )
159 | request_params = params.copy()
160 | # Remove any existing continuationToken in parameters to avoid conflict.
161 | request_params.pop("continuationToken", None)
162 | try:
163 | if method.upper() == "POST":
164 | response = requests.post(
165 | url,
166 | headers=self._get_headers(),
167 | json=request_params,
168 | timeout=120,
169 | )
170 | else:
171 | if "maxResults" not in request_params:
172 | request_params["maxResults"] = self.config.max_results
173 | response = requests.request(
174 | method=method.upper(),
175 | url=url,
176 | headers=self._get_headers(),
177 | params=request_params,
178 | timeout=120,
179 | )
180 | response.raise_for_status()
181 | data = response.json()
182 | except requests.RequestException as e:
183 | logger.error(f"API call failed: {str(e)}")
184 | if e.response is not None:
185 | logger.error(f"Response content: {e.response.text}")
186 | return results if results else None
187 |
188 | if not isinstance(data, dict) or data_key not in data:
189 | raise ValueError(f"Unexpected response format: {data}")
190 |
191 | results.extend(data[data_key])
192 | continuation_token = data.get("continuationToken")
193 | if not continuation_token:
194 | break
195 | return results
196 |
197 | async def get_workspaces(self) -> List[Dict]:
198 | """Get all available workspaces"""
199 | return await self._make_request("workspaces", use_pagination=True)
200 |
201 | async def get_lakehouses(self, workspace_id: str) -> List[Dict]:
202 | """Get all lakehouses in a workspace"""
203 | return await self.get_items(workspace_id=workspace_id, item_type="Lakehouse")
204 |
205 | async def get_warehouses(self, workspace_id: str) -> List[Dict]:
206 | """Get all warehouses in a workspace
207 | Args:
208 | workspace_id: ID of the workspace
209 | Returns:
210 | A list of dictionaries containing warehouse details or an error message.
211 | """
212 | return await self.get_items(workspace_id=workspace_id, item_type="Warehouse")
213 |
214 | async def get_tables(self, workspace_id: str, rsc_id: str, type: str) -> List[Dict]:
215 | """Get all tables in a lakehouse
216 | Args:
217 | workspace_id: ID of the workspace
218 | rsc_id: ID of the lakehouse
219 | type: Type of the resource (e.g., "Lakehouse" or "Warehouse")
220 | Returns:
221 | A list of dictionaries containing table details or an error message.
222 | """
223 | return await self._make_request(
224 | f"workspaces/{workspace_id}/{type}s/{rsc_id}/tables",
225 | use_pagination=True,
226 | data_key="data",
227 | )
228 |
229 | async def get_reports(self, workspace_id: str) -> List[Dict]:
230 | """Get all reports in a lakehouse
231 | Args:
232 | workspace_id: ID of the workspace
233 | Returns:
234 | A list of dictionaries containing report details or an error message.
235 | """
236 | return await self._make_request(
237 | f"workspaces/{workspace_id}/reports",
238 | use_pagination=True,
239 | data_key="value",
240 | )
241 |
242 | async def get_report(self, workspace_id: str, report_id: str) -> Dict:
243 | """Get a specific report by ID
244 |
245 | Args:
246 | workspace_id: ID of the workspace
247 | report_id: ID of the report
248 |
249 | Returns:
250 | A dictionary containing the report details or an error message.
251 | """
252 | return await self._make_request(
253 | f"workspaces/{workspace_id}/reports/{report_id}"
254 | )
255 |
256 | async def get_semantic_models(self, workspace_id: str) -> List[Dict]:
257 | """Get all semantic models in a lakehouse"""
258 | return await self._make_request(
259 | f"workspaces/{workspace_id}/semanticModels",
260 | use_pagination=True,
261 | data_key="value",
262 | )
263 |
264 | async def get_semantic_model(self, workspace_id: str, model_id: str) -> Dict:
265 | """Get a specific semantic model by ID"""
266 | return await self._make_request(
267 | f"workspaces/{workspace_id}/semanticModels/{model_id}"
268 | )
269 |
270 | async def resolve_workspace(self, workspace: str) -> str:
271 | """Convert workspace name or ID to workspace ID with caching"""
272 | return await self._cached_resolve_workspace(workspace)
273 |
274 | async def _resolve_workspace(self, workspace: str) -> str:
275 | """Internal method to convert workspace name or ID to workspace ID"""
276 | if _is_valid_uuid(workspace):
277 | return workspace
278 |
279 | workspaces = await self.get_workspaces()
280 | matching_workspaces = [
281 | w for w in workspaces if w["displayName"].lower() == workspace.lower()
282 | ]
283 |
284 | if not matching_workspaces:
285 | raise ValueError(f"No workspaces found with name: {workspace}")
286 | if len(matching_workspaces) > 1:
287 | raise ValueError(f"Multiple workspaces found with name: {workspace}")
288 |
289 | return matching_workspaces[0]["id"]
290 |
291 | async def resolve_lakehouse(self, workspace_id: str, lakehouse: str) -> str:
292 | """Convert lakehouse name or ID to lakehouse ID with caching"""
293 | return await self._cached_resolve_lakehouse(workspace_id, lakehouse)
294 |
295 | async def _resolve_lakehouse(self, workspace_id: str, lakehouse: str) -> str:
296 | """Internal method to convert lakehouse name or ID to lakehouse ID"""
297 | if _is_valid_uuid(lakehouse):
298 | return lakehouse
299 |
300 | lakehouses = await self.get_lakehouses(workspace_id)
301 | matching_lakehouses = [
302 | lh for lh in lakehouses if lh["displayName"].lower() == lakehouse.lower()
303 | ]
304 |
305 | if not matching_lakehouses:
306 | raise ValueError(f"No lakehouse found with name: {lakehouse}")
307 | if len(matching_lakehouses) > 1:
308 | raise ValueError(f"Multiple lakehouses found with name: {lakehouse}")
309 |
310 | return matching_lakehouses[0]["id"]
311 |
312 | async def get_items(
313 | self,
314 | workspace_id: str,
315 | item_type: Optional[str] = None,
316 | params: Optional[Dict] = None,
317 | ) -> List[Dict]:
318 | """Get all items in a workspace"""
319 | if not _is_valid_uuid(workspace_id):
320 | raise ValueError("Invalid workspace ID.")
321 | if item_type:
322 | params = params or {}
323 | params["type"] = item_type
324 | return await self._make_request(
325 | f"workspaces/{workspace_id}/items", params=params, use_pagination=True
326 | )
327 |
328 | async def get_item(
329 | self,
330 | item_id: str,
331 | workspace_id: str,
332 | item_type: Optional[str] = None,
333 | ) -> Dict:
334 | """Get a specific item by ID"""
335 |
336 | if not _is_valid_uuid(item_id):
337 | item_name, item_id = await self.resolve_item_name_and_id(item_id)
338 | if not _is_valid_uuid(workspace_id):
339 | (workspace_name, workspace_id) = await self.resolve_workspace_name_and_id(
340 | workspace_id
341 | )
342 | return await self._make_request(
343 | f"workspaces/{workspace_id}/{item_type}s/{item_id}"
344 | )
345 |
346 | async def create_item(
347 | self,
348 | name: str,
349 | type: str,
350 | description: Optional[str] = None,
351 | definition: Optional[dict] = None,
352 | workspace: Optional[str | UUID] = None,
353 | lro: Optional[bool] = False,
354 | ):
355 | """
356 | Creates an item in a Fabric workspace.
357 |
358 | Parameters
359 | ----------
360 | name : str
361 | The name of the item to be created.
362 | type : str
363 | The type of the item to be created.
364 | description : str, default=None
365 | A description of the item to be created.
366 | definition : dict, default=None
367 | The definition of the item to be created.
368 | workspace : str | uuid.UUID, default=None
369 | The Fabric workspace name or ID.
370 | Defaults to None which resolves to the workspace of the attached lakehouse
371 | or if no lakehouse attached, resolves to the workspace of the notebook.
372 | """
373 | from sempy_labs._utils import item_types
374 |
375 | if _is_valid_uuid(workspace):
376 | workspace_id = workspace
377 | else:
378 | (workspace_name, workspace_id) = await self.resolve_workspace_name_and_id(
379 | workspace
380 | )
381 | item_type = item_types.get(type)[0].lower()
382 |
383 | payload = {
384 | "displayName": name,
385 | }
386 | if description:
387 | payload["description"] = description
388 | if definition:
389 | payload["definition"] = definition
390 |
391 | try:
392 | response = await self._make_request(
393 | endpoint=f"workspaces/{workspace_id}/{item_type}s",
394 | method="post",
395 | params=payload,
396 | lro=lro,
397 | lro_poll_interval=0.5,
398 | )
399 | except requests.RequestException as e:
400 | logger.error(f"API call failed: {str(e)}")
401 | if e.response is not None:
402 | logger.error(f"Response content: {e.response.text}")
403 | raise ValueError(
404 | f"Failed to create item '{name}' of type '{item_type}' in the '{workspace_id}' workspace."
405 | )
406 |
407 | # Check if response contains an error
408 | if isinstance(response, dict):
409 | if "error" in response:
410 | error_msg = response.get("error", {}).get("message", "Unknown error")
411 | logger.error(f"API error creating item: {error_msg}")
412 | raise ValueError(f"Failed to create item '{name}': {error_msg}")
413 |
414 | # Check if item was created successfully
415 | if "id" in response:
416 | logger.info(f"Successfully created item '{name}' with ID: {response['id']}")
417 | return response
418 |
419 | # If no ID and no error, log the full response for debugging
420 | logger.warning(f"Unexpected response format: {response}")
421 |
422 | # Legacy check - may not be reliable for all item types
423 | if hasattr(response, 'get') and response.get("displayName") and response.get("displayName") != name:
424 | logger.warning(f"Response displayName '{response.get('displayName')}' doesn't match requested name '{name}', but this may be normal")
425 |
426 | return response
427 |
428 | async def resolve_item_name_and_id(
429 | self,
430 | item: str | UUID,
431 | type: Optional[str] = None,
432 | workspace: Optional[str | UUID] = None,
433 | ) -> Tuple[str, UUID]:
434 | (workspace_name, workspace_id) = await self.resolve_workspace_name_and_id(
435 | workspace
436 | )
437 | item_id = await self.resolve_item_id(
438 | item=item, type=type, workspace=workspace_id
439 | )
440 | item_data = await self._make_request(
441 | f"workspaces/{workspace_id}/items/{item_id}"
442 | )
443 | item_name = item_data.get("displayName")
444 | return item_name, item_id
445 |
446 | async def resolve_item_id(
447 | self,
448 | item: str | UUID,
449 | type: Optional[str] = None,
450 | workspace: Optional[str | UUID] = None,
451 | ) -> UUID:
452 | (workspace_name, workspace_id) = await self.resolve_workspace_name_and_id(
453 | workspace
454 | )
455 | item_id = None
456 |
457 | if _is_valid_uuid(item):
458 | # Check (optional)
459 | item_id = item
460 | try:
461 | self._make_request(
462 | endpoint=f"workspaces/{workspace_id}/items/{item_id}"
463 | )
464 | except requests.RequestException:
465 | raise ValueError(
466 | f"The '{item_id}' item was not found in the '{workspace_name}' workspace."
467 | )
468 | else:
469 | if type is None:
470 | raise ValueError(
471 | "The 'type' parameter is required if specifying an item name."
472 | )
473 | responses = await self._make_request(
474 | endpoint=f"workspaces/{workspace_id}/items?type={type}",
475 | use_pagination=True,
476 | )
477 | for v in responses:
478 | display_name = v["displayName"]
479 | if display_name == item:
480 | item_id = v.get("id")
481 | break
482 |
483 | if item_id is None:
484 | raise ValueError(
485 | f"There's no item '{item}' of type '{type}' in the '{workspace_name}' workspace."
486 | )
487 |
488 | return item_id
489 |
490 | async def resolve_workspace_name_and_id(
491 | self,
492 | workspace: Optional[str | UUID] = None,
493 | ) -> Tuple[str, UUID]:
494 | """
495 | Obtains the name and ID of the Fabric workspace.
496 |
497 | Parameters
498 | ----------
499 | workspace : str | uuid.UUID, default=None
500 | The Fabric workspace name or ID.
501 | Defaults to None which resolves to the workspace of the attached lakehouse
502 | or if no lakehouse attached, resolves to the workspace of the notebook.
503 |
504 | Returns
505 | -------
506 | str, uuid.UUID
507 | The name and ID of the Fabric workspace.
508 | """
509 | logger.debug(f"Resolving workspace name and ID for: {workspace}")
510 | if workspace is None:
511 | raise ValueError("Workspace must be specified.")
512 | elif _is_valid_uuid(workspace):
513 | workspace_id = workspace
514 | workspace_name = await self.resolve_workspace_name(workspace_id)
515 | return workspace_name, workspace_id
516 | else:
517 | responses = await self._make_request(
518 | endpoint="workspaces", use_pagination=True
519 | )
520 | workspace_id = None
521 | workspace_name = None
522 | for r in responses:
523 | display_name = r.get("displayName")
524 | if display_name == workspace:
525 | workspace_name = workspace
526 | workspace_id = r.get("id")
527 | return workspace_name, workspace_id
528 |
529 | if workspace_name is None or workspace_id is None:
530 | raise ValueError("Workspace not found")
531 |
532 | return workspace_name, workspace_id
533 |
534 | async def resolve_workspace_name(self, workspace_id: Optional[UUID] = None) -> str:
535 | try:
536 | response = await self._make_request(endpoint=f"workspaces/{workspace_id}")
537 | if not response or "displayName" not in response:
538 | raise ValueError(
539 | f"Workspace '{workspace_id}' not found or API response invalid: {response}"
540 | )
541 | except requests.RequestException:
542 | raise ValueError(f"The '{workspace_id}' workspace was not found.")
543 |
544 | return response.get("displayName")
545 |
546 | async def get_notebooks(self, workspace_id: str) -> List[Dict]:
547 | """Get all notebooks in a workspace"""
548 | return await self.get_items(workspace_id=workspace_id, item_type="Notebook")
549 |
550 | async def get_notebook(self, workspace_id: str, notebook_id: str) -> Dict:
551 | """Get a specific notebook by ID"""
552 | return await self.get_item(
553 | item_id=notebook_id, workspace_id=workspace_id, item_type="notebook"
554 | )
555 |
556 | async def create_notebook(
557 | self, workspace_id: str, notebook_name: str, ipynb_name: str, content: str
558 | ) -> Dict:
559 | """Create a new notebook."""
560 | if not _is_valid_uuid(workspace_id):
561 | raise ValueError("Invalid workspace ID.")
562 |
563 | # Define the notebook definition
564 | logger.debug(
565 | f"Defining notebook '{notebook_name}' in workspace '{workspace_id}'."
566 | )
567 | definition = {
568 | "format": "ipynb",
569 | "parts": [
570 | {
571 | "path": f"{ipynb_name}.ipynb",
572 | "payload": base64.b64encode(
573 | content
574 | if isinstance(content, bytes)
575 | else content.encode("utf-8")
576 | ).decode("utf-8"),
577 | "payloadType": "InlineBase64",
578 | },
579 | # {
580 | # "path": ".platform",
581 | # "payload": base64.b64encode("dotPlatformBase64String".encode("utf-8")).decode("utf-8"),
582 | # "payloadType": "InlineBase64",
583 | # },
584 | ],
585 | }
586 | logger.info(
587 | f"-------Creating notebook '{notebook_name}' in workspace '{workspace_id}'."
588 | )
589 | return await self.create_item(
590 | workspace=workspace_id,
591 | type="Notebook",
592 | name=notebook_name,
593 | definition=definition,
594 | )
595 |
--------------------------------------------------------------------------------
/tools/notebook.py:
--------------------------------------------------------------------------------
1 | from helpers.utils.context import mcp, __ctx_cache
2 | from mcp.server.fastmcp import Context
3 | from helpers.utils.authentication import get_azure_credentials
4 | from helpers.clients import (
5 | FabricApiClient,
6 | NotebookClient,
7 | )
8 | import json
9 | from helpers.logging_config import get_logger
10 |
11 |
12 | from typing import Optional, Dict, List, Any
13 | import base64
14 | import re
15 |
16 | logger = get_logger(__name__)
17 |
18 |
19 | @mcp.tool()
20 | async def list_notebooks(workspace: Optional[str] = None, ctx: Context = None) -> str:
21 | """List all notebooks in a Fabric workspace.
22 |
23 | Args:
24 | workspace: Name or ID of the workspace (optional)
25 | ctx: Context object containing client information
26 | Returns:
27 | A string containing the list of notebooks or an error message.
28 | """
29 |
30 | try:
31 | if ctx is None:
32 | raise ValueError("Context (ctx) must be provided.")
33 |
34 | notebook_client = NotebookClient(
35 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
36 | )
37 | return await notebook_client.list_notebooks(workspace)
38 | except Exception as e:
39 | logger.error(f"Error listing notebooks: {str(e)}")
40 | return f"Error listing notebooks: {str(e)}"
41 |
42 |
43 | @mcp.tool()
44 | async def create_notebook(
45 | workspace: str,
46 | # notebook_name: str,
47 | # content: str,
48 | ctx: Context = None,
49 | ) -> str:
50 | """Create a new notebook in a Fabric workspace.
51 |
52 | Args:
53 | workspace: Name or ID of the workspace
54 | notebook_name: Name of the new notebook
55 | content: Content of the notebook (in JSON format)
56 | ctx: Context object containing client information
57 | Returns:
58 | A string containing the ID of the created notebook or an error message.
59 | """
60 | notebook_json = {
61 | "nbformat": 4,
62 | "nbformat_minor": 5,
63 | "cells": [
64 | {
65 | "cell_type": "code",
66 | "source": ["print('Hello, Fabric!')\n"],
67 | "execution_count": None,
68 | "outputs": [],
69 | "metadata": {},
70 | }
71 | ],
72 | "metadata": {"language_info": {"name": "python"}},
73 | }
74 | notebook_content = json.dumps(notebook_json)
75 | try:
76 | if ctx is None:
77 | raise ValueError("Context (ctx) must be provided.")
78 |
79 | notebook_client = NotebookClient(
80 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
81 | )
82 | response = await notebook_client.create_notebook(
83 | workspace, "test_notebook_2", notebook_content
84 | )
85 | return response.get("id", "") # Return the notebook ID or an empty string
86 | except Exception as e:
87 | logger.error(f"Error creating notebook: {str(e)}")
88 | return f"Error creating notebook: {str(e)}"
89 |
90 |
91 | @mcp.tool()
92 | async def get_notebook_content(
93 | workspace: str,
94 | notebook_id: str,
95 | ctx: Context = None
96 | ) -> str:
97 | """Get the content of a specific notebook in a Fabric workspace.
98 |
99 | Args:
100 | workspace: Name or ID of the workspace
101 | notebook_id: ID or name of the notebook
102 | ctx: Context object containing client information
103 | Returns:
104 | A string containing the notebook content in JSON format or an error message.
105 | """
106 | try:
107 | if ctx is None:
108 | raise ValueError("Context (ctx) must be provided.")
109 |
110 | notebook_client = NotebookClient(
111 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
112 | )
113 |
114 | # Get the notebook details
115 | notebook = await notebook_client.get_notebook(workspace, notebook_id)
116 |
117 | if isinstance(notebook, str): # Error message
118 | return notebook
119 |
120 | # Extract and decode the notebook content
121 | definition = notebook.get("definition", {})
122 | parts = definition.get("parts", [])
123 |
124 | for part in parts:
125 | if part.get("path", "").endswith(".ipynb"):
126 | payload = part.get("payload", "")
127 | if payload:
128 | # Decode base64 content
129 | decoded_content = base64.b64decode(payload).decode("utf-8")
130 | return decoded_content
131 |
132 | return "No notebook content found in the definition."
133 |
134 | except Exception as e:
135 | logger.error(f"Error getting notebook content: {str(e)}")
136 | return f"Error getting notebook content: {str(e)}"
137 |
138 |
139 | @mcp.tool()
140 | async def create_pyspark_notebook(
141 | workspace: str,
142 | notebook_name: str,
143 | template_type: str = "basic",
144 | ctx: Context = None,
145 | ) -> str:
146 | """Create a new PySpark notebook from a template in a Fabric workspace.
147 |
148 | Args:
149 | workspace: Name or ID of the workspace
150 | notebook_name: Name of the new notebook
151 | template_type: Type of PySpark template ('basic', 'etl', 'analytics', 'ml')
152 | ctx: Context object containing client information
153 | Returns:
154 | A string containing the ID of the created notebook or an error message.
155 | """
156 | try:
157 | if ctx is None:
158 | raise ValueError("Context (ctx) must be provided.")
159 |
160 | # Define PySpark templates
161 | templates = {
162 | "basic": {
163 | "cells": [
164 | {
165 | "cell_type": "markdown",
166 | "source": [
167 | "# PySpark Notebook\n",
168 | "\n",
169 | "This notebook demonstrates basic PySpark operations in Microsoft Fabric.\n"
170 | ],
171 | "metadata": {}
172 | },
173 | {
174 | "cell_type": "code",
175 | "source": [
176 | "# Initialize Spark session\n",
177 | "from pyspark.sql import SparkSession\n",
178 | "from pyspark.sql.functions import *\n",
179 | "from pyspark.sql.types import *\n",
180 | "\n",
181 | "# Spark session is already available as 'spark' in Fabric\n",
182 | "print(f\"Spark version: {spark.version}\")\n",
183 | "print(f\"Available cores: {spark.sparkContext.defaultParallelism}\")\n"
184 | ],
185 | "execution_count": None,
186 | "outputs": [],
187 | "metadata": {}
188 | },
189 | {
190 | "cell_type": "code",
191 | "source": [
192 | "# Sample data creation\n",
193 | "sample_data = [\n",
194 | " (1, \"John\", 25, \"Engineering\"),\n",
195 | " (2, \"Jane\", 30, \"Marketing\"),\n",
196 | " (3, \"Bob\", 35, \"Sales\"),\n",
197 | " (4, \"Alice\", 28, \"Engineering\")\n",
198 | "]\n",
199 | "\n",
200 | "schema = StructType([\n",
201 | " StructField(\"id\", IntegerType(), True),\n",
202 | " StructField(\"name\", StringType(), True),\n",
203 | " StructField(\"age\", IntegerType(), True),\n",
204 | " StructField(\"department\", StringType(), True)\n",
205 | "])\n",
206 | "\n",
207 | "df = spark.createDataFrame(sample_data, schema)\n",
208 | "df.show()\n"
209 | ],
210 | "execution_count": None,
211 | "outputs": [],
212 | "metadata": {}
213 | }
214 | ]
215 | },
216 | "etl": {
217 | "cells": [
218 | {
219 | "cell_type": "markdown",
220 | "source": [
221 | "# PySpark ETL Pipeline\n",
222 | "\n",
223 | "This notebook demonstrates an ETL pipeline using PySpark in Microsoft Fabric.\n"
224 | ],
225 | "metadata": {}
226 | },
227 | {
228 | "cell_type": "code",
229 | "source": [
230 | "# Import necessary libraries\n",
231 | "from pyspark.sql import SparkSession\n",
232 | "from pyspark.sql.functions import *\n",
233 | "from pyspark.sql.types import *\n",
234 | "from delta.tables import DeltaTable\n",
235 | "\n",
236 | "print(f\"Spark version: {spark.version}\")\n"
237 | ],
238 | "execution_count": None,
239 | "outputs": [],
240 | "metadata": {}
241 | },
242 | {
243 | "cell_type": "code",
244 | "source": [
245 | "# Extract: Read data from source\n",
246 | "# Example: Reading from a lakehouse table\n",
247 | "# df_source = spark.table(\"lakehouse.table_name\")\n",
248 | "\n",
249 | "# For demo purposes, create sample data\n",
250 | "raw_data = [\n",
251 | " (\"2024-01-01\", \"Product A\", 100, 25.50),\n",
252 | " (\"2024-01-01\", \"Product B\", 150, 30.00),\n",
253 | " (\"2024-01-02\", \"Product A\", 120, 25.50),\n",
254 | " (\"2024-01-02\", \"Product C\", 80, 45.00)\n",
255 | "]\n",
256 | "\n",
257 | "schema = StructType([\n",
258 | " StructField(\"date\", StringType(), True),\n",
259 | " StructField(\"product\", StringType(), True),\n",
260 | " StructField(\"quantity\", IntegerType(), True),\n",
261 | " StructField(\"price\", DoubleType(), True)\n",
262 | "])\n",
263 | "\n",
264 | "df_raw = spark.createDataFrame(raw_data, schema)\n",
265 | "print(\"Raw data:\")\n",
266 | "df_raw.show()\n"
267 | ],
268 | "execution_count": None,
269 | "outputs": [],
270 | "metadata": {}
271 | },
272 | {
273 | "cell_type": "code",
274 | "source": [
275 | "# Transform: Clean and process data\n",
276 | "df_transformed = df_raw \\\n",
277 | " .withColumn(\"date\", to_date(col(\"date\"), \"yyyy-MM-dd\")) \\\n",
278 | " .withColumn(\"revenue\", col(\"quantity\") * col(\"price\")) \\\n",
279 | " .withColumn(\"year\", year(col(\"date\"))) \\\n",
280 | " .withColumn(\"month\", month(col(\"date\")))\n",
281 | "\n",
282 | "print(\"Transformed data:\")\n",
283 | "df_transformed.show()\n",
284 | "df_transformed.printSchema()\n"
285 | ],
286 | "execution_count": None,
287 | "outputs": [],
288 | "metadata": {}
289 | },
290 | {
291 | "cell_type": "code",
292 | "source": [
293 | "# Load: Write processed data to target\n",
294 | "# Example: Writing to a Delta table in lakehouse\n",
295 | "# df_transformed.write \\\n",
296 | "# .format(\"delta\") \\\n",
297 | "# .mode(\"overwrite\") \\\n",
298 | "# .saveAsTable(\"lakehouse.processed_sales\")\n",
299 | "\n",
300 | "print(\"ETL pipeline completed successfully!\")\n",
301 | "print(f\"Processed {df_transformed.count()} records\")\n"
302 | ],
303 | "execution_count": None,
304 | "outputs": [],
305 | "metadata": {}
306 | }
307 | ]
308 | },
309 | "analytics": {
310 | "cells": [
311 | {
312 | "cell_type": "markdown",
313 | "source": [
314 | "# PySpark Data Analytics\n",
315 | "\n",
316 | "This notebook demonstrates data analytics using PySpark in Microsoft Fabric.\n"
317 | ],
318 | "metadata": {}
319 | },
320 | {
321 | "cell_type": "code",
322 | "source": [
323 | "# Import libraries for analytics\n",
324 | "from pyspark.sql import SparkSession\n",
325 | "from pyspark.sql.functions import *\n",
326 | "from pyspark.sql.types import *\n",
327 | "from pyspark.sql.window import Window\n",
328 | "\n",
329 | "print(f\"Spark version: {spark.version}\")\n"
330 | ],
331 | "execution_count": None,
332 | "outputs": [],
333 | "metadata": {}
334 | },
335 | {
336 | "cell_type": "code",
337 | "source": [
338 | "# Create sample sales data for analytics\n",
339 | "sales_data = [\n",
340 | " (\"2024-01-01\", \"North\", \"Product A\", 1000, 100),\n",
341 | " (\"2024-01-01\", \"South\", \"Product A\", 800, 80),\n",
342 | " (\"2024-01-02\", \"North\", \"Product B\", 1200, 120),\n",
343 | " (\"2024-01-02\", \"South\", \"Product B\", 900, 90),\n",
344 | " (\"2024-01-03\", \"East\", \"Product A\", 1100, 110),\n",
345 | " (\"2024-01-03\", \"West\", \"Product C\", 700, 70)\n",
346 | "]\n",
347 | "\n",
348 | "schema = StructType([\n",
349 | " StructField(\"date\", StringType(), True),\n",
350 | " StructField(\"region\", StringType(), True),\n",
351 | " StructField(\"product\", StringType(), True),\n",
352 | " StructField(\"revenue\", IntegerType(), True),\n",
353 | " StructField(\"quantity\", IntegerType(), True)\n",
354 | "])\n",
355 | "\n",
356 | "df_sales = spark.createDataFrame(sales_data, schema)\n",
357 | "df_sales = df_sales.withColumn(\"date\", to_date(col(\"date\"), \"yyyy-MM-dd\"))\n",
358 | "df_sales.show()\n"
359 | ],
360 | "execution_count": None,
361 | "outputs": [],
362 | "metadata": {}
363 | },
364 | {
365 | "cell_type": "code",
366 | "source": [
367 | "# Aggregation analysis\n",
368 | "print(\"=== Revenue by Region ===\")\n",
369 | "df_sales.groupBy(\"region\") \\\n",
370 | " .agg(sum(\"revenue\").alias(\"total_revenue\"),\n",
371 | " sum(\"quantity\").alias(\"total_quantity\"),\n",
372 | " count(\"*\").alias(\"transaction_count\")) \\\n",
373 | " .orderBy(desc(\"total_revenue\")) \\\n",
374 | " .show()\n",
375 | "\n",
376 | "print(\"=== Revenue by Product ===\")\n",
377 | "df_sales.groupBy(\"product\") \\\n",
378 | " .agg(sum(\"revenue\").alias(\"total_revenue\"),\n",
379 | " avg(\"revenue\").alias(\"avg_revenue\")) \\\n",
380 | " .orderBy(desc(\"total_revenue\")) \\\n",
381 | " .show()\n"
382 | ],
383 | "execution_count": None,
384 | "outputs": [],
385 | "metadata": {}
386 | },
387 | {
388 | "cell_type": "code",
389 | "source": [
390 | "# Window functions for advanced analytics\n",
391 | "windowSpec = Window.partitionBy(\"region\").orderBy(\"date\")\n",
392 | "\n",
393 | "df_analytics = df_sales \\\n",
394 | " .withColumn(\"running_total\", sum(\"revenue\").over(windowSpec)) \\\n",
395 | " .withColumn(\"row_number\", row_number().over(windowSpec)) \\\n",
396 | " .withColumn(\"rank\", rank().over(windowSpec.orderBy(desc(\"revenue\"))))\n",
397 | "\n",
398 | "print(\"=== Advanced Analytics with Window Functions ===\")\n",
399 | "df_analytics.select(\"date\", \"region\", \"product\", \"revenue\", \n",
400 | " \"running_total\", \"row_number\", \"rank\") \\\n",
401 | " .orderBy(\"region\", \"date\") \\\n",
402 | " .show()\n"
403 | ],
404 | "execution_count": None,
405 | "outputs": [],
406 | "metadata": {}
407 | }
408 | ]
409 | },
410 | "ml": {
411 | "cells": [
412 | {
413 | "cell_type": "markdown",
414 | "source": [
415 | "# PySpark Machine Learning\n",
416 | "\n",
417 | "This notebook demonstrates machine learning with PySpark MLlib in Microsoft Fabric.\n"
418 | ],
419 | "metadata": {}
420 | },
421 | {
422 | "cell_type": "code",
423 | "source": [
424 | "# Import ML libraries\n",
425 | "from pyspark.sql import SparkSession\n",
426 | "from pyspark.sql.functions import *\n",
427 | "from pyspark.sql.types import *\n",
428 | "from pyspark.ml.feature import VectorAssembler, StandardScaler\n",
429 | "from pyspark.ml.regression import LinearRegression\n",
430 | "from pyspark.ml.evaluation import RegressionEvaluator\n",
431 | "from pyspark.ml import Pipeline\n",
432 | "\n",
433 | "print(f\"Spark version: {spark.version}\")\n"
434 | ],
435 | "execution_count": None,
436 | "outputs": [],
437 | "metadata": {}
438 | },
439 | {
440 | "cell_type": "code",
441 | "source": [
442 | "# Create sample dataset for regression\n",
443 | "ml_data = [\n",
444 | " (1, 2.0, 3.0, 4.0, 10.0),\n",
445 | " (2, 3.0, 4.0, 5.0, 15.0),\n",
446 | " (3, 4.0, 5.0, 6.0, 20.0),\n",
447 | " (4, 5.0, 6.0, 7.0, 25.0),\n",
448 | " (5, 6.0, 7.0, 8.0, 30.0),\n",
449 | " (6, 7.0, 8.0, 9.0, 35.0)\n",
450 | "]\n",
451 | "\n",
452 | "schema = StructType([\n",
453 | " StructField(\"id\", IntegerType(), True),\n",
454 | " StructField(\"feature1\", DoubleType(), True),\n",
455 | " StructField(\"feature2\", DoubleType(), True),\n",
456 | " StructField(\"feature3\", DoubleType(), True),\n",
457 | " StructField(\"label\", DoubleType(), True)\n",
458 | "])\n",
459 | "\n",
460 | "df_ml = spark.createDataFrame(ml_data, schema)\n",
461 | "print(\"Sample ML dataset:\")\n",
462 | "df_ml.show()\n"
463 | ],
464 | "execution_count": None,
465 | "outputs": [],
466 | "metadata": {}
467 | },
468 | {
469 | "cell_type": "code",
470 | "source": [
471 | "# Feature engineering pipeline\n",
472 | "feature_cols = [\"feature1\", \"feature2\", \"feature3\"]\n",
473 | "\n",
474 | "# Assemble features into a vector\n",
475 | "assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"raw_features\")\n",
476 | "\n",
477 | "# Scale features\n",
478 | "scaler = StandardScaler(inputCol=\"raw_features\", outputCol=\"features\")\n",
479 | "\n",
480 | "# Linear regression model\n",
481 | "lr = LinearRegression(featuresCol=\"features\", labelCol=\"label\")\n",
482 | "\n",
483 | "# Create pipeline\n",
484 | "pipeline = Pipeline(stages=[assembler, scaler, lr])\n",
485 | "\n",
486 | "print(\"ML Pipeline created with stages: Feature Assembly -> Scaling -> Linear Regression\")\n"
487 | ],
488 | "execution_count": None,
489 | "outputs": [],
490 | "metadata": {}
491 | },
492 | {
493 | "cell_type": "code",
494 | "source": [
495 | "# Split data and train model\n",
496 | "train_data, test_data = df_ml.randomSplit([0.8, 0.2], seed=42)\n",
497 | "\n",
498 | "print(f\"Training data count: {train_data.count()}\")\n",
499 | "print(f\"Test data count: {test_data.count()}\")\n",
500 | "\n",
501 | "# Train the pipeline\n",
502 | "model = pipeline.fit(train_data)\n",
503 | "\n",
504 | "# Make predictions\n",
505 | "predictions = model.transform(test_data)\n",
506 | "\n",
507 | "print(\"\\nPredictions:\")\n",
508 | "predictions.select(\"id\", \"label\", \"prediction\").show()\n"
509 | ],
510 | "execution_count": None,
511 | "outputs": [],
512 | "metadata": {}
513 | },
514 | {
515 | "cell_type": "code",
516 | "source": [
517 | "# Evaluate model performance\n",
518 | "evaluator = RegressionEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"rmse\")\n",
519 | "rmse = evaluator.evaluate(predictions)\n",
520 | "\n",
521 | "evaluator_r2 = RegressionEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"r2\")\n",
522 | "r2 = evaluator_r2.evaluate(predictions)\n",
523 | "\n",
524 | "print(f\"Root Mean Square Error (RMSE): {rmse:.3f}\")\n",
525 | "print(f\"R-squared (R2): {r2:.3f}\")\n",
526 | "\n",
527 | "# Get model coefficients\n",
528 | "lr_model = model.stages[-1]\n",
529 | "print(f\"\\nModel coefficients: {lr_model.coefficients}\")\n",
530 | "print(f\"Model intercept: {lr_model.intercept:.3f}\")\n"
531 | ],
532 | "execution_count": None,
533 | "outputs": [],
534 | "metadata": {}
535 | }
536 | ]
537 | }
538 | }
539 |
540 | if template_type not in templates:
541 | return f"Invalid template type. Available templates: {', '.join(templates.keys())}"
542 |
543 | # Create notebook JSON structure
544 | notebook_json = {
545 | "nbformat": 4,
546 | "nbformat_minor": 5,
547 | "cells": templates[template_type]["cells"],
548 | "metadata": {
549 | "language_info": {"name": "python"},
550 | "kernel_info": {"name": "synapse_pyspark"},
551 | "description": f"PySpark notebook created from {template_type} template"
552 | },
553 | }
554 |
555 | notebook_content = json.dumps(notebook_json, indent=2)
556 |
557 | notebook_client = NotebookClient(
558 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
559 | )
560 | response = await notebook_client.create_notebook(
561 | workspace, notebook_name, notebook_content
562 | )
563 |
564 | if isinstance(response, dict) and response.get("id"):
565 | return f"Created PySpark notebook '{notebook_name}' with ID: {response['id']}"
566 | else:
567 | return f"Failed to create notebook: {response}"
568 |
569 | except Exception as e:
570 | logger.error(f"Error creating PySpark notebook: {str(e)}")
571 | return f"Error creating PySpark notebook: {str(e)}"
572 |
573 | @mcp.tool()
574 | async def generate_pyspark_code(
575 | operation: str,
576 | source_table: Optional[str] = None,
577 | target_table: Optional[str] = None,
578 | columns: Optional[str] = None,
579 | filter_condition: Optional[str] = None,
580 | ctx: Context = None,
581 | ) -> str:
582 | """Generate PySpark code for common operations.
583 |
584 | Args:
585 | operation: Type of operation ('read_table', 'write_table', 'transform', 'join', 'aggregate')
586 | source_table: Source table name (format: lakehouse.table_name)
587 | target_table: Target table name (format: lakehouse.table_name)
588 | columns: Comma-separated list of columns
589 | filter_condition: Filter condition for data
590 | ctx: Context object containing client information
591 | Returns:
592 | A string containing the generated PySpark code or an error message.
593 | """
594 | try:
595 | code_templates = {
596 | "read_table": f"""# Read data from table
597 | df = spark.table("{source_table or 'lakehouse.table_name'}")
598 | df.show()
599 | df.printSchema()""",
600 |
601 | "write_table": f"""# Write data to table
602 | df.write \\
603 | .format("delta") \\
604 | .mode("overwrite") \\
605 | .saveAsTable("{target_table or 'lakehouse.output_table'}")
606 |
607 | print(f"Successfully wrote {{df.count()}} records to {target_table or 'lakehouse.output_table'}")""",
608 |
609 | "transform": f"""# Data transformation
610 | from pyspark.sql.functions import *
611 |
612 | df_transformed = df \\
613 | .select({columns or '*'}) \\
614 | {f'.filter({filter_condition})' if filter_condition else ''} \\
615 | .withColumn("processed_date", current_timestamp())
616 |
617 | df_transformed.show()""",
618 |
619 | "join": f"""# Join tables
620 | df1 = spark.table("{source_table or 'lakehouse.table1'}")
621 | df2 = spark.table("{target_table or 'lakehouse.table2'}")
622 |
623 | # Inner join (modify join condition as needed)
624 | df_joined = df1.join(df2, df1.id == df2.id, "inner")
625 |
626 | df_joined.show()""",
627 |
628 | "aggregate": f"""# Data aggregation
629 | from pyspark.sql.functions import *
630 |
631 | df_agg = df \\
632 | .groupBy({columns or '"column1"'}) \\
633 | .agg(
634 | count("*").alias("count"),
635 | sum("amount").alias("total_amount"),
636 | avg("amount").alias("avg_amount"),
637 | max("date").alias("max_date")
638 | ) \\
639 | .orderBy(desc("total_amount"))
640 |
641 | df_agg.show()""",
642 |
643 | "schema_inference": f"""# Schema inference and data profiling
644 | print("=== Schema Information ===")
645 | df.printSchema()
646 |
647 | print("\\n=== Data Profile ===")
648 | print(f"Record count: {{df.count()}}")
649 | print(f"Column count: {{len(df.columns)}}")
650 |
651 | print("\\n=== Column Statistics ===")
652 | df.describe().show()
653 |
654 | print("\\n=== Null Value Analysis ===")
655 | from pyspark.sql.functions import col, sum as spark_sum, isnan, when, count
656 |
657 | null_counts = df.select([
658 | spark_sum(when(col(c).isNull() | isnan(col(c)), 1).otherwise(0)).alias(c)
659 | for c in df.columns
660 | ])
661 | null_counts.show()""",
662 |
663 | "data_quality": f"""# Data quality checks
664 | from pyspark.sql.functions import *
665 |
666 | print("=== Data Quality Report ===")
667 |
668 | # Check for duplicates
669 | duplicate_count = df.count() - df.distinct().count()
670 | print(f"Duplicate rows: {{duplicate_count}}")
671 |
672 | # Check for null values
673 | total_rows = df.count()
674 | for column in df.columns:
675 | null_count = df.filter(col(column).isNull()).count()
676 | null_percentage = (null_count / total_rows) * 100
677 | print(f"{{column}}: {{null_count}} nulls ({{null_percentage:.2f}}%)")
678 |
679 | # Check data ranges (for numeric columns)
680 | numeric_columns = [field.name for field in df.schema.fields
681 | if field.dataType.simpleString() in ['int', 'double', 'float', 'bigint']]
682 |
683 | if numeric_columns:
684 | print("\\n=== Numeric Column Ranges ===")
685 | df.select([
686 | min(col(c)).alias(f"{c}_min"),
687 | max(col(c)).alias(f"{c}_max")
688 | for c in numeric_columns
689 | ]).show()""",
690 |
691 | "performance_optimization": f"""# Performance optimization techniques
692 |
693 | # 1. Cache frequently used DataFrames
694 | df.cache()
695 | print(f"Cached DataFrame with {{df.count()}} records")
696 |
697 | # 2. Repartition for better parallelism
698 | optimal_partitions = spark.sparkContext.defaultParallelism * 2
699 | df_repartitioned = df.repartition(optimal_partitions)
700 |
701 | # 3. Use broadcast for small dimension tables (< 200MB)
702 | from pyspark.sql.functions import broadcast
703 | # df_joined = large_df.join(broadcast(small_df), "key")
704 |
705 | # 4. Optimize file formats - use Delta Lake
706 | df.write \\
707 | .format("delta") \\
708 | .mode("overwrite") \\
709 | .option("optimizeWrite", "true") \\
710 | .option("autoOptimize", "true") \\
711 | .saveAsTable("{target_table or 'lakehouse.optimized_table'}")
712 |
713 | # 5. Show execution plan
714 | df.explain(True)"""
715 | }
716 |
717 | if operation not in code_templates:
718 | available_ops = ", ".join(code_templates.keys())
719 | return f"Invalid operation. Available operations: {available_ops}"
720 |
721 | generated_code = code_templates[operation]
722 |
723 | return f"""```python
724 | {generated_code}
725 | ```
726 |
727 | **Generated PySpark code for '{operation}' operation**
728 |
729 | This code can be copied into a notebook cell and executed. Remember to:
730 | - Replace placeholder table names with actual table names
731 | - Adjust column names and conditions as needed
732 | - Test with a small dataset first
733 | - Review the execution plan for performance optimization"""
734 |
735 | except Exception as e:
736 | logger.error(f"Error generating PySpark code: {str(e)}")
737 | return f"Error generating PySpark code: {str(e)}"
738 |
739 | @mcp.tool()
740 | async def validate_pyspark_code(
741 | code: str,
742 | ctx: Context = None,
743 | ) -> str:
744 | """Validate PySpark code for syntax and best practices.
745 |
746 | Args:
747 | code: PySpark code to validate
748 | ctx: Context object containing client information
749 | Returns:
750 | A string containing validation results and suggestions.
751 | """
752 | try:
753 | validation_results = []
754 | warnings = []
755 | suggestions = []
756 |
757 | # Basic syntax validation
758 | try:
759 | compile(code, '', 'exec')
760 | validation_results.append("✅ Syntax validation: PASSED")
761 | except SyntaxError as e:
762 | validation_results.append(f"❌ Syntax validation: FAILED - {e}")
763 | return "\n".join(validation_results)
764 |
765 | # PySpark best practices checks
766 | lines = code.split('\n')
767 |
768 | # Check for common imports
769 | has_spark_imports = any('from pyspark' in line or 'import pyspark' in line for line in lines)
770 | if not has_spark_imports:
771 | warnings.append("⚠️ No PySpark imports detected. Add: from pyspark.sql import SparkSession")
772 |
773 | # Check for DataFrame operations
774 | has_df_operations = any('df.' in line or '.show()' in line for line in lines)
775 | if has_df_operations:
776 | validation_results.append("✅ DataFrame operations detected")
777 |
778 | # Check for performance anti-patterns
779 | if '.collect()' in code:
780 | warnings.append("⚠️ .collect() detected - avoid on large datasets, use .show() or .take() instead")
781 |
782 | if '.toPandas()' in code:
783 | warnings.append("⚠️ .toPandas() detected - ensure dataset fits in driver memory")
784 |
785 | if 'for row in df.collect()' in code:
786 | warnings.append("❌ Anti-pattern: iterating over collected DataFrame. Use DataFrame operations instead")
787 |
788 | # Check for caching opportunities
789 | df_count = code.count('df.')
790 | if df_count > 3 and '.cache()' not in code and '.persist()' not in code:
791 | suggestions.append("💡 Consider caching DataFrame with .cache() for repeated operations")
792 |
793 | # Check for schema definition
794 | if 'createDataFrame' in code and 'StructType' not in code:
795 | suggestions.append("💡 Consider defining explicit schema when creating DataFrames")
796 |
797 | # Check for null handling
798 | if '.filter(' in code and 'isNull' not in code and 'isNotNull' not in code:
799 | suggestions.append("💡 Consider adding null value handling in filters")
800 |
801 | # Check for partitioning
802 | if '.write.' in code and 'partitionBy' not in code:
803 | suggestions.append("💡 Consider partitioning data when writing large datasets")
804 |
805 | # Check for Delta Lake usage
806 | if '.write.' in code and 'format("delta")' not in code:
807 | suggestions.append("💡 Consider using Delta Lake format for ACID transactions and time travel")
808 |
809 | # Compile results
810 | result = "# PySpark Code Validation Report\n\n"
811 | result += "## Validation Results\n"
812 | result += "\n".join(validation_results) + "\n\n"
813 |
814 | if warnings:
815 | result += "## Warnings\n"
816 | result += "\n".join(warnings) + "\n\n"
817 |
818 | if suggestions:
819 | result += "## Optimization Suggestions\n"
820 | result += "\n".join(suggestions) + "\n\n"
821 |
822 | if not warnings and not suggestions:
823 | result += "## Summary\n✅ Code looks good! No issues detected.\n"
824 | else:
825 | result += f"## Summary\n📊 Found {len(warnings)} warnings and {len(suggestions)} optimization opportunities.\n"
826 |
827 | return result
828 |
829 | except Exception as e:
830 | logger.error(f"Error validating PySpark code: {str(e)}")
831 | return f"Error validating PySpark code: {str(e)}"
832 |
833 | @mcp.tool()
834 | async def update_notebook_cell(
835 | workspace: str,
836 | notebook_id: str,
837 | cell_index: int,
838 | cell_content: str,
839 | cell_type: str = "code",
840 | ctx: Context = None,
841 | ) -> str:
842 | """Update a specific cell in a notebook.
843 |
844 | Args:
845 | workspace: Name or ID of the workspace
846 | notebook_id: ID or name of the notebook
847 | cell_index: Index of the cell to update (0-based)
848 | cell_content: New content for the cell
849 | cell_type: Type of cell ('code' or 'markdown')
850 | ctx: Context object containing client information
851 | Returns:
852 | A string confirming the update or an error message.
853 | """
854 | try:
855 | if ctx is None:
856 | raise ValueError("Context (ctx) must be provided.")
857 |
858 | # Get current notebook content
859 | current_content = await get_notebook_content(workspace, notebook_id, ctx)
860 |
861 | if current_content.startswith("Error"):
862 | return current_content
863 |
864 | # Parse the notebook JSON
865 | notebook_data = json.loads(current_content)
866 | cells = notebook_data.get("cells", [])
867 |
868 | if cell_index >= len(cells):
869 | return f"Cell index {cell_index} is out of range. Notebook has {len(cells)} cells."
870 |
871 | # Update the cell
872 | cells[cell_index] = {
873 | "cell_type": cell_type,
874 | "source": cell_content.split('\n') if isinstance(cell_content, str) else cell_content,
875 | "execution_count": None,
876 | "outputs": [],
877 | "metadata": {}
878 | }
879 |
880 | # Update the notebook
881 | updated_content = json.dumps(notebook_data, indent=2)
882 |
883 | notebook_client = NotebookClient(
884 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
885 | )
886 |
887 | # This would require implementing an update method in the client
888 | # For now, return a success message indicating what would be updated
889 | return f"Cell {cell_index} updated successfully with {cell_type} content (length: {len(cell_content)} characters)"
890 |
891 | except Exception as e:
892 | logger.error(f"Error updating notebook cell: {str(e)}")
893 | return f"Error updating notebook cell: {str(e)}"
894 |
895 | @mcp.tool()
896 | async def create_fabric_notebook(
897 | workspace: str,
898 | notebook_name: str,
899 | template_type: str = "fabric_integration",
900 | ctx: Context = None,
901 | ) -> str:
902 | """Create a new notebook optimized for Microsoft Fabric using advanced templates.
903 |
904 | Args:
905 | workspace: Name or ID of the workspace
906 | notebook_name: Name of the new notebook
907 | template_type: Type of Fabric template ('fabric_integration', 'streaming')
908 | ctx: Context object containing client information
909 | Returns:
910 | A string containing the ID of the created notebook or an error message.
911 | """
912 | try:
913 | if ctx is None:
914 | raise ValueError("Context (ctx) must be provided.")
915 |
916 | from helpers.pyspark_helpers import create_notebook_from_template
917 |
918 | # Create notebook from advanced template
919 | notebook_data = create_notebook_from_template(template_type)
920 | notebook_content = json.dumps(notebook_data, indent=2)
921 |
922 | notebook_client = NotebookClient(
923 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
924 | )
925 | response = await notebook_client.create_notebook(
926 | workspace, notebook_name, notebook_content
927 | )
928 |
929 | if isinstance(response, dict) and response.get("id"):
930 | return f"Created Fabric-optimized notebook '{notebook_name}' with ID: {response['id']} using {template_type} template"
931 | else:
932 | return f"Failed to create notebook: {response}"
933 |
934 | except Exception as e:
935 | logger.error(f"Error creating Fabric notebook: {str(e)}")
936 | return f"Error creating Fabric notebook: {str(e)}"
937 |
938 | @mcp.tool()
939 | async def generate_fabric_code(
940 | operation: str,
941 | lakehouse_name: Optional[str] = None,
942 | table_name: Optional[str] = None,
943 | target_table: Optional[str] = None,
944 | ctx: Context = None,
945 | ) -> str:
946 | """Generate Fabric-specific PySpark code for lakehouse operations.
947 |
948 | Args:
949 | operation: Type of operation ('read_lakehouse', 'write_lakehouse', 'merge_delta', 'performance_monitor')
950 | lakehouse_name: Name of the lakehouse
951 | table_name: Name of the source table
952 | target_table: Name of the target table (for write/merge operations)
953 | ctx: Context object containing client information
954 | Returns:
955 | A string containing the generated Fabric-specific PySpark code.
956 | """
957 | try:
958 | from helpers.pyspark_helpers import PySparkCodeGenerator
959 |
960 | generator = PySparkCodeGenerator()
961 |
962 | if operation == "read_lakehouse":
963 | if not lakehouse_name or not table_name:
964 | return "Error: lakehouse_name and table_name are required for read_lakehouse operation"
965 | code = generator.generate_fabric_lakehouse_reader(lakehouse_name, table_name)
966 |
967 | elif operation == "write_lakehouse":
968 | if not table_name:
969 | return "Error: table_name is required for write_lakehouse operation"
970 | code = generator.generate_fabric_lakehouse_writer(table_name)
971 |
972 | elif operation == "merge_delta":
973 | if not target_table:
974 | return "Error: target_table is required for merge_delta operation"
975 | source_df = "new_df" # Default source DataFrame name
976 | join_condition = "target.id = source.id" # Default join condition
977 | code = generator.generate_delta_merge_operation(target_table, source_df, join_condition)
978 |
979 | elif operation == "performance_monitor":
980 | code = generator.generate_performance_monitoring()
981 |
982 | else:
983 | available_ops = ["read_lakehouse", "write_lakehouse", "merge_delta", "performance_monitor"]
984 | return f"Invalid operation. Available operations: {', '.join(available_ops)}"
985 |
986 | return f"""```python
987 | {code}
988 | ```
989 |
990 | **Generated Fabric-specific PySpark code for '{operation}' operation**
991 |
992 | This code is optimized for Microsoft Fabric and includes:
993 | - Proper Delta Lake integration
994 | - Fabric lakehouse connectivity
995 | - Performance monitoring capabilities
996 | - Best practices for Fabric environment"""
997 |
998 | except Exception as e:
999 | logger.error(f"Error generating Fabric code: {str(e)}")
1000 | return f"Error generating Fabric code: {str(e)}"
1001 |
1002 | @mcp.tool()
1003 | async def validate_fabric_code(
1004 | code: str,
1005 | ctx: Context = None,
1006 | ) -> str:
1007 | """Validate PySpark code for Microsoft Fabric compatibility and performance.
1008 |
1009 | Args:
1010 | code: PySpark code to validate for Fabric compatibility
1011 | ctx: Context object containing client information
1012 | Returns:
1013 | A string containing detailed validation results and Fabric-specific recommendations.
1014 | """
1015 | try:
1016 | from helpers.pyspark_helpers import PySparkValidator
1017 |
1018 | validator = PySparkValidator()
1019 |
1020 | # Basic syntax validation
1021 | validation_results = []
1022 | try:
1023 | compile(code, '', 'exec')
1024 | validation_results.append("✅ Syntax validation: PASSED")
1025 | except SyntaxError as e:
1026 | validation_results.append(f"❌ Syntax validation: FAILED - {e}")
1027 | return "\n".join(validation_results)
1028 |
1029 | # Fabric compatibility checks
1030 | fabric_results = validator.validate_fabric_compatibility(code)
1031 |
1032 | # Performance pattern checks
1033 | performance_results = validator.check_performance_patterns(code)
1034 |
1035 | # Additional Fabric-specific checks
1036 | fabric_warnings = []
1037 | fabric_suggestions = []
1038 |
1039 | # Check for Fabric best practices
1040 | if 'spark.table(' in code:
1041 | validation_results.append("✅ Using Fabric managed tables")
1042 |
1043 | if 'notebookutils' in code:
1044 | validation_results.append("✅ Using Fabric notebook utilities")
1045 |
1046 | if 'format("delta")' in code:
1047 | validation_results.append("✅ Using Delta Lake format")
1048 |
1049 | # Check for potential issues
1050 | if 'spark.sql("USE' in code:
1051 | fabric_warnings.append("⚠️ Explicit USE statements may not be necessary in Fabric")
1052 |
1053 | if 'hdfs://' in code or 's3://' in code:
1054 | fabric_warnings.append("⚠️ Direct file system paths detected - consider using Fabric's managed storage")
1055 |
1056 | # Compile comprehensive report
1057 | result = "# Microsoft Fabric PySpark Code Validation Report\n\n"
1058 |
1059 | result += "## Basic Validation\n"
1060 | result += "\n".join(validation_results) + "\n\n"
1061 |
1062 | if fabric_results["issues"]:
1063 | result += "## Fabric Compatibility Issues\n"
1064 | result += "\n".join(fabric_results["issues"]) + "\n\n"
1065 |
1066 | all_warnings = fabric_warnings + performance_results["warnings"]
1067 | if all_warnings:
1068 | result += "## Warnings\n"
1069 | result += "\n".join(all_warnings) + "\n\n"
1070 |
1071 | all_suggestions = fabric_results["suggestions"] + fabric_suggestions + performance_results["optimizations"]
1072 | if all_suggestions:
1073 | result += "## Fabric Optimization Suggestions\n"
1074 | result += "\n".join(all_suggestions) + "\n\n"
1075 |
1076 | # Summary
1077 | total_issues = len(fabric_results["issues"])
1078 | total_warnings = len(all_warnings)
1079 | total_suggestions = len(all_suggestions)
1080 |
1081 | result += "## Summary\n"
1082 | if total_issues == 0 and total_warnings == 0:
1083 | result += "✅ Code is Fabric-ready! No critical issues detected.\n"
1084 | else:
1085 | result += f"📊 Found {total_issues} critical issues, {total_warnings} warnings, and {total_suggestions} optimization opportunities.\n"
1086 |
1087 | result += "\n### Fabric-Specific Recommendations:\n"
1088 | result += "- Use `spark.table()` for managed tables in lakehouses\n"
1089 | result += "- Leverage `notebookutils` for Fabric integration\n"
1090 | result += "- Always use Delta Lake format for optimal performance\n"
1091 | result += "- Consider partitioning strategies for large datasets\n"
1092 | result += "- Use broadcast joins for dimension tables < 200MB\n"
1093 |
1094 | return result
1095 |
1096 | except Exception as e:
1097 | logger.error(f"Error validating Fabric code: {str(e)}")
1098 | return f"Error validating Fabric code: {str(e)}"
1099 |
1100 | @mcp.tool()
1101 | async def analyze_notebook_performance(
1102 | workspace: str,
1103 | notebook_id: str,
1104 | ctx: Context = None,
1105 | ) -> str:
1106 | """Analyze a notebook's code for performance optimization opportunities in Fabric.
1107 |
1108 | Args:
1109 | workspace: Name or ID of the workspace
1110 | notebook_id: ID or name of the notebook
1111 | ctx: Context object containing client information
1112 | Returns:
1113 | A string containing performance analysis and optimization recommendations.
1114 | """
1115 | try:
1116 | if ctx is None:
1117 | raise ValueError("Context (ctx) must be provided.")
1118 |
1119 | # Get notebook content
1120 | notebook_content = await get_notebook_content(workspace, notebook_id, ctx)
1121 |
1122 | if notebook_content.startswith("Error"):
1123 | return notebook_content
1124 |
1125 | # Parse notebook and extract code cells
1126 | notebook_data = json.loads(notebook_content)
1127 | cells = notebook_data.get("cells", [])
1128 |
1129 | code_cells = [cell for cell in cells if cell.get("cell_type") == "code"]
1130 |
1131 | if not code_cells:
1132 | return "No code cells found in the notebook."
1133 |
1134 | # Analyze each code cell
1135 | analysis_results = []
1136 | total_operations = 0
1137 | performance_issues = []
1138 | optimization_opportunities = []
1139 |
1140 | from helpers.pyspark_helpers import PySparkValidator
1141 | validator = PySparkValidator()
1142 |
1143 | for i, cell in enumerate(code_cells):
1144 | cell_source = "\n".join(cell.get("source", []))
1145 |
1146 | if not cell_source.strip():
1147 | continue
1148 |
1149 | analysis_results.append(f"### Cell {i + 1}")
1150 |
1151 | # Count operations
1152 | operations = [
1153 | ("DataFrame reads", cell_source.count("spark.read") + cell_source.count("spark.table")),
1154 | ("DataFrame writes", cell_source.count(".write.")),
1155 | ("Transformations", cell_source.count(".withColumn") + cell_source.count(".select") + cell_source.count(".filter")),
1156 | ("Actions", cell_source.count(".show()") + cell_source.count(".count()") + cell_source.count(".collect()"))
1157 | ]
1158 |
1159 | for op_name, count in operations:
1160 | if count > 0:
1161 | analysis_results.append(f"- {op_name}: {count}")
1162 | total_operations += count
1163 |
1164 | # Check for performance patterns
1165 | perf_results = validator.check_performance_patterns(cell_source)
1166 | performance_issues.extend(perf_results["warnings"])
1167 | optimization_opportunities.extend(perf_results["optimizations"])
1168 |
1169 | # Fabric-specific analysis
1170 | fabric_results = validator.validate_fabric_compatibility(cell_source)
1171 | optimization_opportunities.extend(fabric_results["suggestions"])
1172 |
1173 | # Generate comprehensive report
1174 | report = f"# Notebook Performance Analysis Report\n\n"
1175 | report += f"**Notebook:** {notebook_id}\n"
1176 | report += f"**Total Code Cells:** {len(code_cells)}\n"
1177 | report += f"**Total Operations:** {total_operations}\n\n"
1178 |
1179 | if analysis_results:
1180 | report += "## Cell-by-Cell Analysis\n"
1181 | report += "\n".join(analysis_results) + "\n\n"
1182 |
1183 | if performance_issues:
1184 | report += "## Performance Issues Found\n"
1185 | for issue in set(performance_issues): # Remove duplicates
1186 | report += f"- {issue}\n"
1187 | report += "\n"
1188 |
1189 | if optimization_opportunities:
1190 | report += "## Optimization Opportunities\n"
1191 | for opportunity in set(optimization_opportunities): # Remove duplicates
1192 | report += f"- {opportunity}\n"
1193 | report += "\n"
1194 |
1195 | # Performance score calculation
1196 | score = 100
1197 | score -= len(set(performance_issues)) * 10 # -10 points per unique issue
1198 | score -= len(set(optimization_opportunities)) * 5 # -5 points per optimization opportunity
1199 | score = max(score, 0) # Ensure score doesn't go negative
1200 |
1201 | report += f"## Performance Score: {score}/100\n\n"
1202 |
1203 | if score >= 80:
1204 | report += "✅ **Excellent** - Your notebook is well-optimized for Fabric!\n"
1205 | elif score >= 60:
1206 | report += "⚠️ **Good** - Some optimization opportunities exist.\n"
1207 | elif score >= 40:
1208 | report += "🔧 **Needs Improvement** - Several performance issues should be addressed.\n"
1209 | else:
1210 | report += "❌ **Poor** - Significant performance optimization required.\n"
1211 |
1212 | return report
1213 |
1214 | except Exception as e:
1215 | logger.error(f"Error analyzing notebook performance: {str(e)}")
1216 | return f"Error analyzing notebook performance: {str(e)}"
1217 |
--------------------------------------------------------------------------------