├── .dockerignore ├── docs ├── mcp_vscode.png ├── mcp_inspector.png ├── architecture.md └── pyspark_guide.md ├── helpers ├── utils │ ├── __init__.py │ ├── context.py │ ├── validators.py │ ├── authentication.py │ └── table_tools.py ├── logging_config.py ├── clients │ ├── __init__.py │ ├── report_client.py │ ├── workspace_client.py │ ├── warehouse_client.py │ ├── lakehouse_client.py │ ├── notebook_client.py │ ├── table_client.py │ ├── semanticModel_client.py │ ├── sql_client.py │ └── fabric_client.py ├── formatters │ ├── metadata_formatter.py │ └── schema_formatter.py └── pyspark_helpers.py ├── .gitignore ├── Dockerfile ├── pyproject.toml ├── fabric_mcp.py ├── tools ├── __init__.py ├── workspace.py ├── sql_endpoint.py ├── report.py ├── semantic_model.py ├── warehouse.py ├── load_data.py ├── lakehouse.py ├── table.py └── notebook.py ├── test_notebook_creation.py ├── test_security.py └── README.md /.dockerignore: -------------------------------------------------------------------------------- 1 | .venv 2 | Inprogress 3 | .ruff_cache/ 4 | # Python bytecode files 5 | __pycache__/ -------------------------------------------------------------------------------- /docs/mcp_vscode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datumnova/ms-fabric-mcp/HEAD/docs/mcp_vscode.png -------------------------------------------------------------------------------- /docs/mcp_inspector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datumnova/ms-fabric-mcp/HEAD/docs/mcp_inspector.png -------------------------------------------------------------------------------- /helpers/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from helpers.utils.validators import _is_valid_uuid 2 | 3 | __all__ = [ 4 | "_is_valid_uuid", 5 | ] 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # Virtual environments 10 | .venv 11 | 12 | # Ruff cache 13 | .ruff_cache/ 14 | Inprogress 15 | -------------------------------------------------------------------------------- /helpers/utils/context.py: -------------------------------------------------------------------------------- 1 | from mcp.server.fastmcp import FastMCP 2 | from cachetools import TTLCache 3 | 4 | 5 | # Create MCP instance with context manager 6 | mcp = FastMCP("Fabric MCP Server ", json_response=True, stateless_http=True) 7 | mcp.settings.log_level = "debug" 8 | 9 | # Shared cache and context 10 | __ctx_cache = TTLCache(maxsize=100, ttl=300) # Cache for 5 minutes 11 | ctx = mcp.get_context() 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim 2 | 3 | # Install uv. 4 | COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ 5 | 6 | # Copy the application into the container. 7 | COPY . /app 8 | 9 | # Install the application dependencies. 10 | WORKDIR /app 11 | RUN uv sync --frozen --no-cache 12 | 13 | # Run the application. 14 | CMD ["uv", "run", "python", "fabric_mcp.py", "--port", "8081"] 15 | # CMD ["/app/.venv/bin/fastapi", "run", "app/fabric_mcp.py", "--port", "80", "--host", "0.0.0.0"] -------------------------------------------------------------------------------- /helpers/logging_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def get_logger(name: str) -> logging.Logger: 5 | """Set up and return a logger.""" 6 | logger = logging.getLogger(name) 7 | handler = logging.StreamHandler() 8 | formatter = logging.Formatter( 9 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 10 | ) 11 | handler.setFormatter(formatter) 12 | logger.addHandler(handler) 13 | logger.setLevel(logging.DEBUG) 14 | logger.propagate = True 15 | return logger 16 | -------------------------------------------------------------------------------- /helpers/utils/validators.py: -------------------------------------------------------------------------------- 1 | from uuid import UUID 2 | 3 | 4 | def _is_valid_uuid( 5 | guid: str, 6 | ): 7 | """ 8 | Validates if a string is a valid GUID in version 4 9 | 10 | Parameters 11 | ---------- 12 | guid : str 13 | GUID to be validated. 14 | 15 | Returns 16 | ------- 17 | bool 18 | Boolean that indicates if the string is a GUID or not. 19 | """ 20 | 21 | try: 22 | UUID(str(guid), version=4) 23 | return True 24 | except ValueError: 25 | return False 26 | -------------------------------------------------------------------------------- /helpers/utils/authentication.py: -------------------------------------------------------------------------------- 1 | from azure.identity import DefaultAzureCredential 2 | from cachetools import TTLCache 3 | 4 | 5 | def get_azure_credentials(client_id: str, cache: TTLCache) -> DefaultAzureCredential: 6 | """ 7 | Get Azure credentials using DefaultAzureCredential. 8 | This function is used to authenticate with Azure services. 9 | """ 10 | if f"{client_id}_creds" in cache: 11 | return cache[f"{client_id}_creds"] 12 | # If credentials are not cached, create a new DefaultAzureCredential instance 13 | # and store it in the cache. 14 | else: 15 | cache[f"{client_id}_creds"] = DefaultAzureCredential() 16 | return cache[f"{client_id}_creds"] 17 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "ms-fabric-mcp" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "mcp[cli]", 9 | "azure-identity", 10 | "deltalake", 11 | "requests", 12 | "cachetools", 13 | "semantic-link-labs", 14 | "azure-storage-blob", 15 | "polars", 16 | "sqlalchemy", 17 | "pyodbc", 18 | "tabulate", 19 | "fastapi[standard]", 20 | "python-jose[cryptography]", 21 | "passlib[bcrypt]", 22 | "python-multipart", 23 | "fastapi-mcp", 24 | ] 25 | 26 | [tool.setuptools] 27 | packages = ["helpers", "helpers.clients", "helpers.formatters", "helpers.utils"] 28 | 29 | [project.scripts] 30 | mcp = "mcp.cli:app [cli]" 31 | -------------------------------------------------------------------------------- /helpers/clients/__init__.py: -------------------------------------------------------------------------------- 1 | from helpers.clients.lakehouse_client import LakehouseClient 2 | from helpers.clients.warehouse_client import WarehouseClient 3 | from helpers.clients.table_client import TableClient 4 | from helpers.clients.workspace_client import WorkspaceClient 5 | from helpers.clients.semanticModel_client import SemanticModelClient 6 | from helpers.clients.report_client import ReportClient 7 | from helpers.clients.fabric_client import FabricApiClient 8 | from helpers.clients.sql_client import SQLClient, get_sql_endpoint 9 | from helpers.clients.notebook_client import NotebookClient 10 | 11 | 12 | __all__ = [ 13 | "LakehouseClient", 14 | "WarehouseClient", 15 | "TableClient", 16 | "WorkspaceClient", 17 | "FabricApiClient", 18 | "SemanticModelClient", 19 | "ReportClient", 20 | "NotebookClient", 21 | "SQLClient", 22 | "get_sql_endpoint", 23 | ] 24 | -------------------------------------------------------------------------------- /helpers/clients/report_client.py: -------------------------------------------------------------------------------- 1 | from helpers.logging_config import get_logger 2 | from helpers.clients.fabric_client import FabricApiClient 3 | 4 | logger = get_logger(__name__) 5 | 6 | 7 | class ReportClient: 8 | def __init__(self, client: FabricApiClient): 9 | self.client = client 10 | 11 | async def list_reports(self, workspace_id: str): 12 | """List all reports in a workspace.""" 13 | reports = await self.client.get_reports(workspace_id) 14 | 15 | if not reports: 16 | return f"No reports found in workspace '{workspace_id}'." 17 | 18 | return reports 19 | 20 | async def get_report(self, workspace_id: str, report_id: str) -> dict: 21 | """Get a specific report by ID.""" 22 | report = await self.client.get_report(workspace_id, report_id) 23 | 24 | if not report: 25 | return ( 26 | f"No report found with ID '{report_id}' in workspace '{workspace_id}'." 27 | ) 28 | 29 | return report 30 | -------------------------------------------------------------------------------- /fabric_mcp.py: -------------------------------------------------------------------------------- 1 | from tools import * 2 | from helpers.logging_config import get_logger 3 | from helpers.utils.context import mcp, __ctx_cache 4 | import uvicorn 5 | import argparse 6 | import logging 7 | 8 | 9 | 10 | logger = get_logger(__name__) 11 | logger.level = logging.INFO 12 | 13 | 14 | @mcp.tool() 15 | async def clear_context() -> str: 16 | """Clear the current session context. 17 | 18 | Returns: 19 | A string confirming the context has been cleared. 20 | """ 21 | __ctx_cache.clear() 22 | return "Context cleared." 23 | 24 | 25 | if __name__ == "__main__": 26 | # Initialize and run the server 27 | logger.info("Starting MCP server...") 28 | parser = argparse.ArgumentParser(description="Run MCP Streamable HTTP based server") 29 | parser.add_argument("--port", type=int, default=8081, help="Localhost port to listen on") 30 | args = parser.parse_args() 31 | 32 | # Start the server with Streamable HTTP transport 33 | uvicorn.run(mcp.streamable_http_app, host="0.0.0.0", port=args.port) 34 | # mcp.run(transport="stdio") 35 | -------------------------------------------------------------------------------- /helpers/clients/workspace_client.py: -------------------------------------------------------------------------------- 1 | from helpers.logging_config import get_logger 2 | from helpers.clients.fabric_client import FabricApiClient 3 | 4 | logger = get_logger(__name__) 5 | 6 | 7 | class WorkspaceClient: 8 | def __init__(self, client: FabricApiClient): 9 | self.client = client 10 | 11 | async def list_workspaces(self): 12 | """List all available workspaces.""" 13 | workspaces = await self.client.get_workspaces() 14 | if not workspaces: 15 | raise ValueError("No workspaces found.") 16 | 17 | markdown = "# Fabric Workspaces\n\n" 18 | markdown += "| ID | Name | Capacity |\n" 19 | markdown += "|-----|------|----------|\n" 20 | 21 | for ws in workspaces: 22 | markdown += f"| {ws['id']} | {ws['displayName']} | {ws.get('capacityId', 'N/A')} |\n" 23 | 24 | return markdown 25 | 26 | async def resolve_workspace(self, workspace_name: str): 27 | """Resolve workspace name to workspace ID.""" 28 | return await self.client.resolve_workspace_name_and_id(workspace=workspace_name) 29 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | from tools.workspace import set_workspace, list_workspaces 2 | from tools.warehouse import set_warehouse, list_warehouses 3 | from tools.lakehouse import set_lakehouse, list_lakehouses 4 | from tools.table import ( 5 | set_table, 6 | list_tables, 7 | get_lakehouse_table_schema, 8 | get_all_lakehouse_schemas, 9 | run_query, 10 | ) 11 | from tools.semantic_model import ( 12 | list_semantic_models, 13 | get_semantic_model, 14 | ) 15 | from tools.report import ( 16 | list_reports, 17 | get_report, 18 | ) 19 | from tools.load_data import load_data_from_url 20 | from tools.notebook import list_notebooks, create_notebook 21 | 22 | __all__ = [ 23 | "set_workspace", 24 | "list_workspaces", 25 | "set_warehouse", 26 | "list_warehouses", 27 | "set_lakehouse", 28 | "list_lakehouses", 29 | "set_table", 30 | "list_tables", 31 | "get_lakehouse_table_schema", 32 | "get_all_lakehouse_schemas", 33 | "list_semantic_models", 34 | "get_semantic_model", 35 | "list_reports", 36 | "get_report", 37 | "load_data_from_url", 38 | "run_query", 39 | "list_notebooks", 40 | "create_notebook", 41 | ] 42 | -------------------------------------------------------------------------------- /helpers/formatters/metadata_formatter.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import json 3 | 4 | 5 | def format_metadata_to_markdown(metadata: object) -> str: 6 | """Convert Delta table metadata to a responsive markdown format with HTML.""" 7 | md = "#### Metadata\n\n" 8 | md += "
\n" 9 | md += f"
ID:
{metadata.id}
\n" 10 | if metadata.name: 11 | md += f"
Name:
{metadata.name}
\n" 12 | if metadata.description: 13 | md += f"
Description:
{metadata.description}
\n" 14 | if metadata.partition_columns: 15 | md += f"
Partition Columns:
{', '.join(metadata.partition_columns)}
\n" 16 | if metadata.created_time: 17 | created_time = datetime.fromtimestamp(metadata.created_time / 1000) 18 | md += f"
Created:
{created_time.strftime('%Y-%m-%d %H:%M:%S')}
\n" 19 | if metadata.configuration: 20 | md += "
Configuration:
\n" 21 | md += "
\n" 22 | md += "
\n" 23 | md += " View JSON\n" 24 | md += "
\n"
25 |         md += json.dumps(metadata.configuration, indent=2)
26 |         md += "\n      
\n" 27 | md += "
\n" 28 | md += "
\n" 29 | md += "
\n" 30 | return md 31 | -------------------------------------------------------------------------------- /tools/workspace.py: -------------------------------------------------------------------------------- 1 | from helpers.utils.context import mcp, __ctx_cache 2 | from mcp.server.fastmcp import Context 3 | from helpers.utils.authentication import get_azure_credentials 4 | from helpers.clients import ( 5 | FabricApiClient, 6 | WorkspaceClient, 7 | ) 8 | 9 | 10 | @mcp.tool() 11 | async def set_workspace(workspace: str, ctx: Context) -> str: 12 | """Set the current workspace for the session. 13 | 14 | Args: 15 | workspace: Name or ID of the workspace 16 | ctx: Context object containing client information 17 | Returns: 18 | A string confirming the workspace has been set. 19 | """ 20 | __ctx_cache[f"{ctx.client_id}_workspace"] = workspace 21 | return f"Workspace set to '{workspace}'." 22 | 23 | 24 | @mcp.tool() 25 | async def list_workspaces(ctx: Context) -> str: 26 | """List all available Fabric workspaces. 27 | 28 | Args: 29 | ctx: Context object containing client information 30 | 31 | Returns: 32 | A string containing the list of workspaces or an error message. 33 | """ 34 | try: 35 | client = WorkspaceClient( 36 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 37 | ) 38 | 39 | workspaces = await client.list_workspaces() 40 | 41 | return workspaces 42 | 43 | except Exception as e: 44 | return f"Error listing workspaces: {str(e)}" 45 | -------------------------------------------------------------------------------- /helpers/formatters/schema_formatter.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | from helpers.formatters.metadata_formatter import format_metadata_to_markdown 3 | 4 | 5 | def format_schema_to_markdown( 6 | table_info: Dict, schema: object, metadata: object 7 | ) -> str: 8 | """Convert a Delta table schema and metadata to a responsive markdown format with HTML.""" 9 | md = f"

Delta Table: {table_info['name']}

\n" 10 | md += f"

Type: {table_info['type']}

\n" 11 | md += f"

Location: {table_info['location']}

\n\n" 12 | 13 | # Responsive schema table wrapped in a scrollable div 14 | md += "

Schema

\n" 15 | md += '
\n' 16 | md += '\n' 17 | md += " \n" 18 | md += " \n" 19 | md += " \n" 20 | md += " \n" 21 | md += " \n" 22 | 23 | for field in schema.fields: 24 | md += " \n" 25 | md += f" \n" 26 | md += f" \n" 27 | md += f" \n" 28 | md += " \n" 29 | 30 | md += "
Column NameData TypeNullable
{field.name}{field.type}{field.nullable}
\n" 31 | md += "
\n\n" 32 | 33 | # Collapsible metadata section for a dynamic feel 34 | md += "
\n" 35 | md += " View Metadata\n\n" 36 | md += format_metadata_to_markdown(metadata) 37 | md += "\n
\n" 38 | 39 | return md + "\n" 40 | -------------------------------------------------------------------------------- /helpers/clients/warehouse_client.py: -------------------------------------------------------------------------------- 1 | from helpers.logging_config import get_logger 2 | from helpers.clients.fabric_client import FabricApiClient 3 | from typing import Optional, Dict, Any 4 | 5 | logger = get_logger(__name__) 6 | 7 | 8 | class WarehouseClient: 9 | def __init__(self, client: FabricApiClient): 10 | self.client = client 11 | 12 | async def list_warehouses(self, workspace: str): 13 | """List all warehouses in a lakehouse.""" 14 | warehouses = await self.client.get_warehouses(workspace) 15 | 16 | if not warehouses: 17 | return f"No warehouses found in workspace '{workspace}'." 18 | 19 | markdown = f"# Warehouses in workspace '{workspace}'\n\n" 20 | markdown += "| ID | Name |\n" 21 | markdown += "|-----|------|\n" 22 | 23 | for wh in warehouses: 24 | markdown += f"| {wh['id']} | {wh['displayName']} |\n" 25 | 26 | return markdown 27 | 28 | async def get_warehouse( 29 | self, 30 | workspace: str, 31 | warehouse: str, 32 | ) -> Optional[Dict[str, Any]]: 33 | """Get details of a specific warehouse.""" 34 | if not warehouse: 35 | raise ValueError("Warehouse name cannot be empty.") 36 | 37 | return await self.client.get_item( 38 | workspace_id=workspace, item_id=warehouse, item_type="warehouse" 39 | ) 40 | 41 | async def create_warehouse( 42 | self, 43 | name: str, 44 | workspace: str, 45 | description: Optional[str] = None, 46 | ): 47 | """Create a new warehouse.""" 48 | if not name: 49 | raise ValueError("Warehouse name cannot be empty.") 50 | 51 | return await self.client.create_item( 52 | name=name, workspace=workspace, description=description, type="Warehouse" 53 | ) 54 | -------------------------------------------------------------------------------- /tools/sql_endpoint.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from helpers.utils.context import mcp, __ctx_cache 3 | from mcp.server.fastmcp import Context 4 | from helpers.clients import get_sql_endpoint 5 | 6 | 7 | @mcp.tool() 8 | async def get_sql_endpoint( 9 | workspace: Optional[str] = None, 10 | lakehouse: Optional[str] = None, 11 | warehouse: Optional[str] = None, 12 | type: Optional[str] = None, 13 | ctx: Context = None, 14 | ) -> str: 15 | """ 16 | Retrieve the SQL endpoint for a specified lakehouse or warehouse. 17 | 18 | Args: 19 | workspace: Name or ID of the workspace (optional). 20 | lakehouse: Name or ID of the lakehouse (optional). 21 | warehouse: Name or ID of the warehouse (optional). 22 | type: Type of resource ('lakehouse' or 'warehouse'). If not provided, it will be inferred. 23 | ctx: Context object containing client information. 24 | 25 | Returns: 26 | A string containing the resource type, name/ID, and its SQL endpoint. 27 | """ 28 | try: 29 | if ctx is None: 30 | raise ValueError("Context (ctx) must be provided.") 31 | 32 | if workspace is None: 33 | workspace = __ctx_cache.get(f"{ctx.client_id}_workspace") 34 | if workspace is None: 35 | raise ValueError("Workspace must be specified or set in context.") 36 | if lakehouse is None and warehouse is None: 37 | lakehouse = __ctx_cache.get(f"{ctx.client_id}_lakehouse") 38 | warehouse = __ctx_cache.get(f"{ctx.client_id}_warehouse") 39 | if warehouse is None and lakehouse is None: 40 | raise ValueError( 41 | "Either lakehouse or warehouse must be specified or set in context." 42 | ) 43 | 44 | name, endpoint = await get_sql_endpoint( 45 | workspace=workspace, 46 | lakehouse=lakehouse, 47 | warehouse=warehouse, # Add warehouse to the call 48 | type=type, 49 | ) 50 | 51 | return ( 52 | endpoint 53 | if endpoint 54 | else f"No SQL endpoint found for {type} '{lakehouse or warehouse}' in workspace '{workspace}'." 55 | ) 56 | except Exception as e: 57 | return f"Error retrieving SQL endpoint: {str(e)}" 58 | -------------------------------------------------------------------------------- /helpers/utils/table_tools.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple, Optional 2 | from azure.identity import DefaultAzureCredential 3 | from deltalake import DeltaTable 4 | from helpers.logging_config import get_logger 5 | import asyncio 6 | 7 | logger = get_logger(__name__) 8 | 9 | 10 | async def get_delta_schemas( 11 | tables: List[Dict], credential: DefaultAzureCredential 12 | ) -> List[Tuple[Dict, object, object]]: 13 | """Get schema and metadata for each Delta table""" 14 | delta_tables = [] 15 | logger.info(f"Starting schema extraction for {len(tables)} tables") 16 | 17 | # Get token for Azure Storage (not Fabric API) 18 | token = credential.get_token("https://storage.azure.com/.default").token 19 | storage_options = {"bearer_token": token, "use_fabric_endpoint": "true"} 20 | 21 | for table in tables: 22 | task = asyncio.create_task(get_delta_table(table, storage_options)) 23 | delta_tables.append(task) 24 | logger.debug(f"Created task for table: {table['name']}") 25 | # Wait for all tasks to complete 26 | delta_tables = await asyncio.gather(*delta_tables) 27 | logger.info(f"Completed schema extraction for {len(delta_tables)} tables") 28 | # Filter out None values 29 | delta_tables = [dt for dt in delta_tables if dt is not None] 30 | return delta_tables 31 | 32 | 33 | async def get_delta_table( 34 | table: Dict, storage_options: Optional[Dict] = None 35 | ) -> Optional[Tuple[Dict, object, object]]: 36 | """Get Delta table schema and metadata""" 37 | logger.debug(f"Processing table: {table['name']}") 38 | 39 | # Check if the table is a Delta table 40 | 41 | if table["format"].lower() == "delta": 42 | try: 43 | table_path = table["location"] 44 | logger.debug(f"Processing Delta table: {table['name']} at {table_path}") 45 | 46 | # Create DeltaTable instance with storage options 47 | delta_table = DeltaTable(table_path, storage_options=storage_options) 48 | 49 | # Get both schema and metadata 50 | result = (table, delta_table.schema(), delta_table.metadata()) 51 | logger.info(f"Processed table: {table['name']}") 52 | return result 53 | 54 | except Exception as e: 55 | logger.error(f"Could not process table {table['name']}: {str(e)}") 56 | return None 57 | -------------------------------------------------------------------------------- /helpers/clients/lakehouse_client.py: -------------------------------------------------------------------------------- 1 | from helpers.utils import _is_valid_uuid 2 | from helpers.logging_config import get_logger 3 | from helpers.clients.fabric_client import FabricApiClient 4 | from typing import Optional, Dict, Any 5 | 6 | logger = get_logger(__name__) 7 | 8 | 9 | class LakehouseClient: 10 | def __init__(self, client: FabricApiClient): 11 | self.client = client 12 | 13 | async def list_lakehouses(self, workspace: str): 14 | """List all lakehouses in a workspace.""" 15 | if not _is_valid_uuid(workspace): 16 | raise ValueError("Invalid workspace ID.") 17 | lakehouses = await self.client.get_lakehouses(workspace) 18 | 19 | if not lakehouses: 20 | return f"No lakehouses found in workspace '{workspace}'." 21 | 22 | markdown = f"# Lakehouses in workspace '{workspace}'\n\n" 23 | markdown += "| ID | Name |\n" 24 | markdown += "|-----|------|\n" 25 | 26 | for lh in lakehouses: 27 | markdown += f"| {lh['id']} | {lh['displayName']} |\n" 28 | 29 | return markdown 30 | 31 | async def get_lakehouse( 32 | self, 33 | workspace: str, 34 | lakehouse: str, 35 | ) -> Optional[Dict[str, Any]]: 36 | """Get details of a specific lakehouse.""" 37 | if not _is_valid_uuid(workspace): 38 | raise ValueError("Invalid workspace ID.") 39 | 40 | if not lakehouse: 41 | raise ValueError("Lakehouse name cannot be empty.") 42 | 43 | response = await self.client.get_item(workspace_id=workspace, item_id=lakehouse) 44 | logger.info(f"Lakehouse details: {response}") 45 | return response 46 | 47 | async def resolve_lakehouse(self, workspace_id: str, lakehouse_name: str): 48 | """Resolve lakehouse name to lakehouse ID.""" 49 | return await self.client.resolve_item_name_and_id( 50 | workspace=workspace_id, item=lakehouse_name, type="Lakehouse" 51 | ) 52 | 53 | async def create_lakehouse( 54 | self, 55 | name: str, 56 | workspace: str, 57 | description: Optional[str] = None, 58 | ): 59 | """Create a new lakehouse.""" 60 | if not _is_valid_uuid(workspace): 61 | raise ValueError("Invalid workspace ID.") 62 | 63 | if not name: 64 | raise ValueError("Lakehouse name cannot be empty.") 65 | 66 | return await self.client.create_item( 67 | name=name, workspace=workspace, description="description", type="Lakehouse" 68 | ) 69 | -------------------------------------------------------------------------------- /tools/report.py: -------------------------------------------------------------------------------- 1 | from helpers.utils.context import mcp, __ctx_cache 2 | from mcp.server.fastmcp import Context 3 | from helpers.utils.authentication import get_azure_credentials 4 | from helpers.clients import ( 5 | FabricApiClient, 6 | ReportClient, 7 | ) 8 | from helpers.logging_config import get_logger 9 | from typing import Optional 10 | 11 | logger = get_logger(__name__) 12 | 13 | 14 | @mcp.tool() 15 | async def list_reports(workspace: Optional[str] = None, ctx: Context = None) -> str: 16 | """List all reports in a Fabric workspace. 17 | 18 | Args: 19 | workspace: Name or ID of the workspace (optional) 20 | ctx: Context object containing client information 21 | Returns: 22 | A string containing the list of reports or an error message. 23 | """ 24 | try: 25 | client = ReportClient( 26 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 27 | ) 28 | 29 | reports = await client.list_reports( 30 | workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"] 31 | ) 32 | 33 | markdown = f"# Reports in workspace '{workspace}'\n\n" 34 | markdown += "| ID | Name | Description |\n" 35 | markdown += "|-----|------|-------------|\n" 36 | 37 | for report in reports: 38 | markdown += f"| {report.get('id', 'N/A')} | {report.get('displayName', 'N/A')} | {report.get('description', 'N/A')} |\n" 39 | 40 | return markdown 41 | 42 | except Exception as e: 43 | return f"Error listing reports: {str(e)}" 44 | 45 | 46 | @mcp.tool() 47 | async def get_report( 48 | workspace: Optional[str] = None, 49 | report_id: Optional[str] = None, 50 | ctx: Context = None, 51 | ) -> str: 52 | """Get a specific report by ID. 53 | 54 | Args: 55 | workspace: Name or ID of the workspace (optional) 56 | report_id: ID of the report (optional) 57 | ctx: Context object containing client information 58 | 59 | Returns: 60 | A string containing the report details or an error message. 61 | """ 62 | try: 63 | client = ReportClient( 64 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 65 | ) 66 | 67 | report = await client.get_report( 68 | workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"], 69 | report_id, 70 | ) 71 | 72 | if not report: 73 | return f"No report found with ID '{report_id}' in workspace '{workspace}'." 74 | 75 | return f"Report details:\n\n{report}" 76 | 77 | except Exception as e: 78 | return f"Error getting report: {str(e)}" 79 | -------------------------------------------------------------------------------- /tools/semantic_model.py: -------------------------------------------------------------------------------- 1 | from helpers.utils.context import mcp, __ctx_cache 2 | from mcp.server.fastmcp import Context 3 | from helpers.utils.authentication import get_azure_credentials 4 | from helpers.clients import ( 5 | FabricApiClient, 6 | SemanticModelClient, 7 | ) 8 | from helpers.logging_config import get_logger 9 | 10 | from typing import Optional 11 | 12 | logger = get_logger(__name__) 13 | 14 | 15 | @mcp.tool() 16 | async def list_semantic_models( 17 | workspace: Optional[str] = None, ctx: Context = None 18 | ) -> str: 19 | """List all semantic models in a Fabric workspace. 20 | 21 | Args: 22 | workspace: Name or ID of the workspace (optional) 23 | ctx: Context object containing client information 24 | 25 | Returns: 26 | A string containing the list of semantic models or an error message. 27 | """ 28 | try: 29 | client = SemanticModelClient( 30 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 31 | ) 32 | 33 | models = await client.list_semantic_models( 34 | workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"] 35 | ) 36 | 37 | markdown = f"# Semantic Models in workspace '{workspace}'\n\n" 38 | markdown += "| ID | Name | Folder ID | Description |\n" 39 | markdown += "|-----|------|-----------|-------------|\n" 40 | 41 | for model in models: 42 | markdown += f"| {model.get('id', 'N/A')} | {model.get('displayName', 'N/A')} | {model.get('folderId', 'N/A')} | {model.get('description', 'N/A')} |\n" 43 | 44 | return markdown 45 | 46 | except Exception as e: 47 | return f"Error listing semantic models: {str(e)}" 48 | 49 | 50 | @mcp.tool() 51 | async def get_semantic_model( 52 | workspace: Optional[str] = None, 53 | model_id: Optional[str] = None, 54 | ctx: Context = None, 55 | ) -> str: 56 | """Get a specific semantic model by ID. 57 | 58 | Args: 59 | workspace: Name or ID of the workspace (optional) 60 | model_id: ID of the semantic model (optional) 61 | ctx: Context object containing client information 62 | 63 | Returns: 64 | A string containing the details of the semantic model or an error message. 65 | """ 66 | try: 67 | client = SemanticModelClient( 68 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 69 | ) 70 | 71 | model = await client.get_semantic_model( 72 | workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"], 73 | model_id if model_id else __ctx_cache[f"{ctx.client_id}_semantic_model"], 74 | ) 75 | 76 | return f"Semantic Model '{model['displayName']}' details:\n\n{model}" 77 | 78 | except Exception as e: 79 | return f"Error retrieving semantic model: {str(e)}" 80 | -------------------------------------------------------------------------------- /tools/warehouse.py: -------------------------------------------------------------------------------- 1 | from helpers.utils.context import mcp, __ctx_cache 2 | from mcp.server.fastmcp import Context 3 | from helpers.utils.authentication import get_azure_credentials 4 | from helpers.clients import ( 5 | FabricApiClient, 6 | WarehouseClient, 7 | ) 8 | 9 | from typing import Optional 10 | 11 | 12 | @mcp.tool() 13 | async def set_warehouse(warehouse: str, ctx: Context) -> str: 14 | """Set the current warehouse for the session. 15 | 16 | Args: 17 | warehouse: Name or ID of the warehouse 18 | ctx: Context object containing client information 19 | 20 | Returns: 21 | A string confirming the warehouse has been set. 22 | """ 23 | __ctx_cache[f"{ctx.client_id}_warehouse"] = warehouse 24 | return f"Warehouse set to '{warehouse}'." 25 | 26 | 27 | @mcp.tool() 28 | async def list_warehouses(workspace: Optional[str] = None, ctx: Context = None) -> str: 29 | """List all warehouses in a Fabric workspace. 30 | 31 | Args: 32 | workspace: Name or ID of the workspace (optional) 33 | ctx: Context object containing client information 34 | 35 | Returns: 36 | A string containing the list of warehouses or an error message. 37 | """ 38 | try: 39 | client = WarehouseClient( 40 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 41 | ) 42 | 43 | warehouses = await client.list_warehouses( 44 | workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"] 45 | ) 46 | 47 | return warehouses 48 | 49 | except Exception as e: 50 | return f"Error listing warehouses: {str(e)}" 51 | 52 | 53 | @mcp.tool() 54 | async def create_warehouse( 55 | name: str, 56 | workspace: Optional[str] = None, 57 | description: Optional[str] = None, 58 | ctx: Context = None, 59 | ) -> str: 60 | """Create a new warehouse in a Fabric workspace. 61 | 62 | Args: 63 | name: Name of the warehouse 64 | workspace: Name or ID of the workspace (optional) 65 | description: Description of the warehouse (optional) 66 | ctx: Context object containing client information 67 | Returns: 68 | A string confirming the warehouse has been created or an error message. 69 | """ 70 | try: 71 | client = WarehouseClient( 72 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 73 | ) 74 | 75 | response = await client.create_warehouse( 76 | name=name, 77 | workspace=workspace 78 | if workspace 79 | else __ctx_cache[f"{ctx.client_id}_workspace"], 80 | description=description, 81 | ) 82 | 83 | return f"Warehouse '{response['id']}' created successfully." 84 | 85 | except Exception as e: 86 | return f"Error creating warehouse: {str(e)}" 87 | -------------------------------------------------------------------------------- /helpers/clients/notebook_client.py: -------------------------------------------------------------------------------- 1 | from helpers.utils import _is_valid_uuid 2 | from helpers.logging_config import get_logger 3 | from helpers.clients.fabric_client import FabricApiClient 4 | from typing import Dict, Any 5 | 6 | logger = get_logger(__name__) 7 | 8 | 9 | class NotebookClient: 10 | def __init__(self, client: FabricApiClient): 11 | self.client = client 12 | 13 | async def list_notebooks(self, workspace: str): 14 | """List all notebooks in a workspace.""" 15 | if not _is_valid_uuid(workspace): 16 | raise ValueError("Invalid workspace ID.") 17 | notebooks = await self.client.get_notebooks(workspace) 18 | 19 | if not notebooks: 20 | return f"No notebooks found in workspace '{workspace}'." 21 | 22 | markdown = f"# Notebooks in workspace '{workspace}'\n\n" 23 | markdown += "| ID | Name |\n" 24 | markdown += "|-----|------|\n" 25 | 26 | for nb in notebooks: 27 | markdown += f"| {nb['id']} | {nb['displayName']} |\n" 28 | 29 | return markdown 30 | 31 | async def get_notebook(self, workspace: str, notebook_id: str) -> Dict[str, Any]: 32 | """Get a specific notebook by ID.""" 33 | if not _is_valid_uuid(workspace): 34 | raise ValueError("Invalid workspace ID.") 35 | if not _is_valid_uuid(notebook_id): 36 | raise ValueError("Invalid notebook ID.") 37 | 38 | notebook = await self.client.get_notebook(workspace, notebook_id) 39 | 40 | if not notebook: 41 | return ( 42 | f"No notebook found with ID '{notebook_id}' in workspace '{workspace}'." 43 | ) 44 | 45 | return notebook 46 | 47 | async def create_notebook( 48 | self, workspace: str, notebook_name: str, content: str 49 | ) -> Dict[str, Any]: 50 | """Create a new notebook.""" 51 | try: 52 | workspace, workspace_id = await self.client.resolve_workspace_name_and_id( 53 | workspace 54 | ) 55 | if not workspace_id: 56 | raise ValueError("Invalid workspace ID.") 57 | 58 | logger.info(f"Creating notebook '{notebook_name}' in workspace '{workspace}' (ID: {workspace_id}).") 59 | 60 | try: 61 | response = await self.client.create_notebook( 62 | workspace_id=workspace_id, 63 | notebook_name=notebook_name, 64 | ipynb_name=notebook_name, 65 | content=content, 66 | ) 67 | except Exception as e: 68 | error_msg = f"Failed to create notebook '{notebook_name}' in workspace '{workspace}': {str(e)}" 69 | logger.error(error_msg) 70 | return error_msg 71 | 72 | 73 | logger.info(f"Successfully created notebook '{notebook_name}' with ID: {response['id']}") 74 | return response 75 | 76 | except Exception as e: 77 | error_msg = f"Error creating notebook '{notebook_name}': {str(e)}" 78 | logger.error(error_msg) 79 | return error_msg 80 | -------------------------------------------------------------------------------- /tools/load_data.py: -------------------------------------------------------------------------------- 1 | from helpers.utils.context import mcp, __ctx_cache 2 | from mcp.server.fastmcp import Context 3 | from helpers.utils.authentication import get_azure_credentials 4 | from helpers.clients import ( 5 | FabricApiClient, 6 | LakehouseClient, 7 | WarehouseClient, 8 | get_sql_endpoint, 9 | ) 10 | from helpers.logging_config import get_logger 11 | import tempfile 12 | import os 13 | import requests 14 | from typing import Optional 15 | 16 | logger = get_logger(__name__) 17 | 18 | 19 | @mcp.tool() 20 | async def load_data_from_url( 21 | url: str, 22 | destination_table: str, 23 | workspace: Optional[str] = None, 24 | lakehouse: Optional[str] = None, 25 | warehouse: Optional[str] = None, 26 | ctx: Context = None, 27 | ) -> str: 28 | """Load data from a URL into a table in a warehouse or lakehouse. 29 | 30 | Args: 31 | url: The URL to download data from (CSV or Parquet supported). 32 | destination_table: The name of the table to load data into. 33 | workspace: Name or ID of the workspace (optional). 34 | lakehouse: Name or ID of the lakehouse (optional). 35 | warehouse: Name or ID of the warehouse (optional). 36 | ctx: Context object containing client information. 37 | Returns: 38 | A string confirming the data load or an error message. 39 | """ 40 | try: 41 | # Download the file 42 | response = requests.get(url) 43 | if response.status_code != 200: 44 | return f"Failed to download file from URL: {url}" 45 | file_ext = url.split("?")[0].split(".")[-1].lower() 46 | if file_ext not in ("csv", "parquet"): 47 | return f"Unsupported file type: {file_ext}. Only CSV and Parquet are supported." 48 | with tempfile.NamedTemporaryFile( 49 | delete=False, suffix=f".{file_ext}" 50 | ) as tmp_file: 51 | tmp_file.write(response.content) 52 | tmp_path = tmp_file.name 53 | # Choose destination: lakehouse or warehouse 54 | credential = get_azure_credentials(ctx.client_id, __ctx_cache) 55 | resource_id = None 56 | resource_type = None 57 | if lakehouse: 58 | client = LakehouseClient(FabricApiClient(credential)) 59 | resource_id = lakehouse 60 | resource_type = "lakehouse" 61 | elif warehouse: 62 | client = WarehouseClient(FabricApiClient(credential)) 63 | resource_id = warehouse 64 | resource_type = "warehouse" 65 | else: 66 | return "Either lakehouse or warehouse must be specified." 67 | # Here you would call the appropriate method to upload/ingest the file into the table. 68 | # This is a placeholder for the actual implementation, which depends on the client API. 69 | # For now, just return a success message with file info. 70 | os.remove(tmp_path) 71 | return f"Data from {url} loaded into table '{destination_table}' in {resource_type} '{resource_id}'. (File type: {file_ext})" 72 | except Exception as e: 73 | return f"Error loading data: {str(e)}" 74 | 75 | 76 | # @mcp.resource( 77 | # uri="tables://{table_name}", 78 | # ) 79 | -------------------------------------------------------------------------------- /helpers/clients/table_client.py: -------------------------------------------------------------------------------- 1 | from helpers.logging_config import get_logger 2 | from helpers.clients.fabric_client import FabricApiClient 3 | from helpers.utils.table_tools import get_delta_schemas 4 | from azure.identity import DefaultAzureCredential 5 | from helpers.formatters.schema_formatter import format_schema_to_markdown 6 | from datetime import datetime 7 | 8 | logger = get_logger(__name__) 9 | 10 | 11 | class TableClient: 12 | def __init__(self, client: FabricApiClient): 13 | self.client = client 14 | 15 | async def list_tables( 16 | self, workspace_id: str, rsc_id: str, rsc_type: str = "lakehouse" 17 | ): 18 | """List all tables in a lakehouse.""" 19 | tables = await self.client.get_tables(workspace_id, rsc_id, rsc_type) 20 | 21 | if not tables: 22 | return f"No tables found in {rsc_type} '{rsc_id}'." 23 | 24 | return tables 25 | 26 | async def get_table_schema( 27 | self, 28 | workspace: str, 29 | rsc_id: str, 30 | rsc_type: str, 31 | table_name: str, 32 | credential: DefaultAzureCredential, 33 | ): 34 | """Retrieve schema for a specific table.""" 35 | 36 | tables = await self.list_tables(workspace, rsc_id, rsc_type) 37 | 38 | # Find the specific table 39 | matching_tables = [t for t in tables if t["name"].lower() == table_name.lower()] 40 | 41 | if not matching_tables: 42 | return f"No table found with name '{table_name}' in {rsc_type} '{rsc_id}'." 43 | 44 | table = matching_tables[0] 45 | 46 | # Check that it is a Delta table 47 | if table["format"].lower() != "delta": 48 | return f"The table '{table_name}' is not a Delta table (format: {table['format']})." 49 | 50 | # Get schema 51 | delta_tables = await get_delta_schemas([table], credential) 52 | 53 | if not delta_tables: 54 | return f"Could not retrieve schema for table '{table['name']}'." 55 | 56 | # Format result as markdown 57 | table_info, schema, metadata = delta_tables[0] 58 | markdown = format_schema_to_markdown(table_info, schema, metadata) 59 | 60 | return markdown 61 | 62 | async def get_all_schemas( 63 | self, 64 | workspace: str, 65 | rsc_id: str, 66 | rsc_type: str, 67 | credential: DefaultAzureCredential, 68 | ): 69 | """Get schemas for all Delta tables in a Fabric lakehouse.""" 70 | # Get all tables 71 | tables = await self.list_tables(workspace, rsc_id, rsc_type) 72 | 73 | if isinstance(tables, str): 74 | return tables 75 | 76 | if not tables: 77 | return f"No tables found in {rsc_type} '{rsc_id}'." 78 | 79 | # Filter to only Delta tables 80 | delta_format_tables = [t for t in tables if t["format"].lower() == "delta"] 81 | 82 | if not delta_format_tables: 83 | return f"No Delta tables found in {rsc_type} '{rsc_id}'." 84 | 85 | # Get schema for all tables 86 | delta_tables = await get_delta_schemas(delta_format_tables, credential) 87 | 88 | if not delta_tables: 89 | return "Could not retrieve schemas for any tables." 90 | 91 | # Format the result as markdown 92 | markdown = "# Delta Table Schemas\n\n" 93 | markdown += f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n" 94 | markdown += f"Workspace: {workspace}\n" 95 | markdown += f"Lakehouse: {rsc_id}\n\n" 96 | 97 | for table_info, schema, metadata in delta_tables: 98 | markdown += format_schema_to_markdown(table_info, schema, metadata) 99 | 100 | return markdown 101 | -------------------------------------------------------------------------------- /test_notebook_creation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Test script to validate the notebook creation fixes 4 | """ 5 | import asyncio 6 | import sys 7 | import json 8 | from helpers.clients.fabric_client import FabricApiClient 9 | from helpers.clients.notebook_client import NotebookClient 10 | from helpers.utils.authentication import get_azure_credentials 11 | from helpers.logging_config import get_logger 12 | 13 | logger = get_logger(__name__) 14 | 15 | async def test_notebook_creation(): 16 | """Test notebook creation with improved error handling""" 17 | try: 18 | # Initialize clients 19 | credentials = get_azure_credentials("test-client-id", {}) 20 | fabric_client = FabricApiClient(credentials) 21 | notebook_client = NotebookClient(fabric_client) 22 | 23 | # Test workspace - using "My workspace" 24 | workspace_id = "645f0acc-fd1e-42fe-ae6e-e919b6c63322" 25 | notebook_name = "Test Debug Notebook" 26 | 27 | # Create a simple notebook content 28 | notebook_json = { 29 | "nbformat": 4, 30 | "nbformat_minor": 5, 31 | "cells": [ 32 | { 33 | "cell_type": "code", 34 | "source": ["print('Hello, Fabric!')\n"], 35 | "execution_count": None, 36 | "outputs": [], 37 | "metadata": {}, 38 | } 39 | ], 40 | "metadata": {"language_info": {"name": "python"}}, 41 | } 42 | notebook_content = json.dumps(notebook_json) 43 | 44 | print(f"Testing notebook creation in workspace: {workspace_id}") 45 | print(f"Notebook name: {notebook_name}") 46 | 47 | # Test the notebook creation 48 | result = await notebook_client.create_notebook( 49 | workspace=workspace_id, 50 | notebook_name=notebook_name, 51 | content=notebook_content 52 | ) 53 | 54 | print(f"Result: {result}") 55 | 56 | if isinstance(result, dict) and result.get("id"): 57 | print(f"✅ SUCCESS: Created notebook with ID: {result['id']}") 58 | return True 59 | else: 60 | print(f"❌ FAILED: {result}") 61 | return False 62 | 63 | except Exception as e: 64 | print(f"❌ ERROR: {str(e)}") 65 | logger.error(f"Test failed: {str(e)}", exc_info=True) 66 | return False 67 | 68 | async def test_workspace_resolution(): 69 | """Test workspace name resolution""" 70 | try: 71 | credentials = get_azure_credentials("test-client-id", {}) 72 | fabric_client = FabricApiClient(credentials) 73 | 74 | # Test workspace resolution 75 | workspace_name, workspace_id = await fabric_client.resolve_workspace_name_and_id("My workspace") 76 | print(f"✅ Workspace resolution: '{workspace_name}' -> {workspace_id}") 77 | return True 78 | 79 | except Exception as e: 80 | print(f"❌ Workspace resolution failed: {str(e)}") 81 | return False 82 | 83 | if __name__ == "__main__": 84 | print("=" * 50) 85 | print("Testing Fabric MCP Notebook Creation Fixes") 86 | print("=" * 50) 87 | 88 | # Test workspace resolution first 89 | print("\n1. Testing workspace resolution...") 90 | success1 = asyncio.run(test_workspace_resolution()) 91 | 92 | # Test notebook creation 93 | print("\n2. Testing notebook creation...") 94 | success2 = asyncio.run(test_notebook_creation()) 95 | 96 | print("\n" + "=" * 50) 97 | if success1 and success2: 98 | print("✅ ALL TESTS PASSED") 99 | sys.exit(0) 100 | else: 101 | print("❌ SOME TESTS FAILED") 102 | sys.exit(1) 103 | -------------------------------------------------------------------------------- /helpers/clients/semanticModel_client.py: -------------------------------------------------------------------------------- 1 | from helpers.logging_config import get_logger 2 | from helpers.clients.fabric_client import FabricApiClient 3 | 4 | logger = get_logger(__name__) 5 | 6 | 7 | class SemanticModelClient: 8 | def __init__(self, client: FabricApiClient): 9 | self.client = client 10 | 11 | async def list_semantic_models(self, workspace_id: str): 12 | """List all semantic models in a workspace.""" 13 | models = await self.client.get_semantic_models(workspace_id) 14 | 15 | if not models: 16 | return f"No semantic models found in workspace '{workspace_id}'." 17 | 18 | return models 19 | 20 | async def get_semantic_model(self, workspace_id: str, model_id: str): 21 | """Get a specific semantic model by ID.""" 22 | model = await self.client.get_semantic_model(workspace_id, model_id) 23 | 24 | if not model: 25 | return f"No semantic model found with ID '{model_id}' in workspace '{workspace_id}'." 26 | 27 | return model 28 | 29 | # async def get_model_schema( 30 | # self, 31 | # workspace: str, 32 | # rsc_id: str, 33 | # rsc_type: str, 34 | # table_name: str, 35 | # credential: DefaultAzureCredential, 36 | # ): 37 | # """Retrieve schema for a specific model.""" 38 | 39 | # models = await self.list_semantic_models(workspace) 40 | 41 | # # Find the specific table 42 | # matching_tables = [t for t in tables if t["name"].lower() == table_name.lower()] 43 | 44 | # if not matching_tables: 45 | # return f"No table found with name '{table_name}' in {rsc_type} '{rsc_id}'." 46 | 47 | # table = matching_tables[0] 48 | 49 | # # Check that it is a Delta table 50 | # if table["format"].lower() != "delta": 51 | # return f"The table '{table_name}' is not a Delta table (format: {table['format']})." 52 | 53 | # # Get schema 54 | # delta_tables = await get_delta_schemas([table], credential) 55 | 56 | # if not delta_tables: 57 | # return f"Could not retrieve schema for table '{table}'." 58 | 59 | # # Format result as markdown 60 | # table_info, schema, metadata = delta_tables[0] 61 | # markdown = format_schema_to_markdown(table_info, schema, metadata) 62 | 63 | # return markdown 64 | 65 | # async def get_all_schemas( 66 | # self, 67 | # workspace: str, 68 | # rsc_id: str, 69 | # rsc_type: str, 70 | # credential: DefaultAzureCredential, 71 | # ): 72 | # """Get schemas for all Delta tables in a Fabric lakehouse.""" 73 | # # Get all tables 74 | # tables = await self.list_tables(workspace, rsc_id, rsc_type) 75 | 76 | # if isinstance(tables, str): 77 | # return tables 78 | 79 | # if not tables: 80 | # return f"No tables found in {rsc_type} '{rsc_id}'." 81 | 82 | # # Filter to only Delta tables 83 | # delta_format_tables = [t for t in tables if t["format"].lower() == "delta"] 84 | 85 | # if not delta_format_tables: 86 | # return f"No Delta tables found in {rsc_type} '{rsc_id}'." 87 | 88 | # # Get schema for all tables 89 | # delta_tables = await get_delta_schemas(delta_format_tables, credential) 90 | 91 | # logger.debug(f"Delta Tables response: {tables}") 92 | # if not delta_tables: 93 | # return "Could not retrieve schemas for any tables." 94 | 95 | # # Format the result as markdown 96 | # markdown = "# Delta Table Schemas\n\n" 97 | # markdown += f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n" 98 | # markdown += f"Workspace: {workspace}\n" 99 | # markdown += f"Lakehouse: {rsc_id}\n\n" 100 | 101 | # for table_info, schema, metadata in delta_tables: 102 | # markdown += format_schema_to_markdown(table_info, schema, metadata) 103 | 104 | # return markdown 105 | -------------------------------------------------------------------------------- /tools/lakehouse.py: -------------------------------------------------------------------------------- 1 | from helpers.utils.context import mcp, __ctx_cache 2 | from mcp.server.fastmcp import Context 3 | from helpers.utils.authentication import get_azure_credentials 4 | from helpers.clients import ( 5 | FabricApiClient, 6 | LakehouseClient, 7 | ) 8 | from helpers.logging_config import get_logger 9 | 10 | # import sempy_labs as labs 11 | # import sempy_labs.lakehouse as slh 12 | 13 | from typing import Optional 14 | 15 | logger = get_logger(__name__) 16 | 17 | 18 | @mcp.tool() 19 | async def set_lakehouse(lakehouse: str, ctx: Context) -> str: 20 | """Set the current lakehouse for the session. 21 | 22 | Args: 23 | lakehouse: Name or ID of the lakehouse 24 | ctx: Context object containing client information 25 | 26 | Returns: 27 | A string confirming the lakehouse has been set. 28 | """ 29 | __ctx_cache[f"{ctx.client_id}_lakehouse"] = lakehouse 30 | return f"Lakehouse set to '{lakehouse}'." 31 | 32 | 33 | @mcp.tool() 34 | async def list_lakehouses(workspace: Optional[str] = None, ctx: Context = None) -> str: 35 | """List all lakehouses in a Fabric workspace. 36 | 37 | Args: 38 | workspace: Name or ID of the workspace (optional) 39 | ctx: Context object containing client information 40 | 41 | Returns: 42 | A string containing the list of lakehouses or an error message. 43 | """ 44 | try: 45 | credential = get_azure_credentials(ctx.client_id, __ctx_cache) 46 | fabric_client = FabricApiClient(credential=credential) 47 | lakehouse_client = LakehouseClient(client=fabric_client) 48 | ws = workspace or __ctx_cache.get(f"{ctx.client_id}_workspace") 49 | if not ws: 50 | return "Workspace not set. Please set a workspace using the 'set_workspace' command." 51 | return await lakehouse_client.list_lakehouses(workspace=ws) 52 | except Exception as e: 53 | logger.error(f"Error listing lakehouses: {e}") 54 | return f"Error listing lakehouses: {e}" 55 | 56 | 57 | # @mcp.tool() 58 | # async def list_lakehouses_semantic_link(workspace: Optional[str] = None, ctx: Context = None) -> str: 59 | # """List all lakehouses in a Fabric workspace using semantic-link-labs.""" 60 | # try: 61 | # manager = LakehouseManager() 62 | # lakehouses = manager.list_lakehouses(workspace_id=workspace or __ctx_cache.get(f"{ctx.client_id}_workspace")) 63 | # markdown = f"# Lakehouses (semantic-link-labs) in workspace '{workspace}'\n\n" 64 | # markdown += "| ID | Name |\n" 65 | # markdown += "|-----|------|\n" 66 | # for lh in lakehouses: 67 | # markdown += f"| {lh.get('id', 'N/A')} | {lh.get('displayName', 'N/A')} |\n" 68 | # return markdown 69 | # except Exception as e: 70 | # return f"Error listing lakehouses with semantic-link-labs: {str(e)}" 71 | 72 | 73 | @mcp.tool() 74 | async def create_lakehouse( 75 | name: str, 76 | workspace: Optional[str] = None, 77 | description: Optional[str] = None, 78 | ctx: Context = None, 79 | ) -> str: 80 | """Create a new lakehouse in a Fabric workspace. 81 | 82 | Args: 83 | name: Name of the lakehouse 84 | workspace: Name or ID of the workspace (optional) 85 | description: Description of the lakehouse (optional) 86 | ctx: Context object containing client information 87 | Returns: 88 | A string confirming the lakehouse has been created or an error message. 89 | """ 90 | try: 91 | credential = get_azure_credentials(ctx.client_id, __ctx_cache) 92 | fabric_client = FabricApiClient(credential=credential) 93 | lakehouse_client = LakehouseClient(client=fabric_client) 94 | ws = workspace or __ctx_cache.get(f"{ctx.client_id}_workspace") 95 | if not ws: 96 | return "Workspace not set. Please set a workspace using the 'set_workspace' command." 97 | return await lakehouse_client.create_lakehouse( 98 | name=name, workspace=ws, description=description 99 | ) 100 | except Exception as e: 101 | logger.error(f"Error creating lakehouse: {e}") 102 | return f"Error creating lakehouse: {e}" 103 | -------------------------------------------------------------------------------- /helpers/clients/sql_client.py: -------------------------------------------------------------------------------- 1 | import polars as pl 2 | from sqlalchemy import create_engine, Engine 3 | from itertools import chain, repeat 4 | import urllib 5 | import struct 6 | from typing import Optional 7 | from azure.identity import DefaultAzureCredential 8 | from helpers.clients import FabricApiClient, LakehouseClient, WarehouseClient 9 | 10 | 11 | # prepare connection string 12 | sql_endpoint = "lkxke5qat5vu7fpnluz5o7cnme-qlbrb7caj77uthvfhqdxwd5v54.datawarehouse.fabric.microsoft.com" 13 | database = "EDR_WAREHOUSE" 14 | DRIVER = "{{ODBC Driver 18 for SQL Server}}" 15 | 16 | 17 | def get_sqlalchemy_connection_string(driver: str, server: str, database: str) -> Engine: 18 | """ 19 | Constructs a SQLAlchemy connection string based on the provided parameters. 20 | 21 | Args: 22 | driver (str): The database driver (e.g., 'mssql+pyodbc'). 23 | server (str): The server address. 24 | database (str): The database name. 25 | 26 | Returns: 27 | Engine: A SQLAlchemy engine object. 28 | """ 29 | connection_string = f"Driver={{ODBC Driver 18 for SQL Server}};Server={sql_endpoint},1433;Database={database};Encrypt=Yes;TrustServerCertificate=No" 30 | params = urllib.parse.quote(connection_string) 31 | # authentication 32 | resource_url = "https://database.windows.net/.default" 33 | azure_credentials = DefaultAzureCredential() 34 | token_object = azure_credentials.get_token(resource_url) 35 | # Retrieve an access token 36 | token_as_bytes = bytes( 37 | token_object.token, "UTF-8" 38 | ) # Convert the token to a UTF-8 byte string 39 | encoded_bytes = bytes( 40 | chain.from_iterable(zip(token_as_bytes, repeat(0))) 41 | ) # Encode the bytes to a Windows byte string 42 | token_bytes = ( 43 | struct.pack(" tuple: 63 | """ 64 | Retrieve the SQL endpoint for a specified lakehouse or warehouse. 65 | 66 | Args: 67 | lakehouse: Name or ID of the lakehouse (optional). 68 | warehouse: Name or ID of the warehouse (optional). 69 | type: Type of resource ('lakehouse' or 'warehouse'). 70 | workspace: Name or ID of the workspace (optional). 71 | Returns: 72 | A tuple (database, sql_endpoint) or (None, error_message) in case of error. 73 | """ 74 | try: 75 | credential = DefaultAzureCredential() 76 | fabClient = FabricApiClient(credential) 77 | resource_name = None 78 | endpoint = None 79 | workspace_name, workspace_id = await fabClient.resolve_workspace_name_and_id( 80 | workspace 81 | ) 82 | if type and type.lower() == "lakehouse": 83 | client = LakehouseClient(fabClient) 84 | resource_name, resource_id = await fabClient.resolve_item_name_and_id( 85 | workspace=workspace_id, item=lakehouse, type="Lakehouse" 86 | ) 87 | lakehouse_obj = await client.get_lakehouse( 88 | workspace=workspace, lakehouse=resource_id 89 | ) 90 | endpoint = ( 91 | lakehouse_obj.get("properties", {}) 92 | .get("sqlEndpointProperties", {}) 93 | .get("connectionString") 94 | ) 95 | elif type and type.lower() == "warehouse": 96 | client = WarehouseClient(fabClient) 97 | resource_name, resource_id = await fabClient.resolve_item_name_and_id( 98 | workspace=workspace_id, item=warehouse, type="Warehouse" 99 | ) 100 | warehouse_obj = await client.get_warehouse( 101 | workspace=workspace, warehouse=resource_id 102 | ) 103 | endpoint = warehouse_obj.get("properties", {}).get("connectionString") 104 | if resource_name and endpoint: 105 | return resource_name, endpoint 106 | else: 107 | return ( 108 | None, 109 | f"No SQL endpoint found for {type} '{lakehouse or warehouse}' in workspace '{workspace}'.", 110 | ) 111 | except Exception as e: 112 | return None, f"Error retrieving SQL endpoint: {str(e)}" 113 | 114 | 115 | class SQLClient: 116 | def __init__(self, sql_endpoint: str, database: str): 117 | self.engine = get_sqlalchemy_connection_string(DRIVER, sql_endpoint, database) 118 | 119 | def run_query(self, query: str) -> pl.DataFrame: 120 | return pl.read_database(query, connection=self.engine) 121 | 122 | def load_data(self, df: pl.DataFrame, table_name: str, if_exists: str = "append"): 123 | pdf = df.to_pandas() 124 | pdf.to_sql(table_name, con=self.engine, if_exists=if_exists, index=False) 125 | -------------------------------------------------------------------------------- /docs/architecture.md: -------------------------------------------------------------------------------- 1 | # Microsoft Fabric MCP Architecture with LLM Integration 2 | 3 | ## Complete Architecture Diagram 4 | 5 | ```mermaid 6 | graph TB 7 | subgraph "Developer Environment" 8 | IDE[IDE/VSCode] 9 | DEV[Developer] 10 | PROJ[Project Files] 11 | end 12 | 13 | subgraph "AI Layer" 14 | LLM[Large Language Model
Claude/GPT/etc.] 15 | CONTEXT[Conversation Context] 16 | REASONING[AI Reasoning Engine] 17 | end 18 | 19 | subgraph "MCP Layer" 20 | MCP[MCP Server] 21 | TOOLS[PySpark Tools] 22 | HELPERS[PySpark Helpers] 23 | TEMPLATES[Template Manager] 24 | VALIDATORS[Code Validators] 25 | GENERATORS[Code Generators] 26 | end 27 | 28 | subgraph "Microsoft Fabric" 29 | API[Fabric API] 30 | WS[Workspace] 31 | LH[Lakehouse] 32 | NB[Notebooks] 33 | TABLES[Delta Tables] 34 | SPARK[Spark Clusters] 35 | end 36 | 37 | subgraph "Operations Flow" 38 | CREATE[Create Notebooks] 39 | VALIDATE[Validate Code] 40 | GENERATE[Generate Code] 41 | ANALYZE[Analyze Performance] 42 | DEPLOY[Deploy to Fabric] 43 | end 44 | 45 | %% Developer interactions 46 | DEV --> IDE 47 | IDE --> PROJ 48 | 49 | %% LLM interactions 50 | IDE <--> LLM 51 | LLM <--> CONTEXT 52 | LLM --> REASONING 53 | 54 | %% MCP interactions 55 | LLM <--> MCP 56 | MCP --> TOOLS 57 | TOOLS --> HELPERS 58 | TOOLS --> TEMPLATES 59 | TOOLS --> VALIDATORS 60 | TOOLS --> GENERATORS 61 | 62 | %% Fabric interactions 63 | MCP <--> API 64 | API --> WS 65 | WS --> LH 66 | WS --> NB 67 | LH --> TABLES 68 | NB --> SPARK 69 | 70 | %% Operation flows 71 | TOOLS --> CREATE 72 | TOOLS --> VALIDATE 73 | TOOLS --> GENERATE 74 | TOOLS --> ANALYZE 75 | CREATE --> DEPLOY 76 | 77 | %% Data flow arrows 78 | REASONING -.->|"Intelligent Decisions"| TOOLS 79 | CONTEXT -.->|"Project Awareness"| VALIDATORS 80 | 81 | %% Styling 82 | classDef devEnv fill:#e1f5fe 83 | classDef aiLayer fill:#fff9c4 84 | classDef mcpLayer fill:#f3e5f5 85 | classDef fabricLayer fill:#e8f5e8 86 | classDef operations fill:#fff3e0 87 | 88 | class IDE,DEV,PROJ devEnv 89 | class LLM,CONTEXT,REASONING aiLayer 90 | class MCP,TOOLS,HELPERS,TEMPLATES,VALIDATORS,GENERATORS mcpLayer 91 | class API,WS,LH,NB,TABLES,SPARK fabricLayer 92 | class CREATE,VALIDATE,GENERATE,ANALYZE,DEPLOY operations 93 | ``` 94 | 95 | ## Architecture Components 96 | 97 | ### **1. Developer Environment** 98 | - **IDE/VSCode**: Primary development interface with MCP integration 99 | - **Developer**: Data engineer/scientist working on PySpark projects 100 | - **Project Files**: Local project structure and configuration 101 | 102 | ### **2. AI Layer** 103 | - **Large Language Model**: Claude, GPT, or other LLM providing intelligent assistance 104 | - **Conversation Context**: Maintains project context and conversation history 105 | - **AI Reasoning Engine**: Makes intelligent decisions about code generation and optimization 106 | 107 | ### **3. MCP Layer (This Server)** 108 | - **MCP Server**: Core server handling tool requests from the LLM 109 | - **PySpark Tools**: 11 specialized tools for notebook operations 110 | - **PySpark Helpers**: Template management and code generation 111 | - **Template Manager**: Pre-built notebook templates for different scenarios 112 | - **Code Validators**: Syntax, best practices, and Fabric compatibility checks 113 | - **Code Generators**: Intelligent PySpark code generation 114 | 115 | ### **4. Microsoft Fabric** 116 | - **Fabric API**: REST API for all Fabric operations 117 | - **Workspace**: Fabric workspace containing resources 118 | - **Lakehouse**: Data storage with Delta Lake tables 119 | - **Notebooks**: PySpark notebooks for data processing 120 | - **Delta Tables**: Structured data storage 121 | - **Spark Clusters**: Compute resources for PySpark execution 122 | 123 | ### **5. Operations Flow** 124 | - **Create Notebooks**: Generate notebooks from templates 125 | - **Validate Code**: Check syntax, performance, and compatibility 126 | - **Generate Code**: Create PySpark snippets for common operations 127 | - **Analyze Performance**: Evaluate and optimize notebook performance 128 | - **Deploy to Fabric**: Push notebooks and execute in Fabric environment 129 | 130 | ## Enhanced Interaction Flow with LLM 131 | 132 | 1. **Developer requests PySpark assistance in IDE** 133 | 2. **IDE communicates with LLM (Claude/GPT)** 134 | 3. **LLM analyzes request using conversation context and reasoning** 135 | 4. **LLM calls MCP server tools based on intelligent analysis** 136 | 5. **MCP server processes request using specialized tools** 137 | 6. **Tools utilize helpers, templates, and validators** 138 | 7. **MCP server calls Fabric API for operations** 139 | 8. **Results flow back through MCP to LLM** 140 | 9. **LLM processes and formats results intelligently** 141 | 10. **Developer receives contextual, intelligent responses in IDE** 142 | 143 | ## Key Benefits of LLM Integration 144 | 145 | ### **Intelligent Decision Making** 146 | - LLM analyzes developer intent and context 147 | - Chooses appropriate tools and templates automatically 148 | - Provides contextual recommendations based on project history 149 | 150 | ### **Natural Language Interface** 151 | - Developers can request features in natural language 152 | - LLM translates requests to appropriate MCP tool calls 153 | - Reduces need to remember specific tool names and parameters 154 | 155 | ### **Context Awareness** 156 | - LLM maintains conversation history and project context 157 | - Provides consistent recommendations across sessions 158 | - Learns from previous interactions and code patterns 159 | 160 | ### **Enhanced Code Generation** 161 | - LLM combines multiple tool outputs intelligently 162 | - Provides explanations and documentation with generated code 163 | - Adapts to developer's coding style and preferences 164 | 165 | ## Example LLM-Enhanced Workflows 166 | 167 | ### **Scenario 1: Natural Language Request** 168 | ``` 169 | Developer: "Help me create a PySpark notebook that reads sales data from our lakehouse, 170 | cleans it, and creates a summary table with performance optimization." 171 | 172 | LLM Process: 173 | 1. Analyzes intent: notebook creation + data processing + optimization 174 | 2. Calls create_fabric_notebook() with ETL template 175 | 3. Calls generate_fabric_code() for lakehouse reading 176 | 4. Calls validate_fabric_code() for optimization checks 177 | 5. Provides complete solution with explanations 178 | ``` 179 | 180 | ### **Scenario 2: Performance Optimization** 181 | ``` 182 | Developer: "My PySpark notebook is running slowly. Can you help optimize it?" 183 | 184 | LLM Process: 185 | 1. Calls analyze_notebook_performance() on current notebook 186 | 2. Calls validate_fabric_code() for anti-pattern detection 187 | 3. Calls generate_fabric_code() for optimized alternatives 188 | 4. Provides detailed optimization report with before/after comparisons 189 | ``` 190 | 191 | ### **Scenario 3: Best Practices Guidance** 192 | ``` 193 | Developer: "Is this PySpark code following Fabric best practices?" 194 | 195 | LLM Process: 196 | 1. Calls validate_fabric_code() for compatibility checks 197 | 2. Analyzes results with reasoning engine 198 | 3. Provides detailed feedback with specific recommendations 199 | 4. Suggests alternative approaches using generate_fabric_code() 200 | ``` 201 | 202 | This architecture leverages the power of LLMs to provide intelligent, context-aware assistance while utilizing specialized MCP tools for precise Fabric operations! 203 | -------------------------------------------------------------------------------- /tools/table.py: -------------------------------------------------------------------------------- 1 | from helpers.utils.context import mcp, __ctx_cache 2 | from mcp.server.fastmcp import Context 3 | from helpers.utils.authentication import get_azure_credentials 4 | from helpers.clients import ( 5 | FabricApiClient, 6 | TableClient, 7 | SQLClient, 8 | get_sql_endpoint, 9 | ) 10 | 11 | from typing import Optional 12 | from helpers.logging_config import get_logger 13 | 14 | logger = get_logger(__name__) 15 | 16 | 17 | @mcp.tool() 18 | async def set_table(table_name: str, ctx: Context) -> str: 19 | """Set the current table for the session. 20 | 21 | Args: 22 | table_name: Name of the table to set 23 | ctx: Context object containing client information 24 | 25 | Returns: 26 | A string confirming the table has been set. 27 | """ 28 | __ctx_cache[f"{ctx.client_id}_table"] = table_name 29 | return f"Table set to '{table_name}'." 30 | 31 | 32 | @mcp.tool() 33 | async def list_tables( 34 | workspace: Optional[str] = None, 35 | lakehouse: Optional[str] = None, 36 | ctx: Context = None, 37 | ) -> str: 38 | """List all tables in a Fabric workspace. 39 | 40 | Args: 41 | workspace: Name or ID of the workspace (optional) 42 | lakehouse: Name or ID of the lakehouse (optional) 43 | ctx: Context object containing client information 44 | 45 | Returns: 46 | A string containing the list of tables or an error message. 47 | """ 48 | try: 49 | client = TableClient( 50 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 51 | ) 52 | 53 | tables = await client.list_tables( 54 | workspace_id=workspace 55 | if workspace 56 | else __ctx_cache[f"{ctx.client_id}_workspace"], 57 | rsc_id=lakehouse 58 | if lakehouse 59 | else __ctx_cache[f"{ctx.client_id}_lakehouse"], 60 | ) 61 | 62 | return tables 63 | 64 | except Exception as e: 65 | return f"Error listing tables: {str(e)}" 66 | 67 | 68 | @mcp.tool() 69 | async def get_lakehouse_table_schema( 70 | workspace: Optional[str], 71 | lakehouse: Optional[str], 72 | table_name: str = None, 73 | ctx: Context = None, 74 | ) -> str: 75 | """Get schema for a specific table in a Fabric lakehouse. 76 | 77 | Args: 78 | workspace: Name or ID of the workspace 79 | lakehouse: Name or ID of the lakehouse 80 | table_name: Name of the table to retrieve 81 | ctx: Context object containing client information 82 | 83 | Returns: 84 | A string containing the schema of the specified table or an error message. 85 | """ 86 | try: 87 | credential = get_azure_credentials(ctx.client_id, __ctx_cache) 88 | client = TableClient(FabricApiClient(credential)) 89 | 90 | if table_name is None: 91 | return "Table name must be specified." 92 | if lakehouse is None: 93 | if f"{ctx.client_id}_lakehouse" in __ctx_cache: 94 | lakehouse = __ctx_cache[f"{ctx.client_id}_lakehouse"] 95 | else: 96 | return "Lakehouse must be specified or set in the context." 97 | 98 | if workspace is None: 99 | if f"{ctx.client_id}_workspace" in __ctx_cache: 100 | workspace = __ctx_cache[f"{ctx.client_id}_workspace"] 101 | else: 102 | return "Workspace must be specified or set in the context." 103 | 104 | schema = await client.get_table_schema( 105 | workspace, lakehouse, "lakehouse", table_name, credential 106 | ) 107 | 108 | return schema 109 | 110 | except Exception as e: 111 | return f"Error retrieving table schema: {str(e)}" 112 | 113 | 114 | @mcp.tool() 115 | async def get_all_lakehouse_schemas( 116 | lakehouse: Optional[str], workspace: Optional[str] = None, ctx: Context = None 117 | ) -> str: 118 | """Get schemas for all Delta tables in a Fabric lakehouse. 119 | 120 | Args: 121 | workspace: Name or ID of the workspace 122 | lakehouse: Name or ID of the lakehouse 123 | ctx: Context object containing client information 124 | 125 | Returns: 126 | A string containing the schemas of all Delta tables or an error message. 127 | """ 128 | try: 129 | credential = get_azure_credentials(ctx.client_id, __ctx_cache) 130 | client = TableClient(FabricApiClient(credential)) 131 | 132 | if workspace is None: 133 | if f"{ctx.client_id}_workspace" in __ctx_cache: 134 | workspace = __ctx_cache[f"{ctx.client_id}_workspace"] 135 | else: 136 | return "Workspace must be specified or set in the context." 137 | if lakehouse is None: 138 | if f"{ctx.client_id}_lakehouse" in __ctx_cache: 139 | lakehouse = __ctx_cache[f"{ctx.client_id}_lakehouse"] 140 | else: 141 | return "Lakehouse must be specified or set in the context." 142 | schemas = await client.get_all_schemas( 143 | workspace, lakehouse, "lakehouse", credential 144 | ) 145 | 146 | return schemas 147 | 148 | except Exception as e: 149 | return f"Error retrieving table schemas: {str(e)}" 150 | 151 | 152 | @mcp.tool() 153 | async def run_query( 154 | workspace: Optional[str] = None, 155 | lakehouse: Optional[str] = None, 156 | warehouse: Optional[str] = None, 157 | query: str = None, 158 | type: Optional[str] = None, # Add type hint for 'type' 159 | ctx: Context = None, 160 | ) -> str: 161 | """Read data from a table in a warehouse or lakehouse. 162 | 163 | Args: 164 | workspace: Name or ID of the workspace (optional). 165 | lakehouse: Name or ID of the lakehouse (optional). 166 | warehouse: Name or ID of the warehouse (optional). 167 | query: The SQL query to execute. 168 | type: Type of resource ('lakehouse' or 'warehouse'). If not provided, it will be inferred. 169 | ctx: Context object containing client information. 170 | Returns: 171 | A string confirming the data read or an error message. 172 | """ 173 | try: 174 | if ctx is None: 175 | raise ValueError("Context (ctx) must be provided.") 176 | if query is None: 177 | raise ValueError("Query must be specified.") 178 | # Always resolve the SQL endpoint and database name 179 | database, sql_endpoint = await get_sql_endpoint( 180 | workspace=workspace, 181 | lakehouse=lakehouse, 182 | warehouse=warehouse, 183 | type=type, 184 | ) 185 | if ( 186 | not database 187 | or not sql_endpoint 188 | or sql_endpoint.startswith("Error") 189 | or sql_endpoint.startswith("No SQL endpoint") 190 | ): 191 | return f"Failed to resolve SQL endpoint: {sql_endpoint}" 192 | logger.info(f"Running query '{query}' on SQL endpoint {sql_endpoint}") 193 | client = SQLClient(sql_endpoint=sql_endpoint, database=database) 194 | df = client.run_query(query) 195 | if df.is_empty(): 196 | return f"No data found for query '{query}'." 197 | 198 | # Convert to markdown for user-friendly display 199 | 200 | # markdown = f"### Query: {query} (shape: {df.shape})\n\n" 201 | # with pl.Config() as cfg: 202 | # cfg.set_tbl_formatting('ASCII_MARKDOWN') 203 | # display(Markdown(repr(df))) 204 | # markdown += f"\n\n### Data Preview:\n\n" 205 | # markdown += df.head(10).to_pandas().to_markdown(index=False) 206 | # markdown += f"\n\nColumns: {', '.join(df.columns)}" 207 | return df.to_dict() # Return the DataFrame as a dictionary for easier handling 208 | except Exception as e: 209 | logger.error(f"Error reading data: {str(e)}") 210 | return f"Error reading data: {str(e)}" 211 | -------------------------------------------------------------------------------- /test_security.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test script for the secure MCP server. 3 | Validates authentication, authorization, and security features. 4 | """ 5 | 6 | import requests 7 | import json 8 | import time 9 | import subprocess 10 | import threading 11 | from typing import Optional 12 | 13 | class SecurityTester: 14 | """Test suite for MCP server security.""" 15 | 16 | def __init__(self, base_url: str = "http://localhost:8081"): 17 | self.base_url = base_url.rstrip('/') 18 | self.session = requests.Session() 19 | self.session.verify = False # For self-signed certificates 20 | 21 | def test_health_check(self) -> bool: 22 | """Test health check endpoint.""" 23 | try: 24 | response = self.session.get(f"{self.base_url}/health", timeout=5) 25 | response.raise_for_status() 26 | health_data = response.json() 27 | print(f"✅ Health check passed: {health_data}") 28 | return True 29 | except Exception as e: 30 | print(f"❌ Health check failed: {e}") 31 | return False 32 | 33 | def test_unauthenticated_access(self) -> bool: 34 | """Test that unauthenticated requests are rejected.""" 35 | try: 36 | mcp_request = { 37 | "jsonrpc": "2.0", 38 | "id": 1, 39 | "method": "tools/list" 40 | } 41 | 42 | response = self.session.post( 43 | f"{self.base_url}/mcp", 44 | json=mcp_request, 45 | timeout=5 46 | ) 47 | 48 | if response.status_code == 401: 49 | print("✅ Unauthenticated access properly rejected") 50 | return True 51 | else: 52 | print(f"❌ Unauthenticated access allowed (status: {response.status_code})") 53 | return False 54 | 55 | except Exception as e: 56 | print(f"❌ Error testing unauthenticated access: {e}") 57 | return False 58 | 59 | def test_authentication(self, username: str = "admin", password: str = "changeme") -> Optional[str]: 60 | """Test username/password authentication.""" 61 | try: 62 | response = self.session.post( 63 | f"{self.base_url}/auth/login", 64 | json={"username": username, "password": password}, 65 | timeout=5 66 | ) 67 | 68 | if response.status_code == 200: 69 | token_data = response.json() 70 | token = token_data.get("access_token") 71 | print(f"✅ Authentication successful: {username}") 72 | return token 73 | else: 74 | print(f"❌ Authentication failed: {response.status_code} - {response.text}") 75 | return None 76 | 77 | except Exception as e: 78 | print(f"❌ Authentication error: {e}") 79 | return None 80 | 81 | def test_authenticated_access(self, token: str) -> bool: 82 | """Test authenticated MCP tool access.""" 83 | try: 84 | headers = {"Authorization": f"Bearer {token}"} 85 | mcp_request = { 86 | "jsonrpc": "2.0", 87 | "id": 1, 88 | "method": "tools/list" 89 | } 90 | 91 | response = self.session.post( 92 | f"{self.base_url}/mcp", 93 | json=mcp_request, 94 | headers=headers, 95 | timeout=5 96 | ) 97 | 98 | if response.status_code == 200: 99 | result = response.json() 100 | print(f"✅ Authenticated access successful") 101 | if 'result' in result and 'tools' in result['result']: 102 | tools = result['result']['tools'] 103 | print(f" Available tools: {[t['name'] for t in tools]}") 104 | return True 105 | else: 106 | print(f"❌ Authenticated access failed: {response.status_code}") 107 | return False 108 | 109 | except Exception as e: 110 | print(f"❌ Authenticated access error: {e}") 111 | return False 112 | 113 | def test_token_verification(self, token: str) -> bool: 114 | """Test token verification endpoint.""" 115 | try: 116 | headers = {"Authorization": f"Bearer {token}"} 117 | response = self.session.get( 118 | f"{self.base_url}/auth/verify", 119 | headers=headers, 120 | timeout=5 121 | ) 122 | 123 | if response.status_code == 200: 124 | verify_data = response.json() 125 | print(f"✅ Token verification passed: {verify_data}") 126 | return True 127 | else: 128 | print(f"❌ Token verification failed: {response.status_code}") 129 | return False 130 | 131 | except Exception as e: 132 | print(f"❌ Token verification error: {e}") 133 | return False 134 | 135 | def test_invalid_credentials(self) -> bool: 136 | """Test that invalid credentials are rejected.""" 137 | try: 138 | response = self.session.post( 139 | f"{self.base_url}/auth/login", 140 | json={"username": "invalid", "password": "invalid"}, 141 | timeout=5 142 | ) 143 | 144 | if response.status_code == 401: 145 | print("✅ Invalid credentials properly rejected") 146 | return True 147 | else: 148 | print(f"❌ Invalid credentials accepted (status: {response.status_code})") 149 | return False 150 | 151 | except Exception as e: 152 | print(f"❌ Error testing invalid credentials: {e}") 153 | return False 154 | 155 | def test_rate_limiting(self, token: str) -> bool: 156 | """Test rate limiting functionality.""" 157 | try: 158 | headers = {"Authorization": f"Bearer {token}"} 159 | 160 | # Make multiple rapid requests 161 | success_count = 0 162 | rate_limited = False 163 | 164 | for i in range(10): 165 | response = self.session.get( 166 | f"{self.base_url}/health", 167 | headers=headers, 168 | timeout=5 169 | ) 170 | 171 | if response.status_code == 200: 172 | success_count += 1 173 | elif response.status_code == 429: # Too Many Requests 174 | rate_limited = True 175 | break 176 | 177 | time.sleep(0.1) # Small delay between requests 178 | 179 | if success_count > 0: 180 | print(f"✅ Rate limiting configured (processed {success_count} requests)") 181 | if rate_limited: 182 | print(" Rate limit triggered as expected") 183 | return True 184 | else: 185 | print("❌ No requests succeeded") 186 | return False 187 | 188 | except Exception as e: 189 | print(f"❌ Rate limiting test error: {e}") 190 | return False 191 | 192 | def test_security_headers(self) -> bool: 193 | """Test that security headers are present.""" 194 | try: 195 | response = self.session.get(f"{self.base_url}/", timeout=5) 196 | headers = response.headers 197 | 198 | security_headers = { 199 | 'X-Content-Type-Options': 'nosniff', 200 | 'X-Frame-Options': 'DENY', 201 | 'X-XSS-Protection': '1; mode=block', 202 | 'Content-Security-Policy': "default-src 'self'" 203 | } 204 | 205 | missing_headers = [] 206 | for header, expected_value in security_headers.items(): 207 | if header not in headers: 208 | missing_headers.append(header) 209 | elif headers[header] != expected_value: 210 | print(f"⚠️ Security header {header} has unexpected value: {headers[header]}") 211 | 212 | if missing_headers: 213 | print(f"❌ Missing security headers: {missing_headers}") 214 | return False 215 | else: 216 | print("✅ All security headers present") 217 | return True 218 | 219 | except Exception as e: 220 | print(f"❌ Security headers test error: {e}") 221 | return False 222 | 223 | def run_all_tests(self) -> bool: 224 | """Run all security tests.""" 225 | print("🔒 Starting MCP Server Security Tests") 226 | print("=" * 50) 227 | 228 | test_results = [] 229 | 230 | # Test health check 231 | test_results.append(self.test_health_check()) 232 | 233 | # Test security headers 234 | test_results.append(self.test_security_headers()) 235 | 236 | # Test unauthenticated access 237 | test_results.append(self.test_unauthenticated_access()) 238 | 239 | # Test invalid credentials 240 | test_results.append(self.test_invalid_credentials()) 241 | 242 | # Test authentication 243 | token = self.test_authentication() 244 | if token: 245 | test_results.append(True) 246 | 247 | # Test authenticated access 248 | test_results.append(self.test_authenticated_access(token)) 249 | 250 | # Test token verification 251 | test_results.append(self.test_token_verification(token)) 252 | 253 | # Test rate limiting 254 | test_results.append(self.test_rate_limiting(token)) 255 | else: 256 | test_results.extend([False, False, False, False]) 257 | 258 | # Results summary 259 | passed = sum(test_results) 260 | total = len(test_results) 261 | 262 | print("\n" + "=" * 50) 263 | print(f"Test Results: {passed}/{total} passed") 264 | 265 | if passed == total: 266 | print("🎉 All security tests passed!") 267 | return True 268 | else: 269 | print("⚠️ Some security tests failed. Please review the output above.") 270 | return False 271 | 272 | def main(): 273 | """Main test runner.""" 274 | import argparse 275 | 276 | parser = argparse.ArgumentParser(description="Test MCP server security") 277 | parser.add_argument("--url", default="http://localhost:8081", help="Server URL") 278 | parser.add_argument("--start-server", action="store_true", help="Start secure server before testing") 279 | parser.add_argument("--server-args", default="", help="Additional server arguments") 280 | args = parser.parse_args() 281 | 282 | server_process = None 283 | 284 | if args.start_server: 285 | print("🚀 Starting secure MCP server...") 286 | server_cmd = f"python secure_fabric_mcp.py {args.server_args}" 287 | server_process = subprocess.Popen( 288 | server_cmd.split(), 289 | stdout=subprocess.PIPE, 290 | stderr=subprocess.PIPE 291 | ) 292 | 293 | # Wait for server to start 294 | print("⏳ Waiting for server to start...") 295 | time.sleep(5) 296 | 297 | try: 298 | # Run tests 299 | tester = SecurityTester(args.url) 300 | success = tester.run_all_tests() 301 | 302 | if success: 303 | print("\n✅ Security validation completed successfully!") 304 | else: 305 | print("\n❌ Security validation failed!") 306 | 307 | finally: 308 | if server_process: 309 | print("\n🛑 Stopping server...") 310 | server_process.terminate() 311 | server_process.wait() 312 | 313 | if __name__ == "__main__": 314 | main() 315 | -------------------------------------------------------------------------------- /docs/pyspark_guide.md: -------------------------------------------------------------------------------- 1 | # PySpark Development Guide for Microsoft Fabric MCP 2 | 3 | This guide explains how to use the enhanced PySpark capabilities in the Microsoft Fabric MCP server for developing, testing, and optimizing PySpark notebooks. 4 | 5 | ## Overview 6 | 7 | The MCP server now provides comprehensive PySpark development support with: 8 | - 📓 **Advanced notebook templates** for different use cases 9 | - 🔧 **Code generation** for common PySpark operations 10 | - ✅ **Code validation** and best practices checking 11 | - 🎯 **Fabric-specific optimizations** 12 | - 📊 **Performance analysis** tools 13 | - 🚀 **Execution monitoring** capabilities 14 | 15 | ## Architecture Diagram 16 | 17 | ```mermaid 18 | graph TB 19 | subgraph "Developer Environment" 20 | IDE[IDE/VSCode] 21 | DEV[Developer] 22 | PROJ[Project Files] 23 | end 24 | 25 | subgraph "MCP Layer" 26 | MCP[MCP Server] 27 | TOOLS[PySpark Tools] 28 | HELPERS[PySpark Helpers] 29 | TEMPLATES[Template Manager] 30 | VALIDATORS[Code Validators] 31 | GENERATORS[Code Generators] 32 | end 33 | 34 | subgraph "Microsoft Fabric" 35 | API[Fabric API] 36 | WS[Workspace] 37 | LH[Lakehouse] 38 | NB[Notebooks] 39 | TABLES[Delta Tables] 40 | SPARK[Spark Clusters] 41 | end 42 | 43 | subgraph "Operations Flow" 44 | CREATE[Create Notebooks] 45 | VALIDATE[Validate Code] 46 | GENERATE[Generate Code] 47 | ANALYZE[Analyze Performance] 48 | DEPLOY[Deploy to Fabric] 49 | end 50 | 51 | %% Developer interactions 52 | DEV --> IDE 53 | IDE --> PROJ 54 | 55 | %% MCP interactions 56 | IDE <--> MCP 57 | MCP --> TOOLS 58 | TOOLS --> HELPERS 59 | TOOLS --> TEMPLATES 60 | TOOLS --> VALIDATORS 61 | TOOLS --> GENERATORS 62 | 63 | %% Fabric interactions 64 | MCP <--> API 65 | API --> WS 66 | WS --> LH 67 | WS --> NB 68 | LH --> TABLES 69 | NB --> SPARK 70 | 71 | %% Operation flows 72 | TOOLS --> CREATE 73 | TOOLS --> VALIDATE 74 | TOOLS --> GENERATE 75 | TOOLS --> ANALYZE 76 | CREATE --> DEPLOY 77 | 78 | %% Styling 79 | classDef devEnv fill:#e1f5fe 80 | classDef mcpLayer fill:#f3e5f5 81 | classDef fabricLayer fill:#e8f5e8 82 | classDef operations fill:#fff3e0 83 | 84 | class IDE,DEV,PROJ devEnv 85 | class MCP,TOOLS,HELPERS,TEMPLATES,VALIDATORS,GENERATORS mcpLayer 86 | class API,WS,LH,NB,TABLES,SPARK fabricLayer 87 | class CREATE,VALIDATE,GENERATE,ANALYZE,DEPLOY operations 88 | ``` 89 | 90 | ### Architecture Components 91 | 92 | #### **1. Developer Environment** 93 | - **IDE/VSCode**: Primary development interface with MCP integration 94 | - **Developer**: Data engineer/scientist working on PySpark projects 95 | - **Project Files**: Local project structure and configuration 96 | 97 | #### **2. MCP Layer (This Server)** 98 | - **MCP Server**: Core server handling tool requests 99 | - **PySpark Tools**: 11 specialized tools for notebook operations 100 | - **PySpark Helpers**: Template management and code generation 101 | - **Template Manager**: Pre-built notebook templates for different scenarios 102 | - **Code Validators**: Syntax, best practices, and Fabric compatibility checks 103 | - **Code Generators**: Intelligent PySpark code generation 104 | 105 | #### **3. Microsoft Fabric** 106 | - **Fabric API**: REST API for all Fabric operations 107 | - **Workspace**: Fabric workspace containing resources 108 | - **Lakehouse**: Data storage with Delta Lake tables 109 | - **Notebooks**: PySpark notebooks for data processing 110 | - **Delta Tables**: Structured data storage 111 | - **Spark Clusters**: Compute resources for PySpark execution 112 | 113 | #### **4. Operations Flow** 114 | - **Create Notebooks**: Generate notebooks from templates 115 | - **Validate Code**: Check syntax, performance, and compatibility 116 | - **Generate Code**: Create PySpark snippets for common operations 117 | - **Analyze Performance**: Evaluate and optimize notebook performance 118 | - **Deploy to Fabric**: Push notebooks and execute in Fabric environment 119 | 120 | ### Interaction Flow 121 | 122 | 1. **Developer writes/requests PySpark code in IDE** 123 | 2. **IDE communicates with MCP server via protocol** 124 | 3. **MCP server processes request using specialized tools** 125 | 4. **Tools utilize helpers, templates, and validators** 126 | 5. **MCP server calls Fabric API for operations** 127 | 6. **Results flow back through MCP to IDE** 128 | 7. **Developer receives generated code, validation results, or analysis** 129 | 130 | ### Benefits of This Architecture 131 | 132 | - **Seamless Integration**: Work directly from your IDE without switching contexts 133 | - **Intelligent Assistance**: AI-powered code generation and validation 134 | - **Fabric Optimization**: Specialized tools for Microsoft Fabric environment 135 | - **Performance Focus**: Built-in performance analysis and optimization 136 | - **Template-Driven**: Quick start with proven patterns 137 | - **Real-time Feedback**: Immediate validation and suggestions 138 | 139 | ## Available Tools 140 | 141 | ### 1. Notebook Management 142 | 143 | #### `list_notebooks` 144 | List all notebooks in a workspace. 145 | ``` 146 | Usage: list_notebooks(workspace="my_workspace") 147 | ``` 148 | 149 | #### `get_notebook_content` 150 | Retrieve the content of a specific notebook. 151 | ``` 152 | Usage: get_notebook_content(workspace="my_workspace", notebook_id="notebook_id") 153 | ``` 154 | 155 | #### `create_pyspark_notebook` 156 | Create a notebook from built-in PySpark templates. 157 | ``` 158 | Usage: create_pyspark_notebook( 159 | workspace="my_workspace", 160 | notebook_name="my_pyspark_notebook", 161 | template_type="basic" # Options: basic, etl, analytics, ml 162 | ) 163 | ``` 164 | 165 | #### `create_fabric_notebook` 166 | Create a notebook optimized for Microsoft Fabric with advanced templates. 167 | ``` 168 | Usage: create_fabric_notebook( 169 | workspace="my_workspace", 170 | notebook_name="fabric_optimized_notebook", 171 | template_type="fabric_integration" # Options: fabric_integration, streaming 172 | ) 173 | ``` 174 | 175 | ### 2. Code Generation 176 | 177 | #### `generate_pyspark_code` 178 | Generate PySpark code for common operations. 179 | ``` 180 | Usage: generate_pyspark_code( 181 | operation="read_table", 182 | source_table="lakehouse.my_table", 183 | columns="id,name,age" 184 | ) 185 | 186 | Available operations: 187 | - read_table: Read data from tables 188 | - write_table: Write data to tables 189 | - transform: Data transformations 190 | - join: Table joins 191 | - aggregate: Data aggregations 192 | - schema_inference: Schema analysis 193 | - data_quality: Data quality checks 194 | - performance_optimization: Performance tuning 195 | ``` 196 | 197 | #### `generate_fabric_code` 198 | Generate Fabric-specific PySpark code. 199 | ``` 200 | Usage: generate_fabric_code( 201 | operation="read_lakehouse", 202 | lakehouse_name="my_lakehouse", 203 | table_name="my_table" 204 | ) 205 | 206 | Available operations: 207 | - read_lakehouse: Read from Fabric Lakehouse 208 | - write_lakehouse: Write to Fabric Lakehouse 209 | - merge_delta: Delta Lake merge operations 210 | - performance_monitor: Performance monitoring 211 | ``` 212 | 213 | ### 3. Code Validation 214 | 215 | #### `validate_pyspark_code` 216 | Validate PySpark code for syntax and best practices. 217 | ``` 218 | Usage: validate_pyspark_code(code="df = spark.table('my_table')") 219 | ``` 220 | 221 | #### `validate_fabric_code` 222 | Validate code specifically for Microsoft Fabric compatibility. 223 | ``` 224 | Usage: validate_fabric_code(code="df = spark.table('my_table')") 225 | ``` 226 | 227 | ### 4. Performance Analysis 228 | 229 | #### `analyze_notebook_performance` 230 | Analyze a notebook's performance and provide optimization recommendations. 231 | ``` 232 | Usage: analyze_notebook_performance( 233 | workspace="my_workspace", 234 | notebook_id="notebook_id" 235 | ) 236 | ``` 237 | 238 | ### 5. Notebook Editing 239 | 240 | #### `update_notebook_cell` 241 | Update a specific cell in a notebook. 242 | ``` 243 | Usage: update_notebook_cell( 244 | workspace="my_workspace", 245 | notebook_id="notebook_id", 246 | cell_index=0, 247 | cell_content="print('Hello, Fabric!')", 248 | cell_type="code" 249 | ) 250 | ``` 251 | 252 | ## Template Types 253 | 254 | ### Basic Templates (`create_pyspark_notebook`) 255 | 256 | 1. **basic**: Fundamental PySpark operations 257 | - Spark session initialization 258 | - Basic DataFrame operations 259 | - Sample data creation 260 | 261 | 2. **etl**: ETL pipeline template 262 | - Extract, Transform, Load patterns 263 | - Data cleaning and processing 264 | - Delta Lake integration 265 | 266 | 3. **analytics**: Data analytics template 267 | - Aggregations and window functions 268 | - Advanced analytics patterns 269 | - Statistical operations 270 | 271 | 4. **ml**: Machine learning template 272 | - MLlib pipeline creation 273 | - Feature engineering 274 | - Model training and evaluation 275 | 276 | ### Advanced Templates (`create_fabric_notebook`) 277 | 278 | 1. **fabric_integration**: Microsoft Fabric integration 279 | - Lakehouse connectivity 280 | - Delta Lake operations 281 | - Fabric-specific utilities 282 | 283 | 2. **streaming**: Structured Streaming template 284 | - Real-time data processing 285 | - Stream-to-Delta operations 286 | - Windowed aggregations 287 | 288 | ## Best Practices 289 | 290 | ### 1. Fabric-Specific Optimizations 291 | 292 | ✅ **Use managed tables:** 293 | ```python 294 | df = spark.table("lakehouse.my_table") # Preferred 295 | # instead of direct file paths 296 | ``` 297 | 298 | ✅ **Use Delta Lake format:** 299 | ```python 300 | df.write.format("delta").mode("overwrite").saveAsTable("my_table") 301 | ``` 302 | 303 | ✅ **Leverage notebookutils:** 304 | ```python 305 | import notebookutils as nbu 306 | workspace_id = nbu.runtime.context.workspaceId 307 | ``` 308 | 309 | ### 2. Performance Optimizations 310 | 311 | ✅ **Cache frequently used DataFrames:** 312 | ```python 313 | df.cache() # Cache before multiple actions 314 | ``` 315 | 316 | ✅ **Use broadcast for small tables:** 317 | ```python 318 | from pyspark.sql.functions import broadcast 319 | result = large_df.join(broadcast(small_df), "key") 320 | ``` 321 | 322 | ✅ **Partition large datasets:** 323 | ```python 324 | df.write.partitionBy("year", "month").saveAsTable("partitioned_table") 325 | ``` 326 | 327 | ### 3. Code Quality 328 | 329 | ✅ **Define explicit schemas:** 330 | ```python 331 | schema = StructType([ 332 | StructField("id", IntegerType(), True), 333 | StructField("name", StringType(), True) 334 | ]) 335 | df = spark.createDataFrame(data, schema) 336 | ``` 337 | 338 | ✅ **Handle null values:** 339 | ```python 340 | df.filter(col("column").isNotNull()) 341 | ``` 342 | 343 | ❌ **Avoid these anti-patterns:** 344 | ```python 345 | # Don't collect large datasets 346 | for row in df.collect(): # Avoid this 347 | process(row) 348 | 349 | # Don't use .toPandas() on large data 350 | pandas_df = large_df.toPandas() # Risk of OOM 351 | ``` 352 | 353 | ## Workflow Examples 354 | 355 | ### 1. Creating and Optimizing a PySpark Notebook 356 | 357 | ```python 358 | # 1. Create a new notebook from template 359 | create_fabric_notebook( 360 | workspace="analytics_workspace", 361 | notebook_name="sales_analysis", 362 | template_type="fabric_integration" 363 | ) 364 | 365 | # 2. Generate code for specific operations 366 | generate_fabric_code( 367 | operation="read_lakehouse", 368 | lakehouse_name="sales_lakehouse", 369 | table_name="transactions" 370 | ) 371 | 372 | # 3. Validate the generated code 373 | validate_fabric_code(code="df = spark.table('sales_lakehouse.transactions')") 374 | 375 | # 4. Analyze performance 376 | analyze_notebook_performance( 377 | workspace="analytics_workspace", 378 | notebook_id="sales_analysis_notebook_id" 379 | ) 380 | ``` 381 | 382 | ### 2. ETL Pipeline Development 383 | 384 | ```python 385 | # 1. Create ETL notebook 386 | create_pyspark_notebook( 387 | workspace="etl_workspace", 388 | notebook_name="daily_etl", 389 | template_type="etl" 390 | ) 391 | 392 | # 2. Generate transformation code 393 | generate_pyspark_code( 394 | operation="transform", 395 | columns="customer_id,product_id,amount", 396 | filter_condition="amount > 0" 397 | ) 398 | 399 | # 3. Generate Delta merge code 400 | generate_fabric_code( 401 | operation="merge_delta", 402 | target_table="sales_summary" 403 | ) 404 | ``` 405 | 406 | ### 3. Performance Monitoring 407 | 408 | ```python 409 | # 1. Generate performance monitoring code 410 | generate_fabric_code(operation="performance_monitor") 411 | 412 | # 2. Validate for performance issues 413 | validate_fabric_code(code=""" 414 | df1 = spark.table("large_table") 415 | df2 = spark.table("small_table") 416 | result = df1.join(df2, "key") 417 | result.collect() # This will be flagged 418 | """) 419 | 420 | # 3. Analyze existing notebook 421 | analyze_notebook_performance( 422 | workspace="my_workspace", 423 | notebook_id="existing_notebook_id" 424 | ) 425 | ``` 426 | 427 | ## Error Handling 428 | 429 | The MCP tools provide comprehensive error handling: 430 | 431 | - **Syntax validation**: Checks Python syntax before execution 432 | - **Fabric compatibility**: Ensures code works in Fabric environment 433 | - **Performance warnings**: Identifies potential performance issues 434 | - **Best practice suggestions**: Recommends improvements 435 | 436 | ## Integration with IDE 437 | 438 | When using the MCP in your IDE: 439 | 440 | 1. **Autocomplete**: The MCP provides intelligent code generation 441 | 2. **Validation**: Real-time code validation and suggestions 442 | 3. **Templates**: Quick notebook creation from templates 443 | 4. **Performance insights**: Analyze and optimize existing notebooks 444 | 445 | ## Troubleshooting 446 | 447 | ### Common Issues 448 | 449 | 1. **Context not provided**: Ensure `ctx` parameter is passed to all functions 450 | 2. **Invalid workspace**: Verify workspace name or ID exists 451 | 3. **Notebook not found**: Check notebook ID or name spelling 452 | 4. **Template not found**: Use valid template types listed above 453 | 454 | ### Getting Help 455 | 456 | Use the validation tools to identify issues: 457 | - `validate_pyspark_code()` for general PySpark validation 458 | - `validate_fabric_code()` for Fabric-specific validation 459 | - `analyze_notebook_performance()` for performance insights 460 | 461 | ## Advanced Features 462 | 463 | ### Custom Templates 464 | 465 | The helper module supports extending templates. You can create custom templates by modifying the `PySparkTemplateManager` class in `helpers/pyspark_helpers.py`. 466 | 467 | ### Code Generation Extensions 468 | 469 | Add new code generation patterns by extending the `PySparkCodeGenerator` class with additional methods for specific use cases. 470 | 471 | ### Performance Metrics 472 | 473 | The performance analysis tool provides: 474 | - Operation counts per cell 475 | - Performance issue detection 476 | - Optimization opportunity identification 477 | - Scoring system (0-100) 478 | 479 | This comprehensive PySpark development environment helps you write, test, and optimize PySpark notebooks efficiently in Microsoft Fabric! 480 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Microsoft Fabric MCP Server 2 | 3 | A comprehensive Python-based MCP (Model Context Protocol) server for interacting with Microsoft Fabric APIs, featuring advanced PySpark notebook development, testing, and optimization capabilities with LLM integration. 4 | 5 | ## 🚀 Features 6 | 7 | ### **Core Fabric Operations** 8 | - ✅ Workspace, lakehouse, warehouse, and table management 9 | - ✅ Delta table schemas and metadata retrieval 10 | - ✅ SQL query execution and data loading 11 | - ✅ Report and semantic model operations 12 | 13 | ### **Advanced PySpark Development** 14 | - 📓 **Intelligent notebook creation** with 6 specialized templates 15 | - 🔧 **Smart code generation** for common PySpark operations 16 | - ✅ **Comprehensive validation** with syntax and best practices checking 17 | - 🎯 **Fabric-specific optimizations** and compatibility checks 18 | - 📊 **Performance analysis** with scoring and optimization recommendations 19 | - 🚀 **Real-time monitoring** and execution insights 20 | 21 | ### **LLM Integration** 22 | - 🤖 **Natural language interface** for PySpark development 23 | - 🧠 **Context-aware assistance** with conversation memory 24 | - 🎨 **Intelligent code formatting** and explanations 25 | - 📈 **Smart optimization suggestions** based on project patterns 26 | 27 | ## 🏗️ Architecture 28 | 29 | ```mermaid 30 | graph TB 31 | subgraph "Developer Environment" 32 | IDE[IDE/VSCode] 33 | DEV[Developer] 34 | PROJ[Project Files] 35 | end 36 | 37 | subgraph "AI Layer" 38 | LLM[Large Language Model
Claude/GPT/etc.] 39 | CONTEXT[Conversation Context] 40 | REASONING[AI Reasoning Engine] 41 | end 42 | 43 | subgraph "MCP Layer" 44 | MCP[MCP Server] 45 | TOOLS[PySpark Tools] 46 | HELPERS[PySpark Helpers] 47 | TEMPLATES[Template Manager] 48 | VALIDATORS[Code Validators] 49 | GENERATORS[Code Generators] 50 | end 51 | 52 | subgraph "Microsoft Fabric" 53 | API[Fabric API] 54 | WS[Workspace] 55 | LH[Lakehouse] 56 | NB[Notebooks] 57 | TABLES[Delta Tables] 58 | SPARK[Spark Clusters] 59 | end 60 | 61 | subgraph "Operations Flow" 62 | CREATE[Create Notebooks] 63 | VALIDATE[Validate Code] 64 | GENERATE[Generate Code] 65 | ANALYZE[Analyze Performance] 66 | DEPLOY[Deploy to Fabric] 67 | end 68 | 69 | %% Developer interactions 70 | DEV --> IDE 71 | IDE --> PROJ 72 | 73 | %% LLM interactions 74 | IDE <--> LLM 75 | LLM <--> CONTEXT 76 | LLM --> REASONING 77 | 78 | %% MCP interactions 79 | LLM <--> MCP 80 | MCP --> TOOLS 81 | TOOLS --> HELPERS 82 | TOOLS --> TEMPLATES 83 | TOOLS --> VALIDATORS 84 | TOOLS --> GENERATORS 85 | 86 | %% Fabric interactions 87 | MCP <--> API 88 | API --> WS 89 | WS --> LH 90 | WS --> NB 91 | LH --> TABLES 92 | NB --> SPARK 93 | 94 | %% Operation flows 95 | TOOLS --> CREATE 96 | TOOLS --> VALIDATE 97 | TOOLS --> GENERATE 98 | TOOLS --> ANALYZE 99 | CREATE --> DEPLOY 100 | 101 | %% Data flow arrows 102 | REASONING -.->|"Intelligent Decisions"| TOOLS 103 | CONTEXT -.->|"Project Awareness"| VALIDATORS 104 | 105 | %% Styling 106 | classDef devEnv fill:#e1f5fe 107 | classDef aiLayer fill:#fff9c4 108 | classDef mcpLayer fill:#f3e5f5 109 | classDef fabricLayer fill:#e8f5e8 110 | classDef operations fill:#fff3e0 111 | 112 | class IDE,DEV,PROJ devEnv 113 | class LLM,CONTEXT,REASONING aiLayer 114 | class MCP,TOOLS,HELPERS,TEMPLATES,VALIDATORS,GENERATORS mcpLayer 115 | class API,WS,LH,NB,TABLES,SPARK fabricLayer 116 | class CREATE,VALIDATE,GENERATE,ANALYZE,DEPLOY operations 117 | ``` 118 | 119 | ### **Interaction Flow** 120 | 1. **Developer requests assistance in IDE** 121 | 2. **IDE communicates with LLM (Claude/GPT)** 122 | 3. **LLM analyzes using context and reasoning** 123 | 4. **LLM calls MCP server tools intelligently** 124 | 5. **MCP tools interact with Fabric API** 125 | 6. **Results flow back through LLM with intelligent formatting** 126 | 7. **Developer receives contextual, smart responses** 127 | 128 | ## 📋 Requirements 129 | 130 | - **Python 3.12+** 131 | - **Azure credentials** for authentication 132 | - **uv** (from astral): [Installation instructions](https://docs.astral.sh/uv/getting-started/installation/#installing-uv) 133 | - **Azure CLI**: [Installation instructions](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) 134 | - **Optional: Node.js** for MCP inspector: [Installation instructions](https://nodejs.org/en/download) 135 | 136 | ## 🔧 Installation 137 | 138 | 1. **Clone the repository:** 139 | ```bash 140 | git clone https://github.com/your-repo/fabric-mcp.git 141 | cd fabric-mcp 142 | ``` 143 | 144 | 2. **Set up virtual environment:** 145 | ```bash 146 | uv sync 147 | ``` 148 | 149 | 3. **Install dependencies:** 150 | ```bash 151 | pip install -r requirements.txt 152 | ``` 153 | 154 | ## 🚀 Usage 155 | 156 | 1. **Using STDIO** 157 | 158 | ### **Connect to Microsoft Fabric** 159 | 160 | ```bash 161 | az login --scope https://api.fabric.microsoft.com/.default 162 | ``` 163 | 164 | ### **Running with MCP Inspector** 165 | 166 | ```bash 167 | uv run --with mcp mcp dev fabric_mcp.py 168 | ``` 169 | This starts the server with inspector at `http://localhost:6274`. 170 | 171 | ### **VSCode Integration** 172 | 173 | Add to your `launch.json`: 174 | ```json 175 | { 176 | "mcp": { 177 | "servers": { 178 | "ms-fabric-mcp": { 179 | "type": "stdio", 180 | "command": "\\.venv\\Scripts\\python.exe", 181 | "args": ["\\fabric_mcp.py"] 182 | } 183 | } 184 | } 185 | } 186 | ``` 187 | 188 | 2. **Using HTTP** 189 | ### **Start the MCP Server** 190 | ```bash 191 | uv run python .\fabric_mcp.py --port 8081 192 | ``` 193 | 194 | ### **VSCode Integration** 195 | 196 | Add to your `launch.json`: 197 | ```json 198 | { 199 | "mcp": { 200 | "servers": { 201 | "ms-fabric-mcp": { 202 | "type": "http", 203 | "url": "http://:8081/mcp/", 204 | "headers": { 205 | "Accept": "application/json,text/event-stream", 206 | } 207 | } 208 | } 209 | } 210 | } 211 | ``` 212 | 213 | ## 🛠️ Complete Tool Reference 214 | 215 | ### **1. Workspace Management** 216 | 217 | #### `list_workspaces` 218 | List all available Fabric workspaces. 219 | ```python 220 | # Usage in LLM: "List all my Fabric workspaces" 221 | ``` 222 | 223 | #### `set_workspace` 224 | Set the current workspace context for the session. 225 | ```python 226 | set_workspace(workspace="Analytics-Workspace") 227 | ``` 228 | 229 | ### **2. Lakehouse Operations** 230 | 231 | #### `list_lakehouses` 232 | List all lakehouses in a workspace. 233 | ```python 234 | list_lakehouses(workspace="Analytics-Workspace") 235 | ``` 236 | 237 | #### `create_lakehouse` 238 | Create a new lakehouse. 239 | ```python 240 | create_lakehouse( 241 | name="Sales-Data-Lake", 242 | workspace="Analytics-Workspace", 243 | description="Sales data lakehouse" 244 | ) 245 | ``` 246 | 247 | #### `set_lakehouse` 248 | Set current lakehouse context. 249 | ```python 250 | set_lakehouse(lakehouse="Sales-Data-Lake") 251 | ``` 252 | 253 | ### **3. Warehouse Operations** 254 | 255 | #### `list_warehouses` 256 | List all warehouses in a workspace. 257 | ```python 258 | list_warehouses(workspace="Analytics-Workspace") 259 | ``` 260 | 261 | #### `create_warehouse` 262 | Create a new warehouse. 263 | ```python 264 | create_warehouse( 265 | name="Sales-DW", 266 | workspace="Analytics-Workspace", 267 | description="Sales data warehouse" 268 | ) 269 | ``` 270 | 271 | #### `set_warehouse` 272 | Set current warehouse context. 273 | ```python 274 | set_warehouse(warehouse="Sales-DW") 275 | ``` 276 | 277 | ### **4. Table Operations** 278 | 279 | #### `list_tables` 280 | List all tables in a lakehouse. 281 | ```python 282 | list_tables(workspace="Analytics-Workspace", lakehouse="Sales-Data-Lake") 283 | ``` 284 | 285 | #### `get_lakehouse_table_schema` 286 | Get schema for a specific table. 287 | ```python 288 | get_lakehouse_table_schema( 289 | workspace="Analytics-Workspace", 290 | lakehouse="Sales-Data-Lake", 291 | table_name="transactions" 292 | ) 293 | ``` 294 | 295 | #### `get_all_lakehouse_schemas` 296 | Get schemas for all tables in a lakehouse. 297 | ```python 298 | get_all_lakehouse_schemas( 299 | workspace="Analytics-Workspace", 300 | lakehouse="Sales-Data-Lake" 301 | ) 302 | ``` 303 | 304 | #### `set_table` 305 | Set current table context. 306 | ```python 307 | set_table(table_name="transactions") 308 | ``` 309 | 310 | ### **5. SQL Operations** 311 | 312 | #### `get_sql_endpoint` 313 | Get SQL endpoint for lakehouse or warehouse. 314 | ```python 315 | get_sql_endpoint( 316 | workspace="Analytics-Workspace", 317 | lakehouse="Sales-Data-Lake", 318 | type="lakehouse" 319 | ) 320 | ``` 321 | 322 | #### `run_query` 323 | Execute SQL queries. 324 | ```python 325 | run_query( 326 | workspace="Analytics-Workspace", 327 | lakehouse="Sales-Data-Lake", 328 | query="SELECT COUNT(*) FROM transactions", 329 | type="lakehouse" 330 | ) 331 | ``` 332 | 333 | ### **6. Data Loading** 334 | 335 | #### `load_data_from_url` 336 | Load data from URL into tables. 337 | ```python 338 | load_data_from_url( 339 | url="https://example.com/data.csv", 340 | destination_table="new_data", 341 | workspace="Analytics-Workspace", 342 | lakehouse="Sales-Data-Lake" 343 | ) 344 | ``` 345 | 346 | ### **7. Reports & Models** 347 | 348 | #### `list_reports` 349 | List all reports in a workspace. 350 | ```python 351 | list_reports(workspace="Analytics-Workspace") 352 | ``` 353 | 354 | #### `get_report` 355 | Get specific report details. 356 | ```python 357 | get_report(workspace="Analytics-Workspace", report_id="report-id") 358 | ``` 359 | 360 | #### `list_semantic_models` 361 | List semantic models in workspace. 362 | ```python 363 | list_semantic_models(workspace="Analytics-Workspace") 364 | ``` 365 | 366 | #### `get_semantic_model` 367 | Get specific semantic model. 368 | ```python 369 | get_semantic_model(workspace="Analytics-Workspace", model_id="model-id") 370 | ``` 371 | 372 | ### **8. Basic Notebook Operations** 373 | 374 | #### `list_notebooks` 375 | List all notebooks in a workspace. 376 | ```python 377 | list_notebooks(workspace="Analytics-Workspace") 378 | ``` 379 | 380 | #### `get_notebook_content` 381 | Retrieve notebook content. 382 | ```python 383 | get_notebook_content( 384 | workspace="Analytics-Workspace", 385 | notebook_id="notebook-id" 386 | ) 387 | ``` 388 | 389 | #### `update_notebook_cell` 390 | Update specific notebook cells. 391 | ```python 392 | update_notebook_cell( 393 | workspace="Analytics-Workspace", 394 | notebook_id="notebook-id", 395 | cell_index=0, 396 | cell_content="print('Hello, Fabric!')", 397 | cell_type="code" 398 | ) 399 | ``` 400 | 401 | ### **9. Advanced PySpark Notebook Creation** 402 | 403 | #### `create_pyspark_notebook` 404 | Create notebooks from basic templates. 405 | ```python 406 | create_pyspark_notebook( 407 | workspace="Analytics-Workspace", 408 | notebook_name="Data-Analysis", 409 | template_type="analytics" # Options: basic, etl, analytics, ml 410 | ) 411 | ``` 412 | 413 | #### `create_fabric_notebook` 414 | Create Fabric-optimized notebooks. 415 | ```python 416 | create_fabric_notebook( 417 | workspace="Analytics-Workspace", 418 | notebook_name="Fabric-Pipeline", 419 | template_type="fabric_integration" # Options: fabric_integration, streaming 420 | ) 421 | ``` 422 | 423 | ### **10. PySpark Code Generation** 424 | 425 | #### `generate_pyspark_code` 426 | Generate code for common operations. 427 | ```python 428 | generate_pyspark_code( 429 | operation="read_table", 430 | source_table="sales.transactions", 431 | columns="id,amount,date" 432 | ) 433 | 434 | # Available operations: 435 | # - read_table, write_table, transform, join, aggregate 436 | # - schema_inference, data_quality, performance_optimization 437 | ``` 438 | 439 | #### `generate_fabric_code` 440 | Generate Fabric-specific code. 441 | ```python 442 | generate_fabric_code( 443 | operation="read_lakehouse", 444 | lakehouse_name="Sales-Data-Lake", 445 | table_name="transactions" 446 | ) 447 | 448 | # Available operations: 449 | # - read_lakehouse, write_lakehouse, merge_delta, performance_monitor 450 | ``` 451 | 452 | ### **11. Code Validation & Analysis** 453 | 454 | #### `validate_pyspark_code` 455 | Validate PySpark code syntax and best practices. 456 | ```python 457 | validate_pyspark_code(code=""" 458 | df = spark.table('transactions') 459 | df.show() 460 | """) 461 | ``` 462 | 463 | #### `validate_fabric_code` 464 | Validate Fabric compatibility. 465 | ```python 466 | validate_fabric_code(code=""" 467 | df = spark.table('lakehouse.transactions') 468 | df.write.format('delta').saveAsTable('summary') 469 | """) 470 | ``` 471 | 472 | #### `analyze_notebook_performance` 473 | Comprehensive performance analysis. 474 | ```python 475 | analyze_notebook_performance( 476 | workspace="Analytics-Workspace", 477 | notebook_id="notebook-id" 478 | ) 479 | ``` 480 | 481 | ### **12. Context Management** 482 | 483 | #### `clear_context` 484 | Clear current session context. 485 | ```python 486 | clear_context() 487 | ``` 488 | 489 | ## 📊 PySpark Templates 490 | 491 | ### **Basic Templates** 492 | 1. **basic**: Fundamental PySpark operations and DataFrame usage 493 | 2. **etl**: Complete ETL pipeline with data cleaning and Delta Lake 494 | 3. **analytics**: Advanced analytics with aggregations and window functions 495 | 4. **ml**: Machine learning pipeline with MLlib and feature engineering 496 | 497 | ### **Advanced Templates** 498 | 1. **fabric_integration**: Lakehouse connectivity and Fabric-specific utilities 499 | 2. **streaming**: Real-time processing with Structured Streaming 500 | 501 | ## 🎯 Best Practices 502 | 503 | ### **Fabric Optimization** 504 | ```python 505 | # ✅ Use managed tables 506 | df = spark.table("lakehouse.my_table") 507 | 508 | # ✅ Use Delta Lake format 509 | df.write.format("delta").mode("overwrite").saveAsTable("my_table") 510 | 511 | # ✅ Leverage notebookutils 512 | import notebookutils as nbu 513 | workspace_id = nbu.runtime.context.workspaceId 514 | ``` 515 | 516 | ### **Performance Optimization** 517 | ```python 518 | # ✅ Cache frequently used DataFrames 519 | df.cache() 520 | 521 | # ✅ Use broadcast for small tables 522 | from pyspark.sql.functions import broadcast 523 | result = large_df.join(broadcast(small_df), "key") 524 | 525 | # ✅ Partition large datasets 526 | df.write.partitionBy("year", "month").saveAsTable("partitioned_table") 527 | ``` 528 | 529 | ### **Code Quality** 530 | ```python 531 | # ✅ Define explicit schemas 532 | schema = StructType([ 533 | StructField("id", IntegerType(), True), 534 | StructField("name", StringType(), True) 535 | ]) 536 | 537 | # ✅ Handle null values 538 | df.filter(col("column").isNotNull()) 539 | ``` 540 | 541 | ## 🔄 Example LLM-Enhanced Workflows 542 | 543 | ### **Natural Language Requests** 544 | ``` 545 | Human: "Create a PySpark notebook that reads sales data, cleans it, and optimizes performance" 546 | 547 | LLM Response: 548 | 1. Creates Fabric-optimized notebook with ETL template 549 | 2. Generates lakehouse reading code 550 | 3. Adds data cleaning transformations 551 | 4. Includes performance optimization patterns 552 | 5. Validates code for best practices 553 | ``` 554 | 555 | ### **Performance Analysis** 556 | ``` 557 | Human: "My PySpark notebook is slow. Help me optimize it." 558 | 559 | LLM Response: 560 | 1. Analyzes notebook performance (scoring 0-100) 561 | 2. Identifies anti-patterns and bottlenecks 562 | 3. Suggests specific optimizations 563 | 4. Generates optimized code alternatives 564 | 5. Provides before/after comparisons 565 | ``` 566 | 567 | ## 🔍 Troubleshooting 568 | 569 | ### **Common Issues** 570 | - **Authentication**: Ensure `az login` with correct scope 571 | - **Context**: Use `clear_context()` to reset session state 572 | - **Workspace**: Verify workspace names and permissions 573 | - **Templates**: Check available template types in documentation 574 | 575 | ### **Getting Help** 576 | - Use validation tools for code issues 577 | - Check performance analysis for optimization opportunities 578 | - Leverage LLM natural language interface for guidance 579 | 580 | ## 📈 Performance Metrics 581 | 582 | The analysis tools provide: 583 | - **Operation counts** per notebook cell 584 | - **Performance issues** detection and flagging 585 | - **Optimization opportunities** identification 586 | - **Scoring system** (0-100) for code quality 587 | - **Fabric compatibility** assessment 588 | 589 | ## 🤝 Contributing 590 | 591 | This project welcomes contributions! Please see our contributing guidelines for details. 592 | 593 | ## 📄 License 594 | 595 | This project is licensed under the MIT License. See the LICENSE file for details. 596 | 597 | ## 🙏 Acknowledgments 598 | 599 | Inspired by: https://github.com/Augustab/microsoft_fabric_mcp/tree/main 600 | 601 | --- 602 | 603 | **Ready to supercharge your Microsoft Fabric development with intelligent PySpark assistance!** 🚀 604 | -------------------------------------------------------------------------------- /helpers/pyspark_helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | PySpark helper utilities for Microsoft Fabric MCP Server. 3 | This module provides templates, code generation, and execution helpers for PySpark notebooks. 4 | """ 5 | 6 | import json 7 | from typing import Dict, List, Any, Optional 8 | from helpers.logging_config import get_logger 9 | 10 | logger = get_logger(__name__) 11 | 12 | class PySparkTemplateManager: 13 | """Manages PySpark notebook templates and code generation.""" 14 | 15 | @staticmethod 16 | def get_fabric_integration_template() -> Dict[str, Any]: 17 | """Template for Fabric-specific PySpark operations.""" 18 | return { 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "source": [ 23 | "# Microsoft Fabric PySpark Integration\n", 24 | "\n", 25 | "This notebook demonstrates integration with Microsoft Fabric resources using PySpark.\n" 26 | ], 27 | "metadata": {} 28 | }, 29 | { 30 | "cell_type": "code", 31 | "source": [ 32 | "# Initialize Fabric integration\n", 33 | "from pyspark.sql import SparkSession\n", 34 | "from pyspark.sql.functions import *\n", 35 | "from pyspark.sql.types import *\n", 36 | "from delta.tables import DeltaTable\n", 37 | "import notebookutils as nbu\n", 38 | "\n", 39 | "# Spark session is pre-configured in Fabric\n", 40 | "print(f\"Spark version: {spark.version}\")\n", 41 | "print(f\"Available cores: {spark.sparkContext.defaultParallelism}\")\n", 42 | "\n", 43 | "# Get current workspace and lakehouse context\n", 44 | "print(f\"Current workspace: {nbu.runtime.context.workspaceId}\")\n" 45 | ], 46 | "execution_count": None, 47 | "outputs": [], 48 | "metadata": {} 49 | }, 50 | { 51 | "cell_type": "code", 52 | "source": [ 53 | "# Connect to Fabric Lakehouse\n", 54 | "# List available tables in the default lakehouse\n", 55 | "try:\n", 56 | " tables = spark.sql(\"SHOW TABLES\").collect()\n", 57 | " print(\"Available tables in current lakehouse:\")\n", 58 | " for table in tables:\n", 59 | " print(f\" - {table.database}.{table.tableName}\")\n", 60 | "except Exception as e:\n", 61 | " print(f\"No default lakehouse attached or no tables found: {e}\")\n" 62 | ], 63 | "execution_count": None, 64 | "outputs": [], 65 | "metadata": {} 66 | }, 67 | { 68 | "cell_type": "code", 69 | "source": [ 70 | "# Read from Fabric Lakehouse table\n", 71 | "# Replace 'your_table_name' with actual table name\n", 72 | "# df = spark.table(\"your_table_name\")\n", 73 | "\n", 74 | "# Alternative: Read from files in Lakehouse\n", 75 | "# df = spark.read.format(\"delta\").load(\"Tables/your_table_name\")\n", 76 | "\n", 77 | "# For demo, create sample data\n", 78 | "sample_data = [\n", 79 | " (1, \"Product A\", 100.0, \"2024-01-01\"),\n", 80 | " (2, \"Product B\", 150.0, \"2024-01-02\"),\n", 81 | " (3, \"Product C\", 200.0, \"2024-01-03\")\n", 82 | "]\n", 83 | "\n", 84 | "schema = StructType([\n", 85 | " StructField(\"id\", IntegerType(), True),\n", 86 | " StructField(\"product_name\", StringType(), True),\n", 87 | " StructField(\"price\", DoubleType(), True),\n", 88 | " StructField(\"date_created\", StringType(), True)\n", 89 | "])\n", 90 | "\n", 91 | "df = spark.createDataFrame(sample_data, schema)\n", 92 | "df = df.withColumn(\"date_created\", to_date(col(\"date_created\"), \"yyyy-MM-dd\"))\n", 93 | "df.show()\n" 94 | ], 95 | "execution_count": None, 96 | "outputs": [], 97 | "metadata": {} 98 | }, 99 | { 100 | "cell_type": "code", 101 | "source": [ 102 | "# Write to Fabric Lakehouse as Delta table\n", 103 | "table_name = \"fabric_demo_products\"\n", 104 | "\n", 105 | "# Option 1: Write as managed table\n", 106 | "df.write \\\n", 107 | " .format(\"delta\") \\\n", 108 | " .mode(\"overwrite\") \\\n", 109 | " .option(\"overwriteSchema\", \"true\") \\\n", 110 | " .saveAsTable(table_name)\n", 111 | "\n", 112 | "print(f\"Successfully wrote {df.count()} records to table '{table_name}'\")\n", 113 | "\n", 114 | "# Verify the table was created\n", 115 | "result = spark.table(table_name)\n", 116 | "print(\"\\nTable verification:\")\n", 117 | "result.show()\n" 118 | ], 119 | "execution_count": None, 120 | "outputs": [], 121 | "metadata": {} 122 | }, 123 | { 124 | "cell_type": "code", 125 | "source": [ 126 | "# Advanced Delta Lake operations in Fabric\n", 127 | "from delta.tables import DeltaTable\n", 128 | "\n", 129 | "# Create DeltaTable reference\n", 130 | "delta_table = DeltaTable.forName(spark, table_name)\n", 131 | "\n", 132 | "# Show table history\n", 133 | "print(\"Table history:\")\n", 134 | "delta_table.history().show(truncate=False)\n", 135 | "\n", 136 | "# Perform merge operation (upsert)\n", 137 | "new_data = [\n", 138 | " (1, \"Product A Updated\", 110.0, \"2024-01-01\"), # Update existing\n", 139 | " (4, \"Product D\", 250.0, \"2024-01-04\") # Insert new\n", 140 | "]\n", 141 | "\n", 142 | "new_df = spark.createDataFrame(new_data, schema)\n", 143 | "new_df = new_df.withColumn(\"date_created\", to_date(col(\"date_created\"), \"yyyy-MM-dd\"))\n", 144 | "\n", 145 | "# Merge operation\n", 146 | "delta_table.alias(\"target\") \\\n", 147 | " .merge(\n", 148 | " new_df.alias(\"source\"),\n", 149 | " \"target.id = source.id\"\n", 150 | " ) \\\n", 151 | " .whenMatchedUpdateAll() \\\n", 152 | " .whenNotMatchedInsertAll() \\\n", 153 | " .execute()\n", 154 | "\n", 155 | "print(\"\\nAfter merge operation:\")\n", 156 | "spark.table(table_name).show()\n" 157 | ], 158 | "execution_count": None, 159 | "outputs": [], 160 | "metadata": {} 161 | } 162 | ] 163 | } 164 | 165 | @staticmethod 166 | def get_streaming_template() -> Dict[str, Any]: 167 | """Template for PySpark Structured Streaming in Fabric.""" 168 | return { 169 | "cells": [ 170 | { 171 | "cell_type": "markdown", 172 | "source": [ 173 | "# PySpark Structured Streaming in Fabric\n", 174 | "\n", 175 | "This notebook demonstrates real-time data processing using PySpark Structured Streaming.\n" 176 | ], 177 | "metadata": {} 178 | }, 179 | { 180 | "cell_type": "code", 181 | "source": [ 182 | "# Import streaming libraries\n", 183 | "from pyspark.sql import SparkSession\n", 184 | "from pyspark.sql.functions import *\n", 185 | "from pyspark.sql.types import *\n", 186 | "import time\n", 187 | "\n", 188 | "print(f\"Spark version: {spark.version}\")\n", 189 | "print(\"Structured Streaming capabilities enabled\")\n" 190 | ], 191 | "execution_count": None, 192 | "outputs": [], 193 | "metadata": {} 194 | }, 195 | { 196 | "cell_type": "code", 197 | "source": [ 198 | "# Define schema for streaming data\n", 199 | "streaming_schema = StructType([\n", 200 | " StructField(\"timestamp\", TimestampType(), True),\n", 201 | " StructField(\"user_id\", StringType(), True),\n", 202 | " StructField(\"event_type\", StringType(), True),\n", 203 | " StructField(\"value\", DoubleType(), True)\n", 204 | "])\n", 205 | "\n", 206 | "# Create a streaming DataFrame (example with rate source for demo)\n", 207 | "streaming_df = spark \\\n", 208 | " .readStream \\\n", 209 | " .format(\"rate\") \\\n", 210 | " .option(\"rowsPerSecond\", 10) \\\n", 211 | " .load()\n", 212 | "\n", 213 | "# Transform the rate stream to simulate real events\n", 214 | "events_df = streaming_df \\\n", 215 | " .withColumn(\"user_id\", concat(lit(\"user_\"), (col(\"value\") % 100).cast(\"string\"))) \\\n", 216 | " .withColumn(\"event_type\", \n", 217 | " when(col(\"value\") % 3 == 0, \"purchase\")\n", 218 | " .when(col(\"value\") % 3 == 1, \"view\")\n", 219 | " .otherwise(\"click\")\n", 220 | " ) \\\n", 221 | " .withColumn(\"event_value\", (col(\"value\") % 1000).cast(\"double\")) \\\n", 222 | " .select(\"timestamp\", \"user_id\", \"event_type\", \"event_value\")\n", 223 | "\n", 224 | "print(\"Streaming DataFrame created\")\n", 225 | "print(f\"Schema: {events_df.schema}\")\n" 226 | ], 227 | "execution_count": None, 228 | "outputs": [], 229 | "metadata": {} 230 | }, 231 | { 232 | "cell_type": "code", 233 | "source": [ 234 | "# Streaming aggregations\n", 235 | "# Count events by type in 30-second windows\n", 236 | "windowed_counts = events_df \\\n", 237 | " .withWatermark(\"timestamp\", \"30 seconds\") \\\n", 238 | " .groupBy(\n", 239 | " window(col(\"timestamp\"), \"30 seconds\"),\n", 240 | " col(\"event_type\")\n", 241 | " ) \\\n", 242 | " .count() \\\n", 243 | " .orderBy(\"window\")\n", 244 | "\n", 245 | "# Start streaming query (console output)\n", 246 | "query = windowed_counts \\\n", 247 | " .writeStream \\\n", 248 | " .outputMode(\"complete\") \\\n", 249 | " .format(\"console\") \\\n", 250 | " .option(\"truncate\", False) \\\n", 251 | " .trigger(processingTime=\"10 seconds\") \\\n", 252 | " .start()\n", 253 | "\n", 254 | "print(\"Streaming query started. Check output below...\")\n", 255 | "print(f\"Query ID: {query.id}\")\n" 256 | ], 257 | "execution_count": None, 258 | "outputs": [], 259 | "metadata": {} 260 | }, 261 | { 262 | "cell_type": "code", 263 | "source": [ 264 | "# Let the stream run for a short time\n", 265 | "import time\n", 266 | "time.sleep(30) # Run for 30 seconds\n", 267 | "\n", 268 | "# Stop the query\n", 269 | "query.stop()\n", 270 | "print(\"Streaming query stopped\")\n", 271 | "\n", 272 | "# Show query progress\n", 273 | "print(\"\\nQuery progress:\")\n", 274 | "print(query.lastProgress)\n" 275 | ], 276 | "execution_count": None, 277 | "outputs": [], 278 | "metadata": {} 279 | }, 280 | { 281 | "cell_type": "code", 282 | "source": [ 283 | "# Stream to Delta Lake table\n", 284 | "streaming_table = \"streaming_events\"\n", 285 | "\n", 286 | "# Create another streaming query that writes to Delta\n", 287 | "delta_query = events_df \\\n", 288 | " .writeStream \\\n", 289 | " .format(\"delta\") \\\n", 290 | " .outputMode(\"append\") \\\n", 291 | " .option(\"checkpointLocation\", \"/tmp/checkpoint/streaming_events\") \\\n", 292 | " .table(streaming_table)\n", 293 | "\n", 294 | "print(f\"Started streaming to Delta table: {streaming_table}\")\n", 295 | "print(f\"Query ID: {delta_query.id}\")\n", 296 | "\n", 297 | "# Let it run briefly\n", 298 | "time.sleep(20)\n", 299 | "\n", 300 | "# Stop and check results\n", 301 | "delta_query.stop()\n", 302 | "\n", 303 | "# Read from the Delta table\n", 304 | "result_df = spark.table(streaming_table)\n", 305 | "print(f\"\\nTotal records in Delta table: {result_df.count()}\")\n", 306 | "result_df.show(20)\n" 307 | ], 308 | "execution_count": None, 309 | "outputs": [], 310 | "metadata": {} 311 | } 312 | ] 313 | } 314 | 315 | class PySparkCodeGenerator: 316 | """Generates PySpark code snippets for common operations.""" 317 | 318 | @staticmethod 319 | def generate_fabric_lakehouse_reader(lakehouse_name: str, table_name: str) -> str: 320 | """Generate code to read from a Fabric Lakehouse table.""" 321 | return f"""# Read from Fabric Lakehouse table 322 | df = spark.table("{lakehouse_name}.{table_name}") 323 | 324 | # Alternative: Read from Delta files directly 325 | # df = spark.read.format("delta").load("Tables/{table_name}") 326 | 327 | # Show basic info 328 | print(f"Records: {{df.count()}}") 329 | print(f"Columns: {{len(df.columns)}}") 330 | df.printSchema() 331 | df.show(10)""" 332 | 333 | @staticmethod 334 | def generate_fabric_lakehouse_writer(table_name: str, mode: str = "overwrite") -> str: 335 | """Generate code to write to a Fabric Lakehouse table.""" 336 | return f"""# Write to Fabric Lakehouse table 337 | df.write \\ 338 | .format("delta") \\ 339 | .mode("{mode}") \\ 340 | .option("overwriteSchema", "true") \\ 341 | .saveAsTable("{table_name}") 342 | 343 | print(f"Successfully wrote {{df.count()}} records to table '{table_name}'") 344 | 345 | # Verify the write 346 | verification_df = spark.table("{table_name}") 347 | print(f"Verification - Table now has {{verification_df.count()}} records")""" 348 | 349 | @staticmethod 350 | def generate_delta_merge_operation(target_table: str, source_df_name: str, join_condition: str) -> str: 351 | """Generate code for Delta Lake merge operations.""" 352 | return f"""# Delta Lake merge operation (UPSERT) 353 | from delta.tables import DeltaTable 354 | 355 | # Create DeltaTable reference 356 | target_table = DeltaTable.forName(spark, "{target_table}") 357 | 358 | # Perform merge operation 359 | target_table.alias("target") \\ 360 | .merge( 361 | {source_df_name}.alias("source"), 362 | "{join_condition}" 363 | ) \\ 364 | .whenMatchedUpdateAll() \\ 365 | .whenNotMatchedInsertAll() \\ 366 | .execute() 367 | 368 | print("Merge operation completed successfully") 369 | print(f"Table now has {{spark.table('{target_table}').count()}} records")""" 370 | 371 | @staticmethod 372 | def generate_performance_monitoring() -> str: 373 | """Generate code for monitoring PySpark performance.""" 374 | return """# PySpark Performance Monitoring 375 | 376 | # 1. Check Spark configuration 377 | print("=== Spark Configuration ===") 378 | for key, value in spark.sparkContext.getConf().getAll(): 379 | if 'spark.sql' in key or 'spark.serializer' in key: 380 | print(f"{key}: {value}") 381 | 382 | # 2. Monitor DataFrame operations 383 | from pyspark.sql.utils import AnalysisException 384 | import time 385 | 386 | def monitor_operation(df, operation_name): 387 | start_time = time.time() 388 | try: 389 | count = df.count() 390 | end_time = time.time() 391 | duration = end_time - start_time 392 | print(f"{operation_name}: {count} records in {duration:.2f} seconds") 393 | return count, duration 394 | except Exception as e: 395 | print(f"Error in {operation_name}: {e}") 396 | return 0, 0 397 | 398 | # Example usage: 399 | # count, duration = monitor_operation(df, "DataFrame Count") 400 | 401 | # 3. Show execution plan 402 | print("\\n=== Execution Plan ===") 403 | df.explain(True) 404 | 405 | # 4. Cache analysis 406 | print("\\n=== Storage Levels ===") 407 | print(f"DataFrame cached: {df.is_cached}") 408 | if df.is_cached: 409 | print(f"Storage level: {df.storageLevel}")""" 410 | 411 | class PySparkValidator: 412 | """Validates PySpark code and suggests optimizations.""" 413 | 414 | @staticmethod 415 | def validate_fabric_compatibility(code: str) -> Dict[str, List[str]]: 416 | """Check if code is compatible with Microsoft Fabric.""" 417 | issues = [] 418 | suggestions = [] 419 | 420 | # Check for Fabric-specific patterns 421 | if 'SparkSession.builder' in code: 422 | issues.append("❌ Don't create SparkSession in Fabric - use pre-configured 'spark' variable") 423 | 424 | if 'notebookutils' not in code and any(pattern in code for pattern in ['lakehouse', 'workspace']): 425 | suggestions.append("💡 Consider using 'notebookutils' for Fabric integration") 426 | 427 | if '.saveAsTable(' in code and 'format("delta")' not in code: 428 | suggestions.append("💡 Specify Delta format explicitly when saving tables in Fabric") 429 | 430 | if 'jdbc' in code.lower(): 431 | suggestions.append("💡 Consider using Fabric's built-in connectors instead of JDBC") 432 | 433 | return { 434 | "issues": issues, 435 | "suggestions": suggestions 436 | } 437 | 438 | @staticmethod 439 | def check_performance_patterns(code: str) -> Dict[str, List[str]]: 440 | """Check for performance anti-patterns and optimizations.""" 441 | warnings = [] 442 | optimizations = [] 443 | 444 | # Performance anti-patterns 445 | if '.collect()' in code: 446 | warnings.append("⚠️ .collect() can cause OOM on large datasets") 447 | 448 | if 'rdd.' in code and 'parallelize' not in code: 449 | warnings.append("⚠️ RDD operations are less optimized than DataFrame operations") 450 | 451 | if code.count('spark.read') > 3 and '.cache()' not in code: 452 | optimizations.append("💡 Consider caching frequently accessed DataFrames") 453 | 454 | if '.join(' in code and 'broadcast' not in code: 455 | optimizations.append("💡 Consider broadcast joins for small dimension tables") 456 | 457 | if '.write.' in code and 'partitionBy' not in code: 458 | optimizations.append("💡 Consider partitioning large datasets for better performance") 459 | 460 | return { 461 | "warnings": warnings, 462 | "optimizations": optimizations 463 | } 464 | 465 | def create_notebook_from_template(template_name: str, custom_params: Optional[Dict] = None) -> Dict[str, Any]: 466 | """Create a complete notebook from a template.""" 467 | template_manager = PySparkTemplateManager() 468 | 469 | templates = { 470 | "fabric_integration": template_manager.get_fabric_integration_template(), 471 | "streaming": template_manager.get_streaming_template(), 472 | } 473 | 474 | if template_name not in templates: 475 | raise ValueError(f"Unknown template: {template_name}. Available: {list(templates.keys())}") 476 | 477 | template = templates[template_name] 478 | 479 | # Create notebook structure 480 | notebook = { 481 | "nbformat": 4, 482 | "nbformat_minor": 5, 483 | "cells": template["cells"], 484 | "metadata": { 485 | "language_info": {"name": "python"}, 486 | "kernel_info": {"name": "synapse_pyspark"}, 487 | "description": f"PySpark notebook created from {template_name} template" 488 | } 489 | } 490 | 491 | return notebook 492 | -------------------------------------------------------------------------------- /helpers/clients/fabric_client.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Dict, Any, List, Optional, Tuple, Union 3 | import base64 4 | from urllib.parse import quote 5 | from functools import lru_cache 6 | import requests 7 | from azure.identity import DefaultAzureCredential 8 | from helpers.logging_config import get_logger 9 | from helpers.utils import _is_valid_uuid 10 | import json 11 | from uuid import UUID 12 | 13 | logger = get_logger(__name__) 14 | # from sempy_labs._helper_functions import create_item 15 | 16 | 17 | 18 | class FabricApiConfig(BaseModel): 19 | """Configuration for Fabric API""" 20 | 21 | base_url: str = "https://api.fabric.microsoft.com/v1" 22 | max_results: int = 100 23 | 24 | 25 | class FabricApiClient: 26 | """Client for communicating with the Fabric API""" 27 | 28 | def __init__(self, credential=None, config=None): 29 | self.credential = credential or DefaultAzureCredential() 30 | self.config = config or FabricApiConfig() 31 | # Initialize cached methods 32 | self._cached_resolve_workspace = lru_cache(maxsize=128)(self._resolve_workspace) 33 | self._cached_resolve_lakehouse = lru_cache(maxsize=128)(self._resolve_lakehouse) 34 | 35 | def _get_headers(self) -> Dict[str, str]: 36 | """Get headers for Fabric API calls""" 37 | return { 38 | "Authorization": f"Bearer {self.credential.get_token('https://api.fabric.microsoft.com/.default').token}" 39 | } 40 | 41 | def _build_url( 42 | self, endpoint: str, continuation_token: Optional[str] = None 43 | ) -> str: 44 | # If the endpoint starts with http, use it as-is. 45 | url = ( 46 | endpoint 47 | if endpoint.startswith("http") 48 | else f"{self.config.base_url}/{endpoint.lstrip('/')}" 49 | ) 50 | if continuation_token: 51 | separator = "&" if "?" in url else "?" 52 | encoded_token = quote(continuation_token) 53 | url += f"{separator}continuationToken={encoded_token}" 54 | return url 55 | 56 | async def _make_request( 57 | self, 58 | endpoint: str, 59 | params: Optional[Dict] = None, 60 | method: str = "GET", 61 | use_pagination: bool = False, 62 | data_key: str = "value", 63 | lro: bool = False, 64 | lro_poll_interval: int = 2, # seconds between polls 65 | lro_timeout: int = 300, # max seconds to wait 66 | ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: 67 | """ 68 | Make an asynchronous call to the Fabric API. 69 | 70 | If use_pagination is True, it will automatically handle paginated responses. 71 | 72 | If lro is True, will poll for long-running operation completion. 73 | """ 74 | import time 75 | 76 | params = params or {} 77 | 78 | if not use_pagination: 79 | url = self._build_url(endpoint=endpoint) 80 | try: 81 | if method.upper() == "POST": 82 | # logger.debug(f"Authorization header: {self._get_headers()}") 83 | # logger.debug(f"Request URL: {url}") 84 | # logger.debug(f"Request parameters: {params}") 85 | response = requests.post( 86 | url, 87 | headers=self._get_headers(), 88 | json=params, 89 | timeout=120, 90 | ) 91 | else: 92 | if "maxResults" not in params: 93 | params["maxResults"] = self.config.max_results 94 | response = requests.request( 95 | method=method.upper(), 96 | url=url, 97 | headers=self._get_headers(), 98 | params=params, 99 | timeout=120, 100 | ) 101 | 102 | # LRO support: check for 202 and Operation-Location 103 | if lro and response.status_code == 202: 104 | op_url = response.headers.get( 105 | "Operation-Location" 106 | ) or response.headers.get("operation-location") 107 | if not op_url: 108 | logger.error("LRO: No Operation-Location header found.") 109 | return None 110 | logger.info(f"LRO: Polling {op_url} for operation status...") 111 | start_time = time.time() 112 | while True: 113 | poll_resp = requests.get( 114 | op_url, headers=self._get_headers(), timeout=60 115 | ) 116 | if poll_resp.status_code not in (200, 201, 202): 117 | logger.error( 118 | f"LRO: Poll failed with status {poll_resp.status_code}" 119 | ) 120 | return None 121 | poll_data = poll_resp.json() 122 | status = poll_data.get("status") or poll_data.get( 123 | "operationStatus" 124 | ) 125 | if status in ( 126 | "Succeeded", 127 | "succeeded", 128 | "Completed", 129 | "completed", 130 | ): 131 | logger.info("LRO: Operation succeeded.") 132 | return poll_data 133 | if status in ("Failed", "failed", "Canceled", "canceled"): 134 | logger.error( 135 | f"LRO: Operation failed or canceled. Status: {status}" 136 | ) 137 | return poll_data 138 | if time.time() - start_time > lro_timeout: 139 | logger.error("LRO: Polling timed out.") 140 | return None 141 | logger.debug( 142 | f"LRO: Status {status}, waiting {lro_poll_interval}s..." 143 | ) 144 | time.sleep(lro_poll_interval) 145 | response.raise_for_status() 146 | return response.json() 147 | except requests.RequestException as e: 148 | logger.error(f"API call failed: {str(e)}") 149 | if e.response is not None: 150 | logger.error(f"Response content: {e.response.text}") 151 | return None 152 | else: 153 | results = [] 154 | continuation_token = None 155 | while True: 156 | url = self._build_url( 157 | endpoint=endpoint, continuation_token=continuation_token 158 | ) 159 | request_params = params.copy() 160 | # Remove any existing continuationToken in parameters to avoid conflict. 161 | request_params.pop("continuationToken", None) 162 | try: 163 | if method.upper() == "POST": 164 | response = requests.post( 165 | url, 166 | headers=self._get_headers(), 167 | json=request_params, 168 | timeout=120, 169 | ) 170 | else: 171 | if "maxResults" not in request_params: 172 | request_params["maxResults"] = self.config.max_results 173 | response = requests.request( 174 | method=method.upper(), 175 | url=url, 176 | headers=self._get_headers(), 177 | params=request_params, 178 | timeout=120, 179 | ) 180 | response.raise_for_status() 181 | data = response.json() 182 | except requests.RequestException as e: 183 | logger.error(f"API call failed: {str(e)}") 184 | if e.response is not None: 185 | logger.error(f"Response content: {e.response.text}") 186 | return results if results else None 187 | 188 | if not isinstance(data, dict) or data_key not in data: 189 | raise ValueError(f"Unexpected response format: {data}") 190 | 191 | results.extend(data[data_key]) 192 | continuation_token = data.get("continuationToken") 193 | if not continuation_token: 194 | break 195 | return results 196 | 197 | async def get_workspaces(self) -> List[Dict]: 198 | """Get all available workspaces""" 199 | return await self._make_request("workspaces", use_pagination=True) 200 | 201 | async def get_lakehouses(self, workspace_id: str) -> List[Dict]: 202 | """Get all lakehouses in a workspace""" 203 | return await self.get_items(workspace_id=workspace_id, item_type="Lakehouse") 204 | 205 | async def get_warehouses(self, workspace_id: str) -> List[Dict]: 206 | """Get all warehouses in a workspace 207 | Args: 208 | workspace_id: ID of the workspace 209 | Returns: 210 | A list of dictionaries containing warehouse details or an error message. 211 | """ 212 | return await self.get_items(workspace_id=workspace_id, item_type="Warehouse") 213 | 214 | async def get_tables(self, workspace_id: str, rsc_id: str, type: str) -> List[Dict]: 215 | """Get all tables in a lakehouse 216 | Args: 217 | workspace_id: ID of the workspace 218 | rsc_id: ID of the lakehouse 219 | type: Type of the resource (e.g., "Lakehouse" or "Warehouse") 220 | Returns: 221 | A list of dictionaries containing table details or an error message. 222 | """ 223 | return await self._make_request( 224 | f"workspaces/{workspace_id}/{type}s/{rsc_id}/tables", 225 | use_pagination=True, 226 | data_key="data", 227 | ) 228 | 229 | async def get_reports(self, workspace_id: str) -> List[Dict]: 230 | """Get all reports in a lakehouse 231 | Args: 232 | workspace_id: ID of the workspace 233 | Returns: 234 | A list of dictionaries containing report details or an error message. 235 | """ 236 | return await self._make_request( 237 | f"workspaces/{workspace_id}/reports", 238 | use_pagination=True, 239 | data_key="value", 240 | ) 241 | 242 | async def get_report(self, workspace_id: str, report_id: str) -> Dict: 243 | """Get a specific report by ID 244 | 245 | Args: 246 | workspace_id: ID of the workspace 247 | report_id: ID of the report 248 | 249 | Returns: 250 | A dictionary containing the report details or an error message. 251 | """ 252 | return await self._make_request( 253 | f"workspaces/{workspace_id}/reports/{report_id}" 254 | ) 255 | 256 | async def get_semantic_models(self, workspace_id: str) -> List[Dict]: 257 | """Get all semantic models in a lakehouse""" 258 | return await self._make_request( 259 | f"workspaces/{workspace_id}/semanticModels", 260 | use_pagination=True, 261 | data_key="value", 262 | ) 263 | 264 | async def get_semantic_model(self, workspace_id: str, model_id: str) -> Dict: 265 | """Get a specific semantic model by ID""" 266 | return await self._make_request( 267 | f"workspaces/{workspace_id}/semanticModels/{model_id}" 268 | ) 269 | 270 | async def resolve_workspace(self, workspace: str) -> str: 271 | """Convert workspace name or ID to workspace ID with caching""" 272 | return await self._cached_resolve_workspace(workspace) 273 | 274 | async def _resolve_workspace(self, workspace: str) -> str: 275 | """Internal method to convert workspace name or ID to workspace ID""" 276 | if _is_valid_uuid(workspace): 277 | return workspace 278 | 279 | workspaces = await self.get_workspaces() 280 | matching_workspaces = [ 281 | w for w in workspaces if w["displayName"].lower() == workspace.lower() 282 | ] 283 | 284 | if not matching_workspaces: 285 | raise ValueError(f"No workspaces found with name: {workspace}") 286 | if len(matching_workspaces) > 1: 287 | raise ValueError(f"Multiple workspaces found with name: {workspace}") 288 | 289 | return matching_workspaces[0]["id"] 290 | 291 | async def resolve_lakehouse(self, workspace_id: str, lakehouse: str) -> str: 292 | """Convert lakehouse name or ID to lakehouse ID with caching""" 293 | return await self._cached_resolve_lakehouse(workspace_id, lakehouse) 294 | 295 | async def _resolve_lakehouse(self, workspace_id: str, lakehouse: str) -> str: 296 | """Internal method to convert lakehouse name or ID to lakehouse ID""" 297 | if _is_valid_uuid(lakehouse): 298 | return lakehouse 299 | 300 | lakehouses = await self.get_lakehouses(workspace_id) 301 | matching_lakehouses = [ 302 | lh for lh in lakehouses if lh["displayName"].lower() == lakehouse.lower() 303 | ] 304 | 305 | if not matching_lakehouses: 306 | raise ValueError(f"No lakehouse found with name: {lakehouse}") 307 | if len(matching_lakehouses) > 1: 308 | raise ValueError(f"Multiple lakehouses found with name: {lakehouse}") 309 | 310 | return matching_lakehouses[0]["id"] 311 | 312 | async def get_items( 313 | self, 314 | workspace_id: str, 315 | item_type: Optional[str] = None, 316 | params: Optional[Dict] = None, 317 | ) -> List[Dict]: 318 | """Get all items in a workspace""" 319 | if not _is_valid_uuid(workspace_id): 320 | raise ValueError("Invalid workspace ID.") 321 | if item_type: 322 | params = params or {} 323 | params["type"] = item_type 324 | return await self._make_request( 325 | f"workspaces/{workspace_id}/items", params=params, use_pagination=True 326 | ) 327 | 328 | async def get_item( 329 | self, 330 | item_id: str, 331 | workspace_id: str, 332 | item_type: Optional[str] = None, 333 | ) -> Dict: 334 | """Get a specific item by ID""" 335 | 336 | if not _is_valid_uuid(item_id): 337 | item_name, item_id = await self.resolve_item_name_and_id(item_id) 338 | if not _is_valid_uuid(workspace_id): 339 | (workspace_name, workspace_id) = await self.resolve_workspace_name_and_id( 340 | workspace_id 341 | ) 342 | return await self._make_request( 343 | f"workspaces/{workspace_id}/{item_type}s/{item_id}" 344 | ) 345 | 346 | async def create_item( 347 | self, 348 | name: str, 349 | type: str, 350 | description: Optional[str] = None, 351 | definition: Optional[dict] = None, 352 | workspace: Optional[str | UUID] = None, 353 | lro: Optional[bool] = False, 354 | ): 355 | """ 356 | Creates an item in a Fabric workspace. 357 | 358 | Parameters 359 | ---------- 360 | name : str 361 | The name of the item to be created. 362 | type : str 363 | The type of the item to be created. 364 | description : str, default=None 365 | A description of the item to be created. 366 | definition : dict, default=None 367 | The definition of the item to be created. 368 | workspace : str | uuid.UUID, default=None 369 | The Fabric workspace name or ID. 370 | Defaults to None which resolves to the workspace of the attached lakehouse 371 | or if no lakehouse attached, resolves to the workspace of the notebook. 372 | """ 373 | from sempy_labs._utils import item_types 374 | 375 | if _is_valid_uuid(workspace): 376 | workspace_id = workspace 377 | else: 378 | (workspace_name, workspace_id) = await self.resolve_workspace_name_and_id( 379 | workspace 380 | ) 381 | item_type = item_types.get(type)[0].lower() 382 | 383 | payload = { 384 | "displayName": name, 385 | } 386 | if description: 387 | payload["description"] = description 388 | if definition: 389 | payload["definition"] = definition 390 | 391 | try: 392 | response = await self._make_request( 393 | endpoint=f"workspaces/{workspace_id}/{item_type}s", 394 | method="post", 395 | params=payload, 396 | lro=lro, 397 | lro_poll_interval=0.5, 398 | ) 399 | except requests.RequestException as e: 400 | logger.error(f"API call failed: {str(e)}") 401 | if e.response is not None: 402 | logger.error(f"Response content: {e.response.text}") 403 | raise ValueError( 404 | f"Failed to create item '{name}' of type '{item_type}' in the '{workspace_id}' workspace." 405 | ) 406 | 407 | # Check if response contains an error 408 | if isinstance(response, dict): 409 | if "error" in response: 410 | error_msg = response.get("error", {}).get("message", "Unknown error") 411 | logger.error(f"API error creating item: {error_msg}") 412 | raise ValueError(f"Failed to create item '{name}': {error_msg}") 413 | 414 | # Check if item was created successfully 415 | if "id" in response: 416 | logger.info(f"Successfully created item '{name}' with ID: {response['id']}") 417 | return response 418 | 419 | # If no ID and no error, log the full response for debugging 420 | logger.warning(f"Unexpected response format: {response}") 421 | 422 | # Legacy check - may not be reliable for all item types 423 | if hasattr(response, 'get') and response.get("displayName") and response.get("displayName") != name: 424 | logger.warning(f"Response displayName '{response.get('displayName')}' doesn't match requested name '{name}', but this may be normal") 425 | 426 | return response 427 | 428 | async def resolve_item_name_and_id( 429 | self, 430 | item: str | UUID, 431 | type: Optional[str] = None, 432 | workspace: Optional[str | UUID] = None, 433 | ) -> Tuple[str, UUID]: 434 | (workspace_name, workspace_id) = await self.resolve_workspace_name_and_id( 435 | workspace 436 | ) 437 | item_id = await self.resolve_item_id( 438 | item=item, type=type, workspace=workspace_id 439 | ) 440 | item_data = await self._make_request( 441 | f"workspaces/{workspace_id}/items/{item_id}" 442 | ) 443 | item_name = item_data.get("displayName") 444 | return item_name, item_id 445 | 446 | async def resolve_item_id( 447 | self, 448 | item: str | UUID, 449 | type: Optional[str] = None, 450 | workspace: Optional[str | UUID] = None, 451 | ) -> UUID: 452 | (workspace_name, workspace_id) = await self.resolve_workspace_name_and_id( 453 | workspace 454 | ) 455 | item_id = None 456 | 457 | if _is_valid_uuid(item): 458 | # Check (optional) 459 | item_id = item 460 | try: 461 | self._make_request( 462 | endpoint=f"workspaces/{workspace_id}/items/{item_id}" 463 | ) 464 | except requests.RequestException: 465 | raise ValueError( 466 | f"The '{item_id}' item was not found in the '{workspace_name}' workspace." 467 | ) 468 | else: 469 | if type is None: 470 | raise ValueError( 471 | "The 'type' parameter is required if specifying an item name." 472 | ) 473 | responses = await self._make_request( 474 | endpoint=f"workspaces/{workspace_id}/items?type={type}", 475 | use_pagination=True, 476 | ) 477 | for v in responses: 478 | display_name = v["displayName"] 479 | if display_name == item: 480 | item_id = v.get("id") 481 | break 482 | 483 | if item_id is None: 484 | raise ValueError( 485 | f"There's no item '{item}' of type '{type}' in the '{workspace_name}' workspace." 486 | ) 487 | 488 | return item_id 489 | 490 | async def resolve_workspace_name_and_id( 491 | self, 492 | workspace: Optional[str | UUID] = None, 493 | ) -> Tuple[str, UUID]: 494 | """ 495 | Obtains the name and ID of the Fabric workspace. 496 | 497 | Parameters 498 | ---------- 499 | workspace : str | uuid.UUID, default=None 500 | The Fabric workspace name or ID. 501 | Defaults to None which resolves to the workspace of the attached lakehouse 502 | or if no lakehouse attached, resolves to the workspace of the notebook. 503 | 504 | Returns 505 | ------- 506 | str, uuid.UUID 507 | The name and ID of the Fabric workspace. 508 | """ 509 | logger.debug(f"Resolving workspace name and ID for: {workspace}") 510 | if workspace is None: 511 | raise ValueError("Workspace must be specified.") 512 | elif _is_valid_uuid(workspace): 513 | workspace_id = workspace 514 | workspace_name = await self.resolve_workspace_name(workspace_id) 515 | return workspace_name, workspace_id 516 | else: 517 | responses = await self._make_request( 518 | endpoint="workspaces", use_pagination=True 519 | ) 520 | workspace_id = None 521 | workspace_name = None 522 | for r in responses: 523 | display_name = r.get("displayName") 524 | if display_name == workspace: 525 | workspace_name = workspace 526 | workspace_id = r.get("id") 527 | return workspace_name, workspace_id 528 | 529 | if workspace_name is None or workspace_id is None: 530 | raise ValueError("Workspace not found") 531 | 532 | return workspace_name, workspace_id 533 | 534 | async def resolve_workspace_name(self, workspace_id: Optional[UUID] = None) -> str: 535 | try: 536 | response = await self._make_request(endpoint=f"workspaces/{workspace_id}") 537 | if not response or "displayName" not in response: 538 | raise ValueError( 539 | f"Workspace '{workspace_id}' not found or API response invalid: {response}" 540 | ) 541 | except requests.RequestException: 542 | raise ValueError(f"The '{workspace_id}' workspace was not found.") 543 | 544 | return response.get("displayName") 545 | 546 | async def get_notebooks(self, workspace_id: str) -> List[Dict]: 547 | """Get all notebooks in a workspace""" 548 | return await self.get_items(workspace_id=workspace_id, item_type="Notebook") 549 | 550 | async def get_notebook(self, workspace_id: str, notebook_id: str) -> Dict: 551 | """Get a specific notebook by ID""" 552 | return await self.get_item( 553 | item_id=notebook_id, workspace_id=workspace_id, item_type="notebook" 554 | ) 555 | 556 | async def create_notebook( 557 | self, workspace_id: str, notebook_name: str, ipynb_name: str, content: str 558 | ) -> Dict: 559 | """Create a new notebook.""" 560 | if not _is_valid_uuid(workspace_id): 561 | raise ValueError("Invalid workspace ID.") 562 | 563 | # Define the notebook definition 564 | logger.debug( 565 | f"Defining notebook '{notebook_name}' in workspace '{workspace_id}'." 566 | ) 567 | definition = { 568 | "format": "ipynb", 569 | "parts": [ 570 | { 571 | "path": f"{ipynb_name}.ipynb", 572 | "payload": base64.b64encode( 573 | content 574 | if isinstance(content, bytes) 575 | else content.encode("utf-8") 576 | ).decode("utf-8"), 577 | "payloadType": "InlineBase64", 578 | }, 579 | # { 580 | # "path": ".platform", 581 | # "payload": base64.b64encode("dotPlatformBase64String".encode("utf-8")).decode("utf-8"), 582 | # "payloadType": "InlineBase64", 583 | # }, 584 | ], 585 | } 586 | logger.info( 587 | f"-------Creating notebook '{notebook_name}' in workspace '{workspace_id}'." 588 | ) 589 | return await self.create_item( 590 | workspace=workspace_id, 591 | type="Notebook", 592 | name=notebook_name, 593 | definition=definition, 594 | ) 595 | -------------------------------------------------------------------------------- /tools/notebook.py: -------------------------------------------------------------------------------- 1 | from helpers.utils.context import mcp, __ctx_cache 2 | from mcp.server.fastmcp import Context 3 | from helpers.utils.authentication import get_azure_credentials 4 | from helpers.clients import ( 5 | FabricApiClient, 6 | NotebookClient, 7 | ) 8 | import json 9 | from helpers.logging_config import get_logger 10 | 11 | 12 | from typing import Optional, Dict, List, Any 13 | import base64 14 | import re 15 | 16 | logger = get_logger(__name__) 17 | 18 | 19 | @mcp.tool() 20 | async def list_notebooks(workspace: Optional[str] = None, ctx: Context = None) -> str: 21 | """List all notebooks in a Fabric workspace. 22 | 23 | Args: 24 | workspace: Name or ID of the workspace (optional) 25 | ctx: Context object containing client information 26 | Returns: 27 | A string containing the list of notebooks or an error message. 28 | """ 29 | 30 | try: 31 | if ctx is None: 32 | raise ValueError("Context (ctx) must be provided.") 33 | 34 | notebook_client = NotebookClient( 35 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 36 | ) 37 | return await notebook_client.list_notebooks(workspace) 38 | except Exception as e: 39 | logger.error(f"Error listing notebooks: {str(e)}") 40 | return f"Error listing notebooks: {str(e)}" 41 | 42 | 43 | @mcp.tool() 44 | async def create_notebook( 45 | workspace: str, 46 | # notebook_name: str, 47 | # content: str, 48 | ctx: Context = None, 49 | ) -> str: 50 | """Create a new notebook in a Fabric workspace. 51 | 52 | Args: 53 | workspace: Name or ID of the workspace 54 | notebook_name: Name of the new notebook 55 | content: Content of the notebook (in JSON format) 56 | ctx: Context object containing client information 57 | Returns: 58 | A string containing the ID of the created notebook or an error message. 59 | """ 60 | notebook_json = { 61 | "nbformat": 4, 62 | "nbformat_minor": 5, 63 | "cells": [ 64 | { 65 | "cell_type": "code", 66 | "source": ["print('Hello, Fabric!')\n"], 67 | "execution_count": None, 68 | "outputs": [], 69 | "metadata": {}, 70 | } 71 | ], 72 | "metadata": {"language_info": {"name": "python"}}, 73 | } 74 | notebook_content = json.dumps(notebook_json) 75 | try: 76 | if ctx is None: 77 | raise ValueError("Context (ctx) must be provided.") 78 | 79 | notebook_client = NotebookClient( 80 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 81 | ) 82 | response = await notebook_client.create_notebook( 83 | workspace, "test_notebook_2", notebook_content 84 | ) 85 | return response.get("id", "") # Return the notebook ID or an empty string 86 | except Exception as e: 87 | logger.error(f"Error creating notebook: {str(e)}") 88 | return f"Error creating notebook: {str(e)}" 89 | 90 | 91 | @mcp.tool() 92 | async def get_notebook_content( 93 | workspace: str, 94 | notebook_id: str, 95 | ctx: Context = None 96 | ) -> str: 97 | """Get the content of a specific notebook in a Fabric workspace. 98 | 99 | Args: 100 | workspace: Name or ID of the workspace 101 | notebook_id: ID or name of the notebook 102 | ctx: Context object containing client information 103 | Returns: 104 | A string containing the notebook content in JSON format or an error message. 105 | """ 106 | try: 107 | if ctx is None: 108 | raise ValueError("Context (ctx) must be provided.") 109 | 110 | notebook_client = NotebookClient( 111 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 112 | ) 113 | 114 | # Get the notebook details 115 | notebook = await notebook_client.get_notebook(workspace, notebook_id) 116 | 117 | if isinstance(notebook, str): # Error message 118 | return notebook 119 | 120 | # Extract and decode the notebook content 121 | definition = notebook.get("definition", {}) 122 | parts = definition.get("parts", []) 123 | 124 | for part in parts: 125 | if part.get("path", "").endswith(".ipynb"): 126 | payload = part.get("payload", "") 127 | if payload: 128 | # Decode base64 content 129 | decoded_content = base64.b64decode(payload).decode("utf-8") 130 | return decoded_content 131 | 132 | return "No notebook content found in the definition." 133 | 134 | except Exception as e: 135 | logger.error(f"Error getting notebook content: {str(e)}") 136 | return f"Error getting notebook content: {str(e)}" 137 | 138 | 139 | @mcp.tool() 140 | async def create_pyspark_notebook( 141 | workspace: str, 142 | notebook_name: str, 143 | template_type: str = "basic", 144 | ctx: Context = None, 145 | ) -> str: 146 | """Create a new PySpark notebook from a template in a Fabric workspace. 147 | 148 | Args: 149 | workspace: Name or ID of the workspace 150 | notebook_name: Name of the new notebook 151 | template_type: Type of PySpark template ('basic', 'etl', 'analytics', 'ml') 152 | ctx: Context object containing client information 153 | Returns: 154 | A string containing the ID of the created notebook or an error message. 155 | """ 156 | try: 157 | if ctx is None: 158 | raise ValueError("Context (ctx) must be provided.") 159 | 160 | # Define PySpark templates 161 | templates = { 162 | "basic": { 163 | "cells": [ 164 | { 165 | "cell_type": "markdown", 166 | "source": [ 167 | "# PySpark Notebook\n", 168 | "\n", 169 | "This notebook demonstrates basic PySpark operations in Microsoft Fabric.\n" 170 | ], 171 | "metadata": {} 172 | }, 173 | { 174 | "cell_type": "code", 175 | "source": [ 176 | "# Initialize Spark session\n", 177 | "from pyspark.sql import SparkSession\n", 178 | "from pyspark.sql.functions import *\n", 179 | "from pyspark.sql.types import *\n", 180 | "\n", 181 | "# Spark session is already available as 'spark' in Fabric\n", 182 | "print(f\"Spark version: {spark.version}\")\n", 183 | "print(f\"Available cores: {spark.sparkContext.defaultParallelism}\")\n" 184 | ], 185 | "execution_count": None, 186 | "outputs": [], 187 | "metadata": {} 188 | }, 189 | { 190 | "cell_type": "code", 191 | "source": [ 192 | "# Sample data creation\n", 193 | "sample_data = [\n", 194 | " (1, \"John\", 25, \"Engineering\"),\n", 195 | " (2, \"Jane\", 30, \"Marketing\"),\n", 196 | " (3, \"Bob\", 35, \"Sales\"),\n", 197 | " (4, \"Alice\", 28, \"Engineering\")\n", 198 | "]\n", 199 | "\n", 200 | "schema = StructType([\n", 201 | " StructField(\"id\", IntegerType(), True),\n", 202 | " StructField(\"name\", StringType(), True),\n", 203 | " StructField(\"age\", IntegerType(), True),\n", 204 | " StructField(\"department\", StringType(), True)\n", 205 | "])\n", 206 | "\n", 207 | "df = spark.createDataFrame(sample_data, schema)\n", 208 | "df.show()\n" 209 | ], 210 | "execution_count": None, 211 | "outputs": [], 212 | "metadata": {} 213 | } 214 | ] 215 | }, 216 | "etl": { 217 | "cells": [ 218 | { 219 | "cell_type": "markdown", 220 | "source": [ 221 | "# PySpark ETL Pipeline\n", 222 | "\n", 223 | "This notebook demonstrates an ETL pipeline using PySpark in Microsoft Fabric.\n" 224 | ], 225 | "metadata": {} 226 | }, 227 | { 228 | "cell_type": "code", 229 | "source": [ 230 | "# Import necessary libraries\n", 231 | "from pyspark.sql import SparkSession\n", 232 | "from pyspark.sql.functions import *\n", 233 | "from pyspark.sql.types import *\n", 234 | "from delta.tables import DeltaTable\n", 235 | "\n", 236 | "print(f\"Spark version: {spark.version}\")\n" 237 | ], 238 | "execution_count": None, 239 | "outputs": [], 240 | "metadata": {} 241 | }, 242 | { 243 | "cell_type": "code", 244 | "source": [ 245 | "# Extract: Read data from source\n", 246 | "# Example: Reading from a lakehouse table\n", 247 | "# df_source = spark.table(\"lakehouse.table_name\")\n", 248 | "\n", 249 | "# For demo purposes, create sample data\n", 250 | "raw_data = [\n", 251 | " (\"2024-01-01\", \"Product A\", 100, 25.50),\n", 252 | " (\"2024-01-01\", \"Product B\", 150, 30.00),\n", 253 | " (\"2024-01-02\", \"Product A\", 120, 25.50),\n", 254 | " (\"2024-01-02\", \"Product C\", 80, 45.00)\n", 255 | "]\n", 256 | "\n", 257 | "schema = StructType([\n", 258 | " StructField(\"date\", StringType(), True),\n", 259 | " StructField(\"product\", StringType(), True),\n", 260 | " StructField(\"quantity\", IntegerType(), True),\n", 261 | " StructField(\"price\", DoubleType(), True)\n", 262 | "])\n", 263 | "\n", 264 | "df_raw = spark.createDataFrame(raw_data, schema)\n", 265 | "print(\"Raw data:\")\n", 266 | "df_raw.show()\n" 267 | ], 268 | "execution_count": None, 269 | "outputs": [], 270 | "metadata": {} 271 | }, 272 | { 273 | "cell_type": "code", 274 | "source": [ 275 | "# Transform: Clean and process data\n", 276 | "df_transformed = df_raw \\\n", 277 | " .withColumn(\"date\", to_date(col(\"date\"), \"yyyy-MM-dd\")) \\\n", 278 | " .withColumn(\"revenue\", col(\"quantity\") * col(\"price\")) \\\n", 279 | " .withColumn(\"year\", year(col(\"date\"))) \\\n", 280 | " .withColumn(\"month\", month(col(\"date\")))\n", 281 | "\n", 282 | "print(\"Transformed data:\")\n", 283 | "df_transformed.show()\n", 284 | "df_transformed.printSchema()\n" 285 | ], 286 | "execution_count": None, 287 | "outputs": [], 288 | "metadata": {} 289 | }, 290 | { 291 | "cell_type": "code", 292 | "source": [ 293 | "# Load: Write processed data to target\n", 294 | "# Example: Writing to a Delta table in lakehouse\n", 295 | "# df_transformed.write \\\n", 296 | "# .format(\"delta\") \\\n", 297 | "# .mode(\"overwrite\") \\\n", 298 | "# .saveAsTable(\"lakehouse.processed_sales\")\n", 299 | "\n", 300 | "print(\"ETL pipeline completed successfully!\")\n", 301 | "print(f\"Processed {df_transformed.count()} records\")\n" 302 | ], 303 | "execution_count": None, 304 | "outputs": [], 305 | "metadata": {} 306 | } 307 | ] 308 | }, 309 | "analytics": { 310 | "cells": [ 311 | { 312 | "cell_type": "markdown", 313 | "source": [ 314 | "# PySpark Data Analytics\n", 315 | "\n", 316 | "This notebook demonstrates data analytics using PySpark in Microsoft Fabric.\n" 317 | ], 318 | "metadata": {} 319 | }, 320 | { 321 | "cell_type": "code", 322 | "source": [ 323 | "# Import libraries for analytics\n", 324 | "from pyspark.sql import SparkSession\n", 325 | "from pyspark.sql.functions import *\n", 326 | "from pyspark.sql.types import *\n", 327 | "from pyspark.sql.window import Window\n", 328 | "\n", 329 | "print(f\"Spark version: {spark.version}\")\n" 330 | ], 331 | "execution_count": None, 332 | "outputs": [], 333 | "metadata": {} 334 | }, 335 | { 336 | "cell_type": "code", 337 | "source": [ 338 | "# Create sample sales data for analytics\n", 339 | "sales_data = [\n", 340 | " (\"2024-01-01\", \"North\", \"Product A\", 1000, 100),\n", 341 | " (\"2024-01-01\", \"South\", \"Product A\", 800, 80),\n", 342 | " (\"2024-01-02\", \"North\", \"Product B\", 1200, 120),\n", 343 | " (\"2024-01-02\", \"South\", \"Product B\", 900, 90),\n", 344 | " (\"2024-01-03\", \"East\", \"Product A\", 1100, 110),\n", 345 | " (\"2024-01-03\", \"West\", \"Product C\", 700, 70)\n", 346 | "]\n", 347 | "\n", 348 | "schema = StructType([\n", 349 | " StructField(\"date\", StringType(), True),\n", 350 | " StructField(\"region\", StringType(), True),\n", 351 | " StructField(\"product\", StringType(), True),\n", 352 | " StructField(\"revenue\", IntegerType(), True),\n", 353 | " StructField(\"quantity\", IntegerType(), True)\n", 354 | "])\n", 355 | "\n", 356 | "df_sales = spark.createDataFrame(sales_data, schema)\n", 357 | "df_sales = df_sales.withColumn(\"date\", to_date(col(\"date\"), \"yyyy-MM-dd\"))\n", 358 | "df_sales.show()\n" 359 | ], 360 | "execution_count": None, 361 | "outputs": [], 362 | "metadata": {} 363 | }, 364 | { 365 | "cell_type": "code", 366 | "source": [ 367 | "# Aggregation analysis\n", 368 | "print(\"=== Revenue by Region ===\")\n", 369 | "df_sales.groupBy(\"region\") \\\n", 370 | " .agg(sum(\"revenue\").alias(\"total_revenue\"),\n", 371 | " sum(\"quantity\").alias(\"total_quantity\"),\n", 372 | " count(\"*\").alias(\"transaction_count\")) \\\n", 373 | " .orderBy(desc(\"total_revenue\")) \\\n", 374 | " .show()\n", 375 | "\n", 376 | "print(\"=== Revenue by Product ===\")\n", 377 | "df_sales.groupBy(\"product\") \\\n", 378 | " .agg(sum(\"revenue\").alias(\"total_revenue\"),\n", 379 | " avg(\"revenue\").alias(\"avg_revenue\")) \\\n", 380 | " .orderBy(desc(\"total_revenue\")) \\\n", 381 | " .show()\n" 382 | ], 383 | "execution_count": None, 384 | "outputs": [], 385 | "metadata": {} 386 | }, 387 | { 388 | "cell_type": "code", 389 | "source": [ 390 | "# Window functions for advanced analytics\n", 391 | "windowSpec = Window.partitionBy(\"region\").orderBy(\"date\")\n", 392 | "\n", 393 | "df_analytics = df_sales \\\n", 394 | " .withColumn(\"running_total\", sum(\"revenue\").over(windowSpec)) \\\n", 395 | " .withColumn(\"row_number\", row_number().over(windowSpec)) \\\n", 396 | " .withColumn(\"rank\", rank().over(windowSpec.orderBy(desc(\"revenue\"))))\n", 397 | "\n", 398 | "print(\"=== Advanced Analytics with Window Functions ===\")\n", 399 | "df_analytics.select(\"date\", \"region\", \"product\", \"revenue\", \n", 400 | " \"running_total\", \"row_number\", \"rank\") \\\n", 401 | " .orderBy(\"region\", \"date\") \\\n", 402 | " .show()\n" 403 | ], 404 | "execution_count": None, 405 | "outputs": [], 406 | "metadata": {} 407 | } 408 | ] 409 | }, 410 | "ml": { 411 | "cells": [ 412 | { 413 | "cell_type": "markdown", 414 | "source": [ 415 | "# PySpark Machine Learning\n", 416 | "\n", 417 | "This notebook demonstrates machine learning with PySpark MLlib in Microsoft Fabric.\n" 418 | ], 419 | "metadata": {} 420 | }, 421 | { 422 | "cell_type": "code", 423 | "source": [ 424 | "# Import ML libraries\n", 425 | "from pyspark.sql import SparkSession\n", 426 | "from pyspark.sql.functions import *\n", 427 | "from pyspark.sql.types import *\n", 428 | "from pyspark.ml.feature import VectorAssembler, StandardScaler\n", 429 | "from pyspark.ml.regression import LinearRegression\n", 430 | "from pyspark.ml.evaluation import RegressionEvaluator\n", 431 | "from pyspark.ml import Pipeline\n", 432 | "\n", 433 | "print(f\"Spark version: {spark.version}\")\n" 434 | ], 435 | "execution_count": None, 436 | "outputs": [], 437 | "metadata": {} 438 | }, 439 | { 440 | "cell_type": "code", 441 | "source": [ 442 | "# Create sample dataset for regression\n", 443 | "ml_data = [\n", 444 | " (1, 2.0, 3.0, 4.0, 10.0),\n", 445 | " (2, 3.0, 4.0, 5.0, 15.0),\n", 446 | " (3, 4.0, 5.0, 6.0, 20.0),\n", 447 | " (4, 5.0, 6.0, 7.0, 25.0),\n", 448 | " (5, 6.0, 7.0, 8.0, 30.0),\n", 449 | " (6, 7.0, 8.0, 9.0, 35.0)\n", 450 | "]\n", 451 | "\n", 452 | "schema = StructType([\n", 453 | " StructField(\"id\", IntegerType(), True),\n", 454 | " StructField(\"feature1\", DoubleType(), True),\n", 455 | " StructField(\"feature2\", DoubleType(), True),\n", 456 | " StructField(\"feature3\", DoubleType(), True),\n", 457 | " StructField(\"label\", DoubleType(), True)\n", 458 | "])\n", 459 | "\n", 460 | "df_ml = spark.createDataFrame(ml_data, schema)\n", 461 | "print(\"Sample ML dataset:\")\n", 462 | "df_ml.show()\n" 463 | ], 464 | "execution_count": None, 465 | "outputs": [], 466 | "metadata": {} 467 | }, 468 | { 469 | "cell_type": "code", 470 | "source": [ 471 | "# Feature engineering pipeline\n", 472 | "feature_cols = [\"feature1\", \"feature2\", \"feature3\"]\n", 473 | "\n", 474 | "# Assemble features into a vector\n", 475 | "assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"raw_features\")\n", 476 | "\n", 477 | "# Scale features\n", 478 | "scaler = StandardScaler(inputCol=\"raw_features\", outputCol=\"features\")\n", 479 | "\n", 480 | "# Linear regression model\n", 481 | "lr = LinearRegression(featuresCol=\"features\", labelCol=\"label\")\n", 482 | "\n", 483 | "# Create pipeline\n", 484 | "pipeline = Pipeline(stages=[assembler, scaler, lr])\n", 485 | "\n", 486 | "print(\"ML Pipeline created with stages: Feature Assembly -> Scaling -> Linear Regression\")\n" 487 | ], 488 | "execution_count": None, 489 | "outputs": [], 490 | "metadata": {} 491 | }, 492 | { 493 | "cell_type": "code", 494 | "source": [ 495 | "# Split data and train model\n", 496 | "train_data, test_data = df_ml.randomSplit([0.8, 0.2], seed=42)\n", 497 | "\n", 498 | "print(f\"Training data count: {train_data.count()}\")\n", 499 | "print(f\"Test data count: {test_data.count()}\")\n", 500 | "\n", 501 | "# Train the pipeline\n", 502 | "model = pipeline.fit(train_data)\n", 503 | "\n", 504 | "# Make predictions\n", 505 | "predictions = model.transform(test_data)\n", 506 | "\n", 507 | "print(\"\\nPredictions:\")\n", 508 | "predictions.select(\"id\", \"label\", \"prediction\").show()\n" 509 | ], 510 | "execution_count": None, 511 | "outputs": [], 512 | "metadata": {} 513 | }, 514 | { 515 | "cell_type": "code", 516 | "source": [ 517 | "# Evaluate model performance\n", 518 | "evaluator = RegressionEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"rmse\")\n", 519 | "rmse = evaluator.evaluate(predictions)\n", 520 | "\n", 521 | "evaluator_r2 = RegressionEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"r2\")\n", 522 | "r2 = evaluator_r2.evaluate(predictions)\n", 523 | "\n", 524 | "print(f\"Root Mean Square Error (RMSE): {rmse:.3f}\")\n", 525 | "print(f\"R-squared (R2): {r2:.3f}\")\n", 526 | "\n", 527 | "# Get model coefficients\n", 528 | "lr_model = model.stages[-1]\n", 529 | "print(f\"\\nModel coefficients: {lr_model.coefficients}\")\n", 530 | "print(f\"Model intercept: {lr_model.intercept:.3f}\")\n" 531 | ], 532 | "execution_count": None, 533 | "outputs": [], 534 | "metadata": {} 535 | } 536 | ] 537 | } 538 | } 539 | 540 | if template_type not in templates: 541 | return f"Invalid template type. Available templates: {', '.join(templates.keys())}" 542 | 543 | # Create notebook JSON structure 544 | notebook_json = { 545 | "nbformat": 4, 546 | "nbformat_minor": 5, 547 | "cells": templates[template_type]["cells"], 548 | "metadata": { 549 | "language_info": {"name": "python"}, 550 | "kernel_info": {"name": "synapse_pyspark"}, 551 | "description": f"PySpark notebook created from {template_type} template" 552 | }, 553 | } 554 | 555 | notebook_content = json.dumps(notebook_json, indent=2) 556 | 557 | notebook_client = NotebookClient( 558 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 559 | ) 560 | response = await notebook_client.create_notebook( 561 | workspace, notebook_name, notebook_content 562 | ) 563 | 564 | if isinstance(response, dict) and response.get("id"): 565 | return f"Created PySpark notebook '{notebook_name}' with ID: {response['id']}" 566 | else: 567 | return f"Failed to create notebook: {response}" 568 | 569 | except Exception as e: 570 | logger.error(f"Error creating PySpark notebook: {str(e)}") 571 | return f"Error creating PySpark notebook: {str(e)}" 572 | 573 | @mcp.tool() 574 | async def generate_pyspark_code( 575 | operation: str, 576 | source_table: Optional[str] = None, 577 | target_table: Optional[str] = None, 578 | columns: Optional[str] = None, 579 | filter_condition: Optional[str] = None, 580 | ctx: Context = None, 581 | ) -> str: 582 | """Generate PySpark code for common operations. 583 | 584 | Args: 585 | operation: Type of operation ('read_table', 'write_table', 'transform', 'join', 'aggregate') 586 | source_table: Source table name (format: lakehouse.table_name) 587 | target_table: Target table name (format: lakehouse.table_name) 588 | columns: Comma-separated list of columns 589 | filter_condition: Filter condition for data 590 | ctx: Context object containing client information 591 | Returns: 592 | A string containing the generated PySpark code or an error message. 593 | """ 594 | try: 595 | code_templates = { 596 | "read_table": f"""# Read data from table 597 | df = spark.table("{source_table or 'lakehouse.table_name'}") 598 | df.show() 599 | df.printSchema()""", 600 | 601 | "write_table": f"""# Write data to table 602 | df.write \\ 603 | .format("delta") \\ 604 | .mode("overwrite") \\ 605 | .saveAsTable("{target_table or 'lakehouse.output_table'}") 606 | 607 | print(f"Successfully wrote {{df.count()}} records to {target_table or 'lakehouse.output_table'}")""", 608 | 609 | "transform": f"""# Data transformation 610 | from pyspark.sql.functions import * 611 | 612 | df_transformed = df \\ 613 | .select({columns or '*'}) \\ 614 | {f'.filter({filter_condition})' if filter_condition else ''} \\ 615 | .withColumn("processed_date", current_timestamp()) 616 | 617 | df_transformed.show()""", 618 | 619 | "join": f"""# Join tables 620 | df1 = spark.table("{source_table or 'lakehouse.table1'}") 621 | df2 = spark.table("{target_table or 'lakehouse.table2'}") 622 | 623 | # Inner join (modify join condition as needed) 624 | df_joined = df1.join(df2, df1.id == df2.id, "inner") 625 | 626 | df_joined.show()""", 627 | 628 | "aggregate": f"""# Data aggregation 629 | from pyspark.sql.functions import * 630 | 631 | df_agg = df \\ 632 | .groupBy({columns or '"column1"'}) \\ 633 | .agg( 634 | count("*").alias("count"), 635 | sum("amount").alias("total_amount"), 636 | avg("amount").alias("avg_amount"), 637 | max("date").alias("max_date") 638 | ) \\ 639 | .orderBy(desc("total_amount")) 640 | 641 | df_agg.show()""", 642 | 643 | "schema_inference": f"""# Schema inference and data profiling 644 | print("=== Schema Information ===") 645 | df.printSchema() 646 | 647 | print("\\n=== Data Profile ===") 648 | print(f"Record count: {{df.count()}}") 649 | print(f"Column count: {{len(df.columns)}}") 650 | 651 | print("\\n=== Column Statistics ===") 652 | df.describe().show() 653 | 654 | print("\\n=== Null Value Analysis ===") 655 | from pyspark.sql.functions import col, sum as spark_sum, isnan, when, count 656 | 657 | null_counts = df.select([ 658 | spark_sum(when(col(c).isNull() | isnan(col(c)), 1).otherwise(0)).alias(c) 659 | for c in df.columns 660 | ]) 661 | null_counts.show()""", 662 | 663 | "data_quality": f"""# Data quality checks 664 | from pyspark.sql.functions import * 665 | 666 | print("=== Data Quality Report ===") 667 | 668 | # Check for duplicates 669 | duplicate_count = df.count() - df.distinct().count() 670 | print(f"Duplicate rows: {{duplicate_count}}") 671 | 672 | # Check for null values 673 | total_rows = df.count() 674 | for column in df.columns: 675 | null_count = df.filter(col(column).isNull()).count() 676 | null_percentage = (null_count / total_rows) * 100 677 | print(f"{{column}}: {{null_count}} nulls ({{null_percentage:.2f}}%)") 678 | 679 | # Check data ranges (for numeric columns) 680 | numeric_columns = [field.name for field in df.schema.fields 681 | if field.dataType.simpleString() in ['int', 'double', 'float', 'bigint']] 682 | 683 | if numeric_columns: 684 | print("\\n=== Numeric Column Ranges ===") 685 | df.select([ 686 | min(col(c)).alias(f"{c}_min"), 687 | max(col(c)).alias(f"{c}_max") 688 | for c in numeric_columns 689 | ]).show()""", 690 | 691 | "performance_optimization": f"""# Performance optimization techniques 692 | 693 | # 1. Cache frequently used DataFrames 694 | df.cache() 695 | print(f"Cached DataFrame with {{df.count()}} records") 696 | 697 | # 2. Repartition for better parallelism 698 | optimal_partitions = spark.sparkContext.defaultParallelism * 2 699 | df_repartitioned = df.repartition(optimal_partitions) 700 | 701 | # 3. Use broadcast for small dimension tables (< 200MB) 702 | from pyspark.sql.functions import broadcast 703 | # df_joined = large_df.join(broadcast(small_df), "key") 704 | 705 | # 4. Optimize file formats - use Delta Lake 706 | df.write \\ 707 | .format("delta") \\ 708 | .mode("overwrite") \\ 709 | .option("optimizeWrite", "true") \\ 710 | .option("autoOptimize", "true") \\ 711 | .saveAsTable("{target_table or 'lakehouse.optimized_table'}") 712 | 713 | # 5. Show execution plan 714 | df.explain(True)""" 715 | } 716 | 717 | if operation not in code_templates: 718 | available_ops = ", ".join(code_templates.keys()) 719 | return f"Invalid operation. Available operations: {available_ops}" 720 | 721 | generated_code = code_templates[operation] 722 | 723 | return f"""```python 724 | {generated_code} 725 | ``` 726 | 727 | **Generated PySpark code for '{operation}' operation** 728 | 729 | This code can be copied into a notebook cell and executed. Remember to: 730 | - Replace placeholder table names with actual table names 731 | - Adjust column names and conditions as needed 732 | - Test with a small dataset first 733 | - Review the execution plan for performance optimization""" 734 | 735 | except Exception as e: 736 | logger.error(f"Error generating PySpark code: {str(e)}") 737 | return f"Error generating PySpark code: {str(e)}" 738 | 739 | @mcp.tool() 740 | async def validate_pyspark_code( 741 | code: str, 742 | ctx: Context = None, 743 | ) -> str: 744 | """Validate PySpark code for syntax and best practices. 745 | 746 | Args: 747 | code: PySpark code to validate 748 | ctx: Context object containing client information 749 | Returns: 750 | A string containing validation results and suggestions. 751 | """ 752 | try: 753 | validation_results = [] 754 | warnings = [] 755 | suggestions = [] 756 | 757 | # Basic syntax validation 758 | try: 759 | compile(code, '', 'exec') 760 | validation_results.append("✅ Syntax validation: PASSED") 761 | except SyntaxError as e: 762 | validation_results.append(f"❌ Syntax validation: FAILED - {e}") 763 | return "\n".join(validation_results) 764 | 765 | # PySpark best practices checks 766 | lines = code.split('\n') 767 | 768 | # Check for common imports 769 | has_spark_imports = any('from pyspark' in line or 'import pyspark' in line for line in lines) 770 | if not has_spark_imports: 771 | warnings.append("⚠️ No PySpark imports detected. Add: from pyspark.sql import SparkSession") 772 | 773 | # Check for DataFrame operations 774 | has_df_operations = any('df.' in line or '.show()' in line for line in lines) 775 | if has_df_operations: 776 | validation_results.append("✅ DataFrame operations detected") 777 | 778 | # Check for performance anti-patterns 779 | if '.collect()' in code: 780 | warnings.append("⚠️ .collect() detected - avoid on large datasets, use .show() or .take() instead") 781 | 782 | if '.toPandas()' in code: 783 | warnings.append("⚠️ .toPandas() detected - ensure dataset fits in driver memory") 784 | 785 | if 'for row in df.collect()' in code: 786 | warnings.append("❌ Anti-pattern: iterating over collected DataFrame. Use DataFrame operations instead") 787 | 788 | # Check for caching opportunities 789 | df_count = code.count('df.') 790 | if df_count > 3 and '.cache()' not in code and '.persist()' not in code: 791 | suggestions.append("💡 Consider caching DataFrame with .cache() for repeated operations") 792 | 793 | # Check for schema definition 794 | if 'createDataFrame' in code and 'StructType' not in code: 795 | suggestions.append("💡 Consider defining explicit schema when creating DataFrames") 796 | 797 | # Check for null handling 798 | if '.filter(' in code and 'isNull' not in code and 'isNotNull' not in code: 799 | suggestions.append("💡 Consider adding null value handling in filters") 800 | 801 | # Check for partitioning 802 | if '.write.' in code and 'partitionBy' not in code: 803 | suggestions.append("💡 Consider partitioning data when writing large datasets") 804 | 805 | # Check for Delta Lake usage 806 | if '.write.' in code and 'format("delta")' not in code: 807 | suggestions.append("💡 Consider using Delta Lake format for ACID transactions and time travel") 808 | 809 | # Compile results 810 | result = "# PySpark Code Validation Report\n\n" 811 | result += "## Validation Results\n" 812 | result += "\n".join(validation_results) + "\n\n" 813 | 814 | if warnings: 815 | result += "## Warnings\n" 816 | result += "\n".join(warnings) + "\n\n" 817 | 818 | if suggestions: 819 | result += "## Optimization Suggestions\n" 820 | result += "\n".join(suggestions) + "\n\n" 821 | 822 | if not warnings and not suggestions: 823 | result += "## Summary\n✅ Code looks good! No issues detected.\n" 824 | else: 825 | result += f"## Summary\n📊 Found {len(warnings)} warnings and {len(suggestions)} optimization opportunities.\n" 826 | 827 | return result 828 | 829 | except Exception as e: 830 | logger.error(f"Error validating PySpark code: {str(e)}") 831 | return f"Error validating PySpark code: {str(e)}" 832 | 833 | @mcp.tool() 834 | async def update_notebook_cell( 835 | workspace: str, 836 | notebook_id: str, 837 | cell_index: int, 838 | cell_content: str, 839 | cell_type: str = "code", 840 | ctx: Context = None, 841 | ) -> str: 842 | """Update a specific cell in a notebook. 843 | 844 | Args: 845 | workspace: Name or ID of the workspace 846 | notebook_id: ID or name of the notebook 847 | cell_index: Index of the cell to update (0-based) 848 | cell_content: New content for the cell 849 | cell_type: Type of cell ('code' or 'markdown') 850 | ctx: Context object containing client information 851 | Returns: 852 | A string confirming the update or an error message. 853 | """ 854 | try: 855 | if ctx is None: 856 | raise ValueError("Context (ctx) must be provided.") 857 | 858 | # Get current notebook content 859 | current_content = await get_notebook_content(workspace, notebook_id, ctx) 860 | 861 | if current_content.startswith("Error"): 862 | return current_content 863 | 864 | # Parse the notebook JSON 865 | notebook_data = json.loads(current_content) 866 | cells = notebook_data.get("cells", []) 867 | 868 | if cell_index >= len(cells): 869 | return f"Cell index {cell_index} is out of range. Notebook has {len(cells)} cells." 870 | 871 | # Update the cell 872 | cells[cell_index] = { 873 | "cell_type": cell_type, 874 | "source": cell_content.split('\n') if isinstance(cell_content, str) else cell_content, 875 | "execution_count": None, 876 | "outputs": [], 877 | "metadata": {} 878 | } 879 | 880 | # Update the notebook 881 | updated_content = json.dumps(notebook_data, indent=2) 882 | 883 | notebook_client = NotebookClient( 884 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 885 | ) 886 | 887 | # This would require implementing an update method in the client 888 | # For now, return a success message indicating what would be updated 889 | return f"Cell {cell_index} updated successfully with {cell_type} content (length: {len(cell_content)} characters)" 890 | 891 | except Exception as e: 892 | logger.error(f"Error updating notebook cell: {str(e)}") 893 | return f"Error updating notebook cell: {str(e)}" 894 | 895 | @mcp.tool() 896 | async def create_fabric_notebook( 897 | workspace: str, 898 | notebook_name: str, 899 | template_type: str = "fabric_integration", 900 | ctx: Context = None, 901 | ) -> str: 902 | """Create a new notebook optimized for Microsoft Fabric using advanced templates. 903 | 904 | Args: 905 | workspace: Name or ID of the workspace 906 | notebook_name: Name of the new notebook 907 | template_type: Type of Fabric template ('fabric_integration', 'streaming') 908 | ctx: Context object containing client information 909 | Returns: 910 | A string containing the ID of the created notebook or an error message. 911 | """ 912 | try: 913 | if ctx is None: 914 | raise ValueError("Context (ctx) must be provided.") 915 | 916 | from helpers.pyspark_helpers import create_notebook_from_template 917 | 918 | # Create notebook from advanced template 919 | notebook_data = create_notebook_from_template(template_type) 920 | notebook_content = json.dumps(notebook_data, indent=2) 921 | 922 | notebook_client = NotebookClient( 923 | FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache)) 924 | ) 925 | response = await notebook_client.create_notebook( 926 | workspace, notebook_name, notebook_content 927 | ) 928 | 929 | if isinstance(response, dict) and response.get("id"): 930 | return f"Created Fabric-optimized notebook '{notebook_name}' with ID: {response['id']} using {template_type} template" 931 | else: 932 | return f"Failed to create notebook: {response}" 933 | 934 | except Exception as e: 935 | logger.error(f"Error creating Fabric notebook: {str(e)}") 936 | return f"Error creating Fabric notebook: {str(e)}" 937 | 938 | @mcp.tool() 939 | async def generate_fabric_code( 940 | operation: str, 941 | lakehouse_name: Optional[str] = None, 942 | table_name: Optional[str] = None, 943 | target_table: Optional[str] = None, 944 | ctx: Context = None, 945 | ) -> str: 946 | """Generate Fabric-specific PySpark code for lakehouse operations. 947 | 948 | Args: 949 | operation: Type of operation ('read_lakehouse', 'write_lakehouse', 'merge_delta', 'performance_monitor') 950 | lakehouse_name: Name of the lakehouse 951 | table_name: Name of the source table 952 | target_table: Name of the target table (for write/merge operations) 953 | ctx: Context object containing client information 954 | Returns: 955 | A string containing the generated Fabric-specific PySpark code. 956 | """ 957 | try: 958 | from helpers.pyspark_helpers import PySparkCodeGenerator 959 | 960 | generator = PySparkCodeGenerator() 961 | 962 | if operation == "read_lakehouse": 963 | if not lakehouse_name or not table_name: 964 | return "Error: lakehouse_name and table_name are required for read_lakehouse operation" 965 | code = generator.generate_fabric_lakehouse_reader(lakehouse_name, table_name) 966 | 967 | elif operation == "write_lakehouse": 968 | if not table_name: 969 | return "Error: table_name is required for write_lakehouse operation" 970 | code = generator.generate_fabric_lakehouse_writer(table_name) 971 | 972 | elif operation == "merge_delta": 973 | if not target_table: 974 | return "Error: target_table is required for merge_delta operation" 975 | source_df = "new_df" # Default source DataFrame name 976 | join_condition = "target.id = source.id" # Default join condition 977 | code = generator.generate_delta_merge_operation(target_table, source_df, join_condition) 978 | 979 | elif operation == "performance_monitor": 980 | code = generator.generate_performance_monitoring() 981 | 982 | else: 983 | available_ops = ["read_lakehouse", "write_lakehouse", "merge_delta", "performance_monitor"] 984 | return f"Invalid operation. Available operations: {', '.join(available_ops)}" 985 | 986 | return f"""```python 987 | {code} 988 | ``` 989 | 990 | **Generated Fabric-specific PySpark code for '{operation}' operation** 991 | 992 | This code is optimized for Microsoft Fabric and includes: 993 | - Proper Delta Lake integration 994 | - Fabric lakehouse connectivity 995 | - Performance monitoring capabilities 996 | - Best practices for Fabric environment""" 997 | 998 | except Exception as e: 999 | logger.error(f"Error generating Fabric code: {str(e)}") 1000 | return f"Error generating Fabric code: {str(e)}" 1001 | 1002 | @mcp.tool() 1003 | async def validate_fabric_code( 1004 | code: str, 1005 | ctx: Context = None, 1006 | ) -> str: 1007 | """Validate PySpark code for Microsoft Fabric compatibility and performance. 1008 | 1009 | Args: 1010 | code: PySpark code to validate for Fabric compatibility 1011 | ctx: Context object containing client information 1012 | Returns: 1013 | A string containing detailed validation results and Fabric-specific recommendations. 1014 | """ 1015 | try: 1016 | from helpers.pyspark_helpers import PySparkValidator 1017 | 1018 | validator = PySparkValidator() 1019 | 1020 | # Basic syntax validation 1021 | validation_results = [] 1022 | try: 1023 | compile(code, '', 'exec') 1024 | validation_results.append("✅ Syntax validation: PASSED") 1025 | except SyntaxError as e: 1026 | validation_results.append(f"❌ Syntax validation: FAILED - {e}") 1027 | return "\n".join(validation_results) 1028 | 1029 | # Fabric compatibility checks 1030 | fabric_results = validator.validate_fabric_compatibility(code) 1031 | 1032 | # Performance pattern checks 1033 | performance_results = validator.check_performance_patterns(code) 1034 | 1035 | # Additional Fabric-specific checks 1036 | fabric_warnings = [] 1037 | fabric_suggestions = [] 1038 | 1039 | # Check for Fabric best practices 1040 | if 'spark.table(' in code: 1041 | validation_results.append("✅ Using Fabric managed tables") 1042 | 1043 | if 'notebookutils' in code: 1044 | validation_results.append("✅ Using Fabric notebook utilities") 1045 | 1046 | if 'format("delta")' in code: 1047 | validation_results.append("✅ Using Delta Lake format") 1048 | 1049 | # Check for potential issues 1050 | if 'spark.sql("USE' in code: 1051 | fabric_warnings.append("⚠️ Explicit USE statements may not be necessary in Fabric") 1052 | 1053 | if 'hdfs://' in code or 's3://' in code: 1054 | fabric_warnings.append("⚠️ Direct file system paths detected - consider using Fabric's managed storage") 1055 | 1056 | # Compile comprehensive report 1057 | result = "# Microsoft Fabric PySpark Code Validation Report\n\n" 1058 | 1059 | result += "## Basic Validation\n" 1060 | result += "\n".join(validation_results) + "\n\n" 1061 | 1062 | if fabric_results["issues"]: 1063 | result += "## Fabric Compatibility Issues\n" 1064 | result += "\n".join(fabric_results["issues"]) + "\n\n" 1065 | 1066 | all_warnings = fabric_warnings + performance_results["warnings"] 1067 | if all_warnings: 1068 | result += "## Warnings\n" 1069 | result += "\n".join(all_warnings) + "\n\n" 1070 | 1071 | all_suggestions = fabric_results["suggestions"] + fabric_suggestions + performance_results["optimizations"] 1072 | if all_suggestions: 1073 | result += "## Fabric Optimization Suggestions\n" 1074 | result += "\n".join(all_suggestions) + "\n\n" 1075 | 1076 | # Summary 1077 | total_issues = len(fabric_results["issues"]) 1078 | total_warnings = len(all_warnings) 1079 | total_suggestions = len(all_suggestions) 1080 | 1081 | result += "## Summary\n" 1082 | if total_issues == 0 and total_warnings == 0: 1083 | result += "✅ Code is Fabric-ready! No critical issues detected.\n" 1084 | else: 1085 | result += f"📊 Found {total_issues} critical issues, {total_warnings} warnings, and {total_suggestions} optimization opportunities.\n" 1086 | 1087 | result += "\n### Fabric-Specific Recommendations:\n" 1088 | result += "- Use `spark.table()` for managed tables in lakehouses\n" 1089 | result += "- Leverage `notebookutils` for Fabric integration\n" 1090 | result += "- Always use Delta Lake format for optimal performance\n" 1091 | result += "- Consider partitioning strategies for large datasets\n" 1092 | result += "- Use broadcast joins for dimension tables < 200MB\n" 1093 | 1094 | return result 1095 | 1096 | except Exception as e: 1097 | logger.error(f"Error validating Fabric code: {str(e)}") 1098 | return f"Error validating Fabric code: {str(e)}" 1099 | 1100 | @mcp.tool() 1101 | async def analyze_notebook_performance( 1102 | workspace: str, 1103 | notebook_id: str, 1104 | ctx: Context = None, 1105 | ) -> str: 1106 | """Analyze a notebook's code for performance optimization opportunities in Fabric. 1107 | 1108 | Args: 1109 | workspace: Name or ID of the workspace 1110 | notebook_id: ID or name of the notebook 1111 | ctx: Context object containing client information 1112 | Returns: 1113 | A string containing performance analysis and optimization recommendations. 1114 | """ 1115 | try: 1116 | if ctx is None: 1117 | raise ValueError("Context (ctx) must be provided.") 1118 | 1119 | # Get notebook content 1120 | notebook_content = await get_notebook_content(workspace, notebook_id, ctx) 1121 | 1122 | if notebook_content.startswith("Error"): 1123 | return notebook_content 1124 | 1125 | # Parse notebook and extract code cells 1126 | notebook_data = json.loads(notebook_content) 1127 | cells = notebook_data.get("cells", []) 1128 | 1129 | code_cells = [cell for cell in cells if cell.get("cell_type") == "code"] 1130 | 1131 | if not code_cells: 1132 | return "No code cells found in the notebook." 1133 | 1134 | # Analyze each code cell 1135 | analysis_results = [] 1136 | total_operations = 0 1137 | performance_issues = [] 1138 | optimization_opportunities = [] 1139 | 1140 | from helpers.pyspark_helpers import PySparkValidator 1141 | validator = PySparkValidator() 1142 | 1143 | for i, cell in enumerate(code_cells): 1144 | cell_source = "\n".join(cell.get("source", [])) 1145 | 1146 | if not cell_source.strip(): 1147 | continue 1148 | 1149 | analysis_results.append(f"### Cell {i + 1}") 1150 | 1151 | # Count operations 1152 | operations = [ 1153 | ("DataFrame reads", cell_source.count("spark.read") + cell_source.count("spark.table")), 1154 | ("DataFrame writes", cell_source.count(".write.")), 1155 | ("Transformations", cell_source.count(".withColumn") + cell_source.count(".select") + cell_source.count(".filter")), 1156 | ("Actions", cell_source.count(".show()") + cell_source.count(".count()") + cell_source.count(".collect()")) 1157 | ] 1158 | 1159 | for op_name, count in operations: 1160 | if count > 0: 1161 | analysis_results.append(f"- {op_name}: {count}") 1162 | total_operations += count 1163 | 1164 | # Check for performance patterns 1165 | perf_results = validator.check_performance_patterns(cell_source) 1166 | performance_issues.extend(perf_results["warnings"]) 1167 | optimization_opportunities.extend(perf_results["optimizations"]) 1168 | 1169 | # Fabric-specific analysis 1170 | fabric_results = validator.validate_fabric_compatibility(cell_source) 1171 | optimization_opportunities.extend(fabric_results["suggestions"]) 1172 | 1173 | # Generate comprehensive report 1174 | report = f"# Notebook Performance Analysis Report\n\n" 1175 | report += f"**Notebook:** {notebook_id}\n" 1176 | report += f"**Total Code Cells:** {len(code_cells)}\n" 1177 | report += f"**Total Operations:** {total_operations}\n\n" 1178 | 1179 | if analysis_results: 1180 | report += "## Cell-by-Cell Analysis\n" 1181 | report += "\n".join(analysis_results) + "\n\n" 1182 | 1183 | if performance_issues: 1184 | report += "## Performance Issues Found\n" 1185 | for issue in set(performance_issues): # Remove duplicates 1186 | report += f"- {issue}\n" 1187 | report += "\n" 1188 | 1189 | if optimization_opportunities: 1190 | report += "## Optimization Opportunities\n" 1191 | for opportunity in set(optimization_opportunities): # Remove duplicates 1192 | report += f"- {opportunity}\n" 1193 | report += "\n" 1194 | 1195 | # Performance score calculation 1196 | score = 100 1197 | score -= len(set(performance_issues)) * 10 # -10 points per unique issue 1198 | score -= len(set(optimization_opportunities)) * 5 # -5 points per optimization opportunity 1199 | score = max(score, 0) # Ensure score doesn't go negative 1200 | 1201 | report += f"## Performance Score: {score}/100\n\n" 1202 | 1203 | if score >= 80: 1204 | report += "✅ **Excellent** - Your notebook is well-optimized for Fabric!\n" 1205 | elif score >= 60: 1206 | report += "⚠️ **Good** - Some optimization opportunities exist.\n" 1207 | elif score >= 40: 1208 | report += "🔧 **Needs Improvement** - Several performance issues should be addressed.\n" 1209 | else: 1210 | report += "❌ **Poor** - Significant performance optimization required.\n" 1211 | 1212 | return report 1213 | 1214 | except Exception as e: 1215 | logger.error(f"Error analyzing notebook performance: {str(e)}") 1216 | return f"Error analyzing notebook performance: {str(e)}" 1217 | --------------------------------------------------------------------------------