├── .dockerignore
├── docs
    ├── mcp_vscode.png
    ├── mcp_inspector.png
    ├── architecture.md
    └── pyspark_guide.md
├── helpers
    ├── utils
    │   ├── __init__.py
    │   ├── context.py
    │   ├── validators.py
    │   ├── authentication.py
    │   └── table_tools.py
    ├── logging_config.py
    ├── clients
    │   ├── __init__.py
    │   ├── report_client.py
    │   ├── workspace_client.py
    │   ├── warehouse_client.py
    │   ├── lakehouse_client.py
    │   ├── notebook_client.py
    │   ├── table_client.py
    │   ├── semanticModel_client.py
    │   ├── sql_client.py
    │   └── fabric_client.py
    ├── formatters
    │   ├── metadata_formatter.py
    │   └── schema_formatter.py
    └── pyspark_helpers.py
├── .gitignore
├── Dockerfile
├── pyproject.toml
├── fabric_mcp.py
├── tools
    ├── __init__.py
    ├── workspace.py
    ├── sql_endpoint.py
    ├── report.py
    ├── semantic_model.py
    ├── warehouse.py
    ├── load_data.py
    ├── lakehouse.py
    ├── table.py
    └── notebook.py
├── test_notebook_creation.py
├── test_security.py
└── README.md


/.dockerignore:
--------------------------------------------------------------------------------
1 | .venv
2 | Inprogress
3 | .ruff_cache/
4 | # Python bytecode files
5 | __pycache__/


--------------------------------------------------------------------------------
/docs/mcp_vscode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datumnova/ms-fabric-mcp/HEAD/docs/mcp_vscode.png


--------------------------------------------------------------------------------
/docs/mcp_inspector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datumnova/ms-fabric-mcp/HEAD/docs/mcp_inspector.png


--------------------------------------------------------------------------------
/helpers/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from helpers.utils.validators import _is_valid_uuid
2 | 
3 | __all__ = [
4 |     "_is_valid_uuid",
5 | ]
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python-generated files
 2 | __pycache__/
 3 | *.py[oc]
 4 | build/
 5 | dist/
 6 | wheels/
 7 | *.egg-info
 8 | 
 9 | # Virtual environments
10 | .venv
11 | 
12 | # Ruff cache
13 | .ruff_cache/
14 | Inprogress
15 | 


--------------------------------------------------------------------------------
/helpers/utils/context.py:
--------------------------------------------------------------------------------
 1 | from mcp.server.fastmcp import FastMCP
 2 | from cachetools import TTLCache
 3 | 
 4 | 
 5 | # Create MCP instance with context manager
 6 | mcp = FastMCP("Fabric MCP Server ", json_response=True, stateless_http=True)
 7 | mcp.settings.log_level = "debug"
 8 | 
 9 | # Shared cache and context
10 | __ctx_cache = TTLCache(maxsize=100, ttl=300)  # Cache for 5 minutes
11 | ctx = mcp.get_context()
12 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-slim
 2 | 
 3 | # Install uv.
 4 | COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 5 | 
 6 | # Copy the application into the container.
 7 | COPY . /app
 8 | 
 9 | # Install the application dependencies.
10 | WORKDIR /app
11 | RUN uv sync --frozen --no-cache
12 | 
13 | # Run the application.
14 | CMD ["uv", "run", "python", "fabric_mcp.py", "--port", "8081"]
15 | # CMD ["/app/.venv/bin/fastapi", "run", "app/fabric_mcp.py", "--port", "80", "--host", "0.0.0.0"]


--------------------------------------------------------------------------------
/helpers/logging_config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def get_logger(name: str) -> logging.Logger:
 5 |     """Set up and return a logger."""
 6 |     logger = logging.getLogger(name)
 7 |     handler = logging.StreamHandler()
 8 |     formatter = logging.Formatter(
 9 |         "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
10 |     )
11 |     handler.setFormatter(formatter)
12 |     logger.addHandler(handler)
13 |     logger.setLevel(logging.DEBUG)
14 |     logger.propagate = True
15 |     return logger
16 | 


--------------------------------------------------------------------------------
/helpers/utils/validators.py:
--------------------------------------------------------------------------------
 1 | from uuid import UUID
 2 | 
 3 | 
 4 | def _is_valid_uuid(
 5 |     guid: str,
 6 | ):
 7 |     """
 8 |     Validates if a string is a valid GUID in version 4
 9 | 
10 |     Parameters
11 |     ----------
12 |     guid : str
13 |         GUID to be validated.
14 | 
15 |     Returns
16 |     -------
17 |     bool
18 |         Boolean that indicates if the string is a GUID or not.
19 |     """
20 | 
21 |     try:
22 |         UUID(str(guid), version=4)
23 |         return True
24 |     except ValueError:
25 |         return False
26 | 


--------------------------------------------------------------------------------
/helpers/utils/authentication.py:
--------------------------------------------------------------------------------
 1 | from azure.identity import DefaultAzureCredential
 2 | from cachetools import TTLCache
 3 | 
 4 | 
 5 | def get_azure_credentials(client_id: str, cache: TTLCache) -> DefaultAzureCredential:
 6 |     """
 7 |     Get Azure credentials using DefaultAzureCredential.
 8 |     This function is used to authenticate with Azure services.
 9 |     """
10 |     if f"{client_id}_creds" in cache:
11 |         return cache[f"{client_id}_creds"]
12 |     # If credentials are not cached, create a new DefaultAzureCredential instance
13 |     # and store it in the cache.
14 |     else:
15 |         cache[f"{client_id}_creds"] = DefaultAzureCredential()
16 |         return cache[f"{client_id}_creds"]
17 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "ms-fabric-mcp"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "mcp[cli]",
 9 |     "azure-identity",
10 |     "deltalake",
11 |     "requests",
12 |     "cachetools",
13 |     "semantic-link-labs",
14 |     "azure-storage-blob",
15 |     "polars",
16 |     "sqlalchemy",
17 |     "pyodbc",
18 |     "tabulate",
19 |     "fastapi[standard]",
20 |     "python-jose[cryptography]",
21 |     "passlib[bcrypt]",
22 |     "python-multipart",
23 |     "fastapi-mcp",
24 | ]
25 | 
26 | [tool.setuptools]
27 | packages = ["helpers", "helpers.clients", "helpers.formatters", "helpers.utils"]
28 | 
29 | [project.scripts]
30 | mcp = "mcp.cli:app [cli]"
31 | 


--------------------------------------------------------------------------------
/helpers/clients/__init__.py:
--------------------------------------------------------------------------------
 1 | from helpers.clients.lakehouse_client import LakehouseClient
 2 | from helpers.clients.warehouse_client import WarehouseClient
 3 | from helpers.clients.table_client import TableClient
 4 | from helpers.clients.workspace_client import WorkspaceClient
 5 | from helpers.clients.semanticModel_client import SemanticModelClient
 6 | from helpers.clients.report_client import ReportClient
 7 | from helpers.clients.fabric_client import FabricApiClient
 8 | from helpers.clients.sql_client import SQLClient, get_sql_endpoint
 9 | from helpers.clients.notebook_client import NotebookClient
10 | 
11 | 
12 | __all__ = [
13 |     "LakehouseClient",
14 |     "WarehouseClient",
15 |     "TableClient",
16 |     "WorkspaceClient",
17 |     "FabricApiClient",
18 |     "SemanticModelClient",
19 |     "ReportClient",
20 |     "NotebookClient",
21 |     "SQLClient",
22 |     "get_sql_endpoint",
23 | ]
24 | 


--------------------------------------------------------------------------------
/helpers/clients/report_client.py:
--------------------------------------------------------------------------------
 1 | from helpers.logging_config import get_logger
 2 | from helpers.clients.fabric_client import FabricApiClient
 3 | 
 4 | logger = get_logger(__name__)
 5 | 
 6 | 
 7 | class ReportClient:
 8 |     def __init__(self, client: FabricApiClient):
 9 |         self.client = client
10 | 
11 |     async def list_reports(self, workspace_id: str):
12 |         """List all reports in a workspace."""
13 |         reports = await self.client.get_reports(workspace_id)
14 | 
15 |         if not reports:
16 |             return f"No reports found in workspace '{workspace_id}'."
17 | 
18 |         return reports
19 | 
20 |     async def get_report(self, workspace_id: str, report_id: str) -> dict:
21 |         """Get a specific report by ID."""
22 |         report = await self.client.get_report(workspace_id, report_id)
23 | 
24 |         if not report:
25 |             return (
26 |                 f"No report found with ID '{report_id}' in workspace '{workspace_id}'."
27 |             )
28 | 
29 |         return report
30 | 


--------------------------------------------------------------------------------
/fabric_mcp.py:
--------------------------------------------------------------------------------
 1 | from tools import *
 2 | from helpers.logging_config import get_logger
 3 | from helpers.utils.context import mcp, __ctx_cache
 4 | import uvicorn
 5 | import argparse
 6 | import logging
 7 | 
 8 | 
 9 | 
10 | logger = get_logger(__name__)
11 | logger.level = logging.INFO
12 | 
13 | 
14 | @mcp.tool()
15 | async def clear_context() -> str:
16 |     """Clear the current session context.
17 | 
18 |     Returns:
19 |         A string confirming the context has been cleared.
20 |     """
21 |     __ctx_cache.clear()
22 |     return "Context cleared."
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     # Initialize and run the server
27 |     logger.info("Starting MCP server...")
28 |     parser = argparse.ArgumentParser(description="Run MCP Streamable HTTP based server")
29 |     parser.add_argument("--port", type=int, default=8081, help="Localhost port to listen on")
30 |     args = parser.parse_args()
31 | 
32 |     # Start the server with Streamable HTTP transport
33 |     uvicorn.run(mcp.streamable_http_app, host="0.0.0.0", port=args.port)
34 |     # mcp.run(transport="stdio")
35 | 


--------------------------------------------------------------------------------
/helpers/clients/workspace_client.py:
--------------------------------------------------------------------------------
 1 | from helpers.logging_config import get_logger
 2 | from helpers.clients.fabric_client import FabricApiClient
 3 | 
 4 | logger = get_logger(__name__)
 5 | 
 6 | 
 7 | class WorkspaceClient:
 8 |     def __init__(self, client: FabricApiClient):
 9 |         self.client = client
10 | 
11 |     async def list_workspaces(self):
12 |         """List all available workspaces."""
13 |         workspaces = await self.client.get_workspaces()
14 |         if not workspaces:
15 |             raise ValueError("No workspaces found.")
16 | 
17 |         markdown = "# Fabric Workspaces\n\n"
18 |         markdown += "| ID | Name | Capacity |\n"
19 |         markdown += "|-----|------|----------|\n"
20 | 
21 |         for ws in workspaces:
22 |             markdown += f"| {ws['id']} | {ws['displayName']} | {ws.get('capacityId', 'N/A')} |\n"
23 | 
24 |         return markdown
25 | 
26 |     async def resolve_workspace(self, workspace_name: str):
27 |         """Resolve workspace name to workspace ID."""
28 |         return await self.client.resolve_workspace_name_and_id(workspace=workspace_name)
29 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | from tools.workspace import set_workspace, list_workspaces
 2 | from tools.warehouse import set_warehouse, list_warehouses
 3 | from tools.lakehouse import set_lakehouse, list_lakehouses
 4 | from tools.table import (
 5 |     set_table,
 6 |     list_tables,
 7 |     get_lakehouse_table_schema,
 8 |     get_all_lakehouse_schemas,
 9 |     run_query,
10 | )
11 | from tools.semantic_model import (
12 |     list_semantic_models,
13 |     get_semantic_model,
14 | )
15 | from tools.report import (
16 |     list_reports,
17 |     get_report,
18 | )
19 | from tools.load_data import load_data_from_url
20 | from tools.notebook import list_notebooks, create_notebook
21 | 
22 | __all__ = [
23 |     "set_workspace",
24 |     "list_workspaces",
25 |     "set_warehouse",
26 |     "list_warehouses",
27 |     "set_lakehouse",
28 |     "list_lakehouses",
29 |     "set_table",
30 |     "list_tables",
31 |     "get_lakehouse_table_schema",
32 |     "get_all_lakehouse_schemas",
33 |     "list_semantic_models",
34 |     "get_semantic_model",
35 |     "list_reports",
36 |     "get_report",
37 |     "load_data_from_url",
38 |     "run_query",
39 |     "list_notebooks",
40 |     "create_notebook",
41 | ]
42 | 


--------------------------------------------------------------------------------
/helpers/formatters/metadata_formatter.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import json
 3 | 
 4 | 
 5 | def format_metadata_to_markdown(metadata: object) -> str:
 6 |     """Convert Delta table metadata to a responsive markdown format with HTML."""
 7 |     md = "#### Metadata\n\n"
 8 |     md += "<dl>\n"
 9 |     md += f"  <dt>ID:</dt><dd>{metadata.id}</dd>\n"
10 |     if metadata.name:
11 |         md += f"  <dt>Name:</dt><dd>{metadata.name}</dd>\n"
12 |     if metadata.description:
13 |         md += f"  <dt>Description:</dt><dd>{metadata.description}</dd>\n"
14 |     if metadata.partition_columns:
15 |         md += f"  <dt>Partition Columns:</dt><dd>{', '.join(metadata.partition_columns)}</dd>\n"
16 |     if metadata.created_time:
17 |         created_time = datetime.fromtimestamp(metadata.created_time / 1000)
18 |         md += f"  <dt>Created:</dt><dd>{created_time.strftime('%Y-%m-%d %H:%M:%S')}</dd>\n"
19 |     if metadata.configuration:
20 |         md += "  <dt>Configuration:</dt>\n"
21 |         md += "  <dd>\n"
22 |         md += "    <details>\n"
23 |         md += "      <summary>View JSON</summary>\n"
24 |         md += "      <pre><code>\n"
25 |         md += json.dumps(metadata.configuration, indent=2)
26 |         md += "\n      </code></pre>\n"
27 |         md += "    </details>\n"
28 |         md += "  </dd>\n"
29 |     md += "</dl>\n"
30 |     return md
31 | 


--------------------------------------------------------------------------------
/tools/workspace.py:
--------------------------------------------------------------------------------
 1 | from helpers.utils.context import mcp, __ctx_cache
 2 | from mcp.server.fastmcp import Context
 3 | from helpers.utils.authentication import get_azure_credentials
 4 | from helpers.clients import (
 5 |     FabricApiClient,
 6 |     WorkspaceClient,
 7 | )
 8 | 
 9 | 
10 | @mcp.tool()
11 | async def set_workspace(workspace: str, ctx: Context) -> str:
12 |     """Set the current workspace for the session.
13 | 
14 |     Args:
15 |         workspace: Name or ID of the workspace
16 |         ctx: Context object containing client information
17 |     Returns:
18 |         A string confirming the workspace has been set.
19 |     """
20 |     __ctx_cache[f"{ctx.client_id}_workspace"] = workspace
21 |     return f"Workspace set to '{workspace}'."
22 | 
23 | 
24 | @mcp.tool()
25 | async def list_workspaces(ctx: Context) -> str:
26 |     """List all available Fabric workspaces.
27 | 
28 |     Args:
29 |         ctx: Context object containing client information
30 | 
31 |     Returns:
32 |         A string containing the list of workspaces or an error message.
33 |     """
34 |     try:
35 |         client = WorkspaceClient(
36 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
37 |         )
38 | 
39 |         workspaces = await client.list_workspaces()
40 | 
41 |         return workspaces
42 | 
43 |     except Exception as e:
44 |         return f"Error listing workspaces: {str(e)}"
45 | 


--------------------------------------------------------------------------------
/helpers/formatters/schema_formatter.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | from helpers.formatters.metadata_formatter import format_metadata_to_markdown
 3 | 
 4 | 
 5 | def format_schema_to_markdown(
 6 |     table_info: Dict, schema: object, metadata: object
 7 | ) -> str:
 8 |     """Convert a Delta table schema and metadata to a responsive markdown format with HTML."""
 9 |     md = f"<h2>Delta Table: <code>{table_info['name']}</code></h2>\n"
10 |     md += f"<p><strong>Type:</strong> {table_info['type']}</p>\n"
11 |     md += f"<p><strong>Location:</strong> <code>{table_info['location']}</code></p>\n\n"
12 | 
13 |     # Responsive schema table wrapped in a scrollable div
14 |     md += "<h3>Schema</h3>\n"
15 |     md += '<div style="overflow-x:auto;">\n'
16 |     md += '<table style="width:100%; border-collapse: collapse;" border="1">\n'
17 |     md += "  <tr>\n"
18 |     md += "    <th>Column Name</th>\n"
19 |     md += "    <th>Data Type</th>\n"
20 |     md += "    <th>Nullable</th>\n"
21 |     md += "  </tr>\n"
22 | 
23 |     for field in schema.fields:
24 |         md += "  <tr>\n"
25 |         md += f"    <td>{field.name}</td>\n"
26 |         md += f"    <td>{field.type}</td>\n"
27 |         md += f"    <td>{field.nullable}</td>\n"
28 |         md += "  </tr>\n"
29 | 
30 |     md += "</table>\n"
31 |     md += "</div>\n\n"
32 | 
33 |     # Collapsible metadata section for a dynamic feel
34 |     md += "<details>\n"
35 |     md += "  <summary>View Metadata</summary>\n\n"
36 |     md += format_metadata_to_markdown(metadata)
37 |     md += "\n</details>\n"
38 | 
39 |     return md + "\n"
40 | 


--------------------------------------------------------------------------------
/helpers/clients/warehouse_client.py:
--------------------------------------------------------------------------------
 1 | from helpers.logging_config import get_logger
 2 | from helpers.clients.fabric_client import FabricApiClient
 3 | from typing import Optional, Dict, Any
 4 | 
 5 | logger = get_logger(__name__)
 6 | 
 7 | 
 8 | class WarehouseClient:
 9 |     def __init__(self, client: FabricApiClient):
10 |         self.client = client
11 | 
12 |     async def list_warehouses(self, workspace: str):
13 |         """List all warehouses in a lakehouse."""
14 |         warehouses = await self.client.get_warehouses(workspace)
15 | 
16 |         if not warehouses:
17 |             return f"No warehouses found in workspace '{workspace}'."
18 | 
19 |         markdown = f"# Warehouses in workspace '{workspace}'\n\n"
20 |         markdown += "| ID | Name |\n"
21 |         markdown += "|-----|------|\n"
22 | 
23 |         for wh in warehouses:
24 |             markdown += f"| {wh['id']} | {wh['displayName']} |\n"
25 | 
26 |         return markdown
27 | 
28 |     async def get_warehouse(
29 |         self,
30 |         workspace: str,
31 |         warehouse: str,
32 |     ) -> Optional[Dict[str, Any]]:
33 |         """Get details of a specific warehouse."""
34 |         if not warehouse:
35 |             raise ValueError("Warehouse name cannot be empty.")
36 | 
37 |         return await self.client.get_item(
38 |             workspace_id=workspace, item_id=warehouse, item_type="warehouse"
39 |         )
40 | 
41 |     async def create_warehouse(
42 |         self,
43 |         name: str,
44 |         workspace: str,
45 |         description: Optional[str] = None,
46 |     ):
47 |         """Create a new warehouse."""
48 |         if not name:
49 |             raise ValueError("Warehouse name cannot be empty.")
50 | 
51 |         return await self.client.create_item(
52 |             name=name, workspace=workspace, description=description, type="Warehouse"
53 |         )
54 | 


--------------------------------------------------------------------------------
/tools/sql_endpoint.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from helpers.utils.context import mcp, __ctx_cache
 3 | from mcp.server.fastmcp import Context
 4 | from helpers.clients import get_sql_endpoint
 5 | 
 6 | 
 7 | @mcp.tool()
 8 | async def get_sql_endpoint(
 9 |     workspace: Optional[str] = None,
10 |     lakehouse: Optional[str] = None,
11 |     warehouse: Optional[str] = None,
12 |     type: Optional[str] = None,
13 |     ctx: Context = None,
14 | ) -> str:
15 |     """
16 |     Retrieve the SQL endpoint for a specified lakehouse or warehouse.
17 | 
18 |     Args:
19 |         workspace: Name or ID of the workspace (optional).
20 |         lakehouse: Name or ID of the lakehouse (optional).
21 |         warehouse: Name or ID of the warehouse (optional).
22 |         type: Type of resource ('lakehouse' or 'warehouse'). If not provided, it will be inferred.
23 |         ctx: Context object containing client information.
24 | 
25 |     Returns:
26 |         A string containing the resource type, name/ID, and its SQL endpoint.
27 |     """
28 |     try:
29 |         if ctx is None:
30 |             raise ValueError("Context (ctx) must be provided.")
31 | 
32 |         if workspace is None:
33 |             workspace = __ctx_cache.get(f"{ctx.client_id}_workspace")
34 |             if workspace is None:
35 |                 raise ValueError("Workspace must be specified or set in context.")
36 |         if lakehouse is None and warehouse is None:
37 |             lakehouse = __ctx_cache.get(f"{ctx.client_id}_lakehouse")
38 |             warehouse = __ctx_cache.get(f"{ctx.client_id}_warehouse")
39 |             if warehouse is None and lakehouse is None:
40 |                 raise ValueError(
41 |                     "Either lakehouse or warehouse must be specified or set in context."
42 |                 )
43 | 
44 |         name, endpoint = await get_sql_endpoint(
45 |             workspace=workspace,
46 |             lakehouse=lakehouse,
47 |             warehouse=warehouse,  # Add warehouse to the call
48 |             type=type,
49 |         )
50 | 
51 |         return (
52 |             endpoint
53 |             if endpoint
54 |             else f"No SQL endpoint found for {type} '{lakehouse or warehouse}' in workspace '{workspace}'."
55 |         )
56 |     except Exception as e:
57 |         return f"Error retrieving SQL endpoint: {str(e)}"
58 | 


--------------------------------------------------------------------------------
/helpers/utils/table_tools.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Tuple, Optional
 2 | from azure.identity import DefaultAzureCredential
 3 | from deltalake import DeltaTable
 4 | from helpers.logging_config import get_logger
 5 | import asyncio
 6 | 
 7 | logger = get_logger(__name__)
 8 | 
 9 | 
10 | async def get_delta_schemas(
11 |     tables: List[Dict], credential: DefaultAzureCredential
12 | ) -> List[Tuple[Dict, object, object]]:
13 |     """Get schema and metadata for each Delta table"""
14 |     delta_tables = []
15 |     logger.info(f"Starting schema extraction for {len(tables)} tables")
16 | 
17 |     # Get token for Azure Storage (not Fabric API)
18 |     token = credential.get_token("https://storage.azure.com/.default").token
19 |     storage_options = {"bearer_token": token, "use_fabric_endpoint": "true"}
20 | 
21 |     for table in tables:
22 |         task = asyncio.create_task(get_delta_table(table, storage_options))
23 |         delta_tables.append(task)
24 |         logger.debug(f"Created task for table: {table['name']}")
25 |     # Wait for all tasks to complete
26 |     delta_tables = await asyncio.gather(*delta_tables)
27 |     logger.info(f"Completed schema extraction for {len(delta_tables)} tables")
28 |     # Filter out None values
29 |     delta_tables = [dt for dt in delta_tables if dt is not None]
30 |     return delta_tables
31 | 
32 | 
33 | async def get_delta_table(
34 |     table: Dict, storage_options: Optional[Dict] = None
35 | ) -> Optional[Tuple[Dict, object, object]]:
36 |     """Get Delta table schema and metadata"""
37 |     logger.debug(f"Processing table: {table['name']}")
38 | 
39 |     # Check if the table is a Delta table
40 | 
41 |     if table["format"].lower() == "delta":
42 |         try:
43 |             table_path = table["location"]
44 |             logger.debug(f"Processing Delta table: {table['name']} at {table_path}")
45 | 
46 |             # Create DeltaTable instance with storage options
47 |             delta_table = DeltaTable(table_path, storage_options=storage_options)
48 | 
49 |             # Get both schema and metadata
50 |             result = (table, delta_table.schema(), delta_table.metadata())
51 |             logger.info(f"Processed table: {table['name']}")
52 |             return result
53 | 
54 |         except Exception as e:
55 |             logger.error(f"Could not process table {table['name']}: {str(e)}")
56 |             return None
57 | 


--------------------------------------------------------------------------------
/helpers/clients/lakehouse_client.py:
--------------------------------------------------------------------------------
 1 | from helpers.utils import _is_valid_uuid
 2 | from helpers.logging_config import get_logger
 3 | from helpers.clients.fabric_client import FabricApiClient
 4 | from typing import Optional, Dict, Any
 5 | 
 6 | logger = get_logger(__name__)
 7 | 
 8 | 
 9 | class LakehouseClient:
10 |     def __init__(self, client: FabricApiClient):
11 |         self.client = client
12 | 
13 |     async def list_lakehouses(self, workspace: str):
14 |         """List all lakehouses in a workspace."""
15 |         if not _is_valid_uuid(workspace):
16 |             raise ValueError("Invalid workspace ID.")
17 |         lakehouses = await self.client.get_lakehouses(workspace)
18 | 
19 |         if not lakehouses:
20 |             return f"No lakehouses found in workspace '{workspace}'."
21 | 
22 |         markdown = f"# Lakehouses in workspace '{workspace}'\n\n"
23 |         markdown += "| ID | Name |\n"
24 |         markdown += "|-----|------|\n"
25 | 
26 |         for lh in lakehouses:
27 |             markdown += f"| {lh['id']} | {lh['displayName']} |\n"
28 | 
29 |         return markdown
30 | 
31 |     async def get_lakehouse(
32 |         self,
33 |         workspace: str,
34 |         lakehouse: str,
35 |     ) -> Optional[Dict[str, Any]]:
36 |         """Get details of a specific lakehouse."""
37 |         if not _is_valid_uuid(workspace):
38 |             raise ValueError("Invalid workspace ID.")
39 | 
40 |         if not lakehouse:
41 |             raise ValueError("Lakehouse name cannot be empty.")
42 | 
43 |         response = await self.client.get_item(workspace_id=workspace, item_id=lakehouse)
44 |         logger.info(f"Lakehouse details: {response}")
45 |         return response
46 | 
47 |     async def resolve_lakehouse(self, workspace_id: str, lakehouse_name: str):
48 |         """Resolve lakehouse name to lakehouse ID."""
49 |         return await self.client.resolve_item_name_and_id(
50 |             workspace=workspace_id, item=lakehouse_name, type="Lakehouse"
51 |         )
52 | 
53 |     async def create_lakehouse(
54 |         self,
55 |         name: str,
56 |         workspace: str,
57 |         description: Optional[str] = None,
58 |     ):
59 |         """Create a new lakehouse."""
60 |         if not _is_valid_uuid(workspace):
61 |             raise ValueError("Invalid workspace ID.")
62 | 
63 |         if not name:
64 |             raise ValueError("Lakehouse name cannot be empty.")
65 | 
66 |         return await self.client.create_item(
67 |             name=name, workspace=workspace, description="description", type="Lakehouse"
68 |         )
69 | 


--------------------------------------------------------------------------------
/tools/report.py:
--------------------------------------------------------------------------------
 1 | from helpers.utils.context import mcp, __ctx_cache
 2 | from mcp.server.fastmcp import Context
 3 | from helpers.utils.authentication import get_azure_credentials
 4 | from helpers.clients import (
 5 |     FabricApiClient,
 6 |     ReportClient,
 7 | )
 8 | from helpers.logging_config import get_logger
 9 | from typing import Optional
10 | 
11 | logger = get_logger(__name__)
12 | 
13 | 
14 | @mcp.tool()
15 | async def list_reports(workspace: Optional[str] = None, ctx: Context = None) -> str:
16 |     """List all reports in a Fabric workspace.
17 | 
18 |     Args:
19 |         workspace: Name or ID of the workspace (optional)
20 |         ctx: Context object containing client information
21 |     Returns:
22 |         A string containing the list of reports or an error message.
23 |     """
24 |     try:
25 |         client = ReportClient(
26 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
27 |         )
28 | 
29 |         reports = await client.list_reports(
30 |             workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"]
31 |         )
32 | 
33 |         markdown = f"# Reports in workspace '{workspace}'\n\n"
34 |         markdown += "| ID | Name | Description |\n"
35 |         markdown += "|-----|------|-------------|\n"
36 | 
37 |         for report in reports:
38 |             markdown += f"| {report.get('id', 'N/A')} | {report.get('displayName', 'N/A')} | {report.get('description', 'N/A')} |\n"
39 | 
40 |         return markdown
41 | 
42 |     except Exception as e:
43 |         return f"Error listing reports: {str(e)}"
44 | 
45 | 
46 | @mcp.tool()
47 | async def get_report(
48 |     workspace: Optional[str] = None,
49 |     report_id: Optional[str] = None,
50 |     ctx: Context = None,
51 | ) -> str:
52 |     """Get a specific report by ID.
53 | 
54 |     Args:
55 |         workspace: Name or ID of the workspace (optional)
56 |         report_id: ID of the report (optional)
57 |         ctx: Context object containing client information
58 | 
59 |     Returns:
60 |         A string containing the report details or an error message.
61 |     """
62 |     try:
63 |         client = ReportClient(
64 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
65 |         )
66 | 
67 |         report = await client.get_report(
68 |             workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"],
69 |             report_id,
70 |         )
71 | 
72 |         if not report:
73 |             return f"No report found with ID '{report_id}' in workspace '{workspace}'."
74 | 
75 |         return f"Report details:\n\n{report}"
76 | 
77 |     except Exception as e:
78 |         return f"Error getting report: {str(e)}"
79 | 


--------------------------------------------------------------------------------
/tools/semantic_model.py:
--------------------------------------------------------------------------------
 1 | from helpers.utils.context import mcp, __ctx_cache
 2 | from mcp.server.fastmcp import Context
 3 | from helpers.utils.authentication import get_azure_credentials
 4 | from helpers.clients import (
 5 |     FabricApiClient,
 6 |     SemanticModelClient,
 7 | )
 8 | from helpers.logging_config import get_logger
 9 | 
10 | from typing import Optional
11 | 
12 | logger = get_logger(__name__)
13 | 
14 | 
15 | @mcp.tool()
16 | async def list_semantic_models(
17 |     workspace: Optional[str] = None, ctx: Context = None
18 | ) -> str:
19 |     """List all semantic models in a Fabric workspace.
20 | 
21 |     Args:
22 |         workspace: Name or ID of the workspace (optional)
23 |         ctx: Context object containing client information
24 | 
25 |     Returns:
26 |         A string containing the list of semantic models or an error message.
27 |     """
28 |     try:
29 |         client = SemanticModelClient(
30 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
31 |         )
32 | 
33 |         models = await client.list_semantic_models(
34 |             workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"]
35 |         )
36 | 
37 |         markdown = f"# Semantic Models in workspace '{workspace}'\n\n"
38 |         markdown += "| ID | Name | Folder ID | Description |\n"
39 |         markdown += "|-----|------|-----------|-------------|\n"
40 | 
41 |         for model in models:
42 |             markdown += f"| {model.get('id', 'N/A')} | {model.get('displayName', 'N/A')} | {model.get('folderId', 'N/A')} | {model.get('description', 'N/A')} |\n"
43 | 
44 |         return markdown
45 | 
46 |     except Exception as e:
47 |         return f"Error listing semantic models: {str(e)}"
48 | 
49 | 
50 | @mcp.tool()
51 | async def get_semantic_model(
52 |     workspace: Optional[str] = None,
53 |     model_id: Optional[str] = None,
54 |     ctx: Context = None,
55 | ) -> str:
56 |     """Get a specific semantic model by ID.
57 | 
58 |     Args:
59 |         workspace: Name or ID of the workspace (optional)
60 |         model_id: ID of the semantic model (optional)
61 |         ctx: Context object containing client information
62 | 
63 |     Returns:
64 |         A string containing the details of the semantic model or an error message.
65 |     """
66 |     try:
67 |         client = SemanticModelClient(
68 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
69 |         )
70 | 
71 |         model = await client.get_semantic_model(
72 |             workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"],
73 |             model_id if model_id else __ctx_cache[f"{ctx.client_id}_semantic_model"],
74 |         )
75 | 
76 |         return f"Semantic Model '{model['displayName']}' details:\n\n{model}"
77 | 
78 |     except Exception as e:
79 |         return f"Error retrieving semantic model: {str(e)}"
80 | 


--------------------------------------------------------------------------------
/tools/warehouse.py:
--------------------------------------------------------------------------------
 1 | from helpers.utils.context import mcp, __ctx_cache
 2 | from mcp.server.fastmcp import Context
 3 | from helpers.utils.authentication import get_azure_credentials
 4 | from helpers.clients import (
 5 |     FabricApiClient,
 6 |     WarehouseClient,
 7 | )
 8 | 
 9 | from typing import Optional
10 | 
11 | 
12 | @mcp.tool()
13 | async def set_warehouse(warehouse: str, ctx: Context) -> str:
14 |     """Set the current warehouse for the session.
15 | 
16 |     Args:
17 |         warehouse: Name or ID of the warehouse
18 |         ctx: Context object containing client information
19 | 
20 |     Returns:
21 |         A string confirming the warehouse has been set.
22 |     """
23 |     __ctx_cache[f"{ctx.client_id}_warehouse"] = warehouse
24 |     return f"Warehouse set to '{warehouse}'."
25 | 
26 | 
27 | @mcp.tool()
28 | async def list_warehouses(workspace: Optional[str] = None, ctx: Context = None) -> str:
29 |     """List all warehouses in a Fabric workspace.
30 | 
31 |     Args:
32 |         workspace: Name or ID of the workspace (optional)
33 |         ctx: Context object containing client information
34 | 
35 |     Returns:
36 |         A string containing the list of warehouses or an error message.
37 |     """
38 |     try:
39 |         client = WarehouseClient(
40 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
41 |         )
42 | 
43 |         warehouses = await client.list_warehouses(
44 |             workspace if workspace else __ctx_cache[f"{ctx.client_id}_workspace"]
45 |         )
46 | 
47 |         return warehouses
48 | 
49 |     except Exception as e:
50 |         return f"Error listing warehouses: {str(e)}"
51 | 
52 | 
53 | @mcp.tool()
54 | async def create_warehouse(
55 |     name: str,
56 |     workspace: Optional[str] = None,
57 |     description: Optional[str] = None,
58 |     ctx: Context = None,
59 | ) -> str:
60 |     """Create a new warehouse in a Fabric workspace.
61 | 
62 |     Args:
63 |         name: Name of the warehouse
64 |         workspace: Name or ID of the workspace (optional)
65 |         description: Description of the warehouse (optional)
66 |         ctx: Context object containing client information
67 |     Returns:
68 |         A string confirming the warehouse has been created or an error message.
69 |     """
70 |     try:
71 |         client = WarehouseClient(
72 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
73 |         )
74 | 
75 |         response = await client.create_warehouse(
76 |             name=name,
77 |             workspace=workspace
78 |             if workspace
79 |             else __ctx_cache[f"{ctx.client_id}_workspace"],
80 |             description=description,
81 |         )
82 | 
83 |         return f"Warehouse '{response['id']}' created successfully."
84 | 
85 |     except Exception as e:
86 |         return f"Error creating warehouse: {str(e)}"
87 | 


--------------------------------------------------------------------------------
/helpers/clients/notebook_client.py:
--------------------------------------------------------------------------------
 1 | from helpers.utils import _is_valid_uuid
 2 | from helpers.logging_config import get_logger
 3 | from helpers.clients.fabric_client import FabricApiClient
 4 | from typing import Dict, Any
 5 | 
 6 | logger = get_logger(__name__)
 7 | 
 8 | 
 9 | class NotebookClient:
10 |     def __init__(self, client: FabricApiClient):
11 |         self.client = client
12 | 
13 |     async def list_notebooks(self, workspace: str):
14 |         """List all notebooks in a workspace."""
15 |         if not _is_valid_uuid(workspace):
16 |             raise ValueError("Invalid workspace ID.")
17 |         notebooks = await self.client.get_notebooks(workspace)
18 | 
19 |         if not notebooks:
20 |             return f"No notebooks found in workspace '{workspace}'."
21 | 
22 |         markdown = f"# Notebooks in workspace '{workspace}'\n\n"
23 |         markdown += "| ID | Name |\n"
24 |         markdown += "|-----|------|\n"
25 | 
26 |         for nb in notebooks:
27 |             markdown += f"| {nb['id']} | {nb['displayName']} |\n"
28 | 
29 |         return markdown
30 | 
31 |     async def get_notebook(self, workspace: str, notebook_id: str) -> Dict[str, Any]:
32 |         """Get a specific notebook by ID."""
33 |         if not _is_valid_uuid(workspace):
34 |             raise ValueError("Invalid workspace ID.")
35 |         if not _is_valid_uuid(notebook_id):
36 |             raise ValueError("Invalid notebook ID.")
37 | 
38 |         notebook = await self.client.get_notebook(workspace, notebook_id)
39 | 
40 |         if not notebook:
41 |             return (
42 |                 f"No notebook found with ID '{notebook_id}' in workspace '{workspace}'."
43 |             )
44 | 
45 |         return notebook
46 | 
47 |     async def create_notebook(
48 |         self, workspace: str, notebook_name: str, content: str
49 |     ) -> Dict[str, Any]:
50 |         """Create a new notebook."""
51 |         try:
52 |             workspace, workspace_id = await self.client.resolve_workspace_name_and_id(
53 |                 workspace
54 |             )
55 |             if not workspace_id:
56 |                 raise ValueError("Invalid workspace ID.")
57 |             
58 |             logger.info(f"Creating notebook '{notebook_name}' in workspace '{workspace}' (ID: {workspace_id}).")
59 |             
60 |             try:
61 |                 response = await self.client.create_notebook(
62 |                     workspace_id=workspace_id,
63 |                     notebook_name=notebook_name,
64 |                     ipynb_name=notebook_name,
65 |                     content=content,
66 |                 )
67 |             except Exception as e:
68 |                 error_msg = f"Failed to create notebook '{notebook_name}' in workspace '{workspace}': {str(e)}"
69 |                 logger.error(error_msg)
70 |                 return error_msg
71 | 
72 |             
73 |             logger.info(f"Successfully created notebook '{notebook_name}' with ID: {response['id']}")
74 |             return response
75 |             
76 |         except Exception as e:
77 |             error_msg = f"Error creating notebook '{notebook_name}': {str(e)}"
78 |             logger.error(error_msg)
79 |             return error_msg
80 | 


--------------------------------------------------------------------------------
/tools/load_data.py:
--------------------------------------------------------------------------------
 1 | from helpers.utils.context import mcp, __ctx_cache
 2 | from mcp.server.fastmcp import Context
 3 | from helpers.utils.authentication import get_azure_credentials
 4 | from helpers.clients import (
 5 |     FabricApiClient,
 6 |     LakehouseClient,
 7 |     WarehouseClient,
 8 |     get_sql_endpoint,
 9 | )
10 | from helpers.logging_config import get_logger
11 | import tempfile
12 | import os
13 | import requests
14 | from typing import Optional
15 | 
16 | logger = get_logger(__name__)
17 | 
18 | 
19 | @mcp.tool()
20 | async def load_data_from_url(
21 |     url: str,
22 |     destination_table: str,
23 |     workspace: Optional[str] = None,
24 |     lakehouse: Optional[str] = None,
25 |     warehouse: Optional[str] = None,
26 |     ctx: Context = None,
27 | ) -> str:
28 |     """Load data from a URL into a table in a warehouse or lakehouse.
29 | 
30 |     Args:
31 |         url: The URL to download data from (CSV or Parquet supported).
32 |         destination_table: The name of the table to load data into.
33 |         workspace: Name or ID of the workspace (optional).
34 |         lakehouse: Name or ID of the lakehouse (optional).
35 |         warehouse: Name or ID of the warehouse (optional).
36 |         ctx: Context object containing client information.
37 |     Returns:
38 |         A string confirming the data load or an error message.
39 |     """
40 |     try:
41 |         # Download the file
42 |         response = requests.get(url)
43 |         if response.status_code != 200:
44 |             return f"Failed to download file from URL: {url}"
45 |         file_ext = url.split("?")[0].split(".")[-1].lower()
46 |         if file_ext not in ("csv", "parquet"):
47 |             return f"Unsupported file type: {file_ext}. Only CSV and Parquet are supported."
48 |         with tempfile.NamedTemporaryFile(
49 |             delete=False, suffix=f".{file_ext}"
50 |         ) as tmp_file:
51 |             tmp_file.write(response.content)
52 |             tmp_path = tmp_file.name
53 |         # Choose destination: lakehouse or warehouse
54 |         credential = get_azure_credentials(ctx.client_id, __ctx_cache)
55 |         resource_id = None
56 |         resource_type = None
57 |         if lakehouse:
58 |             client = LakehouseClient(FabricApiClient(credential))
59 |             resource_id = lakehouse
60 |             resource_type = "lakehouse"
61 |         elif warehouse:
62 |             client = WarehouseClient(FabricApiClient(credential))
63 |             resource_id = warehouse
64 |             resource_type = "warehouse"
65 |         else:
66 |             return "Either lakehouse or warehouse must be specified."
67 |         # Here you would call the appropriate method to upload/ingest the file into the table.
68 |         # This is a placeholder for the actual implementation, which depends on the client API.
69 |         # For now, just return a success message with file info.
70 |         os.remove(tmp_path)
71 |         return f"Data from {url} loaded into table '{destination_table}' in {resource_type} '{resource_id}'. (File type: {file_ext})"
72 |     except Exception as e:
73 |         return f"Error loading data: {str(e)}"
74 | 
75 | 
76 | # @mcp.resource(
77 | #         uri="tables://{table_name}",
78 | # )
79 | 


--------------------------------------------------------------------------------
/helpers/clients/table_client.py:
--------------------------------------------------------------------------------
  1 | from helpers.logging_config import get_logger
  2 | from helpers.clients.fabric_client import FabricApiClient
  3 | from helpers.utils.table_tools import get_delta_schemas
  4 | from azure.identity import DefaultAzureCredential
  5 | from helpers.formatters.schema_formatter import format_schema_to_markdown
  6 | from datetime import datetime
  7 | 
  8 | logger = get_logger(__name__)
  9 | 
 10 | 
 11 | class TableClient:
 12 |     def __init__(self, client: FabricApiClient):
 13 |         self.client = client
 14 | 
 15 |     async def list_tables(
 16 |         self, workspace_id: str, rsc_id: str, rsc_type: str = "lakehouse"
 17 |     ):
 18 |         """List all tables in a lakehouse."""
 19 |         tables = await self.client.get_tables(workspace_id, rsc_id, rsc_type)
 20 | 
 21 |         if not tables:
 22 |             return f"No tables found in {rsc_type} '{rsc_id}'."
 23 | 
 24 |         return tables
 25 | 
 26 |     async def get_table_schema(
 27 |         self,
 28 |         workspace: str,
 29 |         rsc_id: str,
 30 |         rsc_type: str,
 31 |         table_name: str,
 32 |         credential: DefaultAzureCredential,
 33 |     ):
 34 |         """Retrieve schema for a specific table."""
 35 | 
 36 |         tables = await self.list_tables(workspace, rsc_id, rsc_type)
 37 | 
 38 |         # Find the specific table
 39 |         matching_tables = [t for t in tables if t["name"].lower() == table_name.lower()]
 40 | 
 41 |         if not matching_tables:
 42 |             return f"No table found with name '{table_name}' in {rsc_type} '{rsc_id}'."
 43 | 
 44 |         table = matching_tables[0]
 45 | 
 46 |         # Check that it is a Delta table
 47 |         if table["format"].lower() != "delta":
 48 |             return f"The table '{table_name}' is not a Delta table (format: {table['format']})."
 49 | 
 50 |         # Get schema
 51 |         delta_tables = await get_delta_schemas([table], credential)
 52 | 
 53 |         if not delta_tables:
 54 |             return f"Could not retrieve schema for table '{table['name']}'."
 55 | 
 56 |         # Format result as markdown
 57 |         table_info, schema, metadata = delta_tables[0]
 58 |         markdown = format_schema_to_markdown(table_info, schema, metadata)
 59 | 
 60 |         return markdown
 61 | 
 62 |     async def get_all_schemas(
 63 |         self,
 64 |         workspace: str,
 65 |         rsc_id: str,
 66 |         rsc_type: str,
 67 |         credential: DefaultAzureCredential,
 68 |     ):
 69 |         """Get schemas for all Delta tables in a Fabric lakehouse."""
 70 |         # Get all tables
 71 |         tables = await self.list_tables(workspace, rsc_id, rsc_type)
 72 | 
 73 |         if isinstance(tables, str):
 74 |             return tables
 75 | 
 76 |         if not tables:
 77 |             return f"No tables found in {rsc_type} '{rsc_id}'."
 78 | 
 79 |         # Filter to only Delta tables
 80 |         delta_format_tables = [t for t in tables if t["format"].lower() == "delta"]
 81 | 
 82 |         if not delta_format_tables:
 83 |             return f"No Delta tables found in {rsc_type} '{rsc_id}'."
 84 | 
 85 |         # Get schema for all tables
 86 |         delta_tables = await get_delta_schemas(delta_format_tables, credential)
 87 | 
 88 |         if not delta_tables:
 89 |             return "Could not retrieve schemas for any tables."
 90 | 
 91 |         # Format the result as markdown
 92 |         markdown = "# Delta Table Schemas\n\n"
 93 |         markdown += f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
 94 |         markdown += f"Workspace: {workspace}\n"
 95 |         markdown += f"Lakehouse: {rsc_id}\n\n"
 96 | 
 97 |         for table_info, schema, metadata in delta_tables:
 98 |             markdown += format_schema_to_markdown(table_info, schema, metadata)
 99 | 
100 |         return markdown
101 | 


--------------------------------------------------------------------------------
/test_notebook_creation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Test script to validate the notebook creation fixes
  4 | """
  5 | import asyncio
  6 | import sys
  7 | import json
  8 | from helpers.clients.fabric_client import FabricApiClient
  9 | from helpers.clients.notebook_client import NotebookClient
 10 | from helpers.utils.authentication import get_azure_credentials
 11 | from helpers.logging_config import get_logger
 12 | 
 13 | logger = get_logger(__name__)
 14 | 
 15 | async def test_notebook_creation():
 16 |     """Test notebook creation with improved error handling"""
 17 |     try:
 18 |         # Initialize clients
 19 |         credentials = get_azure_credentials("test-client-id", {})
 20 |         fabric_client = FabricApiClient(credentials)
 21 |         notebook_client = NotebookClient(fabric_client)
 22 |         
 23 |         # Test workspace - using "My workspace" 
 24 |         workspace_id = "645f0acc-fd1e-42fe-ae6e-e919b6c63322"
 25 |         notebook_name = "Test Debug Notebook"
 26 |         
 27 |         # Create a simple notebook content
 28 |         notebook_json = {
 29 |             "nbformat": 4,
 30 |             "nbformat_minor": 5,
 31 |             "cells": [
 32 |                 {
 33 |                     "cell_type": "code",
 34 |                     "source": ["print('Hello, Fabric!')\n"],
 35 |                     "execution_count": None,
 36 |                     "outputs": [],
 37 |                     "metadata": {},
 38 |                 }
 39 |             ],
 40 |             "metadata": {"language_info": {"name": "python"}},
 41 |         }
 42 |         notebook_content = json.dumps(notebook_json)
 43 |         
 44 |         print(f"Testing notebook creation in workspace: {workspace_id}")
 45 |         print(f"Notebook name: {notebook_name}")
 46 |         
 47 |         # Test the notebook creation
 48 |         result = await notebook_client.create_notebook(
 49 |             workspace=workspace_id,
 50 |             notebook_name=notebook_name,
 51 |             content=notebook_content
 52 |         )
 53 |         
 54 |         print(f"Result: {result}")
 55 |         
 56 |         if isinstance(result, dict) and result.get("id"):
 57 |             print(f"✅ SUCCESS: Created notebook with ID: {result['id']}")
 58 |             return True
 59 |         else:
 60 |             print(f"❌ FAILED: {result}")
 61 |             return False
 62 |             
 63 |     except Exception as e:
 64 |         print(f"❌ ERROR: {str(e)}")
 65 |         logger.error(f"Test failed: {str(e)}", exc_info=True)
 66 |         return False
 67 | 
 68 | async def test_workspace_resolution():
 69 |     """Test workspace name resolution"""
 70 |     try:
 71 |         credentials = get_azure_credentials("test-client-id", {})
 72 |         fabric_client = FabricApiClient(credentials)
 73 |         
 74 |         # Test workspace resolution
 75 |         workspace_name, workspace_id = await fabric_client.resolve_workspace_name_and_id("My workspace")
 76 |         print(f"✅ Workspace resolution: '{workspace_name}' -> {workspace_id}")
 77 |         return True
 78 |         
 79 |     except Exception as e:
 80 |         print(f"❌ Workspace resolution failed: {str(e)}")
 81 |         return False
 82 | 
 83 | if __name__ == "__main__":
 84 |     print("=" * 50)
 85 |     print("Testing Fabric MCP Notebook Creation Fixes")
 86 |     print("=" * 50)
 87 |     
 88 |     # Test workspace resolution first
 89 |     print("\n1. Testing workspace resolution...")
 90 |     success1 = asyncio.run(test_workspace_resolution())
 91 |     
 92 |     # Test notebook creation
 93 |     print("\n2. Testing notebook creation...")
 94 |     success2 = asyncio.run(test_notebook_creation())
 95 |     
 96 |     print("\n" + "=" * 50)
 97 |     if success1 and success2:
 98 |         print("✅ ALL TESTS PASSED")
 99 |         sys.exit(0)
100 |     else:
101 |         print("❌ SOME TESTS FAILED")
102 |         sys.exit(1)
103 | 


--------------------------------------------------------------------------------
/helpers/clients/semanticModel_client.py:
--------------------------------------------------------------------------------
  1 | from helpers.logging_config import get_logger
  2 | from helpers.clients.fabric_client import FabricApiClient
  3 | 
  4 | logger = get_logger(__name__)
  5 | 
  6 | 
  7 | class SemanticModelClient:
  8 |     def __init__(self, client: FabricApiClient):
  9 |         self.client = client
 10 | 
 11 |     async def list_semantic_models(self, workspace_id: str):
 12 |         """List all semantic models in a workspace."""
 13 |         models = await self.client.get_semantic_models(workspace_id)
 14 | 
 15 |         if not models:
 16 |             return f"No semantic models found in workspace '{workspace_id}'."
 17 | 
 18 |         return models
 19 | 
 20 |     async def get_semantic_model(self, workspace_id: str, model_id: str):
 21 |         """Get a specific semantic model by ID."""
 22 |         model = await self.client.get_semantic_model(workspace_id, model_id)
 23 | 
 24 |         if not model:
 25 |             return f"No semantic model found with ID '{model_id}' in workspace '{workspace_id}'."
 26 | 
 27 |         return model
 28 | 
 29 |     # async def get_model_schema(
 30 |     #     self,
 31 |     #     workspace: str,
 32 |     #     rsc_id: str,
 33 |     #     rsc_type: str,
 34 |     #     table_name: str,
 35 |     #     credential: DefaultAzureCredential,
 36 |     # ):
 37 |     #     """Retrieve schema for a specific model."""
 38 | 
 39 |     #     models = await self.list_semantic_models(workspace)
 40 | 
 41 |     #     # Find the specific table
 42 |     #     matching_tables = [t for t in tables if t["name"].lower() == table_name.lower()]
 43 | 
 44 |     #     if not matching_tables:
 45 |     #         return f"No table found with name '{table_name}' in {rsc_type} '{rsc_id}'."
 46 | 
 47 |     #     table = matching_tables[0]
 48 | 
 49 |     #     # Check that it is a Delta table
 50 |     #     if table["format"].lower() != "delta":
 51 |     #         return f"The table '{table_name}' is not a Delta table (format: {table['format']})."
 52 | 
 53 |     #     # Get schema
 54 |     #     delta_tables = await get_delta_schemas([table], credential)
 55 | 
 56 |     #     if not delta_tables:
 57 |     #         return f"Could not retrieve schema for table '{table}'."
 58 | 
 59 |     #     # Format result as markdown
 60 |     #     table_info, schema, metadata = delta_tables[0]
 61 |     #     markdown = format_schema_to_markdown(table_info, schema, metadata)
 62 | 
 63 |     #     return markdown
 64 | 
 65 |     # async def get_all_schemas(
 66 |     #     self,
 67 |     #     workspace: str,
 68 |     #     rsc_id: str,
 69 |     #     rsc_type: str,
 70 |     #     credential: DefaultAzureCredential,
 71 |     # ):
 72 |     #     """Get schemas for all Delta tables in a Fabric lakehouse."""
 73 |     #     # Get all tables
 74 |     #     tables = await self.list_tables(workspace, rsc_id, rsc_type)
 75 | 
 76 |     #     if isinstance(tables, str):
 77 |     #         return tables
 78 | 
 79 |     #     if not tables:
 80 |     #         return f"No tables found in {rsc_type} '{rsc_id}'."
 81 | 
 82 |     #     # Filter to only Delta tables
 83 |     #     delta_format_tables = [t for t in tables if t["format"].lower() == "delta"]
 84 | 
 85 |     #     if not delta_format_tables:
 86 |     #         return f"No Delta tables found in {rsc_type} '{rsc_id}'."
 87 | 
 88 |     #     # Get schema for all tables
 89 |     #     delta_tables = await get_delta_schemas(delta_format_tables, credential)
 90 | 
 91 |     #     logger.debug(f"Delta Tables response: {tables}")
 92 |     #     if not delta_tables:
 93 |     #         return "Could not retrieve schemas for any tables."
 94 | 
 95 |     #     # Format the result as markdown
 96 |     #     markdown = "# Delta Table Schemas\n\n"
 97 |     #     markdown += f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
 98 |     #     markdown += f"Workspace: {workspace}\n"
 99 |     #     markdown += f"Lakehouse: {rsc_id}\n\n"
100 | 
101 |     #     for table_info, schema, metadata in delta_tables:
102 |     #         markdown += format_schema_to_markdown(table_info, schema, metadata)
103 | 
104 |     #     return markdown
105 | 


--------------------------------------------------------------------------------
/tools/lakehouse.py:
--------------------------------------------------------------------------------
  1 | from helpers.utils.context import mcp, __ctx_cache
  2 | from mcp.server.fastmcp import Context
  3 | from helpers.utils.authentication import get_azure_credentials
  4 | from helpers.clients import (
  5 |     FabricApiClient,
  6 |     LakehouseClient,
  7 | )
  8 | from helpers.logging_config import get_logger
  9 | 
 10 | # import sempy_labs as labs
 11 | # import sempy_labs.lakehouse as slh
 12 | 
 13 | from typing import Optional
 14 | 
 15 | logger = get_logger(__name__)
 16 | 
 17 | 
 18 | @mcp.tool()
 19 | async def set_lakehouse(lakehouse: str, ctx: Context) -> str:
 20 |     """Set the current lakehouse for the session.
 21 | 
 22 |     Args:
 23 |         lakehouse: Name or ID of the lakehouse
 24 |         ctx: Context object containing client information
 25 | 
 26 |     Returns:
 27 |         A string confirming the lakehouse has been set.
 28 |     """
 29 |     __ctx_cache[f"{ctx.client_id}_lakehouse"] = lakehouse
 30 |     return f"Lakehouse set to '{lakehouse}'."
 31 | 
 32 | 
 33 | @mcp.tool()
 34 | async def list_lakehouses(workspace: Optional[str] = None, ctx: Context = None) -> str:
 35 |     """List all lakehouses in a Fabric workspace.
 36 | 
 37 |     Args:
 38 |         workspace: Name or ID of the workspace (optional)
 39 |         ctx: Context object containing client information
 40 | 
 41 |     Returns:
 42 |         A string containing the list of lakehouses or an error message.
 43 |     """
 44 |     try:
 45 |         credential = get_azure_credentials(ctx.client_id, __ctx_cache)
 46 |         fabric_client = FabricApiClient(credential=credential)
 47 |         lakehouse_client = LakehouseClient(client=fabric_client)
 48 |         ws = workspace or __ctx_cache.get(f"{ctx.client_id}_workspace")
 49 |         if not ws:
 50 |             return "Workspace not set. Please set a workspace using the 'set_workspace' command."
 51 |         return await lakehouse_client.list_lakehouses(workspace=ws)
 52 |     except Exception as e:
 53 |         logger.error(f"Error listing lakehouses: {e}")
 54 |         return f"Error listing lakehouses: {e}"
 55 | 
 56 | 
 57 | # @mcp.tool()
 58 | # async def list_lakehouses_semantic_link(workspace: Optional[str] = None, ctx: Context = None) -> str:
 59 | #     """List all lakehouses in a Fabric workspace using semantic-link-labs."""
 60 | #     try:
 61 | #         manager = LakehouseManager()
 62 | #         lakehouses = manager.list_lakehouses(workspace_id=workspace or __ctx_cache.get(f"{ctx.client_id}_workspace"))
 63 | #         markdown = f"# Lakehouses (semantic-link-labs) in workspace '{workspace}'\n\n"
 64 | #         markdown += "| ID | Name |\n"
 65 | #         markdown += "|-----|------|\n"
 66 | #         for lh in lakehouses:
 67 | #             markdown += f"| {lh.get('id', 'N/A')} | {lh.get('displayName', 'N/A')} |\n"
 68 | #         return markdown
 69 | #     except Exception as e:
 70 | #         return f"Error listing lakehouses with semantic-link-labs: {str(e)}"
 71 | 
 72 | 
 73 | @mcp.tool()
 74 | async def create_lakehouse(
 75 |     name: str,
 76 |     workspace: Optional[str] = None,
 77 |     description: Optional[str] = None,
 78 |     ctx: Context = None,
 79 | ) -> str:
 80 |     """Create a new lakehouse in a Fabric workspace.
 81 | 
 82 |     Args:
 83 |         name: Name of the lakehouse
 84 |         workspace: Name or ID of the workspace (optional)
 85 |         description: Description of the lakehouse (optional)
 86 |         ctx: Context object containing client information
 87 |     Returns:
 88 |         A string confirming the lakehouse has been created or an error message.
 89 |     """
 90 |     try:
 91 |         credential = get_azure_credentials(ctx.client_id, __ctx_cache)
 92 |         fabric_client = FabricApiClient(credential=credential)
 93 |         lakehouse_client = LakehouseClient(client=fabric_client)
 94 |         ws = workspace or __ctx_cache.get(f"{ctx.client_id}_workspace")
 95 |         if not ws:
 96 |             return "Workspace not set. Please set a workspace using the 'set_workspace' command."
 97 |         return await lakehouse_client.create_lakehouse(
 98 |             name=name, workspace=ws, description=description
 99 |         )
100 |     except Exception as e:
101 |         logger.error(f"Error creating lakehouse: {e}")
102 |         return f"Error creating lakehouse: {e}"
103 | 


--------------------------------------------------------------------------------
/helpers/clients/sql_client.py:
--------------------------------------------------------------------------------
  1 | import polars as pl
  2 | from sqlalchemy import create_engine, Engine
  3 | from itertools import chain, repeat
  4 | import urllib
  5 | import struct
  6 | from typing import Optional
  7 | from azure.identity import DefaultAzureCredential
  8 | from helpers.clients import FabricApiClient, LakehouseClient, WarehouseClient
  9 | 
 10 | 
 11 | # prepare connection string
 12 | sql_endpoint = "lkxke5qat5vu7fpnluz5o7cnme-qlbrb7caj77uthvfhqdxwd5v54.datawarehouse.fabric.microsoft.com"
 13 | database = "EDR_WAREHOUSE"
 14 | DRIVER = "{{ODBC Driver 18 for SQL Server}}"
 15 | 
 16 | 
 17 | def get_sqlalchemy_connection_string(driver: str, server: str, database: str) -> Engine:
 18 |     """
 19 |     Constructs a SQLAlchemy connection string based on the provided parameters.
 20 | 
 21 |     Args:
 22 |         driver (str): The database driver (e.g., 'mssql+pyodbc').
 23 |         server (str): The server address.
 24 |         database (str): The database name.
 25 | 
 26 |     Returns:
 27 |         Engine: A SQLAlchemy engine object.
 28 |     """
 29 |     connection_string = f"Driver={{ODBC Driver 18 for SQL Server}};Server={sql_endpoint},1433;Database={database};Encrypt=Yes;TrustServerCertificate=No"
 30 |     params = urllib.parse.quote(connection_string)
 31 |     # authentication
 32 |     resource_url = "https://database.windows.net/.default"
 33 |     azure_credentials = DefaultAzureCredential()
 34 |     token_object = azure_credentials.get_token(resource_url)
 35 |     # Retrieve an access token
 36 |     token_as_bytes = bytes(
 37 |         token_object.token, "UTF-8"
 38 |     )  # Convert the token to a UTF-8 byte string
 39 |     encoded_bytes = bytes(
 40 |         chain.from_iterable(zip(token_as_bytes, repeat(0)))
 41 |     )  # Encode the bytes to a Windows byte string
 42 |     token_bytes = (
 43 |         struct.pack("<i", len(encoded_bytes)) + encoded_bytes
 44 |     )  # Package the token into a bytes object
 45 |     attrs_before = {
 46 |         1256: token_bytes
 47 |     }  # Attribute pointing to SQL_COPT_SS_ACCESS_TOKEN to pass access token to the driver
 48 | 
 49 |     # build the connection
 50 |     engine = create_engine(
 51 |         "mssql+pyodbc:///?odbc_connect={0}".format(params),
 52 |         connect_args={"attrs_before": attrs_before},
 53 |     )
 54 |     return engine
 55 | 
 56 | 
 57 | async def get_sql_endpoint(
 58 |     workspace: str = None,
 59 |     lakehouse: Optional[str] = None,
 60 |     warehouse: Optional[str] = None,
 61 |     type: str = None,
 62 | ) -> tuple:
 63 |     """
 64 |     Retrieve the SQL endpoint for a specified lakehouse or warehouse.
 65 | 
 66 |     Args:
 67 |         lakehouse: Name or ID of the lakehouse (optional).
 68 |         warehouse: Name or ID of the warehouse (optional).
 69 |         type: Type of resource ('lakehouse' or 'warehouse').
 70 |         workspace: Name or ID of the workspace (optional).
 71 |     Returns:
 72 |         A tuple (database, sql_endpoint) or (None, error_message) in case of error.
 73 |     """
 74 |     try:
 75 |         credential = DefaultAzureCredential()
 76 |         fabClient = FabricApiClient(credential)
 77 |         resource_name = None
 78 |         endpoint = None
 79 |         workspace_name, workspace_id = await fabClient.resolve_workspace_name_and_id(
 80 |             workspace
 81 |         )
 82 |         if type and type.lower() == "lakehouse":
 83 |             client = LakehouseClient(fabClient)
 84 |             resource_name, resource_id = await fabClient.resolve_item_name_and_id(
 85 |                 workspace=workspace_id, item=lakehouse, type="Lakehouse"
 86 |             )
 87 |             lakehouse_obj = await client.get_lakehouse(
 88 |                 workspace=workspace, lakehouse=resource_id
 89 |             )
 90 |             endpoint = (
 91 |                 lakehouse_obj.get("properties", {})
 92 |                 .get("sqlEndpointProperties", {})
 93 |                 .get("connectionString")
 94 |             )
 95 |         elif type and type.lower() == "warehouse":
 96 |             client = WarehouseClient(fabClient)
 97 |             resource_name, resource_id = await fabClient.resolve_item_name_and_id(
 98 |                 workspace=workspace_id, item=warehouse, type="Warehouse"
 99 |             )
100 |             warehouse_obj = await client.get_warehouse(
101 |                 workspace=workspace, warehouse=resource_id
102 |             )
103 |             endpoint = warehouse_obj.get("properties", {}).get("connectionString")
104 |         if resource_name and endpoint:
105 |             return resource_name, endpoint
106 |         else:
107 |             return (
108 |                 None,
109 |                 f"No SQL endpoint found for {type} '{lakehouse or warehouse}' in workspace '{workspace}'.",
110 |             )
111 |     except Exception as e:
112 |         return None, f"Error retrieving SQL endpoint: {str(e)}"
113 | 
114 | 
115 | class SQLClient:
116 |     def __init__(self, sql_endpoint: str, database: str):
117 |         self.engine = get_sqlalchemy_connection_string(DRIVER, sql_endpoint, database)
118 | 
119 |     def run_query(self, query: str) -> pl.DataFrame:
120 |         return pl.read_database(query, connection=self.engine)
121 | 
122 |     def load_data(self, df: pl.DataFrame, table_name: str, if_exists: str = "append"):
123 |         pdf = df.to_pandas()
124 |         pdf.to_sql(table_name, con=self.engine, if_exists=if_exists, index=False)
125 | 


--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
  1 | # Microsoft Fabric MCP Architecture with LLM Integration
  2 | 
  3 | ## Complete Architecture Diagram
  4 | 
  5 | ```mermaid
  6 | graph TB
  7 |     subgraph "Developer Environment"
  8 |         IDE[IDE/VSCode]
  9 |         DEV[Developer]
 10 |         PROJ[Project Files]
 11 |     end
 12 |     
 13 |     subgraph "AI Layer"
 14 |         LLM[Large Language Model<br/>Claude/GPT/etc.]
 15 |         CONTEXT[Conversation Context]
 16 |         REASONING[AI Reasoning Engine]
 17 |     end
 18 |     
 19 |     subgraph "MCP Layer"
 20 |         MCP[MCP Server]
 21 |         TOOLS[PySpark Tools]
 22 |         HELPERS[PySpark Helpers]
 23 |         TEMPLATES[Template Manager]
 24 |         VALIDATORS[Code Validators]
 25 |         GENERATORS[Code Generators]
 26 |     end
 27 |     
 28 |     subgraph "Microsoft Fabric"
 29 |         API[Fabric API]
 30 |         WS[Workspace]
 31 |         LH[Lakehouse]
 32 |         NB[Notebooks]
 33 |         TABLES[Delta Tables]
 34 |         SPARK[Spark Clusters]
 35 |     end
 36 |     
 37 |     subgraph "Operations Flow"
 38 |         CREATE[Create Notebooks]
 39 |         VALIDATE[Validate Code]
 40 |         GENERATE[Generate Code]
 41 |         ANALYZE[Analyze Performance]
 42 |         DEPLOY[Deploy to Fabric]
 43 |     end
 44 |     
 45 |     %% Developer interactions
 46 |     DEV --> IDE
 47 |     IDE --> PROJ
 48 |     
 49 |     %% LLM interactions
 50 |     IDE <--> LLM
 51 |     LLM <--> CONTEXT
 52 |     LLM --> REASONING
 53 |     
 54 |     %% MCP interactions
 55 |     LLM <--> MCP
 56 |     MCP --> TOOLS
 57 |     TOOLS --> HELPERS
 58 |     TOOLS --> TEMPLATES
 59 |     TOOLS --> VALIDATORS
 60 |     TOOLS --> GENERATORS
 61 |     
 62 |     %% Fabric interactions
 63 |     MCP <--> API
 64 |     API --> WS
 65 |     WS --> LH
 66 |     WS --> NB
 67 |     LH --> TABLES
 68 |     NB --> SPARK
 69 |     
 70 |     %% Operation flows
 71 |     TOOLS --> CREATE
 72 |     TOOLS --> VALIDATE
 73 |     TOOLS --> GENERATE
 74 |     TOOLS --> ANALYZE
 75 |     CREATE --> DEPLOY
 76 |     
 77 |     %% Data flow arrows
 78 |     REASONING -.->|"Intelligent Decisions"| TOOLS
 79 |     CONTEXT -.->|"Project Awareness"| VALIDATORS
 80 |     
 81 |     %% Styling
 82 |     classDef devEnv fill:#e1f5fe
 83 |     classDef aiLayer fill:#fff9c4
 84 |     classDef mcpLayer fill:#f3e5f5
 85 |     classDef fabricLayer fill:#e8f5e8
 86 |     classDef operations fill:#fff3e0
 87 |     
 88 |     class IDE,DEV,PROJ devEnv
 89 |     class LLM,CONTEXT,REASONING aiLayer
 90 |     class MCP,TOOLS,HELPERS,TEMPLATES,VALIDATORS,GENERATORS mcpLayer
 91 |     class API,WS,LH,NB,TABLES,SPARK fabricLayer
 92 |     class CREATE,VALIDATE,GENERATE,ANALYZE,DEPLOY operations
 93 | ```
 94 | 
 95 | ## Architecture Components
 96 | 
 97 | ### **1. Developer Environment**
 98 | - **IDE/VSCode**: Primary development interface with MCP integration
 99 | - **Developer**: Data engineer/scientist working on PySpark projects
100 | - **Project Files**: Local project structure and configuration
101 | 
102 | ### **2. AI Layer**
103 | - **Large Language Model**: Claude, GPT, or other LLM providing intelligent assistance
104 | - **Conversation Context**: Maintains project context and conversation history
105 | - **AI Reasoning Engine**: Makes intelligent decisions about code generation and optimization
106 | 
107 | ### **3. MCP Layer (This Server)**
108 | - **MCP Server**: Core server handling tool requests from the LLM
109 | - **PySpark Tools**: 11 specialized tools for notebook operations
110 | - **PySpark Helpers**: Template management and code generation
111 | - **Template Manager**: Pre-built notebook templates for different scenarios
112 | - **Code Validators**: Syntax, best practices, and Fabric compatibility checks
113 | - **Code Generators**: Intelligent PySpark code generation
114 | 
115 | ### **4. Microsoft Fabric**
116 | - **Fabric API**: REST API for all Fabric operations
117 | - **Workspace**: Fabric workspace containing resources
118 | - **Lakehouse**: Data storage with Delta Lake tables
119 | - **Notebooks**: PySpark notebooks for data processing
120 | - **Delta Tables**: Structured data storage
121 | - **Spark Clusters**: Compute resources for PySpark execution
122 | 
123 | ### **5. Operations Flow**
124 | - **Create Notebooks**: Generate notebooks from templates
125 | - **Validate Code**: Check syntax, performance, and compatibility
126 | - **Generate Code**: Create PySpark snippets for common operations
127 | - **Analyze Performance**: Evaluate and optimize notebook performance
128 | - **Deploy to Fabric**: Push notebooks and execute in Fabric environment
129 | 
130 | ## Enhanced Interaction Flow with LLM
131 | 
132 | 1. **Developer requests PySpark assistance in IDE**
133 | 2. **IDE communicates with LLM (Claude/GPT)**
134 | 3. **LLM analyzes request using conversation context and reasoning**
135 | 4. **LLM calls MCP server tools based on intelligent analysis**
136 | 5. **MCP server processes request using specialized tools**
137 | 6. **Tools utilize helpers, templates, and validators**
138 | 7. **MCP server calls Fabric API for operations**
139 | 8. **Results flow back through MCP to LLM**
140 | 9. **LLM processes and formats results intelligently**
141 | 10. **Developer receives contextual, intelligent responses in IDE**
142 | 
143 | ## Key Benefits of LLM Integration
144 | 
145 | ### **Intelligent Decision Making**
146 | - LLM analyzes developer intent and context
147 | - Chooses appropriate tools and templates automatically
148 | - Provides contextual recommendations based on project history
149 | 
150 | ### **Natural Language Interface**
151 | - Developers can request features in natural language
152 | - LLM translates requests to appropriate MCP tool calls
153 | - Reduces need to remember specific tool names and parameters
154 | 
155 | ### **Context Awareness**
156 | - LLM maintains conversation history and project context
157 | - Provides consistent recommendations across sessions
158 | - Learns from previous interactions and code patterns
159 | 
160 | ### **Enhanced Code Generation**
161 | - LLM combines multiple tool outputs intelligently
162 | - Provides explanations and documentation with generated code
163 | - Adapts to developer's coding style and preferences
164 | 
165 | ## Example LLM-Enhanced Workflows
166 | 
167 | ### **Scenario 1: Natural Language Request**
168 | ```
169 | Developer: "Help me create a PySpark notebook that reads sales data from our lakehouse, 
170 | cleans it, and creates a summary table with performance optimization."
171 | 
172 | LLM Process:
173 | 1. Analyzes intent: notebook creation + data processing + optimization
174 | 2. Calls create_fabric_notebook() with ETL template
175 | 3. Calls generate_fabric_code() for lakehouse reading
176 | 4. Calls validate_fabric_code() for optimization checks
177 | 5. Provides complete solution with explanations
178 | ```
179 | 
180 | ### **Scenario 2: Performance Optimization**
181 | ```
182 | Developer: "My PySpark notebook is running slowly. Can you help optimize it?"
183 | 
184 | LLM Process:
185 | 1. Calls analyze_notebook_performance() on current notebook
186 | 2. Calls validate_fabric_code() for anti-pattern detection
187 | 3. Calls generate_fabric_code() for optimized alternatives
188 | 4. Provides detailed optimization report with before/after comparisons
189 | ```
190 | 
191 | ### **Scenario 3: Best Practices Guidance**
192 | ```
193 | Developer: "Is this PySpark code following Fabric best practices?"
194 | 
195 | LLM Process:
196 | 1. Calls validate_fabric_code() for compatibility checks
197 | 2. Analyzes results with reasoning engine
198 | 3. Provides detailed feedback with specific recommendations
199 | 4. Suggests alternative approaches using generate_fabric_code()
200 | ```
201 | 
202 | This architecture leverages the power of LLMs to provide intelligent, context-aware assistance while utilizing specialized MCP tools for precise Fabric operations!
203 | 


--------------------------------------------------------------------------------
/tools/table.py:
--------------------------------------------------------------------------------
  1 | from helpers.utils.context import mcp, __ctx_cache
  2 | from mcp.server.fastmcp import Context
  3 | from helpers.utils.authentication import get_azure_credentials
  4 | from helpers.clients import (
  5 |     FabricApiClient,
  6 |     TableClient,
  7 |     SQLClient,
  8 |     get_sql_endpoint,
  9 | )
 10 | 
 11 | from typing import Optional
 12 | from helpers.logging_config import get_logger
 13 | 
 14 | logger = get_logger(__name__)
 15 | 
 16 | 
 17 | @mcp.tool()
 18 | async def set_table(table_name: str, ctx: Context) -> str:
 19 |     """Set the current table for the session.
 20 | 
 21 |     Args:
 22 |         table_name: Name of the table to set
 23 |         ctx: Context object containing client information
 24 | 
 25 |     Returns:
 26 |         A string confirming the table has been set.
 27 |     """
 28 |     __ctx_cache[f"{ctx.client_id}_table"] = table_name
 29 |     return f"Table set to '{table_name}'."
 30 | 
 31 | 
 32 | @mcp.tool()
 33 | async def list_tables(
 34 |     workspace: Optional[str] = None,
 35 |     lakehouse: Optional[str] = None,
 36 |     ctx: Context = None,
 37 | ) -> str:
 38 |     """List all tables in a Fabric workspace.
 39 | 
 40 |     Args:
 41 |         workspace: Name or ID of the workspace (optional)
 42 |         lakehouse: Name or ID of the lakehouse (optional)
 43 |         ctx: Context object containing client information
 44 | 
 45 |     Returns:
 46 |         A string containing the list of tables or an error message.
 47 |     """
 48 |     try:
 49 |         client = TableClient(
 50 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
 51 |         )
 52 | 
 53 |         tables = await client.list_tables(
 54 |             workspace_id=workspace
 55 |             if workspace
 56 |             else __ctx_cache[f"{ctx.client_id}_workspace"],
 57 |             rsc_id=lakehouse
 58 |             if lakehouse
 59 |             else __ctx_cache[f"{ctx.client_id}_lakehouse"],
 60 |         )
 61 | 
 62 |         return tables
 63 | 
 64 |     except Exception as e:
 65 |         return f"Error listing tables: {str(e)}"
 66 | 
 67 | 
 68 | @mcp.tool()
 69 | async def get_lakehouse_table_schema(
 70 |     workspace: Optional[str],
 71 |     lakehouse: Optional[str],
 72 |     table_name: str = None,
 73 |     ctx: Context = None,
 74 | ) -> str:
 75 |     """Get schema for a specific table in a Fabric lakehouse.
 76 | 
 77 |     Args:
 78 |         workspace: Name or ID of the workspace
 79 |         lakehouse: Name or ID of the lakehouse
 80 |         table_name: Name of the table to retrieve
 81 |         ctx: Context object containing client information
 82 | 
 83 |     Returns:
 84 |         A string containing the schema of the specified table or an error message.
 85 |     """
 86 |     try:
 87 |         credential = get_azure_credentials(ctx.client_id, __ctx_cache)
 88 |         client = TableClient(FabricApiClient(credential))
 89 | 
 90 |         if table_name is None:
 91 |             return "Table name must be specified."
 92 |         if lakehouse is None:
 93 |             if f"{ctx.client_id}_lakehouse" in __ctx_cache:
 94 |                 lakehouse = __ctx_cache[f"{ctx.client_id}_lakehouse"]
 95 |             else:
 96 |                 return "Lakehouse must be specified or set in the context."
 97 | 
 98 |         if workspace is None:
 99 |             if f"{ctx.client_id}_workspace" in __ctx_cache:
100 |                 workspace = __ctx_cache[f"{ctx.client_id}_workspace"]
101 |             else:
102 |                 return "Workspace must be specified or set in the context."
103 | 
104 |         schema = await client.get_table_schema(
105 |             workspace, lakehouse, "lakehouse", table_name, credential
106 |         )
107 | 
108 |         return schema
109 | 
110 |     except Exception as e:
111 |         return f"Error retrieving table schema: {str(e)}"
112 | 
113 | 
114 | @mcp.tool()
115 | async def get_all_lakehouse_schemas(
116 |     lakehouse: Optional[str], workspace: Optional[str] = None, ctx: Context = None
117 | ) -> str:
118 |     """Get schemas for all Delta tables in a Fabric lakehouse.
119 | 
120 |     Args:
121 |         workspace: Name or ID of the workspace
122 |         lakehouse: Name or ID of the lakehouse
123 |         ctx: Context object containing client information
124 | 
125 |     Returns:
126 |         A string containing the schemas of all Delta tables or an error message.
127 |     """
128 |     try:
129 |         credential = get_azure_credentials(ctx.client_id, __ctx_cache)
130 |         client = TableClient(FabricApiClient(credential))
131 | 
132 |         if workspace is None:
133 |             if f"{ctx.client_id}_workspace" in __ctx_cache:
134 |                 workspace = __ctx_cache[f"{ctx.client_id}_workspace"]
135 |             else:
136 |                 return "Workspace must be specified or set in the context."
137 |         if lakehouse is None:
138 |             if f"{ctx.client_id}_lakehouse" in __ctx_cache:
139 |                 lakehouse = __ctx_cache[f"{ctx.client_id}_lakehouse"]
140 |             else:
141 |                 return "Lakehouse must be specified or set in the context."
142 |         schemas = await client.get_all_schemas(
143 |             workspace, lakehouse, "lakehouse", credential
144 |         )
145 | 
146 |         return schemas
147 | 
148 |     except Exception as e:
149 |         return f"Error retrieving table schemas: {str(e)}"
150 | 
151 | 
152 | @mcp.tool()
153 | async def run_query(
154 |     workspace: Optional[str] = None,
155 |     lakehouse: Optional[str] = None,
156 |     warehouse: Optional[str] = None,
157 |     query: str = None,
158 |     type: Optional[str] = None,  # Add type hint for 'type'
159 |     ctx: Context = None,
160 | ) -> str:
161 |     """Read data from a table in a warehouse or lakehouse.
162 | 
163 |     Args:
164 |         workspace: Name or ID of the workspace (optional).
165 |         lakehouse: Name or ID of the lakehouse (optional).
166 |         warehouse: Name or ID of the warehouse (optional).
167 |         query: The SQL query to execute.
168 |         type: Type of resource ('lakehouse' or 'warehouse'). If not provided, it will be inferred.
169 |         ctx: Context object containing client information.
170 |     Returns:
171 |         A string confirming the data read or an error message.
172 |     """
173 |     try:
174 |         if ctx is None:
175 |             raise ValueError("Context (ctx) must be provided.")
176 |         if query is None:
177 |             raise ValueError("Query must be specified.")
178 |         # Always resolve the SQL endpoint and database name
179 |         database, sql_endpoint = await get_sql_endpoint(
180 |             workspace=workspace,
181 |             lakehouse=lakehouse,
182 |             warehouse=warehouse,
183 |             type=type,
184 |         )
185 |         if (
186 |             not database
187 |             or not sql_endpoint
188 |             or sql_endpoint.startswith("Error")
189 |             or sql_endpoint.startswith("No SQL endpoint")
190 |         ):
191 |             return f"Failed to resolve SQL endpoint: {sql_endpoint}"
192 |         logger.info(f"Running query '{query}' on SQL endpoint {sql_endpoint}")
193 |         client = SQLClient(sql_endpoint=sql_endpoint, database=database)
194 |         df = client.run_query(query)
195 |         if df.is_empty():
196 |             return f"No data found for query '{query}'."
197 | 
198 |         # Convert to markdown for user-friendly display
199 | 
200 |         # markdown = f"### Query: {query} (shape: {df.shape})\n\n"
201 |         # with pl.Config() as cfg:
202 |         #     cfg.set_tbl_formatting('ASCII_MARKDOWN')
203 |         #     display(Markdown(repr(df)))
204 |         # markdown += f"\n\n### Data Preview:\n\n"
205 |         # markdown += df.head(10).to_pandas().to_markdown(index=False)
206 |         # markdown += f"\n\nColumns: {', '.join(df.columns)}"
207 |         return df.to_dict()  # Return the DataFrame as a dictionary for easier handling
208 |     except Exception as e:
209 |         logger.error(f"Error reading data: {str(e)}")
210 |         return f"Error reading data: {str(e)}"
211 | 


--------------------------------------------------------------------------------
/test_security.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test script for the secure MCP server.
  3 | Validates authentication, authorization, and security features.
  4 | """
  5 | 
  6 | import requests
  7 | import json
  8 | import time
  9 | import subprocess
 10 | import threading
 11 | from typing import Optional
 12 | 
 13 | class SecurityTester:
 14 |     """Test suite for MCP server security."""
 15 |     
 16 |     def __init__(self, base_url: str = "http://localhost:8081"):
 17 |         self.base_url = base_url.rstrip('/')
 18 |         self.session = requests.Session()
 19 |         self.session.verify = False  # For self-signed certificates
 20 |         
 21 |     def test_health_check(self) -> bool:
 22 |         """Test health check endpoint."""
 23 |         try:
 24 |             response = self.session.get(f"{self.base_url}/health", timeout=5)
 25 |             response.raise_for_status()
 26 |             health_data = response.json()
 27 |             print(f"✅ Health check passed: {health_data}")
 28 |             return True
 29 |         except Exception as e:
 30 |             print(f"❌ Health check failed: {e}")
 31 |             return False
 32 |     
 33 |     def test_unauthenticated_access(self) -> bool:
 34 |         """Test that unauthenticated requests are rejected."""
 35 |         try:
 36 |             mcp_request = {
 37 |                 "jsonrpc": "2.0",
 38 |                 "id": 1,
 39 |                 "method": "tools/list"
 40 |             }
 41 |             
 42 |             response = self.session.post(
 43 |                 f"{self.base_url}/mcp",
 44 |                 json=mcp_request,
 45 |                 timeout=5
 46 |             )
 47 |             
 48 |             if response.status_code == 401:
 49 |                 print("✅ Unauthenticated access properly rejected")
 50 |                 return True
 51 |             else:
 52 |                 print(f"❌ Unauthenticated access allowed (status: {response.status_code})")
 53 |                 return False
 54 |                 
 55 |         except Exception as e:
 56 |             print(f"❌ Error testing unauthenticated access: {e}")
 57 |             return False
 58 |     
 59 |     def test_authentication(self, username: str = "admin", password: str = "changeme") -> Optional[str]:
 60 |         """Test username/password authentication."""
 61 |         try:
 62 |             response = self.session.post(
 63 |                 f"{self.base_url}/auth/login",
 64 |                 json={"username": username, "password": password},
 65 |                 timeout=5
 66 |             )
 67 |             
 68 |             if response.status_code == 200:
 69 |                 token_data = response.json()
 70 |                 token = token_data.get("access_token")
 71 |                 print(f"✅ Authentication successful: {username}")
 72 |                 return token
 73 |             else:
 74 |                 print(f"❌ Authentication failed: {response.status_code} - {response.text}")
 75 |                 return None
 76 |                 
 77 |         except Exception as e:
 78 |             print(f"❌ Authentication error: {e}")
 79 |             return None
 80 |     
 81 |     def test_authenticated_access(self, token: str) -> bool:
 82 |         """Test authenticated MCP tool access."""
 83 |         try:
 84 |             headers = {"Authorization": f"Bearer {token}"}
 85 |             mcp_request = {
 86 |                 "jsonrpc": "2.0",
 87 |                 "id": 1,
 88 |                 "method": "tools/list"
 89 |             }
 90 |             
 91 |             response = self.session.post(
 92 |                 f"{self.base_url}/mcp",
 93 |                 json=mcp_request,
 94 |                 headers=headers,
 95 |                 timeout=5
 96 |             )
 97 |             
 98 |             if response.status_code == 200:
 99 |                 result = response.json()
100 |                 print(f"✅ Authenticated access successful")
101 |                 if 'result' in result and 'tools' in result['result']:
102 |                     tools = result['result']['tools']
103 |                     print(f"   Available tools: {[t['name'] for t in tools]}")
104 |                 return True
105 |             else:
106 |                 print(f"❌ Authenticated access failed: {response.status_code}")
107 |                 return False
108 |                 
109 |         except Exception as e:
110 |             print(f"❌ Authenticated access error: {e}")
111 |             return False
112 |     
113 |     def test_token_verification(self, token: str) -> bool:
114 |         """Test token verification endpoint."""
115 |         try:
116 |             headers = {"Authorization": f"Bearer {token}"}
117 |             response = self.session.get(
118 |                 f"{self.base_url}/auth/verify",
119 |                 headers=headers,
120 |                 timeout=5
121 |             )
122 |             
123 |             if response.status_code == 200:
124 |                 verify_data = response.json()
125 |                 print(f"✅ Token verification passed: {verify_data}")
126 |                 return True
127 |             else:
128 |                 print(f"❌ Token verification failed: {response.status_code}")
129 |                 return False
130 |                 
131 |         except Exception as e:
132 |             print(f"❌ Token verification error: {e}")
133 |             return False
134 |     
135 |     def test_invalid_credentials(self) -> bool:
136 |         """Test that invalid credentials are rejected."""
137 |         try:
138 |             response = self.session.post(
139 |                 f"{self.base_url}/auth/login",
140 |                 json={"username": "invalid", "password": "invalid"},
141 |                 timeout=5
142 |             )
143 |             
144 |             if response.status_code == 401:
145 |                 print("✅ Invalid credentials properly rejected")
146 |                 return True
147 |             else:
148 |                 print(f"❌ Invalid credentials accepted (status: {response.status_code})")
149 |                 return False
150 |                 
151 |         except Exception as e:
152 |             print(f"❌ Error testing invalid credentials: {e}")
153 |             return False
154 |     
155 |     def test_rate_limiting(self, token: str) -> bool:
156 |         """Test rate limiting functionality."""
157 |         try:
158 |             headers = {"Authorization": f"Bearer {token}"}
159 |             
160 |             # Make multiple rapid requests
161 |             success_count = 0
162 |             rate_limited = False
163 |             
164 |             for i in range(10):
165 |                 response = self.session.get(
166 |                     f"{self.base_url}/health",
167 |                     headers=headers,
168 |                     timeout=5
169 |                 )
170 |                 
171 |                 if response.status_code == 200:
172 |                     success_count += 1
173 |                 elif response.status_code == 429:  # Too Many Requests
174 |                     rate_limited = True
175 |                     break
176 |                     
177 |                 time.sleep(0.1)  # Small delay between requests
178 |             
179 |             if success_count > 0:
180 |                 print(f"✅ Rate limiting configured (processed {success_count} requests)")
181 |                 if rate_limited:
182 |                     print("   Rate limit triggered as expected")
183 |                 return True
184 |             else:
185 |                 print("❌ No requests succeeded")
186 |                 return False
187 |                 
188 |         except Exception as e:
189 |             print(f"❌ Rate limiting test error: {e}")
190 |             return False
191 |     
192 |     def test_security_headers(self) -> bool:
193 |         """Test that security headers are present."""
194 |         try:
195 |             response = self.session.get(f"{self.base_url}/", timeout=5)
196 |             headers = response.headers
197 |             
198 |             security_headers = {
199 |                 'X-Content-Type-Options': 'nosniff',
200 |                 'X-Frame-Options': 'DENY',
201 |                 'X-XSS-Protection': '1; mode=block',
202 |                 'Content-Security-Policy': "default-src 'self'"
203 |             }
204 |             
205 |             missing_headers = []
206 |             for header, expected_value in security_headers.items():
207 |                 if header not in headers:
208 |                     missing_headers.append(header)
209 |                 elif headers[header] != expected_value:
210 |                     print(f"⚠️  Security header {header} has unexpected value: {headers[header]}")
211 |             
212 |             if missing_headers:
213 |                 print(f"❌ Missing security headers: {missing_headers}")
214 |                 return False
215 |             else:
216 |                 print("✅ All security headers present")
217 |                 return True
218 |                 
219 |         except Exception as e:
220 |             print(f"❌ Security headers test error: {e}")
221 |             return False
222 |     
223 |     def run_all_tests(self) -> bool:
224 |         """Run all security tests."""
225 |         print("🔒 Starting MCP Server Security Tests")
226 |         print("=" * 50)
227 |         
228 |         test_results = []
229 |         
230 |         # Test health check
231 |         test_results.append(self.test_health_check())
232 |         
233 |         # Test security headers
234 |         test_results.append(self.test_security_headers())
235 |         
236 |         # Test unauthenticated access
237 |         test_results.append(self.test_unauthenticated_access())
238 |         
239 |         # Test invalid credentials
240 |         test_results.append(self.test_invalid_credentials())
241 |         
242 |         # Test authentication
243 |         token = self.test_authentication()
244 |         if token:
245 |             test_results.append(True)
246 |             
247 |             # Test authenticated access
248 |             test_results.append(self.test_authenticated_access(token))
249 |             
250 |             # Test token verification
251 |             test_results.append(self.test_token_verification(token))
252 |             
253 |             # Test rate limiting
254 |             test_results.append(self.test_rate_limiting(token))
255 |         else:
256 |             test_results.extend([False, False, False, False])
257 |         
258 |         # Results summary
259 |         passed = sum(test_results)
260 |         total = len(test_results)
261 |         
262 |         print("\n" + "=" * 50)
263 |         print(f"Test Results: {passed}/{total} passed")
264 |         
265 |         if passed == total:
266 |             print("🎉 All security tests passed!")
267 |             return True
268 |         else:
269 |             print("⚠️  Some security tests failed. Please review the output above.")
270 |             return False
271 | 
272 | def main():
273 |     """Main test runner."""
274 |     import argparse
275 |     
276 |     parser = argparse.ArgumentParser(description="Test MCP server security")
277 |     parser.add_argument("--url", default="http://localhost:8081", help="Server URL")
278 |     parser.add_argument("--start-server", action="store_true", help="Start secure server before testing")
279 |     parser.add_argument("--server-args", default="", help="Additional server arguments")
280 |     args = parser.parse_args()
281 |     
282 |     server_process = None
283 |     
284 |     if args.start_server:
285 |         print("🚀 Starting secure MCP server...")
286 |         server_cmd = f"python secure_fabric_mcp.py {args.server_args}"
287 |         server_process = subprocess.Popen(
288 |             server_cmd.split(),
289 |             stdout=subprocess.PIPE,
290 |             stderr=subprocess.PIPE
291 |         )
292 |         
293 |         # Wait for server to start
294 |         print("⏳ Waiting for server to start...")
295 |         time.sleep(5)
296 |     
297 |     try:
298 |         # Run tests
299 |         tester = SecurityTester(args.url)
300 |         success = tester.run_all_tests()
301 |         
302 |         if success:
303 |             print("\n✅ Security validation completed successfully!")
304 |         else:
305 |             print("\n❌ Security validation failed!")
306 |             
307 |     finally:
308 |         if server_process:
309 |             print("\n🛑 Stopping server...")
310 |             server_process.terminate()
311 |             server_process.wait()
312 | 
313 | if __name__ == "__main__":
314 |     main()
315 | 


--------------------------------------------------------------------------------
/docs/pyspark_guide.md:
--------------------------------------------------------------------------------
  1 | # PySpark Development Guide for Microsoft Fabric MCP
  2 | 
  3 | This guide explains how to use the enhanced PySpark capabilities in the Microsoft Fabric MCP server for developing, testing, and optimizing PySpark notebooks.
  4 | 
  5 | ## Overview
  6 | 
  7 | The MCP server now provides comprehensive PySpark development support with:
  8 | - 📓 **Advanced notebook templates** for different use cases
  9 | - 🔧 **Code generation** for common PySpark operations
 10 | - ✅ **Code validation** and best practices checking
 11 | - 🎯 **Fabric-specific optimizations** 
 12 | - 📊 **Performance analysis** tools
 13 | - 🚀 **Execution monitoring** capabilities
 14 | 
 15 | ## Architecture Diagram
 16 | 
 17 | ```mermaid
 18 | graph TB
 19 |     subgraph "Developer Environment"
 20 |         IDE[IDE/VSCode]
 21 |         DEV[Developer]
 22 |         PROJ[Project Files]
 23 |     end
 24 |     
 25 |     subgraph "MCP Layer"
 26 |         MCP[MCP Server]
 27 |         TOOLS[PySpark Tools]
 28 |         HELPERS[PySpark Helpers]
 29 |         TEMPLATES[Template Manager]
 30 |         VALIDATORS[Code Validators]
 31 |         GENERATORS[Code Generators]
 32 |     end
 33 |     
 34 |     subgraph "Microsoft Fabric"
 35 |         API[Fabric API]
 36 |         WS[Workspace]
 37 |         LH[Lakehouse]
 38 |         NB[Notebooks]
 39 |         TABLES[Delta Tables]
 40 |         SPARK[Spark Clusters]
 41 |     end
 42 |     
 43 |     subgraph "Operations Flow"
 44 |         CREATE[Create Notebooks]
 45 |         VALIDATE[Validate Code]
 46 |         GENERATE[Generate Code]
 47 |         ANALYZE[Analyze Performance]
 48 |         DEPLOY[Deploy to Fabric]
 49 |     end
 50 |     
 51 |     %% Developer interactions
 52 |     DEV --> IDE
 53 |     IDE --> PROJ
 54 |     
 55 |     %% MCP interactions
 56 |     IDE <--> MCP
 57 |     MCP --> TOOLS
 58 |     TOOLS --> HELPERS
 59 |     TOOLS --> TEMPLATES
 60 |     TOOLS --> VALIDATORS
 61 |     TOOLS --> GENERATORS
 62 |     
 63 |     %% Fabric interactions
 64 |     MCP <--> API
 65 |     API --> WS
 66 |     WS --> LH
 67 |     WS --> NB
 68 |     LH --> TABLES
 69 |     NB --> SPARK
 70 |     
 71 |     %% Operation flows
 72 |     TOOLS --> CREATE
 73 |     TOOLS --> VALIDATE
 74 |     TOOLS --> GENERATE
 75 |     TOOLS --> ANALYZE
 76 |     CREATE --> DEPLOY
 77 |     
 78 |     %% Styling
 79 |     classDef devEnv fill:#e1f5fe
 80 |     classDef mcpLayer fill:#f3e5f5
 81 |     classDef fabricLayer fill:#e8f5e8
 82 |     classDef operations fill:#fff3e0
 83 |     
 84 |     class IDE,DEV,PROJ devEnv
 85 |     class MCP,TOOLS,HELPERS,TEMPLATES,VALIDATORS,GENERATORS mcpLayer
 86 |     class API,WS,LH,NB,TABLES,SPARK fabricLayer
 87 |     class CREATE,VALIDATE,GENERATE,ANALYZE,DEPLOY operations
 88 | ```
 89 | 
 90 | ### Architecture Components
 91 | 
 92 | #### **1. Developer Environment**
 93 | - **IDE/VSCode**: Primary development interface with MCP integration
 94 | - **Developer**: Data engineer/scientist working on PySpark projects
 95 | - **Project Files**: Local project structure and configuration
 96 | 
 97 | #### **2. MCP Layer (This Server)**
 98 | - **MCP Server**: Core server handling tool requests
 99 | - **PySpark Tools**: 11 specialized tools for notebook operations
100 | - **PySpark Helpers**: Template management and code generation
101 | - **Template Manager**: Pre-built notebook templates for different scenarios
102 | - **Code Validators**: Syntax, best practices, and Fabric compatibility checks
103 | - **Code Generators**: Intelligent PySpark code generation
104 | 
105 | #### **3. Microsoft Fabric**
106 | - **Fabric API**: REST API for all Fabric operations
107 | - **Workspace**: Fabric workspace containing resources
108 | - **Lakehouse**: Data storage with Delta Lake tables
109 | - **Notebooks**: PySpark notebooks for data processing
110 | - **Delta Tables**: Structured data storage
111 | - **Spark Clusters**: Compute resources for PySpark execution
112 | 
113 | #### **4. Operations Flow**
114 | - **Create Notebooks**: Generate notebooks from templates
115 | - **Validate Code**: Check syntax, performance, and compatibility
116 | - **Generate Code**: Create PySpark snippets for common operations
117 | - **Analyze Performance**: Evaluate and optimize notebook performance
118 | - **Deploy to Fabric**: Push notebooks and execute in Fabric environment
119 | 
120 | ### Interaction Flow
121 | 
122 | 1. **Developer writes/requests PySpark code in IDE**
123 | 2. **IDE communicates with MCP server via protocol**
124 | 3. **MCP server processes request using specialized tools**
125 | 4. **Tools utilize helpers, templates, and validators**
126 | 5. **MCP server calls Fabric API for operations**
127 | 6. **Results flow back through MCP to IDE**
128 | 7. **Developer receives generated code, validation results, or analysis**
129 | 
130 | ### Benefits of This Architecture
131 | 
132 | - **Seamless Integration**: Work directly from your IDE without switching contexts
133 | - **Intelligent Assistance**: AI-powered code generation and validation
134 | - **Fabric Optimization**: Specialized tools for Microsoft Fabric environment
135 | - **Performance Focus**: Built-in performance analysis and optimization
136 | - **Template-Driven**: Quick start with proven patterns
137 | - **Real-time Feedback**: Immediate validation and suggestions
138 | 
139 | ## Available Tools
140 | 
141 | ### 1. Notebook Management
142 | 
143 | #### `list_notebooks`
144 | List all notebooks in a workspace.
145 | ```
146 | Usage: list_notebooks(workspace="my_workspace")
147 | ```
148 | 
149 | #### `get_notebook_content`
150 | Retrieve the content of a specific notebook.
151 | ```
152 | Usage: get_notebook_content(workspace="my_workspace", notebook_id="notebook_id")
153 | ```
154 | 
155 | #### `create_pyspark_notebook`
156 | Create a notebook from built-in PySpark templates.
157 | ```
158 | Usage: create_pyspark_notebook(
159 |     workspace="my_workspace",
160 |     notebook_name="my_pyspark_notebook",
161 |     template_type="basic"  # Options: basic, etl, analytics, ml
162 | )
163 | ```
164 | 
165 | #### `create_fabric_notebook`
166 | Create a notebook optimized for Microsoft Fabric with advanced templates.
167 | ```
168 | Usage: create_fabric_notebook(
169 |     workspace="my_workspace",
170 |     notebook_name="fabric_optimized_notebook",
171 |     template_type="fabric_integration"  # Options: fabric_integration, streaming
172 | )
173 | ```
174 | 
175 | ### 2. Code Generation
176 | 
177 | #### `generate_pyspark_code`
178 | Generate PySpark code for common operations.
179 | ```
180 | Usage: generate_pyspark_code(
181 |     operation="read_table",
182 |     source_table="lakehouse.my_table",
183 |     columns="id,name,age"
184 | )
185 | 
186 | Available operations:
187 | - read_table: Read data from tables
188 | - write_table: Write data to tables
189 | - transform: Data transformations
190 | - join: Table joins
191 | - aggregate: Data aggregations
192 | - schema_inference: Schema analysis
193 | - data_quality: Data quality checks
194 | - performance_optimization: Performance tuning
195 | ```
196 | 
197 | #### `generate_fabric_code`
198 | Generate Fabric-specific PySpark code.
199 | ```
200 | Usage: generate_fabric_code(
201 |     operation="read_lakehouse",
202 |     lakehouse_name="my_lakehouse",
203 |     table_name="my_table"
204 | )
205 | 
206 | Available operations:
207 | - read_lakehouse: Read from Fabric Lakehouse
208 | - write_lakehouse: Write to Fabric Lakehouse
209 | - merge_delta: Delta Lake merge operations
210 | - performance_monitor: Performance monitoring
211 | ```
212 | 
213 | ### 3. Code Validation
214 | 
215 | #### `validate_pyspark_code`
216 | Validate PySpark code for syntax and best practices.
217 | ```
218 | Usage: validate_pyspark_code(code="df = spark.table('my_table')")
219 | ```
220 | 
221 | #### `validate_fabric_code`
222 | Validate code specifically for Microsoft Fabric compatibility.
223 | ```
224 | Usage: validate_fabric_code(code="df = spark.table('my_table')")
225 | ```
226 | 
227 | ### 4. Performance Analysis
228 | 
229 | #### `analyze_notebook_performance`
230 | Analyze a notebook's performance and provide optimization recommendations.
231 | ```
232 | Usage: analyze_notebook_performance(
233 |     workspace="my_workspace",
234 |     notebook_id="notebook_id"
235 | )
236 | ```
237 | 
238 | ### 5. Notebook Editing
239 | 
240 | #### `update_notebook_cell`
241 | Update a specific cell in a notebook.
242 | ```
243 | Usage: update_notebook_cell(
244 |     workspace="my_workspace",
245 |     notebook_id="notebook_id",
246 |     cell_index=0,
247 |     cell_content="print('Hello, Fabric!')",
248 |     cell_type="code"
249 | )
250 | ```
251 | 
252 | ## Template Types
253 | 
254 | ### Basic Templates (`create_pyspark_notebook`)
255 | 
256 | 1. **basic**: Fundamental PySpark operations
257 |    - Spark session initialization
258 |    - Basic DataFrame operations
259 |    - Sample data creation
260 | 
261 | 2. **etl**: ETL pipeline template
262 |    - Extract, Transform, Load patterns
263 |    - Data cleaning and processing
264 |    - Delta Lake integration
265 | 
266 | 3. **analytics**: Data analytics template
267 |    - Aggregations and window functions
268 |    - Advanced analytics patterns
269 |    - Statistical operations
270 | 
271 | 4. **ml**: Machine learning template
272 |    - MLlib pipeline creation
273 |    - Feature engineering
274 |    - Model training and evaluation
275 | 
276 | ### Advanced Templates (`create_fabric_notebook`)
277 | 
278 | 1. **fabric_integration**: Microsoft Fabric integration
279 |    - Lakehouse connectivity
280 |    - Delta Lake operations
281 |    - Fabric-specific utilities
282 | 
283 | 2. **streaming**: Structured Streaming template
284 |    - Real-time data processing
285 |    - Stream-to-Delta operations
286 |    - Windowed aggregations
287 | 
288 | ## Best Practices
289 | 
290 | ### 1. Fabric-Specific Optimizations
291 | 
292 | ✅ **Use managed tables:**
293 | ```python
294 | df = spark.table("lakehouse.my_table")  # Preferred
295 | # instead of direct file paths
296 | ```
297 | 
298 | ✅ **Use Delta Lake format:**
299 | ```python
300 | df.write.format("delta").mode("overwrite").saveAsTable("my_table")
301 | ```
302 | 
303 | ✅ **Leverage notebookutils:**
304 | ```python
305 | import notebookutils as nbu
306 | workspace_id = nbu.runtime.context.workspaceId
307 | ```
308 | 
309 | ### 2. Performance Optimizations
310 | 
311 | ✅ **Cache frequently used DataFrames:**
312 | ```python
313 | df.cache()  # Cache before multiple actions
314 | ```
315 | 
316 | ✅ **Use broadcast for small tables:**
317 | ```python
318 | from pyspark.sql.functions import broadcast
319 | result = large_df.join(broadcast(small_df), "key")
320 | ```
321 | 
322 | ✅ **Partition large datasets:**
323 | ```python
324 | df.write.partitionBy("year", "month").saveAsTable("partitioned_table")
325 | ```
326 | 
327 | ### 3. Code Quality
328 | 
329 | ✅ **Define explicit schemas:**
330 | ```python
331 | schema = StructType([
332 |     StructField("id", IntegerType(), True),
333 |     StructField("name", StringType(), True)
334 | ])
335 | df = spark.createDataFrame(data, schema)
336 | ```
337 | 
338 | ✅ **Handle null values:**
339 | ```python
340 | df.filter(col("column").isNotNull())
341 | ```
342 | 
343 | ❌ **Avoid these anti-patterns:**
344 | ```python
345 | # Don't collect large datasets
346 | for row in df.collect():  # Avoid this
347 |     process(row)
348 | 
349 | # Don't use .toPandas() on large data
350 | pandas_df = large_df.toPandas()  # Risk of OOM
351 | ```
352 | 
353 | ## Workflow Examples
354 | 
355 | ### 1. Creating and Optimizing a PySpark Notebook
356 | 
357 | ```python
358 | # 1. Create a new notebook from template
359 | create_fabric_notebook(
360 |     workspace="analytics_workspace",
361 |     notebook_name="sales_analysis",
362 |     template_type="fabric_integration"
363 | )
364 | 
365 | # 2. Generate code for specific operations
366 | generate_fabric_code(
367 |     operation="read_lakehouse",
368 |     lakehouse_name="sales_lakehouse",
369 |     table_name="transactions"
370 | )
371 | 
372 | # 3. Validate the generated code
373 | validate_fabric_code(code="df = spark.table('sales_lakehouse.transactions')")
374 | 
375 | # 4. Analyze performance
376 | analyze_notebook_performance(
377 |     workspace="analytics_workspace",
378 |     notebook_id="sales_analysis_notebook_id"
379 | )
380 | ```
381 | 
382 | ### 2. ETL Pipeline Development
383 | 
384 | ```python
385 | # 1. Create ETL notebook
386 | create_pyspark_notebook(
387 |     workspace="etl_workspace",
388 |     notebook_name="daily_etl",
389 |     template_type="etl"
390 | )
391 | 
392 | # 2. Generate transformation code
393 | generate_pyspark_code(
394 |     operation="transform",
395 |     columns="customer_id,product_id,amount",
396 |     filter_condition="amount > 0"
397 | )
398 | 
399 | # 3. Generate Delta merge code
400 | generate_fabric_code(
401 |     operation="merge_delta",
402 |     target_table="sales_summary"
403 | )
404 | ```
405 | 
406 | ### 3. Performance Monitoring
407 | 
408 | ```python
409 | # 1. Generate performance monitoring code
410 | generate_fabric_code(operation="performance_monitor")
411 | 
412 | # 2. Validate for performance issues
413 | validate_fabric_code(code="""
414 | df1 = spark.table("large_table")
415 | df2 = spark.table("small_table")
416 | result = df1.join(df2, "key")
417 | result.collect()  # This will be flagged
418 | """)
419 | 
420 | # 3. Analyze existing notebook
421 | analyze_notebook_performance(
422 |     workspace="my_workspace",
423 |     notebook_id="existing_notebook_id"
424 | )
425 | ```
426 | 
427 | ## Error Handling
428 | 
429 | The MCP tools provide comprehensive error handling:
430 | 
431 | - **Syntax validation**: Checks Python syntax before execution
432 | - **Fabric compatibility**: Ensures code works in Fabric environment
433 | - **Performance warnings**: Identifies potential performance issues
434 | - **Best practice suggestions**: Recommends improvements
435 | 
436 | ## Integration with IDE
437 | 
438 | When using the MCP in your IDE:
439 | 
440 | 1. **Autocomplete**: The MCP provides intelligent code generation
441 | 2. **Validation**: Real-time code validation and suggestions
442 | 3. **Templates**: Quick notebook creation from templates
443 | 4. **Performance insights**: Analyze and optimize existing notebooks
444 | 
445 | ## Troubleshooting
446 | 
447 | ### Common Issues
448 | 
449 | 1. **Context not provided**: Ensure `ctx` parameter is passed to all functions
450 | 2. **Invalid workspace**: Verify workspace name or ID exists
451 | 3. **Notebook not found**: Check notebook ID or name spelling
452 | 4. **Template not found**: Use valid template types listed above
453 | 
454 | ### Getting Help
455 | 
456 | Use the validation tools to identify issues:
457 | - `validate_pyspark_code()` for general PySpark validation
458 | - `validate_fabric_code()` for Fabric-specific validation
459 | - `analyze_notebook_performance()` for performance insights
460 | 
461 | ## Advanced Features
462 | 
463 | ### Custom Templates
464 | 
465 | The helper module supports extending templates. You can create custom templates by modifying the `PySparkTemplateManager` class in `helpers/pyspark_helpers.py`.
466 | 
467 | ### Code Generation Extensions
468 | 
469 | Add new code generation patterns by extending the `PySparkCodeGenerator` class with additional methods for specific use cases.
470 | 
471 | ### Performance Metrics
472 | 
473 | The performance analysis tool provides:
474 | - Operation counts per cell
475 | - Performance issue detection
476 | - Optimization opportunity identification
477 | - Scoring system (0-100)
478 | 
479 | This comprehensive PySpark development environment helps you write, test, and optimize PySpark notebooks efficiently in Microsoft Fabric!
480 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Microsoft Fabric MCP Server
  2 | 
  3 | A comprehensive Python-based MCP (Model Context Protocol) server for interacting with Microsoft Fabric APIs, featuring advanced PySpark notebook development, testing, and optimization capabilities with LLM integration.
  4 | 
  5 | ## 🚀 Features
  6 | 
  7 | ### **Core Fabric Operations**
  8 | - ✅ Workspace, lakehouse, warehouse, and table management
  9 | - ✅ Delta table schemas and metadata retrieval
 10 | - ✅ SQL query execution and data loading
 11 | - ✅ Report and semantic model operations
 12 | 
 13 | ### **Advanced PySpark Development**
 14 | - 📓 **Intelligent notebook creation** with 6 specialized templates
 15 | - 🔧 **Smart code generation** for common PySpark operations
 16 | - ✅ **Comprehensive validation** with syntax and best practices checking
 17 | - 🎯 **Fabric-specific optimizations** and compatibility checks
 18 | - 📊 **Performance analysis** with scoring and optimization recommendations
 19 | - 🚀 **Real-time monitoring** and execution insights
 20 | 
 21 | ### **LLM Integration**
 22 | - 🤖 **Natural language interface** for PySpark development
 23 | - 🧠 **Context-aware assistance** with conversation memory
 24 | - 🎨 **Intelligent code formatting** and explanations
 25 | - 📈 **Smart optimization suggestions** based on project patterns
 26 | 
 27 | ## 🏗️ Architecture
 28 | 
 29 | ```mermaid
 30 | graph TB
 31 |     subgraph "Developer Environment"
 32 |         IDE[IDE/VSCode]
 33 |         DEV[Developer]
 34 |         PROJ[Project Files]
 35 |     end
 36 |     
 37 |     subgraph "AI Layer"
 38 |         LLM[Large Language Model<br/>Claude/GPT/etc.]
 39 |         CONTEXT[Conversation Context]
 40 |         REASONING[AI Reasoning Engine]
 41 |     end
 42 |     
 43 |     subgraph "MCP Layer"
 44 |         MCP[MCP Server]
 45 |         TOOLS[PySpark Tools]
 46 |         HELPERS[PySpark Helpers]
 47 |         TEMPLATES[Template Manager]
 48 |         VALIDATORS[Code Validators]
 49 |         GENERATORS[Code Generators]
 50 |     end
 51 |     
 52 |     subgraph "Microsoft Fabric"
 53 |         API[Fabric API]
 54 |         WS[Workspace]
 55 |         LH[Lakehouse]
 56 |         NB[Notebooks]
 57 |         TABLES[Delta Tables]
 58 |         SPARK[Spark Clusters]
 59 |     end
 60 |     
 61 |     subgraph "Operations Flow"
 62 |         CREATE[Create Notebooks]
 63 |         VALIDATE[Validate Code]
 64 |         GENERATE[Generate Code]
 65 |         ANALYZE[Analyze Performance]
 66 |         DEPLOY[Deploy to Fabric]
 67 |     end
 68 |     
 69 |     %% Developer interactions
 70 |     DEV --> IDE
 71 |     IDE --> PROJ
 72 |     
 73 |     %% LLM interactions
 74 |     IDE <--> LLM
 75 |     LLM <--> CONTEXT
 76 |     LLM --> REASONING
 77 |     
 78 |     %% MCP interactions
 79 |     LLM <--> MCP
 80 |     MCP --> TOOLS
 81 |     TOOLS --> HELPERS
 82 |     TOOLS --> TEMPLATES
 83 |     TOOLS --> VALIDATORS
 84 |     TOOLS --> GENERATORS
 85 |     
 86 |     %% Fabric interactions
 87 |     MCP <--> API
 88 |     API --> WS
 89 |     WS --> LH
 90 |     WS --> NB
 91 |     LH --> TABLES
 92 |     NB --> SPARK
 93 |     
 94 |     %% Operation flows
 95 |     TOOLS --> CREATE
 96 |     TOOLS --> VALIDATE
 97 |     TOOLS --> GENERATE
 98 |     TOOLS --> ANALYZE
 99 |     CREATE --> DEPLOY
100 |     
101 |     %% Data flow arrows
102 |     REASONING -.->|"Intelligent Decisions"| TOOLS
103 |     CONTEXT -.->|"Project Awareness"| VALIDATORS
104 |     
105 |     %% Styling
106 |     classDef devEnv fill:#e1f5fe
107 |     classDef aiLayer fill:#fff9c4
108 |     classDef mcpLayer fill:#f3e5f5
109 |     classDef fabricLayer fill:#e8f5e8
110 |     classDef operations fill:#fff3e0
111 |     
112 |     class IDE,DEV,PROJ devEnv
113 |     class LLM,CONTEXT,REASONING aiLayer
114 |     class MCP,TOOLS,HELPERS,TEMPLATES,VALIDATORS,GENERATORS mcpLayer
115 |     class API,WS,LH,NB,TABLES,SPARK fabricLayer
116 |     class CREATE,VALIDATE,GENERATE,ANALYZE,DEPLOY operations
117 | ```
118 | 
119 | ### **Interaction Flow**
120 | 1. **Developer requests assistance in IDE**
121 | 2. **IDE communicates with LLM (Claude/GPT)**
122 | 3. **LLM analyzes using context and reasoning**
123 | 4. **LLM calls MCP server tools intelligently**
124 | 5. **MCP tools interact with Fabric API**
125 | 6. **Results flow back through LLM with intelligent formatting**
126 | 7. **Developer receives contextual, smart responses**
127 | 
128 | ## 📋 Requirements
129 | 
130 | - **Python 3.12+**
131 | - **Azure credentials** for authentication
132 | - **uv** (from astral): [Installation instructions](https://docs.astral.sh/uv/getting-started/installation/#installing-uv)
133 | - **Azure CLI**: [Installation instructions](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest)
134 | - **Optional: Node.js** for MCP inspector: [Installation instructions](https://nodejs.org/en/download)
135 | 
136 | ## 🔧 Installation
137 | 
138 | 1. **Clone the repository:**
139 |    ```bash
140 |    git clone https://github.com/your-repo/fabric-mcp.git
141 |    cd fabric-mcp
142 |    ```
143 | 
144 | 2. **Set up virtual environment:**
145 |    ```bash
146 |    uv sync
147 |    ```
148 | 
149 | 3. **Install dependencies:**
150 |    ```bash
151 |    pip install -r requirements.txt
152 |    ```
153 | 
154 | ## 🚀 Usage
155 | 
156 | 1. **Using STDIO**
157 | 
158 | ### **Connect to Microsoft Fabric**
159 | 
160 | ```bash
161 | az login --scope https://api.fabric.microsoft.com/.default
162 | ```
163 | 
164 | ### **Running with MCP Inspector**
165 | 
166 | ```bash
167 | uv run --with mcp mcp dev fabric_mcp.py
168 | ```
169 | This starts the server with inspector at `http://localhost:6274`.
170 | 
171 | ### **VSCode Integration**
172 | 
173 | Add to your `launch.json`:
174 | ```json
175 | {
176 |     "mcp": {
177 |         "servers": {
178 |             "ms-fabric-mcp": {
179 |                 "type": "stdio",
180 |                 "command": "<FullPathToProjectFolder>\\.venv\\Scripts\\python.exe",
181 |                 "args": ["<FullPathToProjectFolder>\\fabric_mcp.py"]
182 |             }
183 |         }
184 |     }
185 | }
186 | ```
187 | 
188 | 2. **Using HTTP**
189 | ### **Start the MCP Server**
190 | ```bash
191 | uv run python .\fabric_mcp.py --port 8081
192 | ```
193 | 
194 | ### **VSCode Integration**
195 | 
196 | Add to your `launch.json`:
197 | ```json
198 | {
199 |     "mcp": {
200 |         "servers": {
201 |             "ms-fabric-mcp": {
202 |                 "type": "http",
203 |                 "url": "http://<localhost or remote IP>:8081/mcp/",
204 |                 "headers": {
205 |                     "Accept": "application/json,text/event-stream",
206 |                 }
207 |             }
208 |         }
209 |     }
210 | }
211 | ```
212 | 
213 | ## 🛠️ Complete Tool Reference
214 | 
215 | ### **1. Workspace Management**
216 | 
217 | #### `list_workspaces`
218 | List all available Fabric workspaces.
219 | ```python
220 | # Usage in LLM: "List all my Fabric workspaces"
221 | ```
222 | 
223 | #### `set_workspace`
224 | Set the current workspace context for the session.
225 | ```python
226 | set_workspace(workspace="Analytics-Workspace")
227 | ```
228 | 
229 | ### **2. Lakehouse Operations**
230 | 
231 | #### `list_lakehouses`
232 | List all lakehouses in a workspace.
233 | ```python
234 | list_lakehouses(workspace="Analytics-Workspace")
235 | ```
236 | 
237 | #### `create_lakehouse`
238 | Create a new lakehouse.
239 | ```python
240 | create_lakehouse(
241 |     name="Sales-Data-Lake",
242 |     workspace="Analytics-Workspace",
243 |     description="Sales data lakehouse"
244 | )
245 | ```
246 | 
247 | #### `set_lakehouse`
248 | Set current lakehouse context.
249 | ```python
250 | set_lakehouse(lakehouse="Sales-Data-Lake")
251 | ```
252 | 
253 | ### **3. Warehouse Operations**
254 | 
255 | #### `list_warehouses`
256 | List all warehouses in a workspace.
257 | ```python
258 | list_warehouses(workspace="Analytics-Workspace")
259 | ```
260 | 
261 | #### `create_warehouse`
262 | Create a new warehouse.
263 | ```python
264 | create_warehouse(
265 |     name="Sales-DW",
266 |     workspace="Analytics-Workspace", 
267 |     description="Sales data warehouse"
268 | )
269 | ```
270 | 
271 | #### `set_warehouse`
272 | Set current warehouse context.
273 | ```python
274 | set_warehouse(warehouse="Sales-DW")
275 | ```
276 | 
277 | ### **4. Table Operations**
278 | 
279 | #### `list_tables`
280 | List all tables in a lakehouse.
281 | ```python
282 | list_tables(workspace="Analytics-Workspace", lakehouse="Sales-Data-Lake")
283 | ```
284 | 
285 | #### `get_lakehouse_table_schema`
286 | Get schema for a specific table.
287 | ```python
288 | get_lakehouse_table_schema(
289 |     workspace="Analytics-Workspace",
290 |     lakehouse="Sales-Data-Lake",
291 |     table_name="transactions"
292 | )
293 | ```
294 | 
295 | #### `get_all_lakehouse_schemas`
296 | Get schemas for all tables in a lakehouse.
297 | ```python
298 | get_all_lakehouse_schemas(
299 |     workspace="Analytics-Workspace",
300 |     lakehouse="Sales-Data-Lake"
301 | )
302 | ```
303 | 
304 | #### `set_table`
305 | Set current table context.
306 | ```python
307 | set_table(table_name="transactions")
308 | ```
309 | 
310 | ### **5. SQL Operations**
311 | 
312 | #### `get_sql_endpoint`
313 | Get SQL endpoint for lakehouse or warehouse.
314 | ```python
315 | get_sql_endpoint(
316 |     workspace="Analytics-Workspace",
317 |     lakehouse="Sales-Data-Lake",
318 |     type="lakehouse"
319 | )
320 | ```
321 | 
322 | #### `run_query`
323 | Execute SQL queries.
324 | ```python
325 | run_query(
326 |     workspace="Analytics-Workspace",
327 |     lakehouse="Sales-Data-Lake",
328 |     query="SELECT COUNT(*) FROM transactions",
329 |     type="lakehouse"
330 | )
331 | ```
332 | 
333 | ### **6. Data Loading**
334 | 
335 | #### `load_data_from_url`
336 | Load data from URL into tables.
337 | ```python
338 | load_data_from_url(
339 |     url="https://example.com/data.csv",
340 |     destination_table="new_data",
341 |     workspace="Analytics-Workspace",
342 |     lakehouse="Sales-Data-Lake"
343 | )
344 | ```
345 | 
346 | ### **7. Reports & Models**
347 | 
348 | #### `list_reports`
349 | List all reports in a workspace.
350 | ```python
351 | list_reports(workspace="Analytics-Workspace")
352 | ```
353 | 
354 | #### `get_report`
355 | Get specific report details.
356 | ```python
357 | get_report(workspace="Analytics-Workspace", report_id="report-id")
358 | ```
359 | 
360 | #### `list_semantic_models`
361 | List semantic models in workspace.
362 | ```python
363 | list_semantic_models(workspace="Analytics-Workspace")
364 | ```
365 | 
366 | #### `get_semantic_model`
367 | Get specific semantic model.
368 | ```python
369 | get_semantic_model(workspace="Analytics-Workspace", model_id="model-id")
370 | ```
371 | 
372 | ### **8. Basic Notebook Operations**
373 | 
374 | #### `list_notebooks`
375 | List all notebooks in a workspace.
376 | ```python
377 | list_notebooks(workspace="Analytics-Workspace")
378 | ```
379 | 
380 | #### `get_notebook_content`
381 | Retrieve notebook content.
382 | ```python
383 | get_notebook_content(
384 |     workspace="Analytics-Workspace",
385 |     notebook_id="notebook-id"
386 | )
387 | ```
388 | 
389 | #### `update_notebook_cell`
390 | Update specific notebook cells.
391 | ```python
392 | update_notebook_cell(
393 |     workspace="Analytics-Workspace",
394 |     notebook_id="notebook-id",
395 |     cell_index=0,
396 |     cell_content="print('Hello, Fabric!')",
397 |     cell_type="code"
398 | )
399 | ```
400 | 
401 | ### **9. Advanced PySpark Notebook Creation**
402 | 
403 | #### `create_pyspark_notebook`
404 | Create notebooks from basic templates.
405 | ```python
406 | create_pyspark_notebook(
407 |     workspace="Analytics-Workspace",
408 |     notebook_name="Data-Analysis",
409 |     template_type="analytics"  # Options: basic, etl, analytics, ml
410 | )
411 | ```
412 | 
413 | #### `create_fabric_notebook`
414 | Create Fabric-optimized notebooks.
415 | ```python
416 | create_fabric_notebook(
417 |     workspace="Analytics-Workspace",
418 |     notebook_name="Fabric-Pipeline",
419 |     template_type="fabric_integration"  # Options: fabric_integration, streaming
420 | )
421 | ```
422 | 
423 | ### **10. PySpark Code Generation**
424 | 
425 | #### `generate_pyspark_code`
426 | Generate code for common operations.
427 | ```python
428 | generate_pyspark_code(
429 |     operation="read_table",
430 |     source_table="sales.transactions",
431 |     columns="id,amount,date"
432 | )
433 | 
434 | # Available operations:
435 | # - read_table, write_table, transform, join, aggregate
436 | # - schema_inference, data_quality, performance_optimization
437 | ```
438 | 
439 | #### `generate_fabric_code`
440 | Generate Fabric-specific code.
441 | ```python
442 | generate_fabric_code(
443 |     operation="read_lakehouse",
444 |     lakehouse_name="Sales-Data-Lake",
445 |     table_name="transactions"
446 | )
447 | 
448 | # Available operations:
449 | # - read_lakehouse, write_lakehouse, merge_delta, performance_monitor
450 | ```
451 | 
452 | ### **11. Code Validation & Analysis**
453 | 
454 | #### `validate_pyspark_code`
455 | Validate PySpark code syntax and best practices.
456 | ```python
457 | validate_pyspark_code(code="""
458 | df = spark.table('transactions')
459 | df.show()
460 | """)
461 | ```
462 | 
463 | #### `validate_fabric_code`
464 | Validate Fabric compatibility.
465 | ```python
466 | validate_fabric_code(code="""
467 | df = spark.table('lakehouse.transactions')
468 | df.write.format('delta').saveAsTable('summary')
469 | """)
470 | ```
471 | 
472 | #### `analyze_notebook_performance`
473 | Comprehensive performance analysis.
474 | ```python
475 | analyze_notebook_performance(
476 |     workspace="Analytics-Workspace",
477 |     notebook_id="notebook-id"
478 | )
479 | ```
480 | 
481 | ### **12. Context Management**
482 | 
483 | #### `clear_context`
484 | Clear current session context.
485 | ```python
486 | clear_context()
487 | ```
488 | 
489 | ## 📊 PySpark Templates
490 | 
491 | ### **Basic Templates**
492 | 1. **basic**: Fundamental PySpark operations and DataFrame usage
493 | 2. **etl**: Complete ETL pipeline with data cleaning and Delta Lake
494 | 3. **analytics**: Advanced analytics with aggregations and window functions
495 | 4. **ml**: Machine learning pipeline with MLlib and feature engineering
496 | 
497 | ### **Advanced Templates**
498 | 1. **fabric_integration**: Lakehouse connectivity and Fabric-specific utilities
499 | 2. **streaming**: Real-time processing with Structured Streaming
500 | 
501 | ## 🎯 Best Practices
502 | 
503 | ### **Fabric Optimization**
504 | ```python
505 | # ✅ Use managed tables
506 | df = spark.table("lakehouse.my_table")
507 | 
508 | # ✅ Use Delta Lake format
509 | df.write.format("delta").mode("overwrite").saveAsTable("my_table")
510 | 
511 | # ✅ Leverage notebookutils
512 | import notebookutils as nbu
513 | workspace_id = nbu.runtime.context.workspaceId
514 | ```
515 | 
516 | ### **Performance Optimization**
517 | ```python
518 | # ✅ Cache frequently used DataFrames
519 | df.cache()
520 | 
521 | # ✅ Use broadcast for small tables
522 | from pyspark.sql.functions import broadcast
523 | result = large_df.join(broadcast(small_df), "key")
524 | 
525 | # ✅ Partition large datasets
526 | df.write.partitionBy("year", "month").saveAsTable("partitioned_table")
527 | ```
528 | 
529 | ### **Code Quality**
530 | ```python
531 | # ✅ Define explicit schemas
532 | schema = StructType([
533 |     StructField("id", IntegerType(), True),
534 |     StructField("name", StringType(), True)
535 | ])
536 | 
537 | # ✅ Handle null values
538 | df.filter(col("column").isNotNull())
539 | ```
540 | 
541 | ## 🔄 Example LLM-Enhanced Workflows
542 | 
543 | ### **Natural Language Requests**
544 | ```
545 | Human: "Create a PySpark notebook that reads sales data, cleans it, and optimizes performance"
546 | 
547 | LLM Response:
548 | 1. Creates Fabric-optimized notebook with ETL template
549 | 2. Generates lakehouse reading code
550 | 3. Adds data cleaning transformations
551 | 4. Includes performance optimization patterns
552 | 5. Validates code for best practices
553 | ```
554 | 
555 | ### **Performance Analysis**
556 | ```
557 | Human: "My PySpark notebook is slow. Help me optimize it."
558 | 
559 | LLM Response:
560 | 1. Analyzes notebook performance (scoring 0-100)
561 | 2. Identifies anti-patterns and bottlenecks
562 | 3. Suggests specific optimizations
563 | 4. Generates optimized code alternatives
564 | 5. Provides before/after comparisons
565 | ```
566 | 
567 | ## 🔍 Troubleshooting
568 | 
569 | ### **Common Issues**
570 | - **Authentication**: Ensure `az login` with correct scope
571 | - **Context**: Use `clear_context()` to reset session state
572 | - **Workspace**: Verify workspace names and permissions
573 | - **Templates**: Check available template types in documentation
574 | 
575 | ### **Getting Help**
576 | - Use validation tools for code issues
577 | - Check performance analysis for optimization opportunities
578 | - Leverage LLM natural language interface for guidance
579 | 
580 | ## 📈 Performance Metrics
581 | 
582 | The analysis tools provide:
583 | - **Operation counts** per notebook cell
584 | - **Performance issues** detection and flagging
585 | - **Optimization opportunities** identification
586 | - **Scoring system** (0-100) for code quality
587 | - **Fabric compatibility** assessment
588 | 
589 | ## 🤝 Contributing
590 | 
591 | This project welcomes contributions! Please see our contributing guidelines for details.
592 | 
593 | ## 📄 License
594 | 
595 | This project is licensed under the MIT License. See the LICENSE file for details.
596 | 
597 | ## 🙏 Acknowledgments
598 | 
599 | Inspired by: https://github.com/Augustab/microsoft_fabric_mcp/tree/main
600 | 
601 | ---
602 | 
603 | **Ready to supercharge your Microsoft Fabric development with intelligent PySpark assistance!** 🚀
604 | 


--------------------------------------------------------------------------------
/helpers/pyspark_helpers.py:
--------------------------------------------------------------------------------
  1 | """
  2 | PySpark helper utilities for Microsoft Fabric MCP Server.
  3 | This module provides templates, code generation, and execution helpers for PySpark notebooks.
  4 | """
  5 | 
  6 | import json
  7 | from typing import Dict, List, Any, Optional
  8 | from helpers.logging_config import get_logger
  9 | 
 10 | logger = get_logger(__name__)
 11 | 
 12 | class PySparkTemplateManager:
 13 |     """Manages PySpark notebook templates and code generation."""
 14 |     
 15 |     @staticmethod
 16 |     def get_fabric_integration_template() -> Dict[str, Any]:
 17 |         """Template for Fabric-specific PySpark operations."""
 18 |         return {
 19 |             "cells": [
 20 |                 {
 21 |                     "cell_type": "markdown",
 22 |                     "source": [
 23 |                         "# Microsoft Fabric PySpark Integration\n",
 24 |                         "\n",
 25 |                         "This notebook demonstrates integration with Microsoft Fabric resources using PySpark.\n"
 26 |                     ],
 27 |                     "metadata": {}
 28 |                 },
 29 |                 {
 30 |                     "cell_type": "code",
 31 |                     "source": [
 32 |                         "# Initialize Fabric integration\n",
 33 |                         "from pyspark.sql import SparkSession\n",
 34 |                         "from pyspark.sql.functions import *\n",
 35 |                         "from pyspark.sql.types import *\n",
 36 |                         "from delta.tables import DeltaTable\n",
 37 |                         "import notebookutils as nbu\n",
 38 |                         "\n",
 39 |                         "# Spark session is pre-configured in Fabric\n",
 40 |                         "print(f\"Spark version: {spark.version}\")\n",
 41 |                         "print(f\"Available cores: {spark.sparkContext.defaultParallelism}\")\n",
 42 |                         "\n",
 43 |                         "# Get current workspace and lakehouse context\n",
 44 |                         "print(f\"Current workspace: {nbu.runtime.context.workspaceId}\")\n"
 45 |                     ],
 46 |                     "execution_count": None,
 47 |                     "outputs": [],
 48 |                     "metadata": {}
 49 |                 },
 50 |                 {
 51 |                     "cell_type": "code",
 52 |                     "source": [
 53 |                         "# Connect to Fabric Lakehouse\n",
 54 |                         "# List available tables in the default lakehouse\n",
 55 |                         "try:\n",
 56 |                         "    tables = spark.sql(\"SHOW TABLES\").collect()\n",
 57 |                         "    print(\"Available tables in current lakehouse:\")\n",
 58 |                         "    for table in tables:\n",
 59 |                         "        print(f\"  - {table.database}.{table.tableName}\")\n",
 60 |                         "except Exception as e:\n",
 61 |                         "    print(f\"No default lakehouse attached or no tables found: {e}\")\n"
 62 |                     ],
 63 |                     "execution_count": None,
 64 |                     "outputs": [],
 65 |                     "metadata": {}
 66 |                 },
 67 |                 {
 68 |                     "cell_type": "code",
 69 |                     "source": [
 70 |                         "# Read from Fabric Lakehouse table\n",
 71 |                         "# Replace 'your_table_name' with actual table name\n",
 72 |                         "# df = spark.table(\"your_table_name\")\n",
 73 |                         "\n",
 74 |                         "# Alternative: Read from files in Lakehouse\n",
 75 |                         "# df = spark.read.format(\"delta\").load(\"Tables/your_table_name\")\n",
 76 |                         "\n",
 77 |                         "# For demo, create sample data\n",
 78 |                         "sample_data = [\n",
 79 |                         "    (1, \"Product A\", 100.0, \"2024-01-01\"),\n",
 80 |                         "    (2, \"Product B\", 150.0, \"2024-01-02\"),\n",
 81 |                         "    (3, \"Product C\", 200.0, \"2024-01-03\")\n",
 82 |                         "]\n",
 83 |                         "\n",
 84 |                         "schema = StructType([\n",
 85 |                         "    StructField(\"id\", IntegerType(), True),\n",
 86 |                         "    StructField(\"product_name\", StringType(), True),\n",
 87 |                         "    StructField(\"price\", DoubleType(), True),\n",
 88 |                         "    StructField(\"date_created\", StringType(), True)\n",
 89 |                         "])\n",
 90 |                         "\n",
 91 |                         "df = spark.createDataFrame(sample_data, schema)\n",
 92 |                         "df = df.withColumn(\"date_created\", to_date(col(\"date_created\"), \"yyyy-MM-dd\"))\n",
 93 |                         "df.show()\n"
 94 |                     ],
 95 |                     "execution_count": None,
 96 |                     "outputs": [],
 97 |                     "metadata": {}
 98 |                 },
 99 |                 {
100 |                     "cell_type": "code",
101 |                     "source": [
102 |                         "# Write to Fabric Lakehouse as Delta table\n",
103 |                         "table_name = \"fabric_demo_products\"\n",
104 |                         "\n",
105 |                         "# Option 1: Write as managed table\n",
106 |                         "df.write \\\n",
107 |                         "    .format(\"delta\") \\\n",
108 |                         "    .mode(\"overwrite\") \\\n",
109 |                         "    .option(\"overwriteSchema\", \"true\") \\\n",
110 |                         "    .saveAsTable(table_name)\n",
111 |                         "\n",
112 |                         "print(f\"Successfully wrote {df.count()} records to table '{table_name}'\")\n",
113 |                         "\n",
114 |                         "# Verify the table was created\n",
115 |                         "result = spark.table(table_name)\n",
116 |                         "print(\"\\nTable verification:\")\n",
117 |                         "result.show()\n"
118 |                     ],
119 |                     "execution_count": None,
120 |                     "outputs": [],
121 |                     "metadata": {}
122 |                 },
123 |                 {
124 |                     "cell_type": "code",
125 |                     "source": [
126 |                         "# Advanced Delta Lake operations in Fabric\n",
127 |                         "from delta.tables import DeltaTable\n",
128 |                         "\n",
129 |                         "# Create DeltaTable reference\n",
130 |                         "delta_table = DeltaTable.forName(spark, table_name)\n",
131 |                         "\n",
132 |                         "# Show table history\n",
133 |                         "print(\"Table history:\")\n",
134 |                         "delta_table.history().show(truncate=False)\n",
135 |                         "\n",
136 |                         "# Perform merge operation (upsert)\n",
137 |                         "new_data = [\n",
138 |                         "    (1, \"Product A Updated\", 110.0, \"2024-01-01\"),  # Update existing\n",
139 |                         "    (4, \"Product D\", 250.0, \"2024-01-04\")           # Insert new\n",
140 |                         "]\n",
141 |                         "\n",
142 |                         "new_df = spark.createDataFrame(new_data, schema)\n",
143 |                         "new_df = new_df.withColumn(\"date_created\", to_date(col(\"date_created\"), \"yyyy-MM-dd\"))\n",
144 |                         "\n",
145 |                         "# Merge operation\n",
146 |                         "delta_table.alias(\"target\") \\\n",
147 |                         "    .merge(\n",
148 |                         "        new_df.alias(\"source\"),\n",
149 |                         "        \"target.id = source.id\"\n",
150 |                         "    ) \\\n",
151 |                         "    .whenMatchedUpdateAll() \\\n",
152 |                         "    .whenNotMatchedInsertAll() \\\n",
153 |                         "    .execute()\n",
154 |                         "\n",
155 |                         "print(\"\\nAfter merge operation:\")\n",
156 |                         "spark.table(table_name).show()\n"
157 |                     ],
158 |                     "execution_count": None,
159 |                     "outputs": [],
160 |                     "metadata": {}
161 |                 }
162 |             ]
163 |         }
164 |     
165 |     @staticmethod
166 |     def get_streaming_template() -> Dict[str, Any]:
167 |         """Template for PySpark Structured Streaming in Fabric."""
168 |         return {
169 |             "cells": [
170 |                 {
171 |                     "cell_type": "markdown",
172 |                     "source": [
173 |                         "# PySpark Structured Streaming in Fabric\n",
174 |                         "\n",
175 |                         "This notebook demonstrates real-time data processing using PySpark Structured Streaming.\n"
176 |                     ],
177 |                     "metadata": {}
178 |                 },
179 |                 {
180 |                     "cell_type": "code",
181 |                     "source": [
182 |                         "# Import streaming libraries\n",
183 |                         "from pyspark.sql import SparkSession\n",
184 |                         "from pyspark.sql.functions import *\n",
185 |                         "from pyspark.sql.types import *\n",
186 |                         "import time\n",
187 |                         "\n",
188 |                         "print(f\"Spark version: {spark.version}\")\n",
189 |                         "print(\"Structured Streaming capabilities enabled\")\n"
190 |                     ],
191 |                     "execution_count": None,
192 |                     "outputs": [],
193 |                     "metadata": {}
194 |                 },
195 |                 {
196 |                     "cell_type": "code",
197 |                     "source": [
198 |                         "# Define schema for streaming data\n",
199 |                         "streaming_schema = StructType([\n",
200 |                         "    StructField(\"timestamp\", TimestampType(), True),\n",
201 |                         "    StructField(\"user_id\", StringType(), True),\n",
202 |                         "    StructField(\"event_type\", StringType(), True),\n",
203 |                         "    StructField(\"value\", DoubleType(), True)\n",
204 |                         "])\n",
205 |                         "\n",
206 |                         "# Create a streaming DataFrame (example with rate source for demo)\n",
207 |                         "streaming_df = spark \\\n",
208 |                         "    .readStream \\\n",
209 |                         "    .format(\"rate\") \\\n",
210 |                         "    .option(\"rowsPerSecond\", 10) \\\n",
211 |                         "    .load()\n",
212 |                         "\n",
213 |                         "# Transform the rate stream to simulate real events\n",
214 |                         "events_df = streaming_df \\\n",
215 |                         "    .withColumn(\"user_id\", concat(lit(\"user_\"), (col(\"value\") % 100).cast(\"string\"))) \\\n",
216 |                         "    .withColumn(\"event_type\", \n",
217 |                         "        when(col(\"value\") % 3 == 0, \"purchase\")\n",
218 |                         "        .when(col(\"value\") % 3 == 1, \"view\")\n",
219 |                         "        .otherwise(\"click\")\n",
220 |                         "    ) \\\n",
221 |                         "    .withColumn(\"event_value\", (col(\"value\") % 1000).cast(\"double\")) \\\n",
222 |                         "    .select(\"timestamp\", \"user_id\", \"event_type\", \"event_value\")\n",
223 |                         "\n",
224 |                         "print(\"Streaming DataFrame created\")\n",
225 |                         "print(f\"Schema: {events_df.schema}\")\n"
226 |                     ],
227 |                     "execution_count": None,
228 |                     "outputs": [],
229 |                     "metadata": {}
230 |                 },
231 |                 {
232 |                     "cell_type": "code",
233 |                     "source": [
234 |                         "# Streaming aggregations\n",
235 |                         "# Count events by type in 30-second windows\n",
236 |                         "windowed_counts = events_df \\\n",
237 |                         "    .withWatermark(\"timestamp\", \"30 seconds\") \\\n",
238 |                         "    .groupBy(\n",
239 |                         "        window(col(\"timestamp\"), \"30 seconds\"),\n",
240 |                         "        col(\"event_type\")\n",
241 |                         "    ) \\\n",
242 |                         "    .count() \\\n",
243 |                         "    .orderBy(\"window\")\n",
244 |                         "\n",
245 |                         "# Start streaming query (console output)\n",
246 |                         "query = windowed_counts \\\n",
247 |                         "    .writeStream \\\n",
248 |                         "    .outputMode(\"complete\") \\\n",
249 |                         "    .format(\"console\") \\\n",
250 |                         "    .option(\"truncate\", False) \\\n",
251 |                         "    .trigger(processingTime=\"10 seconds\") \\\n",
252 |                         "    .start()\n",
253 |                         "\n",
254 |                         "print(\"Streaming query started. Check output below...\")\n",
255 |                         "print(f\"Query ID: {query.id}\")\n"
256 |                     ],
257 |                     "execution_count": None,
258 |                     "outputs": [],
259 |                     "metadata": {}
260 |                 },
261 |                 {
262 |                     "cell_type": "code",
263 |                     "source": [
264 |                         "# Let the stream run for a short time\n",
265 |                         "import time\n",
266 |                         "time.sleep(30)  # Run for 30 seconds\n",
267 |                         "\n",
268 |                         "# Stop the query\n",
269 |                         "query.stop()\n",
270 |                         "print(\"Streaming query stopped\")\n",
271 |                         "\n",
272 |                         "# Show query progress\n",
273 |                         "print(\"\\nQuery progress:\")\n",
274 |                         "print(query.lastProgress)\n"
275 |                     ],
276 |                     "execution_count": None,
277 |                     "outputs": [],
278 |                     "metadata": {}
279 |                 },
280 |                 {
281 |                     "cell_type": "code",
282 |                     "source": [
283 |                         "# Stream to Delta Lake table\n",
284 |                         "streaming_table = \"streaming_events\"\n",
285 |                         "\n",
286 |                         "# Create another streaming query that writes to Delta\n",
287 |                         "delta_query = events_df \\\n",
288 |                         "    .writeStream \\\n",
289 |                         "    .format(\"delta\") \\\n",
290 |                         "    .outputMode(\"append\") \\\n",
291 |                         "    .option(\"checkpointLocation\", \"/tmp/checkpoint/streaming_events\") \\\n",
292 |                         "    .table(streaming_table)\n",
293 |                         "\n",
294 |                         "print(f\"Started streaming to Delta table: {streaming_table}\")\n",
295 |                         "print(f\"Query ID: {delta_query.id}\")\n",
296 |                         "\n",
297 |                         "# Let it run briefly\n",
298 |                         "time.sleep(20)\n",
299 |                         "\n",
300 |                         "# Stop and check results\n",
301 |                         "delta_query.stop()\n",
302 |                         "\n",
303 |                         "# Read from the Delta table\n",
304 |                         "result_df = spark.table(streaming_table)\n",
305 |                         "print(f\"\\nTotal records in Delta table: {result_df.count()}\")\n",
306 |                         "result_df.show(20)\n"
307 |                     ],
308 |                     "execution_count": None,
309 |                     "outputs": [],
310 |                     "metadata": {}
311 |                 }
312 |             ]
313 |         }
314 | 
315 | class PySparkCodeGenerator:
316 |     """Generates PySpark code snippets for common operations."""
317 |     
318 |     @staticmethod
319 |     def generate_fabric_lakehouse_reader(lakehouse_name: str, table_name: str) -> str:
320 |         """Generate code to read from a Fabric Lakehouse table."""
321 |         return f"""# Read from Fabric Lakehouse table
322 | df = spark.table("{lakehouse_name}.{table_name}")
323 | 
324 | # Alternative: Read from Delta files directly
325 | # df = spark.read.format("delta").load("Tables/{table_name}")
326 | 
327 | # Show basic info
328 | print(f"Records: {{df.count()}}")
329 | print(f"Columns: {{len(df.columns)}}")
330 | df.printSchema()
331 | df.show(10)"""
332 | 
333 |     @staticmethod
334 |     def generate_fabric_lakehouse_writer(table_name: str, mode: str = "overwrite") -> str:
335 |         """Generate code to write to a Fabric Lakehouse table."""
336 |         return f"""# Write to Fabric Lakehouse table
337 | df.write \\
338 |     .format("delta") \\
339 |     .mode("{mode}") \\
340 |     .option("overwriteSchema", "true") \\
341 |     .saveAsTable("{table_name}")
342 | 
343 | print(f"Successfully wrote {{df.count()}} records to table '{table_name}'")
344 | 
345 | # Verify the write
346 | verification_df = spark.table("{table_name}")
347 | print(f"Verification - Table now has {{verification_df.count()}} records")"""
348 | 
349 |     @staticmethod
350 |     def generate_delta_merge_operation(target_table: str, source_df_name: str, join_condition: str) -> str:
351 |         """Generate code for Delta Lake merge operations."""
352 |         return f"""# Delta Lake merge operation (UPSERT)
353 | from delta.tables import DeltaTable
354 | 
355 | # Create DeltaTable reference
356 | target_table = DeltaTable.forName(spark, "{target_table}")
357 | 
358 | # Perform merge operation
359 | target_table.alias("target") \\
360 |     .merge(
361 |         {source_df_name}.alias("source"),
362 |         "{join_condition}"
363 |     ) \\
364 |     .whenMatchedUpdateAll() \\
365 |     .whenNotMatchedInsertAll() \\
366 |     .execute()
367 | 
368 | print("Merge operation completed successfully")
369 | print(f"Table now has {{spark.table('{target_table}').count()}} records")"""
370 | 
371 |     @staticmethod
372 |     def generate_performance_monitoring() -> str:
373 |         """Generate code for monitoring PySpark performance."""
374 |         return """# PySpark Performance Monitoring
375 | 
376 | # 1. Check Spark configuration
377 | print("=== Spark Configuration ===")
378 | for key, value in spark.sparkContext.getConf().getAll():
379 |     if 'spark.sql' in key or 'spark.serializer' in key:
380 |         print(f"{key}: {value}")
381 | 
382 | # 2. Monitor DataFrame operations
383 | from pyspark.sql.utils import AnalysisException
384 | import time
385 | 
386 | def monitor_operation(df, operation_name):
387 |     start_time = time.time()
388 |     try:
389 |         count = df.count()
390 |         end_time = time.time()
391 |         duration = end_time - start_time
392 |         print(f"{operation_name}: {count} records in {duration:.2f} seconds")
393 |         return count, duration
394 |     except Exception as e:
395 |         print(f"Error in {operation_name}: {e}")
396 |         return 0, 0
397 | 
398 | # Example usage:
399 | # count, duration = monitor_operation(df, "DataFrame Count")
400 | 
401 | # 3. Show execution plan
402 | print("\\n=== Execution Plan ===")
403 | df.explain(True)
404 | 
405 | # 4. Cache analysis
406 | print("\\n=== Storage Levels ===")
407 | print(f"DataFrame cached: {df.is_cached}")
408 | if df.is_cached:
409 |     print(f"Storage level: {df.storageLevel}")"""
410 | 
411 | class PySparkValidator:
412 |     """Validates PySpark code and suggests optimizations."""
413 |     
414 |     @staticmethod
415 |     def validate_fabric_compatibility(code: str) -> Dict[str, List[str]]:
416 |         """Check if code is compatible with Microsoft Fabric."""
417 |         issues = []
418 |         suggestions = []
419 |         
420 |         # Check for Fabric-specific patterns
421 |         if 'SparkSession.builder' in code:
422 |             issues.append("❌ Don't create SparkSession in Fabric - use pre-configured 'spark' variable")
423 |         
424 |         if 'notebookutils' not in code and any(pattern in code for pattern in ['lakehouse', 'workspace']):
425 |             suggestions.append("💡 Consider using 'notebookutils' for Fabric integration")
426 |         
427 |         if '.saveAsTable(' in code and 'format("delta")' not in code:
428 |             suggestions.append("💡 Specify Delta format explicitly when saving tables in Fabric")
429 |         
430 |         if 'jdbc' in code.lower():
431 |             suggestions.append("💡 Consider using Fabric's built-in connectors instead of JDBC")
432 |         
433 |         return {
434 |             "issues": issues,
435 |             "suggestions": suggestions
436 |         }
437 |     
438 |     @staticmethod
439 |     def check_performance_patterns(code: str) -> Dict[str, List[str]]:
440 |         """Check for performance anti-patterns and optimizations."""
441 |         warnings = []
442 |         optimizations = []
443 |         
444 |         # Performance anti-patterns
445 |         if '.collect()' in code:
446 |             warnings.append("⚠️ .collect() can cause OOM on large datasets")
447 |         
448 |         if 'rdd.' in code and 'parallelize' not in code:
449 |             warnings.append("⚠️ RDD operations are less optimized than DataFrame operations")
450 |         
451 |         if code.count('spark.read') > 3 and '.cache()' not in code:
452 |             optimizations.append("💡 Consider caching frequently accessed DataFrames")
453 |         
454 |         if '.join(' in code and 'broadcast' not in code:
455 |             optimizations.append("💡 Consider broadcast joins for small dimension tables")
456 |         
457 |         if '.write.' in code and 'partitionBy' not in code:
458 |             optimizations.append("💡 Consider partitioning large datasets for better performance")
459 |         
460 |         return {
461 |             "warnings": warnings,
462 |             "optimizations": optimizations
463 |         }
464 | 
465 | def create_notebook_from_template(template_name: str, custom_params: Optional[Dict] = None) -> Dict[str, Any]:
466 |     """Create a complete notebook from a template."""
467 |     template_manager = PySparkTemplateManager()
468 |     
469 |     templates = {
470 |         "fabric_integration": template_manager.get_fabric_integration_template(),
471 |         "streaming": template_manager.get_streaming_template(),
472 |     }
473 |     
474 |     if template_name not in templates:
475 |         raise ValueError(f"Unknown template: {template_name}. Available: {list(templates.keys())}")
476 |     
477 |     template = templates[template_name]
478 |     
479 |     # Create notebook structure
480 |     notebook = {
481 |         "nbformat": 4,
482 |         "nbformat_minor": 5,
483 |         "cells": template["cells"],
484 |         "metadata": {
485 |             "language_info": {"name": "python"},
486 |             "kernel_info": {"name": "synapse_pyspark"},
487 |             "description": f"PySpark notebook created from {template_name} template"
488 |         }
489 |     }
490 |     
491 |     return notebook
492 | 


--------------------------------------------------------------------------------
/helpers/clients/fabric_client.py:
--------------------------------------------------------------------------------
  1 | from pydantic import BaseModel
  2 | from typing import Dict, Any, List, Optional, Tuple, Union
  3 | import base64
  4 | from urllib.parse import quote
  5 | from functools import lru_cache
  6 | import requests
  7 | from azure.identity import DefaultAzureCredential
  8 | from helpers.logging_config import get_logger
  9 | from helpers.utils import _is_valid_uuid
 10 | import json
 11 | from uuid import UUID
 12 | 
 13 | logger = get_logger(__name__)
 14 | # from  sempy_labs._helper_functions import create_item
 15 | 
 16 | 
 17 | 
 18 | class FabricApiConfig(BaseModel):
 19 |     """Configuration for Fabric API"""
 20 | 
 21 |     base_url: str = "https://api.fabric.microsoft.com/v1"
 22 |     max_results: int = 100
 23 | 
 24 | 
 25 | class FabricApiClient:
 26 |     """Client for communicating with the Fabric API"""
 27 | 
 28 |     def __init__(self, credential=None, config=None):
 29 |         self.credential = credential or DefaultAzureCredential()
 30 |         self.config = config or FabricApiConfig()
 31 |         # Initialize cached methods
 32 |         self._cached_resolve_workspace = lru_cache(maxsize=128)(self._resolve_workspace)
 33 |         self._cached_resolve_lakehouse = lru_cache(maxsize=128)(self._resolve_lakehouse)
 34 | 
 35 |     def _get_headers(self) -> Dict[str, str]:
 36 |         """Get headers for Fabric API calls"""
 37 |         return {
 38 |             "Authorization": f"Bearer {self.credential.get_token('https://api.fabric.microsoft.com/.default').token}"
 39 |         }
 40 | 
 41 |     def _build_url(
 42 |         self, endpoint: str, continuation_token: Optional[str] = None
 43 |     ) -> str:
 44 |         # If the endpoint starts with http, use it as-is.
 45 |         url = (
 46 |             endpoint
 47 |             if endpoint.startswith("http")
 48 |             else f"{self.config.base_url}/{endpoint.lstrip('/')}"
 49 |         )
 50 |         if continuation_token:
 51 |             separator = "&" if "?" in url else "?"
 52 |             encoded_token = quote(continuation_token)
 53 |             url += f"{separator}continuationToken={encoded_token}"
 54 |         return url
 55 | 
 56 |     async def _make_request(
 57 |         self,
 58 |         endpoint: str,
 59 |         params: Optional[Dict] = None,
 60 |         method: str = "GET",
 61 |         use_pagination: bool = False,
 62 |         data_key: str = "value",
 63 |         lro: bool = False,
 64 |         lro_poll_interval: int = 2,  # seconds between polls
 65 |         lro_timeout: int = 300,  # max seconds to wait
 66 |     ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
 67 |         """
 68 |         Make an asynchronous call to the Fabric API.
 69 | 
 70 |         If use_pagination is True, it will automatically handle paginated responses.
 71 | 
 72 |         If lro is True, will poll for long-running operation completion.
 73 |         """
 74 |         import time
 75 | 
 76 |         params = params or {}
 77 | 
 78 |         if not use_pagination:
 79 |             url = self._build_url(endpoint=endpoint)
 80 |             try:
 81 |                 if method.upper() == "POST":
 82 |                     # logger.debug(f"Authorization header: {self._get_headers()}")
 83 |                     # logger.debug(f"Request URL: {url}")
 84 |                     # logger.debug(f"Request parameters: {params}")
 85 |                     response = requests.post(
 86 |                         url,
 87 |                         headers=self._get_headers(),
 88 |                         json=params,
 89 |                         timeout=120,
 90 |                     )
 91 |                 else:
 92 |                     if "maxResults" not in params:
 93 |                         params["maxResults"] = self.config.max_results
 94 |                     response = requests.request(
 95 |                         method=method.upper(),
 96 |                         url=url,
 97 |                         headers=self._get_headers(),
 98 |                         params=params,
 99 |                         timeout=120,
100 |                     )
101 |     
102 |                 # LRO support: check for 202 and Operation-Location
103 |                 if lro and response.status_code == 202:
104 |                     op_url = response.headers.get(
105 |                         "Operation-Location"
106 |                     ) or response.headers.get("operation-location")
107 |                     if not op_url:
108 |                         logger.error("LRO: No Operation-Location header found.")
109 |                         return None
110 |                     logger.info(f"LRO: Polling {op_url} for operation status...")
111 |                     start_time = time.time()
112 |                     while True:
113 |                         poll_resp = requests.get(
114 |                             op_url, headers=self._get_headers(), timeout=60
115 |                         )
116 |                         if poll_resp.status_code not in (200, 201, 202):
117 |                             logger.error(
118 |                                 f"LRO: Poll failed with status {poll_resp.status_code}"
119 |                             )
120 |                             return None
121 |                         poll_data = poll_resp.json()
122 |                         status = poll_data.get("status") or poll_data.get(
123 |                             "operationStatus"
124 |                         )
125 |                         if status in (
126 |                             "Succeeded",
127 |                             "succeeded",
128 |                             "Completed",
129 |                             "completed",
130 |                         ):
131 |                             logger.info("LRO: Operation succeeded.")
132 |                             return poll_data
133 |                         if status in ("Failed", "failed", "Canceled", "canceled"):
134 |                             logger.error(
135 |                                 f"LRO: Operation failed or canceled. Status: {status}"
136 |                             )
137 |                             return poll_data
138 |                         if time.time() - start_time > lro_timeout:
139 |                             logger.error("LRO: Polling timed out.")
140 |                             return None
141 |                         logger.debug(
142 |                             f"LRO: Status {status}, waiting {lro_poll_interval}s..."
143 |                         )
144 |                         time.sleep(lro_poll_interval)
145 |                 response.raise_for_status()
146 |                 return response.json()
147 |             except requests.RequestException as e:
148 |                 logger.error(f"API call failed: {str(e)}")
149 |                 if e.response is not None:
150 |                     logger.error(f"Response content: {e.response.text}")
151 |                 return None
152 |         else:
153 |             results = []
154 |             continuation_token = None
155 |             while True:
156 |                 url = self._build_url(
157 |                     endpoint=endpoint, continuation_token=continuation_token
158 |                 )
159 |                 request_params = params.copy()
160 |                 # Remove any existing continuationToken in parameters to avoid conflict.
161 |                 request_params.pop("continuationToken", None)
162 |                 try:
163 |                     if method.upper() == "POST":
164 |                         response = requests.post(
165 |                             url,
166 |                             headers=self._get_headers(),
167 |                             json=request_params,
168 |                             timeout=120,
169 |                         )
170 |                     else:
171 |                         if "maxResults" not in request_params:
172 |                             request_params["maxResults"] = self.config.max_results
173 |                         response = requests.request(
174 |                             method=method.upper(),
175 |                             url=url,
176 |                             headers=self._get_headers(),
177 |                             params=request_params,
178 |                             timeout=120,
179 |                         )
180 |                     response.raise_for_status()
181 |                     data = response.json()
182 |                 except requests.RequestException as e:
183 |                     logger.error(f"API call failed: {str(e)}")
184 |                     if e.response is not None:
185 |                         logger.error(f"Response content: {e.response.text}")
186 |                     return results if results else None
187 | 
188 |                 if not isinstance(data, dict) or data_key not in data:
189 |                     raise ValueError(f"Unexpected response format: {data}")
190 | 
191 |                 results.extend(data[data_key])
192 |                 continuation_token = data.get("continuationToken")
193 |                 if not continuation_token:
194 |                     break
195 |             return results
196 | 
197 |     async def get_workspaces(self) -> List[Dict]:
198 |         """Get all available workspaces"""
199 |         return await self._make_request("workspaces", use_pagination=True)
200 | 
201 |     async def get_lakehouses(self, workspace_id: str) -> List[Dict]:
202 |         """Get all lakehouses in a workspace"""
203 |         return await self.get_items(workspace_id=workspace_id, item_type="Lakehouse")
204 | 
205 |     async def get_warehouses(self, workspace_id: str) -> List[Dict]:
206 |         """Get all warehouses in a workspace
207 |         Args:
208 |             workspace_id: ID of the workspace
209 |         Returns:
210 |             A list of dictionaries containing warehouse details or an error message.
211 |         """
212 |         return await self.get_items(workspace_id=workspace_id, item_type="Warehouse")
213 | 
214 |     async def get_tables(self, workspace_id: str, rsc_id: str, type: str) -> List[Dict]:
215 |         """Get all tables in a lakehouse
216 |         Args:
217 |             workspace_id: ID of the workspace
218 |             rsc_id: ID of the lakehouse
219 |             type: Type of the resource (e.g., "Lakehouse" or "Warehouse")
220 |         Returns:
221 |             A list of dictionaries containing table details or an error message.
222 |         """
223 |         return await self._make_request(
224 |             f"workspaces/{workspace_id}/{type}s/{rsc_id}/tables",
225 |             use_pagination=True,
226 |             data_key="data",
227 |         )
228 | 
229 |     async def get_reports(self, workspace_id: str) -> List[Dict]:
230 |         """Get all reports in a lakehouse
231 |         Args:
232 |             workspace_id: ID of the workspace
233 |         Returns:
234 |             A list of dictionaries containing report details or an error message.
235 |         """
236 |         return await self._make_request(
237 |             f"workspaces/{workspace_id}/reports",
238 |             use_pagination=True,
239 |             data_key="value",
240 |         )
241 | 
242 |     async def get_report(self, workspace_id: str, report_id: str) -> Dict:
243 |         """Get a specific report by ID
244 | 
245 |         Args:
246 |             workspace_id: ID of the workspace
247 |             report_id: ID of the report
248 | 
249 |         Returns:
250 |             A dictionary containing the report details or an error message.
251 |         """
252 |         return await self._make_request(
253 |             f"workspaces/{workspace_id}/reports/{report_id}"
254 |         )
255 | 
256 |     async def get_semantic_models(self, workspace_id: str) -> List[Dict]:
257 |         """Get all semantic models in a lakehouse"""
258 |         return await self._make_request(
259 |             f"workspaces/{workspace_id}/semanticModels",
260 |             use_pagination=True,
261 |             data_key="value",
262 |         )
263 | 
264 |     async def get_semantic_model(self, workspace_id: str, model_id: str) -> Dict:
265 |         """Get a specific semantic model by ID"""
266 |         return await self._make_request(
267 |             f"workspaces/{workspace_id}/semanticModels/{model_id}"
268 |         )
269 | 
270 |     async def resolve_workspace(self, workspace: str) -> str:
271 |         """Convert workspace name or ID to workspace ID with caching"""
272 |         return await self._cached_resolve_workspace(workspace)
273 | 
274 |     async def _resolve_workspace(self, workspace: str) -> str:
275 |         """Internal method to convert workspace name or ID to workspace ID"""
276 |         if _is_valid_uuid(workspace):
277 |             return workspace
278 | 
279 |         workspaces = await self.get_workspaces()
280 |         matching_workspaces = [
281 |             w for w in workspaces if w["displayName"].lower() == workspace.lower()
282 |         ]
283 | 
284 |         if not matching_workspaces:
285 |             raise ValueError(f"No workspaces found with name: {workspace}")
286 |         if len(matching_workspaces) > 1:
287 |             raise ValueError(f"Multiple workspaces found with name: {workspace}")
288 | 
289 |         return matching_workspaces[0]["id"]
290 | 
291 |     async def resolve_lakehouse(self, workspace_id: str, lakehouse: str) -> str:
292 |         """Convert lakehouse name or ID to lakehouse ID with caching"""
293 |         return await self._cached_resolve_lakehouse(workspace_id, lakehouse)
294 | 
295 |     async def _resolve_lakehouse(self, workspace_id: str, lakehouse: str) -> str:
296 |         """Internal method to convert lakehouse name or ID to lakehouse ID"""
297 |         if _is_valid_uuid(lakehouse):
298 |             return lakehouse
299 | 
300 |         lakehouses = await self.get_lakehouses(workspace_id)
301 |         matching_lakehouses = [
302 |             lh for lh in lakehouses if lh["displayName"].lower() == lakehouse.lower()
303 |         ]
304 | 
305 |         if not matching_lakehouses:
306 |             raise ValueError(f"No lakehouse found with name: {lakehouse}")
307 |         if len(matching_lakehouses) > 1:
308 |             raise ValueError(f"Multiple lakehouses found with name: {lakehouse}")
309 | 
310 |         return matching_lakehouses[0]["id"]
311 | 
312 |     async def get_items(
313 |         self,
314 |         workspace_id: str,
315 |         item_type: Optional[str] = None,
316 |         params: Optional[Dict] = None,
317 |     ) -> List[Dict]:
318 |         """Get all items in a workspace"""
319 |         if not _is_valid_uuid(workspace_id):
320 |             raise ValueError("Invalid workspace ID.")
321 |         if item_type:
322 |             params = params or {}
323 |             params["type"] = item_type
324 |         return await self._make_request(
325 |             f"workspaces/{workspace_id}/items", params=params, use_pagination=True
326 |         )
327 | 
328 |     async def get_item(
329 |         self,
330 |         item_id: str,
331 |         workspace_id: str,
332 |         item_type: Optional[str] = None,
333 |     ) -> Dict:
334 |         """Get a specific item by ID"""
335 | 
336 |         if not _is_valid_uuid(item_id):
337 |             item_name, item_id = await self.resolve_item_name_and_id(item_id)
338 |         if not _is_valid_uuid(workspace_id):
339 |             (workspace_name, workspace_id) = await self.resolve_workspace_name_and_id(
340 |                 workspace_id
341 |             )
342 |         return await self._make_request(
343 |             f"workspaces/{workspace_id}/{item_type}s/{item_id}"
344 |         )
345 | 
346 |     async def create_item(
347 |         self,
348 |         name: str,
349 |         type: str,
350 |         description: Optional[str] = None,
351 |         definition: Optional[dict] = None,
352 |         workspace: Optional[str | UUID] = None,
353 |         lro: Optional[bool] = False,
354 |     ):
355 |         """
356 |         Creates an item in a Fabric workspace.
357 | 
358 |         Parameters
359 |         ----------
360 |         name : str
361 |             The name of the item to be created.
362 |         type : str
363 |             The type of the item to be created.
364 |         description : str, default=None
365 |             A description of the item to be created.
366 |         definition : dict, default=None
367 |             The definition of the item to be created.
368 |         workspace : str | uuid.UUID, default=None
369 |             The Fabric workspace name or ID.
370 |             Defaults to None which resolves to the workspace of the attached lakehouse
371 |             or if no lakehouse attached, resolves to the workspace of the notebook.
372 |         """
373 |         from sempy_labs._utils import item_types
374 | 
375 |         if _is_valid_uuid(workspace):
376 |             workspace_id = workspace
377 |         else:
378 |             (workspace_name, workspace_id) = await self.resolve_workspace_name_and_id(
379 |                 workspace
380 |             )
381 |         item_type = item_types.get(type)[0].lower()
382 | 
383 |         payload = {
384 |             "displayName": name,
385 |         }
386 |         if description:
387 |             payload["description"] = description
388 |         if definition:
389 |             payload["definition"] = definition
390 | 
391 |         try:
392 |             response = await self._make_request(
393 |                 endpoint=f"workspaces/{workspace_id}/{item_type}s",
394 |                 method="post",
395 |                 params=payload,
396 |                 lro=lro,
397 |                 lro_poll_interval=0.5,
398 |             )
399 |         except requests.RequestException as e:
400 |             logger.error(f"API call failed: {str(e)}")
401 |             if e.response is not None:
402 |                 logger.error(f"Response content: {e.response.text}")
403 |             raise ValueError(
404 |                 f"Failed to create item '{name}' of type '{item_type}' in the '{workspace_id}' workspace."
405 |             )        
406 |         
407 |         # Check if response contains an error
408 |         if isinstance(response, dict):
409 |             if "error" in response:
410 |                 error_msg = response.get("error", {}).get("message", "Unknown error")
411 |                 logger.error(f"API error creating item: {error_msg}")
412 |                 raise ValueError(f"Failed to create item '{name}': {error_msg}")
413 |             
414 |             # Check if item was created successfully
415 |             if "id" in response:
416 |                 logger.info(f"Successfully created item '{name}' with ID: {response['id']}")
417 |                 return response
418 |             
419 |             # If no ID and no error, log the full response for debugging
420 |             logger.warning(f"Unexpected response format: {response}")
421 |         
422 |         # Legacy check - may not be reliable for all item types
423 |         if hasattr(response, 'get') and response.get("displayName") and response.get("displayName") != name:
424 |             logger.warning(f"Response displayName '{response.get('displayName')}' doesn't match requested name '{name}', but this may be normal")
425 |         
426 |         return response
427 | 
428 |     async def resolve_item_name_and_id(
429 |         self,
430 |         item: str | UUID,
431 |         type: Optional[str] = None,
432 |         workspace: Optional[str | UUID] = None,
433 |     ) -> Tuple[str, UUID]:
434 |         (workspace_name, workspace_id) = await self.resolve_workspace_name_and_id(
435 |             workspace
436 |         )
437 |         item_id = await self.resolve_item_id(
438 |             item=item, type=type, workspace=workspace_id
439 |         )
440 |         item_data = await self._make_request(
441 |             f"workspaces/{workspace_id}/items/{item_id}"
442 |         )
443 |         item_name = item_data.get("displayName")
444 |         return item_name, item_id
445 | 
446 |     async def resolve_item_id(
447 |         self,
448 |         item: str | UUID,
449 |         type: Optional[str] = None,
450 |         workspace: Optional[str | UUID] = None,
451 |     ) -> UUID:
452 |         (workspace_name, workspace_id) = await self.resolve_workspace_name_and_id(
453 |             workspace
454 |         )
455 |         item_id = None
456 | 
457 |         if _is_valid_uuid(item):
458 |             # Check (optional)
459 |             item_id = item
460 |             try:
461 |                 self._make_request(
462 |                     endpoint=f"workspaces/{workspace_id}/items/{item_id}"
463 |                 )
464 |             except requests.RequestException:
465 |                 raise ValueError(
466 |                     f"The '{item_id}' item was not found in the '{workspace_name}' workspace."
467 |                 )
468 |         else:
469 |             if type is None:
470 |                 raise ValueError(
471 |                     "The 'type' parameter is required if specifying an item name."
472 |                 )
473 |             responses = await self._make_request(
474 |                 endpoint=f"workspaces/{workspace_id}/items?type={type}",
475 |                 use_pagination=True,
476 |             )
477 |             for v in responses:
478 |                 display_name = v["displayName"]
479 |                 if display_name == item:
480 |                     item_id = v.get("id")
481 |                     break
482 | 
483 |         if item_id is None:
484 |             raise ValueError(
485 |                 f"There's no item '{item}' of type '{type}' in the '{workspace_name}' workspace."
486 |             )
487 | 
488 |         return item_id
489 | 
490 |     async def resolve_workspace_name_and_id(
491 |         self,
492 |         workspace: Optional[str | UUID] = None,
493 |     ) -> Tuple[str, UUID]:
494 |         """
495 |         Obtains the name and ID of the Fabric workspace.
496 | 
497 |         Parameters
498 |         ----------
499 |         workspace : str | uuid.UUID, default=None
500 |             The Fabric workspace name or ID.
501 |             Defaults to None which resolves to the workspace of the attached lakehouse
502 |             or if no lakehouse attached, resolves to the workspace of the notebook.
503 | 
504 |         Returns
505 |         -------
506 |         str, uuid.UUID
507 |             The name and ID of the Fabric workspace.
508 |         """
509 |         logger.debug(f"Resolving workspace name and ID for: {workspace}")
510 |         if workspace is None:
511 |             raise ValueError("Workspace must be specified.")
512 |         elif _is_valid_uuid(workspace):
513 |             workspace_id = workspace
514 |             workspace_name = await self.resolve_workspace_name(workspace_id)
515 |             return workspace_name, workspace_id
516 |         else:
517 |             responses = await self._make_request(
518 |                 endpoint="workspaces", use_pagination=True
519 |             )
520 |             workspace_id = None
521 |             workspace_name = None
522 |             for r in responses:
523 |                 display_name = r.get("displayName")
524 |                 if display_name == workspace:
525 |                     workspace_name = workspace
526 |                     workspace_id = r.get("id")
527 |                     return workspace_name, workspace_id
528 | 
529 |         if workspace_name is None or workspace_id is None:
530 |             raise ValueError("Workspace not found")
531 | 
532 |         return workspace_name, workspace_id
533 | 
534 |     async def resolve_workspace_name(self, workspace_id: Optional[UUID] = None) -> str:
535 |         try:
536 |             response = await self._make_request(endpoint=f"workspaces/{workspace_id}")
537 |             if not response or "displayName" not in response:
538 |                 raise ValueError(
539 |                     f"Workspace '{workspace_id}' not found or API response invalid: {response}"
540 |                 )
541 |         except requests.RequestException:
542 |             raise ValueError(f"The '{workspace_id}' workspace was not found.")
543 | 
544 |         return response.get("displayName")
545 | 
546 |     async def get_notebooks(self, workspace_id: str) -> List[Dict]:
547 |         """Get all notebooks in a workspace"""
548 |         return await self.get_items(workspace_id=workspace_id, item_type="Notebook")
549 | 
550 |     async def get_notebook(self, workspace_id: str, notebook_id: str) -> Dict:
551 |         """Get a specific notebook by ID"""
552 |         return await self.get_item(
553 |             item_id=notebook_id, workspace_id=workspace_id, item_type="notebook"
554 |         )
555 | 
556 |     async def create_notebook(
557 |         self, workspace_id: str, notebook_name: str, ipynb_name: str, content: str
558 |     ) -> Dict:
559 |         """Create a new notebook."""
560 |         if not _is_valid_uuid(workspace_id):
561 |             raise ValueError("Invalid workspace ID.")
562 | 
563 |         # Define the notebook definition
564 |         logger.debug(
565 |             f"Defining notebook '{notebook_name}' in workspace '{workspace_id}'."
566 |         )
567 |         definition = {
568 |             "format": "ipynb",
569 |             "parts": [
570 |                 {
571 |                     "path": f"{ipynb_name}.ipynb",
572 |                     "payload": base64.b64encode(
573 |                         content
574 |                         if isinstance(content, bytes)
575 |                         else content.encode("utf-8")
576 |                     ).decode("utf-8"),
577 |                     "payloadType": "InlineBase64",
578 |                 },
579 |                 # {
580 |                 #     "path": ".platform",
581 |                 #     "payload": base64.b64encode("dotPlatformBase64String".encode("utf-8")).decode("utf-8"),
582 |                 #     "payloadType": "InlineBase64",
583 |                 # },
584 |             ],
585 |         }
586 |         logger.info(
587 |             f"-------Creating notebook '{notebook_name}' in workspace '{workspace_id}'."
588 |         )
589 |         return await self.create_item(
590 |             workspace=workspace_id,
591 |             type="Notebook",
592 |             name=notebook_name,
593 |             definition=definition,
594 |         )
595 | 


--------------------------------------------------------------------------------
/tools/notebook.py:
--------------------------------------------------------------------------------
   1 | from helpers.utils.context import mcp, __ctx_cache
   2 | from mcp.server.fastmcp import Context
   3 | from helpers.utils.authentication import get_azure_credentials
   4 | from helpers.clients import (
   5 |     FabricApiClient,
   6 |     NotebookClient,
   7 | )
   8 | import json
   9 | from helpers.logging_config import get_logger
  10 | 
  11 | 
  12 | from typing import Optional, Dict, List, Any
  13 | import base64
  14 | import re
  15 | 
  16 | logger = get_logger(__name__)
  17 | 
  18 | 
  19 | @mcp.tool()
  20 | async def list_notebooks(workspace: Optional[str] = None, ctx: Context = None) -> str:
  21 |     """List all notebooks in a Fabric workspace.
  22 | 
  23 |     Args:
  24 |         workspace: Name or ID of the workspace (optional)
  25 |         ctx: Context object containing client information
  26 |     Returns:
  27 |         A string containing the list of notebooks or an error message.
  28 |     """
  29 | 
  30 |     try:
  31 |         if ctx is None:
  32 |             raise ValueError("Context (ctx) must be provided.")
  33 | 
  34 |         notebook_client = NotebookClient(
  35 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
  36 |         )
  37 |         return await notebook_client.list_notebooks(workspace)
  38 |     except Exception as e:
  39 |         logger.error(f"Error listing notebooks: {str(e)}")
  40 |         return f"Error listing notebooks: {str(e)}"
  41 | 
  42 | 
  43 | @mcp.tool()
  44 | async def create_notebook(
  45 |     workspace: str,
  46 |     # notebook_name: str,
  47 |     # content: str,
  48 |     ctx: Context = None,
  49 | ) -> str:
  50 |     """Create a new notebook in a Fabric workspace.
  51 | 
  52 |     Args:
  53 |         workspace: Name or ID of the workspace
  54 |         notebook_name: Name of the new notebook
  55 |         content: Content of the notebook (in JSON format)
  56 |         ctx: Context object containing client information
  57 |     Returns:
  58 |         A string containing the ID of the created notebook or an error message.
  59 |     """
  60 |     notebook_json = {
  61 |         "nbformat": 4,
  62 |         "nbformat_minor": 5,
  63 |         "cells": [
  64 |             {
  65 |                 "cell_type": "code",
  66 |                 "source": ["print('Hello, Fabric!')\n"],
  67 |                 "execution_count": None,
  68 |                 "outputs": [],
  69 |                 "metadata": {},
  70 |             }
  71 |         ],
  72 |         "metadata": {"language_info": {"name": "python"}},
  73 |     }
  74 |     notebook_content = json.dumps(notebook_json)
  75 |     try:
  76 |         if ctx is None:
  77 |             raise ValueError("Context (ctx) must be provided.")
  78 | 
  79 |         notebook_client = NotebookClient(
  80 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
  81 |         )
  82 |         response = await notebook_client.create_notebook(
  83 |             workspace, "test_notebook_2", notebook_content
  84 |         )
  85 |         return response.get("id", "")  # Return the notebook ID or an empty string
  86 |     except Exception as e:
  87 |         logger.error(f"Error creating notebook: {str(e)}")
  88 |         return f"Error creating notebook: {str(e)}"
  89 | 
  90 | 
  91 | @mcp.tool()
  92 | async def get_notebook_content(
  93 |     workspace: str, 
  94 |     notebook_id: str, 
  95 |     ctx: Context = None
  96 | ) -> str:
  97 |     """Get the content of a specific notebook in a Fabric workspace.
  98 | 
  99 |     Args:
 100 |         workspace: Name or ID of the workspace
 101 |         notebook_id: ID or name of the notebook
 102 |         ctx: Context object containing client information
 103 |     Returns:
 104 |         A string containing the notebook content in JSON format or an error message.
 105 |     """
 106 |     try:
 107 |         if ctx is None:
 108 |             raise ValueError("Context (ctx) must be provided.")
 109 | 
 110 |         notebook_client = NotebookClient(
 111 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
 112 |         )
 113 |         
 114 |         # Get the notebook details
 115 |         notebook = await notebook_client.get_notebook(workspace, notebook_id)
 116 |         
 117 |         if isinstance(notebook, str):  # Error message
 118 |             return notebook
 119 |             
 120 |         # Extract and decode the notebook content
 121 |         definition = notebook.get("definition", {})
 122 |         parts = definition.get("parts", [])
 123 |         
 124 |         for part in parts:
 125 |             if part.get("path", "").endswith(".ipynb"):
 126 |                 payload = part.get("payload", "")
 127 |                 if payload:
 128 |                     # Decode base64 content
 129 |                     decoded_content = base64.b64decode(payload).decode("utf-8")
 130 |                     return decoded_content
 131 |         
 132 |         return "No notebook content found in the definition."
 133 |         
 134 |     except Exception as e:
 135 |         logger.error(f"Error getting notebook content: {str(e)}")
 136 |         return f"Error getting notebook content: {str(e)}"
 137 | 
 138 | 
 139 | @mcp.tool()
 140 | async def create_pyspark_notebook(
 141 |     workspace: str,
 142 |     notebook_name: str,
 143 |     template_type: str = "basic",
 144 |     ctx: Context = None,
 145 | ) -> str:
 146 |     """Create a new PySpark notebook from a template in a Fabric workspace.
 147 | 
 148 |     Args:
 149 |         workspace: Name or ID of the workspace
 150 |         notebook_name: Name of the new notebook
 151 |         template_type: Type of PySpark template ('basic', 'etl', 'analytics', 'ml')
 152 |         ctx: Context object containing client information
 153 |     Returns:
 154 |         A string containing the ID of the created notebook or an error message.
 155 |     """
 156 |     try:
 157 |         if ctx is None:
 158 |             raise ValueError("Context (ctx) must be provided.")
 159 | 
 160 |         # Define PySpark templates
 161 |         templates = {
 162 |             "basic": {
 163 |                 "cells": [
 164 |                     {
 165 |                         "cell_type": "markdown",
 166 |                         "source": [
 167 |                             "# PySpark Notebook\n",
 168 |                             "\n",
 169 |                             "This notebook demonstrates basic PySpark operations in Microsoft Fabric.\n"
 170 |                         ],
 171 |                         "metadata": {}
 172 |                     },
 173 |                     {
 174 |                         "cell_type": "code",
 175 |                         "source": [
 176 |                             "# Initialize Spark session\n",
 177 |                             "from pyspark.sql import SparkSession\n",
 178 |                             "from pyspark.sql.functions import *\n",
 179 |                             "from pyspark.sql.types import *\n",
 180 |                             "\n",
 181 |                             "# Spark session is already available as 'spark' in Fabric\n",
 182 |                             "print(f\"Spark version: {spark.version}\")\n",
 183 |                             "print(f\"Available cores: {spark.sparkContext.defaultParallelism}\")\n"
 184 |                         ],
 185 |                         "execution_count": None,
 186 |                         "outputs": [],
 187 |                         "metadata": {}
 188 |                     },
 189 |                     {
 190 |                         "cell_type": "code",
 191 |                         "source": [
 192 |                             "# Sample data creation\n",
 193 |                             "sample_data = [\n",
 194 |                             "    (1, \"John\", 25, \"Engineering\"),\n",
 195 |                             "    (2, \"Jane\", 30, \"Marketing\"),\n",
 196 |                             "    (3, \"Bob\", 35, \"Sales\"),\n",
 197 |                             "    (4, \"Alice\", 28, \"Engineering\")\n",
 198 |                             "]\n",
 199 |                             "\n",
 200 |                             "schema = StructType([\n",
 201 |                             "    StructField(\"id\", IntegerType(), True),\n",
 202 |                             "    StructField(\"name\", StringType(), True),\n",
 203 |                             "    StructField(\"age\", IntegerType(), True),\n",
 204 |                             "    StructField(\"department\", StringType(), True)\n",
 205 |                             "])\n",
 206 |                             "\n",
 207 |                             "df = spark.createDataFrame(sample_data, schema)\n",
 208 |                             "df.show()\n"
 209 |                         ],
 210 |                         "execution_count": None,
 211 |                         "outputs": [],
 212 |                         "metadata": {}
 213 |                     }
 214 |                 ]
 215 |             },
 216 |             "etl": {
 217 |                 "cells": [
 218 |                     {
 219 |                         "cell_type": "markdown",
 220 |                         "source": [
 221 |                             "# PySpark ETL Pipeline\n",
 222 |                             "\n",
 223 |                             "This notebook demonstrates an ETL pipeline using PySpark in Microsoft Fabric.\n"
 224 |                         ],
 225 |                         "metadata": {}
 226 |                     },
 227 |                     {
 228 |                         "cell_type": "code",
 229 |                         "source": [
 230 |                             "# Import necessary libraries\n",
 231 |                             "from pyspark.sql import SparkSession\n",
 232 |                             "from pyspark.sql.functions import *\n",
 233 |                             "from pyspark.sql.types import *\n",
 234 |                             "from delta.tables import DeltaTable\n",
 235 |                             "\n",
 236 |                             "print(f\"Spark version: {spark.version}\")\n"
 237 |                         ],
 238 |                         "execution_count": None,
 239 |                         "outputs": [],
 240 |                         "metadata": {}
 241 |                     },
 242 |                     {
 243 |                         "cell_type": "code",
 244 |                         "source": [
 245 |                             "# Extract: Read data from source\n",
 246 |                             "# Example: Reading from a lakehouse table\n",
 247 |                             "# df_source = spark.table(\"lakehouse.table_name\")\n",
 248 |                             "\n",
 249 |                             "# For demo purposes, create sample data\n",
 250 |                             "raw_data = [\n",
 251 |                             "    (\"2024-01-01\", \"Product A\", 100, 25.50),\n",
 252 |                             "    (\"2024-01-01\", \"Product B\", 150, 30.00),\n",
 253 |                             "    (\"2024-01-02\", \"Product A\", 120, 25.50),\n",
 254 |                             "    (\"2024-01-02\", \"Product C\", 80, 45.00)\n",
 255 |                             "]\n",
 256 |                             "\n",
 257 |                             "schema = StructType([\n",
 258 |                             "    StructField(\"date\", StringType(), True),\n",
 259 |                             "    StructField(\"product\", StringType(), True),\n",
 260 |                             "    StructField(\"quantity\", IntegerType(), True),\n",
 261 |                             "    StructField(\"price\", DoubleType(), True)\n",
 262 |                             "])\n",
 263 |                             "\n",
 264 |                             "df_raw = spark.createDataFrame(raw_data, schema)\n",
 265 |                             "print(\"Raw data:\")\n",
 266 |                             "df_raw.show()\n"
 267 |                         ],
 268 |                         "execution_count": None,
 269 |                         "outputs": [],
 270 |                         "metadata": {}
 271 |                     },
 272 |                     {
 273 |                         "cell_type": "code",
 274 |                         "source": [
 275 |                             "# Transform: Clean and process data\n",
 276 |                             "df_transformed = df_raw \\\n",
 277 |                             "    .withColumn(\"date\", to_date(col(\"date\"), \"yyyy-MM-dd\")) \\\n",
 278 |                             "    .withColumn(\"revenue\", col(\"quantity\") * col(\"price\")) \\\n",
 279 |                             "    .withColumn(\"year\", year(col(\"date\"))) \\\n",
 280 |                             "    .withColumn(\"month\", month(col(\"date\")))\n",
 281 |                             "\n",
 282 |                             "print(\"Transformed data:\")\n",
 283 |                             "df_transformed.show()\n",
 284 |                             "df_transformed.printSchema()\n"
 285 |                         ],
 286 |                         "execution_count": None,
 287 |                         "outputs": [],
 288 |                         "metadata": {}
 289 |                     },
 290 |                     {
 291 |                         "cell_type": "code",
 292 |                         "source": [
 293 |                             "# Load: Write processed data to target\n",
 294 |                             "# Example: Writing to a Delta table in lakehouse\n",
 295 |                             "# df_transformed.write \\\n",
 296 |                             "#     .format(\"delta\") \\\n",
 297 |                             "#     .mode(\"overwrite\") \\\n",
 298 |                             "#     .saveAsTable(\"lakehouse.processed_sales\")\n",
 299 |                             "\n",
 300 |                             "print(\"ETL pipeline completed successfully!\")\n",
 301 |                             "print(f\"Processed {df_transformed.count()} records\")\n"
 302 |                         ],
 303 |                         "execution_count": None,
 304 |                         "outputs": [],
 305 |                         "metadata": {}
 306 |                     }
 307 |                 ]
 308 |             },
 309 |             "analytics": {
 310 |                 "cells": [
 311 |                     {
 312 |                         "cell_type": "markdown",
 313 |                         "source": [
 314 |                             "# PySpark Data Analytics\n",
 315 |                             "\n",
 316 |                             "This notebook demonstrates data analytics using PySpark in Microsoft Fabric.\n"
 317 |                         ],
 318 |                         "metadata": {}
 319 |                     },
 320 |                     {
 321 |                         "cell_type": "code",
 322 |                         "source": [
 323 |                             "# Import libraries for analytics\n",
 324 |                             "from pyspark.sql import SparkSession\n",
 325 |                             "from pyspark.sql.functions import *\n",
 326 |                             "from pyspark.sql.types import *\n",
 327 |                             "from pyspark.sql.window import Window\n",
 328 |                             "\n",
 329 |                             "print(f\"Spark version: {spark.version}\")\n"
 330 |                         ],
 331 |                         "execution_count": None,
 332 |                         "outputs": [],
 333 |                         "metadata": {}
 334 |                     },
 335 |                     {
 336 |                         "cell_type": "code",
 337 |                         "source": [
 338 |                             "# Create sample sales data for analytics\n",
 339 |                             "sales_data = [\n",
 340 |                             "    (\"2024-01-01\", \"North\", \"Product A\", 1000, 100),\n",
 341 |                             "    (\"2024-01-01\", \"South\", \"Product A\", 800, 80),\n",
 342 |                             "    (\"2024-01-02\", \"North\", \"Product B\", 1200, 120),\n",
 343 |                             "    (\"2024-01-02\", \"South\", \"Product B\", 900, 90),\n",
 344 |                             "    (\"2024-01-03\", \"East\", \"Product A\", 1100, 110),\n",
 345 |                             "    (\"2024-01-03\", \"West\", \"Product C\", 700, 70)\n",
 346 |                             "]\n",
 347 |                             "\n",
 348 |                             "schema = StructType([\n",
 349 |                             "    StructField(\"date\", StringType(), True),\n",
 350 |                             "    StructField(\"region\", StringType(), True),\n",
 351 |                             "    StructField(\"product\", StringType(), True),\n",
 352 |                             "    StructField(\"revenue\", IntegerType(), True),\n",
 353 |                             "    StructField(\"quantity\", IntegerType(), True)\n",
 354 |                             "])\n",
 355 |                             "\n",
 356 |                             "df_sales = spark.createDataFrame(sales_data, schema)\n",
 357 |                             "df_sales = df_sales.withColumn(\"date\", to_date(col(\"date\"), \"yyyy-MM-dd\"))\n",
 358 |                             "df_sales.show()\n"
 359 |                         ],
 360 |                         "execution_count": None,
 361 |                         "outputs": [],
 362 |                         "metadata": {}
 363 |                     },
 364 |                     {
 365 |                         "cell_type": "code",
 366 |                         "source": [
 367 |                             "# Aggregation analysis\n",
 368 |                             "print(\"=== Revenue by Region ===\")\n",
 369 |                             "df_sales.groupBy(\"region\") \\\n",
 370 |                             "    .agg(sum(\"revenue\").alias(\"total_revenue\"),\n",
 371 |                             "         sum(\"quantity\").alias(\"total_quantity\"),\n",
 372 |                             "         count(\"*\").alias(\"transaction_count\")) \\\n",
 373 |                             "    .orderBy(desc(\"total_revenue\")) \\\n",
 374 |                             "    .show()\n",
 375 |                             "\n",
 376 |                             "print(\"=== Revenue by Product ===\")\n",
 377 |                             "df_sales.groupBy(\"product\") \\\n",
 378 |                             "    .agg(sum(\"revenue\").alias(\"total_revenue\"),\n",
 379 |                             "         avg(\"revenue\").alias(\"avg_revenue\")) \\\n",
 380 |                             "    .orderBy(desc(\"total_revenue\")) \\\n",
 381 |                             "    .show()\n"
 382 |                         ],
 383 |                         "execution_count": None,
 384 |                         "outputs": [],
 385 |                         "metadata": {}
 386 |                     },
 387 |                     {
 388 |                         "cell_type": "code",
 389 |                         "source": [
 390 |                             "# Window functions for advanced analytics\n",
 391 |                             "windowSpec = Window.partitionBy(\"region\").orderBy(\"date\")\n",
 392 |                             "\n",
 393 |                             "df_analytics = df_sales \\\n",
 394 |                             "    .withColumn(\"running_total\", sum(\"revenue\").over(windowSpec)) \\\n",
 395 |                             "    .withColumn(\"row_number\", row_number().over(windowSpec)) \\\n",
 396 |                             "    .withColumn(\"rank\", rank().over(windowSpec.orderBy(desc(\"revenue\"))))\n",
 397 |                             "\n",
 398 |                             "print(\"=== Advanced Analytics with Window Functions ===\")\n",
 399 |                             "df_analytics.select(\"date\", \"region\", \"product\", \"revenue\", \n",
 400 |                             "                   \"running_total\", \"row_number\", \"rank\") \\\n",
 401 |                             "    .orderBy(\"region\", \"date\") \\\n",
 402 |                             "    .show()\n"
 403 |                         ],
 404 |                         "execution_count": None,
 405 |                         "outputs": [],
 406 |                         "metadata": {}
 407 |                     }
 408 |                 ]
 409 |             },
 410 |             "ml": {
 411 |                 "cells": [
 412 |                     {
 413 |                         "cell_type": "markdown",
 414 |                         "source": [
 415 |                             "# PySpark Machine Learning\n",
 416 |                             "\n",
 417 |                             "This notebook demonstrates machine learning with PySpark MLlib in Microsoft Fabric.\n"
 418 |                         ],
 419 |                         "metadata": {}
 420 |                     },
 421 |                     {
 422 |                         "cell_type": "code",
 423 |                         "source": [
 424 |                             "# Import ML libraries\n",
 425 |                             "from pyspark.sql import SparkSession\n",
 426 |                             "from pyspark.sql.functions import *\n",
 427 |                             "from pyspark.sql.types import *\n",
 428 |                             "from pyspark.ml.feature import VectorAssembler, StandardScaler\n",
 429 |                             "from pyspark.ml.regression import LinearRegression\n",
 430 |                             "from pyspark.ml.evaluation import RegressionEvaluator\n",
 431 |                             "from pyspark.ml import Pipeline\n",
 432 |                             "\n",
 433 |                             "print(f\"Spark version: {spark.version}\")\n"
 434 |                         ],
 435 |                         "execution_count": None,
 436 |                         "outputs": [],
 437 |                         "metadata": {}
 438 |                     },
 439 |                     {
 440 |                         "cell_type": "code",
 441 |                         "source": [
 442 |                             "# Create sample dataset for regression\n",
 443 |                             "ml_data = [\n",
 444 |                             "    (1, 2.0, 3.0, 4.0, 10.0),\n",
 445 |                             "    (2, 3.0, 4.0, 5.0, 15.0),\n",
 446 |                             "    (3, 4.0, 5.0, 6.0, 20.0),\n",
 447 |                             "    (4, 5.0, 6.0, 7.0, 25.0),\n",
 448 |                             "    (5, 6.0, 7.0, 8.0, 30.0),\n",
 449 |                             "    (6, 7.0, 8.0, 9.0, 35.0)\n",
 450 |                             "]\n",
 451 |                             "\n",
 452 |                             "schema = StructType([\n",
 453 |                             "    StructField(\"id\", IntegerType(), True),\n",
 454 |                             "    StructField(\"feature1\", DoubleType(), True),\n",
 455 |                             "    StructField(\"feature2\", DoubleType(), True),\n",
 456 |                             "    StructField(\"feature3\", DoubleType(), True),\n",
 457 |                             "    StructField(\"label\", DoubleType(), True)\n",
 458 |                             "])\n",
 459 |                             "\n",
 460 |                             "df_ml = spark.createDataFrame(ml_data, schema)\n",
 461 |                             "print(\"Sample ML dataset:\")\n",
 462 |                             "df_ml.show()\n"
 463 |                         ],
 464 |                         "execution_count": None,
 465 |                         "outputs": [],
 466 |                         "metadata": {}
 467 |                     },
 468 |                     {
 469 |                         "cell_type": "code",
 470 |                         "source": [
 471 |                             "# Feature engineering pipeline\n",
 472 |                             "feature_cols = [\"feature1\", \"feature2\", \"feature3\"]\n",
 473 |                             "\n",
 474 |                             "# Assemble features into a vector\n",
 475 |                             "assembler = VectorAssembler(inputCols=feature_cols, outputCol=\"raw_features\")\n",
 476 |                             "\n",
 477 |                             "# Scale features\n",
 478 |                             "scaler = StandardScaler(inputCol=\"raw_features\", outputCol=\"features\")\n",
 479 |                             "\n",
 480 |                             "# Linear regression model\n",
 481 |                             "lr = LinearRegression(featuresCol=\"features\", labelCol=\"label\")\n",
 482 |                             "\n",
 483 |                             "# Create pipeline\n",
 484 |                             "pipeline = Pipeline(stages=[assembler, scaler, lr])\n",
 485 |                             "\n",
 486 |                             "print(\"ML Pipeline created with stages: Feature Assembly -> Scaling -> Linear Regression\")\n"
 487 |                         ],
 488 |                         "execution_count": None,
 489 |                         "outputs": [],
 490 |                         "metadata": {}
 491 |                     },
 492 |                     {
 493 |                         "cell_type": "code",
 494 |                         "source": [
 495 |                             "# Split data and train model\n",
 496 |                             "train_data, test_data = df_ml.randomSplit([0.8, 0.2], seed=42)\n",
 497 |                             "\n",
 498 |                             "print(f\"Training data count: {train_data.count()}\")\n",
 499 |                             "print(f\"Test data count: {test_data.count()}\")\n",
 500 |                             "\n",
 501 |                             "# Train the pipeline\n",
 502 |                             "model = pipeline.fit(train_data)\n",
 503 |                             "\n",
 504 |                             "# Make predictions\n",
 505 |                             "predictions = model.transform(test_data)\n",
 506 |                             "\n",
 507 |                             "print(\"\\nPredictions:\")\n",
 508 |                             "predictions.select(\"id\", \"label\", \"prediction\").show()\n"
 509 |                         ],
 510 |                         "execution_count": None,
 511 |                         "outputs": [],
 512 |                         "metadata": {}
 513 |                     },
 514 |                     {
 515 |                         "cell_type": "code",
 516 |                         "source": [
 517 |                             "# Evaluate model performance\n",
 518 |                             "evaluator = RegressionEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"rmse\")\n",
 519 |                             "rmse = evaluator.evaluate(predictions)\n",
 520 |                             "\n",
 521 |                             "evaluator_r2 = RegressionEvaluator(labelCol=\"label\", predictionCol=\"prediction\", metricName=\"r2\")\n",
 522 |                             "r2 = evaluator_r2.evaluate(predictions)\n",
 523 |                             "\n",
 524 |                             "print(f\"Root Mean Square Error (RMSE): {rmse:.3f}\")\n",
 525 |                             "print(f\"R-squared (R2): {r2:.3f}\")\n",
 526 |                             "\n",
 527 |                             "# Get model coefficients\n",
 528 |                             "lr_model = model.stages[-1]\n",
 529 |                             "print(f\"\\nModel coefficients: {lr_model.coefficients}\")\n",
 530 |                             "print(f\"Model intercept: {lr_model.intercept:.3f}\")\n"
 531 |                         ],
 532 |                         "execution_count": None,
 533 |                         "outputs": [],
 534 |                         "metadata": {}
 535 |                     }
 536 |                 ]
 537 |             }
 538 |         }
 539 | 
 540 |         if template_type not in templates:
 541 |             return f"Invalid template type. Available templates: {', '.join(templates.keys())}"
 542 | 
 543 |         # Create notebook JSON structure
 544 |         notebook_json = {
 545 |             "nbformat": 4,
 546 |             "nbformat_minor": 5,
 547 |             "cells": templates[template_type]["cells"],
 548 |             "metadata": {
 549 |                 "language_info": {"name": "python"},
 550 |                 "kernel_info": {"name": "synapse_pyspark"},
 551 |                 "description": f"PySpark notebook created from {template_type} template"
 552 |             },
 553 |         }
 554 |         
 555 |         notebook_content = json.dumps(notebook_json, indent=2)
 556 | 
 557 |         notebook_client = NotebookClient(
 558 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
 559 |         )
 560 |         response = await notebook_client.create_notebook(
 561 |             workspace, notebook_name, notebook_content
 562 |         )
 563 |         
 564 |         if isinstance(response, dict) and response.get("id"):
 565 |             return f"Created PySpark notebook '{notebook_name}' with ID: {response['id']}"
 566 |         else:
 567 |             return f"Failed to create notebook: {response}"
 568 |             
 569 |     except Exception as e:
 570 |         logger.error(f"Error creating PySpark notebook: {str(e)}")
 571 |         return f"Error creating PySpark notebook: {str(e)}"
 572 | 
 573 | @mcp.tool()
 574 | async def generate_pyspark_code(
 575 |     operation: str,
 576 |     source_table: Optional[str] = None,
 577 |     target_table: Optional[str] = None,
 578 |     columns: Optional[str] = None,
 579 |     filter_condition: Optional[str] = None,
 580 |     ctx: Context = None,
 581 | ) -> str:
 582 |     """Generate PySpark code for common operations.
 583 | 
 584 |     Args:
 585 |         operation: Type of operation ('read_table', 'write_table', 'transform', 'join', 'aggregate')
 586 |         source_table: Source table name (format: lakehouse.table_name)
 587 |         target_table: Target table name (format: lakehouse.table_name)
 588 |         columns: Comma-separated list of columns
 589 |         filter_condition: Filter condition for data
 590 |         ctx: Context object containing client information
 591 |     Returns:
 592 |         A string containing the generated PySpark code or an error message.
 593 |     """
 594 |     try:
 595 |         code_templates = {
 596 |             "read_table": f"""# Read data from table
 597 | df = spark.table("{source_table or 'lakehouse.table_name'}")
 598 | df.show()
 599 | df.printSchema()""",
 600 |             
 601 |             "write_table": f"""# Write data to table
 602 | df.write \\
 603 |     .format("delta") \\
 604 |     .mode("overwrite") \\
 605 |     .saveAsTable("{target_table or 'lakehouse.output_table'}")
 606 | 
 607 | print(f"Successfully wrote {{df.count()}} records to {target_table or 'lakehouse.output_table'}")""",
 608 |             
 609 |             "transform": f"""# Data transformation
 610 | from pyspark.sql.functions import *
 611 | 
 612 | df_transformed = df \\
 613 |     .select({columns or '*'}) \\
 614 |     {f'.filter({filter_condition})' if filter_condition else ''} \\
 615 |     .withColumn("processed_date", current_timestamp())
 616 | 
 617 | df_transformed.show()""",
 618 |             
 619 |             "join": f"""# Join tables
 620 | df1 = spark.table("{source_table or 'lakehouse.table1'}")
 621 | df2 = spark.table("{target_table or 'lakehouse.table2'}")
 622 | 
 623 | # Inner join (modify join condition as needed)
 624 | df_joined = df1.join(df2, df1.id == df2.id, "inner")
 625 | 
 626 | df_joined.show()""",
 627 |             
 628 |             "aggregate": f"""# Data aggregation
 629 | from pyspark.sql.functions import *
 630 | 
 631 | df_agg = df \\
 632 |     .groupBy({columns or '"column1"'}) \\
 633 |     .agg(
 634 |         count("*").alias("count"),
 635 |         sum("amount").alias("total_amount"),
 636 |         avg("amount").alias("avg_amount"),
 637 |         max("date").alias("max_date")
 638 |     ) \\
 639 |     .orderBy(desc("total_amount"))
 640 | 
 641 | df_agg.show()""",
 642 |             
 643 |             "schema_inference": f"""# Schema inference and data profiling
 644 | print("=== Schema Information ===")
 645 | df.printSchema()
 646 | 
 647 | print("\\n=== Data Profile ===")
 648 | print(f"Record count: {{df.count()}}")
 649 | print(f"Column count: {{len(df.columns)}}")
 650 | 
 651 | print("\\n=== Column Statistics ===")
 652 | df.describe().show()
 653 | 
 654 | print("\\n=== Null Value Analysis ===")
 655 | from pyspark.sql.functions import col, sum as spark_sum, isnan, when, count
 656 | 
 657 | null_counts = df.select([
 658 |     spark_sum(when(col(c).isNull() | isnan(col(c)), 1).otherwise(0)).alias(c)
 659 |     for c in df.columns
 660 | ])
 661 | null_counts.show()""",
 662 |             
 663 |             "data_quality": f"""# Data quality checks
 664 | from pyspark.sql.functions import *
 665 | 
 666 | print("=== Data Quality Report ===")
 667 | 
 668 | # Check for duplicates
 669 | duplicate_count = df.count() - df.distinct().count()
 670 | print(f"Duplicate rows: {{duplicate_count}}")
 671 | 
 672 | # Check for null values
 673 | total_rows = df.count()
 674 | for column in df.columns:
 675 |     null_count = df.filter(col(column).isNull()).count()
 676 |     null_percentage = (null_count / total_rows) * 100
 677 |     print(f"{{column}}: {{null_count}} nulls ({{null_percentage:.2f}}%)")
 678 | 
 679 | # Check data ranges (for numeric columns)
 680 | numeric_columns = [field.name for field in df.schema.fields 
 681 |                   if field.dataType.simpleString() in ['int', 'double', 'float', 'bigint']]
 682 | 
 683 | if numeric_columns:
 684 |     print("\\n=== Numeric Column Ranges ===")
 685 |     df.select([
 686 |         min(col(c)).alias(f"{c}_min"),
 687 |         max(col(c)).alias(f"{c}_max")
 688 |         for c in numeric_columns
 689 |     ]).show()""",
 690 |             
 691 |             "performance_optimization": f"""# Performance optimization techniques
 692 | 
 693 | # 1. Cache frequently used DataFrames
 694 | df.cache()
 695 | print(f"Cached DataFrame with {{df.count()}} records")
 696 | 
 697 | # 2. Repartition for better parallelism
 698 | optimal_partitions = spark.sparkContext.defaultParallelism * 2
 699 | df_repartitioned = df.repartition(optimal_partitions)
 700 | 
 701 | # 3. Use broadcast for small dimension tables (< 200MB)
 702 | from pyspark.sql.functions import broadcast
 703 | # df_joined = large_df.join(broadcast(small_df), "key")
 704 | 
 705 | # 4. Optimize file formats - use Delta Lake
 706 | df.write \\
 707 |     .format("delta") \\
 708 |     .mode("overwrite") \\
 709 |     .option("optimizeWrite", "true") \\
 710 |     .option("autoOptimize", "true") \\
 711 |     .saveAsTable("{target_table or 'lakehouse.optimized_table'}")
 712 | 
 713 | # 5. Show execution plan
 714 | df.explain(True)"""
 715 |         }
 716 |         
 717 |         if operation not in code_templates:
 718 |             available_ops = ", ".join(code_templates.keys())
 719 |             return f"Invalid operation. Available operations: {available_ops}"
 720 |         
 721 |         generated_code = code_templates[operation]
 722 |         
 723 |         return f"""```python
 724 | {generated_code}
 725 | ```
 726 | 
 727 | **Generated PySpark code for '{operation}' operation**
 728 | 
 729 | This code can be copied into a notebook cell and executed. Remember to:
 730 | - Replace placeholder table names with actual table names
 731 | - Adjust column names and conditions as needed
 732 | - Test with a small dataset first
 733 | - Review the execution plan for performance optimization"""
 734 |         
 735 |     except Exception as e:
 736 |         logger.error(f"Error generating PySpark code: {str(e)}")
 737 |         return f"Error generating PySpark code: {str(e)}"
 738 | 
 739 | @mcp.tool()
 740 | async def validate_pyspark_code(
 741 |     code: str,
 742 |     ctx: Context = None,
 743 | ) -> str:
 744 |     """Validate PySpark code for syntax and best practices.
 745 | 
 746 |     Args:
 747 |         code: PySpark code to validate
 748 |         ctx: Context object containing client information
 749 |     Returns:
 750 |         A string containing validation results and suggestions.
 751 |     """
 752 |     try:
 753 |         validation_results = []
 754 |         warnings = []
 755 |         suggestions = []
 756 |         
 757 |         # Basic syntax validation
 758 |         try:
 759 |             compile(code, '<string>', 'exec')
 760 |             validation_results.append("✅ Syntax validation: PASSED")
 761 |         except SyntaxError as e:
 762 |             validation_results.append(f"❌ Syntax validation: FAILED - {e}")
 763 |             return "\n".join(validation_results)
 764 |         
 765 |         # PySpark best practices checks
 766 |         lines = code.split('\n')
 767 |         
 768 |         # Check for common imports
 769 |         has_spark_imports = any('from pyspark' in line or 'import pyspark' in line for line in lines)
 770 |         if not has_spark_imports:
 771 |             warnings.append("⚠️  No PySpark imports detected. Add: from pyspark.sql import SparkSession")
 772 |         
 773 |         # Check for DataFrame operations
 774 |         has_df_operations = any('df.' in line or '.show()' in line for line in lines)
 775 |         if has_df_operations:
 776 |             validation_results.append("✅ DataFrame operations detected")
 777 |         
 778 |         # Check for performance anti-patterns
 779 |         if '.collect()' in code:
 780 |             warnings.append("⚠️  .collect() detected - avoid on large datasets, use .show() or .take() instead")
 781 |         
 782 |         if '.toPandas()' in code:
 783 |             warnings.append("⚠️  .toPandas() detected - ensure dataset fits in driver memory")
 784 |         
 785 |         if 'for row in df.collect()' in code:
 786 |             warnings.append("❌ Anti-pattern: iterating over collected DataFrame. Use DataFrame operations instead")
 787 |         
 788 |         # Check for caching opportunities
 789 |         df_count = code.count('df.')
 790 |         if df_count > 3 and '.cache()' not in code and '.persist()' not in code:
 791 |             suggestions.append("💡 Consider caching DataFrame with .cache() for repeated operations")
 792 |         
 793 |         # Check for schema definition
 794 |         if 'createDataFrame' in code and 'StructType' not in code:
 795 |             suggestions.append("💡 Consider defining explicit schema when creating DataFrames")
 796 |         
 797 |         # Check for null handling
 798 |         if '.filter(' in code and 'isNull' not in code and 'isNotNull' not in code:
 799 |             suggestions.append("💡 Consider adding null value handling in filters")
 800 |         
 801 |         # Check for partitioning
 802 |         if '.write.' in code and 'partitionBy' not in code:
 803 |             suggestions.append("💡 Consider partitioning data when writing large datasets")
 804 |         
 805 |         # Check for Delta Lake usage
 806 |         if '.write.' in code and 'format("delta")' not in code:
 807 |             suggestions.append("💡 Consider using Delta Lake format for ACID transactions and time travel")
 808 |         
 809 |         # Compile results
 810 |         result = "# PySpark Code Validation Report\n\n"
 811 |         result += "## Validation Results\n"
 812 |         result += "\n".join(validation_results) + "\n\n"
 813 |         
 814 |         if warnings:
 815 |             result += "## Warnings\n"
 816 |             result += "\n".join(warnings) + "\n\n"
 817 |         
 818 |         if suggestions:
 819 |             result += "## Optimization Suggestions\n"
 820 |             result += "\n".join(suggestions) + "\n\n"
 821 |         
 822 |         if not warnings and not suggestions:
 823 |             result += "## Summary\n✅ Code looks good! No issues detected.\n"
 824 |         else:
 825 |             result += f"## Summary\n📊 Found {len(warnings)} warnings and {len(suggestions)} optimization opportunities.\n"
 826 |         
 827 |         return result
 828 |         
 829 |     except Exception as e:
 830 |         logger.error(f"Error validating PySpark code: {str(e)}")
 831 |         return f"Error validating PySpark code: {str(e)}"
 832 | 
 833 | @mcp.tool()
 834 | async def update_notebook_cell(
 835 |     workspace: str,
 836 |     notebook_id: str,
 837 |     cell_index: int,
 838 |     cell_content: str,
 839 |     cell_type: str = "code",
 840 |     ctx: Context = None,
 841 | ) -> str:
 842 |     """Update a specific cell in a notebook.
 843 | 
 844 |     Args:
 845 |         workspace: Name or ID of the workspace
 846 |         notebook_id: ID or name of the notebook
 847 |         cell_index: Index of the cell to update (0-based)
 848 |         cell_content: New content for the cell
 849 |         cell_type: Type of cell ('code' or 'markdown')
 850 |         ctx: Context object containing client information
 851 |     Returns:
 852 |         A string confirming the update or an error message.
 853 |     """
 854 |     try:
 855 |         if ctx is None:
 856 |             raise ValueError("Context (ctx) must be provided.")
 857 | 
 858 |         # Get current notebook content
 859 |         current_content = await get_notebook_content(workspace, notebook_id, ctx)
 860 |         
 861 |         if current_content.startswith("Error"):
 862 |             return current_content
 863 |         
 864 |         # Parse the notebook JSON
 865 |         notebook_data = json.loads(current_content)
 866 |         cells = notebook_data.get("cells", [])
 867 |         
 868 |         if cell_index >= len(cells):
 869 |             return f"Cell index {cell_index} is out of range. Notebook has {len(cells)} cells."
 870 |         
 871 |         # Update the cell
 872 |         cells[cell_index] = {
 873 |             "cell_type": cell_type,
 874 |             "source": cell_content.split('\n') if isinstance(cell_content, str) else cell_content,
 875 |             "execution_count": None,
 876 |             "outputs": [],
 877 |             "metadata": {}
 878 |         }
 879 |         
 880 |         # Update the notebook
 881 |         updated_content = json.dumps(notebook_data, indent=2)
 882 |         
 883 |         notebook_client = NotebookClient(
 884 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
 885 |         )
 886 |         
 887 |         # This would require implementing an update method in the client
 888 |         # For now, return a success message indicating what would be updated
 889 |         return f"Cell {cell_index} updated successfully with {cell_type} content (length: {len(cell_content)} characters)"
 890 |         
 891 |     except Exception as e:
 892 |         logger.error(f"Error updating notebook cell: {str(e)}")
 893 |         return f"Error updating notebook cell: {str(e)}"
 894 | 
 895 | @mcp.tool()
 896 | async def create_fabric_notebook(
 897 |     workspace: str,
 898 |     notebook_name: str,
 899 |     template_type: str = "fabric_integration",
 900 |     ctx: Context = None,
 901 | ) -> str:
 902 |     """Create a new notebook optimized for Microsoft Fabric using advanced templates.
 903 | 
 904 |     Args:
 905 |         workspace: Name or ID of the workspace
 906 |         notebook_name: Name of the new notebook
 907 |         template_type: Type of Fabric template ('fabric_integration', 'streaming')
 908 |         ctx: Context object containing client information
 909 |     Returns:
 910 |         A string containing the ID of the created notebook or an error message.
 911 |     """
 912 |     try:
 913 |         if ctx is None:
 914 |             raise ValueError("Context (ctx) must be provided.")
 915 | 
 916 |         from helpers.pyspark_helpers import create_notebook_from_template
 917 |         
 918 |         # Create notebook from advanced template
 919 |         notebook_data = create_notebook_from_template(template_type)
 920 |         notebook_content = json.dumps(notebook_data, indent=2)
 921 | 
 922 |         notebook_client = NotebookClient(
 923 |             FabricApiClient(get_azure_credentials(ctx.client_id, __ctx_cache))
 924 |         )
 925 |         response = await notebook_client.create_notebook(
 926 |             workspace, notebook_name, notebook_content
 927 |         )
 928 |         
 929 |         if isinstance(response, dict) and response.get("id"):
 930 |             return f"Created Fabric-optimized notebook '{notebook_name}' with ID: {response['id']} using {template_type} template"
 931 |         else:
 932 |             return f"Failed to create notebook: {response}"
 933 |             
 934 |     except Exception as e:
 935 |         logger.error(f"Error creating Fabric notebook: {str(e)}")
 936 |         return f"Error creating Fabric notebook: {str(e)}"
 937 | 
 938 | @mcp.tool()
 939 | async def generate_fabric_code(
 940 |     operation: str,
 941 |     lakehouse_name: Optional[str] = None,
 942 |     table_name: Optional[str] = None,
 943 |     target_table: Optional[str] = None,
 944 |     ctx: Context = None,
 945 | ) -> str:
 946 |     """Generate Fabric-specific PySpark code for lakehouse operations.
 947 | 
 948 |     Args:
 949 |         operation: Type of operation ('read_lakehouse', 'write_lakehouse', 'merge_delta', 'performance_monitor')
 950 |         lakehouse_name: Name of the lakehouse
 951 |         table_name: Name of the source table
 952 |         target_table: Name of the target table (for write/merge operations)
 953 |         ctx: Context object containing client information
 954 |     Returns:
 955 |         A string containing the generated Fabric-specific PySpark code.
 956 |     """
 957 |     try:
 958 |         from helpers.pyspark_helpers import PySparkCodeGenerator
 959 |         
 960 |         generator = PySparkCodeGenerator()
 961 |         
 962 |         if operation == "read_lakehouse":
 963 |             if not lakehouse_name or not table_name:
 964 |                 return "Error: lakehouse_name and table_name are required for read_lakehouse operation"
 965 |             code = generator.generate_fabric_lakehouse_reader(lakehouse_name, table_name)
 966 |             
 967 |         elif operation == "write_lakehouse":
 968 |             if not table_name:
 969 |                 return "Error: table_name is required for write_lakehouse operation"
 970 |             code = generator.generate_fabric_lakehouse_writer(table_name)
 971 |             
 972 |         elif operation == "merge_delta":
 973 |             if not target_table:
 974 |                 return "Error: target_table is required for merge_delta operation"
 975 |             source_df = "new_df"  # Default source DataFrame name
 976 |             join_condition = "target.id = source.id"  # Default join condition
 977 |             code = generator.generate_delta_merge_operation(target_table, source_df, join_condition)
 978 |             
 979 |         elif operation == "performance_monitor":
 980 |             code = generator.generate_performance_monitoring()
 981 |             
 982 |         else:
 983 |             available_ops = ["read_lakehouse", "write_lakehouse", "merge_delta", "performance_monitor"]
 984 |             return f"Invalid operation. Available operations: {', '.join(available_ops)}"
 985 |         
 986 |         return f"""```python
 987 | {code}
 988 | ```
 989 | 
 990 | **Generated Fabric-specific PySpark code for '{operation}' operation**
 991 | 
 992 | This code is optimized for Microsoft Fabric and includes:
 993 | - Proper Delta Lake integration
 994 | - Fabric lakehouse connectivity
 995 | - Performance monitoring capabilities
 996 | - Best practices for Fabric environment"""
 997 |         
 998 |     except Exception as e:
 999 |         logger.error(f"Error generating Fabric code: {str(e)}")
1000 |         return f"Error generating Fabric code: {str(e)}"
1001 | 
1002 | @mcp.tool()
1003 | async def validate_fabric_code(
1004 |     code: str,
1005 |     ctx: Context = None,
1006 | ) -> str:
1007 |     """Validate PySpark code for Microsoft Fabric compatibility and performance.
1008 | 
1009 |     Args:
1010 |         code: PySpark code to validate for Fabric compatibility
1011 |         ctx: Context object containing client information
1012 |     Returns:
1013 |         A string containing detailed validation results and Fabric-specific recommendations.
1014 |     """
1015 |     try:
1016 |         from helpers.pyspark_helpers import PySparkValidator
1017 |         
1018 |         validator = PySparkValidator()
1019 |         
1020 |         # Basic syntax validation
1021 |         validation_results = []
1022 |         try:
1023 |             compile(code, '<string>', 'exec')
1024 |             validation_results.append("✅ Syntax validation: PASSED")
1025 |         except SyntaxError as e:
1026 |             validation_results.append(f"❌ Syntax validation: FAILED - {e}")
1027 |             return "\n".join(validation_results)
1028 |         
1029 |         # Fabric compatibility checks
1030 |         fabric_results = validator.validate_fabric_compatibility(code)
1031 |         
1032 |         # Performance pattern checks
1033 |         performance_results = validator.check_performance_patterns(code)
1034 |         
1035 |         # Additional Fabric-specific checks
1036 |         fabric_warnings = []
1037 |         fabric_suggestions = []
1038 |         
1039 |         # Check for Fabric best practices
1040 |         if 'spark.table(' in code:
1041 |             validation_results.append("✅ Using Fabric managed tables")
1042 |         
1043 |         if 'notebookutils' in code:
1044 |             validation_results.append("✅ Using Fabric notebook utilities")
1045 |         
1046 |         if 'format("delta")' in code:
1047 |             validation_results.append("✅ Using Delta Lake format")
1048 |         
1049 |         # Check for potential issues
1050 |         if 'spark.sql("USE' in code:
1051 |             fabric_warnings.append("⚠️ Explicit USE statements may not be necessary in Fabric")
1052 |         
1053 |         if 'hdfs://' in code or 's3://' in code:
1054 |             fabric_warnings.append("⚠️ Direct file system paths detected - consider using Fabric's managed storage")
1055 |         
1056 |         # Compile comprehensive report
1057 |         result = "# Microsoft Fabric PySpark Code Validation Report\n\n"
1058 |         
1059 |         result += "## Basic Validation\n"
1060 |         result += "\n".join(validation_results) + "\n\n"
1061 |         
1062 |         if fabric_results["issues"]:
1063 |             result += "## Fabric Compatibility Issues\n"
1064 |             result += "\n".join(fabric_results["issues"]) + "\n\n"
1065 |         
1066 |         all_warnings = fabric_warnings + performance_results["warnings"]
1067 |         if all_warnings:
1068 |             result += "## Warnings\n"
1069 |             result += "\n".join(all_warnings) + "\n\n"
1070 |         
1071 |         all_suggestions = fabric_results["suggestions"] + fabric_suggestions + performance_results["optimizations"]
1072 |         if all_suggestions:
1073 |             result += "## Fabric Optimization Suggestions\n"
1074 |             result += "\n".join(all_suggestions) + "\n\n"
1075 |         
1076 |         # Summary
1077 |         total_issues = len(fabric_results["issues"])
1078 |         total_warnings = len(all_warnings)
1079 |         total_suggestions = len(all_suggestions)
1080 |         
1081 |         result += "## Summary\n"
1082 |         if total_issues == 0 and total_warnings == 0:
1083 |             result += "✅ Code is Fabric-ready! No critical issues detected.\n"
1084 |         else:
1085 |             result += f"📊 Found {total_issues} critical issues, {total_warnings} warnings, and {total_suggestions} optimization opportunities.\n"
1086 |         
1087 |         result += "\n### Fabric-Specific Recommendations:\n"
1088 |         result += "- Use `spark.table()` for managed tables in lakehouses\n"
1089 |         result += "- Leverage `notebookutils` for Fabric integration\n"
1090 |         result += "- Always use Delta Lake format for optimal performance\n"
1091 |         result += "- Consider partitioning strategies for large datasets\n"
1092 |         result += "- Use broadcast joins for dimension tables < 200MB\n"
1093 |         
1094 |         return result
1095 |         
1096 |     except Exception as e:
1097 |         logger.error(f"Error validating Fabric code: {str(e)}")
1098 |         return f"Error validating Fabric code: {str(e)}"
1099 | 
1100 | @mcp.tool()
1101 | async def analyze_notebook_performance(
1102 |     workspace: str,
1103 |     notebook_id: str,
1104 |     ctx: Context = None,
1105 | ) -> str:
1106 |     """Analyze a notebook's code for performance optimization opportunities in Fabric.
1107 | 
1108 |     Args:
1109 |         workspace: Name or ID of the workspace
1110 |         notebook_id: ID or name of the notebook
1111 |         ctx: Context object containing client information
1112 |     Returns:
1113 |         A string containing performance analysis and optimization recommendations.
1114 |     """
1115 |     try:
1116 |         if ctx is None:
1117 |             raise ValueError("Context (ctx) must be provided.")
1118 | 
1119 |         # Get notebook content
1120 |         notebook_content = await get_notebook_content(workspace, notebook_id, ctx)
1121 |         
1122 |         if notebook_content.startswith("Error"):
1123 |             return notebook_content
1124 |         
1125 |         # Parse notebook and extract code cells
1126 |         notebook_data = json.loads(notebook_content)
1127 |         cells = notebook_data.get("cells", [])
1128 |         
1129 |         code_cells = [cell for cell in cells if cell.get("cell_type") == "code"]
1130 |         
1131 |         if not code_cells:
1132 |             return "No code cells found in the notebook."
1133 |         
1134 |         # Analyze each code cell
1135 |         analysis_results = []
1136 |         total_operations = 0
1137 |         performance_issues = []
1138 |         optimization_opportunities = []
1139 |         
1140 |         from helpers.pyspark_helpers import PySparkValidator
1141 |         validator = PySparkValidator()
1142 |         
1143 |         for i, cell in enumerate(code_cells):
1144 |             cell_source = "\n".join(cell.get("source", []))
1145 |             
1146 |             if not cell_source.strip():
1147 |                 continue
1148 |                 
1149 |             analysis_results.append(f"### Cell {i + 1}")
1150 |             
1151 |             # Count operations
1152 |             operations = [
1153 |                 ("DataFrame reads", cell_source.count("spark.read") + cell_source.count("spark.table")),
1154 |                 ("DataFrame writes", cell_source.count(".write.")),
1155 |                 ("Transformations", cell_source.count(".withColumn") + cell_source.count(".select") + cell_source.count(".filter")),
1156 |                 ("Actions", cell_source.count(".show()") + cell_source.count(".count()") + cell_source.count(".collect()"))
1157 |             ]
1158 |             
1159 |             for op_name, count in operations:
1160 |                 if count > 0:
1161 |                     analysis_results.append(f"- {op_name}: {count}")
1162 |                     total_operations += count
1163 |             
1164 |             # Check for performance patterns
1165 |             perf_results = validator.check_performance_patterns(cell_source)
1166 |             performance_issues.extend(perf_results["warnings"])
1167 |             optimization_opportunities.extend(perf_results["optimizations"])
1168 |             
1169 |             # Fabric-specific analysis
1170 |             fabric_results = validator.validate_fabric_compatibility(cell_source)
1171 |             optimization_opportunities.extend(fabric_results["suggestions"])
1172 |         
1173 |         # Generate comprehensive report
1174 |         report = f"# Notebook Performance Analysis Report\n\n"
1175 |         report += f"**Notebook:** {notebook_id}\n"
1176 |         report += f"**Total Code Cells:** {len(code_cells)}\n"
1177 |         report += f"**Total Operations:** {total_operations}\n\n"
1178 |         
1179 |         if analysis_results:
1180 |             report += "## Cell-by-Cell Analysis\n"
1181 |             report += "\n".join(analysis_results) + "\n\n"
1182 |         
1183 |         if performance_issues:
1184 |             report += "## Performance Issues Found\n"
1185 |             for issue in set(performance_issues):  # Remove duplicates
1186 |                 report += f"- {issue}\n"
1187 |             report += "\n"
1188 |         
1189 |         if optimization_opportunities:
1190 |             report += "## Optimization Opportunities\n"
1191 |             for opportunity in set(optimization_opportunities):  # Remove duplicates
1192 |                 report += f"- {opportunity}\n"
1193 |             report += "\n"
1194 |         
1195 |         # Performance score calculation
1196 |         score = 100
1197 |         score -= len(set(performance_issues)) * 10  # -10 points per unique issue
1198 |         score -= len(set(optimization_opportunities)) * 5  # -5 points per optimization opportunity
1199 |         score = max(score, 0)  # Ensure score doesn't go negative
1200 |         
1201 |         report += f"## Performance Score: {score}/100\n\n"
1202 |         
1203 |         if score >= 80:
1204 |             report += "✅ **Excellent** - Your notebook is well-optimized for Fabric!\n"
1205 |         elif score >= 60:
1206 |             report += "⚠️ **Good** - Some optimization opportunities exist.\n"
1207 |         elif score >= 40:
1208 |             report += "🔧 **Needs Improvement** - Several performance issues should be addressed.\n"
1209 |         else:
1210 |             report += "❌ **Poor** - Significant performance optimization required.\n"
1211 |         
1212 |         return report
1213 |         
1214 |     except Exception as e:
1215 |         logger.error(f"Error analyzing notebook performance: {str(e)}")
1216 |         return f"Error analyzing notebook performance: {str(e)}"
1217 | 


--------------------------------------------------------------------------------