├── puppygraph
    ├── data
    │   ├── __init__.py
    │   ├── host
    │   │   ├── __init__.py
    │   │   └── host_config.py
    │   ├── mapping
    │   │   ├── __init__.py
    │   │   ├── database_params.py
    │   │   ├── graph_element_cache_config.py
    │   │   ├── catalog_config.py
    │   │   ├── datalake_params.py
    │   │   └── graph_mapping_config.py
    │   └── schema
    │   │   ├── __init__.py
    │   │   └── graph_schema.py
    ├── client
    │   ├── __init__.py
    │   └── client.py
    ├── common
    │   ├── __init__.py
    │   ├── dataclass_utils.py
    │   ├── conversion_utils.py
    │   └── test_conversion_utils.py
    ├── rag
    │   ├── __init__.py
    │   └── graph_agent.py
    └── __init__.py
├── apps
    ├── chatbot
    │   ├── .gitignore
    │   ├── requirements.txt
    │   ├── .env.example
    │   ├── run.sh
    │   ├── CLAUDE.md
    │   ├── test_query_limits.py
    │   ├── test_integration.py
    │   ├── README.md
    │   ├── mcp_server.py
    │   ├── rag_system.py
    │   ├── backend.py
    │   └── gradio_app.py
    ├── databricks_mining_site
    │   ├── README.md
    │   ├── run_agent.py
    │   └── set_graph_schema.py
    └── imdb
    │   └── run_agent.py
├── pyproject.toml
├── .gitignore
├── README.md
└── LICENSE


/puppygraph/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/puppygraph/client/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/puppygraph/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/puppygraph/data/host/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/puppygraph/data/mapping/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/puppygraph/data/schema/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/puppygraph/rag/__init__.py:
--------------------------------------------------------------------------------
1 | from puppygraph.rag.graph_agent import PuppyGraphAgent
2 | 
3 | __all__ = ["PuppyGraphAgent"]
4 | 


--------------------------------------------------------------------------------
/apps/chatbot/.gitignore:
--------------------------------------------------------------------------------
1 | # Virtual environment
2 | venv/
3 | 
4 | # Environment variables
5 | .env
6 | 
7 | # Python cache
8 | __pycache__/


--------------------------------------------------------------------------------
/puppygraph/__init__.py:
--------------------------------------------------------------------------------
1 | from puppygraph.client.client import PuppyGraphClient
2 | from puppygraph.data.host.host_config import PuppyGraphHostConfig
3 | from puppygraph.data.schema.graph_schema import PuppyGraphSchema
4 | 
5 | __all__ = ["PuppyGraphClient", "PuppyGraphHostConfig", "PuppyGraphSchema"]
6 | 


--------------------------------------------------------------------------------
/apps/chatbot/requirements.txt:
--------------------------------------------------------------------------------
 1 | gradio==5.45.0
 2 | mcp==1.14.0
 3 | neo4j==5.28.2
 4 | requests==2.32.5
 5 | sentence-transformers==5.1.0
 6 | chromadb==1.0.21
 7 | langchain==0.3.27
 8 | langchain-community==0.3.29
 9 | anthropic==0.67.0
10 | pydantic==2.11.7
11 | uvicorn==0.35.0
12 | fastapi==0.116.1
13 | python-dotenv==1.1.1


--------------------------------------------------------------------------------
/apps/chatbot/.env.example:
--------------------------------------------------------------------------------
 1 | # Anthropic API Key for text-to-cypher generation
 2 | ANTHROPIC_API_KEY=your_anthropic_api_key_here
 3 | 
 4 | # PuppyGraph Configuration
 5 | PUPPYGRAPH_BOLT_URI=bolt://localhost:7687
 6 | PUPPYGRAPH_HTTP_URI=http://localhost:8081
 7 | PUPPYGRAPH_USERNAME=puppygraph
 8 | PUPPYGRAPH_PASSWORD=puppygraph123
 9 | 
10 | # Optional: Gradio Configuration
11 | GRADIO_SERVER_PORT=7860
12 | GRADIO_SERVER_NAME=0.0.0.0


--------------------------------------------------------------------------------
/puppygraph/data/mapping/database_params.py:
--------------------------------------------------------------------------------
 1 | """Defines Database parameters for a catalog."""
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import List, Optional, Union
 5 | 
 6 | 
 7 | @dataclass(frozen=True)
 8 | class JDBCParam:
 9 |     """JDBC connection parameters for a database."""
10 | 
11 |     username: str
12 |     password: str
13 |     jdbc_uri: str
14 |     driver_class: str
15 |     driver_url: str
16 | 
17 | 
18 | @dataclass(frozen=True)
19 | class ElasticSearchParam:
20 |     """ElasticSearch connection parameters for a database."""
21 | 
22 |     hosts: List[str]
23 |     username: Optional[str] = None
24 |     password: Optional[str] = None
25 | 
26 | 
27 | DatabaseParams = Union[JDBCParam, ElasticSearchParam]
28 | 


--------------------------------------------------------------------------------
/apps/databricks_mining_site/README.md:
--------------------------------------------------------------------------------
 1 | # PuppyGraphAgent for Mining Site (Databricks Blog)
 2 | 
 3 | ## Overview
 4 | This project demonstrates the use of PuppyGraphAgent for processing and analyzing data in a mining site environment using Databricks.
 5 | 
 6 | ## Tables
 7 | 
 8 | - **Work Orders**: `pg_databricks.gold.work_orders`
 9 | - **Troubleshooting Guide**: `pg_databricks.silver.troubleshooting_guide`
10 | - **Assets**: `pg_databricks.silver.assets`
11 | - **Failure Type**: `pg_databricks.bronze.failure_type`
12 | 
13 | ## How to Run
14 | 
15 | 1. Set up the schema connection to PuppyGraph:
16 |    ```bash
17 |    python set_schema.py
18 |    ```
19 | 2. Run the graph agent:
20 |    ```bash
21 |    python run_agent.py
22 |    ```
23 | 
24 | ## Requirements
25 | 
26 | See [tool.poetry.group.app.dependencies] for the required dependencies.
27 | 


--------------------------------------------------------------------------------
/puppygraph/data/host/host_config.py:
--------------------------------------------------------------------------------
 1 | """Config for PuppyGraph server."""
 2 | 
 3 | from dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass(frozen=True)
 7 | class PuppyGraphHostConfig:
 8 |     """PuppyGraph host configuration."""
 9 | 
10 |     # IP address of the PuppyGraph host
11 |     ip: str
12 | 
13 |     # HTTP port of the PuppyGraph server
14 |     http_port: int = 8081
15 | 
16 |     # Cypher query port of the PuppyGraph server
17 |     cypher_port: int = 7687
18 |     
19 |     # Maximum connection lifetime for the Cypher query driver
20 |     cypher_max_connection_lifetime: int = 30
21 | 
22 |     # Gremlin query port of the PuppyGraph server
23 |     gremlin_port: int = 8182
24 | 
25 |     # Username to authenticate with the PuppyGraph server
26 |     username: str = "puppygraph"
27 | 
28 |     # Password to authenticate with the PuppyGraph server
29 |     password: str = "puppygraph123"
30 | 


--------------------------------------------------------------------------------
/puppygraph/data/mapping/graph_element_cache_config.py:
--------------------------------------------------------------------------------
 1 | """PuppyGraph Element Cache config."""
 2 | 
 3 | from dataclasses import dataclass
 4 | from enum import Enum
 5 | from typing import Optional
 6 | 
 7 | 
 8 | class TimeUnit(Enum):
 9 |     """Enum for time units in cache TTL."""
10 | 
11 |     YEAR = 0
12 |     MONTH = 1
13 |     DAY = 2
14 |     HOUR = 3
15 |     MINUTE = 4
16 |     PARTITION = 5
17 | 
18 | 
19 | @dataclass(frozen=True)
20 | class TimeToLive:
21 |     """Time-to-live configuration for cache."""
22 | 
23 |     amount: int
24 |     unit: TimeUnit
25 | 
26 | 
27 | @dataclass(frozen=True)
28 | class RefreshStrategy:
29 |     """Refresh strategy configuration for cache."""
30 | 
31 |     parallelism: int
32 | 
33 | 
34 | @dataclass(frozen=True)
35 | class GraphElementCacheConfig:
36 |     """Cache configuration for elements."""
37 | 
38 |     partition_key: str
39 |     partition_ttl: Optional[TimeToLive] = None
40 |     refresh_strategy: Optional[RefreshStrategy] = None
41 | 


--------------------------------------------------------------------------------
/puppygraph/common/dataclass_utils.py:
--------------------------------------------------------------------------------
 1 | """Utility functions for working with dataclasses."""
 2 | 
 3 | from dataclasses import asdict
 4 | from enum import Enum
 5 | 
 6 | 
 7 | def dataclass_to_camel_dict(instance) -> dict:
 8 |     """Convert a dataclass instance to a camelCase dictionary, omitting fields with None values."""
 9 |     if not hasattr(instance, "__dataclass_fields__"):
10 |         raise ValueError("The provided instance is not a dataclass.")
11 | 
12 |     def _snake_to_camel(snake_str: str) -> str:
13 |         """Convert a snake_case string to camelCase."""
14 |         components = snake_str.split("_")
15 |         return components[0] + "".join(x.title() for x in components[1:])
16 | 
17 |     def _convert_value(value):
18 |         """Convert Enum to its name, and keep other types as they are."""
19 |         if isinstance(value, Enum):
20 |             return value.name
21 |         return value
22 | 
23 |     return {
24 |         _snake_to_camel(k): _convert_value(v)
25 |         for k, v in asdict(instance).items()
26 |         if v is not None
27 |     }
28 | 


--------------------------------------------------------------------------------
/puppygraph/data/mapping/catalog_config.py:
--------------------------------------------------------------------------------
 1 | """Catalog configuration dataclasses and enums.
 2 | 
 3 | This module defines the access configurations for different types of data sources.
 4 | """
 5 | 
 6 | from dataclasses import dataclass
 7 | from enum import Enum
 8 | from typing import Union
 9 | 
10 | from puppygraph.data.mapping.database_params import DatabaseParams
11 | from puppygraph.data.mapping.datalake_params import DatalakeParams
12 | 
13 | 
14 | class CatalogType(Enum):
15 |     """Defines the source type of a catalog."""
16 | 
17 |     UNKNOWN = 0
18 |     HIVE = 1
19 |     ICEBERG = 2
20 |     HUDI = 3
21 |     DELTALAKE = 4
22 |     MYSQL = 5
23 |     POSTGRESQL = 6
24 |     ELASTICSEARCH = 7
25 |     REDSHIFT = 8
26 |     DUCKDB = 9
27 |     BIGQUERY = 10
28 |     SNOWFLAKE = 11
29 |     TRINO = 12
30 |     VERTICA = 13
31 |     SINGLESTORE = 14
32 | 
33 | 
34 | @dataclass(frozen=True)
35 | class CatalogConfig:
36 |     """Defines the source of a catalog for PuppyGraph to construct the graph from."""
37 | 
38 |     name: str
39 |     type: CatalogType
40 |     params: Union[DatalakeParams, DatabaseParams]
41 | 


--------------------------------------------------------------------------------
/puppygraph/data/schema/graph_schema.py:
--------------------------------------------------------------------------------
 1 | """PuppyGraph schema definition."""
 2 | 
 3 | from dataclasses import dataclass, field
 4 | from typing import List, Optional
 5 | 
 6 | 
 7 | @dataclass(frozen=True)
 8 | class AttributeSchema:
 9 |     """Attribute schema."""
10 | 
11 |     type: str
12 |     name: str
13 |     description: Optional[str] = None
14 | 
15 | 
16 | @dataclass(frozen=True)
17 | class VertexSchema:
18 |     """Vertex schema."""
19 | 
20 |     label: str
21 |     attributes: List[AttributeSchema] = field(default_factory=list)
22 |     description: Optional[str] = None
23 | 
24 | 
25 | @dataclass(frozen=True)
26 | class EdgeSchema:
27 |     """Edge schema."""
28 | 
29 |     label: str
30 |     from_vertex_label: str
31 |     to_vertex_label: str
32 |     attributes: List[AttributeSchema] = field(default_factory=list)
33 |     description: Optional[str] = None
34 | 
35 | 
36 | @dataclass(frozen=True)
37 | class PuppyGraphSchema:
38 |     """The PuppyGraph schema."""
39 | 
40 |     vertex_schemas: List[VertexSchema] = field(default_factory=list)
41 |     edge_schemas: List[EdgeSchema] = field(default_factory=list)
42 |     description: Optional[str] = None
43 | 


--------------------------------------------------------------------------------
/puppygraph/data/mapping/datalake_params.py:
--------------------------------------------------------------------------------
 1 | """Defines Datalake parameters for a catalog."""
 2 | 
 3 | from dataclasses import dataclass
 4 | from typing import Optional, Union
 5 | 
 6 | 
 7 | @dataclass(frozen=True)
 8 | class UnityMetastoreParam:
 9 |     """Unity metastore parameters."""
10 | 
11 |     token: Optional[str] = None
12 |     host: Optional[str] = None
13 |     unity_catalog_name: Optional[str] = None
14 | 
15 | 
16 | @dataclass(frozen=True)
17 | class S3StorageParam:
18 |     """S3 storage parameters."""
19 | 
20 |     use_instance_profile: Optional[str] = None
21 |     region: Optional[str] = None
22 |     access_key: Optional[str] = None
23 |     secret_key: Optional[str] = None
24 |     iam_role_arn: Optional[str] = None
25 |     enable_ssl: Optional[str] = None
26 |     endpoint: Optional[str] = None
27 |     enable_path_style_access: Optional[str] = None
28 | 
29 | 
30 | MetastoreParam = Union[UnityMetastoreParam]
31 | StorageParam = Union[S3StorageParam]
32 | 
33 | 
34 | @dataclass(frozen=True)
35 | class DatalakeParams:
36 |     """Datalake parameters for a catalog."""
37 | 
38 |     metastore_param: Optional[MetastoreParam] = None
39 |     storage_param: Optional[StorageParam] = None
40 | 


--------------------------------------------------------------------------------
/apps/chatbot/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # PuppyGraph RAG Chatbot Launcher Script
 4 | 
 5 | set -e
 6 | 
 7 | echo "🐶 PuppyGraph RAG Chatbot Demo"
 8 | echo "================================"
 9 | 
10 | # Check if virtual environment exists
11 | if [ ! -d "venv" ]; then
12 |     echo "📦 Creating virtual environment..."
13 |     python3 -m venv venv
14 | fi
15 | 
16 | # Activate virtual environment
17 | echo "🔄 Activating virtual environment..."
18 | source venv/bin/activate
19 | 
20 | # Install dependencies
21 | echo "📥 Installing dependencies..."
22 | pip install -r requirements.txt
23 | 
24 | # Check if .env file exists
25 | if [ ! -f ".env" ]; then
26 |     echo "⚠️  .env file not found. Copying from .env.example..."
27 |     cp .env.example .env
28 |     echo "🔧 Please edit .env with your configuration before running!"
29 |     echo "   Especially set your ANTHROPIC_API_KEY"
30 | fi
31 | 
32 | # Run integration tests
33 | echo "🧪 Running integration tests..."
34 | python test_integration.py
35 | 
36 | if [ $? -eq 0 ]; then
37 |     echo "✅ Integration tests passed!"
38 |     echo ""
39 |     echo "🚀 Starting PuppyGraph RAG Chatbot..."
40 |     echo "   Access the UI at: http://localhost:7860"
41 |     echo "   Press Ctrl+C to stop"
42 |     echo ""
43 |     
44 |     # Start the application
45 |     python gradio_app.py
46 | else
47 |     echo "❌ Integration tests failed. Please check the configuration."
48 |     exit 1
49 | fi


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["poetry-core>=1.0.0"]
 3 | build-backend = "poetry.core.masonry.api"
 4 | 
 5 | [tool.poetry]
 6 | name = "puppygraph"
 7 | version = "0.1.5"
 8 | description = "The project contains the PuppyGraph client and PuppyGraph's Agentic Graph RAG libraries for Python. PuppyGraph client enables dynamic graph querying across multiple data sources with zero ETL, flexible schema management, and support for Cypher and Gremlin queries. PuppyGraph agentic graph RAG library makes graph-aware AI Agents."
 9 | authors = ["PuppyGraph <contact@puppygraph.com>"]
10 | license = "Apache-2.0"
11 | readme = "README.md"
12 | packages=[{include = "puppygraph"}]
13 | exclude = ["puppygraph/**/test*_.py"]
14 | 
15 | 
16 | [tool.poetry.dependencies]
17 | python = ">=3.9,<3.13"
18 | requests = "^2.28"
19 | gremlinpython = "^3.7.2"
20 | neo4j = "^5.7.0"
21 | dacite = "^1.7.0"
22 | async-timeout="^4.0.3"
23 | langchain-core = "^0.3.51"
24 | jinja2 = "^3.1.4"
25 | 
26 | [tool.poetry.group.dev]
27 | optional = true
28 | 
29 | [tool.poetry.group.dev.dependencies]
30 | pytest = "^7.4"
31 | pytest-cov = "^4.1"
32 | mypy = "^1.10"
33 | 
34 | 
35 | [tool.poetry.group.apps]
36 | optional = true
37 | 
38 | [tool.poetry.group.apps.dependencies]
39 | langchain-openai = "^0.3.12"
40 | langchain-community = "^0.3.21"
41 | langchain-anthropic = "^0.2.4"
42 | gradio = "^4.42.0"
43 | 
44 | 
45 | [tool.mypy]
46 | ignore_missing_imports = true
47 | disallow_untyped_defs = true
48 | disallow_untyped_calls = true
49 | 


--------------------------------------------------------------------------------
/puppygraph/data/mapping/graph_mapping_config.py:
--------------------------------------------------------------------------------
 1 | """PuppyGraph construction config."""
 2 | 
 3 | from dataclasses import dataclass, field
 4 | from typing import List, Optional
 5 | 
 6 | from puppygraph.data.mapping.catalog_config import CatalogConfig
 7 | from puppygraph.data.mapping.graph_element_cache_config import (
 8 |     GraphElementCacheConfig,
 9 | )
10 | 
11 | 
12 | @dataclass(frozen=True)
13 | class TableSource:
14 |     """Table source."""
15 | 
16 |     catalog_name: str
17 |     schema_name: str
18 |     table_name: str
19 | 
20 | 
21 | @dataclass(frozen=True)
22 | class MappedField:
23 |     """MappedAttributes from the source table to the graph schema."""
24 | 
25 |     name: str
26 |     type: str
27 |     from_field: str
28 |     description: Optional[str] = None
29 | 
30 | 
31 | @dataclass(frozen=True)
32 | class GraphElementConfig:
33 |     """Graph Element config.
34 | 
35 |     A graph element is a vertex or an edge.
36 |     """
37 | 
38 |     table_source: TableSource
39 | 
40 |     label: str
41 | 
42 |     # List of attributes, each attribute is a field in the source table
43 |     attributes: List[MappedField]
44 | 
45 |     # The ID can from single field or list of fields (composite key)
46 |     id: List[MappedField]
47 | 
48 |     # Description of the element
49 |     description: Optional[str] = None
50 | 
51 |     # Cache config
52 |     cache_config: Optional[GraphElementCacheConfig] = None
53 | 
54 |     # ID of the from vertex, only applicable if the element is an edge
55 |     from_id: Optional[List[MappedField]] = None
56 | 
57 |     # ID of the to vertex, only applicable if the element is an edge
58 |     to_id: Optional[List[MappedField]] = None
59 | 
60 |     # Label of the from vertex, only applicable if the element is an edge
61 |     from_label: Optional[str] = None
62 | 
63 |     # Label of the to vertex, only applicable if the element is an edge
64 |     to_label: Optional[str] = None
65 | 
66 | 
67 | @dataclass(frozen=True)
68 | class PuppyGraphMappingConfig:
69 |     """PuppyGraph Mapping config.
70 | 
71 |     This config defines how the graph is mapped from the source tables.
72 |     """
73 | 
74 |     # Catalogs to fetch tables from
75 |     catalogs: List[CatalogConfig] = field(default_factory=list)
76 | 
77 |     vertices: List[GraphElementConfig] = field(default_factory=list)
78 | 
79 |     edges: List[GraphElementConfig] = field(default_factory=list)
80 | 
81 |     # description of the graph
82 |     description: Optional[str] = None
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     pass
87 |     # print(GraphElementConfig)
88 |     # print(PuppyGraphConstruction
89 | 


--------------------------------------------------------------------------------
/apps/chatbot/CLAUDE.md:
--------------------------------------------------------------------------------
 1 | # CLAUDE.md
 2 | 
 3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
 4 | 
 5 | ## Development Commands
 6 | 
 7 | ### Quick Start
 8 | ```bash
 9 | # Run the complete application with setup
10 | ./run.sh
11 | 
12 | # Or run individual components:
13 | python gradio_app.py          # Start web UI at http://localhost:7860
14 | python mcp_server.py          # Run MCP server standalone
15 | python test_integration.py    # Run integration tests
16 | ```
17 | 
18 | ### Setup Commands
19 | ```bash
20 | # Install dependencies
21 | pip install -r requirements.txt
22 | 
23 | # Create and activate virtual environment
24 | python3 -m venv venv
25 | source venv/bin/activate
26 | 
27 | # Environment setup
28 | cp .env.example .env  # Edit with your API keys
29 | ```
30 | 
31 | ### Testing
32 | ```bash
33 | # Integration tests (checks all components)
34 | python test_integration.py
35 | 
36 | # Test individual components
37 | python -c "from backend import get_chatbot; chatbot = get_chatbot(); print(chatbot.get_graph_stats())"
38 | python -c "from rag_system import TextToCypherRAG; rag = TextToCypherRAG(); print('RAG system OK')"
39 | ```
40 | 
41 | ## Architecture Overview
42 | 
43 | ### Core Components
44 | This is a RAG-powered chatbot that converts natural language queries to Cypher queries for PuppyGraph:
45 | 
46 | 1. **gradio_app.py** - Main web interface (Gradio UI)
47 | 2. **backend.py** - Central coordinator (`PuppyGraphChatbot` class)
48 | 3. **rag_system.py** - Text-to-Cypher conversion using embeddings + Claude Sonnet 4.0
49 | 4. **mcp_server.py** - Model Context Protocol server for PuppyGraph operations
50 | 
51 | ### Data Flow
52 | ```
53 | User Question → Gradio UI → Backend → RAG System → Claude Sonnet 4.0 → Cypher Query → MCP Server → PuppyGraph → Results
54 | ```
55 | 
56 | ### Key Classes
57 | - `PuppyGraphChatbot` (backend.py) - Main orchestrator
58 | - `TextToCypherRAG` (rag_system.py) - Handles NL→Cypher conversion using ChromaDB + embeddings
59 | - `PuppyGraphMCPServer` (mcp_server.py) - MCP tools for schema, query execution, validation
60 | 
61 | ## Configuration
62 | 
63 | ### Required Environment Variables
64 | - `ANTHROPIC_API_KEY` - Required for Claude Sonnet 4.0 integration
65 | - `PUPPYGRAPH_BOLT_URI` - Default: `bolt://localhost:7687`
66 | - `PUPPYGRAPH_HTTP_URI` - Default: `http://localhost:8081`
67 | - `PUPPYGRAPH_USERNAME` - Default: `puppygraph`
68 | - `PUPPYGRAPH_PASSWORD` - Default: `puppygraph123`
69 | 
70 | ### Dependencies
71 | - **Core**: gradio, anthropic, mcp, neo4j, requests
72 | - **RAG**: sentence-transformers, chromadb, langchain
73 | - **Server**: uvicorn, fastapi, python-dotenv
74 | 
75 | ## Development Notes
76 | 
77 | ### Multi-Round Query Execution
78 | The system automatically breaks complex questions into multiple Cypher queries. Each round uses results from previous queries to generate more specific follow-ups.
79 | 
80 | ### RAG System Details
81 | - Uses `all-MiniLM-L6-v2` for embeddings by default
82 | - ChromaDB stores question/Cypher examples
83 | - Claude Sonnet 4.0 generates queries using retrieved examples as context
84 | - Examples can be added via UI or programmatically
85 | 
86 | ### MCP Integration
87 | The MCP server provides these tools:
88 | - `execute_cypher` - Run Cypher queries with result formatting
89 | - `get_schema_info` - Retrieve graph schema with optional node samples
90 | - `validate_cypher` - Validate query syntax before execution
91 | 
92 | ### Error Handling
93 | - Connection failures to PuppyGraph are handled gracefully
94 | - RAG system falls back to basic query generation if examples are unavailable
95 | - All components have comprehensive logging


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | .idea/
163 | apps/private


--------------------------------------------------------------------------------
/apps/chatbot/test_query_limits.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Test script to verify query limit functionality
  4 | """
  5 | 
  6 | import logging
  7 | import sys
  8 | from rag_system import TextToCypherRAG, QueryStep
  9 | 
 10 | logging.basicConfig(level=logging.INFO)
 11 | logger = logging.getLogger("test_query_limits")
 12 | 
 13 | def test_query_limits():
 14 |     """Test that the system properly handles query limits"""
 15 |     
 16 |     try:
 17 |         # Initialize RAG system
 18 |         rag_system = TextToCypherRAG()
 19 |         
 20 |         # Mock schema
 21 |         mock_schema = {
 22 |             "vertices": [{"label": "TestNode", "attributes": []}],
 23 |             "edges": []
 24 |         }
 25 |         
 26 |         # Test with max_rounds = 3 (should force stop on 3rd round)
 27 |         max_rounds = 3
 28 |         question = "Test query with forced limits"
 29 |         previous_steps = []
 30 |         
 31 |         # Simulate first round
 32 |         logger.info(f"=== Testing Round 1 (should continue) ===")
 33 |         cypher, explanation, should_stop, prompt, llm_response = rag_system.generate_next_query(
 34 |             question, mock_schema, previous_steps, max_rounds
 35 |         )
 36 |         
 37 |         if should_stop:
 38 |             logger.error("❌ Round 1: System stopped unexpectedly")
 39 |             return False
 40 |         else:
 41 |             logger.info(f"✅ Round 1: Continuing as expected")
 42 |             logger.info(f"Generated query: {cypher}")
 43 |         
 44 |         # Add mock step for round 1
 45 |         step1 = QueryStep(1, "First query", cypher)
 46 |         step1.result = [{"test": "data1"}]
 47 |         previous_steps.append(step1)
 48 |         
 49 |         # Simulate second round 
 50 |         logger.info(f"=== Testing Round 2 (should continue with warning) ===")
 51 |         cypher, explanation, should_stop, prompt, llm_response = rag_system.generate_next_query(
 52 |             question, mock_schema, previous_steps, max_rounds
 53 |         )
 54 |         
 55 |         if should_stop:
 56 |             logger.error("❌ Round 2: System stopped unexpectedly")
 57 |             return False
 58 |         else:
 59 |             logger.info(f"✅ Round 2: Continuing as expected")
 60 |             logger.info(f"Generated query: {cypher}")
 61 |             # Check if warning about final round is in prompt
 62 |             logger.info(f"Round 2 prompt content: {prompt}")
 63 |             if "only have" in prompt and "round left" in prompt:
 64 |                 logger.info("✅ Round 2: Warning about approaching limit found in prompt")
 65 |             else:
 66 |                 logger.warning("⚠️ Round 2: No warning about approaching limit found")
 67 |         
 68 |         # Add mock step for round 2
 69 |         step2 = QueryStep(2, "Second query", cypher)
 70 |         step2.result = [{"test": "data2"}]
 71 |         previous_steps.append(step2)
 72 |         
 73 |         # Simulate third round (should force stop)
 74 |         logger.info(f"=== Testing Round 3 (should force stop) ===")
 75 |         cypher, explanation, should_stop, prompt, llm_response = rag_system.generate_next_query(
 76 |             question, mock_schema, previous_steps, max_rounds
 77 |         )
 78 |         
 79 |         logger.info(f"Round 3 prompt content: {prompt}")
 80 |         if "final round" in prompt and "must now STOP" in prompt:
 81 |             logger.info("✅ Round 3: Force stop message found in prompt")
 82 |         else:
 83 |             logger.warning("⚠️ Round 3: No force stop message found")
 84 |         
 85 |         if should_stop:
 86 |             logger.info(f"✅ Round 3: System stopped as expected")
 87 |             logger.info(f"Final answer: {explanation}")
 88 |             return True
 89 |         else:
 90 |             logger.error("❌ Round 3: System should have stopped but continued")
 91 |             return False
 92 |             
 93 |     except Exception as e:
 94 |         logger.error(f"❌ Test failed with exception: {e}")
 95 |         import traceback
 96 |         traceback.print_exc()
 97 |         return False
 98 | 
 99 | if __name__ == "__main__":
100 |     logger.info("🧪 Testing Query Limit functionality")
101 |     logger.info("=" * 50)
102 |     
103 |     success = test_query_limits()
104 |     
105 |     logger.info("=" * 50)
106 |     if success:
107 |         logger.info("🎉 Query limit test PASSED!")
108 |         sys.exit(0)
109 |     else:
110 |         logger.error("❌ Query limit test FAILED!")
111 |         sys.exit(1)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PuppyGraph
  2 | 
  3 | This repository contains the PuppyGraph client for Python as well
  4 | as PuppyGraph's Agentic Graph RAG libraries.
  5 | 
  6 | ## Key Features
  7 | 
  8 | - **Zero ETL**: Query data directly from your lakes and databases without data duplication.
  9 | - **Dynamic Schema Management**: Modify graph schemas on the fly, without needing to rebuild databases.
 10 | - **Petabyte Scalability**: Auto-sharded, distributed computation for handling vast datasets.
 11 | - **Support for Cypher and Gremlin**: Interoperable query support with robust performance.
 12 | - **AI-Native**: Ideal for Graph-RAG applications, with ultra-fast response times.
 13 | 
 14 | ## Installation
 15 | 
 16 | You can install the latest version via pip:
 17 | 
 18 | ```bash
 19 | pip install puppygraph
 20 | ```
 21 | 
 22 | ## Quick Example
 23 | 
 24 | ### Setup the client
 25 |     
 26 | ```python
 27 |     from puppygraph import PuppyGraphClient, PuppyGraphHostConfig
 28 |     client = PuppyGraphClient(PuppyGraphHostConfig("localhost"))
 29 | ```
 30 | 
 31 | ### Query the graph
 32 | ```python
 33 |     # Cypher Query
 34 |     client.cypher_query("MATCH (actor:Actor)-[:ACTED_IN]->(movie:Movie) WHERE actor.name = \"Tom Hanks\"
 35 |                         "RETURN movie.title")
 36 | 
 37 |     # Gremlin Query 
 38 |     client.gremlin_query("g.V().hasLabel('person').has('name', 'Tom Hanks').out('ACTED_IN').values('title')")
 39 | ```
 40 | 
 41 | ### Set the schema
 42 | ```python
 43 |     # A sample schema for the IMDb dataset
 44 |     client.set_schema(
 45 |     {
 46 |         "catalogs": [
 47 |             {
 48 |                 "name": "imdb_catalog",
 49 |                 "type": "DELTALAKE",
 50 |                 "params": {
 51 |                     "metastore_param": {
 52 |                         "token": "your_token",
 53 |                         "host": "your_host",
 54 |                         "unity_catalog_name": "imdb_catalog",
 55 |                     },
 56 |                     "storage_param": {
 57 |                         "use_instance_profile": "false",
 58 |                         "region": "us-west-2",
 59 |                         "access_key": "your_access_key",
 60 |                         "secret_key": "your_secret_key",
 61 |                         "enable_ssl": "true",
 62 |                         "type": "S3",
 63 |                     },
 64 |                 },
 65 |             }
 66 |         ],
 67 |         "vertices": [
 68 |             {
 69 |                 "table_source": {
 70 |                     "catalog_name": "imdb_catalog",
 71 |                     "schema_name": "public",
 72 |                     "table_name": "movies",
 73 |                 },
 74 |                 "label": "Movie",
 75 |                 "description": "A movie in the IMDb database",
 76 |                 "attributes": [
 77 |                     {
 78 |                         "name": "title",
 79 |                         "from_field": "title",
 80 |                         "type": "String",
 81 |                         "description": "The title of the movie",
 82 |                     },
 83 |                     {
 84 |                         "name": "release_year",
 85 |                         "from_field": "release_year",
 86 |                         "type": "Integer",
 87 |                         "description": "The year the movie was released",
 88 |                     },
 89 |                 ],
 90 |                 "id": [
 91 |                     {"name": "movie_id", "from_field": "movie_id", "type": "String"}
 92 |                 ],
 93 |             },
 94 |             {
 95 |                 "table_source": {
 96 |                     "catalog_name": "imdb_catalog",
 97 |                     "schema_name": "public",
 98 |                     "table_name": "actors",
 99 |                 },
100 |                 "label": "Actor",
101 |                 "description": "An actor who starred in movies",
102 |                 "attributes": [
103 |                     {
104 |                         "name": "name",
105 |                         "from_field": "name",
106 |                         "type": "String",
107 |                         "description": "The name of the actor",
108 |                     }
109 |                 ],
110 |                 "id": [
111 |                     {"name": "actor_id", "from_field": "actor_id", "type": "String"}
112 |                 ],
113 |             },
114 |         ],
115 |         "edges": [
116 |             {
117 |                 "table_source": {
118 |                     "catalog_name": "imdb_catalog",
119 |                     "schema_name": "public",
120 |                     "table_name": "acted_in",
121 |                 },
122 |                 "label": "ACTED_IN",
123 |                 "from_label": "Actor",
124 |                 "to_label": "Movie",
125 |                 "description": "An actor acted in a movie",
126 |                 "attributes": [],
127 |                 "id": [
128 |                     {
129 |                         "name": "acted_in_id",
130 |                         "from_field": "acted_in_id",
131 |                         "type": "String",
132 |                     }
133 |                 ],
134 |                 "from_id": [
135 |                     {"name": "actor_id", "from_field": "actor_id", "type": "String"}
136 |                 ],
137 |                 "to_id": [
138 |                     {"name": "movie_id", "from_field": "movie_id", "type": "String"}
139 |                 ],
140 |             }
141 |         ],
142 |     }
143 | )
144 |     
145 | ```
146 | 
147 | ## About PuppyGraph
148 | 
149 | [PuppyGraph](https://www.puppygraph.com) 
150 | is a zero-ETL graph analytics engine enabling seamless graph querying across one or multiple data sources. 
151 | Unlike traditional graph databases, PuppyGraph connects directly to your data warehouses and lakes without requiring complex ETL pipelines, making it both cost-efficient and scalable.
152 | 


--------------------------------------------------------------------------------
/apps/chatbot/test_integration.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | import sys
  5 | import time
  6 | import logging
  7 | from typing import Dict, Any
  8 | 
  9 | # Setup logging
 10 | logging.basicConfig(level=logging.INFO)
 11 | logger = logging.getLogger("integration_test")
 12 | 
 13 | def test_imports():
 14 |     """Test that all required modules can be imported"""
 15 |     logger.info("Testing imports...")
 16 |     
 17 |     try:
 18 |         import gradio as gr
 19 |         logger.info("✅ Gradio imported successfully")
 20 |     except ImportError as e:
 21 |         logger.error(f"❌ Failed to import Gradio: {e}")
 22 |         return False
 23 |     
 24 |     try:
 25 |         from backend import PuppyGraphChatbot
 26 |         logger.info("✅ Backend imported successfully")
 27 |     except ImportError as e:
 28 |         logger.error(f"❌ Failed to import backend: {e}")
 29 |         return False
 30 |     
 31 |     try:
 32 |         from rag_system import TextToCypherRAG
 33 |         logger.info("✅ RAG system imported successfully")
 34 |     except ImportError as e:
 35 |         logger.error(f"❌ Failed to import RAG system: {e}")
 36 |         return False
 37 |     
 38 |     try:
 39 |         import mcp_server
 40 |         logger.info("✅ MCP server imported successfully")
 41 |     except ImportError as e:
 42 |         logger.error(f"❌ Failed to import MCP server: {e}")
 43 |         return False
 44 |     
 45 |     return True
 46 | 
 47 | def test_rag_system():
 48 |     """Test RAG system functionality"""
 49 |     logger.info("Testing RAG system...")
 50 |     
 51 |     try:
 52 |         from rag_system import TextToCypherRAG, QueryExample
 53 |         
 54 |         # Initialize RAG system (will use lightweight model for testing)
 55 |         rag = TextToCypherRAG()
 56 |         logger.info("✅ RAG system initialized")
 57 |         
 58 |         # Test adding an example
 59 |         example = QueryExample(
 60 |             question="Count all nodes",
 61 |             cypher="MATCH (n) RETURN count(n)",
 62 |             description="Counts all nodes in the graph"
 63 |         )
 64 |         rag.add_example(example)
 65 |         logger.info("✅ Example added to RAG system")
 66 |         
 67 |         # Test finding similar examples
 68 |         similar = rag.find_similar_examples("How many nodes are there?")
 69 |         if similar:
 70 |             logger.info(f"✅ Found {len(similar)} similar examples")
 71 |             return True
 72 |         else:
 73 |             logger.warning("⚠️ No similar examples found (may be expected)")
 74 |             return True
 75 |             
 76 |     except Exception as e:
 77 |         logger.error(f"❌ RAG system test failed: {e}")
 78 |         return False
 79 | 
 80 | def test_backend_basic():
 81 |     """Test basic backend functionality"""
 82 |     logger.info("Testing backend basic functionality...")
 83 |     
 84 |     try:
 85 |         from backend import PuppyGraphChatbot
 86 |         
 87 |         # Initialize chatbot (this will fail if PuppyGraph is not running)
 88 |         try:
 89 |             chatbot = PuppyGraphChatbot()
 90 |             logger.info("✅ Backend initialized successfully")
 91 |             
 92 |             # Test schema retrieval
 93 |             schema = chatbot.get_schema()
 94 |             logger.info(f"✅ Schema retrieved: {len(schema.get('vertices', []))} vertices, {len(schema.get('edges', []))} edges")
 95 |             
 96 |             # Test stats (this might fail if no connection to PuppyGraph)
 97 |             stats = chatbot.get_graph_stats()
 98 |             if "error" in stats:
 99 |                 logger.warning(f"⚠️ Graph stats returned error (PuppyGraph may not be running): {stats['error']}")
100 |             else:
101 |                 logger.info(f"✅ Graph stats: {stats.get('node_count', 'unknown')} nodes, {stats.get('edge_count', 'unknown')} edges")
102 |             
103 |             return True
104 |             
105 |         except Exception as e:
106 |             logger.warning(f"⚠️ Backend connection failed (PuppyGraph may not be running): {e}")
107 |             return True  # This is expected if PuppyGraph is not running
108 |             
109 |     except Exception as e:
110 |         logger.error(f"❌ Backend test failed: {e}")
111 |         return False
112 | 
113 | def test_gradio_interface():
114 |     """Test Gradio interface creation"""
115 |     logger.info("Testing Gradio interface...")
116 |     
117 |     try:
118 |         from gradio_app import create_interface
119 |         
120 |         # Create interface (don't launch)
121 |         interface = create_interface()
122 |         logger.info("✅ Gradio interface created successfully")
123 |         return True
124 |         
125 |     except Exception as e:
126 |         logger.error(f"❌ Gradio interface test failed: {e}")
127 |         return False
128 | 
129 | def test_environment_setup():
130 |     """Test environment and configuration"""
131 |     logger.info("Testing environment setup...")
132 |     
133 |     # Check for .env file
134 |     if os.path.exists('.env'):
135 |         logger.info("✅ .env file found")
136 |     else:
137 |         logger.warning("⚠️ .env file not found (using .env.example)")
138 |     
139 |     # Check for Anthropic API key
140 |     anthropic_key = os.getenv('ANTHROPIC_API_KEY')
141 |     if anthropic_key:
142 |         logger.info("✅ Anthropic API key configured")
143 |     else:
144 |         logger.warning("⚠️ Anthropic API key not found - RAG functionality may be limited")
145 |     
146 |     return True
147 | 
148 | def run_all_tests():
149 |     """Run all integration tests"""
150 |     logger.info("Starting PuppyGraph RAG Chatbot Integration Tests")
151 |     logger.info("=" * 60)
152 |     
153 |     tests = [
154 |         ("Environment Setup", test_environment_setup),
155 |         ("Module Imports", test_imports),
156 |         ("RAG System", test_rag_system),
157 |         ("Backend Basic", test_backend_basic),
158 |         ("Gradio Interface", test_gradio_interface),
159 |     ]
160 |     
161 |     results = {}
162 |     
163 |     for test_name, test_func in tests:
164 |         logger.info(f"\n🧪 Running {test_name} test...")
165 |         try:
166 |             results[test_name] = test_func()
167 |         except Exception as e:
168 |             logger.error(f"❌ {test_name} test crashed: {e}")
169 |             results[test_name] = False
170 |     
171 |     # Summary
172 |     logger.info("\n" + "=" * 60)
173 |     logger.info("TEST RESULTS SUMMARY")
174 |     logger.info("=" * 60)
175 |     
176 |     passed = 0
177 |     total = len(results)
178 |     
179 |     for test_name, result in results.items():
180 |         status = "✅ PASS" if result else "❌ FAIL"
181 |         logger.info(f"{test_name:.<40} {status}")
182 |         if result:
183 |             passed += 1
184 |     
185 |     logger.info(f"\nOverall: {passed}/{total} tests passed")
186 |     
187 |     if passed == total:
188 |         logger.info("🎉 All tests passed! The system is ready to use.")
189 |         return True
190 |     else:
191 |         logger.warning("⚠️ Some tests failed. Check the logs above for details.")
192 |         return False
193 | 
194 | def main():
195 |     """Main test runner"""
196 |     success = run_all_tests()
197 |     sys.exit(0 if success else 1)
198 | 
199 | if __name__ == "__main__":
200 |     main()


--------------------------------------------------------------------------------
/puppygraph/common/conversion_utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for conversion."""
  2 | 
  3 | from typing import Any, Dict, List, Union
  4 | 
  5 | import dacite
  6 | from puppygraph.common.dataclass_utils import dataclass_to_camel_dict
  7 | from puppygraph.data.mapping.catalog_config import CatalogConfig, CatalogType
  8 | from puppygraph.data.mapping.database_params import (
  9 |     DatabaseParams,
 10 |     ElasticSearchParam,
 11 |     JDBCParam,
 12 | )
 13 | from puppygraph.data.mapping.datalake_params import (
 14 |     DatalakeParams,
 15 |     MetastoreParam,
 16 |     S3StorageParam,
 17 |     UnityMetastoreParam,
 18 | )
 19 | from puppygraph.data.mapping.graph_element_cache_config import (
 20 |     GraphElementCacheConfig,
 21 | )
 22 | from puppygraph.data.mapping.graph_mapping_config import (
 23 |     GraphElementConfig,
 24 |     MappedField,
 25 |     PuppyGraphMappingConfig,
 26 |     TableSource,
 27 | )
 28 | 
 29 | 
 30 | def convert_mapping_config_to_host_json(
 31 |     config: Union[PuppyGraphMappingConfig, Dict],
 32 | ) -> Dict[str, Any]:
 33 |     """Converts the PuppyGraph Mapping config to the host schema JSON.
 34 | 
 35 |     Args:
 36 |         config: The PuppyGraph Mapping config.
 37 | 
 38 |     Returns:
 39 |         The host schema JSON.
 40 |     """
 41 | 
 42 |     def _metastore_param_to_json(metastore_param: MetastoreParam) -> Dict[str, Any]:
 43 |         json = {}
 44 |         if isinstance(metastore_param, UnityMetastoreParam):
 45 |             json = dataclass_to_camel_dict(metastore_param)
 46 |             json["type"] = "unity"
 47 |             # Rename unityCatalogName to databricksCatalogName
 48 |             if "unityCatalogName" in json:
 49 |                 json["databricksCatalogName"] = json.pop("unityCatalogName")
 50 |         return json
 51 | 
 52 |     def _storage_param_to_json(storage_param: S3StorageParam) -> Dict[str, Any]:
 53 |         json = {}
 54 |         if isinstance(storage_param, S3StorageParam):
 55 |             json = dataclass_to_camel_dict(storage_param)
 56 |             json["type"] = "S3"
 57 |         return json
 58 | 
 59 |     def _datalake_params_to_json(datalake_params: DatalakeParams) -> Dict[str, Any]:
 60 |         json = {}
 61 |         if datalake_params.metastore_param is not None:
 62 |             json["metastore"] = _metastore_param_to_json(
 63 |                 datalake_params.metastore_param
 64 |             )
 65 |         if datalake_params.storage_param is not None:
 66 |             json["storage"] = _storage_param_to_json(datalake_params.storage_param)
 67 |         return json
 68 | 
 69 |     def _database_params_to_json(database_params: DatabaseParams) -> Dict[str, Any]:
 70 |         key = ""
 71 |         if isinstance(database_params, JDBCParam):
 72 |             key = "jdbc"
 73 |         elif isinstance(database_params, ElasticSearchParam):
 74 |             key = "elasticSearch"
 75 |         return {key: dataclass_to_camel_dict(database_params)}
 76 | 
 77 |     def _catalog_config_to_json(catalog_config: CatalogConfig) -> Dict[str, Any]:
 78 |         """Maps the catalog config to the host catalog JSON.
 79 | 
 80 |         Args:
 81 |             catalog_config: The catalog configuration to map.
 82 | 
 83 |         Returns:
 84 |             The host catalog JSON.
 85 |         """
 86 |         catalog_json = {
 87 |             "name": catalog_config.name,
 88 |             "type": catalog_config.type.name.lower(),
 89 |         }
 90 | 
 91 |         if isinstance(catalog_config.params, DatalakeParams):
 92 |             catalog_json.update(_datalake_params_to_json(catalog_config.params))
 93 |         elif isinstance(catalog_config.params, DatabaseParams):
 94 |             catalog_json.update(_database_params_to_json(catalog_config.params))
 95 | 
 96 |         return catalog_json
 97 | 
 98 |     def _table_source_to_json(table_source: TableSource) -> Dict[str, Any]:
 99 |         return {
100 |             "catalog": table_source.catalog_name,
101 |             "schema": table_source.schema_name,
102 |             "table": table_source.table_name,
103 |         }
104 | 
105 |     def _mapped_id_to_json(mapped_id: List[MappedField]) -> Dict[str, Any]:
106 |         return {
107 |             "fields": [
108 |                 {
109 |                     "field": mapped_field.from_field,
110 |                     "alias": mapped_field.name,
111 |                     "type": mapped_field.type,
112 |                 }
113 |                 for mapped_field in mapped_id
114 |             ]
115 |         }
116 | 
117 |     def _attribute_to_json(mapped_attribute: MappedField) -> Dict[str, Any]:
118 |         return {
119 |             "alias": mapped_attribute.name,
120 |             "field": mapped_attribute.from_field,
121 |             "type": mapped_attribute.type,
122 |         }
123 | 
124 |     def _cache_config_to_json(cache_config: GraphElementCacheConfig) -> Dict[str, Any]:
125 |         return dataclass_to_camel_dict(cache_config)
126 | 
127 |     def _vertex_config_to_json(vertex_config: GraphElementConfig) -> Dict[str, Any]:
128 |         json = {
129 |             "label": vertex_config.label,
130 |             "oneToOne": {
131 |                 "tableSource": _table_source_to_json(
132 |                     table_source=vertex_config.table_source
133 |                 ),
134 |                 "id": _mapped_id_to_json(mapped_id=vertex_config.id),
135 |                 "attributes": [
136 |                     _attribute_to_json(
137 |                         mapped_attribute=attribute,
138 |                     )
139 |                     for attribute in vertex_config.attributes
140 |                 ],
141 |             },
142 |         }
143 |         if vertex_config.cache_config is not None:
144 |             json["cache"] = _cache_config_to_json(vertex_config.cache_config)
145 |         return json
146 | 
147 |     def _edge_config_to_json(
148 |         edge_config: GraphElementConfig,
149 |     ) -> Dict[str, Any]:
150 | 
151 |         json = {
152 |             "label": edge_config.label,
153 |             "fromVertex": edge_config.from_label,
154 |             "toVertex": edge_config.to_label,
155 |             "tableSource": _table_source_to_json(table_source=edge_config.table_source),
156 |             "id": _mapped_id_to_json(mapped_id=edge_config.id),
157 |             "fromId": _mapped_id_to_json(mapped_id=edge_config.from_id),
158 |             "toId": _mapped_id_to_json(mapped_id=edge_config.to_id),
159 |             "attributes": [
160 |                 _attribute_to_json(
161 |                     mapped_attribute=attribute,
162 |                 )
163 |                 for attribute in edge_config.attributes
164 |             ],
165 |         }
166 | 
167 |         if edge_config.cache_config is not None:
168 |             json["cache"] = _cache_config_to_json(edge_config.cache_config)
169 |         return json
170 | 
171 |     if isinstance(config, dict):
172 |         config = dacite.from_dict(
173 |             data_class=PuppyGraphMappingConfig,
174 |             data=config,
175 |             config=dacite.Config(
176 |                 type_hooks={
177 |                     CatalogType: lambda x: CatalogType[x.upper()],
178 |                 }
179 |             ),
180 |         )
181 | 
182 |     return {
183 |         "catalogs": [
184 |             _catalog_config_to_json(catalog_config)
185 |             for catalog_config in config.catalogs
186 |         ],
187 |         "graph": {
188 |             "vertices": [
189 |                 _vertex_config_to_json(vertex_config=vertex_config)
190 |                 for vertex_config in config.vertices
191 |             ],
192 |             "edges": [
193 |                 _edge_config_to_json(edge_config=edge_config)
194 |                 for edge_config in config.edges
195 |             ],
196 |         },
197 |     }
198 | 


--------------------------------------------------------------------------------
/puppygraph/rag/graph_agent.py:
--------------------------------------------------------------------------------
  1 | """PuppyGraphAgent."""
  2 | 
  3 | import logging
  4 | from copy import deepcopy
  5 | from typing import Iterable, List, Optional
  6 | 
  7 | from langchain_core.language_models.chat_models import BaseChatModel
  8 | from langchain_core.messages import (
  9 |     AIMessage,
 10 |     BaseMessage,
 11 |     HumanMessage,
 12 |     ToolCall,
 13 |     ToolMessage,
 14 | )
 15 | from langchain_core.prompts.chat import ChatPromptTemplate
 16 | from langchain_core.pydantic_v1 import Field, create_model
 17 | from langchain_core.runnables import RunnableSequence
 18 | from langchain_core.tools import StructuredTool, Tool
 19 | 
 20 | from puppygraph.client.client import PuppyGraphClient
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | logger.addHandler(
 24 |     logging.NullHandler()
 25 | )  # Prevent "No handlers could be found" warnings
 26 | 
 27 | 
 28 | class PuppyGraphAgent:
 29 |     """PuppyGraphAgent is the agent that interacts with the PuppyGraph via natural language.
 30 | 
 31 |     It enables the user to interact with the graph via natural lanaguage queries.
 32 |     """
 33 | 
 34 |     def __init__(
 35 |         self,
 36 |         puppy_graph_client: PuppyGraphClient,
 37 |         llm: BaseChatModel,
 38 |         chat_prompt_template: ChatPromptTemplate,
 39 |         query_language: str = "cypher",
 40 |         additional_tools: Optional[List[Tool]] = None,
 41 |     ):
 42 |         """Initializes the PuppyGraphAgent.
 43 | 
 44 |         Args:
 45 |             puppy_graph_client: The PuppyGraph client
 46 |             llm: The language model
 47 |             chat_prompt_template: The chat prompt template
 48 |             query_language: The query language to use, either "cypher" or "gremlin" or "both"
 49 |             additional_tools: Additional tools for the agent to use
 50 |         """
 51 | 
 52 |         self._puppy_graph_client = puppy_graph_client
 53 | 
 54 |         # Set up tools
 55 |         self._cypher_query_tool = _get_cypher_query_tool(
 56 |             puppy_graph_client=puppy_graph_client
 57 |         )
 58 |         self._gremlin_query_tool = _get_gremlin_query_tool(
 59 |             puppy_graph_client=puppy_graph_client
 60 |         )
 61 |         if query_language == "cypher":
 62 |             self._tools = [self._cypher_query_tool]
 63 |         elif query_language == "gremlin":
 64 |             self._tools = [self._gremlin_query_tool]
 65 |         elif query_language == "both":
 66 |             self._tools = [self._cypher_query_tool, self._gremlin_query_tool]
 67 | 
 68 |         if additional_tools is not None:
 69 |             self._tools.extend(additional_tools)
 70 | 
 71 |         self._tool_dict = {tool.name: tool for tool in self._tools}
 72 | 
 73 |         # Set up llm chain
 74 |         self._chat_prompt_template = chat_prompt_template
 75 | 
 76 |         self._llm = llm.bind_tools(tools=self._tools)
 77 |         self._llm_chain: RunnableSequence = self._chat_prompt_template | self._llm
 78 | 
 79 |         self._llm_no_tool_output = llm.bind_tools(tools=[])
 80 |         self._llm_no_tool_output_chain: RunnableSequence = (
 81 |             self._chat_prompt_template | self._llm_no_tool_output
 82 |         )
 83 | 
 84 |         # Set up other global variables
 85 |         self._message_history = []
 86 | 
 87 |     def query(self, user_input: str, max_iters: int = 10) -> Iterable[BaseMessage]:
 88 |         """Query the graph using the given user input.
 89 | 
 90 |         Args:
 91 |             user_input: The user input
 92 |             max_iters: The maximum number of iterations to run
 93 | 
 94 |         Yields:
 95 |             BaseMessage, can be either AIMessage or ToolMessage
 96 |         """
 97 |         # We have to copy the message history to avoid side effects
 98 |         # if query() is called multiple times
 99 |         message_history = deepcopy(self._message_history)
100 | 
101 |         new_messages = [HumanMessage(content=user_input)]
102 | 
103 |         iters = 0
104 | 
105 |         wait_for_user_input = False
106 |         while iters < max_iters and not wait_for_user_input:
107 |             # If we are at the last iteration, we don't want to call the tool
108 |             # This is because we want to show the user the final message
109 |             tool_call_allowed = True
110 |             if iters + 1 == max_iters:
111 |                 tool_call_allowed = False
112 | 
113 |             # Predict
114 |             ai_message = self._llm_predict(
115 |                 message_history=message_history + new_messages,
116 |                 tool_call_allowed=tool_call_allowed,
117 |             )
118 | 
119 |             # Add AI message to new messages
120 |             new_messages.append(ai_message)
121 |             logger.info("AI message: %s", ai_message.content)
122 |             yield ai_message
123 | 
124 |             if ai_message.tool_calls:
125 |                 # Execute the actual tool
126 |                 tool_messages = self._execute_tool_calls(ai_message)
127 | 
128 |                 # Add tool messages to new messages
129 |                 new_messages.extend(tool_messages)
130 |                 logger.info("Tool messages: %s", tool_messages)
131 |                 yield from tool_messages
132 | 
133 |             iters += 1
134 | 
135 |             # Check if we need to wait for user input
136 |             if not ai_message.tool_calls:
137 |                 wait_for_user_input = True
138 | 
139 |         # Update the message history
140 |         self._message_history += new_messages
141 | 
142 |     def reset_messages(self):
143 |         """Reset the message history."""
144 |         self._message_history = []
145 | 
146 |     def _llm_predict(
147 |         self,
148 |         message_history: List[BaseMessage],
149 |         tool_call_allowed: bool,
150 |     ) -> AIMessage:
151 |         """Predict the AI message using llm.
152 | 
153 |         Args:
154 |             message_history: The message history
155 |             tool_call_allowed: Whether tool calls are allowed
156 | 
157 |         Returns:
158 |             The predicted AI message
159 |         """
160 |         input_dict = {
161 |             "message_history": message_history,
162 |         }
163 | 
164 |         if not tool_call_allowed:
165 |             return self._llm_no_tool_output_chain.invoke(input=input_dict)
166 | 
167 |         return self._llm_chain.invoke(input=input_dict)
168 | 
169 |     def _execute_tool_calls(self, ai_message: AIMessage) -> List[ToolMessage]:
170 |         """Execute the tool calls in the AI message.
171 | 
172 |         Args:
173 |             ai_message: The AI message which might contain tool calls
174 | 
175 |         Returns:
176 |             The tool messages
177 |         """
178 |         tool_messages = []
179 |         for tool_call in ai_message.tool_calls:
180 |             tool_messages.append(self._execute_tool_call(tool_call))
181 |         return tool_messages
182 | 
183 |     def _execute_tool_call(self, tool_call: ToolCall) -> ToolMessage:
184 |         """Execute the given tool call.
185 | 
186 |         Args:
187 |             tool_call: The tool call to execute
188 | 
189 |         Returns:
190 |             The tool message
191 |         """
192 |         tool = self._tool_dict[tool_call["name"]]
193 |         logger.info(
194 |             "Calling tool: %s with args: %s", tool_call["name"], tool_call["args"]
195 |         )
196 |         try:
197 |             tool_output = str(tool.invoke(input=tool_call["args"]))
198 |         except Exception as e:
199 |             tool_output = f"While executing tool, an error occurred : {e}"
200 | 
201 |         return ToolMessage(tool_output, tool_call_id=tool_call["id"])
202 | 
203 | 
204 | def _get_cypher_query_tool(puppy_graph_client: PuppyGraphClient):
205 |     """Get the Cypher query tool."""
206 |     return StructuredTool.from_function(
207 |         func=puppy_graph_client.cypher_query,
208 |         name="query_graph_cypher",
209 |         description="Query the graph database using Cypher.",
210 |         args_schema=create_model(
211 |             "", query=(str, Field(description="The Cypher query to run"))
212 |         ),
213 |     )
214 | 
215 | 
216 | def _get_gremlin_query_tool(puppy_graph_client: PuppyGraphClient):
217 |     """Get the Gremlin query tool."""
218 |     return StructuredTool.from_function(
219 |         func=puppy_graph_client.gremlin_query,
220 |         name="query_graph_gremlin",
221 |         description="Query the graph database using Gremlin.",
222 |         args_schema=create_model(
223 |             "", query=(str, Field(description="The Gremlin query to run"))
224 |         ),
225 |     )
226 | 


--------------------------------------------------------------------------------
/apps/chatbot/README.md:
--------------------------------------------------------------------------------
  1 | # PuppyGraph RAG Chatbot Demo
  2 | 
  3 | A conversational AI interface for PuppyGraph that converts natural language questions into Cypher queries using Retrieval-Augmented Generation (RAG).
  4 | 
  5 | ## Features
  6 | 
  7 | - 🤖 **Natural Language to Cypher**: Ask questions in plain English, get intelligent query execution
  8 | - 🔄 **Multi-Round Execution**: Automatically generates and executes multiple queries as needed
  9 | - ⚡ **Real-time Streaming**: Watch each query step execute live as it happens
 10 | - 🧠 **RAG-Powered**: Uses embeddings and similar examples to improve query generation
 11 | - 🔌 **MCP Integration**: Custom Model Context Protocol server for PuppyGraph
 12 | - 🧭 **Claude Sonnet 4.0**: Powered by Anthropic's latest language model with intelligent stopping
 13 | - 📊 **Graph Exploration**: Built-in schema viewer and statistics
 14 | - 🎯 **Interactive UI**: Clean Gradio interface with real-time updates
 15 | - 📚 **Learning System**: Add your own examples to improve performance
 16 | 
 17 | ## Architecture
 18 | 
 19 | ```
 20 | ┌─────────────────┐    ┌──────────────────┐    ┌─────────────────┐
 21 | │                 │    │                  │    │                 │
 22 | │   Gradio UI     │◄──►│  Python Backend  │◄──►│  PuppyGraph     │
 23 | │                 │    │                  │    │                 │
 24 | └─────────────────┘    └──────────────────┘    └─────────────────┘
 25 |                                 │
 26 |                                 ▼
 27 |                        ┌──────────────────┐
 28 |                        │                  │
 29 |                        │   MCP Server     │
 30 |                        │                  │
 31 |                        └──────────────────┘
 32 |                                 │
 33 |                                 ▼
 34 |                        ┌──────────────────┐
 35 |                        │                  │
 36 |                        │   RAG System     │
 37 |                        │ (ChromaDB +      │
 38 |                        │  Embeddings +    │
 39 |                        │  Claude Sonnet)  │
 40 |                        └──────────────────┘
 41 | ```
 42 | 
 43 | ## Installation
 44 | 
 45 | 1. Create a virtual environment and activate it:
 46 |    ```bash
 47 |    python -m venv venv
 48 |    source venv/bin/activate
 49 |    ```
 50 | 
 51 | 2. **Install dependencies:**
 52 |    ```bash
 53 |    pip install -r requirements.txt
 54 |    ```
 55 | 
 56 | 3. **Set up environment variables:**
 57 |    ```bash
 58 |    cp .env.example .env
 59 |    # Edit .env with your Anthropic API key and PuppyGraph settings
 60 |    ```
 61 | 
 62 | 4. **Ensure PuppyGraph is running:**
 63 |    - Bolt protocol on port 7687
 64 |    - HTTP API on port 8081
 65 |    - Default credentials: puppygraph/puppygraph123
 66 | 
 67 | ## Usage
 68 | 
 69 | ### Quick Start
 70 | 
 71 | ```bash
 72 | python gradio_app.py
 73 | ```
 74 | 
 75 | Then open http://localhost:7860 in your browser.
 76 | 
 77 | ### Components
 78 | 
 79 | #### 1. MCP Server (`mcp_server.py`)
 80 | Standalone Model Context Protocol server that provides:
 81 | - Cypher query execution
 82 | - Schema introspection  
 83 | - Query validation
 84 | - Graph statistics
 85 | 
 86 | Run standalone:
 87 | ```bash
 88 | python mcp_server.py
 89 | ```
 90 | 
 91 | #### 2. RAG System (`rag_system.py`)
 92 | Handles text-to-Cypher conversion using:
 93 | - Sentence embeddings for question similarity
 94 | - ChromaDB for example storage
 95 | - Claude Sonnet 4.0 for query generation
 96 | - Confidence scoring
 97 | 
 98 | #### 3. Backend (`backend.py`)
 99 | Coordinates all components:
100 | - Manages MCP server process
101 | - Integrates RAG system
102 | - Handles conversation history
103 | - Provides unified API
104 | 
105 | #### 4. Gradio UI (`gradio_app.py`)
106 | Interactive web interface with:
107 | - Chat interface for questions
108 | - Schema and statistics viewer
109 | - Example management system
110 | - Help documentation
111 | 
112 | ## Example Queries
113 | 
114 | ### Simple Queries (typically 1 round):
115 | - "Show me all nodes in the graph"
116 | - "Count the total number of relationships"
117 | - "What types of nodes exist?"
118 | - "Show me the graph schema"
119 | 
120 | ### Complex Queries (typically 2-3 rounds):
121 | - "Which users have the most connections and what do they connect to?"
122 | - "What percentage of nodes have more than 5 relationships?"
123 | - "Find nodes that are connected to both X and Y type nodes"
124 | - "Show me the top 5 most connected entities and their relationship types"
125 | - "How many different paths exist between node A and node B?"
126 | 
127 | ## Adding Custom Examples
128 | 
129 | Use the "Add Examples" tab to teach the system new patterns:
130 | 
131 | 1. **Question**: "Find users who bought expensive products"
132 | 2. **Cypher**: `MATCH (u:User)-[:BOUGHT]->(p:Product) WHERE p.price > 100 RETURN u, p`
133 | 3. **Description**: "Finds users who purchased products over $100"
134 | 
135 | ## Configuration
136 | 
137 | ### Environment Variables
138 | 
139 | - `ANTHROPIC_API_KEY`: Required for Claude Sonnet 4.0 integration
140 | - `PUPPYGRAPH_BOLT_URI`: PuppyGraph Bolt endpoint (default: bolt://localhost:7687)
141 | - `PUPPYGRAPH_HTTP_URI`: PuppyGraph HTTP API (default: http://localhost:8081)
142 | - `PUPPYGRAPH_USERNAME`: Database username (default: puppygraph)
143 | - `PUPPYGRAPH_PASSWORD`: Database password (default: puppygraph123)
144 | 
145 | ### Customization
146 | 
147 | #### RAG System
148 | - **Embedding Model**: Change in `rag_system.py` (default: all-MiniLM-L6-v2)
149 | - **Vector Database**: ChromaDB configuration
150 | - **LLM Model**: Claude model selection (default: claude-sonnet-4-20250514)
151 | 
152 | #### UI Customization
153 | - **Port**: Modify in `gradio_app.py` (default: 7860)
154 | - **Styling**: Update CSS in the interface creation
155 | - **Tabs**: Add/remove functionality tabs
156 | 
157 | ## API Reference
158 | 
159 | ### Backend Methods
160 | 
161 | ```python
162 | from backend import PuppyGraphChatbot
163 | 
164 | chatbot = PuppyGraphChatbot()
165 | 
166 | # Process natural language query
167 | result = chatbot.process_natural_language_query("Show all nodes")
168 | 
169 | # Add custom example
170 | chatbot.add_query_example(
171 |     question="Find connected users",
172 |     cypher="MATCH (u1:User)-[:FRIEND]->(u2:User) RETURN u1, u2",
173 |     description="Shows friendship connections"
174 | )
175 | 
176 | # Get graph statistics
177 | stats = chatbot.get_graph_stats()
178 | ```
179 | 
180 | ### MCP Server Tools
181 | 
182 | When running as MCP server, provides these tools:
183 | - `execute_cypher`: Run Cypher queries
184 | - `get_schema_info`: Get schema with optional samples
185 | - `validate_cypher`: Validate query syntax
186 | 
187 | ## Troubleshooting
188 | 
189 | ### Common Issues
190 | 
191 | 1. **Connection Error**: Ensure PuppyGraph is running and accessible
192 | 2. **Anthropic API Error**: Check your API key and credits
193 | 3. **Import Errors**: Install all requirements with `pip install -r requirements.txt`
194 | 4. **MCP Server Issues**: Check logs for connection problems
195 | 
196 | ### Logs
197 | 
198 | Enable debug logging:
199 | ```python
200 | import logging
201 | logging.basicConfig(level=logging.DEBUG)
202 | ```
203 | 
204 | ### Testing Connection
205 | 
206 | Test PuppyGraph connectivity:
207 | ```python
208 | from backend import PuppyGraphChatbot
209 | chatbot = PuppyGraphChatbot()
210 | print(chatbot.get_graph_stats())
211 | ```
212 | 
213 | ## Development
214 | 
215 | ### Project Structure
216 | 
217 | ```
218 | rag-demo/
219 | ├── gradio_app.py        # Main Gradio UI application
220 | ├── backend.py           # Backend coordinator
221 | ├── mcp_server.py        # MCP server implementation  
222 | ├── rag_system.py        # RAG/text-to-cypher system
223 | ├── requirements.txt     # Python dependencies
224 | ├── .env.example         # Environment variables template
225 | └── README.md           # This file
226 | ```
227 | 
228 | ### Adding Features
229 | 
230 | 1. **New Query Types**: Add examples to `rag_system.py`
231 | 2. **UI Components**: Extend tabs in `gradio_app.py`
232 | 3. **MCP Tools**: Add tools in `mcp_server.py`
233 | 4. **Backend Logic**: Extend `backend.py`
234 | 
235 | ### Testing
236 | 
237 | ```bash
238 | # Test MCP server
239 | python mcp_server.py
240 | 
241 | # Test RAG system
242 | python -c "from rag_system import TextToCypherRAG; rag = TextToCypherRAG(); print('RAG system OK')"
243 | 
244 | # Test backend
245 | python -c "from backend import get_chatbot; chatbot = get_chatbot(); print(chatbot.get_graph_stats())"
246 | ```
247 | 
248 | ## Contributing
249 | 
250 | 1. Fork the repository
251 | 2. Create a feature branch
252 | 3. Make your changes
253 | 4. Add tests if applicable
254 | 5. Submit a pull request
255 | 
256 | ## License
257 | 
258 | This project is part of the PuppyGraph ecosystem. See the main repository for license information.
259 | 
260 | ## Support
261 | 
262 | For issues and questions:
263 | - Check the troubleshooting section
264 | - Review PuppyGraph documentation
265 | - Open an issue in the main repository


--------------------------------------------------------------------------------
/apps/databricks_mining_site/run_agent.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from typing import List
  4 | 
  5 | import yaml
  6 | from langchain_core.prompts.chat import ChatPromptTemplate, MessagesPlaceholder
  7 | from langchain_openai import ChatOpenAI
  8 | 
  9 | from puppygraph import PuppyGraphClient, PuppyGraphHostConfig
 10 | from puppygraph.rag import PuppyGraphAgent
 11 | from langchain_core.messages import BaseMessage
 12 | from typing import Iterable, List
 13 | 
 14 | logging.basicConfig(level=logging.INFO)
 15 | 
 16 | 
 17 | def _get_graph_schema_prompt() -> str:
 18 |     return """
 19 |     Nodes are the following:
 20 |     - failure_type:
 21 |         properties: 
 22 |             - name: failure_type_name
 23 |               type: String
 24 |               description: The name of the failure type, only takes values from "Electrical", "Mechanical", "Software", "Pneumatic", "Hydraulic"
 25 |     - asset:
 26 |         description: The asset node represents a physical asset in the mining site
 27 |         properties:
 28 |             - name: asset_id
 29 |               type: String
 30 |             - name: asset_name
 31 |               type: String
 32 |             - name: asset_type
 33 |               type: String
 34 |               description: the type of an asset, only takes values from "Heavy Machinery", "Drilling", "Material Handling", "Transport", "Processing", "Safety"
 35 |             - name: location
 36 |               type: String 
 37 |             - name: acquisition_date
 38 |               type: String
 39 |               description: the format is "YYYY-MM-DD"
 40 |             - name: status 
 41 |               type: String
 42 |               description: The status of the asset, only takes values from "Active", "Inactive", "Under Maintenance"
 43 |     - work_order:
 44 |         description: The work order node represents a work order raised for an asset
 45 |         properties:
 46 |             - name: work_order_id
 47 |               type: String
 48 |             - name: date
 49 |               type: String
 50 |             - name: action_taken
 51 |               type: String
 52 |             - name: technician
 53 |               type: String
 54 |             - name: component_replaced_description
 55 |               type: String
 56 |               description: Description of the component replaced, only takes values from "Alternator", "Brake Assembly", "Control Panel", "Conveyor Belt", "Cooling Fan", "Engine Oil Filter", "Exhaust Manifold", "Fuel Injector", "Hydraulic Cylinder", "Hydraulic Hose", "Hydraulic Pump", "Pressure Sensor", "Swing Motor", "Track Chain", "Transmission". 
 57 |             - name: component_replaced_material_num
 58 |               type: String
 59 |             - name: successful_fix
 60 |               type: Boolean
 61 |               description: Whether the work order successfully fixed the asset or not.
 62 |     Edges are the following:
 63 |     - can_have_failure:
 64 |         description: The potential failure mode of an asset
 65 |         from: asset
 66 |         to: failure_type
 67 |         properties:
 68 |             - name: steps_to_follow
 69 |               type: String
 70 |               description: The steps to follow to troubleshoot the failure. (remember, this property is on the EDGE, not the NODE)
 71 |             - name: reference_source
 72 |               type: String
 73 |               descritpion: The reference source of the troubleshooting steps, only takes values from "Documentum", "OEM", "Internal Manual"
 74 |             - name: recommended_actions
 75 |               type: String
 76 |     - worked_on:
 77 |         description: A work order is working on an asset
 78 |         from: work_order
 79 |         to: asset
 80 |         properties: NONE
 81 |     - related_to_failure:
 82 |         description: A work order identifies a failure type on a specific asset
 83 |         properties: NONE
 84 |         from: work_order
 85 |         to: failure_type
 86 |     The relationships are the following:
 87 |     (:asset)-[:can_have_failure]->(:failure_type),
 88 |     (:work_order)-[:worked_on]->(:asset),
 89 |     (:work_order)-[:related_to_failure]->(:failure_type)
 90 | """
 91 | 
 92 | 
 93 | def _get_chat_prompt_template(graph_schema_prompt: str) -> ChatPromptTemplate:
 94 | 
 95 |     return ChatPromptTemplate.from_messages(
 96 |         [
 97 |             (
 98 |                 "system",
 99 |                 "You are a helpful assistant to help answer user questions about assets in a mining site."
100 |                 "You will need to use the information stored in the graph database to answer the user's questions."
101 |                 "Here is some information about the graph database schema.\n"
102 |                 f"{graph_schema_prompt}",
103 |             ),
104 |             (
105 |                 "system",
106 |                 "You must first output a PLAN, then you can use the PLAN to call the tools.\n"
107 |                 "Each STEP of the PLAN should be corresponding to one or more function calls (but not less), either simple or complex.\n"
108 |                 "Minimize the number of steps in the PLAN, but make sure the PLAN is workable.\n"
109 |                 "Remember, each step can be converted to a Cypher query, since Cypher query can handle quite complex queries,"
110 |                 "each step can be complex as well as long as it can be converted to a Cypher query.",
111 |             ),
112 |             MessagesPlaceholder(variable_name="message_history"),
113 |             (
114 |                 "system",
115 |                 "For COUNT(), ONLY use COUNT(*) in your Cypher queries, as COUNT(something) is not supported yet.When calculating failures for a particular asset, also first find out the work orders that are related to the asset, then count the work orders that are related to the failure using related_to_failure. DO NOT USE can_have_failure for counting total number of failures, USE related_to_failure instead.",
116 |             ),
117 |             (
118 |                 "system",
119 |                 "Always use the format {\n"
120 |                 "'THINKING': <the thought process in PLAIN TEXT>,"
121 |                 "'PLAN': <the plan contains multiple steps in PLAIN TEXT, Your Original plan or Update plan after seeing some executed results>,"
122 |                 "'CONCLUSION': <Keep your conclusion simple and clear if you decide to conclude>}",
123 |             ),
124 |         ],
125 |         template_format="jinja2",
126 |     )
127 | 
128 | 
129 | def _get_llm() -> ChatOpenAI:
130 |     return ChatOpenAI(
131 |         model="gpt-4o-2024-08-06",
132 |         temperature=0,
133 |         api_key=os.getenv("OPENAI_API_KEY"),
134 |     )
135 | 
136 | 
137 | def _get_puppy_graph_client(ip) -> PuppyGraphClient:
138 |     return PuppyGraphClient(PuppyGraphHostConfig(ip=ip))
139 | 
140 | 
141 | def _process_answer(answers: Iterable[BaseMessage]) -> str:
142 |     reversed_answers = reversed(list(answers))
143 |     for answer in reversed_answers:
144 |         text = answer.content
145 |         try:
146 |             text_dict = yaml.safe_load(text)
147 |             if "CONCLUSION" in text_dict:
148 |                 return text_dict["CONCLUSION"]
149 |         except:
150 |             text_split = text.split("'CONCLUSION':")
151 |             return text_split[-1].strip("\n}")
152 |         return text
153 | 
154 | 
155 | def _run_queries(pg_agent: PuppyGraphAgent) -> List[str]:
156 |     queries = [
157 |         "How do I check engine oil levels?",
158 |         "How many mechanical failures has Excavator 3000 had?",
159 |         "How many times did we need to replace pressure sensor on Haul Truck 400T?",
160 |         "How many mechanical work orders were unsuccessful?",
161 |         "When was a work order raised for Load-Haul-Dump Machine to update fuel injector?",
162 |         "What are the Asset IDs of Heavy Machinery?",
163 |         "How many assets are active in Site A?",
164 |         "When should I replace Pressure Sensor?",
165 |         "How many troubleshooting steps are from the Documentum?",
166 |         "Which asset had the most work orders and how many of them?",
167 |         "How do I safeguard against system errors in my Hydraulic Shovels?",
168 |         "Was the transmission tested under load in WO008?",
169 |         "Was the troubleshooting guide for Excavator 3000 followed for WO001 order?",
170 |         "Where is Excavator 3000 located?",
171 |         "What are the previous failures type for Excavator 3000 from work order logs?",
172 |         "Did we replace the Cooling fan on Crusher CR6000?",
173 |         "What component have we replaced the most?",
174 |         "How many Electrical failures has Crusher CR6000 had?",
175 |         "How many Electrical failures have Heavy Machinery Had?",
176 |     ]
177 | 
178 |     answers = []
179 |     for i, query in enumerate(queries):
180 |         print(f"======{i}======")
181 |         print(f"User: {query}")
182 | 
183 |         # We are doing single user query, not a conversation
184 |         # so need to reset history for each turn
185 |         pg_agent.reset_messages()
186 |         answer = _process_answer(pg_agent.query(query))
187 |         answers.append(answer)
188 | 
189 |         print(f"System: {answer}")
190 |         print(f"=====================")
191 | 
192 |     return answers
193 | 
194 | 
195 | def main():
196 |     """Main function to run the puppygraph agent.
197 | 
198 |     We first run a set of queries and then enter free chat mode.
199 |     """
200 |     pg_agent = PuppyGraphAgent(
201 |         puppy_graph_client=_get_puppy_graph_client("127.0.0.1"),
202 |         llm=_get_llm(),
203 |         chat_prompt_template=_get_chat_prompt_template(
204 |             graph_schema_prompt=_get_graph_schema_prompt()
205 |         ),
206 |     )
207 | 
208 |     _run_queries(pg_agent=pg_agent)
209 | 
210 |     print("\n=======Entering Free Chat Mode=======\n")
211 |     pg_agent.reset_messages()
212 |     while True:
213 |         user_input = input("User: ")
214 |         response = pg_agent.query(user_input=user_input)
215 |         print(f"System: {_process_answer(response)}")
216 | 
217 | 
218 | if __name__ == "__main__":
219 |     main()
220 | 


--------------------------------------------------------------------------------
/puppygraph/client/client.py:
--------------------------------------------------------------------------------
  1 | """PuppyGraph client module."""
  2 | 
  3 | import logging
  4 | import threading
  5 | import time
  6 | from typing import Any, Dict, List, Optional, Union
  7 | 
  8 | import requests
  9 | from gremlin_python.driver import client as GremlinClient
 10 | from gremlin_python.driver.protocol import GremlinServerError
 11 | from gremlin_python.driver.serializer import GraphBinarySerializersV1
 12 | from neo4j import Driver as CypherDriver
 13 | from neo4j import GraphDatabase, Query, Result
 14 | from neo4j.exceptions import AuthError, CypherSyntaxError, ServiceUnavailable
 15 | from neo4j.graph import Node, Relationship, Path
 16 | from puppygraph.common.conversion_utils import convert_mapping_config_to_host_json
 17 | from puppygraph.data.host.host_config import PuppyGraphHostConfig
 18 | from puppygraph.data.mapping.graph_mapping_config import PuppyGraphMappingConfig
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | logger.addHandler(
 22 |     logging.NullHandler()
 23 | )  # Prevent "No handlers could be found" warnings
 24 | 
 25 | 
 26 | class PuppyGraphClient:
 27 |     """PuppyGraph client."""
 28 | 
 29 |     def __init__(self, config: PuppyGraphHostConfig):
 30 |         """Initializes a PuppyGraph client."""
 31 |         self._config = config
 32 |         self._cypher_query_driver = GraphDatabase.driver(
 33 |             f"bolt://{config.ip}:{config.cypher_port}/",
 34 |             max_connection_lifetime=config.cypher_max_connection_lifetime
 35 |         )
 36 |         self._gremlin_query_client = GremlinClient.Client(
 37 |             f"ws://{config.ip}:{config.gremlin_port}/gremlin",
 38 |             traversal_source="g",
 39 |             username=config.username,
 40 |             password=config.password,
 41 |             message_serializer=GraphBinarySerializersV1(),
 42 |         )
 43 | 
 44 |     def set_schema(
 45 |         self, mapping_config: Union[PuppyGraphMappingConfig, Dict]
 46 |     ) -> PuppyGraphMappingConfig:
 47 |         """Sets the graph mapping config in PuppyGraph server.
 48 | 
 49 |         Args:
 50 |             mapping_config: The graph mapping config to set, can be json or dataclass.
 51 | 
 52 |         Returns:
 53 |             The puppygraph graph mapping config that was set.
 54 |         """
 55 |         return _set_schema(host_config=self._config, mapping_config=mapping_config)
 56 | 
 57 |     def get_schema(self) -> str:
 58 |         """Returns the json schema of the PuppyGraph database."""
 59 |         return _get_schema(config=self._config)
 60 | 
 61 |     def cypher_query(
 62 |         self, query: str, params: Optional[Dict[str, Any]] = None, timeout_ms=30000
 63 |     ) -> List[Dict[str, Any]]:
 64 |         """Executes a Cypher query on the puppy graph.
 65 | 
 66 |         Args:
 67 |             query: The Cypher query to execute.
 68 |             params: The parameters to pass to the Cypher query.
 69 |             timeout_ms: The timeout in milliseconds for the query, defaults to 30000.
 70 | 
 71 |         Returns:
 72 |             The result of the Cypher query in a list of dictionaries.
 73 |         """
 74 |         return _run_cypher_query(
 75 |             cypher_driver=self._cypher_query_driver,
 76 |             query=query,
 77 |             params=params,
 78 |             timeout_ms=timeout_ms,
 79 |         )
 80 | 
 81 |     def gremlin_query(self, query: str, timeout_ms=30000) -> List[Dict[str, Any]]:
 82 |         """Executes a Gremlin query on the puppy graph.
 83 | 
 84 |         Args:
 85 |             query: The Gremlin query to execute.
 86 |             timeout_ms: The timeout in milliseconds for the query, defaults to 30000.
 87 | 
 88 |         Returns:
 89 |             The result of the Gremlin query in a list of dictionaries.
 90 |         """
 91 |         return _run_gremlin_query(
 92 |             gremlin_client=self._gremlin_query_client,
 93 |             query=query,
 94 |             timeout_ms=timeout_ms,
 95 |         )
 96 | 
 97 | 
 98 | def _set_schema(
 99 |     host_config: PuppyGraphHostConfig,
100 |     mapping_config: Union[PuppyGraphMappingConfig, Dict],
101 | ) -> PuppyGraphMappingConfig:
102 |     """Sets the graph schema in PuppyGraph server.
103 | 
104 |     Args:
105 |         host_config: The host configuration of the PuppyGraph server.
106 |         mapping_config: The graph mapping config that defines how the graph is constructed, can be json or dataclass.
107 | 
108 |     Returns:
109 |         The puppygraph schema that was set.
110 |     """
111 | 
112 |     schema_json = convert_mapping_config_to_host_json(config=mapping_config)
113 |     logger.info(
114 |         "Setting graph schema in PuppyGraph server...\n=============================\n%s\n=============================\n",
115 |         schema_json,
116 |     )
117 | 
118 |     response = requests.post(
119 |         f"http://{host_config.ip}:{host_config.http_port}/schema",
120 |         auth=(host_config.username, host_config.password),
121 |         json=schema_json,
122 |         timeout=60,
123 |     )
124 |     response.raise_for_status()
125 |     logger.info("Successfully updated the schema in PuppyGraph!")
126 | 
127 |     # Check if PuppyGraph is ready to serve
128 |     response = requests.get(
129 |         f"http://{host_config.ip}:{host_config.http_port}/schemajson",
130 |         auth=(host_config.username, host_config.password),
131 |         timeout=60,
132 |     )
133 |     while response.status_code != 200:
134 |         time.sleep(10)
135 |     logger.info("PuppyGraph is ready to serve the new schema!")
136 |     return mapping_config
137 | 
138 | 
139 | def _get_schema(config: PuppyGraphHostConfig) -> str:
140 |     """Returns the schema of the PuppyGraph database.
141 | 
142 |     Args:
143 |         config: The PuppyGraph host configuration.
144 | 
145 |     Returns:
146 |         The schema of the PuppyGraph database in string format.
147 |     """
148 |     response = requests.get(
149 |         f"http://{config.ip}:{config.http_port}/schemajson",
150 |         auth=(config.username, config.password),
151 |         timeout=60,
152 |     )
153 |     response.raise_for_status()
154 |     return response.text
155 | 
156 | 
157 | class _QueryThread(threading.Thread):
158 |     def __init__(self, target, *args, **kwargs):
159 |         super().__init__()
160 |         self._target = target
161 |         self._args = args
162 |         self._kwargs = kwargs
163 |         self._result = None
164 |         self._error = None
165 | 
166 |     def run(self):
167 |         try:
168 |             self._result = self._target(*self._args, **self._kwargs)
169 |         except Exception as e:
170 |             self._error = e
171 | 
172 |     def get_result(self):
173 |         if self._error:
174 |             raise self._error
175 |         return self._result
176 | 
177 | 
178 | def _run_with_threading_timeout(fn, timeout, *args, **kwargs):
179 |     thread = _QueryThread(fn, *args, **kwargs)
180 |     thread.start()
181 |     thread.join(timeout / 1e3)  # Convert ms to seconds
182 | 
183 |     if thread.is_alive():
184 |         raise TimeoutError(f"Operation timed out after {timeout} ms!")
185 |     return thread.get_result()
186 | 
187 | 
188 | def _cypher_query_fn(
189 |     cypher_driver: CypherDriver,
190 |     query: str,
191 |     params: Optional[Dict[str, Any]],
192 |     timeout_s: float,
193 | ):
194 |     with cypher_driver.session() as session:
195 |         neo4j_query = Query(text=query, timeout=timeout_s)
196 |         try:
197 |             res: Result = session.run(neo4j_query, params)
198 |             json_data: List[Dict[str, Any]] = [_unpack_value(record) for record in res]
199 |             return json_data
200 |         except CypherSyntaxError as e:
201 |             raise ValueError(f"`{query}` is not valid:\n{e}") from e
202 |         except (AuthError, ServiceUnavailable, TimeoutError) as e:
203 |             raise e
204 | 
205 | def _unpack_value(value):
206 |     if isinstance(value, Node):
207 |         return {
208 |             "type": "Node",
209 |             "id": value.element_id,
210 |             "labels": list(value.labels),
211 |             "properties": dict(value.items())
212 |         }
213 |     elif isinstance(value, Relationship):
214 |         return {
215 |             "type": "Relationship",
216 |             "id": value.element_id,
217 |             "type_name": value.type,
218 |             "start_id": value.start_node.element_id,
219 |             "end_id": value.end_node.element_id,
220 |             "properties": dict(value.items())
221 |         }
222 |     elif isinstance(value, Path):
223 |         return {
224 |             "type": "Path",
225 |             "nodes": [_unpack_value(n) for n in value.nodes],
226 |             "relationships": [_unpack_value(r) for r in value.relationships]
227 |         }
228 |     elif isinstance(value, list):
229 |         return [_unpack_value(v) for v in value]
230 |     elif isinstance(value, dict):
231 |         return {k: _unpack_value(v) for k, v in value.items()}
232 |     else:
233 |         return value  # fall back to raw value (int, str, etc.)
234 | 
235 | 
236 | def _gremlin_query_fn(gremlin_client: GremlinClient.Client, query: str):
237 |     try:
238 |         result_set = gremlin_client.submit(query)
239 |         results = result_set.all().result()
240 |         return results
241 |     except GremlinServerError as e:
242 |         raise ValueError(f"Gremlin query error: {e}") from e
243 |     except TimeoutError as e:
244 |         raise TimeoutError(f"Timeout occurred: {e}") from e
245 |     except Exception as e:
246 |         raise e
247 | 
248 | 
249 | def _run_cypher_query(
250 |     cypher_driver: CypherDriver,
251 |     query: str,
252 |     params: Optional[Dict[str, Any]] = None,
253 |     timeout_ms: int = 300000,
254 | ) -> List[Dict[str, Any]]:
255 |     return _run_with_threading_timeout(
256 |         _cypher_query_fn, timeout_ms, cypher_driver, query, params, timeout_ms / 1e3
257 |     )
258 | 
259 | 
260 | def _run_gremlin_query(
261 |     gremlin_client: GremlinClient.Client,
262 |     query: str,
263 |     timeout_ms: int = 300000,
264 | ) -> List[Dict[str, Any]]:
265 |     return _run_with_threading_timeout(
266 |         _gremlin_query_fn, timeout_ms, gremlin_client, query
267 |     )
268 | 


--------------------------------------------------------------------------------
/apps/chatbot/mcp_server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import asyncio
  4 | import json
  5 | import logging
  6 | from typing import Any, Dict, List, Optional, Sequence
  7 | 
  8 | from mcp.server.models import InitializationOptions
  9 | from mcp.server import NotificationOptions, Server
 10 | from mcp.server.stdio import stdio_server
 11 | from mcp.types import (
 12 |     Resource,
 13 |     Tool,
 14 |     TextContent,
 15 |     ImageContent,
 16 |     EmbeddedResource,
 17 |     LoggingLevel
 18 | )
 19 | import mcp.types as types
 20 | from neo4j import GraphDatabase, Driver
 21 | import requests
 22 | 
 23 | logging.basicConfig(level=logging.INFO)
 24 | logger = logging.getLogger("puppygraph-mcp")
 25 | 
 26 | class PuppyGraphMCPServer:
 27 |     def __init__(self, 
 28 |                  bolt_uri: str = "bolt://localhost:7687", 
 29 |                  http_uri: str = "http://localhost:8081",
 30 |                  username: str = "puppygraph", 
 31 |                  password: str = "puppygraph123"):
 32 |         self.bolt_uri = bolt_uri
 33 |         self.http_uri = http_uri  
 34 |         self.username = username
 35 |         self.password = password
 36 |         self.driver: Optional[Driver] = None
 37 |         self.schema_cache: Optional[Dict[str, Any]] = None
 38 |         
 39 |         # Initialize Neo4j driver
 40 |         try:
 41 |             self.driver = GraphDatabase.driver(bolt_uri, auth=(username, password))
 42 |             # Test connection
 43 |             with self.driver.session() as session:
 44 |                 session.run("RETURN 1")
 45 |             logger.info(f"Connected to PuppyGraph at {bolt_uri}")
 46 |         except Exception as e:
 47 |             logger.error(f"Failed to connect to PuppyGraph: {e}")
 48 |             raise
 49 |     
 50 |     def close(self):
 51 |         if self.driver:
 52 |             self.driver.close()
 53 |     
 54 |     def get_schema(self) -> Dict[str, Any]:
 55 |         """Fetch schema from PuppyGraph HTTP API"""
 56 |         if self.schema_cache:
 57 |             return self.schema_cache
 58 |         
 59 |         try:
 60 |             response = requests.get(
 61 |                 f"{self.http_uri}/schemajson",
 62 |                 auth=(self.username, self.password),
 63 |                 timeout=10
 64 |             )
 65 |             response.raise_for_status()
 66 |             self.schema_cache = response.json()
 67 |             return self.schema_cache
 68 |         except Exception as e:
 69 |             logger.error(f"Failed to fetch schema: {e}")
 70 |             return {"vertices": [], "edges": []}
 71 |     
 72 |     def execute_cypher(self, query: str, params: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
 73 |         """Execute Cypher query against PuppyGraph"""
 74 |         if not self.driver:
 75 |             raise RuntimeError("Not connected to PuppyGraph")
 76 |         
 77 |         try:
 78 |             with self.driver.session() as session:
 79 |                 result = session.run(query, params or {})
 80 |                 return [record.data() for record in result]
 81 |         except Exception as e:
 82 |             logger.error(f"Cypher query failed: {e}")
 83 |             raise
 84 | 
 85 | server = Server("puppygraph-mcp")
 86 | puppygraph = PuppyGraphMCPServer()
 87 | 
 88 | @server.list_resources()
 89 | async def handle_list_resources() -> list[Resource]:
 90 |     """List available resources"""
 91 |     return [
 92 |         Resource(
 93 |             uri="puppygraph://schema",
 94 |             name="PuppyGraph Schema",
 95 |             description="Current graph schema with vertex and edge definitions",
 96 |             mimeType="application/json",
 97 |         ),
 98 |         Resource(
 99 |             uri="puppygraph://stats", 
100 |             name="Graph Statistics",
101 |             description="Basic statistics about the graph (node/edge counts)",
102 |             mimeType="application/json",
103 |         )
104 |     ]
105 | 
106 | @server.read_resource()
107 | async def handle_read_resource(uri: str) -> str:
108 |     """Read a resource by URI"""
109 |     if uri == "puppygraph://schema":
110 |         schema = puppygraph.get_schema()
111 |         return json.dumps(schema, indent=2)
112 |     
113 |     elif uri == "puppygraph://stats":
114 |         try:
115 |             stats = puppygraph.execute_cypher("""
116 |                 MATCH (n) 
117 |                 WITH count(n) as node_count
118 |                 MATCH ()-[r]->()  
119 |                 WITH node_count, count(r) as edge_count
120 |                 RETURN node_count, edge_count
121 |             """)
122 |             return json.dumps(stats[0] if stats else {"node_count": 0, "edge_count": 0}, indent=2)
123 |         except Exception as e:
124 |             return json.dumps({"error": str(e)}, indent=2)
125 |     
126 |     else:
127 |         raise ValueError(f"Unknown resource: {uri}")
128 | 
129 | @server.list_tools()
130 | async def handle_list_tools() -> list[Tool]:
131 |     """List available tools"""
132 |     return [
133 |         Tool(
134 |             name="execute_cypher",
135 |             description="Execute a Cypher query against PuppyGraph and return results",
136 |             inputSchema={
137 |                 "type": "object",
138 |                 "properties": {
139 |                     "query": {
140 |                         "type": "string",
141 |                         "description": "The Cypher query to execute",
142 |                     },
143 |                     "parameters": {
144 |                         "type": "object", 
145 |                         "description": "Optional parameters for the query",
146 |                         "additionalProperties": True,
147 |                     }
148 |                 },
149 |                 "required": ["query"],
150 |             },
151 |         ),
152 |         Tool(
153 |             name="get_schema_info",
154 |             description="Get detailed schema information about vertices and edges",
155 |             inputSchema={
156 |                 "type": "object",
157 |                 "properties": {
158 |                     "include_samples": {
159 |                         "type": "boolean",
160 |                         "description": "Whether to include sample data",
161 |                         "default": False
162 |                     }
163 |                 }
164 |             }
165 |         ),
166 |         Tool(
167 |             name="validate_cypher",
168 |             description="Validate a Cypher query without executing it",
169 |             inputSchema={
170 |                 "type": "object",
171 |                 "properties": {
172 |                     "query": {
173 |                         "type": "string",
174 |                         "description": "The Cypher query to validate",
175 |                     }
176 |                 },
177 |                 "required": ["query"],
178 |             },
179 |         )
180 |     ]
181 | 
182 | @server.call_tool()
183 | async def handle_call_tool(name: str, arguments: dict) -> list[types.TextContent]:
184 |     """Handle tool calls"""
185 |     
186 |     if name == "execute_cypher":
187 |         query = arguments.get("query")
188 |         parameters = arguments.get("parameters", {})
189 |         
190 |         if not query:
191 |             return [types.TextContent(type="text", text="Error: No query provided")]
192 |         
193 |         try:
194 |             results = puppygraph.execute_cypher(query, parameters)
195 |             return [types.TextContent(
196 |                 type="text", 
197 |                 text=json.dumps(results, indent=2, default=str)
198 |             )]
199 |         except Exception as e:
200 |             return [types.TextContent(
201 |                 type="text", 
202 |                 text=f"Error executing query: {str(e)}"
203 |             )]
204 |     
205 |     elif name == "get_schema_info":
206 |         include_samples = arguments.get("include_samples", False)
207 |         
208 |         try:
209 |             schema = puppygraph.get_schema()
210 |             
211 |             if include_samples:
212 |                 # Get sample data for each vertex and edge type
213 |                 samples = {}
214 |                 for vertex in schema.get("vertices", []):
215 |                     label = vertex["label"]
216 |                     try:
217 |                         sample_query = f"MATCH (n:{label}) RETURN n LIMIT 3"
218 |                         samples[f"vertex_{label}"] = puppygraph.execute_cypher(sample_query)
219 |                     except:
220 |                         samples[f"vertex_{label}"] = []
221 |                 
222 |                 for edge in schema.get("edges", []):
223 |                     label = edge["label"]
224 |                     try:
225 |                         sample_query = f"MATCH ()-[r:{label}]->() RETURN r LIMIT 3"
226 |                         samples[f"edge_{label}"] = puppygraph.execute_cypher(sample_query)
227 |                     except:
228 |                         samples[f"edge_{label}"] = []
229 |                 
230 |                 result = {"schema": schema, "samples": samples}
231 |             else:
232 |                 result = {"schema": schema}
233 |             
234 |             return [types.TextContent(
235 |                 type="text",
236 |                 text=json.dumps(result, indent=2, default=str)
237 |             )]
238 |         except Exception as e:
239 |             return [types.TextContent(
240 |                 type="text",
241 |                 text=f"Error getting schema: {str(e)}"
242 |             )]
243 |     
244 |     elif name == "validate_cypher":
245 |         query = arguments.get("query")
246 |         
247 |         if not query:
248 |             return [types.TextContent(type="text", text="Error: No query provided")]
249 |         
250 |         try:
251 |             # Try to explain the query to validate syntax
252 |             explain_query = f"EXPLAIN {query}"
253 |             puppygraph.execute_cypher(explain_query)
254 |             return [types.TextContent(
255 |                 type="text",
256 |                 text="Query syntax is valid"
257 |             )]
258 |         except Exception as e:
259 |             return [types.TextContent(
260 |                 type="text",
261 |                 text=f"Query validation failed: {str(e)}"
262 |             )]
263 |     
264 |     else:
265 |         return [types.TextContent(
266 |             type="text",
267 |             text=f"Unknown tool: {name}"
268 |         )]
269 | 
270 | async def main():
271 |     # Register cleanup handler
272 |     import atexit
273 |     atexit.register(puppygraph.close)
274 |     
275 |     async with stdio_server() as (read_stream, write_stream):
276 |         await server.run(
277 |             read_stream,
278 |             write_stream,
279 |             InitializationOptions(
280 |                 server_name="puppygraph-mcp",
281 |                 server_version="1.0.0",
282 |                 capabilities=server.get_capabilities(
283 |                     notification_options=NotificationOptions(),
284 |                     experimental_capabilities={},
285 |                 ),
286 |             ),
287 |         )
288 | 
289 | if __name__ == "__main__":
290 |     asyncio.run(main())


--------------------------------------------------------------------------------
/apps/imdb/run_agent.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import time
  4 | from functools import partial
  5 | from typing import Iterable, List, Optional, Union
  6 | 
  7 | import gradio as gr
  8 | import yaml
  9 | from langchain_community.tools.google_serper.tool import GoogleSerperRun
 10 | from langchain_community.utilities import GoogleSerperAPIWrapper
 11 | from langchain_core.messages import AIMessage, ToolMessage
 12 | from langchain_core.prompts.chat import ChatPromptTemplate, MessagesPlaceholder
 13 | from langchain_core.pydantic_v1 import Field, create_model
 14 | from langchain_core.tools import StructuredTool
 15 | from langchain_openai import ChatOpenAI
 16 | 
 17 | from puppygraph import PuppyGraphClient, PuppyGraphHostConfig
 18 | from puppygraph.rag import PuppyGraphAgent
 19 | 
 20 | 
 21 | def _get_graph_schema_prompt(query_language: str) -> str:
 22 | 
 23 |     schema_prompt = """
 24 |     Nodes are the following:
 25 |     - person:
 26 |         properties: 
 27 |             - name: primaryName
 28 |             type: String
 29 |             description: The name of the person, as listed in the IMDb database.
 30 |             - name: birthYear
 31 |             type: Int
 32 |             description: The birth year of the person (if available).
 33 |             - name: deathYear
 34 |             type: Int
 35 |             description: The death year of the person (if available).
 36 | 
 37 |     - title:
 38 |         properties:
 39 |             - name: titleType
 40 |             type: String
 41 |             description: The type/format of the title (e.g., movie, short, tvseries, tvepisode, video, etc.).
 42 |             - name: primaryTitle
 43 |             type: String
 44 |             description: The more popular title or the title used by filmmakers on promotional materials at the point of release.
 45 |             - name: originalTitle
 46 |             type: String
 47 |             description: The original title, in the original language.
 48 |             - name: isAdult
 49 |             type: Boolean
 50 |             description: Indicates whether the title is for adults (1: adult title, 0: non-adult title).
 51 |             - name: startYear
 52 |             type: Int
 53 |             description: Represents the release year of a title. For TV Series, this is the series start year.
 54 |             - name: endYear
 55 |             type: Int
 56 |             description: For TV Series, this is the series end year. '\\N' for all other title types.
 57 |             - name: runtimeMinutes
 58 |             type: Int
 59 |             description: The primary runtime of the title, in minutes.
 60 | 
 61 |     Edges are the following:
 62 |     - cast_and_crew:
 63 |         from: title
 64 |         to: person
 65 |         properties:
 66 |             - name: ordering
 67 |             type: Int
 68 |             description: A unique identifier for the row, used to determine the order of people associated with this title.
 69 |             - name: category
 70 |             type: String
 71 |             description: The category of job that the person was in (e.g., actor, director).
 72 |             - name: job
 73 |             type: String
 74 |             description: The specific job title if applicable, else '\\N'.
 75 |             - name: characters
 76 |             type: String
 77 |             description: The name of the character played if applicable, else '\\N'.
 78 | """
 79 |     if query_language == "cypher":
 80 |         additional_instructions = ""
 81 |     elif query_language == "gremlin":
 82 |         additional_instructions = """
 83 |             The relationships are the following:
 84 |             g.V().hasLabel('title').out('cast_and_crew').hasLabel('person'),
 85 |             g.V().hasLabel('person').in('cast_and_crew').hasLabel('title'),
 86 | 
 87 |             if filter by category, you must use outE() or inE(), because the category is stored in the EDGE properties.
 88 |         """
 89 |     else:
 90 |         raise NotImplementedError(f"Query language {query_language} is not supported.")
 91 | 
 92 |     return schema_prompt + additional_instructions
 93 | 
 94 | 
 95 | def _get_chat_prompt_template(
 96 |     graph_schema_prompt: str, search_tool_enabled: bool
 97 | ) -> ChatPromptTemplate:
 98 | 
 99 |     if search_tool_enabled:
100 |         additional_conclusion_prompt = ", please also cite the source [🌐] or [📈] indicating whether the information is from the internet or from the graph database if applicable"
101 |     else:
102 |         additional_conclusion_prompt = ""
103 |     return ChatPromptTemplate.from_messages(
104 |         [
105 |             (
106 |                 "system",
107 |                 "You are a helpful assistant to help answer user questions about imdb."
108 |                 "You will need to use the information stored in the graph database to answer the user's questions."
109 |                 "Here is some information about the graph database schema.\n"
110 |                 f"{graph_schema_prompt}",
111 |             ),
112 |             (
113 |                 "system",
114 |                 "You must first output a PLAN, then you can use the PLAN to call the tools.\n"
115 |                 "Each STEP of the PLAN should be corresponding to one or more function calls (but not less), either simple or complex.\n"
116 |                 "Minimize the number of steps in the PLAN, but make sure the PLAN is workable.\n"
117 |                 "Remember, each step can be converted to a Gremlin query, since Gremlin query can handle quite complex queries,"
118 |                 "each step can be complex as well as long as it can be converted to a Gremlin query.",
119 |             ),
120 |             MessagesPlaceholder(variable_name="message_history"),
121 |             (
122 |                 "system",
123 |                 "Always use the JSON format {\n"
124 |                 "'THINKING': <the thought process in PLAIN TEXT>,"
125 |                 "'PLAN': <the plan contains multiple steps in PLAIN TEXT, Your Original plan or Update plan after seeing some executed results>,"
126 |                 f"'CONCLUSION': <Keep your conclusion simple and clear if you decide to conclude {additional_conclusion_prompt} >",
127 |             ),
128 |         ],
129 |         template_format="jinja2",
130 |     )
131 | 
132 | 
133 | def _get_llm() -> ChatOpenAI:
134 |     return ChatOpenAI(
135 |         model="gpt-4o-2024-08-06",
136 |         temperature=0,
137 |         api_key=os.getenv("OPENAI_API_KEY"),
138 |     )
139 | 
140 | 
141 | def _get_puppy_graph_client(ip) -> PuppyGraphClient:
142 |     return PuppyGraphClient(PuppyGraphHostConfig(ip=ip))
143 | 
144 | 
145 | def _display_ai_message_content(
146 |     ai_message_content: str, is_last_message: bool
147 | ) -> Iterable[str]:
148 |     if not is_last_message:
149 |         conclusion_emoji = "📝"
150 |     else:
151 |         conclusion_emoji = "✅"
152 | 
153 |     try:
154 |         text_dict = yaml.safe_load(ai_message_content)
155 |         if "THINKING" in text_dict:
156 |             yield f"📝 {text_dict['THINKING']}"
157 | 
158 |         if "PLAN" in text_dict:
159 |             yield f"📝 {text_dict['PLAN']}"
160 |         if "CONCLUSION" in text_dict:
161 |             yield f"{conclusion_emoji} {text_dict['CONCLUSION']}"
162 |     except Exception as _:
163 |         text_split = ai_message_content.split("'CONCLUSION':")
164 |         seps = "\n} "
165 |         yield f"{conclusion_emoji} {text_split[-1].strip(seps)}"
166 | 
167 | 
168 | def _display_ai_message_tool_calls(tool_calls: List[str]) -> Iterable[str]:
169 |     for tool_call in tool_calls:
170 |         yield f"🔨 Calling {tool_call['name']} with args: {tool_call['args']}"
171 | 
172 | 
173 | def _display_tool_message(tool_message: ToolMessage) -> Iterable[str]:
174 |     yield f"🔨 Response: {tool_message.content}"
175 | 
176 | 
177 | def _display_message(
178 |     message: Optional[Union[AIMessage, ToolMessage]], is_last_message: bool = False
179 | ) -> Iterable[str]:
180 |     if message is None:
181 |         return
182 | 
183 |     if isinstance(message, AIMessage):
184 |         yield from _display_ai_message_content(message.content, is_last_message)
185 |         yield from _display_ai_message_tool_calls(message.tool_calls)
186 |     elif isinstance(message, ToolMessage):
187 |         yield from _display_tool_message(message)
188 | 
189 | 
190 | def _get_displayable_responses(
191 |     pg_agent: PuppyGraphAgent, user_message: str
192 | ) -> Iterable[str]:
193 |     response_iter = pg_agent.query(user_input=user_message, max_iters=20)
194 |     previous_message = None
195 |     while True:
196 |         try:
197 |             current_message = next(response_iter)
198 |             for display_string in _display_message(previous_message):
199 |                 yield display_string
200 |             previous_message = current_message
201 |         except StopIteration:
202 |             for display_string in _display_message(
203 |                 previous_message, is_last_message=True
204 |             ):
205 |                 yield display_string
206 |             break
207 | 
208 | 
209 | def _gradio_respond(pg_agent: PuppyGraphAgent, verbose_mode: bool, message, _):
210 |     all_responses = ""
211 |     for response in _get_displayable_responses(pg_agent=pg_agent, user_message=message):
212 |         all_responses += response + "\n"
213 |         if verbose_mode:
214 |             yield all_responses
215 |         else:
216 |             time.sleep(0.5)
217 |             yield response
218 | 
219 | 
220 | def _get_gradio_chatbot(
221 |     pg_agent: PuppyGraphAgent, verbose_mode: bool = False
222 | ) -> gr.ChatInterface:
223 |     clear_btn = gr.Button("Clear", variant="secondary", size="sm", min_width=60)
224 |     chat_bot = gr.ChatInterface(
225 |         fn=partial(_gradio_respond, pg_agent, verbose_mode), clear_btn=clear_btn
226 |     )
227 |     with chat_bot:
228 |         clear_btn.click(pg_agent.reset_messages)
229 | 
230 |     return chat_bot
231 | 
232 | 
233 | def main():
234 |     """Main function for running the PuppyGraphAgent."""
235 |     # Set up argument parsing
236 |     parser = argparse.ArgumentParser(description="Configure PuppyGraphAgent settings.")
237 |     parser.add_argument(
238 |         "--ip",
239 |         type=str,
240 |         default="localhost",
241 |         help="The IP address for the PuppyGraph.",
242 |     )
243 |     parser.add_argument(
244 |         "--query_language",
245 |         type=str,
246 |         default="gremlin",
247 |         help="The query language to be used (choose from 'gremlin' or 'cypher').",
248 |     )
249 |     parser.add_argument("--verbose", action="store_true", help="Enable verbose mode.")
250 |     parser.add_argument(
251 |         "--search", action="store_true", help="Enable search tool through internet."
252 |     )
253 | 
254 |     # Parse the command-line arguments
255 |     args = parser.parse_args()
256 | 
257 |     # Extract the arguments
258 |     ip = args.ip
259 |     query_language = args.query_language
260 |     verbose_mode = args.verbose
261 |     search_tool_enabled = args.search
262 | 
263 |     pg_agent = PuppyGraphAgent(
264 |         puppy_graph_client=_get_puppy_graph_client(ip=ip),
265 |         llm=_get_llm(),
266 |         chat_prompt_template=_get_chat_prompt_template(
267 |             graph_schema_prompt=_get_graph_schema_prompt(query_language=query_language),
268 |             search_tool_enabled=search_tool_enabled,
269 |         ),
270 |         query_language=query_language,
271 |         additional_tools=(
272 |             [
273 |                 StructuredTool.from_function(
274 |                     func=GoogleSerperAPIWrapper().run,
275 |                     name="google_serper",
276 |                     description="Query the internet.",
277 |                     args_schema=create_model(
278 |                         "", query=(str, Field(description="query"))
279 |                     ),
280 |                 )
281 |             ]
282 |             if search_tool_enabled
283 |             else None
284 |         ),
285 |     )
286 | 
287 |     _get_gradio_chatbot(pg_agent=pg_agent, verbose_mode=verbose_mode).launch()
288 | 
289 | 
290 | if __name__ == "__main__":
291 | 
292 |     main()
293 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/apps/databricks_mining_site/set_graph_schema.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from puppygraph import PuppyGraphClient, PuppyGraphHostConfig
  4 | 
  5 | logging.basicConfig(level=logging.INFO)
  6 | 
  7 | 
  8 | if __name__ == "__main__":
  9 |     cilent = PuppyGraphClient(PuppyGraphHostConfig(ip="127.0.0.1"))
 10 |     cilent.set_schema(
 11 |         {
 12 |             "catalogs": [
 13 |                 {
 14 |                     "name": "pg_databricks",
 15 |                     "type": "DELTALAKE",
 16 |                     "params": {
 17 |                         "metastore_param": {
 18 |                             "token": "${ENV:DATABRICKS_TOKEN}",
 19 |                             "host": "${ENV:DATABRICKS_HOST}",
 20 |                             "unity_catalog_name": "pg_databricks",
 21 |                         },
 22 |                         "storage_param": {
 23 |                             "use_instance_profile": "false",
 24 |                             "region": "us-east-1",
 25 |                             "access_key": "${ENV:AWS_ACCESS_KEY_ID}",
 26 |                             "secret_key": "${ENV:AWS_SECRET_ACCESS_KEY}",
 27 |                             "enable_ssl": "false",
 28 |                             "type": "S3",
 29 |                         },
 30 |                     },
 31 |                 }
 32 |             ],
 33 |             "vertices": [
 34 |                 {
 35 |                     "table_source": {
 36 |                         "catalog_name": "pg_databricks",
 37 |                         "schema_name": "bronze",
 38 |                         "table_name": "failure_type",
 39 |                     },
 40 |                     "label": "failure_type",
 41 |                     "description": "A type of failure",
 42 |                     "attributes": [
 43 |                         {
 44 |                             "name": "failure_type_name",
 45 |                             "from_field": "failure_type_name",
 46 |                             "type": "String",
 47 |                             "description": "The name of the failure type",
 48 |                         }
 49 |                     ],
 50 |                     "id": [
 51 |                         {
 52 |                             "name": "failure_type_id",
 53 |                             "from_field": "failure_type_id",
 54 |                             "type": "String",
 55 |                         }
 56 |                     ],
 57 |                 },
 58 |                 {
 59 |                     "table_source": {
 60 |                         "catalog_name": "pg_databricks",
 61 |                         "schema_name": "silver",
 62 |                         "table_name": "assets",
 63 |                     },
 64 |                     "label": "asset",
 65 |                     "description": "An asset in the system",
 66 |                     "attributes": [
 67 |                         {
 68 |                             "name": "asset_id",
 69 |                             "from_field": "asset_id",
 70 |                             "type": "String",
 71 |                             "description": "The ID of the asset",
 72 |                         },
 73 |                         {
 74 |                             "name": "asset_name",
 75 |                             "from_field": "asset_name",
 76 |                             "type": "String",
 77 |                             "description": "The name of the asset",
 78 |                         },
 79 |                         {
 80 |                             "name": "asset_type",
 81 |                             "from_field": "asset_type",
 82 |                             "type": "String",
 83 |                             "description": "The type of the asset",
 84 |                         },
 85 |                         {
 86 |                             "name": "location",
 87 |                             "from_field": "location",
 88 |                             "type": "String",
 89 |                             "description": "The location of the asset",
 90 |                         },
 91 |                         {
 92 |                             "name": "acquisition_date",
 93 |                             "from_field": "acquisition_date_formatted",
 94 |                             "type": "Date",
 95 |                             "description": "The acquisition date of the asset",
 96 |                         },
 97 |                         {
 98 |                             "name": "status",
 99 |                             "from_field": "status",
100 |                             "type": "String",
101 |                             "description": "The status of the asset",
102 |                         },
103 |                     ],
104 |                     "id": [{"name": "id", "from_field": "asset_id", "type": "String"}],
105 |                 },
106 |                 {
107 |                     "table_source": {
108 |                         "catalog_name": "pg_databricks",
109 |                         "schema_name": "gold",
110 |                         "table_name": "work_orders",
111 |                     },
112 |                     "label": "work_order",
113 |                     "description": "A work order in the system",
114 |                     "attributes": [
115 |                         {
116 |                             "name": "work_order_id",
117 |                             "from_field": "work_order_id",
118 |                             "type": "String",
119 |                             "description": "The ID of the work order",
120 |                         },
121 |                         {
122 |                             "name": "date",
123 |                             "from_field": "date",
124 |                             "type": "Date",
125 |                             "description": "The date of the work order",
126 |                         },
127 |                         {
128 |                             "name": "action_taken",
129 |                             "from_field": "action_taken",
130 |                             "type": "String",
131 |                             "description": "The action taken for the work order",
132 |                         },
133 |                         {
134 |                             "name": "technician",
135 |                             "from_field": "technician",
136 |                             "type": "String",
137 |                             "description": "The technician handling the work order",
138 |                         },
139 |                         {
140 |                             "name": "component_replaced_description",
141 |                             "from_field": "component_replaced_description",
142 |                             "type": "String",
143 |                             "description": "Description of the component replaced",
144 |                         },
145 |                         {
146 |                             "name": "component_replaced_material_num",
147 |                             "from_field": "component_replaced_material_num",
148 |                             "type": "String",
149 |                             "description": "Material number of the component replaced",
150 |                         },
151 |                         {
152 |                             "name": "repeated_work_order",
153 |                             "from_field": "repeated_work_order",
154 |                             "type": "Boolean",
155 |                             "description": "Whether the work order is a repeated one",
156 |                         },
157 |                         {
158 |                             "name": "successful_fix",
159 |                             "from_field": "successful_fix",
160 |                             "type": "Boolean",
161 |                             "description": "Whether the issue was successfully fixed",
162 |                         },
163 |                     ],
164 |                     "id": [
165 |                         {"name": "id", "from_field": "work_order_id", "type": "String"}
166 |                     ],
167 |                 },
168 |             ],
169 |             "edges": [
170 |                 {
171 |                     "table_source": {
172 |                         "catalog_name": "pg_databricks",
173 |                         "schema_name": "silver",
174 |                         "table_name": "troubleshooting_guide",
175 |                     },
176 |                     "label": "can_have_failure",
177 |                     "from_label": "asset",
178 |                     "to_label": "failure_type",
179 |                     "description": "An asset can have a failure type",
180 |                     "attributes": [
181 |                         {
182 |                             "name": "steps_to_follow",
183 |                             "from_field": "steps_to_follow",
184 |                             "type": "String",
185 |                             "description": "Steps to follow for the failure",
186 |                         },
187 |                         {
188 |                             "name": "reference_source",
189 |                             "from_field": "reference_source",
190 |                             "type": "String",
191 |                             "description": "The reference source for the failure",
192 |                         },
193 |                         {
194 |                             "name": "recommended_actions",
195 |                             "from_field": "recommended_actions",
196 |                             "type": "String",
197 |                             "description": "The recommended actions for the failure",
198 |                         },
199 |                     ],
200 |                     "id": [
201 |                         {
202 |                             "name": "can_have_failure_id",
203 |                             "from_field": "reference_id",
204 |                             "type": "String",
205 |                         }
206 |                     ],
207 |                     "from_id": [
208 |                         {"name": "asset_id", "from_field": "asset_id", "type": "String"}
209 |                     ],
210 |                     "to_id": [
211 |                         {
212 |                             "name": "failure_type_id",
213 |                             "from_field": "failure_type",
214 |                             "type": "String",
215 |                         }
216 |                     ],
217 |                 },
218 |                 {
219 |                     "table_source": {
220 |                         "catalog_name": "pg_databricks",
221 |                         "schema_name": "gold",
222 |                         "table_name": "work_orders",
223 |                     },
224 |                     "label": "worked_on",
225 |                     "from_label": "work_order",
226 |                     "to_label": "asset",
227 |                     "description": "A work order worked on an asset",
228 |                     "attributes": [],
229 |                     "id": [
230 |                         {
231 |                             "name": "worked_on_id",
232 |                             "from_field": "work_order_id",
233 |                             "type": "String",
234 |                         }
235 |                     ],
236 |                     "from_id": [
237 |                         {
238 |                             "name": "work_order_id",
239 |                             "from_field": "work_order_id",
240 |                             "type": "String",
241 |                         }
242 |                     ],
243 |                     "to_id": [
244 |                         {"name": "asset_id", "from_field": "asset_id", "type": "String"}
245 |                     ],
246 |                 },
247 |                 {
248 |                     "table_source": {
249 |                         "catalog_name": "pg_databricks",
250 |                         "schema_name": "gold",
251 |                         "table_name": "work_orders",
252 |                     },
253 |                     "label": "related_to_failure",
254 |                     "from_label": "work_order",
255 |                     "to_label": "failure_type",
256 |                     "description": "A work order is related to a failure type",
257 |                     "attributes": [],
258 |                     "id": [
259 |                         {
260 |                             "name": "related_to_failure_id",
261 |                             "from_field": "work_order_id",
262 |                             "type": "String",
263 |                         }
264 |                     ],
265 |                     "from_id": [
266 |                         {
267 |                             "name": "work_order_id",
268 |                             "from_field": "work_order_id",
269 |                             "type": "String",
270 |                         }
271 |                     ],
272 |                     "to_id": [
273 |                         {
274 |                             "name": "failure_type_id",
275 |                             "from_field": "llm_failure_type",
276 |                             "type": "String",
277 |                         }
278 |                     ],
279 |                 },
280 |             ],
281 |         }
282 |     )
283 | 


--------------------------------------------------------------------------------
/puppygraph/common/test_conversion_utils.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from puppygraph.common.conversion_utils import convert_mapping_config_to_host_json
  3 | 
  4 | 
  5 | @pytest.mark.parametrize(
  6 |     "config_dict, expected_json",
  7 |     [
  8 |         (
  9 |             {
 10 |                 "catalogs": [
 11 |                     {
 12 |                         "name": "my_catalog",
 13 |                         "type": "DELTALAKE",
 14 |                         "params": {
 15 |                             "metastore_param": {
 16 |                                 "token": "my_token",
 17 |                                 "host": "my_host",
 18 |                                 "unity_catalog_name": "my_catalog_name",
 19 |                             },
 20 |                             "storage_param": {
 21 |                                 "use_instance_profile": "false",
 22 |                                 "region": "my_region",
 23 |                                 "access_key": "my_access_key",
 24 |                                 "secret_key": "my_secret_key",
 25 |                                 "enable_ssl": "true",
 26 |                                 "type": "S3",
 27 |                             },
 28 |                         },
 29 |                     }
 30 |                 ],
 31 |                 "vertices": [
 32 |                     {
 33 |                         "label": "Person",
 34 |                         "table_source": {
 35 |                             "catalog_name": "my_catalog",
 36 |                             "schema_name": "my_schema",
 37 |                             "table_name": "person_table",
 38 |                         },
 39 |                         "id": [{"name": "id", "type": "String", "from_field": "id"}],
 40 |                         "attributes": [
 41 |                             {
 42 |                                 "name": "name",
 43 |                                 "type": "String",
 44 |                                 "from_field": "person_name",
 45 |                             },
 46 |                             {"name": "age", "type": "Int", "from_field": "person_age"},
 47 |                         ],
 48 |                     },
 49 |                     {
 50 |                         "label": "Location",
 51 |                         "table_source": {
 52 |                             "catalog_name": "my_catalog",
 53 |                             "schema_name": "my_schema",
 54 |                             "table_name": "location_table",
 55 |                         },
 56 |                         "id": [{"name": "id", "type": "String", "from_field": "id"}],
 57 |                         "attributes": [
 58 |                             {
 59 |                                 "name": "name",
 60 |                                 "type": "String",
 61 |                                 "from_field": "location_name",
 62 |                             },
 63 |                             {
 64 |                                 "name": "latitude",
 65 |                                 "type": "Float",
 66 |                                 "from_field": "location_lat",
 67 |                             },
 68 |                             {
 69 |                                 "name": "longitude",
 70 |                                 "type": "Float",
 71 |                                 "from_field": "location_long",
 72 |                             },
 73 |                         ],
 74 |                     },
 75 |                 ],
 76 |                 "edges": [
 77 |                     {
 78 |                         "label": "LivesIn",
 79 |                         "from_label": "Person",
 80 |                         "to_label": "Location",
 81 |                         "table_source": {
 82 |                             "catalog_name": "my_catalog",
 83 |                             "schema_name": "my_schema",
 84 |                             "table_name": "lives_in_table",
 85 |                         },
 86 |                         "id": [{"name": "id", "type": "String", "from_field": "id"}],
 87 |                         "from_id": [
 88 |                             {
 89 |                                 "name": "from_id",
 90 |                                 "type": "String",
 91 |                                 "from_field": "from_id",
 92 |                             }
 93 |                         ],
 94 |                         "to_id": [
 95 |                             {"name": "to_id", "type": "String", "from_field": "to_id"}
 96 |                         ],
 97 |                         "attributes": [
 98 |                             {
 99 |                                 "name": "since",
100 |                                 "type": "Date",
101 |                                 "from_field": "since_date",
102 |                             }
103 |                         ],
104 |                     },
105 |                     {
106 |                         "label": "Likes",
107 |                         "from_label": "Person",
108 |                         "to_label": "Person",
109 |                         "table_source": {
110 |                             "catalog_name": "my_catalog",
111 |                             "schema_name": "my_schema",
112 |                             "table_name": "likes_table",
113 |                         },
114 |                         "id": [{"name": "id", "type": "String", "from_field": "id"}],
115 |                         "from_id": [
116 |                             {
117 |                                 "name": "from_id",
118 |                                 "type": "String",
119 |                                 "from_field": "from_id",
120 |                             }
121 |                         ],
122 |                         "to_id": [
123 |                             {"name": "to_id", "type": "String", "from_field": "to_id"}
124 |                         ],
125 |                         "attributes": [],
126 |                     },
127 |                 ],
128 |             },
129 |             {
130 |                 "catalogs": [
131 |                     {
132 |                         "name": "my_catalog",
133 |                         "type": "deltalake",
134 |                         "metastore": {
135 |                             "type": "unity",
136 |                             "token": "my_token",
137 |                             "host": "my_host",
138 |                             "databricksCatalogName": "my_catalog_name",
139 |                         },
140 |                         "storage": {
141 |                             "useInstanceProfile": "false",
142 |                             "region": "my_region",
143 |                             "accessKey": "my_access_key",
144 |                             "secretKey": "my_secret_key",
145 |                             "enableSsl": "true",
146 |                             "type": "S3",
147 |                         },
148 |                     },
149 |                 ],
150 |                 "graph": {
151 |                     "vertices": [
152 |                         {
153 |                             "label": "Person",
154 |                             "oneToOne": {
155 |                                 "tableSource": {
156 |                                     "catalog": "my_catalog",
157 |                                     "schema": "my_schema",
158 |                                     "table": "person_table",
159 |                                 },
160 |                                 "id": {
161 |                                     "fields": [
162 |                                         {"field": "id", "type": "String", "alias": "id"}
163 |                                     ]
164 |                                 },
165 |                                 "attributes": [
166 |                                     {
167 |                                         "alias": "name",
168 |                                         "field": "person_name",
169 |                                         "type": "String",
170 |                                     },
171 |                                     {
172 |                                         "alias": "age",
173 |                                         "field": "person_age",
174 |                                         "type": "Int",
175 |                                     },
176 |                                 ],
177 |                             },
178 |                         },
179 |                         {
180 |                             "label": "Location",
181 |                             "oneToOne": {
182 |                                 "tableSource": {
183 |                                     "catalog": "my_catalog",
184 |                                     "schema": "my_schema",
185 |                                     "table": "location_table",
186 |                                 },
187 |                                 "id": {
188 |                                     "fields": [
189 |                                         {"field": "id", "type": "String", "alias": "id"}
190 |                                     ]
191 |                                 },
192 |                                 "attributes": [
193 |                                     {
194 |                                         "alias": "name",
195 |                                         "field": "location_name",
196 |                                         "type": "String",
197 |                                     },
198 |                                     {
199 |                                         "alias": "latitude",
200 |                                         "field": "location_lat",
201 |                                         "type": "Float",
202 |                                     },
203 |                                     {
204 |                                         "alias": "longitude",
205 |                                         "field": "location_long",
206 |                                         "type": "Float",
207 |                                     },
208 |                                 ],
209 |                             },
210 |                         },
211 |                     ],
212 |                     "edges": [
213 |                         {
214 |                             "label": "LivesIn",
215 |                             "fromVertex": "Person",
216 |                             "toVertex": "Location",
217 |                             "tableSource": {
218 |                                 "catalog": "my_catalog",
219 |                                 "schema": "my_schema",
220 |                                 "table": "lives_in_table",
221 |                             },
222 |                             "id": {
223 |                                 "fields": [
224 |                                     {"field": "id", "type": "String", "alias": "id"}
225 |                                 ]
226 |                             },
227 |                             "fromId": {
228 |                                 "fields": [
229 |                                     {
230 |                                         "field": "from_id",
231 |                                         "type": "String",
232 |                                         "alias": "from_id",
233 |                                     }
234 |                                 ]
235 |                             },
236 |                             "toId": {
237 |                                 "fields": [
238 |                                     {
239 |                                         "field": "to_id",
240 |                                         "type": "String",
241 |                                         "alias": "to_id",
242 |                                     }
243 |                                 ]
244 |                             },
245 |                             "attributes": [
246 |                                 {
247 |                                     "alias": "since",
248 |                                     "field": "since_date",
249 |                                     "type": "Date",
250 |                                 }
251 |                             ],
252 |                         },
253 |                         {
254 |                             "label": "Likes",
255 |                             "fromVertex": "Person",
256 |                             "toVertex": "Person",
257 |                             "tableSource": {
258 |                                 "catalog": "my_catalog",
259 |                                 "schema": "my_schema",
260 |                                 "table": "likes_table",
261 |                             },
262 |                             "id": {
263 |                                 "fields": [
264 |                                     {"field": "id", "type": "String", "alias": "id"}
265 |                                 ]
266 |                             },
267 |                             "fromId": {
268 |                                 "fields": [
269 |                                     {
270 |                                         "field": "from_id",
271 |                                         "type": "String",
272 |                                         "alias": "from_id",
273 |                                     }
274 |                                 ]
275 |                             },
276 |                             "toId": {
277 |                                 "fields": [
278 |                                     {
279 |                                         "field": "to_id",
280 |                                         "type": "String",
281 |                                         "alias": "to_id",
282 |                                     }
283 |                                 ]
284 |                             },
285 |                             "attributes": [],
286 |                         },
287 |                     ],
288 |                 },
289 |             },
290 |         )
291 |     ],
292 | )
293 | def test_convert_schema_and_construction_to_host_json(config_dict, expected_json):
294 |     """Test conversion of schema and construction to host json."""
295 |     assert convert_mapping_config_to_host_json(config=config_dict) == expected_json
296 | 


--------------------------------------------------------------------------------
/apps/chatbot/rag_system.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from typing import Dict, List, Any, Optional, Tuple
  4 | from dataclasses import dataclass
  5 | import chromadb
  6 | from sentence_transformers import SentenceTransformer
  7 | from anthropic import Anthropic
  8 | import os
  9 | from dotenv import load_dotenv
 10 | 
 11 | load_dotenv()
 12 | 
 13 | logging.basicConfig(level=logging.INFO)
 14 | logger = logging.getLogger("rag_system")
 15 | 
 16 | 
 17 | @dataclass
 18 | class QueryExample:
 19 |     """Example query with natural language question and corresponding Cypher"""
 20 |     question: str
 21 |     cypher: str
 22 |     description: str
 23 |     schema_context: Optional[str] = None
 24 | 
 25 | 
 26 | @dataclass
 27 | class PromptConfig:
 28 |     """Configuration for customizable prompt components"""
 29 |     role_definition: str = """You are an expert at converting natural language questions to Cypher queries for graph databases."""
 30 |     
 31 |     plan_generation_instruction: str = """First, analyze the question and create a step-by-step plan for generating the appropriate Cypher query."""
 32 |     
 33 |     puppygraph_differences: str = """PUPPYGRAPH DIFFERENCES FROM STANDARD CYPHER:
 34 | - PuppyGraph supports standard Cypher syntax
 35 | - Use proper node and relationship patterns: (n)-[r]->(m)
 36 | - Always list required properties in the query, do not return nodes or relationships without the required properties
 37 | - Only return one aggregate value per query, including collect(), count(), size(), type(), etc."""
 38 |     
 39 |     output_format_instruction: str = """OUTPUT FORMAT:
 40 | Use the generate_cypher_query tool to create a Cypher query that answers this question.
 41 | Provide:
 42 | 1. A complete, valid Cypher query
 43 | 2. A clear explanation of what the query does
 44 | 3. Step-by-step reasoning (optional but helpful)"""
 45 | 
 46 | 
 47 | @dataclass
 48 | class QueryStep:
 49 |     """A single step in a multi-query execution plan"""
 50 |     step_number: int
 51 |     description: str
 52 |     cypher: str
 53 |     result: Optional[List[Dict[str, Any]]] = None
 54 |     error: Optional[str] = None
 55 |     prompt: Optional[str] = None
 56 |     llm_response: Optional[str] = None
 57 | 
 58 | 
 59 | class TextToCypherRAG:
 60 |     """RAG system for converting natural language to Cypher queries"""
 61 |     
 62 |     def __init__(self, 
 63 |                  embedding_model: str = "all-MiniLM-L6-v2",
 64 |                  collection_name: str = "cypher_examples",
 65 |                  anthropic_api_key: Optional[str] = None,
 66 |                  prompt_config: Optional[PromptConfig] = None):
 67 |         
 68 |         # Initialize embedding model
 69 |         self.embedding_model = SentenceTransformer(embedding_model)
 70 |         
 71 |         # Initialize ChromaDB
 72 |         self.chroma_client = chromadb.Client()
 73 |         self.collection = self.chroma_client.get_or_create_collection(
 74 |             name=collection_name,
 75 |             metadata={"hnsw:space": "cosine"}
 76 |         )
 77 |         
 78 |         # Initialize Anthropic client
 79 |         self.anthropic_client = Anthropic(api_key=anthropic_api_key or os.getenv("ANTHROPIC_API_KEY"))
 80 |         
 81 |         # Initialize prompt configuration
 82 |         self.prompt_config = prompt_config or PromptConfig()
 83 |         
 84 |         # Initialize conversation history - maintain context across questions
 85 |         self.conversation_messages = []
 86 |         self.current_schema = None
 87 |         
 88 |         # Initialize with some default examples
 89 |         self._initialize_examples()
 90 |     
 91 |     def _initialize_examples(self):
 92 |         """Initialize the RAG system with some common query examples"""
 93 |         examples = [
 94 |             QueryExample(
 95 |                 question="Show all nodes in the graph",
 96 |                 cypher="MATCH (n) RETURN n LIMIT 100",
 97 |                 description="Returns all nodes with a limit"
 98 |             ),
 99 |             QueryExample(
100 |                 question="Count all nodes",
101 |                 cypher="MATCH (n) RETURN count(n) as node_count",
102 |                 description="Counts total number of nodes"
103 |             ),
104 |             QueryExample(
105 |                 question="Count all relationships",
106 |                 cypher="MATCH ()-[r]->() RETURN count(r) as relationship_count",
107 |                 description="Counts total number of relationships"
108 |             ),
109 |             QueryExample(
110 |                 question="Show graph statistics",
111 |                 cypher="MATCH (n) WITH count(n) as node_count MATCH ()-[r]->() RETURN node_count, count(r) as edge_count",
112 |                 description="Shows basic graph statistics"
113 |             ),
114 |             QueryExample(
115 |                 question="Find nodes with specific property",
116 |                 cypher="MATCH (n) WHERE n.name IS NOT NULL RETURN n.name LIMIT 10",
117 |                 description="Finds nodes that have a name property"
118 |             ),
119 |             QueryExample(
120 |                 question="Show all relationship types",
121 |                 cypher="MATCH ()-[r]->() RETURN DISTINCT type(r) as relationship_type",
122 |                 description="Returns all unique relationship types in the graph"
123 |             ),
124 |             QueryExample(
125 |                 question="Show all node labels",
126 |                 cypher="MATCH (n) RETURN DISTINCT labels(n) as node_labels",
127 |                 description="Returns all unique node labels in the graph"
128 |             ),
129 |             QueryExample(
130 |                 question="Find connected nodes",
131 |                 cypher="MATCH (n)-[r]-(m) RETURN n, type(r) as relationship, m LIMIT 20",
132 |                 description="Shows connected nodes with their relationships"
133 |             ),
134 |             QueryExample(
135 |                 question="Find shortest path between nodes",
136 |                 cypher="MATCH p = shortestPath((start)-[*]-(end)) WHERE id(start) = $start_id AND id(end) = $end_id RETURN p",
137 |                 description="Finds shortest path between two specific nodes"
138 |             ),
139 |             QueryExample(
140 |                 question="Find nodes by degree",
141 |                 cypher="MATCH (n) WITH n, size((n)--()) as degree WHERE degree > 5 RETURN n, degree ORDER BY degree DESC",
142 |                 description="Finds nodes with high connectivity (degree > 5)"
143 |             )
144 |         ]
145 |         
146 |         self._add_examples_to_collection(examples)
147 |     
148 |     def _add_examples_to_collection(self, examples: List[QueryExample]):
149 |         """Add examples to the ChromaDB collection"""
150 |         if not examples:
151 |             return
152 |         
153 |         # Check if examples already exist
154 |         existing_count = self.collection.count()
155 |         if existing_count >= len(examples):
156 |             logger.info(f"Examples already exist in collection ({existing_count} items)")
157 |             return
158 |         
159 |         # Prepare data for ChromaDB
160 |         questions = [ex.question for ex in examples]
161 |         embeddings = self.embedding_model.encode(questions).tolist()
162 |         
163 |         ids = [f"example_{i}" for i in range(len(examples))]
164 |         documents = questions
165 |         metadatas = [
166 |             {
167 |                 "cypher": ex.cypher,
168 |                 "description": ex.description,
169 |                 "schema_context": ex.schema_context or ""
170 |             }
171 |             for ex in examples
172 |         ]
173 |         
174 |         # Add to collection
175 |         self.collection.add(
176 |             ids=ids,
177 |             embeddings=embeddings,
178 |             documents=documents,
179 |             metadatas=metadatas
180 |         )
181 |         
182 |         logger.info(f"Added {len(examples)} examples to the collection")
183 |     
184 |     def add_example(self, example: QueryExample):
185 |         """Add a single example to the RAG system"""
186 |         self._add_examples_to_collection([example])
187 |     
188 |     def update_prompt_config(self, prompt_config: PromptConfig):
189 |         """Update the prompt configuration"""
190 |         self.prompt_config = prompt_config
191 |         logger.info("Prompt configuration updated")
192 |     
193 |     def get_prompt_config(self) -> PromptConfig:
194 |         """Get the current prompt configuration"""
195 |         return self.prompt_config
196 |     
197 |     def _build_system_prompt(self, schema: Dict[str, Any]) -> str:
198 |         """Build the system prompt with static information that doesn't change during conversation"""
199 |         
200 |         # Format schema information
201 |         schema_info = self._format_schema_for_prompt(schema)
202 |         
203 |         # Get similar examples for general context
204 |         examples_text = "\n".join([
205 |             f"Q: {ex['question']}\nCypher: {ex['cypher']}\nDescription: {ex['description']}\n"
206 |             for ex in self.find_similar_examples("graph query examples", k=5)
207 |         ])
208 |         
209 |         system_prompt = f"""{self.prompt_config.role_definition}
210 | 
211 | {self.prompt_config.plan_generation_instruction}
212 | 
213 | GRAPH SCHEMA:
214 | {schema_info}
215 | 
216 | EXAMPLE QUERY PATTERNS:
217 | {examples_text}
218 | 
219 | {self.prompt_config.puppygraph_differences}
220 | 
221 | RULES:
222 | 1. Always use proper Cypher syntax
223 | 2. Include appropriate LIMIT clauses for large result sets  
224 | 3. Use parameterized queries when possible
225 | 4. Consider the graph schema when writing queries
226 | 5. Return only valid, executable Cypher
227 | 6. Be conservative with result sizes (use LIMIT 100 or less by default)
228 | 7. IMPORTANT: You have a limited number of query rounds. When approaching the limit, prioritize gathering the most essential information.
229 | 8. CRITICAL: When you reach the maximum number of rounds, you MUST stop and provide a comprehensive summary based on all gathered data.
230 | 
231 | {self.prompt_config.output_format_instruction}
232 | 
233 | You will be asked to help answer questions by generating Cypher queries step by step. Use the tools provided to generate queries or make decisions about when to stop and provide a final answer. Consider previous conversation context when making decisions."""
234 | 
235 |         return system_prompt
236 |     
237 |     def _update_schema_if_needed(self, schema: Dict[str, Any]):
238 |         """Update the schema and rebuild system prompt if schema has changed"""
239 |         if self.current_schema != schema:
240 |             self.current_schema = schema
241 |             # Only reset conversation if this is truly a new schema (not just the first time)
242 |             if hasattr(self, 'conversation_messages') and self.conversation_messages:
243 |                 logger.warning("Schema changed mid-conversation, resetting conversation history")
244 |                 self.conversation_messages = []
245 |             logger.info("Schema updated")
246 |     
247 |     def clear_conversation(self):
248 |         """Clear the conversation history (but keep the current schema/system prompt)"""
249 |         self.conversation_messages = []
250 |     
251 |     def get_system_prompt(self) -> str:
252 |         """Get the current system prompt"""
253 |         if self.current_schema:
254 |             return self._build_system_prompt(self.current_schema)
255 |         return ""
256 |     
257 |     def find_similar_examples(self, question: str, k: int = 3) -> List[Dict[str, Any]]:
258 |         """Find similar examples for a given question"""
259 |         # Generate embedding for the question
260 |         question_embedding = self.embedding_model.encode([question]).tolist()[0]
261 |         
262 |         # Search in ChromaDB
263 |         results = self.collection.query(
264 |             query_embeddings=[question_embedding],
265 |             n_results=k,
266 |             include=["documents", "metadatas", "distances"]
267 |         )
268 |         
269 |         # Format results
270 |         similar_examples = []
271 |         if results["ids"]:
272 |             for i in range(len(results["ids"][0])):
273 |                 similar_examples.append({
274 |                     "question": results["documents"][0][i],
275 |                     "cypher": results["metadatas"][0][i]["cypher"],
276 |                     "description": results["metadatas"][0][i]["description"],
277 |                     "similarity": 1 - results["distances"][0][i]  # Convert distance to similarity
278 |                 })
279 |         
280 |         return similar_examples
281 |     
282 |     def _format_schema_for_prompt(self, schema: Dict[str, Any]) -> str:
283 |         """Format schema information for the LLM prompt"""
284 |         schema_text = "VERTICES:\n"
285 |         
286 |         for vertex in schema.get("vertices", []):
287 |             label = vertex.get("label", "Unknown")
288 |             attributes = vertex.get("attributes", [])
289 |             description = vertex.get("description", "").strip()
290 |             
291 |             # Label on its own line
292 |             schema_text += f"- {label}\n"
293 |             
294 |             # Description section (if available)
295 |             if description:
296 |                 schema_text += f"  Description: {description}\n"
297 |             
298 |             # Attributes
299 |             if attributes:
300 |                 attr_text = ", ".join([f"{attr['name']}:{attr['type']}" for attr in attributes])
301 |                 schema_text += f"  Attributes: {attr_text}\n"
302 |             else:
303 |                 schema_text += f"  Attributes: (none)\n"
304 |             
305 |             schema_text += "\n"
306 |         
307 |         schema_text += "EDGES:\n"
308 |         for edge in schema.get("edges", []):
309 |             label = edge.get("label", "Unknown")
310 |             from_vertex = edge.get("from", "Unknown")
311 |             to_vertex = edge.get("to", "Unknown")
312 |             attributes = edge.get("attributes", [])
313 |             description = edge.get("description", "").strip()
314 |             
315 |             # Edge pattern on its own line
316 |             schema_text += f"- (:{from_vertex})-[:{label}]->(:{to_vertex})\n"
317 |             
318 |             # Description section (if available)
319 |             if description:
320 |                 schema_text += f"  Description: {description}\n"
321 |             
322 |             # Attributes
323 |             if attributes:
324 |                 attr_text = ", ".join([f"{attr['name']}:{attr['type']}" for attr in attributes])
325 |                 schema_text += f"  Attributes: {attr_text}\n"
326 |             else:
327 |                 schema_text += f"  Attributes: (none)\n"
328 |             
329 |             schema_text += "\n"
330 |         
331 |         return schema_text
332 |     
333 |     def generate_next_query(self, 
334 |                            question: str,
335 |                            schema: Dict[str, Any],
336 |                            previous_steps: List[QueryStep],
337 |                            max_rounds: int = 5) -> Tuple[str, str, bool, str, str]:
338 |         """Generate the next query in a multi-round execution, or decide to stop"""
339 |         
340 |         # Ensure we have the schema set up in our conversation
341 |         self._update_schema_if_needed(schema)
342 |         
343 |         # If this is the start of a new question, add it to conversation
344 |         if not previous_steps:
345 |             self.conversation_messages.append({
346 |                 "role": "user", 
347 |                 "content": f"Please help me answer this question: {question}"
348 |             })
349 |         
350 |         # Debug: Print conversation history at each round
351 |         logger.info(f"=== Round {len(previous_steps) + 1} - Conversation History ({len(self.conversation_messages)} messages) ===")
352 |         for i, msg in enumerate(self.conversation_messages):
353 |             logger.info(f"  {i+1}. {msg['role']}: {msg['content'][:100]}...")
354 |         logger.info("=== End Conversation History ===")
355 |         
356 |         
357 |         # Define the tool for multi-round query generation
358 |         multi_round_tool = {
359 |             "name": "multi_round_query_decision",
360 |             "description": "Decide whether to generate another query or provide final answer in multi-round execution",
361 |             "input_schema": {
362 |                 "type": "object",
363 |                 "properties": {
364 |                     "action": {
365 |                         "type": "string",
366 |                         "enum": ["CONTINUE", "STOP"],
367 |                         "description": "Whether to continue with another query or stop and provide final answer. You MUST choose STOP if you're at the maximum number of rounds."
368 |                     },
369 |                     "cypher_query": {
370 |                         "type": "string",
371 |                         "description": "The next Cypher query to execute (only if action is CONTINUE)"
372 |                     },
373 |                     "explanation": {
374 |                         "type": "string",
375 |                         "description": "Explanation of what this query will do (if CONTINUE) or the final answer to the original question (if STOP)"
376 |                     },
377 |                     "final_answer": {
378 |                         "type": "string",
379 |                         "description": "The final answer to the original question (if STOP). This should be a comprehensive summary based on all gathered data. Do not include query details or reasoning - focus on answering the user's original question directly."
380 |                     },
381 |                     "reasoning": {
382 |                         "type": "string",
383 |                         "description": "Reasoning for why you chose to continue or stop"
384 |                     }
385 |                 },
386 |                 "required": ["action", "explanation"]
387 |             }
388 |         }
389 |         
390 |         # Add the current request to ask what to do next
391 |         current_round = len(previous_steps) + 1
392 |         remaining_rounds = max_rounds - len(previous_steps)
393 |         
394 |         if previous_steps:
395 |             if current_round >= max_rounds:
396 |                 # Force summarization on last round
397 |                 current_request = f"I have executed {len(previous_steps)} queries and this is my final round (round {max_rounds}). I must now STOP and provide a comprehensive final answer to the original question '{question}' based on all the information I've gathered."
398 |             elif current_round == max_rounds - 1:
399 |                 # Warn about approaching limit
400 |                 current_request = f"I have executed {len(previous_steps)} queries so far and only have {remaining_rounds} round left. Based on the results from our conversation, should I continue with one final query or do I have enough information to answer the original question '{question}'? If I continue, the next round will be my last."
401 |             else:
402 |                 current_request = f"I have executed {len(previous_steps)} queries so far ({remaining_rounds} rounds remaining). Based on the results from our conversation, should I continue with another query or do I have enough information to answer the original question '{question}'?"
403 |         else:
404 |             current_request = f"I need to answer the question: '{question}'. Should I execute a query to gather information or do I already have what I need? I have up to {max_rounds} rounds available."
405 |         
406 |         # Create a copy of messages for this request (don't permanently add the request)
407 |         current_messages = self.conversation_messages + [{"role": "user", "content": current_request}]
408 |         
409 |         # Build simple prompt that will be shown in UI (the real context is in conversation history)
410 |         display_prompt = f"Question: {question}\nCurrent request: {current_request}"
411 |         
412 |         try:
413 |             response = self.anthropic_client.messages.create(
414 |                 model="claude-sonnet-4-20250514",
415 |                 max_tokens=2000,
416 |                 temperature=0,
417 |                 system=self.get_system_prompt(),
418 |                 tools=[multi_round_tool],
419 |                 tool_choice={"type": "tool", "name": "multi_round_query_decision"},
420 |                 messages=current_messages
421 |             )
422 |             
423 |             # Extract tool use result
424 |             if response.content and response.content[0].type == "tool_use":
425 |                 tool_input = response.content[0].input
426 |                 action = tool_input.get("action", "STOP")
427 |                 explanation = tool_input.get("explanation", "")
428 |                 reasoning = tool_input.get("reasoning", "")
429 |                 
430 |                 # Format the raw response for display
431 |                 raw_response = f"Multi-round tool response:\n{json.dumps(tool_input, indent=2)}"
432 |                 
433 |                 if action == "STOP" or current_round >= max_rounds:
434 |                     # Force stop if we've reached max rounds, regardless of LLM decision
435 |                     if current_round >= max_rounds and action == "CONTINUE":
436 |                         logger.warning(f"LLM tried to continue past max_rounds ({max_rounds}), forcing stop")
437 |                         action = "STOP"
438 |                         explanation = "Reached maximum number of query rounds - providing summary based on gathered data."
439 |                     
440 |                     # Include reasoning in the final answer if provided
441 |                     final_answer = tool_input.get("final_answer", "")
442 |                     if not final_answer:
443 |                         final_answer = explanation
444 |                         if reasoning:
445 |                             final_answer = f"{explanation}\n\nReasoning: {reasoning}"
446 |                     
447 |                     # Add the final response to conversation history
448 |                     self.conversation_messages.append({
449 |                         "role": "assistant",
450 |                         "content": f"I have enough information to answer. Final answer: {final_answer}"
451 |                     })
452 |                     
453 |                     return "", final_answer, True, display_prompt, raw_response
454 |                 else:
455 |                     # Continue with next query
456 |                     cypher_query = tool_input.get("cypher_query", "")
457 |                     if not cypher_query:
458 |                         logger.warning("No cypher query provided for CONTINUE action")
459 |                         return "MATCH (n) RETURN count(n) as count", "No query provided", False, display_prompt, raw_response
460 |                     
461 |                     # Include reasoning in explanation if provided
462 |                     full_explanation = explanation
463 |                     if reasoning:
464 |                         full_explanation = f"{explanation} (Reasoning: {reasoning})"
465 |                     
466 |                     # Add the decision to conversation history so Claude can see its own reasoning
467 |                     self.conversation_messages.append({
468 |                         "role": "assistant",
469 |                         "content": f"I decided to continue with another query: {cypher_query}\nExplanation: {full_explanation}"
470 |                     })
471 |                     
472 |                     return cypher_query, full_explanation, False, display_prompt, raw_response
473 |             else:
474 |                 logger.warning("No tool use found in multi-round response")
475 |                 return "MATCH (n) RETURN count(n) as count", "Error: No valid tool response received", True, display_prompt, "No tool use found"
476 |                 
477 |         except Exception as e:
478 |             logger.error(f"Error generating next query: {e}")
479 |             return "MATCH (n) RETURN count(n) as count", f"Error: {str(e)}", True, "", f"Error: {str(e)}"
480 |     
481 |     def generate_final_answer_from_steps(self, question: str, executed_steps: List[QueryStep]) -> str:
482 |         """Generate final answer using LLM to analyze conversation history"""
483 |         
484 |         if not executed_steps:
485 |             return "I wasn't able to execute any queries to answer your question."
486 |         
487 |         # Ask Claude to summarize based on the conversation history
488 |         summary_request = f"Based on our conversation and the {len(executed_steps)} queries I've executed, please provide a comprehensive answer to the original question: '{question}'. Summarize what we learned and provide the best answer you can based on the data we gathered."
489 |         
490 |         current_messages = self.conversation_messages + [{"role": "user", "content": summary_request}]
491 |         
492 |         try:
493 |             response = self.anthropic_client.messages.create(
494 |                 model="claude-sonnet-4-20250514",
495 |                 max_tokens=2000,
496 |                 temperature=1,
497 |                 thinking={
498 |                   "type": "enabled",
499 |                   "budget_tokens": 1600
500 |                 },
501 |                 system=self.get_system_prompt(),
502 |                 messages=current_messages
503 |             )
504 |             
505 |             if response.content and response.content[0].type == "text":
506 |                 return response.content[0].text
507 |             else:
508 |                 # Fallback to basic summary
509 |                 return f"I executed {len(executed_steps)} queries to gather information, but couldn't generate a proper summary."
510 |                 
511 |         except Exception as e:
512 |             logger.error(f"Error generating final answer: {e}")
513 |             return f"I executed {len(executed_steps)} queries but encountered an error when summarizing: {str(e)}"


--------------------------------------------------------------------------------
/apps/chatbot/backend.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import logging
  4 | import subprocess
  5 | import time
  6 | from typing import Dict, Any, List, Optional, Tuple
  7 | from dataclasses import asdict
  8 | import requests
  9 | from rag_system import TextToCypherRAG, QueryExample, PromptConfig
 10 | 
 11 | logging.basicConfig(level=logging.INFO)
 12 | logger = logging.getLogger("backend")
 13 | 
 14 | 
 15 | class PuppyGraphChatbot:
 16 |     """Main chatbot backend that coordinates MCP server, RAG system, and PuppyGraph"""
 17 |     
 18 |     def __init__(self, 
 19 |                  puppygraph_bolt_uri: str = "bolt://localhost:7687",
 20 |                  puppygraph_http_uri: str = "http://localhost:8081", 
 21 |                  puppygraph_username: str = "puppygraph",
 22 |                  puppygraph_password: str = "puppygraph123",
 23 |                  prompt_config: Optional[PromptConfig] = None):
 24 |         
 25 |         self.puppygraph_bolt_uri = puppygraph_bolt_uri
 26 |         self.puppygraph_http_uri = puppygraph_http_uri
 27 |         self.puppygraph_username = puppygraph_username
 28 |         self.puppygraph_password = puppygraph_password
 29 |         
 30 |         # Initialize RAG system with optional prompt configuration
 31 |         self.rag_system = TextToCypherRAG(prompt_config=prompt_config)
 32 |         
 33 |         # MCP server process
 34 |         self.mcp_process = None
 35 |         
 36 |         # Cache for schema and frequently used data
 37 |         self.schema_cache = None
 38 |         self.schema_cache_time = 0
 39 |         self.cache_duration = 300  # 5 minutes
 40 |         
 41 |         # Conversation history
 42 |         self.conversation_history: List[Dict[str, Any]] = []
 43 |     
 44 |     async def start_mcp_server(self):
 45 |         """Start the MCP server process"""
 46 |         try:
 47 |             # Set environment variables for MCP server
 48 |             env = {
 49 |                 "PUPPYGRAPH_BOLT_URI": self.puppygraph_bolt_uri,
 50 |                 "PUPPYGRAPH_HTTP_URI": self.puppygraph_http_uri,
 51 |                 "PUPPYGRAPH_USERNAME": self.puppygraph_username,
 52 |                 "PUPPYGRAPH_PASSWORD": self.puppygraph_password
 53 |             }
 54 |             
 55 |             self.mcp_process = subprocess.Popen(
 56 |                 ["python", "mcp_server.py"],
 57 |                 stdin=subprocess.PIPE,
 58 |                 stdout=subprocess.PIPE,
 59 |                 stderr=subprocess.PIPE,
 60 |                 text=True,
 61 |                 env=env
 62 |             )
 63 |             
 64 |             # Give it a moment to start
 65 |             await asyncio.sleep(2)
 66 |             
 67 |             if self.mcp_process.poll() is None:
 68 |                 logger.info("MCP server started successfully")
 69 |                 return True
 70 |             else:
 71 |                 logger.error("MCP server failed to start")
 72 |                 return False
 73 |                 
 74 |         except Exception as e:
 75 |             logger.error(f"Error starting MCP server: {e}")
 76 |             return False
 77 |     
 78 |     def stop_mcp_server(self):
 79 |         """Stop the MCP server process"""
 80 |         if self.mcp_process and self.mcp_process.poll() is None:
 81 |             self.mcp_process.terminate()
 82 |             self.mcp_process.wait()
 83 |             logger.info("MCP server stopped")
 84 |     
 85 |     def get_schema(self) -> Dict[str, Any]:
 86 |         """Get schema from PuppyGraph with caching"""
 87 |         current_time = time.time()
 88 |         
 89 |         # Return cached schema if still valid
 90 |         if (self.schema_cache and 
 91 |             current_time - self.schema_cache_time < self.cache_duration):
 92 |             return self.schema_cache
 93 |         
 94 |         try:
 95 |             response = requests.get(
 96 |                 f"{self.puppygraph_http_uri}/schemajson",
 97 |                 auth=(self.puppygraph_username, self.puppygraph_password),
 98 |                 timeout=10
 99 |             )
100 |             response.raise_for_status()
101 |             
102 |             raw_schema = response.json()
103 |             
104 |             # Convert PuppyGraph schema format to our expected format
105 |             converted_schema = self._convert_puppygraph_schema(raw_schema)
106 |             
107 |             self.schema_cache = converted_schema
108 |             self.schema_cache_time = current_time
109 |             
110 |             return self.schema_cache
111 |             
112 |         except Exception as e:
113 |             logger.error(f"Error fetching schema: {e}")
114 |             # Return cached schema if available, otherwise empty schema
115 |             return self.schema_cache or {"vertices": [], "edges": []}
116 |     
117 |     def _convert_puppygraph_schema(self, raw_schema: Dict[str, Any]) -> Dict[str, Any]:
118 |         """Convert PuppyGraph schema format to our expected format based on graph_schema.proto"""
119 |         
120 |         try:
121 |             # Extract graph definition (non-deprecated format)
122 |             graph_def = raw_schema.get("graph", {})
123 |             
124 |             # Convert vertices from Graph.VertexSchema format
125 |             vertices = []
126 |             for vertex in graph_def.get("vertices", []):
127 |                 converted_vertex = {
128 |                     "label": vertex.get("label", "Unknown"),
129 |                     "attributes": [],
130 |                     "description": vertex.get("description", "")
131 |                 }
132 |                 
133 |                 # Handle OneToOne mapping (most common)
134 |                 one_to_one = vertex.get("oneToOne", {})
135 |                 if one_to_one:
136 |                     # Extract attributes from MappedField format
137 |                     attributes = one_to_one.get("attributes", [])
138 |                     for attr in attributes:
139 |                         converted_vertex["attributes"].append({
140 |                             "name": attr.get("alias", attr.get("field", "unknown")),
141 |                             "type": self._map_puppygraph_type(attr.get("type", "String"))
142 |                         })
143 |                 
144 |                 # Handle ManyToOne mapping if present
145 |                 many_to_one = vertex.get("manyToOne", {})
146 |                 if many_to_one:
147 |                     # For ManyToOne, we'll just show it has complex mapping
148 |                     converted_vertex["attributes"].append({
149 |                         "name": "complex_mapping",
150 |                         "type": "ManyToOne"
151 |                     })
152 |                 
153 |                 vertices.append(converted_vertex)
154 |             
155 |             # Convert edges from Graph.EdgeSchema format
156 |             edges = []
157 |             for edge in graph_def.get("edges", []):
158 |                 converted_edge = {
159 |                     "label": edge.get("label", "Unknown"),
160 |                     "from": edge.get("fromVertex", "Unknown"),
161 |                     "to": edge.get("toVertex", "Unknown"),
162 |                     "attributes": [],
163 |                     "description": edge.get("description", "")
164 |                 }
165 |                 
166 |                 # Extract attributes from MappedField format
167 |                 attributes = edge.get("attributes", [])
168 |                 for attr in attributes:
169 |                     converted_edge["attributes"].append({
170 |                         "name": attr.get("alias", attr.get("field", "unknown")),
171 |                         "type": self._map_puppygraph_type(attr.get("type", "String"))
172 |                     })
173 |                 
174 |                 edges.append(converted_edge)
175 |             
176 |             return {
177 |                 "vertices": vertices,
178 |                 "edges": edges
179 |             }
180 |             
181 |         except Exception as e:
182 |             logger.error(f"Error converting PuppyGraph schema: {e}")
183 |             return {"vertices": [], "edges": []}
184 |     
185 |     def _map_puppygraph_type(self, puppygraph_type: str) -> str:
186 |         """Map PuppyGraph types to standard types"""
187 |         type_mapping = {
188 |             "String": "String",
189 |             "Int": "Integer", 
190 |             "Double": "Double",
191 |             "Boolean": "Boolean",
192 |             "Long": "Long",
193 |             "Float": "Float"
194 |         }
195 |         return type_mapping.get(puppygraph_type, "String")
196 |     
197 |     def execute_cypher_direct(self, query: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
198 |         """Execute Cypher query directly against PuppyGraph"""
199 |         from neo4j import GraphDatabase
200 |         from neo4j.exceptions import ServiceUnavailable, AuthError
201 |         
202 |         try:
203 |             driver = GraphDatabase.driver(
204 |                 self.puppygraph_bolt_uri,
205 |                 auth=(self.puppygraph_username, self.puppygraph_password)
206 |             )
207 |             
208 |             with driver.session() as session:
209 |                 result = session.run(query, params or {})
210 |                 records = [record.data() for record in result]
211 |                 
212 |             driver.close()
213 |             
214 |             return {
215 |                 "success": True,
216 |                 "data": records,
217 |                 "query": query,
218 |                 "record_count": len(records)
219 |             }
220 |             
221 |         except ServiceUnavailable as e:
222 |             error_msg = f"PuppyGraph server not available at {self.puppygraph_bolt_uri}. Please ensure PuppyGraph is running."
223 |             logger.error(error_msg)
224 |             return {
225 |                 "success": False,
226 |                 "error": error_msg,
227 |                 "query": query
228 |             }
229 |         except AuthError as e:
230 |             error_msg = f"Authentication failed. Check PuppyGraph credentials."
231 |             logger.error(error_msg)
232 |             return {
233 |                 "success": False,
234 |                 "error": error_msg,
235 |                 "query": query
236 |             }
237 |         except Exception as e:
238 |             logger.error(f"Error executing Cypher query: {e}")
239 |             return {
240 |                 "success": False,
241 |                 "error": str(e),
242 |                 "query": query
243 |             }
244 |     
245 |     def process_natural_language_query_streaming(self, question: str):
246 |         """Process a natural language question with streaming progress updates"""
247 |         
248 |         try:
249 |             # Get current schema
250 |             schema = self.get_schema()
251 |             
252 |             # Initialize progress
253 |             executed_steps = []
254 |             max_rounds = 5
255 |             
256 |             # Yield initial status
257 |             yield self._format_streaming_update("🤖 Starting multi-round query execution...", executed_steps, None, question)
258 |             
259 |             for round_num in range(1, max_rounds + 1):
260 |                 # Generate next query or decision to stop
261 |                 yield self._format_streaming_update(f"🔄 Round {round_num}: Analyzing question and generating query...", executed_steps, None, question)
262 |                 
263 |                 cypher, explanation, should_stop, prompt, llm_response = self.rag_system.generate_next_query(
264 |                     question, schema, executed_steps, max_rounds
265 |                 )
266 |                 
267 |                 if should_stop:
268 |                     # Final answer ready
269 |                     final_result = {
270 |                         "question": question,
271 |                         "executed_steps": executed_steps,
272 |                         "final_answer": explanation,
273 |                         "total_rounds": round_num - 1,
274 |                         "success": True,
275 |                         "stopped_reason": "LLM determined sufficient information gathered"
276 |                     }
277 |                     self.conversation_history.append(final_result)
278 |                     yield self._format_streaming_update("✅ Analysis complete!", executed_steps, explanation, question, final=True)
279 |                     return
280 |                 
281 |                 # Show generated query
282 |                 yield self._format_streaming_update(f"📝 Generated query for round {round_num}", executed_steps, None, question, current_query=cypher, current_description=explanation, current_prompt=prompt, current_llm_response=llm_response)
283 |                 
284 |                 # Create query step
285 |                 from rag_system import QueryStep
286 |                 query_step = QueryStep(
287 |                     step_number=round_num,
288 |                     description=explanation,
289 |                     cypher=cypher,
290 |                     prompt=prompt,
291 |                     llm_response=llm_response
292 |                 )
293 |                 
294 |                 # Execute the query
295 |                 yield self._format_streaming_update(f"⚡ Executing query {round_num}...", executed_steps, None, question, current_query=cypher)
296 |                 
297 |                 try:
298 |                     execution_result = self.execute_cypher_direct(cypher)
299 |                     if execution_result.get("success", False):
300 |                         query_step.result = execution_result.get("data", [])
301 |                         result_summary = f"✅ Query {round_num} completed: {len(query_step.result)} records returned"
302 |                     else:
303 |                         query_step.error = execution_result.get("error", "Unknown error")
304 |                         result_summary = f"❌ Query {round_num} failed: {query_step.error}"
305 |                 except Exception as e:
306 |                     query_step.error = str(e)
307 |                     result_summary = f"❌ Query {round_num} error: {str(e)}"
308 |                 
309 |                 executed_steps.append(query_step)
310 |                 
311 |                 # Add query result to RAG system conversation history
312 |                 if query_step.result is not None:
313 |                     total_records = len(query_step.result)
314 |                     rag_result_summary = f"Query executed successfully. Found {total_records} records."
315 |                     if query_step.result:
316 |                         sample = query_step.result[:5]  # Show first 5 records as sample
317 |                         rag_result_summary += f" Sample data (showing {len(sample)} of {total_records}): {json.dumps(sample, default=str)}"
318 |                         if total_records > 5:
319 |                             rag_result_summary += f" (Note: {total_records - 5} additional records omitted)"
320 |                 elif query_step.error:
321 |                     rag_result_summary = f"Query failed with error: {query_step.error}"
322 |                 else:
323 |                     rag_result_summary = "Query executed with no result data."
324 |                 
325 |                 self.rag_system.conversation_messages.append({
326 |                     "role": "assistant",
327 |                     "content": f"I executed this query: {query_step.cypher}\nResult: {rag_result_summary}"
328 |                 })
329 |                 
330 |                 # Show execution result
331 |                 yield self._format_streaming_update(result_summary, executed_steps, None, question)
332 |             
333 |             # If we reach max rounds, force stop
334 |             final_answer = self.rag_system.generate_final_answer_from_steps(question, executed_steps)
335 |             final_result = {
336 |                 "question": question,
337 |                 "executed_steps": executed_steps,
338 |                 "final_answer": final_answer,
339 |                 "total_rounds": max_rounds,
340 |                 "success": True,
341 |                 "stopped_reason": f"Reached maximum rounds ({max_rounds})"
342 |             }
343 |             self.conversation_history.append(final_result)
344 |             yield self._format_streaming_update("🛑 Maximum rounds reached", executed_steps, final_answer, question, final=True)
345 |             
346 |         except Exception as e:
347 |             logger.error(f"Error in streaming processing: {e}")
348 |             yield f"❌ Error: {str(e)}"
349 |     
350 |     def _format_streaming_update(self, status: str, executed_steps, final_answer: str = None, question: str = "", final: bool = False, current_query: str = None, current_description: str = None, current_prompt: str = None, current_llm_response: str = None) -> str:
351 |         """Format a streaming progress update in chronological order"""
352 |         
353 |         response = f"**Question:** {question}\n\n"
354 |         
355 |         # If we have a final answer, show it prominently first and collapse the details
356 |         if final_answer and final:
357 |             response += f"**🎯 Final Answer:**\n{final_answer}\n\n"
358 |             response += f"---\n\n"
359 |             
360 |             # Put all processing details in a collapsible section
361 |             details_content = f"**Status:** {status}\n\n"
362 |             
363 |             # System configuration in details
364 |             system_prompt = self.rag_system.get_system_prompt()
365 |             if system_prompt:
366 |                 details_content += f"**🔧 System Configuration:**\n"
367 |                 details_content += f"<details><summary>View System Prompt (Schema, Rules, Examples)</summary>\n\n```\n{system_prompt}\n```\n</details>\n\n"
368 |             
369 |             # Processing steps in details
370 |             if executed_steps:
371 |                 details_content += f"**Processing Steps ({len(executed_steps)}):**\n\n"
372 |                 for step in executed_steps:
373 |                     details_content += f"---\n**Step {step.step_number}:** {step.description}\n\n"
374 |                     
375 |                     # LLM request for this step
376 |                     if hasattr(step, 'prompt') and step.prompt:
377 |                         details_content += f"**🤖 LLM Request:**\n```\n{step.prompt}\n```\n\n"
378 |                     
379 |                     # LLM response (tool use)
380 |                     if hasattr(step, 'llm_response') and step.llm_response:
381 |                         details_content += f"**🤖 LLM Response:**\n```json\n{step.llm_response}\n```\n\n"
382 |                     
383 |                     # Generated Cypher query
384 |                     details_content += f"**🔗 Cypher Query:**\n```cypher\n{step.cypher}\n```\n\n"
385 |                     
386 |                     # Query execution results
387 |                     if step.result is not None:
388 |                         details_content += f"**📊 Query Results:** ✅ {len(step.result)} records returned\n"
389 |                         if step.result:
390 |                             sample_size = min(5, len(step.result))
391 |                             details_content += f"**Sample Data (showing {sample_size} of {len(step.result)} records):**\n"
392 |                             details_content += f"```json\n{json.dumps(step.result[:sample_size], indent=2, default=str)}\n```\n"
393 |                             if len(step.result) > sample_size:
394 |                                 details_content += f"... and {len(step.result) - sample_size} more records\n"
395 |                         details_content += "\n"
396 |                     elif step.error:
397 |                         details_content += f"**Query Error:** ❌ {step.error}\n\n"
398 |                     
399 |                     details_content += "\n"
400 |             
401 |             # Wrap all details in a collapsible section
402 |             response += f"<details><summary>📋 Click to view detailed processing steps ({len(executed_steps)} queries executed)</summary>\n\n{details_content}\n</details>\n"
403 |             
404 |             return response
405 |         
406 |         # If not final, show the regular streaming format
407 |         response += f"**Status:** {status}\n\n"
408 |         
409 |         # 1. System prompt (always first)
410 |         system_prompt = self.rag_system.get_system_prompt()
411 |         if system_prompt:
412 |             response += f"**🔧 System Configuration:**\n"
413 |             response += f"<details><summary>View System Prompt (Schema, Rules, Examples)</summary>\n\n```\n{system_prompt}\n```\n</details>\n\n"
414 |         
415 |         # 2. Initial user query (what started everything)
416 |         response += f"**🧑 User Query:** {question}\n\n"
417 |         
418 |         # 3. Show completed steps in chronological order
419 |         if executed_steps:
420 |             response += f"**Processing Steps ({len(executed_steps)}):**\n\n"
421 |             for step in executed_steps:
422 |                 response += f"---\n**Step {step.step_number}:** {step.description}\n\n"
423 |                 
424 |                 # 3a. LLM request for this step
425 |                 if hasattr(step, 'prompt') and step.prompt:
426 |                     response += f"**🤖 LLM Request:**\n```\n{step.prompt}\n```\n\n"
427 |                 
428 |                 # 3b. LLM response (tool use)
429 |                 if hasattr(step, 'llm_response') and step.llm_response:
430 |                     response += f"**🤖 LLM Response:**\n```json\n{step.llm_response}\n```\n\n"
431 |                 
432 |                 # 3c. Generated Cypher query
433 |                 response += f"**🔗 Cypher Query:**\n```cypher\n{step.cypher}\n```\n\n"
434 |                 
435 |                 # 3d. Query execution results
436 |                 if step.result is not None:
437 |                     response += f"**📊 Query Results:** ✅ {len(step.result)} records returned\n"
438 |                     if step.result:
439 |                         sample_size = min(5, len(step.result))
440 |                         response += f"**Sample Data (showing {sample_size} of {len(step.result)} records):**\n"
441 |                         response += f"```json\n{json.dumps(step.result[:sample_size], indent=2, default=str)}\n```\n"
442 |                         if len(step.result) > sample_size:
443 |                             response += f"... and {len(step.result) - sample_size} more records\n"
444 |                     response += "\n"
445 |                 elif step.error:
446 |                     response += f"**Query Error:** ❌ {step.error}\n\n"
447 |                 
448 |                 response += "\n"
449 |         
450 |         # 4. Show current step in progress (if any)
451 |         if current_query and not final:
452 |             step_num = len(executed_steps) + 1
453 |             response += f"---\n**Step {step_num} (In Progress):** {current_description or 'Processing...'}\n\n"
454 |             
455 |             # 4a. LLM request for current step
456 |             if current_prompt:
457 |                 response += f"**🤖 LLM Request:**\n```\n{current_prompt}\n```\n\n"
458 |             
459 |             # 4b. LLM response for current step
460 |             if current_llm_response:
461 |                 response += f"**🤖 LLM Response:**\n```json\n{current_llm_response}\n```\n\n"
462 |             
463 |             # 4c. Generated Cypher query
464 |             response += f"**🔗 Cypher Query:**\n```cypher\n{current_query}\n```\n\n"
465 |             response += f"**📊 Status:** Executing query...\n\n"
466 |         
467 |         return response
468 |     
469 |     def add_query_example(self, question: str, cypher: str, description: str) -> bool:
470 |         """Add a new query example to the RAG system"""
471 |         try:
472 |             example = QueryExample(
473 |                 question=question,
474 |                 cypher=cypher,
475 |                 description=description
476 |             )
477 |             self.rag_system.add_example(example)
478 |             return True
479 |         except Exception as e:
480 |             logger.error(f"Error adding query example: {e}")
481 |             return False
482 |     
483 |     def get_conversation_history(self, limit: int = 10) -> List[Dict[str, Any]]:
484 |         """Get recent conversation history"""
485 |         return self.conversation_history[-limit:] if self.conversation_history else []
486 |     
487 |     def clear_conversation_history(self):
488 |         """Clear conversation history"""
489 |         self.conversation_history.clear()
490 |         # Also clear the RAG system's conversation history
491 |         self.rag_system.clear_conversation()
492 |     
493 |     def update_prompt_config(self, prompt_config: PromptConfig):
494 |         """Update the prompt configuration for the RAG system"""
495 |         self.rag_system.update_prompt_config(prompt_config)
496 |         logger.info("Chatbot prompt configuration updated")
497 |     
498 |     def get_prompt_config(self) -> PromptConfig:
499 |         """Get the current prompt configuration"""
500 |         return self.rag_system.get_prompt_config()
501 |     
502 |     def get_graph_stats(self) -> Dict[str, Any]:
503 |         """Get basic graph statistics"""
504 |         try:
505 |             stats_query = """
506 |             MATCH (n) 
507 |             WITH count(n) as node_count
508 |             MATCH ()-[r]->()  
509 |             RETURN node_count, count(r) as edge_count
510 |             """
511 |             
512 |             result = self.execute_cypher_direct(stats_query)
513 |             
514 |             if result["success"] and result["data"]:
515 |                 stats = result["data"][0]
516 |                 
517 |                 # Get node labels and relationship types separately
518 |                 node_labels = []
519 |                 relationship_types = []
520 |                 
521 |                 try:
522 |                     # Try to get distinct node labels
523 |                     labels_result = self.execute_cypher_direct("MATCH (n) RETURN DISTINCT labels(n) as node_labels LIMIT 20")
524 |                     if labels_result["success"]:
525 |                         node_labels = [item["node_labels"][0] for item in labels_result["data"] if item.get("node_labels")]
526 |                 except:
527 |                     pass
528 |                 
529 |                 try:
530 |                     # Try to get distinct relationship types
531 |                     types_result = self.execute_cypher_direct("MATCH ()-[r]->() RETURN DISTINCT type(r) as relationship_type LIMIT 20")
532 |                     if types_result["success"]:
533 |                         relationship_types = [item["relationship_type"] for item in types_result["data"]]
534 |                 except:
535 |                     pass
536 |                 
537 |                 return {
538 |                     "node_count": stats.get("node_count", 0),
539 |                     "edge_count": stats.get("edge_count", 0),
540 |                     "node_labels": node_labels,
541 |                     "relationship_types": relationship_types
542 |                 }
543 |             else:
544 |                 return {"node_count": 0, "edge_count": 0, "node_labels": [], "relationship_types": []}
545 |                 
546 |         except Exception as e:
547 |             logger.error(f"Error getting graph stats: {e}")
548 |             return {"error": str(e)}
549 |     
550 |     def cleanup(self):
551 |         """Cleanup resources"""
552 |         self.stop_mcp_server()
553 | 
554 | 
555 | # Global chatbot instance
556 | chatbot = None
557 | 
558 | def get_chatbot() -> PuppyGraphChatbot:
559 |     """Get or create the global chatbot instance"""
560 |     global chatbot
561 |     if chatbot is None:
562 |         chatbot = PuppyGraphChatbot()
563 |     return chatbot
564 | 
565 | def shutdown_chatbot():
566 |     """Shutdown the global chatbot instance"""
567 |     global chatbot
568 |     if chatbot:
569 |         chatbot.cleanup()
570 |         chatbot = None


--------------------------------------------------------------------------------
/apps/chatbot/gradio_app.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import gradio as gr
  4 | import json
  5 | import time
  6 | from typing import List, Tuple, Dict, Any
  7 | import logging
  8 | 
  9 | from backend import get_chatbot, shutdown_chatbot
 10 | from rag_system import PromptConfig
 11 | 
 12 | logging.basicConfig(level=logging.INFO)
 13 | logger = logging.getLogger("gradio_app")
 14 | 
 15 | # Global variables for maintaining state
 16 | chatbot_instance = None
 17 | 
 18 | 
 19 | def initialize_chatbot():
 20 |     """Initialize the chatbot backend"""
 21 |     global chatbot_instance
 22 |     if chatbot_instance is None:
 23 |         chatbot_instance = get_chatbot()
 24 |     return chatbot_instance
 25 | 
 26 | 
 27 | def process_message_streaming(message: str, history: List[Dict[str, str]]):
 28 |     """Process user message with streaming updates"""
 29 |     if not message.strip():
 30 |         return
 31 |     
 32 |     try:
 33 |         chatbot = initialize_chatbot()
 34 |         
 35 |         # Add user message to history with empty response initially
 36 |         history.append({"role": "user", "content": message})
 37 |         history.append({"role": "assistant", "content": ""})
 38 |         yield history
 39 |         
 40 |         # Stream the processing
 41 |         full_response = ""
 42 |         for update in chatbot.process_natural_language_query_streaming(message):
 43 |             full_response = update
 44 |             # Update the last assistant message in history
 45 |             history[-1] = {"role": "assistant", "content": full_response}
 46 |             yield history
 47 |             
 48 |     except Exception as e:
 49 |         error_response = f"❌ Error processing your message: {str(e)}"
 50 |         history[-1] = {"role": "assistant", "content": error_response}
 51 |         yield history
 52 | 
 53 | 
 54 | 
 55 | 
 56 | def get_graph_stats() -> str:
 57 |     """Get and format graph statistics"""
 58 |     try:
 59 |         chatbot = initialize_chatbot()
 60 |         stats = chatbot.get_graph_stats()
 61 |         
 62 |         if "error" in stats:
 63 |             return f"❌ Error getting stats: {stats['error']}"
 64 |         
 65 |         stats_text = f"""
 66 | 📊 **Graph Statistics**
 67 | 
 68 | 🔢 **Nodes:** {stats.get('node_count', 0)}
 69 | 🔗 **Edges:** {stats.get('edge_count', 0)}
 70 | 
 71 | 🏷️ **Node Labels:** {', '.join(stats.get('node_labels', []))}
 72 | ⚡ **Relationship Types:** {', '.join(stats.get('relationship_types', []))}
 73 |         """
 74 |         
 75 |         return stats_text.strip()
 76 |         
 77 |     except Exception as e:
 78 |         return f"❌ Error: {str(e)}"
 79 | 
 80 | 
 81 | def get_schema_info() -> str:
 82 |     """Get and format schema information"""
 83 |     try:
 84 |         chatbot = initialize_chatbot()
 85 |         schema = chatbot.get_schema()
 86 |         
 87 |         if not schema:
 88 |             return "📋 **Graph Schema**\n\n⚠️ No schema information available. Please ensure PuppyGraph is running and has a configured schema."
 89 |         
 90 |         schema_text = "📋 **Graph Schema**\n\n"
 91 |         
 92 |         # Format vertices
 93 |         vertices = schema.get("vertices", [])
 94 |         edges = schema.get("edges", [])
 95 |         
 96 |         if not vertices and not edges:
 97 |             schema_text += "⚠️ **No schema found**\n\n"
 98 |             schema_text += "This could mean:\n"
 99 |             schema_text += "• PuppyGraph is not running\n"
100 |             schema_text += "• No schema has been configured in PuppyGraph\n"
101 |             schema_text += "• Connection to PuppyGraph failed\n\n"
102 |             schema_text += "Please check your PuppyGraph server status and configuration."
103 |             return schema_text
104 |         
105 |         if vertices:
106 |             schema_text += "🟢 **Vertices:**\n"
107 |             for vertex in vertices:
108 |                 label = vertex.get("label", "Unknown")
109 |                 attributes = vertex.get("attributes", [])
110 |                 if attributes:
111 |                     attr_text = ", ".join([f"{attr['name']}:{attr['type']}" for attr in attributes])
112 |                     schema_text += f"  • **{label}**: {attr_text}\n"
113 |                 else:
114 |                     schema_text += f"  • **{label}**: (no attributes)\n"
115 |             schema_text += "\n"
116 |         
117 |         if edges:
118 |             schema_text += "🔗 **Edges:**\n"
119 |             for edge in edges:
120 |                 label = edge.get("label", "Unknown")
121 |                 from_vertex = edge.get("from", "Unknown")
122 |                 to_vertex = edge.get("to", "Unknown")
123 |                 attributes = edge.get("attributes", [])
124 |                 if attributes:
125 |                     attr_text = ", ".join([f"{attr['name']}:{attr['type']}" for attr in attributes])
126 |                     schema_text += f"  • **{from_vertex}** -[{label}]-> **{to_vertex}**: {attr_text}\n"
127 |                 else:
128 |                     schema_text += f"  • **{from_vertex}** -[{label}]-> **{to_vertex}**: (no attributes)\n"
129 |         
130 |         if not vertices and edges:
131 |             schema_text += "\n⚠️ **Note**: Found edge definitions but no vertex definitions."
132 |         elif vertices and not edges:
133 |             schema_text += "\n⚠️ **Note**: Found vertex definitions but no edge definitions."
134 |         
135 |         return schema_text
136 |         
137 |     except Exception as e:
138 |         return f"📋 **Graph Schema**\n\n❌ Error getting schema: {str(e)}\n\nPlease check that PuppyGraph is running and accessible."
139 | 
140 | 
141 | def start_new_session():
142 |     """Start a new session by clearing chat and conversation history"""
143 |     try:
144 |         chatbot = initialize_chatbot()
145 |         chatbot.clear_conversation_history()
146 |         logger.info("New session started - conversation history cleared")
147 |         return []
148 |     except Exception as e:
149 |         logger.error(f"Error starting new session: {e}")
150 |         return []
151 | 
152 | 
153 | def add_example_query(question: str, cypher: str, description: str) -> str:
154 |     """Add a new example query to the RAG system"""
155 |     if not question.strip() or not cypher.strip():
156 |         return "❌ Question and Cypher query are required"
157 |     
158 |     try:
159 |         chatbot = initialize_chatbot()
160 |         success = chatbot.add_query_example(question, cypher, description or "User-added example")
161 |         
162 |         if success:
163 |             return "✅ Example added successfully to the knowledge base"
164 |         else:
165 |             return "❌ Failed to add example"
166 |             
167 |     except Exception as e:
168 |         return f"❌ Error adding example: {str(e)}"
169 | 
170 | 
171 | def get_current_prompt_config() -> Tuple[str, str, str, str]:
172 |     """Get the current prompt configuration components"""
173 |     try:
174 |         chatbot = initialize_chatbot()
175 |         config = chatbot.get_prompt_config()
176 |         return (
177 |             config.role_definition,
178 |             config.plan_generation_instruction,
179 |             config.puppygraph_differences,
180 |             config.output_format_instruction
181 |         )
182 |     except Exception as e:
183 |         error_msg = f"Error getting config: {str(e)}"
184 |         return error_msg, error_msg, error_msg, error_msg
185 | 
186 | 
187 | def update_prompt_config(role_def: str, plan_gen: str, puppygraph_diff: str, output_format: str) -> str:
188 |     """Update the prompt configuration"""
189 |     try:
190 |         chatbot = initialize_chatbot()
191 |         
192 |         # Create new config with updated values
193 |         new_config = PromptConfig(
194 |             role_definition=role_def.strip() or """You are a helpful assistant to help answer user questions about assets in a mining site.
195 | You will need to use the information stored in the graph database to answer the user's questions.
196 | Here is some information about the graph database schema.""",
197 |             plan_generation_instruction=plan_gen.strip() or """You must first output a PLAN, then you can use the PLAN to call the tools.
198 | Each STEP of the PLAN should be corresponding to one or more function calls (but not less), either simple or complex.
199 | Minimize the number of steps in the PLAN, but make sure the PLAN is workable.
200 | Remember, each step can be converted to a Cypher query, since Cypher query can handle quite complex queries,
201 | each step can be complex as well as long as it can be converted to a Cypher query.
202 | 
203 | IMPORTANT RESULT HANDLING STRATEGY:
204 | - If your query results are truncated (you see "[Results truncated...]"), you have several options:
205 |   1. Use a smaller LIMIT size to get a sample of results first for exploration
206 |   2. Add COUNT(*) queries to understand total result sizes before fetching data
207 |   3. For final comprehensive results, remove LIMIT clauses entirely to provide complete downloadable data
208 | - When providing final conclusions to users, ensure the last query retrieves complete data (no LIMIT) for download
209 | - Structure your approach: exploration -> understanding -> comprehensive final result""",
210 |             puppygraph_differences=puppygraph_diff.strip() or """PUPPYGRAPH DIFFERENCES FROM STANDARD CYPHER:
211 | When calculating failures for a particular asset, also first find out the work orders that are related to the asset, 
212 | then count the work orders that are related to the failure using related_to_failure. 
213 | DO NOT USE can_have_failure for counting total number of failures, USE related_to_failure instead.""",
214 |             output_format_instruction=output_format.strip() or """OUTPUT FORMAT:
215 | Always use the format {
216 | 'THINKING': <the thought process in PLAIN TEXT>,
217 | 'PLAN': <the plan contains multiple steps in PLAIN TEXT, Your Original plan or Update plan after seeing some executed results>,
218 | 'CONCLUSION': <Keep your conclusion simple and clear if you decide to conclude>,
219 | 'FINAL_DATA_AVAILABLE': <true/false - whether comprehensive downloadable data is available>,
220 | 'QUERY_EXECUTION_SUMMARY': <summary of queries executed and their purpose>}
221 | 
222 | RESULT MANAGEMENT GUIDELINES:
223 | - For exploratory queries, use appropriate LIMIT clauses (10-50 records)
224 | - For final results intended for user download, use NO LIMIT to provide complete data
225 | - Always inform users when data is available for download
226 | - Include query execution summary to help users understand what was analyzed"""
227 |         )
228 |         
229 |         chatbot.update_prompt_config(new_config)
230 |         return "✅ Prompt configuration updated successfully! The new settings will be used for future queries."
231 |         
232 |     except Exception as e:
233 |         return f"❌ Error updating prompt configuration: {str(e)}"
234 | 
235 | 
236 | def reset_prompt_config() -> Tuple[str, str, str, str, str]:
237 |     """Reset prompt configuration to defaults"""
238 |     try:
239 |         chatbot = initialize_chatbot()
240 |         default_config = PromptConfig()  # Create with defaults
241 |         chatbot.update_prompt_config(default_config)
242 |         
243 |         return (
244 |             default_config.role_definition,
245 |             default_config.plan_generation_instruction,
246 |             default_config.puppygraph_differences,
247 |             default_config.output_format_instruction,
248 |             "✅ Prompt configuration reset to defaults!"
249 |         )
250 |     except Exception as e:
251 |         error_msg = f"❌ Error resetting config: {str(e)}"
252 |         return "", "", "", "", error_msg
253 | 
254 | 
255 | def create_interface():
256 |     """Create the Gradio interface"""
257 |     
258 |     # Custom CSS for better styling
259 |     css = """
260 |     .gradio-container {
261 |         max-width: 1200px !important;
262 |     }
263 |     .chat-message {
264 |         font-family: 'Courier New', monospace;
265 |     }
266 |     .stats-display {
267 |         background-color: #f0f0f0;
268 |         padding: 10px;
269 |         border-radius: 5px;
270 |         font-family: monospace;
271 |     }
272 |     """
273 |     
274 |     with gr.Blocks(css=css, title="PuppyGraph RAG Chat") as interface:
275 |         
276 |         gr.Markdown("""
277 |         # 🐶 PuppyGraph RAG Chatbot
278 |         
279 |         Ask questions about your graph in natural language! Watch in **real-time** as I analyze your question, generate and execute multiple Cypher queries, and build a comprehensive answer step by step.
280 |         
281 |         **🆕 Optimized Conversation System:** Now uses efficient context management:
282 |         - **System prompt**: Schema, rules, and examples defined once per session
283 |         - **Conversation history**: Maintains context across questions without repetition
284 |         - **Full transparency**: Complete prompts, responses, queries, and results
285 |         - **Efficient prompting**: Only dynamic content sent to LLM, reducing costs
286 |         - **Multi-round execution**: Context-aware query generation
287 |         
288 |         **Examples to try:**
289 |         - "Show me all nodes in the graph"
290 |         - "Count all relationships" 
291 |         - "What are the different types of nodes?"
292 |         - "Find highly connected nodes"
293 |         - "Which users have the most connections and what do they connect to?"
294 |         - "What percentage of nodes have more than 5 relationships?"
295 |         """)
296 |         
297 |         with gr.Tab("💬 Chat"):
298 |             gr.Markdown("""
299 |             ### Chat with PuppyGraph 🔄
300 |             **Full conversation transparency enabled** - See every prompt, response, query, and result in detail
301 |             """)
302 |             
303 |             chatbot_ui = gr.Chatbot(
304 |                 value=[],
305 |                 height=600,
306 |                 label="PuppyGraph Assistant (Full Conversation Details)",
307 |                 show_label=True,
308 |                 elem_classes=["chat-message"],
309 |                 type="messages",
310 |                 render_markdown=True,
311 |                 sanitize_html=False
312 |             )
313 |             
314 |             msg = gr.Textbox(
315 |                 placeholder="Ask me anything about your graph...",
316 |                 label="Your Question",
317 |                 lines=2
318 |             )
319 |             
320 |             with gr.Row():
321 |                 submit_btn = gr.Button("Send", variant="primary", size="sm")
322 |                 clear_btn = gr.Button("🔄 New Session", size="sm", variant="stop")
323 |             
324 |             # Event handlers for chat (always streaming)
325 |             msg.submit(
326 |                 process_message_streaming,
327 |                 inputs=[msg, chatbot_ui],
328 |                 outputs=[chatbot_ui]
329 |             ).then(
330 |                 lambda: "", outputs=[msg]  # Clear input after submission
331 |             )
332 |             
333 |             submit_btn.click(
334 |                 process_message_streaming,
335 |                 inputs=[msg, chatbot_ui], 
336 |                 outputs=[chatbot_ui]
337 |             ).then(
338 |                 lambda: "", outputs=[msg]  # Clear input after submission
339 |             )
340 |             
341 |             clear_btn.click(
342 |                 start_new_session,
343 |                 outputs=[chatbot_ui]
344 |             )
345 |         
346 |         with gr.Tab("📊 Graph Info"):
347 |             with gr.Row():
348 |                 with gr.Column():
349 |                     gr.Markdown("### Graph Statistics")
350 |                     stats_display = gr.Textbox(
351 |                         label="Current Stats",
352 |                         lines=8,
353 |                         interactive=False,
354 |                         elem_classes=["stats-display"]
355 |                     )
356 |                     stats_btn = gr.Button("Refresh Stats", variant="secondary")
357 |                 
358 |                 with gr.Column():
359 |                     gr.Markdown("### Schema Information")
360 |                     schema_display = gr.Textbox(
361 |                         label="Graph Schema",
362 |                         lines=8,
363 |                         interactive=False,
364 |                         elem_classes=["stats-display"]
365 |                     )
366 |                     schema_btn = gr.Button("Refresh Schema", variant="secondary")
367 |             
368 |             # Event handlers for info tab
369 |             stats_btn.click(
370 |                 get_graph_stats,
371 |                 outputs=[stats_display]
372 |             )
373 |             
374 |             schema_btn.click(
375 |                 get_schema_info,
376 |                 outputs=[schema_display]
377 |             )
378 |         
379 |         with gr.Tab("➕ Add Examples"):
380 |             gr.Markdown("""
381 |             ### Add Query Examples
382 |             Help improve the chatbot by adding your own question-to-Cypher examples!
383 |             """)
384 |             
385 |             example_question = gr.Textbox(
386 |                 label="Natural Language Question",
387 |                 placeholder="e.g., 'Find all users who like movies'",
388 |                 lines=2
389 |             )
390 |             
391 |             example_cypher = gr.Textbox(
392 |                 label="Corresponding Cypher Query",
393 |                 placeholder="e.g., 'MATCH (u:User)-[:LIKES]->(m:Movie) RETURN u, m LIMIT 20'",
394 |                 lines=3
395 |             )
396 |             
397 |             example_description = gr.Textbox(
398 |                 label="Description (optional)",
399 |                 placeholder="Brief description of what this query does",
400 |                 lines=1
401 |             )
402 |             
403 |             add_example_btn = gr.Button("Add Example", variant="primary")
404 |             example_result = gr.Textbox(
405 |                 label="Result",
406 |                 interactive=False,
407 |                 lines=2
408 |             )
409 |             
410 |             # Event handler for adding examples
411 |             add_example_btn.click(
412 |                 add_example_query,
413 |                 inputs=[example_question, example_cypher, example_description],
414 |                 outputs=[example_result]
415 |             )
416 |         
417 |         with gr.Tab("⚙️ Prompt Config"):
418 |             gr.Markdown("""
419 |             ### Configure System Prompts
420 |             Customize how the AI assistant behaves by configuring the four key components of the system prompt.
421 |             Changes take effect immediately for new queries.
422 |             """)
423 |             
424 |             # Role Definition
425 |             with gr.Group():
426 |                 gr.Markdown("#### 1️⃣ Role Definition")
427 |                 gr.Markdown("Define the AI's role and expertise level.")
428 |                 role_definition = gr.Textbox(
429 |                     label="Role Definition",
430 |                     lines=2,
431 |                     placeholder="You are an expert at converting natural language questions to Cypher queries for graph databases."
432 |                 )
433 |             
434 |             # Plan Generation Instruction  
435 |             with gr.Group():
436 |                 gr.Markdown("#### 2️⃣ Plan Generation Instruction")
437 |                 gr.Markdown("Tell the AI to create a plan before generating queries.")
438 |                 plan_generation = gr.Textbox(
439 |                     label="Plan Generation Instruction",
440 |                     lines=2,
441 |                     placeholder="First, analyze the question and create a step-by-step plan for generating the appropriate Cypher query."
442 |                 )
443 |             
444 |             # PuppyGraph Differences
445 |             with gr.Group():
446 |                 gr.Markdown("#### 3️⃣ PuppyGraph vs Standard Cypher Differences")
447 |                 gr.Markdown("Explain how PuppyGraph differs from standard Cypher syntax.")
448 |                 puppygraph_differences = gr.Textbox(
449 |                     label="PuppyGraph Differences",
450 |                     lines=4,
451 |                     placeholder="""PUPPYGRAPH DIFFERENCES FROM STANDARD CYPHER:
452 | - PuppyGraph supports standard Cypher syntax
453 | - Use proper node and relationship patterns: (n)-[r]->(m)
454 | - Labels and properties follow Neo4j conventions
455 | - Functions like count(), size(), type() work as expected"""
456 |                 )
457 |             
458 |             # Output Format Instruction
459 |             with gr.Group():
460 |                 gr.Markdown("#### 4️⃣ Output Format Instruction")
461 |                 gr.Markdown("Define the expected output format and structure.")
462 |                 output_format = gr.Textbox(
463 |                     label="Output Format Instruction",
464 |                     lines=4,
465 |                     placeholder="""OUTPUT FORMAT:
466 | Use the generate_cypher_query tool to create a Cypher query that answers this question.
467 | Provide:
468 | 1. A complete, valid Cypher query
469 | 2. A clear explanation of what the query does
470 | 3. Step-by-step reasoning (optional but helpful)"""
471 |                 )
472 |             
473 |             # Action buttons
474 |             with gr.Row():
475 |                 load_current_btn = gr.Button("Load Current Config", variant="secondary")
476 |                 update_config_btn = gr.Button("Update Configuration", variant="primary")
477 |                 reset_config_btn = gr.Button("Reset to Defaults", variant="stop")
478 |             
479 |             # Result display
480 |             config_result = gr.Textbox(
481 |                 label="Status",
482 |                 interactive=False,
483 |                 lines=2,
484 |                 placeholder="Click 'Load Current Config' to see the current settings."
485 |             )
486 |             
487 |             # Event handlers for prompt config
488 |             load_current_btn.click(
489 |                 get_current_prompt_config,
490 |                 outputs=[role_definition, plan_generation, puppygraph_differences, output_format]
491 |             ).then(
492 |                 lambda: "✅ Current configuration loaded into form fields above.",
493 |                 outputs=[config_result]
494 |             )
495 |             
496 |             update_config_btn.click(
497 |                 update_prompt_config,
498 |                 inputs=[role_definition, plan_generation, puppygraph_differences, output_format],
499 |                 outputs=[config_result]
500 |             )
501 |             
502 |             reset_config_btn.click(
503 |                 reset_prompt_config,
504 |                 outputs=[role_definition, plan_generation, puppygraph_differences, output_format, config_result]
505 |             )
506 |         
507 |         with gr.Tab("📝 Debug/Prompts"):
508 |             gr.Markdown("### Prompt Debugging")
509 |             gr.Markdown("""
510 |             This tab shows the latest prompt used to generate Cypher queries. This is useful for understanding
511 |             how the RAG system constructs prompts and for debugging query generation issues.
512 |             """)
513 |             
514 |             with gr.Row():
515 |                 with gr.Column():
516 |                     latest_prompt_display = gr.Textbox(
517 |                         label="Latest Prompt Used",
518 |                         lines=12,
519 |                         interactive=False,
520 |                         placeholder="No prompts captured yet. Run a query to see the prompt used."
521 |                     )
522 |                     
523 |                 with gr.Column():
524 |                     latest_response_display = gr.Textbox(
525 |                         label="Latest LLM Response",
526 |                         lines=12,
527 |                         interactive=False,
528 |                         placeholder="No LLM responses captured yet. Run a query to see the response."
529 |                     )
530 |             
531 |             refresh_debug_btn = gr.Button("Refresh Latest Prompt & Response", variant="secondary")
532 |             
533 |             # Function to get the latest prompt and response from conversation history
534 |             def get_latest_debug_info():
535 |                 try:
536 |                     chatbot = initialize_chatbot()
537 |                     history = chatbot.get_conversation_history(1)
538 |                     if history and "executed_steps" in history[0]:
539 |                         steps = history[0]["executed_steps"]
540 |                         if steps:
541 |                             latest_step = steps[-1]
542 |                             prompt = getattr(latest_step, 'prompt', None) or "No prompt available"
543 |                             response = getattr(latest_step, 'llm_response', None) or "No LLM response available"
544 |                             return prompt, response
545 |                     return "No debug information available in recent conversation history.", "No debug information available in recent conversation history."
546 |                 except Exception as e:
547 |                     error_msg = f"Error retrieving debug info: {str(e)}"
548 |                     return error_msg, error_msg
549 |             
550 |             refresh_debug_btn.click(
551 |                 get_latest_debug_info,
552 |                 outputs=[latest_prompt_display, latest_response_display]
553 |             )
554 |         
555 |         with gr.Tab("ℹ️ Help"):
556 |             gr.Markdown("""
557 |             ## How to Use PuppyGraph RAG Chatbot
558 |             
559 |             ### 🗣️ Chat Tab
560 |             - Type natural language questions about your graph
561 |             - **Real-time streaming**: Watch each query step execute live as it happens
562 |             - **Multi-round execution**: generates and runs multiple Cypher queries as needed
563 |             - Claude Sonnet 4.0 decides when it has enough information to provide a complete answer
564 |             - **🆕 Full conversation transparency**: See complete details including:
565 |               - Full prompts sent to Claude Sonnet 4.0 with schema and context
566 |               - Complete LLM responses showing reasoning and decision-making
567 |               - All generated Cypher queries with explanations
568 |               - Full query results with detailed data samples
569 |               - Step-by-step progression through the entire conversation
570 |             
571 |             ### 📊 Graph Info Tab
572 |             - View basic statistics about your graph (node count, edge count, etc.)
573 |             - Explore the schema to understand available node types and relationships
574 |             
575 |             ### ➕ Add Examples Tab
576 |             - Teach the chatbot new patterns by adding question-query pairs
577 |             - Your examples will be used to improve future query generation
578 |             
579 |             ### ⚙️ Prompt Config Tab
580 |             - **Customize AI behavior**: Configure the 4 key components of system prompts
581 |             - **Role Definition**: Set the AI's expertise level and domain knowledge
582 |             - **Plan Generation**: Control whether the AI creates execution plans first
583 |             - **PuppyGraph Differences**: Define how PuppyGraph differs from standard Cypher
584 |             - **Output Format**: Specify the expected response structure
585 |             - **Real-time updates**: Changes take effect immediately for new queries
586 |             - **Reset to defaults**: Easily restore original prompt settings
587 |             
588 |             ### 📝 Debug/Prompts Tab
589 |             - View the exact prompts sent to Claude Sonnet 4.0 for query generation
590 |             - View the raw LLM responses received from Claude
591 |             - Useful for understanding how the RAG system works and debugging issues
592 |             - Prompts include schema info, similar examples, and conversation context
593 |             
594 |             ### 🔧 Technical Details
595 |             - **Backend**: Python with FastAPI
596 |             - **Graph DB**: PuppyGraph (Cypher queries via Bolt protocol)
597 |             - **RAG System**: ChromaDB + SentenceTransformers + Claude Sonnet 4.0
598 |             - **MCP Integration**: Custom Model Context Protocol server
599 |             
600 |             ### 💡 Tips
601 |             - Be specific in your questions for better results
602 |             - Check the confidence score - higher scores indicate more reliable queries
603 |             - Use the schema information to understand what data is available
604 |             - Add your own examples to improve performance for your specific use case
605 |             """)
606 |         
607 |         # Load initial data when interface starts
608 |         interface.load(
609 |             fn=lambda: (get_graph_stats(), get_schema_info()),
610 |             outputs=[stats_display, schema_display]
611 |         )
612 |     
613 |     return interface
614 | 
615 | 
616 | def main():
617 |     """Main function to run the application"""
618 |     try:
619 |         # Initialize the chatbot
620 |         logger.info("Initializing PuppyGraph RAG Chatbot...")
621 |         initialize_chatbot()
622 |         
623 |         # Create and launch the interface
624 |         interface = create_interface()
625 |         
626 |         logger.info("Starting Gradio interface...")
627 |         interface.launch(
628 |             server_name="0.0.0.0",
629 |             server_port=7860,
630 |             share=False,
631 |             show_error=True,
632 |             debug=True
633 |         )
634 |         
635 |     except KeyboardInterrupt:
636 |         logger.info("Shutting down...")
637 |     except Exception as e:
638 |         logger.error(f"Error running application: {e}")
639 |     finally:
640 |         # Cleanup
641 |         shutdown_chatbot()
642 |         logger.info("Application shutdown complete")
643 | 
644 | 
645 | if __name__ == "__main__":
646 |     main()


--------------------------------------------------------------------------------