├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── cryo_mcp ├── __init__.py ├── server.py └── sql.py ├── ethereum__blocks_22001067_to_22001067.json ├── ethereum__blocks__00001000_to_00001004.json ├── ethereum__blocks__22005903_to_22005907.json ├── pyproject.toml ├── tests ├── __init__.py ├── data │ └── ethereum__blocks__00001000_to_00001004.parquet ├── test_blocks.py ├── test_contract_transactions.py ├── test_cryo.py ├── test_latest_block.py ├── test_latest_functions.py ├── test_mcp_functions.py ├── test_sql.py ├── test_transaction_by_hash.py └── test_transactions.py └── uv.lock /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | .pytest_cache/ 23 | .coverage 24 | htmlcov/ 25 | 26 | # Environment 27 | .env 28 | .venv 29 | env/ 30 | venv/ 31 | ENV/ 32 | .python-version 33 | 34 | # IDE files 35 | .idea/ 36 | .vscode/ 37 | *.swp 38 | *.swo 39 | 40 | # OS specific 41 | .DS_Store 42 | Thumbs.db 43 | 44 | # Cryo specific 45 | .cryo/ 46 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.1.0 (Initial Release) 4 | 5 | - Initial Cryo MCP implementation 6 | - Support for querying Ethereum blocks, transactions, and contracts 7 | - Integration with Claude Code 8 | - Command-line interface -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 z80 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Cryo MCP 🧊 2 | 3 | A Model Completion Protocol (MCP) server for the [Cryo](https://github.com/paradigmxyz/cryo) blockchain data extraction tool. 4 | 5 | Cryo MCP allows you to access Cryo's powerful blockchain data extraction capabilities via an API server that implements the MCP protocol, making it easy to query blockchain data from any MCP-compatible client. 6 | 7 | ## For LLM Users: SQL Query Workflow Guide 8 | 9 | When using this MCP server to run SQL queries on blockchain data, follow this workflow: 10 | 11 | 1. **Download data** with `query_dataset`: 12 | ```python 13 | result = query_dataset( 14 | dataset="blocks", # or "transactions", "logs", etc. 15 | blocks="15000000:15001000", # or use blocks_from_latest=100 16 | output_format="parquet" # important: use parquet for SQL 17 | ) 18 | files = result.get("files", []) # Get the returned file paths 19 | ``` 20 | 21 | 2. **Explore schema** with `get_sql_table_schema`: 22 | ```python 23 | # Check what columns are available in the file 24 | schema = get_sql_table_schema(files[0]) 25 | # Now you can see all columns, data types, and sample data 26 | ``` 27 | 28 | 3. **Run SQL** with `query_sql`: 29 | ```python 30 | # Option 1: Simple table reference (DuckDB will match the table name to file) 31 | sql_result = query_sql( 32 | query="SELECT block_number, timestamp, gas_used FROM blocks", 33 | files=files # Pass the files from step 1 34 | ) 35 | 36 | # Option 2: Using read_parquet() with explicit file path 37 | sql_result = query_sql( 38 | query=f"SELECT block_number, timestamp, gas_used FROM read_parquet('{files[0]}')", 39 | files=files # Pass the files from step 1 40 | ) 41 | ``` 42 | 43 | Alternatively, use the combined approach with `query_blockchain_sql`: 44 | ```python 45 | # Option 1: Simple table reference 46 | result = query_blockchain_sql( 47 | sql_query="SELECT * FROM blocks", 48 | dataset="blocks", 49 | blocks_from_latest=100 50 | ) 51 | 52 | # Option 2: Using read_parquet() 53 | result = query_blockchain_sql( 54 | sql_query="SELECT * FROM read_parquet('/path/to/file.parquet')", # Path doesn't matter 55 | dataset="blocks", 56 | blocks_from_latest=100 57 | ) 58 | ``` 59 | 60 | For a complete working example, see [examples/sql_workflow_example.py](examples/sql_workflow_example.py). 61 | 62 | ## Features 63 | 64 | - **Full Cryo Dataset Access**: Query any Cryo dataset through an API server 65 | - **MCP Integration**: Works seamlessly with MCP clients 66 | - **Flexible Query Options**: Support for all major Cryo filtering and output options 67 | - **Block Range Options**: Query specific blocks, latest block, or relative ranges 68 | - **Contract Filtering**: Filter data by contract address 69 | - **Latest Block Access**: Easy access to the latest Ethereum block data 70 | - **Multiple Output Formats**: JSON, CSV, and Parquet support 71 | - **Schema Information**: Get detailed dataset schemas and sample data 72 | - **SQL Queries**: Run SQL queries directly against downloaded blockchain data 73 | 74 | ## Installation (Optional) 75 | 76 | This is not required if you will run the tool with `uvx` directly. 77 | 78 | ```bash 79 | # install with UV (recommended) 80 | uv tool install cryo-mcp 81 | ``` 82 | 83 | ## Requirements 84 | 85 | - Python 3.8+ 86 | - uv 87 | - A working installation of [Cryo](https://github.com/paradigmxyz/cryo) 88 | - Access to an Ethereum RPC endpoint 89 | - DuckDB (for SQL query functionality) 90 | 91 | ## Quick Start 92 | 93 | ### Usage with Claude Code 94 | 95 | 1. Run `claude mcp add` for an interactive prompt. 96 | 2. Enter `uvx` as the command to run. 97 | 3. Enter `cryo-mcp --rpc-url [--data-dir ]` as the args 98 | 4. Alternatively, provide `ETH_RPC_URL` and `CRYO_DATA_DIR` as environment variables instead. 99 | 100 | New instances of `claude` will now have access to cryo as configured to hit your RPC endpoint and store data in the specified directory. 101 | 102 | ## Available Tools 103 | 104 | Cryo MCP exposes the following MCP tools: 105 | 106 | ### `list_datasets()` 107 | 108 | Returns a list of all available Cryo datasets. 109 | 110 | Example: 111 | ```python 112 | client.list_datasets() 113 | ``` 114 | 115 | ### `query_dataset()` 116 | 117 | Query a Cryo dataset with various filtering options. 118 | 119 | Parameters: 120 | - `dataset` (str): The name of the dataset to query (e.g., 'blocks', 'transactions', 'logs') 121 | - `blocks` (str, optional): Block range specification (e.g., '1000:1010') 122 | - `start_block` (int, optional): Start block number (alternative to blocks) 123 | - `end_block` (int, optional): End block number (alternative to blocks) 124 | - `use_latest` (bool, optional): If True, query the latest block 125 | - `blocks_from_latest` (int, optional): Number of blocks from latest to include 126 | - `contract` (str, optional): Contract address to filter by 127 | - `output_format` (str, optional): Output format ('json', 'csv', 'parquet') 128 | - `include_columns` (list, optional): Columns to include alongside defaults 129 | - `exclude_columns` (list, optional): Columns to exclude from defaults 130 | 131 | Example: 132 | ```python 133 | # Get transactions from blocks 15M to 15.01M 134 | client.query_dataset('transactions', blocks='15M:15.01M') 135 | 136 | # Get logs for a specific contract from the latest 100 blocks 137 | client.query_dataset('logs', blocks_from_latest=100, contract='0x1234...') 138 | 139 | # Get just the latest block 140 | client.query_dataset('blocks', use_latest=True) 141 | ``` 142 | 143 | ### `lookup_dataset()` 144 | 145 | Get detailed information about a specific dataset, including schema and sample data. 146 | 147 | Parameters: 148 | - `name` (str): The name of the dataset to look up 149 | - `sample_start_block` (int, optional): Start block for sample data 150 | - `sample_end_block` (int, optional): End block for sample data 151 | - `use_latest_sample` (bool, optional): Use latest block for sample 152 | - `sample_blocks_from_latest` (int, optional): Number of blocks from latest for sample 153 | 154 | Example: 155 | ```python 156 | client.lookup_dataset('logs') 157 | ``` 158 | 159 | ### `get_latest_ethereum_block()` 160 | 161 | Returns information about the latest Ethereum block. 162 | 163 | Example: 164 | ```python 165 | client.get_latest_ethereum_block() 166 | ``` 167 | 168 | ### SQL Query Tools 169 | 170 | Cryo MCP includes several tools for running SQL queries against blockchain data: 171 | 172 | ### `query_sql()` 173 | 174 | Run a SQL query against downloaded blockchain data. 175 | 176 | Parameters: 177 | - `query` (str): SQL query to execute 178 | - `files` (list, optional): List of parquet file paths to query. If None, will use all files in the data directory. 179 | - `include_schema` (bool, optional): Whether to include schema information in the result 180 | 181 | Example: 182 | ```python 183 | # Run against all available files 184 | client.query_sql("SELECT * FROM read_parquet('/path/to/blocks.parquet') LIMIT 10") 185 | 186 | # Run against specific files 187 | client.query_sql( 188 | "SELECT * FROM read_parquet('/path/to/blocks.parquet') LIMIT 10", 189 | files=['/path/to/blocks.parquet'] 190 | ) 191 | ``` 192 | 193 | ### `query_blockchain_sql()` 194 | 195 | Query blockchain data using SQL, automatically downloading any required data. 196 | 197 | Parameters: 198 | - `sql_query` (str): SQL query to execute 199 | - `dataset` (str, optional): The dataset to query (e.g., 'blocks', 'transactions') 200 | - `blocks` (str, optional): Block range specification 201 | - `start_block` (int, optional): Start block number 202 | - `end_block` (int, optional): End block number 203 | - `use_latest` (bool, optional): If True, query the latest block 204 | - `blocks_from_latest` (int, optional): Number of blocks before the latest to include 205 | - `contract` (str, optional): Contract address to filter by 206 | - `force_refresh` (bool, optional): Force download of new data even if it exists 207 | - `include_schema` (bool, optional): Include schema information in the result 208 | 209 | Example: 210 | ```python 211 | # Automatically downloads blocks data if needed, then runs the SQL query 212 | client.query_blockchain_sql( 213 | sql_query="SELECT block_number, gas_used, timestamp FROM blocks ORDER BY gas_used DESC LIMIT 10", 214 | dataset="blocks", 215 | blocks_from_latest=100 216 | ) 217 | ``` 218 | 219 | ### `list_available_sql_tables()` 220 | 221 | List all available tables that can be queried with SQL. 222 | 223 | Example: 224 | ```python 225 | client.list_available_sql_tables() 226 | ``` 227 | 228 | ### `get_sql_table_schema()` 229 | 230 | Get the schema for a specific parquet file. 231 | 232 | Parameters: 233 | - `file_path` (str): Path to the parquet file 234 | 235 | Example: 236 | ```python 237 | client.get_sql_table_schema("/path/to/blocks.parquet") 238 | ``` 239 | 240 | ### `get_sql_examples()` 241 | 242 | Get example SQL queries for different blockchain datasets. 243 | 244 | Example: 245 | ```python 246 | client.get_sql_examples() 247 | ``` 248 | 249 | ## Configuration Options 250 | 251 | When starting the Cryo MCP server, you can use these command-line options: 252 | 253 | - `--rpc-url URL`: Ethereum RPC URL (overrides ETH_RPC_URL environment variable) 254 | - `--data-dir PATH`: Directory to store downloaded data (overrides CRYO_DATA_DIR environment variable, defaults to ~/.cryo-mcp/data/) 255 | 256 | ## Environment Variables 257 | 258 | - `ETH_RPC_URL`: Default Ethereum RPC URL to use when not specified via command line 259 | - `CRYO_DATA_DIR`: Default directory to store downloaded data when not specified via command line 260 | 261 | ## Advanced Usage 262 | 263 | ### SQL Queries Against Blockchain Data 264 | 265 | Cryo MCP allows you to run powerful SQL queries against blockchain data, combining the flexibility of SQL with Cryo's data extraction capabilities: 266 | 267 | #### Two-Step SQL Query Flow 268 | 269 | You can split data extraction and querying into two separate steps: 270 | 271 | ```python 272 | # Step 1: Download data and get file paths 273 | download_result = client.query_dataset( 274 | dataset="transactions", 275 | blocks_from_latest=1000, 276 | output_format="parquet" 277 | ) 278 | 279 | # Step 2: Use the file paths to run SQL queries 280 | file_paths = download_result.get("files", []) 281 | client.query_sql( 282 | query=f""" 283 | SELECT 284 | to_address as contract_address, 285 | COUNT(*) as tx_count, 286 | SUM(gas_used) as total_gas, 287 | AVG(gas_used) as avg_gas 288 | FROM read_parquet('{file_paths[0]}') 289 | WHERE to_address IS NOT NULL 290 | GROUP BY to_address 291 | ORDER BY total_gas DESC 292 | LIMIT 20 293 | """, 294 | files=file_paths 295 | ) 296 | ``` 297 | 298 | #### Combined SQL Query Flow 299 | 300 | For convenience, you can also use the combined function that handles both steps: 301 | 302 | ```python 303 | # Get top gas-consuming contracts 304 | client.query_blockchain_sql( 305 | sql_query=""" 306 | SELECT 307 | to_address as contract_address, 308 | COUNT(*) as tx_count, 309 | SUM(gas_used) as total_gas, 310 | AVG(gas_used) as avg_gas 311 | FROM read_parquet('/path/to/transactions.parquet') 312 | WHERE to_address IS NOT NULL 313 | GROUP BY to_address 314 | ORDER BY total_gas DESC 315 | LIMIT 20 316 | """, 317 | dataset="transactions", 318 | blocks_from_latest=1000 319 | ) 320 | 321 | # Find blocks with the most transactions 322 | client.query_blockchain_sql( 323 | sql_query=""" 324 | SELECT 325 | block_number, 326 | COUNT(*) as tx_count 327 | FROM read_parquet('/path/to/transactions.parquet') 328 | GROUP BY block_number 329 | ORDER BY tx_count DESC 330 | LIMIT 10 331 | """, 332 | dataset="transactions", 333 | blocks="15M:16M" 334 | ) 335 | 336 | # Analyze event logs by topic 337 | client.query_blockchain_sql( 338 | sql_query=""" 339 | SELECT 340 | topic0, 341 | COUNT(*) as event_count 342 | FROM read_parquet('/path/to/logs.parquet') 343 | GROUP BY topic0 344 | ORDER BY event_count DESC 345 | LIMIT 20 346 | """, 347 | dataset="logs", 348 | blocks_from_latest=100 349 | ) 350 | ``` 351 | 352 | **Note**: For SQL queries, always use `output_format="parquet"` when downloading data to ensure optimal performance with DuckDB. When using `query_blockchain_sql`, you should refer to the file paths directly in your SQL using the `read_parquet()` function. 353 | 354 | ### Querying with Block Ranges 355 | 356 | Cryo MCP supports the full range of Cryo's block specification syntax: 357 | 358 | ```python 359 | # Using block numbers 360 | client.query_dataset('transactions', blocks='15000000:15001000') 361 | 362 | # Using K/M notation 363 | client.query_dataset('logs', blocks='15M:15.01M') 364 | 365 | # Using offsets from latest 366 | client.query_dataset('blocks', blocks_from_latest=100) 367 | ``` 368 | 369 | ### Contract Filtering 370 | 371 | Filter logs and other data by contract address: 372 | 373 | ```python 374 | # Get all logs for USDC contract 375 | client.query_dataset('logs', 376 | blocks='16M:16.1M', 377 | contract='0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48') 378 | ``` 379 | 380 | ### Column Selection 381 | 382 | Include only the columns you need: 383 | 384 | ```python 385 | # Get just block numbers and timestamps 386 | client.query_dataset('blocks', 387 | blocks='16M:16.1M', 388 | include_columns=['number', 'timestamp']) 389 | ``` 390 | 391 | ## Development 392 | 393 | ### Project Structure 394 | 395 | ``` 396 | cryo-mcp/ 397 | ├── cryo_mcp/ # Main package directory 398 | │ ├── __init__.py # Package initialization 399 | │ ├── server.py # Main MCP server implementation 400 | │ ├── sql.py # SQL query functionality 401 | ├── tests/ # Test directory 402 | │ ├── test_*.py # Test files 403 | ├── pyproject.toml # Project configuration 404 | ├── README.md # Project documentation 405 | ``` 406 | 407 | ### Run Tests 408 | 409 | `uv run pytest` 410 | 411 | ## License 412 | 413 | MIT 414 | 415 | ## Credits 416 | 417 | - Built on top of the amazing [Cryo](https://github.com/paradigmxyz/cryo) tool by Paradigm 418 | - Uses the [MCP protocol](https://github.com/mcp-team/mcp) for API communication 419 | -------------------------------------------------------------------------------- /cryo_mcp/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cryo MCP Server - Query Ethereum blockchain data using cryo and MCP 3 | """ 4 | 5 | __version__ = "0.1.4" 6 | -------------------------------------------------------------------------------- /cryo_mcp/server.py: -------------------------------------------------------------------------------- 1 | # cryo_mcp/server.py 2 | """ 3 | Cryo MCP - A Model Completion Protocol server for the Cryo blockchain data extraction tool. 4 | 5 | This module provides a server that exposes Cryo's functionality through the MCP protocol, 6 | allowing blockchain data querying through an API interface geared at usage by LLMs. 7 | """ 8 | import json 9 | import os 10 | import subprocess 11 | import requests 12 | import argparse 13 | import sys 14 | from pathlib import Path 15 | from typing import List, Optional, Dict, Any, Union 16 | from mcp.server.fastmcp import FastMCP 17 | 18 | # Get the default RPC URL from environment or use fallback 19 | DEFAULT_RPC_URL = "http://localhost:8545" 20 | 21 | # Default data directory for storing output 22 | DEFAULT_DATA_DIR = str(Path.home() / ".cryo-mcp" / "data") 23 | 24 | # Create an MCP server 25 | mcp = FastMCP("Cryo Data Server") 26 | 27 | def get_latest_block_number() -> Optional[int]: 28 | """Get the latest block number from the Ethereum node""" 29 | rpc_url = os.environ.get("ETH_RPC_URL", DEFAULT_RPC_URL) 30 | 31 | payload = { 32 | "jsonrpc": "2.0", 33 | "method": "eth_blockNumber", 34 | "params": [], 35 | "id": 1 36 | } 37 | 38 | try: 39 | response = requests.post(rpc_url, json=payload) 40 | response_data = response.json() 41 | 42 | if 'result' in response_data: 43 | # Convert hex to int 44 | latest_block = int(response_data['result'], 16) 45 | print(f"Latest block number: {latest_block}") 46 | return latest_block 47 | else: 48 | print(f"Error fetching latest block: {response_data.get('error', 'Unknown error')}") 49 | return None 50 | except Exception as e: 51 | print(f"Exception when fetching latest block: {e}") 52 | return None 53 | 54 | @mcp.tool() 55 | def list_datasets() -> List[str]: 56 | """Return a list of all available cryo datasets""" 57 | # Ensure we have the RPC URL 58 | rpc_url = os.environ.get("ETH_RPC_URL", DEFAULT_RPC_URL) 59 | 60 | result = subprocess.run( 61 | ["cryo", "help", "datasets", "-r", rpc_url], 62 | capture_output=True, 63 | text=True 64 | ) 65 | 66 | # Parse the output to extract dataset names 67 | lines = result.stdout.split('\n') 68 | datasets = [] 69 | 70 | for line in lines: 71 | if line.startswith('- ') and not line.startswith('- blocks_and_transactions:'): 72 | # Extract dataset name, removing any aliases 73 | dataset = line[2:].split(' (alias')[0].strip() 74 | datasets.append(dataset) 75 | if line == 'dataset group names': 76 | break 77 | 78 | return datasets 79 | 80 | @mcp.tool() 81 | def query_dataset( 82 | dataset: str, 83 | blocks: Optional[str] = None, 84 | start_block: Optional[int] = None, 85 | end_block: Optional[int] = None, 86 | use_latest: bool = False, 87 | blocks_from_latest: Optional[int] = None, 88 | contract: Optional[str] = None, 89 | output_format: str = "json", 90 | include_columns: Optional[List[str]] = None, 91 | exclude_columns: Optional[List[str]] = None 92 | ) -> Dict[str, Any]: 93 | """ 94 | Download blockchain data and return the file paths where the data is stored. 95 | 96 | IMPORTANT WORKFLOW NOTE: When running SQL queries, use this function first to download 97 | data, then use the returned file paths with query_sql() to execute SQL on those files. 98 | 99 | Example workflow for SQL: 100 | 1. First download data: result = query_dataset('transactions', blocks='1000:1010', output_format='parquet') 101 | 2. Get file paths: files = result.get('files', []) 102 | 3. Run SQL query: query_sql("SELECT * FROM read_parquet('/path/to/file.parquet')", files=files) 103 | 104 | DATASET-SPECIFIC PARAMETERS: 105 | For datasets that require specific address parameters (like 'balances', 'erc20_transfers', etc.), 106 | ALWAYS use the 'contract' parameter to pass ANY Ethereum address. For example: 107 | 108 | - For 'balances' dataset: Use contract parameter for the address you want balances for 109 | query_dataset('balances', blocks='1000:1010', contract='0x123...') 110 | 111 | - For 'logs' or 'erc20_transfers': Use contract parameter for contract address 112 | query_dataset('logs', blocks='1000:1010', contract='0x123...') 113 | 114 | To check what parameters a dataset requires, always use lookup_dataset() first: 115 | lookup_dataset('balances') # Will show required parameters 116 | 117 | Args: 118 | dataset: The name of the dataset to query (e.g., 'logs', 'transactions', 'balances') 119 | blocks: Block range specification as a string (e.g., '1000:1010') 120 | start_block: Start block number as integer (alternative to blocks) 121 | end_block: End block number as integer (alternative to blocks) 122 | use_latest: If True, query the latest block 123 | blocks_from_latest: Number of blocks before the latest to include (e.g., 10 = latest-10 to latest) 124 | contract: Contract address to filter by - IMPORTANT: Use this parameter for ALL address-based filtering 125 | regardless of the parameter name in the native cryo command (address, contract, etc.) 126 | output_format: Output format (json, csv, parquet) - use 'parquet' for SQL queries 127 | include_columns: Columns to include alongside the defaults 128 | exclude_columns: Columns to exclude from the defaults 129 | 130 | Returns: 131 | Dictionary containing file paths where the downloaded data is stored 132 | """ 133 | # Ensure we have the RPC URL 134 | rpc_url = os.environ.get("ETH_RPC_URL", DEFAULT_RPC_URL) 135 | 136 | # Build the cryo command 137 | cmd = ["cryo", dataset, "-r", rpc_url] 138 | 139 | # Handle block range (priority: blocks > use_latest > start/end_block > default) 140 | if blocks: 141 | # Use specified block range string directly 142 | cmd.extend(["-b", blocks]) 143 | elif use_latest or blocks_from_latest is not None: 144 | # Get the latest block number 145 | latest_block = get_latest_block_number() 146 | 147 | if latest_block is None: 148 | return {"error": "Failed to get the latest block number from the RPC endpoint"} 149 | 150 | if blocks_from_latest is not None: 151 | # Use a range of blocks up to the latest 152 | start = latest_block - blocks_from_latest 153 | block_range = f"{start}:{latest_block+1}" # +1 to make it inclusive 154 | else: 155 | # Just the latest block 156 | block_range = f"{latest_block}:{latest_block+1}" # +1 to make it inclusive 157 | 158 | print(f"Using latest block range: {block_range}") 159 | cmd.extend(["-b", block_range]) 160 | elif start_block is not None: 161 | # Convert integer block numbers to string range 162 | if end_block is not None: 163 | # Note: cryo uses [start:end) range (inclusive start, exclusive end) 164 | # Add 1 to end_block to include it in the range 165 | block_range = f"{start_block}:{end_block+1}" 166 | else: 167 | # If only start_block is provided, get 10 blocks starting from there 168 | block_range = f"{start_block}:{start_block+10}" 169 | 170 | print(f"Using block range: {block_range}") 171 | cmd.extend(["-b", block_range]) 172 | else: 173 | # Default to a reasonable block range if none specified 174 | cmd.extend(["-b", "1000:1010"]) 175 | 176 | # Handle dataset-specific address parameters 177 | # For all address-based filters, we use the contract parameter 178 | # but map it to the correct flag based on the dataset 179 | if contract: 180 | # Check if this is a dataset that requires a different parameter name 181 | if dataset == 'balances': 182 | # For balances dataset, contract parameter maps to --address 183 | cmd.extend(["--address", contract]) 184 | else: 185 | # For other datasets like logs, transactions, etc. use --contract 186 | cmd.extend(["--contract", contract]) 187 | 188 | if output_format == "json": 189 | cmd.append("--json") 190 | elif output_format == "csv": 191 | cmd.append("--csv") 192 | 193 | if include_columns: 194 | cmd.append("--include-columns") 195 | cmd.extend(include_columns) 196 | 197 | if exclude_columns: 198 | cmd.append("--exclude-columns") 199 | cmd.extend(exclude_columns) 200 | 201 | # Get the base data directory 202 | data_dir = Path(os.environ.get("CRYO_DATA_DIR", DEFAULT_DATA_DIR)) 203 | 204 | # Choose output directory based on whether we're querying latest blocks 205 | if use_latest or blocks_from_latest is not None: 206 | output_dir = data_dir / "latest" 207 | output_dir.mkdir(parents=True, exist_ok=True) 208 | 209 | # Clean up the latest directory before new query 210 | print("Cleaning latest directory for current block query") 211 | existing_files = list(output_dir.glob(f"*{dataset}*.*")) 212 | for file in existing_files: 213 | try: 214 | file.unlink() 215 | print(f"Removed existing file: {file}") 216 | except Exception as e: 217 | print(f"Warning: Could not remove file {file}: {e}") 218 | else: 219 | # For historical queries, use the main data directory 220 | output_dir = data_dir 221 | output_dir.mkdir(parents=True, exist_ok=True) 222 | 223 | cmd.extend(["-o", str(output_dir)]) 224 | 225 | # Print the command for debugging 226 | print(f"Running query command: {' '.join(cmd)}") 227 | 228 | # Execute the command 229 | result = subprocess.run(cmd, capture_output=True, text=True) 230 | 231 | if result.returncode != 0: 232 | return { 233 | "error": result.stderr, 234 | "stdout": result.stdout, 235 | "command": " ".join(cmd) 236 | } 237 | 238 | # Try to find the report file which contains info about generated files 239 | report_dir = output_dir / ".cryo" / "reports" 240 | if report_dir.exists(): 241 | # Get the most recent report file (should be the one we just created) 242 | report_files = sorted(report_dir.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True) 243 | if report_files: 244 | with open(report_files[0], 'r') as f: 245 | report_data = json.load(f) 246 | # Get the list of completed files from the report 247 | if "results" in report_data and "completed_paths" in report_data["results"]: 248 | completed_files = report_data["results"]["completed_paths"] 249 | print(f"Found {len(completed_files)} files in Cryo report: {completed_files}") 250 | 251 | # Return the list of files and their count 252 | return { 253 | "files": completed_files, 254 | "count": len(completed_files), 255 | "format": output_format 256 | } 257 | 258 | # Fallback to glob search if report file not found or doesn't contain the expected data 259 | output_files = list(output_dir.glob(f"*{dataset}*.{output_format}")) 260 | print(f"Output files found via glob: {output_files}") 261 | 262 | if not output_files: 263 | return {"error": "No output files generated", "command": " ".join(cmd)} 264 | 265 | # Convert Path objects to strings for JSON serialization 266 | file_paths = [str(file_path) for file_path in output_files] 267 | 268 | return { 269 | "files": file_paths, 270 | "count": len(file_paths), 271 | "format": output_format 272 | } 273 | 274 | @mcp.resource("dataset://{name}") 275 | def get_dataset_info(name: str) -> Dict[str, Any]: 276 | """Get information about a specific dataset""" 277 | # Ensure we have the RPC URL 278 | rpc_url = os.environ.get("ETH_RPC_URL", DEFAULT_RPC_URL) 279 | 280 | result = subprocess.run( 281 | ["cryo", "help", name, "-r", rpc_url], 282 | capture_output=True, 283 | text=True 284 | ) 285 | 286 | # Get the latest block number for examples 287 | latest_block = get_latest_block_number() 288 | latest_example = "" 289 | 290 | if latest_block: 291 | latest_example = f"query_dataset('{name}', blocks_from_latest=10) # Gets latest-10 to latest blocks" 292 | 293 | # Add special examples for datasets requiring address parameters 294 | address_example = "" 295 | if "address" in result.stdout.lower() and "required parameters: address" in result.stdout.lower(): 296 | address_example = f"query_dataset('{name}', blocks='1000:1010', contract='0x123...') # Use contract parameter for address" 297 | 298 | return { 299 | "name": name, 300 | "description": result.stdout, 301 | "example_queries": [ 302 | f"query_dataset('{name}', blocks='1000:1010')", 303 | f"query_dataset('{name}', start_block=1000, end_block=1009)", 304 | f"query_dataset('{name}', use_latest=True) # Gets just the latest block", 305 | latest_example, 306 | address_example 307 | ] if address_example else [ 308 | f"query_dataset('{name}', blocks='1000:1010')", 309 | f"query_dataset('{name}', start_block=1000, end_block=1009)", 310 | f"query_dataset('{name}', use_latest=True) # Gets just the latest block", 311 | latest_example 312 | ], 313 | "notes": [ 314 | "Block ranges are inclusive for start_block and end_block when using integer parameters.", 315 | "Use 'use_latest=True' to query only the latest block.", 316 | "Use 'blocks_from_latest=N' to query the latest N blocks.", 317 | "IMPORTANT: For datasets requiring an 'address' parameter (like 'balances'), use the 'contract' parameter.", 318 | "Always check the required parameters in the dataset description and use lookup_dataset() first." 319 | ] 320 | } 321 | 322 | @mcp.tool() 323 | def lookup_dataset( 324 | name: str, 325 | sample_start_block: Optional[int] = None, 326 | sample_end_block: Optional[int] = None, 327 | use_latest_sample: bool = False, 328 | sample_blocks_from_latest: Optional[int] = None 329 | ) -> Dict[str, Any]: 330 | """ 331 | Look up a specific dataset and return detailed information about it. IMPORTANT: Always use this 332 | function before querying a new dataset to understand its required parameters and schema. 333 | 334 | The returned information includes: 335 | 1. Required parameters for the dataset (IMPORTANT for datasets like 'balances' that need an address) 336 | 2. Schema details showing available columns and data types 337 | 3. Example queries for the dataset 338 | 339 | When the dataset requires specific parameters like 'address' (for 'balances'), 340 | ALWAYS use the 'contract' parameter in query_dataset() to pass these values. 341 | 342 | Example: 343 | For 'balances' dataset, lookup_dataset('balances') will show it requires an 'address' parameter. 344 | You should then query it using: 345 | query_dataset('balances', blocks='1000:1010', contract='0x1234...') 346 | 347 | Args: 348 | name: The name of the dataset to look up 349 | sample_start_block: Optional start block for sample data (integer) 350 | sample_end_block: Optional end block for sample data (integer) 351 | use_latest_sample: If True, use the latest block for sample data 352 | sample_blocks_from_latest: Number of blocks before the latest to include in sample 353 | 354 | Returns: 355 | Detailed information about the dataset including schema and available fields 356 | """ 357 | # Get basic dataset info 358 | info = get_dataset_info(name) 359 | 360 | # Ensure we have the RPC URL 361 | rpc_url = os.environ.get("ETH_RPC_URL", DEFAULT_RPC_URL) 362 | 363 | # Get schema information by running the dataset with --dry-run 364 | schema_result = subprocess.run( 365 | ["cryo", name, "--dry-run", "-r", rpc_url], 366 | capture_output=True, 367 | text=True 368 | ) 369 | 370 | if schema_result.returncode == 0: 371 | info["schema"] = schema_result.stdout 372 | else: 373 | info["schema_error"] = schema_result.stderr 374 | 375 | # Try to get a sample of the dataset (first 5 records) 376 | try: 377 | data_dir = Path(os.environ.get("CRYO_DATA_DIR", DEFAULT_DATA_DIR)) 378 | 379 | # Determine block range for sample (priority: latest > specified blocks > default) 380 | if use_latest_sample or sample_blocks_from_latest is not None: 381 | # Get the latest block number 382 | latest_block = get_latest_block_number() 383 | 384 | if latest_block is None: 385 | info["sample_error"] = "Failed to get the latest block number from the RPC endpoint" 386 | return info 387 | 388 | if sample_blocks_from_latest is not None: 389 | # Use a range of blocks from latest-n to latest 390 | block_range = f"{latest_block - sample_blocks_from_latest}:{latest_block+1}" 391 | else: 392 | # Just the latest 5 blocks 393 | block_range = f"{latest_block-4}:{latest_block+1}" 394 | 395 | info["sample_block_range"] = block_range 396 | 397 | # Use the latest directory for latest block samples 398 | sample_dir = data_dir / "latest" 399 | sample_dir.mkdir(parents=True, exist_ok=True) 400 | 401 | # Clean up the latest directory before new query 402 | print("Cleaning latest directory for current sample") 403 | existing_files = list(sample_dir.glob(f"*{name}*.*")) 404 | for file in existing_files: 405 | try: 406 | file.unlink() 407 | print(f"Removed existing sample file: {file}") 408 | except Exception as e: 409 | print(f"Warning: Could not remove sample file {file}: {e}") 410 | else: 411 | # For historical blocks, get the start block and end block 412 | if sample_start_block is not None: 413 | if sample_end_block is not None: 414 | # Note: cryo uses [start:end) range (inclusive start, exclusive end) 415 | # Add 1 to end_block to include it in the range 416 | block_range = f"{sample_start_block}:{sample_end_block+1}" 417 | else: 418 | # Use start block and get 5 blocks 419 | block_range = f"{sample_start_block}:{sample_start_block+5}" 420 | else: 421 | # Default to a known good block range 422 | block_range = "1000:1005" 423 | 424 | # For historical samples, use the main data directory 425 | sample_dir = data_dir 426 | sample_dir.mkdir(parents=True, exist_ok=True) 427 | 428 | # Use the block range for the sample 429 | sample_cmd = [ 430 | "cryo", name, 431 | "-b", block_range, 432 | "-r", rpc_url, 433 | "--json", 434 | "-o", str(sample_dir) 435 | ] 436 | 437 | print(f"Running sample command: {' '.join(sample_cmd)}") 438 | sample_result = subprocess.run( 439 | sample_cmd, 440 | capture_output=True, 441 | text=True, 442 | timeout=30 # Add timeout to prevent hanging 443 | ) 444 | 445 | if sample_result.returncode == 0: 446 | # Try to find the report file which contains info about generated files 447 | report_dir = sample_dir / ".cryo" / "reports" 448 | if report_dir.exists(): 449 | # Get the most recent report file 450 | report_files = sorted(report_dir.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True) 451 | if report_files: 452 | with open(report_files[0], 'r') as f: 453 | report_data = json.load(f) 454 | # Get the list of completed files from the report 455 | if "results" in report_data and "completed_paths" in report_data["results"]: 456 | completed_files = report_data["results"]["completed_paths"] 457 | print(f"Found {len(completed_files)} files in Cryo report: {completed_files}") 458 | info["sample_files"] = completed_files 459 | return info 460 | 461 | # Fallback to glob search if report file not found 462 | output_files = list(sample_dir.glob(f"*{name}*.json")) 463 | print(f"Output files found via glob: {output_files}") 464 | 465 | if output_files: 466 | # Convert Path objects to strings for JSON serialization 467 | file_paths = [str(file_path) for file_path in output_files] 468 | info["sample_files"] = file_paths 469 | else: 470 | info["sample_error"] = "No output files generated" 471 | else: 472 | info["sample_error"] = sample_result.stderr 473 | info["sample_stdout"] = sample_result.stdout # Include stdout for debugging 474 | except (subprocess.TimeoutExpired, Exception) as e: 475 | info["sample_error"] = str(e) 476 | 477 | return info 478 | 479 | @mcp.tool() 480 | def get_transaction_by_hash( 481 | tx_hash: str 482 | ) -> Dict[str, Any]: 483 | """ 484 | Get detailed information about a transaction by its hash 485 | 486 | Args: 487 | tx_hash: The transaction hash to look up 488 | 489 | Returns: 490 | Detailed information about the transaction 491 | """ 492 | # Ensure we have the RPC URL 493 | rpc_url = os.environ.get("ETH_RPC_URL", DEFAULT_RPC_URL) 494 | 495 | # Use RPC directly to get the transaction 496 | payload = { 497 | "jsonrpc": "2.0", 498 | "method": "eth_getTransactionByHash", 499 | "params": [tx_hash], 500 | "id": 1 501 | } 502 | 503 | try: 504 | response = requests.post(rpc_url, json=payload) 505 | response_data = response.json() 506 | 507 | if 'result' in response_data and response_data['result']: 508 | tx_data = response_data['result'] 509 | 510 | # Get the receipt as well for additional information (gas used, status) 511 | receipt_payload = { 512 | "jsonrpc": "2.0", 513 | "method": "eth_getTransactionReceipt", 514 | "params": [tx_hash], 515 | "id": 2 516 | } 517 | 518 | receipt_response = requests.post(rpc_url, json=receipt_payload) 519 | receipt_data = receipt_response.json() 520 | 521 | if 'result' in receipt_data and receipt_data['result']: 522 | receipt = receipt_data['result'] 523 | 524 | # Combine transaction and receipt data 525 | result = { 526 | "transaction_hash": tx_hash, 527 | "block_number": int(tx_data.get("blockNumber", "0x0"), 16), 528 | "block_hash": tx_data.get("blockHash"), 529 | "from_address": tx_data.get("from"), 530 | "to_address": tx_data.get("to"), 531 | "value": tx_data.get("value"), 532 | "value_decimal": int(tx_data.get("value", "0x0"), 16), 533 | "gas_limit": int(tx_data.get("gas", "0x0"), 16), 534 | "gas_price": int(tx_data.get("gasPrice", "0x0"), 16), 535 | "nonce": int(tx_data.get("nonce", "0x0"), 16), 536 | "input": tx_data.get("input"), 537 | "transaction_index": int(tx_data.get("transactionIndex", "0x0"), 16), 538 | "gas_used": int(receipt.get("gasUsed", "0x0"), 16), 539 | "status": int(receipt.get("status", "0x0"), 16), 540 | "logs_count": len(receipt.get("logs", [])), 541 | "contract_address": receipt.get("contractAddress") 542 | } 543 | 544 | # Handle EIP-1559 transactions 545 | if "maxFeePerGas" in tx_data: 546 | result["max_fee_per_gas"] = int(tx_data.get("maxFeePerGas", "0x0"), 16) 547 | result["max_priority_fee_per_gas"] = int(tx_data.get("maxPriorityFeePerGas", "0x0"), 16) 548 | result["transaction_type"] = int(tx_data.get("type", "0x0"), 16) 549 | 550 | return result 551 | else: 552 | # Return just the transaction data if receipt is not available 553 | return { 554 | "transaction_hash": tx_hash, 555 | "block_number": int(tx_data.get("blockNumber", "0x0"), 16), 556 | "block_hash": tx_data.get("blockHash"), 557 | "from_address": tx_data.get("from"), 558 | "to_address": tx_data.get("to"), 559 | "value": tx_data.get("value"), 560 | "value_decimal": int(tx_data.get("value", "0x0"), 16), 561 | "gas_limit": int(tx_data.get("gas", "0x0"), 16), 562 | "gas_price": int(tx_data.get("gasPrice", "0x0"), 16), 563 | "nonce": int(tx_data.get("nonce", "0x0"), 16), 564 | "input": tx_data.get("input"), 565 | "transaction_index": int(tx_data.get("transactionIndex", "0x0"), 16), 566 | "error": "Failed to retrieve transaction receipt" 567 | } 568 | else: 569 | return {"error": f"Transaction not found: {tx_hash}"} 570 | except Exception as e: 571 | return {"error": f"Exception when fetching transaction: {e}"} 572 | 573 | @mcp.tool() 574 | def get_latest_ethereum_block() -> Dict[str, Any]: 575 | """ 576 | Get information about the latest Ethereum block 577 | 578 | Returns: 579 | Information about the latest block including block number 580 | """ 581 | latest_block = get_latest_block_number() 582 | 583 | if latest_block is None: 584 | return {"error": "Failed to get the latest block number from the RPC endpoint"} 585 | 586 | # Get block data using cryo 587 | rpc_url = os.environ.get("ETH_RPC_URL", DEFAULT_RPC_URL) 588 | block_range = f"{latest_block}:{latest_block+1}" # +1 to make it inclusive 589 | 590 | data_dir = Path(os.environ.get("CRYO_DATA_DIR", DEFAULT_DATA_DIR)) 591 | latest_dir = data_dir / "latest" 592 | latest_dir.mkdir(parents=True, exist_ok=True) 593 | 594 | # Always clean up the latest directory for latest block 595 | print("Cleaning latest directory for current block") 596 | existing_files = list(latest_dir.glob("*blocks*.*")) 597 | for file in existing_files: 598 | try: 599 | file.unlink() 600 | print(f"Removed existing file: {file}") 601 | except Exception as e: 602 | print(f"Warning: Could not remove file {file}: {e}") 603 | 604 | cmd = [ 605 | "cryo", "blocks", 606 | "-b", block_range, 607 | "-r", rpc_url, 608 | "--json", 609 | "-o", str(latest_dir) 610 | ] 611 | 612 | result = subprocess.run(cmd, capture_output=True, text=True) 613 | 614 | if result.returncode != 0: 615 | return { 616 | "block_number": latest_block, 617 | "error": "Failed to get detailed block data", 618 | "stderr": result.stderr 619 | } 620 | 621 | # Try to find the report file which contains info about generated files 622 | report_dir = latest_dir / ".cryo" / "reports" 623 | if report_dir.exists(): 624 | # Get the most recent report file 625 | report_files = sorted(report_dir.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True) 626 | if report_files: 627 | with open(report_files[0], 'r') as f: 628 | report_data = json.load(f) 629 | # Get the list of completed files from the report 630 | if "results" in report_data and "completed_paths" in report_data["results"]: 631 | completed_files = report_data["results"]["completed_paths"] 632 | print(f"Found {len(completed_files)} files in Cryo report: {completed_files}") 633 | 634 | return { 635 | "block_number": latest_block, 636 | "files": completed_files, 637 | "count": len(completed_files) 638 | } 639 | 640 | # Fallback to glob search if report file not found 641 | output_files = list(latest_dir.glob("*blocks*.json")) 642 | 643 | if not output_files: 644 | return { 645 | "block_number": latest_block, 646 | "error": "No output files generated" 647 | } 648 | 649 | # Convert Path objects to strings for JSON serialization 650 | file_paths = [str(file_path) for file_path in output_files] 651 | 652 | return { 653 | "block_number": latest_block, 654 | "files": file_paths, 655 | "count": len(file_paths) 656 | } 657 | 658 | @mcp.tool() 659 | def query_sql( 660 | query: str, 661 | files: Optional[List[str]] = None, 662 | include_schema: bool = True 663 | ) -> Dict[str, Any]: 664 | """ 665 | Run a SQL query against downloaded blockchain data files 666 | 667 | IMPORTANT WORKFLOW: This function should be used after calling query_dataset 668 | to download data. Use the file paths returned by query_dataset as input to this function. 669 | 670 | Workflow steps: 671 | 1. Download data: result = query_dataset('transactions', blocks='1000:1010', output_format='parquet') 672 | 2. Get file paths: files = result.get('files', []) 673 | 3. Execute SQL using either: 674 | - Direct table references: query_sql("SELECT * FROM transactions", files=files) 675 | - Or read_parquet(): query_sql("SELECT * FROM read_parquet('/path/to/file.parquet')", files=files) 676 | 677 | To see the schema of a file, use get_sql_table_schema(file_path) before writing your query. 678 | 679 | DuckDB supports both approaches: 680 | 1. Direct table references (simpler): "SELECT * FROM blocks" 681 | 2. read_parquet function (explicit): "SELECT * FROM read_parquet('/path/to/file.parquet')" 682 | 683 | Args: 684 | query: SQL query to execute - can use simple table names or read_parquet() 685 | files: List of parquet file paths to query (typically from query_dataset results) 686 | include_schema: Whether to include schema information in the result 687 | 688 | Returns: 689 | Query results and metadata 690 | """ 691 | from cryo_mcp.sql import execute_sql_query 692 | return execute_sql_query(query, files, include_schema) 693 | 694 | @mcp.tool() 695 | def list_available_sql_tables() -> List[Dict[str, Any]]: 696 | """ 697 | List all available parquet files that can be queried with SQL 698 | 699 | USAGE NOTES: 700 | - This function lists parquet files that have already been downloaded 701 | - Each file can be queried using read_parquet('/path/to/file.parquet') in your SQL 702 | - For each file, this returns the file path, dataset type, and other metadata 703 | - Use these file paths in your SQL queries with query_sql() 704 | 705 | Returns: 706 | List of available files and their metadata 707 | """ 708 | from cryo_mcp.sql import list_available_tables 709 | return list_available_tables() 710 | 711 | @mcp.tool() 712 | def get_sql_table_schema(file_path: str) -> Dict[str, Any]: 713 | """ 714 | Get the schema and sample data for a specific parquet file 715 | 716 | WORKFLOW NOTE: Use this function to explore the structure of parquet files 717 | before writing SQL queries against them. This will show you: 718 | 1. All available columns and their data types 719 | 2. Sample data from the file 720 | 3. Total row count 721 | 722 | Usage example: 723 | 1. Get list of files: files = list_available_sql_tables() 724 | 2. For a specific file: schema = get_sql_table_schema(files[0]['path']) 725 | 3. Use columns in your SQL: query_sql("SELECT column1, column2 FROM read_parquet('/path/to/file.parquet')") 726 | 727 | Args: 728 | file_path: Path to the parquet file (from list_available_sql_tables or query_dataset) 729 | 730 | Returns: 731 | Table schema information including columns, data types, and sample data 732 | """ 733 | from cryo_mcp.sql import get_table_schema 734 | return get_table_schema(file_path) 735 | 736 | @mcp.tool() 737 | def query_blockchain_sql( 738 | sql_query: str, 739 | dataset: Optional[str] = None, 740 | blocks: Optional[str] = None, 741 | start_block: Optional[int] = None, 742 | end_block: Optional[int] = None, 743 | use_latest: bool = False, 744 | blocks_from_latest: Optional[int] = None, 745 | contract: Optional[str] = None, 746 | force_refresh: bool = False, 747 | include_schema: bool = True 748 | ) -> Dict[str, Any]: 749 | """ 750 | Download blockchain data and run SQL query in a single step 751 | 752 | CONVENIENCE FUNCTION: This combines query_dataset and query_sql into one call. 753 | 754 | You can write SQL queries using either approach: 755 | 1. Simple table references: "SELECT * FROM blocks LIMIT 10" 756 | 2. Explicit read_parquet: "SELECT * FROM read_parquet('/path/to/file.parquet') LIMIT 10" 757 | 758 | DATASET-SPECIFIC PARAMETERS: 759 | For datasets that require specific address parameters (like 'balances', 'erc20_transfers', etc.), 760 | ALWAYS use the 'contract' parameter to pass ANY Ethereum address. For example: 761 | 762 | - For 'balances' dataset: Use contract parameter for the address you want balances for 763 | query_blockchain_sql( 764 | sql_query="SELECT * FROM balances", 765 | dataset="balances", 766 | blocks='1000:1010', 767 | contract='0x123...' # Address you want balances for 768 | ) 769 | 770 | Examples: 771 | ``` 772 | # Using simple table name 773 | query_blockchain_sql( 774 | sql_query="SELECT * FROM blocks LIMIT 10", 775 | dataset="blocks", 776 | blocks_from_latest=100 777 | ) 778 | 779 | # Using read_parquet() (the path will be automatically replaced) 780 | query_blockchain_sql( 781 | sql_query="SELECT * FROM read_parquet('/any/path.parquet') LIMIT 10", 782 | dataset="blocks", 783 | blocks_from_latest=100 784 | ) 785 | ``` 786 | 787 | ALTERNATIVE WORKFLOW (more control): 788 | If you need more control, you can separate the steps: 789 | 1. Download data: result = query_dataset('blocks', blocks_from_latest=100, output_format='parquet') 790 | 2. Inspect schema: schema = get_sql_table_schema(result['files'][0]) 791 | 3. Run SQL query: query_sql("SELECT * FROM blocks", files=result['files']) 792 | 793 | Args: 794 | sql_query: SQL query to execute - using table names or read_parquet() 795 | dataset: The specific dataset to query (e.g., 'transactions', 'logs', 'balances') 796 | If None, will be extracted from the SQL query 797 | blocks: Block range specification as a string (e.g., '1000:1010') 798 | start_block: Start block number (alternative to blocks) 799 | end_block: End block number (alternative to blocks) 800 | use_latest: If True, query the latest block 801 | blocks_from_latest: Number of blocks before the latest to include 802 | contract: Contract address to filter by - IMPORTANT: Use this parameter for ALL address-based filtering 803 | regardless of the parameter name in the native cryo command (address, contract, etc.) 804 | force_refresh: Force download of new data even if it exists 805 | include_schema: Include schema information in the result 806 | 807 | Returns: 808 | SQL query results and metadata 809 | """ 810 | from cryo_mcp.sql import execute_sql_query, extract_dataset_from_sql 811 | 812 | # Try to determine dataset if not provided 813 | if dataset is None: 814 | dataset = extract_dataset_from_sql(sql_query) 815 | if dataset is None: 816 | return { 817 | "success": False, 818 | "error": "Could not determine dataset from SQL query. Please specify dataset parameter." 819 | } 820 | 821 | # First, ensure we have the data by running a query_dataset operation 822 | # This will download the data and return the file paths 823 | download_result = query_dataset( 824 | dataset=dataset, 825 | blocks=blocks, 826 | start_block=start_block, 827 | end_block=end_block, 828 | use_latest=use_latest, 829 | blocks_from_latest=blocks_from_latest, 830 | contract=contract, 831 | output_format="parquet" # Use parquet for optimal SQL performance 832 | ) 833 | 834 | if "error" in download_result: 835 | return { 836 | "success": False, 837 | "error": f"Failed to download data: {download_result['error']}", 838 | "download_details": download_result 839 | } 840 | 841 | # Get the file paths from the download result 842 | files = download_result.get("files", []) 843 | 844 | # Check if we have any files 845 | if not files: 846 | return { 847 | "success": False, 848 | "error": "No data files were generated from the download operation" 849 | } 850 | 851 | # Filter for parquet files only 852 | parquet_files = [f for f in files if f.endswith('.parquet')] 853 | if not parquet_files: 854 | return { 855 | "success": False, 856 | "error": "No parquet files were generated. Check output_format parameter." 857 | } 858 | 859 | # Now execute the SQL query directly against the downloaded parquet files 860 | sql_result = execute_sql_query(sql_query, parquet_files, include_schema) 861 | 862 | # Include download info in result 863 | sql_result["data_source"] = { 864 | "dataset": dataset, 865 | "files": files, 866 | "block_range": blocks or f"{start_block}:{end_block}" if start_block and end_block else "latest blocks" 867 | if use_latest or blocks_from_latest else "default range" 868 | } 869 | 870 | return sql_result 871 | 872 | @mcp.tool() 873 | def get_sql_examples() -> Dict[str, List[str]]: 874 | """ 875 | Get example SQL queries for different blockchain datasets with DuckDB 876 | 877 | SQL WORKFLOW TIPS: 878 | 1. First download data: result = query_dataset('dataset_name', blocks='...', output_format='parquet') 879 | 2. Inspect schema: schema = get_sql_table_schema(result['files'][0]) 880 | 3. Run SQL: query_sql("SELECT * FROM read_parquet('/path/to/file.parquet')", files=result['files']) 881 | 882 | OR use the combined approach: 883 | - query_blockchain_sql(sql_query="SELECT * FROM read_parquet('...')", dataset='blocks', blocks='...') 884 | 885 | Returns: 886 | Dictionary of example queries categorized by dataset type and workflow patterns 887 | """ 888 | return { 889 | "basic_usage": [ 890 | "-- Option 1: Simple table names (recommended)", 891 | "SELECT * FROM blocks LIMIT 10", 892 | "SELECT * FROM transactions LIMIT 10", 893 | "SELECT * FROM logs LIMIT 10", 894 | 895 | "-- Option 2: Using read_parquet() with explicit file paths", 896 | "SELECT * FROM read_parquet('/path/to/blocks.parquet') LIMIT 10" 897 | ], 898 | "transactions": [ 899 | "-- Option 1: Simple table reference", 900 | "SELECT * FROM transactions LIMIT 10", 901 | "SELECT block_number, COUNT(*) as tx_count FROM transactions GROUP BY block_number ORDER BY tx_count DESC LIMIT 10", 902 | 903 | "-- Option 2: Using read_parquet()", 904 | "SELECT from_address, COUNT(*) as sent_count FROM read_parquet('/path/to/transactions.parquet') GROUP BY from_address ORDER BY sent_count DESC LIMIT 10", 905 | "SELECT to_address, SUM(value) as total_eth FROM read_parquet('/path/to/transactions.parquet') GROUP BY to_address ORDER BY total_eth DESC LIMIT 10" 906 | ], 907 | "blocks": [ 908 | "SELECT * FROM blocks LIMIT 10", 909 | "SELECT block_number, gas_used, transaction_count FROM blocks ORDER BY gas_used DESC LIMIT 10", 910 | "SELECT AVG(gas_used) as avg_gas, AVG(transaction_count) as avg_txs FROM blocks" 911 | ], 912 | "balances": [ 913 | "-- IMPORTANT: When querying the balances dataset, use the 'contract' parameter to specify the address", 914 | "-- First download the data:", 915 | "# result = query_dataset('balances', blocks='15M:15.01M', contract='0x1234...', output_format='parquet')", 916 | "-- Then query the data:", 917 | "SELECT block_number, address, balance_f64 FROM balances ORDER BY block_number", 918 | "SELECT block_number, balance_f64, balance_f64/1e18 as balance_eth FROM balances ORDER BY block_number" 919 | ], 920 | "logs": [ 921 | "SELECT * FROM logs LIMIT 10", 922 | "SELECT address, COUNT(*) as event_count FROM logs GROUP BY address ORDER BY event_count DESC LIMIT 10", 923 | "SELECT topic0, COUNT(*) as event_count FROM logs GROUP BY topic0 ORDER BY event_count DESC LIMIT 10" 924 | ], 925 | "joins": [ 926 | "-- Join with simple table references", 927 | "SELECT t.block_number, COUNT(*) as tx_count, b.gas_used FROM transactions t JOIN blocks b ON t.block_number = b.block_number GROUP BY t.block_number, b.gas_used ORDER BY tx_count DESC LIMIT 10", 928 | 929 | "-- Join with read_parquet (useful for complex joins)", 930 | "SELECT l.block_number, l.address, COUNT(*) as log_count FROM read_parquet('/path/to/logs.parquet') l GROUP BY l.block_number, l.address ORDER BY log_count DESC LIMIT 10" 931 | ], 932 | "workflow_examples": [ 933 | "-- Step 1: Download data with query_dataset", 934 | "# result = query_dataset(dataset='blocks', blocks='15000000:15000100', output_format='parquet')", 935 | "-- Step 2: Get schema info", 936 | "# schema = get_sql_table_schema(result['files'][0])", 937 | "-- Step 3: Run SQL query (simple table reference)", 938 | "# query_sql(query=\"SELECT * FROM blocks LIMIT 10\", files=result.get('files', []))", 939 | "", 940 | "-- Or use the combined function", 941 | "# query_blockchain_sql(sql_query=\"SELECT * FROM blocks LIMIT 10\", dataset='blocks', blocks='15000000:15000100')" 942 | ], 943 | "using_dataset_parameters": [ 944 | "-- IMPORTANT: How to check required parameters for datasets", 945 | "-- Step 1: Look up the dataset to see required parameters", 946 | "# dataset_info = lookup_dataset('balances')", 947 | "# This will show: 'required parameters: address'", 948 | "", 949 | "-- Step 2: Use the contract parameter for ANY address parameter", 950 | "# For balances dataset, query_dataset('balances', blocks='1M:1.1M', contract='0x1234...')", 951 | "# For erc20_transfers, query_dataset('erc20_transfers', blocks='1M:1.1M', contract='0x1234...')", 952 | "", 953 | "-- Step 3: Always check the dataset description and schema before querying new datasets", 954 | "# This helps ensure you're passing the correct parameters" 955 | ] 956 | } 957 | 958 | def parse_args(args=None): 959 | """Parse command line arguments""" 960 | parser = argparse.ArgumentParser(description="Cryo Data Server") 961 | parser.add_argument( 962 | "--rpc-url", 963 | type=str, 964 | help="Ethereum RPC URL to use for requests" 965 | ) 966 | parser.add_argument( 967 | "--data-dir", 968 | type=str, 969 | help="Directory to store downloaded data, defaults to ~/.cryo-mcp/data/" 970 | ) 971 | parser.add_argument( 972 | "--version", 973 | action="store_true", 974 | help="Show version information and exit" 975 | ) 976 | return parser.parse_args(args) 977 | 978 | def main(): 979 | """Main entry point for the command-line script""" 980 | args = parse_args() 981 | 982 | # Check if version was requested 983 | if args.version: 984 | from cryo_mcp import __version__ 985 | print(f"cryo-mcp version {__version__}") 986 | return 0 987 | 988 | # Set RPC URL with priority: command line > environment variable > default 989 | if args.rpc_url: 990 | rpc_url = args.rpc_url 991 | os.environ["ETH_RPC_URL"] = rpc_url 992 | print(f"Using RPC URL from command line: {rpc_url}") 993 | elif os.environ.get("ETH_RPC_URL"): 994 | rpc_url = os.environ["ETH_RPC_URL"] 995 | print(f"Using RPC URL from environment: {rpc_url}") 996 | else: 997 | rpc_url = DEFAULT_RPC_URL 998 | os.environ["ETH_RPC_URL"] = rpc_url 999 | print(f"Using default RPC URL: {rpc_url}") 1000 | 1001 | # Set data directory with priority: command line > environment variable > default 1002 | if args.data_dir: 1003 | data_dir = args.data_dir 1004 | os.environ["CRYO_DATA_DIR"] = data_dir 1005 | print(f"Using data directory from command line: {data_dir}") 1006 | elif os.environ.get("CRYO_DATA_DIR"): 1007 | data_dir = os.environ["CRYO_DATA_DIR"] 1008 | print(f"Using data directory from environment: {data_dir}") 1009 | else: 1010 | data_dir = DEFAULT_DATA_DIR 1011 | os.environ["CRYO_DATA_DIR"] = data_dir 1012 | print(f"Using default data directory: {data_dir}") 1013 | 1014 | # Ensure data directory exists 1015 | Path(data_dir).mkdir(parents=True, exist_ok=True) 1016 | 1017 | mcp.run() 1018 | 1019 | return 0 1020 | 1021 | if __name__ == "__main__": 1022 | sys.exit(main()) 1023 | -------------------------------------------------------------------------------- /cryo_mcp/sql.py: -------------------------------------------------------------------------------- 1 | """SQL query functionality for Cryo MCP using DuckDB.""" 2 | import os 3 | import re 4 | import json 5 | from pathlib import Path 6 | import duckdb 7 | from typing import Dict, Any, List, Optional, Union 8 | 9 | # Default SQL query timeout in seconds 10 | DEFAULT_QUERY_TIMEOUT = 30 11 | 12 | def get_data_directory() -> Path: 13 | """Get the data directory where Cryo files are stored.""" 14 | default_data_dir = str(Path.home() / ".cryo-mcp" / "data") 15 | return Path(os.environ.get("CRYO_DATA_DIR", default_data_dir)) 16 | 17 | def create_connection(read_only: bool = False) -> duckdb.DuckDBPyConnection: 18 | """Create a DuckDB connection with appropriate settings.""" 19 | # In-memory database can't be read-only, so we always use read_only=False 20 | conn = duckdb.connect(database=":memory:", read_only=False) 21 | 22 | # Configure DuckDB settings for performance and safety 23 | conn.execute("SET memory_limit='4GB'") 24 | conn.execute("SET max_expression_depth=10000") 25 | 26 | # Note: query_timeout_ms setting might not be available in all DuckDB versions 27 | try: 28 | conn.execute(f"SET query_timeout_ms={DEFAULT_QUERY_TIMEOUT * 1000}") 29 | except Exception: 30 | pass # Ignore if setting doesn't exist 31 | 32 | return conn 33 | 34 | def list_available_tables() -> List[Dict[str, Any]]: 35 | """List all available tables from downloaded data files.""" 36 | data_dir = get_data_directory() 37 | 38 | # Find all parquet files in the data directory (including the latest subdirectory) 39 | parquet_files = list(data_dir.glob("**/*.parquet")) 40 | 41 | tables = [] 42 | for file_path in parquet_files: 43 | # Extract dataset name from filename 44 | name = file_path.stem.split("__")[0] 45 | if "__" in file_path.stem: 46 | name = file_path.stem.split("__")[0] 47 | else: 48 | # Try to extract from other naming patterns 49 | name_match = re.match(r'([a-z_]+)_', file_path.stem) 50 | if name_match: 51 | name = name_match.group(1) 52 | else: 53 | name = file_path.stem 54 | 55 | # Get file stats 56 | stats = file_path.stat() 57 | 58 | # Try to extract block range from filename 59 | block_range = "" 60 | blocks_match = re.search(r'blocks__(\d+)_to_(\d+)', str(file_path)) 61 | if blocks_match: 62 | block_range = f"{blocks_match.group(1)}:{blocks_match.group(2)}" 63 | 64 | tables.append({ 65 | "name": name, 66 | "path": str(file_path), 67 | "size_bytes": stats.st_size, 68 | "modified": stats.st_mtime, 69 | "block_range": block_range, 70 | "is_latest": "latest" in str(file_path) 71 | }) 72 | 73 | return tables 74 | 75 | def extract_dataset_from_sql(sql_query: str) -> Optional[str]: 76 | """ 77 | Try to extract the dataset name from an SQL query. 78 | 79 | This is a simple heuristic that looks for FROM clauses in the query. 80 | 81 | Args: 82 | sql_query: The SQL query to parse 83 | 84 | Returns: 85 | The extracted dataset name or None if it couldn't be determined 86 | """ 87 | # Simple regex to find table names after FROM or JOIN 88 | # This won't handle all SQL syntax but works for basic queries 89 | matches = re.findall(r'(?:FROM|JOIN)\s+([a-zA-Z_][a-zA-Z0-9_]*)', sql_query, re.IGNORECASE) 90 | 91 | if matches: 92 | # Return the first match that isn't a common SQL keyword 93 | for match in matches: 94 | if match.lower() not in ('where', 'select', 'group', 'order', 'having', 'limit', 'offset'): 95 | return match 96 | 97 | return None 98 | 99 | def execute_sql_query( 100 | query: str, 101 | files: Optional[List[str]] = None, 102 | include_schema: bool = True 103 | ) -> Dict[str, Any]: 104 | """ 105 | Execute a SQL query against specified parquet files. 106 | 107 | Args: 108 | query: SQL query to execute 109 | files: List of parquet file paths to query. If None, will use all files in the data directory. 110 | include_schema: Whether to include schema information in the result 111 | 112 | Returns: 113 | Dictionary with query results and metadata 114 | """ 115 | data_dir = get_data_directory() 116 | conn = create_connection() 117 | 118 | try: 119 | # Determine which parquet files to use 120 | parquet_files = [] 121 | if files: 122 | for file_path in files: 123 | path = Path(file_path) 124 | if path.exists() and path.suffix == '.parquet': 125 | parquet_files.append(path) 126 | else: 127 | print(f"Warning: File not found or not a parquet file: {file_path}") 128 | else: 129 | # If no files provided, use all parquet files in the data directory 130 | parquet_files = list(data_dir.glob("**/*.parquet")) 131 | 132 | if not parquet_files: 133 | return { 134 | "success": False, 135 | "error": "No parquet files available. Download data first with query_dataset." 136 | } 137 | 138 | # Register temporary views for datasets if needed 139 | has_registered_views = False 140 | try: 141 | # Check if the query might be using direct table references without read_parquet() 142 | potential_tables = extract_tables_from_sql(query) 143 | 144 | # Create views for potential table names that aren't using read_parquet 145 | for table_name in potential_tables: 146 | if not ("read_parquet" in query.lower() and table_name.lower() in query.lower()): 147 | # Match files to table name more precisely 148 | # First, look for exact dataset name match (e.g., "blocks" in ethereum__blocks_*.parquet) 149 | dataset_pattern = f"__{table_name.lower()}__" 150 | exact_matches = [f for f in parquet_files if dataset_pattern in str(f).lower()] 151 | 152 | # If no exact matches, try looser matching 153 | if not exact_matches: 154 | # Try matching at word boundaries to avoid partial matches 155 | matching_files = [] 156 | for f in parquet_files: 157 | file_lower = str(f).lower() 158 | # Match dataset name patterns like ethereum__blocks_* or *_blocks_* 159 | if f"__{table_name.lower()}__" in file_lower or f"_{table_name.lower()}_" in file_lower: 160 | matching_files.append(f) 161 | # Also match if it's just the table name at the start of the filename 162 | elif f"/{table_name.lower()}_" in file_lower or f"/{table_name.lower()}." in file_lower: 163 | matching_files.append(f) 164 | else: 165 | matching_files = exact_matches 166 | 167 | if matching_files: 168 | # Create a combined view from all matching files 169 | conn.execute(f"DROP VIEW IF EXISTS {table_name}") 170 | 171 | if len(matching_files) == 1: 172 | # If only one file, create a simple view 173 | conn.execute(f"CREATE VIEW {table_name} AS SELECT * FROM '{matching_files[0]}'") 174 | print(f"Registered view '{table_name}' for file: {matching_files[0]}") 175 | else: 176 | # If multiple files, create a UNION ALL view to join all files 177 | union_query = " UNION ALL ".join([f"SELECT * FROM '{file}'" for file in matching_files]) 178 | conn.execute(f"CREATE VIEW {table_name} AS {union_query}") 179 | print(f"Registered view '{table_name}' for {len(matching_files)} files using UNION ALL") 180 | 181 | has_registered_views = True 182 | 183 | # Execute the query 184 | print(f"Executing SQL query: {query}") 185 | result = conn.execute(query).fetchdf() 186 | 187 | # Convert to records format for easier JSON serialization 188 | records = result.to_dict(orient="records") 189 | 190 | # Get schema information if requested 191 | schema_info = None 192 | if include_schema and not result.empty: 193 | schema_info = { 194 | "columns": list(result.columns), 195 | "dtypes": {col: str(dtype) for col, dtype in result.dtypes.items()} 196 | } 197 | 198 | # Track how the files were used 199 | file_usage = {} 200 | if has_registered_views: 201 | for table_name in extract_tables_from_sql(query): 202 | # Use the same matching logic as above 203 | dataset_pattern = f"__{table_name.lower()}__" 204 | exact_matches = [f for f in parquet_files if dataset_pattern in str(f).lower()] 205 | 206 | if not exact_matches: 207 | matching_files = [] 208 | for f in parquet_files: 209 | file_lower = str(f).lower() 210 | if f"__{table_name.lower()}__" in file_lower or f"_{table_name.lower()}_" in file_lower: 211 | matching_files.append(f) 212 | elif f"/{table_name.lower()}_" in file_lower or f"/{table_name.lower()}." in file_lower: 213 | matching_files.append(f) 214 | else: 215 | matching_files = exact_matches 216 | if matching_files: 217 | file_usage[table_name] = { 218 | "files": [str(f) for f in matching_files], 219 | "combined": len(matching_files) > 1 220 | } 221 | 222 | return { 223 | "success": True, 224 | "result": records, 225 | "row_count": len(records), 226 | "schema": schema_info, 227 | "files_used": [str(f) for f in parquet_files], 228 | "used_direct_references": has_registered_views, 229 | "table_mappings": file_usage if file_usage else None 230 | } 231 | except Exception as e: 232 | # Handle query-specific errors 233 | error_msg = str(e) 234 | print(f"SQL query error: {error_msg}") 235 | return { 236 | "success": False, 237 | "error": error_msg, 238 | "files_available": [str(f) for f in parquet_files] 239 | } 240 | except Exception as e: 241 | # Handle connection and setup errors 242 | return { 243 | "success": False, 244 | "error": str(e) 245 | } 246 | finally: 247 | # Clean up any registered views 248 | if has_registered_views: 249 | for table_name in extract_tables_from_sql(query): 250 | try: 251 | conn.execute(f"DROP VIEW IF EXISTS {table_name}") 252 | except: 253 | pass 254 | conn.close() 255 | 256 | def extract_tables_from_sql(sql_query: str) -> List[str]: 257 | """Extract table names from an SQL query that aren't using read_parquet.""" 258 | # This extends our extract_dataset_from_sql function for more general use 259 | import re 260 | 261 | # Find potential table names after FROM or JOIN 262 | matches = re.findall(r'(?:FROM|JOIN)\s+([a-zA-Z_][a-zA-Z0-9_]*)', sql_query, re.IGNORECASE) 263 | 264 | # Filter out common SQL keywords 265 | sql_keywords = ('where', 'select', 'group', 'order', 'having', 'limit', 'offset') 266 | return [match for match in matches if match.lower() not in sql_keywords] 267 | 268 | def get_table_schema(file_path: str) -> Dict[str, Any]: 269 | """ 270 | Get schema information for a parquet file. 271 | 272 | Args: 273 | file_path: Path to the parquet file 274 | 275 | Returns: 276 | Dictionary with schema information 277 | """ 278 | conn = create_connection() 279 | 280 | try: 281 | path = Path(file_path) 282 | if not path.exists() or path.suffix != '.parquet': 283 | return { 284 | "success": False, 285 | "error": f"File not found or not a parquet file: {file_path}" 286 | } 287 | 288 | # Register a temporary view for the file 289 | conn.execute(f"CREATE VIEW temp_view AS SELECT * FROM '{file_path}'") 290 | 291 | # Get schema info 292 | schema_result = conn.execute("SELECT column_name, data_type FROM information_schema.columns WHERE table_name='temp_view'").fetchdf() 293 | 294 | # Get sample data 295 | sample_data = conn.execute("SELECT * FROM temp_view LIMIT 5").fetchdf() 296 | 297 | # Get row count (might be expensive for large files) 298 | row_count = conn.execute("SELECT COUNT(*) as count FROM temp_view").fetchone()[0] 299 | 300 | return { 301 | "success": True, 302 | "file_path": file_path, 303 | "columns": schema_result.to_dict(orient="records"), 304 | "sample_data": sample_data.to_dict(orient="records"), 305 | "row_count": row_count 306 | } 307 | except Exception as e: 308 | return { 309 | "success": False, 310 | "error": str(e) 311 | } 312 | finally: 313 | conn.close() -------------------------------------------------------------------------------- /ethereum__blocks__00001000_to_00001004.json: -------------------------------------------------------------------------------- 1 | [{"block_hash":"0x5b4590a9905fa1c9cc273f32e6dc63b4c512f0ee14edc6fa41c26b416a7b5d58","author":"0xbb7b8287f3f0a933474a79eae42cbca977791171","block_number":1000,"gas_used":0,"extra_data":"0x476574682f4c5649562f76312e302e302f6c696e75782f676f312e342e32","timestamp":1438272138,"base_fee_per_gas":null,"chain_id":1},{"block_hash":"0x48acba3928780f40b61ca7f0614448847b2af9b35b985e60054f7bb41b36b1cd","author":"0xbb7b8287f3f0a933474a79eae42cbca977791171","block_number":1001,"gas_used":0,"extra_data":"0x476574682f4c5649562f76312e302e302f6c696e75782f676f312e342e32","timestamp":1438272139,"base_fee_per_gas":null,"chain_id":1},{"block_hash":"0x15b90b909a3c844b8e0ca76302027619ac56c4750dcddfc83cb78c8cbdba4b28","author":"0xbb7b8287f3f0a933474a79eae42cbca977791171","block_number":1002,"gas_used":0,"extra_data":"0x476574682f4c5649562f76312e302e302f6c696e75782f676f312e342e32","timestamp":1438272140,"base_fee_per_gas":null,"chain_id":1},{"block_hash":"0x8b83375cdc6a3490595b1cde985a810bea9bdb6df601c4f07719629a59ab520d","author":"0xbb7b8287f3f0a933474a79eae42cbca977791171","block_number":1003,"gas_used":0,"extra_data":"0x476574682f4c5649562f76312e302e302f6c696e75782f676f312e342e32","timestamp":1438272141,"base_fee_per_gas":null,"chain_id":1},{"block_hash":"0x3aa173397b610df7f96ad29f76a3868890b5d6ac09fdf139bd5a7a57360f89c2","author":"0xa1623430350c5df1b52b0b57483a5bb45d1796da","block_number":1004,"gas_used":0,"extra_data":"0x476574682f76312e302e302d30636463373634372f6c696e75782f676f312e34","timestamp":1438272142,"base_fee_per_gas":null,"chain_id":1}] -------------------------------------------------------------------------------- /ethereum__blocks__22005903_to_22005907.json: -------------------------------------------------------------------------------- 1 | [{"block_hash":"0x32a93425c3c7c4df6a2951b6b672de7adfb2ce4b194ac0fdfb701c6b90881f68","author":"0x95222290dd7278aa3ddd389cc1e1d165cc4bafe5","block_number":22005903,"gas_used":11416700,"extra_data":"0x6265617665726275696c642e6f7267","timestamp":1741482167,"base_fee_per_gas":548088168,"chain_id":1},{"block_hash":"0xe053107b091e750749a4fb35d8f62455359fa92cd956bb1b2ddc8ceffade1771","author":"0x4838b106fce9647bdf1e7877bf73ce8b0bad5f97","block_number":22005904,"gas_used":17920506,"extra_data":"0x546974616e2028746974616e6275696c6465722e78797a29","timestamp":1741482179,"base_fee_per_gas":523073542,"chain_id":1},{"block_hash":"0x52d2a60f9d815988acb9f84af41d71bd98f254544403de330c22355f62cd1d72","author":"0x95222290dd7278aa3ddd389cc1e1d165cc4bafe5","block_number":22005905,"gas_used":30175069,"extra_data":"0x6265617665726275696c642e6f7267","timestamp":1741482191,"base_fee_per_gas":522784911,"chain_id":1},{"block_hash":"0xf58e243788d10806928e2ce1893abe14a5d1a4c7a69f2dba0b608a0972b82e77","author":"0x95222290dd7278aa3ddd389cc1e1d165cc4bafe5","block_number":22005906,"gas_used":12427766,"extra_data":"0x6265617665726275696c642e6f7267","timestamp":1741482203,"base_fee_per_gas":567093198,"chain_id":1},{"block_hash":"0x8ebdea19b52e3585a3a476d4eea02033c1c702072eeb59461a8442d84bbb2ac8","author":"0x95222290dd7278aa3ddd389cc1e1d165cc4bafe5","block_number":22005907,"gas_used":28509755,"extra_data":"0x6265617665726275696c642e6f7267","timestamp":1741482215,"base_fee_per_gas":545149065,"chain_id":1}] -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "cryo-mcp" 3 | version = "0.1.4" 4 | description = "MCP server for querying Ethereum blockchain data using cryo" 5 | readme = "README.md" 6 | license = {file = "LICENSE"} 7 | requires-python = ">=3.10" 8 | authors = [ 9 | {name = "z80", email = "z80@ophy.xyz"} 10 | ] 11 | 12 | keywords = ["ethereum", "blockchain", "cryo", "mcp", "api", "server"] 13 | classifiers = [ 14 | "Development Status :: 4 - Beta", 15 | "Intended Audience :: Developers", 16 | "License :: OSI Approved :: MIT License", 17 | "Programming Language :: Python :: 3", 18 | "Programming Language :: Python :: 3.8", 19 | "Programming Language :: Python :: 3.9", 20 | "Programming Language :: Python :: 3.10", 21 | "Programming Language :: Python :: 3.11", 22 | "Programming Language :: Python :: 3.12", 23 | "Topic :: Software Development :: Libraries :: Python Modules", 24 | ] 25 | dependencies = [ 26 | "duckdb>=1.2.1", 27 | "mcp>=1.3.0", 28 | "numpy>=2.2.3", 29 | "pandas>=2.2.3", 30 | "pyarrow>=19.0.1", 31 | "requests>=2.28.0", 32 | ] 33 | 34 | [project.optional-dependencies] 35 | dev = [ 36 | "pytest>=7.0.0", 37 | "black>=23.0.0", 38 | "isort>=5.10.0", 39 | "mypy>=1.0.0", 40 | ] 41 | 42 | [project.urls] 43 | "Homepage" = "https://github.com/z80dev/cryo-mcp" 44 | "Bug Tracker" = "https://github.com/z80dev/cryo-mcp/issues" 45 | 46 | [project.scripts] 47 | cryo-mcp = "cryo_mcp.server:main" 48 | 49 | 50 | [build-system] 51 | requires = ["hatchling"] 52 | build-backend = "hatchling.build" 53 | 54 | [tool.black] 55 | line-length = 88 56 | 57 | [tool.isort] 58 | profile = "black" 59 | line_length = 88 60 | 61 | [tool.mypy] 62 | python_version = "3.8" 63 | warn_return_any = true 64 | warn_unused_configs = true 65 | 66 | [dependency-groups] 67 | dev = [ 68 | "ipython>=8.34.0", 69 | "pytest>=8.3.5", 70 | ] 71 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # tests for cryo-mcp -------------------------------------------------------------------------------- /tests/data/ethereum__blocks__00001000_to_00001004.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/z80dev/cryo-mcp/5c831d9af138d73d8345d2c994536fb4ce22c184/tests/data/ethereum__blocks__00001000_to_00001004.parquet -------------------------------------------------------------------------------- /tests/test_blocks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import os 4 | import subprocess 5 | from pathlib import Path 6 | 7 | # Set ETH_RPC_URL 8 | os.environ["ETH_RPC_URL"] = "http://10.0.0.48:8545" 9 | print(f"Using ETH_RPC_URL: {os.environ['ETH_RPC_URL']}") 10 | 11 | def test_integer_blocks(): 12 | """Test using integer blocks with cryo directly""" 13 | 14 | # Convert integer block range to string 15 | start_block = 1000 16 | end_block = 1005 17 | block_range = f"{start_block}:{end_block}" 18 | 19 | cmd = ["cryo", "blocks", "-b", block_range, "-r", "http://10.0.0.48:8545", "--json"] 20 | 21 | print(f"Running command: {' '.join(cmd)}") 22 | result = subprocess.run(cmd, capture_output=True, text=True) 23 | 24 | print(f"Return code: {result.returncode}") 25 | print(f"STDOUT: {result.stdout[:500]}...") 26 | 27 | # Now run the equivalent using our server's string conversion logic 28 | temp_dir = Path("/tmp/cryo_int_test") 29 | temp_dir.mkdir(exist_ok=True) 30 | 31 | cmd = ["cryo", "blocks", "-b", block_range, "-r", "http://10.0.0.48:8545", "--json", "-o", str(temp_dir)] 32 | 33 | print(f"\nRunning output command: {' '.join(cmd)}") 34 | result = subprocess.run(cmd, capture_output=True, text=True) 35 | 36 | print(f"Return code: {result.returncode}") 37 | print(f"STDOUT: {result.stdout[:500]}...") 38 | 39 | # Find and read the output file 40 | output_files = list(temp_dir.glob("*blocks*.json")) 41 | print(f"Output files: {output_files}") 42 | 43 | if output_files: 44 | with open(output_files[0], 'r') as f: 45 | data = json.load(f) 46 | print(f"Number of records: {len(data)}") 47 | print(f"First block number: {data[0]['block_number']}") 48 | print(f"Last block number: {data[-1]['block_number']}") 49 | 50 | # Verify that we got the block range we asked for 51 | # Note: cryo seems to use start:end as [start, end) (inclusive start, exclusive end) 52 | expected_blocks = list(range(start_block, end_block)) 53 | actual_blocks = [block["block_number"] for block in data] 54 | 55 | print(f"Expected blocks: {expected_blocks}") 56 | print(f"Actual blocks: {actual_blocks}") 57 | 58 | if sorted(actual_blocks) == sorted(expected_blocks): 59 | print("✅ Block ranges match!") 60 | else: 61 | print("❌ Block ranges do not match!") 62 | 63 | if __name__ == "__main__": 64 | test_integer_blocks() -------------------------------------------------------------------------------- /tests/test_contract_transactions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import os 4 | import subprocess 5 | from pathlib import Path 6 | 7 | # Set ETH_RPC_URL 8 | os.environ["ETH_RPC_URL"] = "http://10.0.0.48:8545" 9 | print(f"Using ETH_RPC_URL: {os.environ['ETH_RPC_URL']}") 10 | 11 | def test_contract_transactions(): 12 | """Test fetching transactions for a specific contract""" 13 | 14 | # Use a known block number 15 | block_num = 22001067 # You can replace this with any block number you want to test with 16 | block_range = f"{block_num}:{block_num+1}" 17 | 18 | # Use a known contract address (USDC for example) 19 | contract_address = "0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48" 20 | 21 | # Create a temp directory for output 22 | temp_dir = Path("/tmp/cryo_contract_tx_test") 23 | temp_dir.mkdir(exist_ok=True) 24 | 25 | cmd = [ 26 | "cryo", "transactions", 27 | "-b", block_range, 28 | "--contract", contract_address, 29 | "-r", os.environ["ETH_RPC_URL"], 30 | "--json", 31 | "-o", str(temp_dir) 32 | ] 33 | 34 | print(f"Running command: {' '.join(cmd)}") 35 | result = subprocess.run(cmd, capture_output=True, text=True) 36 | 37 | print(f"Return code: {result.returncode}") 38 | print(f"STDOUT: {result.stdout[:500]}...") 39 | 40 | # Find and read the output file 41 | output_files = list(temp_dir.glob("*transactions*.json")) 42 | print(f"Output files: {output_files}") 43 | 44 | if output_files: 45 | with open(output_files[0], 'r') as f: 46 | data = json.load(f) 47 | 48 | print(f"Number of contract transactions: {len(data)}") 49 | if data: 50 | print(f"First transaction hash: {data[0].get('transaction_hash')}") 51 | print(f"First transaction block number: {data[0].get('block_number')}") 52 | 53 | # Verify contract interactions 54 | for tx in data: 55 | if tx.get("to_address") == contract_address: 56 | print(f"Found transaction to contract: {tx.get('transaction_hash')}") 57 | elif tx.get("from_address") == contract_address: 58 | print(f"Found transaction from contract: {tx.get('transaction_hash')}") 59 | 60 | return data 61 | 62 | return None 63 | 64 | if __name__ == "__main__": 65 | test_contract_transactions() -------------------------------------------------------------------------------- /tests/test_cryo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import os 4 | import subprocess 5 | from pathlib import Path 6 | 7 | # Set ETH_RPC_URL 8 | os.environ["ETH_RPC_URL"] = "http://10.0.0.48:8545" 9 | print(f"Using ETH_RPC_URL: {os.environ['ETH_RPC_URL']}") 10 | 11 | def test_cryo_cli(): 12 | """Test direct CLI command to verify it works""" 13 | dataset = "blocks" 14 | cmd = ["cryo", dataset, "-b", "1000:1005", "-r", "http://10.0.0.48:8545", "--json"] 15 | 16 | print(f"Running command: {' '.join(cmd)}") 17 | result = subprocess.run(cmd, capture_output=True, text=True) 18 | 19 | print(f"Return code: {result.returncode}") 20 | print(f"STDOUT: {result.stdout[:500]}...") 21 | print(f"STDERR: {result.stderr[:500]}...") 22 | 23 | if result.returncode != 0: 24 | print("CLI command failed") 25 | return False 26 | 27 | return True 28 | 29 | def test_cryo_with_output(): 30 | """Test with output directory as we do in the server""" 31 | dataset = "blocks" 32 | temp_dir = Path("/tmp/cryo_test") 33 | temp_dir.mkdir(exist_ok=True) 34 | 35 | cmd = ["cryo", dataset, "-b", "1000:1005", "-r", "http://10.0.0.48:8545", "--json", "-o", str(temp_dir)] 36 | 37 | print(f"Running command: {' '.join(cmd)}") 38 | result = subprocess.run(cmd, capture_output=True, text=True) 39 | 40 | print(f"Return code: {result.returncode}") 41 | print(f"STDOUT: {result.stdout[:500]}...") 42 | print(f"STDERR: {result.stderr[:500]}...") 43 | 44 | if result.returncode != 0: 45 | print("CLI command with output failed") 46 | return False 47 | 48 | # Find the output file 49 | output_files = list(temp_dir.glob(f"*{dataset}*.json")) 50 | print(f"Output files: {output_files}") 51 | 52 | if not output_files: 53 | print("No output files found") 54 | return False 55 | 56 | # Read the first file 57 | with open(output_files[0], 'r') as f: 58 | data = json.load(f) 59 | print(f"Data sample: {json.dumps(data[:2], indent=2)}") 60 | 61 | return True 62 | 63 | if __name__ == "__main__": 64 | print("=== Testing direct CLI command ===") 65 | cli_result = test_cryo_cli() 66 | 67 | print("\n=== Testing CLI command with output directory ===") 68 | output_result = test_cryo_with_output() 69 | 70 | if cli_result and output_result: 71 | print("\n✅ All tests passed") 72 | else: 73 | print("\n❌ Tests failed") -------------------------------------------------------------------------------- /tests/test_latest_block.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import os 4 | import subprocess 5 | import requests 6 | from pathlib import Path 7 | 8 | # Set ETH_RPC_URL 9 | RPC_URL = "http://10.0.0.48:8545" 10 | os.environ["ETH_RPC_URL"] = RPC_URL 11 | print(f"Using ETH_RPC_URL: {os.environ['ETH_RPC_URL']}") 12 | 13 | def get_latest_block_number(): 14 | """Get the latest block number from the Ethereum node""" 15 | payload = { 16 | "jsonrpc": "2.0", 17 | "method": "eth_blockNumber", 18 | "params": [], 19 | "id": 1 20 | } 21 | 22 | try: 23 | response = requests.post(RPC_URL, json=payload) 24 | response_data = response.json() 25 | 26 | if 'result' in response_data: 27 | # Convert hex to int 28 | latest_block = int(response_data['result'], 16) 29 | print(f"Latest block number: {latest_block}") 30 | return latest_block 31 | else: 32 | print(f"Error: {response_data.get('error', 'Unknown error')}") 33 | return None 34 | except Exception as e: 35 | print(f"Exception when fetching latest block: {e}") 36 | return None 37 | 38 | def test_blocks_range(): 39 | """Test querying a specific block range""" 40 | # Use fixed block range for testing 41 | start_block = 22005903 42 | end_block = 22005908 43 | 44 | block_range = f"{start_block}:{end_block}" 45 | 46 | cmd = ["cryo", "blocks", "-b", block_range, "-r", RPC_URL, "--json"] 47 | 48 | print(f"Running command: {' '.join(cmd)}") 49 | result = subprocess.run(cmd, capture_output=True, text=True) 50 | 51 | print(f"Return code: {result.returncode}") 52 | print(f"STDOUT: {result.stdout[:500]}...") 53 | 54 | if result.returncode != 0: 55 | print(f"STDERR: {result.stderr}") 56 | assert False, "Command failed" 57 | 58 | assert True 59 | 60 | def test_latest_blocks(): 61 | """Test getting the latest blocks""" 62 | latest_block = get_latest_block_number() 63 | 64 | if latest_block is None: 65 | print("Failed to get the latest block number") 66 | assert False, "Failed to get the latest block number" 67 | 68 | # Test getting the latest 5 blocks 69 | start_block = latest_block - 5 70 | print(f"Fetching blocks from {start_block} to {latest_block}") 71 | 72 | # Direct implementation rather than calling test_blocks_range 73 | block_range = f"{start_block}:{latest_block+1}" # Add 1 to make it inclusive 74 | 75 | cmd = ["cryo", "blocks", "-b", block_range, "-r", RPC_URL, "--json"] 76 | 77 | print(f"Running command: {' '.join(cmd)}") 78 | result = subprocess.run(cmd, capture_output=True, text=True) 79 | 80 | print(f"Return code: {result.returncode}") 81 | print(f"STDOUT: {result.stdout[:500]}...") 82 | 83 | if result.returncode != 0: 84 | print(f"STDERR: {result.stderr}") 85 | assert False, "Failed to fetch the latest blocks" 86 | 87 | assert True 88 | 89 | if __name__ == "__main__": 90 | test_latest_blocks() -------------------------------------------------------------------------------- /tests/test_latest_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import os 4 | import subprocess 5 | import requests 6 | from pathlib import Path 7 | 8 | # Define the function directly in the test script 9 | def get_latest_block_number(): 10 | """Get the latest block number from the Ethereum node""" 11 | rpc_url = os.environ.get("ETH_RPC_URL", "http://10.0.0.48:8545") 12 | 13 | payload = { 14 | "jsonrpc": "2.0", 15 | "method": "eth_blockNumber", 16 | "params": [], 17 | "id": 1 18 | } 19 | 20 | try: 21 | response = requests.post(rpc_url, json=payload) 22 | response_data = response.json() 23 | 24 | if 'result' in response_data: 25 | # Convert hex to int 26 | latest_block = int(response_data['result'], 16) 27 | print(f"Latest block number: {latest_block}") 28 | return latest_block 29 | else: 30 | print(f"Error: {response_data.get('error', 'Unknown error')}") 31 | return None 32 | except Exception as e: 33 | print(f"Exception when fetching latest block: {e}") 34 | return None 35 | 36 | def test_latest_block_functions(): 37 | """Test the direct latest block functions""" 38 | 39 | print("=== Testing get_latest_block_number() ===") 40 | latest_block = get_latest_block_number() 41 | print(f"Latest block number: {latest_block}") 42 | 43 | print("\n=== Testing get_latest_ethereum_block with cryo ===") 44 | 45 | # Test getting the latest block using cryo directly 46 | if latest_block: 47 | rpc_url = os.environ.get("ETH_RPC_URL", "http://10.0.0.48:8545") 48 | block_range = f"{latest_block}:{latest_block+1}" 49 | 50 | temp_dir = Path("/tmp/cryo_latest_test") 51 | temp_dir.mkdir(exist_ok=True) 52 | 53 | cmd = ["cryo", "blocks", "-b", block_range, "-r", rpc_url, "--json", "-o", str(temp_dir)] 54 | 55 | print(f"Running command: {' '.join(cmd)}") 56 | result = subprocess.run(cmd, capture_output=True, text=True) 57 | 58 | if result.returncode != 0: 59 | print(f"Error: {result.stderr}") 60 | return False 61 | 62 | # Find the output file 63 | output_files = list(temp_dir.glob("*blocks*.json")) 64 | 65 | if not output_files: 66 | print("No output files found") 67 | return False 68 | 69 | # Read the block data 70 | with open(output_files[0], 'r') as f: 71 | data = json.load(f) 72 | if data and len(data) > 0: 73 | print(f"Block data: {json.dumps(data[0], indent=2)}") 74 | return True 75 | 76 | return False 77 | 78 | def test_query_latest_blocks(): 79 | """Test querying the latest blocks using subprocess""" 80 | 81 | # Get the latest block number 82 | latest_block = get_latest_block_number() 83 | if latest_block is None: 84 | print("Failed to get latest block number") 85 | return False 86 | 87 | # Test getting a range of latest blocks 88 | start_block = latest_block - 5 89 | end_block = latest_block 90 | 91 | # Create a block range string 92 | block_range = f"{start_block}:{end_block+1}" # Add 1 to make it inclusive 93 | 94 | # Use cryo directly 95 | rpc_url = os.environ.get("ETH_RPC_URL", "http://10.0.0.48:8545") 96 | temp_dir = Path("/tmp/cryo_test_latest") 97 | temp_dir.mkdir(exist_ok=True) 98 | 99 | cmd = [ 100 | "cryo", "blocks", 101 | "-b", block_range, 102 | "-r", rpc_url, 103 | "--json", 104 | "-o", str(temp_dir) 105 | ] 106 | 107 | print(f"Running command: {' '.join(cmd)}") 108 | result = subprocess.run(cmd, capture_output=True, text=True) 109 | 110 | if result.returncode != 0: 111 | print(f"Error: {result.stderr}") 112 | return False 113 | 114 | # Find the output file 115 | output_files = list(temp_dir.glob("*blocks*.json")) 116 | 117 | if not output_files: 118 | print("No output files found") 119 | return False 120 | 121 | # Read the first file 122 | with open(output_files[0], 'r') as f: 123 | data = json.load(f) 124 | print(f"Found {len(data)} blocks") 125 | 126 | # Check if we got the range we expected 127 | block_numbers = [block["block_number"] for block in data] 128 | print(f"Block numbers: {block_numbers}") 129 | 130 | # Check the range covers what we requested (inclusive start to end) 131 | expected_blocks = list(range(start_block, end_block + 1)) 132 | actual_blocks = sorted(block_numbers) 133 | 134 | print(f"Expected blocks: {expected_blocks}") 135 | print(f"Actual blocks: {actual_blocks}") 136 | 137 | return set(expected_blocks) == set(actual_blocks) 138 | 139 | if __name__ == "__main__": 140 | print("Testing latest block functions") 141 | 142 | # Test direct functions 143 | functions_success = test_latest_block_functions() 144 | 145 | # Test querying latest blocks 146 | query_success = test_query_latest_blocks() 147 | 148 | if functions_success and query_success: 149 | print("\n✅ All tests passed!") 150 | else: 151 | print("\n❌ Tests failed") 152 | if not functions_success: 153 | print("- Latest block functions test failed") 154 | if not query_success: 155 | print("- Query latest blocks test failed") -------------------------------------------------------------------------------- /tests/test_mcp_functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | from cryo_mcp.server import ( 5 | get_latest_ethereum_block, 6 | list_datasets, 7 | query_dataset, 8 | lookup_dataset 9 | ) 10 | 11 | # Set ETH_RPC_URL 12 | os.environ["ETH_RPC_URL"] = "http://10.0.0.48:8545" 13 | print(f"Using ETH_RPC_URL: {os.environ['ETH_RPC_URL']}") 14 | 15 | def test_get_latest_block(): 16 | """Test the get_latest_ethereum_block function""" 17 | print("\n=== Testing get_latest_ethereum_block ===") 18 | 19 | block_info = get_latest_ethereum_block() 20 | print(f"Latest block: {block_info}") 21 | 22 | if "error" in block_info: 23 | print(f"❌ Error getting latest block: {block_info['error']}") 24 | return False 25 | 26 | print(f"✅ Successfully got latest block: {block_info['block_number']}") 27 | return True 28 | 29 | def test_list_datasets(): 30 | """Test the list_datasets function""" 31 | print("\n=== Testing list_datasets ===") 32 | 33 | datasets = list_datasets() 34 | print(f"Found {len(datasets)} datasets: {', '.join(datasets[:5])}...") 35 | 36 | # Check that we have some common datasets 37 | required_datasets = ["blocks", "transactions", "logs", "balances"] 38 | missing = [ds for ds in required_datasets if ds not in datasets] 39 | 40 | if missing: 41 | print(f"❌ Missing required datasets: {', '.join(missing)}") 42 | return False 43 | 44 | print(f"✅ Successfully listed {len(datasets)} datasets") 45 | return True 46 | 47 | def test_query_dataset(): 48 | """Test the query_dataset function""" 49 | print("\n=== Testing query_dataset ===") 50 | 51 | # Test transactions with latest block 52 | result = query_dataset( 53 | dataset="transactions", 54 | use_latest=True, 55 | output_format="json" 56 | ) 57 | 58 | if "error" in result: 59 | print(f"❌ Error querying transactions: {result['error']}") 60 | return False 61 | 62 | data = result.get("data", []) 63 | print(f"Got {len(data)} transactions from latest block") 64 | 65 | if not data: 66 | print("❌ No transactions returned") 67 | return False 68 | 69 | # Test transactions with block range and contract filter 70 | contract_address = "0xa0b86991c6218b36c1d19d4a2e9eb0ce3606eb48" # USDC 71 | result = query_dataset( 72 | dataset="transactions", 73 | blocks="22001067:22001068", 74 | contract=contract_address, 75 | output_format="json" 76 | ) 77 | 78 | if "error" in result: 79 | print(f"❌ Error querying contract transactions: {result['error']}") 80 | return False 81 | 82 | data = result.get("data", []) 83 | print(f"Got {len(data)} USDC transactions from block 22001067") 84 | 85 | contract_txs = [tx for tx in data if tx.get("to_address") == contract_address] 86 | print(f"Found {len(contract_txs)} transactions to USDC") 87 | 88 | print(f"✅ Successfully queried dataset with different parameters") 89 | return True 90 | 91 | def test_lookup_dataset(): 92 | """Test the lookup_dataset function""" 93 | print("\n=== Testing lookup_dataset ===") 94 | 95 | # Look up transactions dataset 96 | result = lookup_dataset( 97 | name="transactions", 98 | use_latest_sample=True 99 | ) 100 | 101 | if "schema_error" in result and "sample_error" not in result: 102 | print(f"❓ Schema error but sample OK: {result['schema_error']}") 103 | elif "sample_error" in result: 104 | print(f"❓ Sample error: {result['sample_error']}") 105 | 106 | print(f"Dataset info: {result['name']}") 107 | print(f"Example queries: {result['example_queries']}") 108 | 109 | # Check that we got some schema information 110 | if "schema" in result or "schema_error" in result: 111 | print("✅ Got schema information (or error)") 112 | else: 113 | print("❌ Missing schema information") 114 | return False 115 | 116 | print(f"✅ Successfully looked up dataset information") 117 | return True 118 | 119 | def main(): 120 | """Run all tests""" 121 | tests = [ 122 | test_get_latest_block, 123 | test_list_datasets, 124 | test_query_dataset, 125 | test_lookup_dataset 126 | ] 127 | 128 | results = [] 129 | for test in tests: 130 | results.append(test()) 131 | 132 | print("\n=== Test Summary ===") 133 | print(f"Passed: {results.count(True)}/{len(results)}") 134 | 135 | return 0 if all(results) else 1 136 | 137 | if __name__ == "__main__": 138 | sys.exit(main()) -------------------------------------------------------------------------------- /tests/test_sql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import json 4 | import tempfile 5 | from pathlib import Path 6 | import unittest 7 | import subprocess 8 | from unittest.mock import patch, MagicMock 9 | import shutil 10 | 11 | # Add parent directory to path to import modules 12 | import sys 13 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 14 | 15 | from cryo_mcp.sql import execute_sql_query, list_available_tables, create_connection, extract_dataset_from_sql 16 | from cryo_mcp.server import query_blockchain_sql, query_dataset 17 | 18 | # Constants for test data 19 | TEST_DATA_DIR = Path(__file__).parent / "data" 20 | TEST_BLOCK_RANGE = "1000:1005" # Use a small block range for testing 21 | 22 | class TestSQL(unittest.TestCase): 23 | """Test cases for SQL functionality""" 24 | 25 | @classmethod 26 | def setUpClass(cls): 27 | """Setup for all tests - download real blockchain data once""" 28 | # Create test data directory if it doesn't exist 29 | TEST_DATA_DIR.mkdir(exist_ok=True) 30 | 31 | # Setup ETH_RPC_URL environment variable if not set 32 | if not os.environ.get("ETH_RPC_URL"): 33 | os.environ["ETH_RPC_URL"] = "http://localhost:8545" 34 | 35 | # We don't need to download data here anymore as we've manually downloaded it 36 | # via direct shell command 37 | 38 | def setUp(self): 39 | """Setup for each test""" 40 | # Create a temporary directory for test data 41 | self.temp_dir = tempfile.TemporaryDirectory() 42 | self.data_dir = Path(self.temp_dir.name) 43 | 44 | # Set environment variable for data directory 45 | os.environ["CRYO_DATA_DIR"] = str(self.data_dir) 46 | 47 | # Create latest directory 48 | self.latest_dir = self.data_dir / "latest" 49 | self.latest_dir.mkdir(exist_ok=True) 50 | 51 | # Copy real parquet files from TEST_DATA_DIR to temp dir if they exist 52 | self.has_real_data = False 53 | parquet_files = list(TEST_DATA_DIR.glob("*.parquet")) 54 | if parquet_files: 55 | for file in parquet_files: 56 | shutil.copy(file, self.data_dir) 57 | self.has_real_data = True 58 | print(f"Using real blockchain data for tests: {[f.name for f in parquet_files]}") 59 | 60 | def tearDown(self): 61 | """Clean up temporary directory""" 62 | self.temp_dir.cleanup() 63 | 64 | def create_mock_parquet_file(self, dataset_name, is_latest=False): 65 | """Create a mock parquet file for testing""" 66 | # If we already have real data, don't need to create mock data 67 | if self.has_real_data and dataset_name == "blocks": 68 | return next(self.data_dir.glob("*blocks*.parquet")) 69 | 70 | # Determine the directory based on whether it's a latest file 71 | directory = self.latest_dir if is_latest else self.data_dir 72 | 73 | # Create a mock parquet file (doesn't need to be a real parquet file for our tests) 74 | file_path = directory / f"{dataset_name}__00001000_to_00001010.parquet" 75 | with open(file_path, 'w') as f: 76 | f.write("mock parquet data") 77 | 78 | return file_path 79 | 80 | @patch('cryo_mcp.sql.duckdb.connect') 81 | def test_extract_dataset_from_sql(self, mock_connect): 82 | """Test extracting dataset names from SQL queries""" 83 | test_cases = [ 84 | {"query": "SELECT * FROM blocks LIMIT 10", "expected": "blocks"}, 85 | {"query": "SELECT block_number FROM transactions WHERE value > 0", "expected": "transactions"}, 86 | {"query": "SELECT logs.address FROM logs", "expected": "logs"}, 87 | {"query": "SELECT t.hash FROM transactions t JOIN blocks b", "expected": "transactions"}, 88 | {"query": "SELECT * FROM WHERE x = 1", "expected": None}, # Invalid SQL 89 | {"query": "SELECT * FROM", "expected": None}, # Invalid SQL 90 | ] 91 | 92 | for case in test_cases: 93 | result = extract_dataset_from_sql(case["query"]) 94 | self.assertEqual(result, case["expected"], f"Failed for query: {case['query']}") 95 | 96 | def test_list_available_tables(self): 97 | """Test listing available tables with real data""" 98 | # If we don't have real data, we need to create mock files 99 | if not self.has_real_data: 100 | self.create_mock_parquet_file("blocks") 101 | self.create_mock_parquet_file("transactions", is_latest=True) 102 | else: 103 | # With real data, we should already have a blocks table 104 | pass 105 | 106 | # Get tables 107 | tables = list_available_tables() 108 | 109 | # Check that we have at least one table 110 | self.assertTrue(len(tables) > 0, "Should find at least one table") 111 | 112 | # With real data, verify that our known table is found 113 | if self.has_real_data: 114 | # There should be at least one table with 'ethereum' in the name 115 | ethereum_tables = [table for table in tables if 'ethereum' in table["path"]] 116 | self.assertTrue(len(ethereum_tables) > 0, "Should find ethereum tables") 117 | 118 | @patch('cryo_mcp.server.query_dataset') 119 | @patch('cryo_mcp.sql.execute_sql_query') 120 | def test_query_blockchain_sql(self, mock_execute_sql, mock_query_dataset): 121 | """Test the combined blockchain SQL query function""" 122 | # Mock query_dataset to return a successful result 123 | mock_query_dataset.return_value = { 124 | "files": ["/path/to/blocks__1000_to_1010.parquet"], 125 | "count": 1, 126 | "format": "parquet" 127 | } 128 | 129 | # Mock execute_sql_query to return a successful result 130 | mock_execute_sql.return_value = { 131 | "success": True, 132 | "result": [{"block_number": 1000, "gas_used": 1000000}], 133 | "row_count": 1, 134 | "schema": {"columns": ["block_number", "gas_used"]}, 135 | "files_used": ["/path/to/blocks__1000_to_1010.parquet"] 136 | } 137 | 138 | # Call query_blockchain_sql 139 | result = query_blockchain_sql( 140 | sql_query="SELECT block_number, gas_used FROM '/path/to/blocks__1000_to_1010.parquet' LIMIT 1", 141 | dataset="blocks", 142 | blocks="1000:1010" 143 | ) 144 | 145 | # Check results 146 | self.assertTrue(result["success"], "Query should succeed") 147 | 148 | # Verify that query_dataset was called with correct parameters 149 | mock_query_dataset.assert_called_once_with( 150 | dataset="blocks", 151 | blocks="1000:1010", 152 | start_block=None, 153 | end_block=None, 154 | use_latest=False, 155 | blocks_from_latest=None, 156 | contract=None, 157 | output_format="parquet" 158 | ) 159 | 160 | # Verify that execute_sql_query was called with correct parameters 161 | mock_execute_sql.assert_called_once_with( 162 | "SELECT block_number, gas_used FROM '/path/to/blocks__1000_to_1010.parquet' LIMIT 1", 163 | ["/path/to/blocks__1000_to_1010.parquet"], # files parameter 164 | True # include_schema parameter 165 | ) 166 | 167 | def test_execute_sql_query_with_nonexistent_file(self): 168 | """Test executing SQL query with a nonexistent file""" 169 | # Call execute_sql_query with a file that doesn't exist 170 | result = execute_sql_query( 171 | "SELECT * FROM '/nonexistent/file.parquet' LIMIT 1", 172 | files=['/nonexistent/file.parquet'] 173 | ) 174 | 175 | # Print debug info 176 | print("Nonexistent file query result:", result) 177 | 178 | # Check results 179 | self.assertFalse(result["success"], "Query should fail with nonexistent file") 180 | self.assertIn("error", result, "Should return an error message") 181 | 182 | def test_execute_sql_query_with_real_data(self): 183 | """Test executing SQL query with real blockchain data""" 184 | # Skip this test if we don't have real data 185 | if not self.has_real_data: 186 | self.skipTest("No real blockchain data available") 187 | 188 | # Find parquet files to use for testing 189 | parquet_files = list(self.data_dir.glob("*.parquet")) 190 | if not parquet_files: 191 | self.skipTest("No parquet files found for testing") 192 | 193 | # Get file paths as strings for the test 194 | file_paths = [str(f) for f in parquet_files] 195 | 196 | # Part 1: Test direct file reference 197 | result = execute_sql_query( 198 | f"SELECT * FROM '{file_paths[0]}' LIMIT 3", 199 | files=file_paths 200 | ) 201 | 202 | # Print some debug info to see what's happening 203 | print("Result:", result) 204 | 205 | # Check if we have an error 206 | if not result.get("success", False) and "error" in result: 207 | print("SQL error:", result["error"]) 208 | 209 | # Inspect the parquet file to make sure it's valid 210 | for file in parquet_files: 211 | print(f"Parquet file details: {file}") 212 | print(f"File size: {file.stat().st_size} bytes") 213 | 214 | try: 215 | # Try to read the parquet file directly 216 | from cryo_mcp.sql import create_connection 217 | conn = create_connection() 218 | conn.execute(f"SELECT * FROM '{file_paths[0]}' LIMIT 1") 219 | print("Direct parquet read test succeeded") 220 | except Exception as e: 221 | print(f"Direct parquet read test failed: {e}") 222 | 223 | # Check results 224 | self.assertTrue(result.get("success", False), "Query should succeed") 225 | self.assertEqual(result["row_count"], 3, "Should return 3 rows") 226 | self.assertEqual(len(result["files_used"]), len(file_paths), "Should track all files") 227 | 228 | # Verify that we got real data with expected columns 229 | self.assertIn("schema", result, "Should include schema") 230 | self.assertIn("columns", result["schema"], "Should include columns in schema") 231 | 232 | # Verify we can run a more complex query directly on the file 233 | complex_result = execute_sql_query( 234 | f""" 235 | SELECT 236 | MIN(block_number) as min_block, 237 | MAX(block_number) as max_block, 238 | AVG(gas_used) as avg_gas 239 | FROM '{file_paths[0]}' 240 | """, 241 | files=file_paths 242 | ) 243 | 244 | print("Complex result:", complex_result) 245 | self.assertTrue(complex_result["success"], "Complex query should succeed") 246 | self.assertEqual(complex_result["row_count"], 1, "Should return 1 summary row") 247 | self.assertIn("min_block", complex_result["result"][0], "Should have min_block column") 248 | self.assertIn("max_block", complex_result["result"][0], "Should have max_block column") 249 | 250 | # Part 2: Test table name with multiple files (if we have more than one file) 251 | if len(parquet_files) > 1: 252 | # Create a duplicate file to ensure we have multiple files 253 | duplicate_file = self.data_dir / f"{parquet_files[0].stem}_copy.parquet" 254 | shutil.copy(parquet_files[0], duplicate_file) 255 | 256 | # Update file paths list to include the duplicate 257 | file_paths.append(str(duplicate_file)) 258 | 259 | # Extract dataset name from filename for table reference 260 | # Example: ethereum__blocks__00001000_to_00001004.parquet -> blocks 261 | dataset_name = None 262 | file_name = parquet_files[0].stem 263 | if "__" in file_name: 264 | parts = file_name.split("__") 265 | if len(parts) > 1: 266 | dataset_name = parts[1] # e.g., blocks, transactions 267 | 268 | if not dataset_name: 269 | # Fallback - just use a simple name 270 | dataset_name = "blocks" 271 | 272 | # Run a query using table name (should combine files) 273 | multi_file_result = execute_sql_query( 274 | f"SELECT COUNT(*) as total_rows FROM {dataset_name}", 275 | files=file_paths 276 | ) 277 | 278 | print(f"Multi-file result for table '{dataset_name}':", multi_file_result) 279 | 280 | # Check that our query was successful 281 | self.assertTrue(multi_file_result["success"], "Multi-file query should succeed") 282 | 283 | # Verify table mappings show multiple files were used 284 | self.assertIsNotNone(multi_file_result.get("table_mappings"), "Should include table mappings") 285 | self.assertTrue( 286 | any(mapping["combined"] for mapping in multi_file_result.get("table_mappings", {}).values()), 287 | "Should indicate files were combined" 288 | ) 289 | 290 | @patch('duckdb.DuckDBPyConnection') 291 | @patch('cryo_mcp.sql.duckdb.connect') 292 | def test_execute_sql_query_with_mock_data(self, mock_connect, mock_connection): 293 | """Test executing SQL query with mock data (fallback if real data unavailable)""" 294 | # Skip if we have real data (we'll use the real data test instead) 295 | if self.has_real_data: 296 | self.skipTest("Using real data test instead") 297 | 298 | # Create mock parquet file 299 | file_path = self.create_mock_parquet_file("blocks") 300 | 301 | # Setup mock connection and cursor 302 | mock_fetchdf = MagicMock() 303 | mock_fetchdf.to_dict.return_value = [{"block_number": 1000, "gas_used": 1000000}] 304 | mock_fetchdf.empty = False 305 | mock_fetchdf.columns = ["block_number", "gas_used"] 306 | mock_fetchdf.dtypes = {"block_number": "int64", "gas_used": "int64"} 307 | 308 | mock_cursor = MagicMock() 309 | mock_cursor.fetchdf.return_value = mock_fetchdf 310 | 311 | mock_connection_instance = mock_connect.return_value 312 | mock_connection_instance.execute.return_value = mock_cursor 313 | 314 | # Call execute_sql_query with direct file reference 315 | result = execute_sql_query( 316 | f"SELECT * FROM '{file_path}'", 317 | files=[str(file_path)] 318 | ) 319 | 320 | # Check connection setup 321 | mock_connect.assert_called_once() 322 | 323 | # Check results 324 | self.assertTrue(result["success"], "Query should succeed") 325 | self.assertEqual(result["row_count"], 1, "Should return correct row count") 326 | self.assertEqual(len(result["files_used"]), 1, "Should track files used") 327 | self.assertIn(str(file_path), result["files_used"][0], "Should include file path used") 328 | 329 | 330 | if __name__ == "__main__": 331 | unittest.main() 332 | -------------------------------------------------------------------------------- /tests/test_transaction_by_hash.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import sys 4 | from cryo_mcp.server import get_transaction_by_hash 5 | 6 | # Set ETH_RPC_URL 7 | os.environ["ETH_RPC_URL"] = "http://10.0.0.48:8545" 8 | print(f"Using ETH_RPC_URL: {os.environ['ETH_RPC_URL']}") 9 | 10 | def test_get_transaction_by_hash(): 11 | """Test getting transaction details by hash""" 12 | 13 | # Test with a known transaction hash from our previous tests 14 | # You can replace this with any valid transaction hash you want to test with 15 | tx_hash = "0xbee5a5c9024d9d6dde31c180c71b21aba1ebb7a726cf148a4b2781cf0ca7b7e6" 16 | 17 | print(f"Looking up transaction: {tx_hash}") 18 | tx_info = get_transaction_by_hash(tx_hash) 19 | 20 | if "error" in tx_info: 21 | print(f"❌ Error: {tx_info['error']}") 22 | return False 23 | 24 | # Print transaction details 25 | print("\nTransaction Details:") 26 | for key, value in tx_info.items(): 27 | # Skip printing the full input data which can be very long 28 | if key == "input" and value and len(value) > 100: 29 | print(f" {key}: {value[:50]}...{value[-50:]}") 30 | else: 31 | print(f" {key}: {value}") 32 | 33 | # Test with an invalid transaction hash 34 | invalid_hash = "0x1234567890123456789012345678901234567890123456789012345678901234" 35 | 36 | print(f"\nLooking up invalid transaction: {invalid_hash}") 37 | invalid_tx = get_transaction_by_hash(invalid_hash) 38 | 39 | if "error" in invalid_tx: 40 | print(f"✅ Expected error for invalid hash: {invalid_tx['error']}") 41 | else: 42 | print(f"❌ Unexpected success for invalid hash: {invalid_tx}") 43 | return False 44 | 45 | return True 46 | 47 | if __name__ == "__main__": 48 | test_get_transaction_by_hash() -------------------------------------------------------------------------------- /tests/test_transactions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import os 4 | import subprocess 5 | from pathlib import Path 6 | 7 | # Set ETH_RPC_URL 8 | os.environ["ETH_RPC_URL"] = "http://10.0.0.48:8545" 9 | print(f"Using ETH_RPC_URL: {os.environ['ETH_RPC_URL']}") 10 | 11 | def test_transactions(): 12 | """Test fetching transactions for a specific block""" 13 | 14 | # Use a known block number 15 | block_num = 22001067 # You can replace this with any block number you want to test with 16 | block_range = f"{block_num}:{block_num+1}" 17 | 18 | # Create a temp directory for output 19 | temp_dir = Path("/tmp/cryo_tx_test") 20 | temp_dir.mkdir(exist_ok=True) 21 | 22 | cmd = ["cryo", "transactions", "-b", block_range, "-r", os.environ["ETH_RPC_URL"], "--json", "-o", str(temp_dir)] 23 | 24 | print(f"Running command: {' '.join(cmd)}") 25 | result = subprocess.run(cmd, capture_output=True, text=True) 26 | 27 | print(f"Return code: {result.returncode}") 28 | print(f"STDOUT: {result.stdout[:500]}...") 29 | 30 | # Find and read the output file 31 | output_files = list(temp_dir.glob("*transactions*.json")) 32 | print(f"Output files: {output_files}") 33 | 34 | if output_files: 35 | with open(output_files[0], 'r') as f: 36 | data = json.load(f) 37 | 38 | print(f"Number of transactions: {len(data)}") 39 | if data: 40 | print(f"First transaction hash: {data[0].get('transaction_hash')}") 41 | print(f"First transaction block number: {data[0].get('block_number')}") 42 | 43 | # Save the first transaction to a file for inspection 44 | print(f"Saving first transaction to ethereum__blocks_{block_num}_to_{block_num}.json") 45 | with open(f"ethereum__blocks_{block_num}_to_{block_num}.json", 'w') as outfile: 46 | json.dump(data, outfile, indent=2) 47 | 48 | return data 49 | 50 | return None 51 | 52 | if __name__ == "__main__": 53 | test_transactions() --------------------------------------------------------------------------------