├── main.py ├── Dockerfile ├── requirements.txt ├── .gitignore ├── config.yml ├── utils └── func.py ├── config.py ├── LICENSE ├── prometheus.yml ├── promtail.yml ├── modules ├── version.py ├── epoch.py ├── node_health.py ├── balance.py ├── vote.py ├── block.py ├── leader_slot.py ├── slot.py └── validator.py ├── docker-compose.yml ├── exporter ├── collector.py └── exporter.py ├── prometheus └── metrics.py └── README.md /main.py: -------------------------------------------------------------------------------- 1 | from exporter.exporter import main 2 | 3 | if __name__ == "__main__": 4 | 5 | main() 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim 2 | WORKDIR /app 3 | COPY . . 4 | RUN pip install --root-user-action=ignore -r requirements.txt 5 | 6 | CMD ["python", "-u", "main.py"] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.10.5 2 | aiosignal==1.3.1 3 | loguru==0.7.2 4 | prometheus_client==0.21.0 5 | python-telegram-bot==21.6 6 | requests==2.32.3 7 | PyYAML==6.0.2 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | 3 | # Environments 4 | .venv 5 | env/ 6 | venv/ 7 | ENV/ 8 | env.bak/ 9 | venv.bak/ 10 | prometheus_data/ 11 | 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # pycharm 18 | .idea 19 | .DS_Store 20 | -------------------------------------------------------------------------------- /config.yml: -------------------------------------------------------------------------------- 1 | pub_key: YOUR_PUB_KEY 2 | vote_pub_key: YOUR_VOTE_KEY 3 | network_rpc_endpoint: https://api.testnet.solana.com 4 | validator_rpc_endpoint: http://localhost:8899 5 | 6 | sleep_time: 45 7 | metric_port: 1234 8 | 9 | thread_pool_size: 2 10 | log_level: DEBUG # INFO/WARNING/SUCCESS/ERROR 11 | retry: 10 12 | -------------------------------------------------------------------------------- /utils/func.py: -------------------------------------------------------------------------------- 1 | def update_metric(metric, value, labels=None): 2 | """ 3 | Update Prometheus metric with optional labels and log the update if value is not None. 4 | 5 | :param metric: Prometheus metric to update 6 | :param value: Value to set for the metric 7 | :param labels: Optional dictionary of labels for the metric 8 | """ 9 | if value is not None: 10 | if labels: 11 | # If labels are provided, update metric with multiple labels 12 | metric.labels(**labels).set(value) 13 | else: 14 | # Update metric without labels 15 | metric.set(value) 16 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | HEADERS = {'Content-Type': 'application/json'} 4 | 5 | with open("config.yml", "r") as config_file: 6 | config = yaml.safe_load(config_file) 7 | 8 | PUB_KEY = config.get("pub_key") 9 | VOTE_PUB_KEY = config.get("vote_pub_key") 10 | NETWORK_RPC_ENDPOINT = config.get("network_rpc_endpoint", "https://api.testnet.solana.com") 11 | VALIDATOR_RPC_ENDPOINT = config.get("validator_rpc_endpoint", "http://localhost:8899") 12 | SOLANA_BINARY_PATH = config.get("solana_binary_path", "solana") 13 | 14 | THREAD_POOL_SIZE = config.get("thread_pool_size", 4) 15 | SLEEP_TIME = config.get("sleep_time", 45) 16 | PORT = config.get("metric_port", 1234) 17 | LOG_LEVEL = config.get("log_level", "INFO") 18 | RETRY = config.get("retry", 5) 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 qskyhigh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /prometheus.yml: -------------------------------------------------------------------------------- 1 | # my global config 2 | global: 3 | scrape_interval: 60s # Set the scrape interval to every 15 seconds. Default is every 1 minute. 4 | evaluation_interval: 60s # Evaluate rules every 15 seconds. The default is every 1 minute. 5 | # scrape_timeout is set to the global default (10s). 6 | external_labels: 7 | host: "solana-monitor-testnet" 8 | # Alertmanager configuration 9 | alerting: 10 | alertmanagers: 11 | - static_configs: 12 | - targets: 13 | # - alertmanager:9093 14 | 15 | # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. 16 | rule_files: 17 | # - "first_rules.yml" 18 | # - "second_rules.yml" 19 | 20 | # A scrape configuration containing exactly one endpoint to scrape: 21 | # Here it's Prometheus itself. 22 | scrape_configs: 23 | # The job name is added as a label `job=` to any timeseries scraped from this config. 24 | - job_name: 'prometheus' 25 | 26 | # metrics_path defaults to '/metrics' 27 | # scheme defaults to 'http'. 28 | 29 | static_configs: 30 | - targets: ['localhost:9090'] 31 | 32 | - job_name: 'solana' 33 | 34 | static_configs: 35 | - targets: ['localhost:1234'] 36 | labels: 37 | instance: 'solana-monitor' 38 | 39 | 40 | - job_name: 'node_exporter' 41 | 42 | static_configs: 43 | - targets: ['localhost:9100'] 44 | labels: 45 | instance: 'SolanaTestnet' 46 | 47 | remote_write: 48 | - url: https://prometheus-prod-13-prod-us-east-0.grafana.net/api/prom/push 49 | basic_auth: 50 | username: YOUR_USERNAME 51 | password: YOUR_API_TOKEN 52 | -------------------------------------------------------------------------------- /promtail.yml: -------------------------------------------------------------------------------- 1 | server: 2 | http_listen_port: 9080 3 | grpc_listen_port: 0 4 | 5 | positions: 6 | filename: /tmp/positions.yaml 7 | 8 | clients: 9 | - url: https://YOUR_USERNAME:YOUR_API_TOKEN@logs-prod-006.grafana.net/loki/api/v1/push 10 | scrape_configs: 11 | - job_name: "monitor-logs" 12 | static_configs: 13 | - targets: 14 | - localhost 15 | labels: 16 | __path__: /app/logs/monitor.log 17 | job: monitor-logs 18 | pipeline_stages: 19 | - regex: 20 | expression: '(?P\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}) \| (?P\S+) *\| (?P.*?) - (?P.*)$' 21 | - labels: 22 | level: 23 | source: 24 | - timestamp: 25 | source: timestamp 26 | format: '2006-01-02 15:04:05.000' 27 | location: Europe/Moscow 28 | - output: 29 | source: message 30 | - job_name: "solana-logs" 31 | static_configs: 32 | - targets: 33 | - localhost 34 | labels: 35 | __path__: /var/log/solana.log 36 | job: solana-logs 37 | pipeline_stages: 38 | - regex: 39 | expression: '\[(?P.*?)\s+(?P.*?)\s+(?P.*?)\]\s+(?P.*)' 40 | - labels: 41 | level: 42 | source: 43 | - timestamp: 44 | source: timestamp 45 | format: RFC3339 46 | location: Europe/Moscow 47 | - match: 48 | selector: '{level!="ERROR"}' 49 | action: drop 50 | drop_counter_reason: promtail_noisy_error 51 | - output: 52 | source: message 53 | 54 | -------------------------------------------------------------------------------- /modules/version.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | from loguru import logger 3 | from config import HEADERS, VALIDATOR_RPC_ENDPOINT, PORT 4 | from utils.func import update_metric 5 | from prometheus.metrics import solana_node_version 6 | 7 | PROMETHEUS_METRICS_URL = f"http://localhost:{PORT}/metrics" 8 | 9 | 10 | async def get_versions_from_prometheus() -> list: 11 | try: 12 | async with aiohttp.ClientSession() as session: 13 | async with session.get(PROMETHEUS_METRICS_URL) as response: 14 | response.raise_for_status() 15 | metrics = await response.text() 16 | 17 | all_versions = [ 18 | line.split('{')[1].split('}')[0].split('=')[1].replace('"', '') 19 | for line in metrics.splitlines() 20 | if line.startswith(f'{solana_node_version._name}') and '1.0' in line 21 | ] 22 | return all_versions 23 | except Exception as e: 24 | logger.error(f"Failed to fetch version from Prometheus: {e}") 25 | return [] 26 | 27 | 28 | async def get_version(): 29 | all_versions = await get_versions_from_prometheus() 30 | payload = {"jsonrpc": "2.0", "id": 1, "method": "getVersion"} 31 | 32 | try: 33 | async with aiohttp.ClientSession() as session: 34 | async with session.post(VALIDATOR_RPC_ENDPOINT, json=payload, headers=HEADERS) as response: 35 | response.raise_for_status() 36 | result = await response.json() 37 | 38 | current_version = result['result'].get('solana-core') 39 | for version in all_versions: 40 | if version and version != current_version: 41 | update_metric(solana_node_version, 0, labels={"version": version}) 42 | 43 | update_metric(solana_node_version, 1, labels={"version": current_version}) 44 | logger.info(f"Node version of solana: {current_version}") 45 | 46 | except Exception as e: 47 | logger.error(f"Error getting version node: {e}") 48 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | solana-monitor: 3 | container_name: solana-monitor 4 | restart: unless-stopped 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | image: monitor-image:latest 9 | network_mode: "host" 10 | volumes: 11 | - /root/.local/share/solana/install/active_release/bin:/solana 12 | - ~/.config/solana:/root/.config/solana 13 | - app_logs:/app/logs 14 | - ./config.yml:/app/config.yml 15 | environment: 16 | - PATH=/solana:$PATH 17 | - TZ=Europe/Moscow 18 | 19 | promtail: 20 | container_name: promtail 21 | restart: unless-stopped 22 | image: grafana/promtail 23 | volumes: 24 | - ./promtail.yml:/etc/agent/agent.yaml 25 | - /root/solana/solana.log:/var/log/solana.log 26 | - app_logs:/app/logs 27 | environment: 28 | - TZ=Europe/Moscow 29 | command: 30 | - --config.file=/etc/agent/agent.yaml 31 | 32 | prometheus: 33 | container_name: prometheus 34 | restart: unless-stopped 35 | image: prom/prometheus 36 | network_mode: "host" 37 | volumes: 38 | - ./prometheus.yml:/etc/prometheus/prometheus.yml 39 | - prometheus_data:/prometheus 40 | command: 41 | - '--config.file=/etc/prometheus/prometheus.yml' 42 | - '--storage.tsdb.path=/prometheus' 43 | - '--web.console.libraries=/etc/prometheus/console_libraries' 44 | - '--web.console.templates=/etc/prometheus/consoles' 45 | - '--web.enable-lifecycle' 46 | 47 | node-exporter: 48 | container_name: node-exporter 49 | restart: unless-stopped 50 | image: prom/node-exporter:latest 51 | network_mode: "host" 52 | volumes: 53 | - /proc:/host/proc:ro 54 | - /sys:/host/sys:ro 55 | - /:/rootfs:ro 56 | command: 57 | - '--path.procfs=/host/proc' 58 | - '--path.rootfs=/rootfs' 59 | - '--path.sysfs=/host/sys' 60 | - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' 61 | 62 | networks: 63 | solana-monitor_monitoring: 64 | external: true 65 | 66 | volumes: 67 | app_logs: 68 | prometheus_data: 69 | 70 | -------------------------------------------------------------------------------- /exporter/collector.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from loguru import logger 3 | from concurrent.futures import ThreadPoolExecutor, as_completed 4 | from config import THREAD_POOL_SIZE 5 | from modules.balance import balance_metrics 6 | from modules.block import block_metrics 7 | from modules.epoch import get_epoch_information 8 | from modules.validator import validator_metrics, get_vote_accounts 9 | from modules.version import get_version 10 | from modules.leader_slot import leader_slot_metrics 11 | from modules.node_health import get_health 12 | from modules.slot import get_block_height, get_slots 13 | from modules.vote import get_votes 14 | 15 | 16 | def run_sync_tasks(): 17 | sync_tasks = [ 18 | block_metrics, 19 | validator_metrics 20 | ] 21 | 22 | with ThreadPoolExecutor(max_workers=THREAD_POOL_SIZE) as executor: 23 | futures = [executor.submit(task) for task in sync_tasks] 24 | 25 | for future in as_completed(futures): 26 | try: 27 | future.result() 28 | except Exception as e: 29 | logger.error(f"Error collecting sync metric: {e}, task: {future}") 30 | 31 | 32 | async def run_async_tasks(): 33 | async_tasks = { 34 | "get_block_height": get_block_height(), 35 | "get_slots": get_slots(), 36 | "get_votes": get_votes(), 37 | "balance_metrics": balance_metrics(), 38 | "get_vote_accounts": get_vote_accounts(), 39 | "leader_slot_metrics": leader_slot_metrics(), 40 | "get_epoch_information": get_epoch_information(), 41 | "get_health": get_health(), 42 | "get_version": get_version() 43 | } 44 | 45 | try: 46 | results = await asyncio.gather(*async_tasks.values(), return_exceptions=True) 47 | 48 | for task, result in zip(async_tasks.keys(), results): 49 | if isinstance(result, Exception): 50 | logger.error(f"{task.upper()}: {result}", exc_info=True) 51 | except Exception as e: 52 | logger.error(f"Error collecting async metric: {e}") 53 | 54 | 55 | async def collect(): 56 | loop = asyncio.get_event_loop() 57 | sync_task = loop.run_in_executor(None, run_sync_tasks) 58 | async_task = run_async_tasks() 59 | await asyncio.gather(sync_task, async_task) 60 | -------------------------------------------------------------------------------- /modules/epoch.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | from loguru import logger 3 | from config import NETWORK_RPC_ENDPOINT, HEADERS 4 | from utils.func import update_metric 5 | from prometheus.metrics import (solana_network_epoch, solana_tx_count, solana_slot_in_epoch, solana_slot_index) 6 | 7 | 8 | async def get_epoch_information(): 9 | """Fetches epoch information from Solana network and updates Prometheus metrics asynchronously.""" 10 | payload = { 11 | "jsonrpc": "2.0", "id": 1, "method": "getEpochInfo" 12 | } 13 | 14 | try: 15 | # Create async session and send request to Solana RPC endpoint 16 | # logger.info("Sending async request to fetch epoch information from Solana network.") 17 | async with aiohttp.ClientSession() as session: 18 | async with session.post(NETWORK_RPC_ENDPOINT, json=payload, headers=HEADERS) as response: 19 | response.raise_for_status() # Raise error for HTTP issues 20 | result = await response.json() 21 | 22 | # Validate required keys in the result 23 | if 'result' not in result: 24 | raise ValueError("Invalid response format: 'result' field is missing") 25 | 26 | # Extract metrics from result 27 | epoch = result['result'].get('epoch') 28 | update_metric(solana_network_epoch, epoch) 29 | slot_in_epoch = result['result'].get('slotsInEpoch') 30 | update_metric(solana_slot_in_epoch, slot_in_epoch) 31 | slot_index = result['result'].get('slotIndex') 32 | update_metric(solana_slot_index, slot_index) 33 | tx_count = result['result'].get('transactionCount') 34 | update_metric(solana_tx_count, tx_count) 35 | 36 | # logger.info("Successfully retrieved epoch information.") 37 | logger.debug(f"Epoch: {epoch}, Slot in Epoch: {slot_in_epoch}, Slot Index: {slot_index}, Transaction Count: {tx_count}") 38 | 39 | except aiohttp.ClientError as e: 40 | # Log network or connection errors 41 | logger.error(f"Network error occurred while fetching epoch information: {e}") 42 | except ValueError as e: 43 | # Log invalid response format issues 44 | logger.error(f"Data format error: {e}") 45 | except Exception as e: 46 | # Log any other unexpected errors 47 | logger.error(f"Unexpected error while getting epoch information: {e}") 48 | -------------------------------------------------------------------------------- /modules/node_health.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | from loguru import logger 3 | from config import VALIDATOR_RPC_ENDPOINT, HEADERS 4 | from utils.func import update_metric 5 | from prometheus.metrics import solana_node_health, solana_node_slots_behind 6 | 7 | 8 | async def get_health(): 9 | payload = { 10 | "jsonrpc": "2.0", "id": 1, "method": "getHealth" 11 | } 12 | 13 | try: 14 | # Create async session and send request to Validator RPC endpoint 15 | async with aiohttp.ClientSession() as session: 16 | async with session.post(VALIDATOR_RPC_ENDPOINT, json=payload, headers=HEADERS) as response: 17 | response.raise_for_status() # Raise error for HTTP issues 18 | result = await response.json() 19 | 20 | if "result" in result and result["result"] == "ok": 21 | update_metric(solana_node_health, 1, labels={"status": "healthy", "cause": "none"}) 22 | logger.info("Node is healthy") 23 | elif "error" in result: 24 | error_message = result["error"].get("message", "Unknown error") 25 | slots_behind = result["error"]["data"]["numSlotsBehind"] 26 | solana_node_health.labels(status="unhealthy", cause="slots_behind").set(0) 27 | update_metric(solana_node_health, 1, labels={"status": "unhealthy", "cause": "slots_behind"}) 28 | update_metric(solana_node_health, 0, labels={"status": "healthy", "cause": "none"}) 29 | update_metric(solana_node_slots_behind, slots_behind) 30 | logger.error(f"Node is unhealthy: {error_message}.") 31 | 32 | else: 33 | logger.error("Unexpected response format") 34 | update_metric(solana_node_health, 0, labels={"status": "healthy", "cause": "none"}) 35 | 36 | except aiohttp.ClientError as e: 37 | logger.error(f"Network error occurred while fetching node information: {e}") 38 | update_metric(solana_node_health, 0, labels={"status": "healthy", "cause": "none"}) 39 | except ValueError as e: 40 | logger.error(f"Data format error: {e}") 41 | update_metric(solana_node_health, 0, labels={"status": "healthy", "cause": "none"}) 42 | except Exception as e: 43 | logger.error(f"Error getting node status: {e}") 44 | update_metric(solana_node_health, 0, labels={"status": "healthy", "cause": "none"}) 45 | -------------------------------------------------------------------------------- /exporter/exporter.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import signal 3 | from prometheus_client import start_http_server 4 | import time 5 | from exporter.collector import collect 6 | from loguru import logger 7 | from config import SLEEP_TIME, PORT, LOG_LEVEL 8 | 9 | 10 | async def graceful_shutdown(loop, sig=None): 11 | """Handles graceful shutdown on receiving a signal""" 12 | if sig: 13 | logger.info(f"Received exit signal {sig.name}...") 14 | 15 | tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()] 16 | 17 | logger.info(f"Cancelling {len(tasks)} outstanding tasks") 18 | for task in tasks: 19 | task.cancel() 20 | 21 | await asyncio.gather(*tasks, return_exceptions=True) 22 | loop.stop() 23 | 24 | 25 | def setup_signals(loop): 26 | """Setup signal handling for graceful shutdown""" 27 | for sig in (signal.SIGINT, signal.SIGTERM): 28 | loop.add_signal_handler(sig, lambda: asyncio.create_task(graceful_shutdown(loop, sig))) 29 | 30 | 31 | async def run_exporter(): 32 | """Main function to run the Prometheus exporter""" 33 | logger.info(f"Starting Prometheus metrics server on localhost:{PORT}/metrics") 34 | start_http_server(PORT) 35 | 36 | while True: 37 | start_time = time.time() 38 | logger.info("Starting collection of metrics") 39 | try: 40 | await collect() 41 | logger.info(f"Metrics collected successfully in {time.time() - start_time:.2f} seconds") 42 | except Exception as e: 43 | logger.error(f"Error during metrics collection: {e}") 44 | 45 | logger.info(f"💤 Sleeping for {SLEEP_TIME} seconds") 46 | await asyncio.sleep(SLEEP_TIME) 47 | 48 | 49 | def main(): 50 | logger.add("logs/monitor.log", 51 | level=LOG_LEVEL, 52 | rotation="00:00", 53 | retention="6 days", 54 | compression=None, 55 | backtrace=True, 56 | diagnose=True, 57 | enqueue=True) 58 | 59 | loop = asyncio.get_event_loop() 60 | setup_signals(loop) # Graceful shutdown signal setup 61 | try: 62 | loop.run_until_complete(run_exporter()) 63 | except Exception as e: 64 | logger.error(f"Unexpected error: {e}") 65 | finally: 66 | logger.info("Shutting down Prometheus exporter") 67 | loop.close() 68 | -------------------------------------------------------------------------------- /modules/balance.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import time 3 | import aiohttp 4 | from loguru import logger 5 | from utils.func import update_metric 6 | from prometheus.metrics import solana_account_balance, solana_vote_account_balance 7 | from config import PUB_KEY, VOTE_PUB_KEY, NETWORK_RPC_ENDPOINT, HEADERS 8 | 9 | 10 | # Unified async function to fetch both balances 11 | async def fetch_balances(): 12 | # Payload for fetching both identity and vote account balances 13 | payload = [ 14 | {"jsonrpc": "2.0", "id": 1, "method": "getBalance", "params": [PUB_KEY]}, 15 | {"jsonrpc": "2.0", "id": 2, "method": "getBalance", "params": [VOTE_PUB_KEY]} 16 | ] 17 | 18 | try: 19 | # Send the requests asynchronously 20 | async with aiohttp.ClientSession() as session: 21 | async with session.post(NETWORK_RPC_ENDPOINT, json=payload, headers=HEADERS) as response: 22 | response.raise_for_status() 23 | results = await response.json() 24 | 25 | try: 26 | # Parsing the results for both accounts 27 | identity_balance = results[0]['result'].get('value') / 10 ** 9 if 'result' in results[0] else None 28 | vote_acc_balance = results[1]['result'].get('value') / 10 ** 9 if 'result' in results[1] else None 29 | 30 | logger.debug(f"Identity balance: {identity_balance} SOL, Vote account balance: {vote_acc_balance} SOL") 31 | 32 | return identity_balance, vote_acc_balance 33 | except Exception as e: 34 | logger.error(f"Error processing balance data: {e}") 35 | return None, None 36 | 37 | except aiohttp.ClientError as e: 38 | logger.error(f"Error making request to Solana RPC: {e}") 39 | return None, None 40 | 41 | 42 | # Main async function to gather and update modules 43 | async def balance_metrics(): 44 | logger.info(f"{inspect.currentframe().f_code.co_name}: Starting metrics collection process.") 45 | start_time = time.time() 46 | 47 | # Fetch both balances asynchronously 48 | identity_balance, vote_acc_balance = await fetch_balances() 49 | 50 | # Function to update Prometheus modules if values are valid 51 | update_metric(solana_account_balance, identity_balance) 52 | update_metric(solana_vote_account_balance, vote_acc_balance) 53 | 54 | end_time = time.time() 55 | logger.success(f"{inspect.currentframe().f_code.co_name}: Metrics successfully collected and exported to " 56 | f"Prometheus. Time: {end_time - start_time}") 57 | -------------------------------------------------------------------------------- /modules/vote.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import time 3 | import aiohttp 4 | import asyncio 5 | from loguru import logger 6 | from utils.func import update_metric 7 | from config import VALIDATOR_RPC_ENDPOINT, NETWORK_RPC_ENDPOINT, VOTE_PUB_KEY, HEADERS, RETRY 8 | from prometheus.metrics import solana_validator_vote_height, solana_network_vote_height, solana_vote_height_diff 9 | 10 | rpc_urls = { 11 | "network": NETWORK_RPC_ENDPOINT, 12 | "validator": VALIDATOR_RPC_ENDPOINT 13 | } 14 | 15 | 16 | def get_vote_accounts(result): 17 | if "error" in result: 18 | error_code = result["error"].get("code", "Unknown code") 19 | error_message = result["error"].get("message", "Unknown message") 20 | logger.error(f"Error fetching vote accounts - code: {error_code}, message: {error_message}") 21 | return None 22 | return result.get('result', {}).get('current', []) or result.get('result', {}).get('delinquent', []) 23 | 24 | 25 | async def measure_rpc_response_time(url, session, payload): 26 | try: 27 | start_time = time.time() 28 | async with session.post(url, json=payload, headers=HEADERS) as response: 29 | result = await response.json() 30 | end_time = time.time() 31 | return result, end_time - start_time 32 | except Exception as e: 33 | logger.error(f"Error while accessing {url}: {e}") 34 | return None, None 35 | 36 | 37 | async def make_requests(payload, func_name): 38 | async with aiohttp.ClientSession() as session: 39 | raw_results = await asyncio.gather( 40 | *[measure_rpc_response_time(url, session, payload) for url in rpc_urls.values()], 41 | return_exceptions=True 42 | ) 43 | 44 | results = [] 45 | response_times = {} 46 | 47 | for (row, response_time), name in zip(raw_results, rpc_urls.keys()): 48 | if row is not None: 49 | results.append(row) 50 | response_times[name] = response_time 51 | network_time = response_times.get('network') 52 | validator_time = response_times.get('validator') 53 | 54 | if network_time is not None: 55 | logger.debug(f"{func_name.upper()} Response time for network: {network_time:.4f} seconds") 56 | else: 57 | logger.warning(f"{func_name.upper()} No valid response time for network") 58 | 59 | if validator_time is not None: 60 | logger.debug(f"{func_name.upper()} Response time for validator: {validator_time:.4f} seconds") 61 | else: 62 | logger.warning(f"{func_name.upper()} No valid response time for validator") 63 | 64 | return results, response_times 65 | 66 | 67 | async def get_votes(): 68 | payload = { 69 | "jsonrpc": "2.0", 70 | "id": 1, 71 | "method": "getVoteAccounts", 72 | "params": [ 73 | { 74 | "votePubkey": VOTE_PUB_KEY 75 | } 76 | ] 77 | } 78 | func_name = inspect.currentframe().f_code.co_name 79 | blocks, response_times = await make_requests(payload, func_name) 80 | 81 | retry_count = 0 82 | last_blocks = None 83 | while retry_count < RETRY and any(t is not None and t > 1 for t in response_times.values()): 84 | retry_count += 1 85 | logger.info("One or more requests took longer than 1 second. Retrying...") 86 | blocks, response_times = await make_requests(payload, func_name) 87 | 88 | last_blocks = blocks 89 | 90 | if retry_count == RETRY: 91 | blocks = last_blocks 92 | 93 | try: 94 | results = blocks 95 | validator_vote_height = network_vote_height = None 96 | if results: 97 | vote_accounts_network = get_vote_accounts(results[0]) if len(results) > 0 else None 98 | if vote_accounts_network: 99 | network_vote_height = vote_accounts_network[0]['lastVote'] 100 | update_metric(solana_network_vote_height, network_vote_height, labels={"rpc": "network"}) 101 | logger.debug(f"Network vote height: {network_vote_height}") 102 | else: 103 | logger.warning(f"{func_name.upper()} No vote data for network") 104 | 105 | vote_accounts_validator = get_vote_accounts(results[1]) if len(results) > 1 else None 106 | if vote_accounts_validator: 107 | validator_vote_height = vote_accounts_validator[0]['lastVote'] 108 | update_metric(solana_validator_vote_height, validator_vote_height, labels={"rpc": "validator"}) 109 | logger.debug(f"Validator vote height: {validator_vote_height}") 110 | else: 111 | logger.warning(f"{func_name.upper()} No vote data for validator") 112 | 113 | if vote_accounts_network and vote_accounts_validator: 114 | update_metric(solana_vote_height_diff, validator_vote_height - network_vote_height) 115 | logger.debug(f"Diff vote height: {validator_vote_height - network_vote_height}") 116 | else: 117 | logger.error(f"Error processing vote data: {results}") 118 | except Exception as e: 119 | logger.error(f"Error processing vote data: {e}") 120 | -------------------------------------------------------------------------------- /prometheus/metrics.py: -------------------------------------------------------------------------------- 1 | from prometheus_client import Gauge 2 | 3 | # Prometheus Gauges 4 | # balance module 5 | solana_account_balance = Gauge('solana_account_balance', 'Identity account balance') 6 | solana_vote_account_balance = Gauge('solana_vote_account_balance', 'Vote account balance') 7 | 8 | # block module 9 | solana_net_skip_rate = Gauge('solana_net_skip_rate', 'Network skip rate') 10 | solana_skipped_total = Gauge('solana_skipped_total', 'Total skipped slots of network in current epoch') 11 | solana_val_blocks_produced = Gauge('solana_val_blocks_produced', 'Blocks produced of a validator in current epoch') 12 | solana_val_skip_rate = Gauge('solana_val_skip_rate', 'Validator skip rate') 13 | solana_val_skipped_slots = Gauge('solana_val_skipped_slots', 'Skipped slots of a validator in current epoch') 14 | solana_total_blocks_produced = Gauge('solana_total_blocks_produced', 'Total blocks produced in current epoch') 15 | solana_skip_rate_diff = Gauge('solana_skip_rate_diff', 'Skip rate difference of network and validator') 16 | solana_val_leader_slots = Gauge('solana_val_leader_slots', 'Leader slots of a validator in current epoch') 17 | solana_total_slots = Gauge('solana_total_slots', 'Total slots in current epoch') 18 | solana_confirmed_epoch_first_slot = Gauge('solana_confirmed_epoch_first_slot', 'First slot in current epoch') 19 | solana_confirmed_epoch_last_slot = Gauge('solana_confirmed_epoch_last_slot', 'Last slot in current epoch') 20 | 21 | # epoch module 22 | solana_node_version = Gauge('solana_node_version', 'Node version of solana', ['version']) 23 | solana_network_epoch = Gauge('solana_network_epoch', 'Current epoch of network (max confirmation)') 24 | solana_tx_count = Gauge('solana_tx_count', 'solana transaction count') 25 | solana_slot_in_epoch = Gauge('solana_slot_in_epoch', 'solana_slot_in_epoch') 26 | solana_slot_index = Gauge('solana_slot_index', 'solana_slot_index') 27 | 28 | # leader_slot module 29 | solana_val_total_leader_slots = Gauge('solana_val_total_leader_slots', 'Total number of leader slots in current epoch') 30 | solana_next_leader_slot = Gauge('solana_next_leader_slot', 'The next leader slot') 31 | solana_time_to_next_slot = Gauge('solana_time_to_next_slot', 'Time until the next leader slot in seconds') 32 | solana_avg_slot_duration = Gauge('solana_avg_slot_duration', 'Average slot duration in seconds') 33 | solana_next_slot_time = Gauge('solana_next_slot_time', 'Time of the next leader slot') 34 | solana_previous_leader_slot = Gauge('solana_previous_leader_slot', 'The previous leader slot') 35 | 36 | # node_health module 37 | solana_node_health = Gauge('solana_node_health', 'Health status of the Solana node', ['status', 'cause']) 38 | solana_node_slots_behind = Gauge('solana_node_slots_behind', 'Number of slots the Solana node is behind') 39 | 40 | # slot module 41 | solana_block_height = Gauge('solana_block_height', 'Current Block Height of validator') 42 | solana_network_block_height = Gauge('solana_network_block_height', 'Current Block Height of network') 43 | solana_block_height_diff = Gauge('solana_block_height_diff', 'Current Block Height difference of network and validator') 44 | solana_current_slot = Gauge('solana_current_slot', 'Current validator slot height') 45 | solana_net_current_slot = Gauge('solana_net_current_slot', 'Current network slot height') 46 | solana_slot_diff = Gauge('solana_slot_diff', 'Current slot difference of network and validator') 47 | solana_net_max_shred_insert_slot = Gauge('solana_net_max_shred_insert_slot', 'Get the max NETWORK slot seen from after shred insert') 48 | solana_net_max_retransmit_slot = Gauge('solana_net_max_retransmit_slot', 'Get the max NETWORK slot seen from retransmit stage') 49 | solana_val_max_shred_insert_slot = Gauge('solana_val_max_shred_insert_slot', 'Get the max VALIDATOR slot seen from after shred insert') 50 | solana_val_max_retransmit_slot = Gauge('solana_val_max_retransmit_slot', 'Get the max VALIDATOR slot seen from retransmit stage') 51 | 52 | # validator module 53 | solana_active_stake = Gauge('solana_active_stake', 'Active Stake SOLs') 54 | solana_current_stake = Gauge('solana_current_stake', 'Current Stake SOLs') 55 | solana_delinquent_stake = Gauge('solana_delinquent_stake', 'Delinquent Stake SOLs') 56 | solana_val_commission = Gauge('solana_val_commission', 'Solana validator current commission', ['commission']) 57 | solana_active_validators = Gauge('solana_active_validators', 'Total number of active validators by state', ['state']) 58 | solana_validator_activated_stake = Gauge('solana_validator_activated_stake', 'Activated stake per validator', 59 | ['pubkey', 'votekey']) 60 | solana_val_status = Gauge('solana_val_status', 'Solana validator voting status i.e., voting or jailed', ['state']) 61 | solana_vote_credits = Gauge('solana_vote_credits', 'Solana validator vote credits of current epoch') 62 | solana_avg_vote_credits = Gauge('solana_avg_vote_credits', 'Average network vote credits of current epoch') 63 | solana_total_credits = Gauge('solana_total_credits', 'Solana validator vote credits of all epochs') 64 | 65 | # vote module 66 | solana_validator_vote_height = Gauge('solana_validator_vote_height', 67 | 'Most recent VALIDATOR slot voted on by this vote account', 68 | ['rpc']) 69 | solana_network_vote_height = Gauge('solana_network_vote_height', 70 | 'Most recent NETWORK slot voted on by this vote account', 71 | ['rpc']) 72 | solana_vote_height_diff = Gauge('solana_vote_height_diff', 'Vote height difference of validator and network') 73 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Solana Monitor 3 | 4 | This project monitors Solana validator metrics using Docker and Docker Compose. It also integrates with Prometheus and Grafana for visualization. 5 | 6 | ## Prerequisites 7 | 8 | - **Docker**: Container management 9 | - **Docker Compose**: Multi-container orchestration 10 | - **Git**: To clone the repository 11 | 12 | ## Path Configuration 13 | 14 | ### Solana Node Path 15 | If you installed the Solana node under a different user (not `root`), you need to modify the path in `docker-compose.yml`. 16 | 17 | Find this line: 18 | ```yaml 19 | - /root/.local/share/solana/install/active_release/bin:/solana 20 | ``` 21 | 22 | You can find your Solana installation path by running: 23 | ```bash 24 | which solana 25 | ``` 26 | 27 | Similarly, update the Solana log path in the promtail service if needed: 28 | ```yaml 29 | - /root/solana/solana.log:/var/log/solana.log # Change /root to your user's home directory 30 | ``` 31 | 32 | These path adjustments ensure the containers can access your Solana installation and log files correctly. 33 | 34 | ## Building the Project 35 | 36 | ### 1. Install Docker and Docker Compose 37 | 38 | Set up Docker by adding the official Docker repository and installing the required packages. 39 | 40 | #### Install Docker: 41 | 42 | Follow the official [Docker installation guide](https://docs.docker.com/engine/install/ubuntu/) for more details. 43 | 44 | ```bash 45 | # Update packages and install dependencies 46 | sudo apt-get update 47 | sudo apt-get install ca-certificates curl 48 | 49 | # Add Docker's official GPG key and set up the repository 50 | sudo install -m 0755 -d /etc/apt/keyrings 51 | sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc 52 | sudo chmod a+r /etc/apt/keyrings/docker.asc 53 | echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] \ 54 | https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \ 55 | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null 56 | ``` 57 | ```bash 58 | sudo apt-get update 59 | sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin 60 | ``` 61 | #### Install Docker Compose: 62 | 63 | ```shell 64 | sudo curl -L https://github.com/docker/compose/releases/download/v2.29.7/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose 65 | sudo chmod +x /usr/local/bin/docker-compose 66 | ``` 67 | 68 | ### 2. Clone the Repository 69 | Download the project source code: 70 | ```bash 71 | git clone https://github.com/qskyhigh/solana-monitor-public 72 | cd solana-monitor-public 73 | ``` 74 | 75 | ### 3. Build and Start the Application 76 | Use Docker Compose to build the project and run the services in the background: 77 | ```bash 78 | docker-compose build --no-cache 79 | docker-compose up -d 80 | ``` 81 | 82 | ## Grafana Cloud API Token Configuration 83 | To connect Prometheus and Loki with Grafana Cloud, you need to generate your own API tokens and update the relevant configuration files. 84 | ### 1. Prometheus Configuration (`prometheus.yml`) 85 | 86 | In the `prometheus.yml` file, replace the `username` and `password` with your own Grafana Cloud Prometheus API credentials. 87 | ```yml 88 | remote_write: 89 | - url: https://prometheus-prod-13-prod-us-east-0.grafana.net/api/prom/push 90 | basic_auth: 91 | username: YOUR_USERNAME 92 | password: YOUR_API_TOKEN 93 | ``` 94 | 95 | ### 2. Loki Configuration (`promtail.yml`) 96 | In the `promtail.yml` file, replace the `username` and `token` with your own Grafana Cloud Loki credentials. 97 | ```yml 98 | clients: 99 | - url: https://YOUR_USERNAME:YOUR_API_TOKEN@logs-prod-006.grafana.net/loki/api/v1/push 100 | ``` 101 | 102 | #### How to Obtain API Tokens 103 | 1. Log in to your Grafana Cloud account. 104 | 2. Go to the API Keys section under Settings. 105 | 3. Generate API tokens for both Prometheus and Loki. 106 | 4. Use the generated tokens to replace the placeholders in `prometheus.yml` and `promtail.yml`. 107 | 108 | ## Grafana Dashboard Configuration 109 | 110 | The dashboard can be imported from the docs/ directory to your Grafana instance
111 | - Default is to utilize a label applied by the collector `host: solana-monitor-testnet` (you can change the global label in `prometheus.yml`) 112 | 113 | ## Testing 114 | You can check the running Docker containers with: 115 | ```bash 116 | docker ps 117 | ``` 118 | Once the containers are up, access Grafana to visualize Solana metrics. For more details on the dashboard configuration, refer to the provided Grafana screenshot: 119 | dqskyhigh-grafana 120 | 121 | ### Node Exporter Metrics 122 | 123 | To monitor system-level metrics such as CPU, memory, and disk usage, you can use the **Node Exporter**. 124 | 125 | You can download and import the Node Exporter dashboard with **ID 1860** from Grafana's dashboard library: 126 | 127 | 1. Go to Grafana and navigate to **Dashboards** > **Import**. 128 | 2. Enter the **Dashboard ID: `1860`** and click **Load**. 129 | 3. Select your Prometheus datasource and click **Import**. 130 | 131 | This will provide a comprehensive overview of your system's performance using Node Exporter metrics. 132 | 133 | ### 134 | If you found this project helpful, feel free to support by donating SOL to my wallet. 135 | 136 | **SOL Wallet Address**: `FNu9BCwCmgSmeCa56LCAErPBNeAdgnQJsBrrLgVbbMKt` 137 | 138 | Buy Me A Coffee -------------------------------------------------------------------------------- /modules/block.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import time 3 | import inspect 4 | import json 5 | from loguru import logger 6 | from config import PUB_KEY, SOLANA_BINARY_PATH 7 | from utils.func import update_metric 8 | from prometheus.metrics import (solana_net_skip_rate, solana_skipped_total, solana_val_blocks_produced, 9 | solana_val_skip_rate, solana_val_skipped_slots, solana_total_blocks_produced, 10 | solana_skip_rate_diff, solana_val_leader_slots, solana_total_slots, 11 | solana_confirmed_epoch_first_slot, solana_confirmed_epoch_last_slot) 12 | 13 | 14 | # Function to fetch block production data from Solana CLI 15 | def get_block_production(): 16 | # Run solana block-production command and capture output 17 | try: 18 | result = subprocess.run( 19 | [SOLANA_BINARY_PATH, "block-production", "--output", "json-compact"], 20 | capture_output=True, text=True, check=True 21 | ) 22 | # logger.info("Block production command executed successfully.") 23 | except subprocess.CalledProcessError as e: 24 | logger.error(f"Error executing solana block-production command: {e}") 25 | return None 26 | 27 | # Remove any "Note:" lines that might be in the output 28 | block_production = "\n".join([line for line in result.stdout.splitlines() if "Note:" not in line]) 29 | 30 | try: 31 | block_production_data = json.loads(block_production) 32 | # logger.info("Block production data successfully parsed.") 33 | except json.JSONDecodeError as e: 34 | logger.error(f"Error decoding JSON from block production data: {e}") 35 | return None 36 | 37 | # Return parsed data 38 | return block_production_data 39 | 40 | 41 | # Function to extract validator's modules and send them to Prometheus 42 | def process_metrics(block_production_data): 43 | 44 | if not block_production_data: 45 | logger.warning("No block production data available to process.") 46 | return 47 | 48 | # Retrieve network modules 49 | try: 50 | total_slots_skipped = block_production_data.get('total_slots_skipped') 51 | update_metric(solana_skipped_total, total_slots_skipped) 52 | total_slots = block_production_data.get('total_slots') 53 | update_metric(solana_total_slots, total_slots) 54 | total_blocks_produced = block_production_data.get('total_blocks_produced') 55 | update_metric(solana_total_blocks_produced, total_blocks_produced) 56 | start_slot = block_production_data.get('start_slot') 57 | update_metric(solana_confirmed_epoch_first_slot, start_slot) 58 | end_slot = block_production_data.get('end_slot') 59 | update_metric(solana_confirmed_epoch_last_slot, end_slot) 60 | total_net_skip_rate = (total_slots_skipped / total_slots) * 100 61 | update_metric(solana_net_skip_rate, total_net_skip_rate) 62 | logger.debug(f"Network metrics - Total slots skipped: {total_slots_skipped}, Total slots: {total_slots}, " 63 | f"Total blocks produced: {total_blocks_produced}, Start slot: {start_slot}, " 64 | f"End slot: {end_slot}, Net skip rate: {total_net_skip_rate}") 65 | except KeyError as e: 66 | logger.error(f"Key error when extracting network metrics: {e}") 67 | return 68 | 69 | validator_block_production = [ 70 | leader for leader in block_production_data.get("leaders", []) 71 | if leader.get("identityPubkey") == PUB_KEY 72 | ] 73 | 74 | # Retrieve validator-specific modules 75 | if validator_block_production: 76 | try: 77 | val_slots_skipped = validator_block_production[0].get('skippedSlots') 78 | update_metric(solana_val_skipped_slots, val_slots_skipped) 79 | val_leader_slots = validator_block_production[0].get('leaderSlots') 80 | # update_metric(solana_val_leader_slots, val_leader_slots) 81 | val_blocks_produced = validator_block_production[0].get('blocksProduced') 82 | update_metric(solana_val_blocks_produced, val_blocks_produced) 83 | val_skip_rate = (val_slots_skipped / val_leader_slots) * 100 84 | update_metric(solana_val_skip_rate, val_skip_rate) 85 | skip_rate_diff = val_skip_rate - total_net_skip_rate 86 | update_metric(solana_skip_rate_diff, skip_rate_diff) 87 | logger.debug( 88 | f"Validator metrics - blocks produced: {val_blocks_produced}, " 89 | f"skip rate: {val_skip_rate:.2f}%, slots skipped: {val_slots_skipped}, " 90 | f"leader_slots: {val_leader_slots}, skip rate diff: {skip_rate_diff}") 91 | except KeyError as e: 92 | logger.error(f"Key error when extracting validator-specific metrics: {e}") 93 | return 94 | else: 95 | logger.warning("No validator block production data found.") 96 | update_metric(solana_val_skipped_slots, 0) 97 | update_metric(solana_val_blocks_produced, 0) 98 | update_metric(solana_val_skip_rate, 0) 99 | update_metric(solana_skip_rate_diff, -total_net_skip_rate) 100 | 101 | 102 | # Main function to collect block production data and process it 103 | def block_metrics(): 104 | logger.info(f"{inspect.currentframe().f_code.co_name}: Starting metrics collection process.") 105 | start_time = time.time() 106 | 107 | # Fetch block production data 108 | block_production_data = get_block_production() 109 | 110 | # Process and send modules to Prometheus 111 | process_metrics(block_production_data) 112 | 113 | end_time = time.time() 114 | logger.success(f"{inspect.currentframe().f_code.co_name}: All metrics have been successfully collected and sent " 115 | f"to Prometheus. Time: {end_time - start_time}") 116 | -------------------------------------------------------------------------------- /modules/leader_slot.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import aiohttp 3 | import asyncio 4 | import time 5 | from datetime import datetime, timedelta 6 | from loguru import logger 7 | from utils.func import update_metric 8 | from config import NETWORK_RPC_ENDPOINT, PUB_KEY, HEADERS 9 | from prometheus.metrics import (solana_val_total_leader_slots, solana_next_leader_slot, solana_time_to_next_slot, 10 | solana_avg_slot_duration, solana_next_slot_time, solana_previous_leader_slot) 11 | 12 | 13 | # Generalized async function to fetch data from the Solana RPC 14 | async def fetch_rpc_data(session, method, params=None): 15 | payload = { 16 | "jsonrpc": "2.0", "id": 1, "method": method, "params": params or [] 17 | } 18 | 19 | try: 20 | async with session.post(NETWORK_RPC_ENDPOINT, headers=HEADERS, json=payload) as response: 21 | response.raise_for_status() 22 | return await response.json() 23 | except Exception as e: 24 | logger.error(f"Error fetching {method} from RPC: {e}") 25 | return None 26 | 27 | 28 | # Get current slot 29 | async def get_current_slot(session): 30 | result = await fetch_rpc_data(session, "getSlot", [{"commitment": "confirmed"}]) 31 | return result.get('result') if result else None 32 | 33 | 34 | # Get leader schedule 35 | async def get_leader_schedule(session): 36 | result = await fetch_rpc_data(session, "getLeaderSchedule", [None, {"identity": PUB_KEY}]) 37 | return result.get('result', {}).get(PUB_KEY, []) if result else None 38 | 39 | 40 | # Get epoch information 41 | async def get_epoch(session): 42 | epoch_schedule = await fetch_rpc_data(session, "getEpochSchedule") 43 | epoch_info = await fetch_rpc_data(session, "getEpochInfo") 44 | 45 | if epoch_schedule and epoch_info: 46 | return (epoch_schedule['result'].get('firstNormalEpoch'), 47 | epoch_schedule['result'].get('firstNormalSlot'), 48 | epoch_schedule['result'].get('slotsPerEpoch'), 49 | epoch_info['result'].get('epoch')) 50 | else: 51 | logger.error("Error fetching epoch data") 52 | return None, None, None, None 53 | 54 | 55 | # Calculate average slot duration 56 | async def calculate_slot_duration(session): 57 | result = await fetch_rpc_data(session, "getRecentPerformanceSamples", [1]) 58 | if result and len(result['result']) > 0: 59 | sample = result['result'][0] 60 | sample_period_secs = sample.get('samplePeriodSecs') 61 | num_slots = sample.get('numSlots') 62 | if num_slots != 0: 63 | return sample_period_secs / num_slots 64 | else: 65 | logger.warning("Warning: Division by zero, setting result to 0") 66 | return 0 67 | else: 68 | logger.error("Error fetching slot duration data") 69 | return None 70 | 71 | 72 | # Main function to gather and set Prometheus modules 73 | async def leader_slot_metrics(): 74 | logger.info(f"{inspect.currentframe().f_code.co_name}: Starting metrics collection process.") 75 | start_time = time.time() 76 | async with aiohttp.ClientSession() as session: 77 | # Parallel requests to Solana RPC 78 | current_slot, leader_slots_in_epoch, epoch_data, slot_duration = await asyncio.gather( 79 | get_current_slot(session), 80 | get_leader_schedule(session), 81 | get_epoch(session), 82 | calculate_slot_duration(session) 83 | ) 84 | 85 | if current_slot is None or leader_slots_in_epoch is None or epoch_data is None or slot_duration is None: 86 | logger.error("Failed to fetch all required data. Skipping metric collection.") 87 | return 88 | 89 | first_normal_epoch, first_normal_slot, slots_per_epoch, epoch = epoch_data 90 | if not all([first_normal_epoch, first_normal_slot, slots_per_epoch, epoch]): 91 | logger.error("Incomplete epoch data. Skipping metric collection.") 92 | return 93 | 94 | # Calculate next and previous leader slots 95 | first_slot_in_epoch = (epoch - first_normal_epoch) * slots_per_epoch + first_normal_slot 96 | next_slot = next((slot for slot in leader_slots_in_epoch if slot + first_slot_in_epoch > current_slot), None) 97 | previous_slot = next( 98 | (slot for slot in reversed(leader_slots_in_epoch) if slot + first_slot_in_epoch < current_slot), 0) 99 | 100 | if next_slot: 101 | next_slot_epoch = first_slot_in_epoch + next_slot 102 | time_to_next_slot = (next_slot_epoch - current_slot) * slot_duration 103 | next_slot_time = datetime.now() + timedelta(seconds=time_to_next_slot) 104 | next_slot_time = next_slot_time.replace(second=0, microsecond=0) 105 | next_slot_time_unix = time.mktime(next_slot_time.timetuple()) 106 | logger.debug(f"Next leader slot: {next_slot_epoch} in {time_to_next_slot:.2f}s") 107 | 108 | # Update Prometheus modules 109 | update_metric(solana_next_leader_slot, next_slot_epoch) 110 | update_metric(solana_time_to_next_slot, time_to_next_slot) 111 | update_metric(solana_next_slot_time, next_slot_time_unix) 112 | else: 113 | logger.warning("No upcoming leader slots found.") 114 | solana_next_leader_slot.set(0) 115 | solana_time_to_next_slot.set(0) 116 | solana_next_slot_time.set(0) 117 | 118 | previous_slot_epoch = first_slot_in_epoch + previous_slot 119 | update_metric(solana_previous_leader_slot, previous_slot_epoch) 120 | update_metric(solana_val_total_leader_slots, len(leader_slots_in_epoch)) 121 | update_metric(solana_avg_slot_duration, slot_duration) 122 | logger.debug(f"Previous leader slot: {previous_slot_epoch}, Total_leader_slots: {len(leader_slots_in_epoch)}, " 123 | f"Avg slot duration: {slot_duration}") 124 | end_time = time.time() 125 | logger.success(f"{inspect.currentframe().f_code.co_name}: Metrics successfully collected and exported to " 126 | f"Prometheus. Time: {end_time - start_time}") 127 | -------------------------------------------------------------------------------- /modules/slot.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import time 3 | import aiohttp 4 | import asyncio 5 | from loguru import logger 6 | from config import VALIDATOR_RPC_ENDPOINT, NETWORK_RPC_ENDPOINT, HEADERS, RETRY 7 | from utils.func import update_metric 8 | from prometheus.metrics import (solana_block_height, solana_network_block_height, solana_current_slot, 9 | solana_net_current_slot, solana_net_max_shred_insert_slot, 10 | solana_net_max_retransmit_slot, solana_slot_diff, solana_block_height_diff, 11 | solana_val_max_shred_insert_slot, solana_val_max_retransmit_slot) 12 | 13 | rpc_urls = { 14 | "network": NETWORK_RPC_ENDPOINT, 15 | "validator": VALIDATOR_RPC_ENDPOINT 16 | } 17 | 18 | 19 | def extract_slot(data, req_id): 20 | return next((item['result'] for item in data if item['id'] == req_id and 'error' not in item), None) 21 | 22 | 23 | async def measure_rpc_response_time(url, session, payload): 24 | try: 25 | start_time = time.time() 26 | async with session.post(url, json=payload, headers=HEADERS) as response: 27 | result = await response.json() 28 | end_time = time.time() 29 | return result, end_time - start_time 30 | except Exception as e: 31 | logger.error(f"Error while accessing {url}: {e}") 32 | return None, None 33 | 34 | 35 | async def make_requests(payload, func_name): 36 | 37 | async with aiohttp.ClientSession() as session: 38 | raw_results = await asyncio.gather( 39 | *[measure_rpc_response_time(url, session, payload) for url in rpc_urls.values()], 40 | return_exceptions=True 41 | ) 42 | 43 | results = [] 44 | response_times = {} 45 | 46 | for (row, response_time), name in zip(raw_results, rpc_urls.keys()): 47 | results.append(row) 48 | response_times[name] = response_time 49 | network_time = response_times.get('network') 50 | validator_time = response_times.get('validator') 51 | 52 | if network_time is not None: 53 | logger.debug(f"{func_name.upper()} Response time for network: {network_time:.4f} seconds") 54 | else: 55 | logger.warning(f"{func_name.upper()} No valid response time for network") 56 | 57 | if validator_time is not None: 58 | logger.debug(f"{func_name.upper()} Response time for validator: {validator_time:.4f} seconds") 59 | else: 60 | logger.warning(f"{func_name.upper()} No valid response time for validator") 61 | 62 | return results, response_times 63 | 64 | 65 | async def get_slots(): 66 | payload = [ 67 | {"jsonrpc": "2.0", "id": 1, "method": "getMaxRetransmitSlot"}, 68 | {"jsonrpc": "2.0", "id": 2, "method": "getMaxShredInsertSlot"}, 69 | {"jsonrpc": "2.0", "id": 3, "method": "getSlot", "params": [{"commitment": "confirmed"}]} 70 | ] 71 | func_name = inspect.currentframe().f_code.co_name 72 | slots, response_times = await make_requests(payload, func_name) 73 | 74 | retry_count = 0 75 | last_slots = None 76 | while retry_count < RETRY and any(t is not None and t > 1 for t in response_times.values()): 77 | retry_count += 1 78 | logger.info("One or more requests took longer than 1 second. Retrying...") 79 | slots, response_times = await make_requests(payload, func_name) 80 | 81 | last_slots = slots 82 | 83 | if retry_count == RETRY: 84 | slots = last_slots 85 | 86 | try: 87 | val_slot = net_slot = None 88 | if slots[0] is not None: 89 | net_slot = extract_slot(slots[0], 3) 90 | update_metric(solana_current_slot, net_slot) 91 | net_max_shred_insert_slot = extract_slot(slots[0], 2) 92 | update_metric(solana_net_max_shred_insert_slot, net_max_shred_insert_slot) 93 | net_max_retransmit_slot = extract_slot(slots[0], 1) 94 | update_metric(solana_net_max_retransmit_slot, net_max_retransmit_slot) 95 | logger.debug(f"Network slot: {net_slot}, " 96 | f"net_max_shred_insert_slot: {net_max_shred_insert_slot}, net_max_retransmit_slot: {net_max_retransmit_slot}") 97 | else: 98 | logger.warning(f"{func_name.upper()} No slot data for network") 99 | 100 | if slots[1] is not None: 101 | val_slot = extract_slot(slots[1], 3) 102 | update_metric(solana_net_current_slot, val_slot) 103 | val_max_shred_insert_slot = extract_slot(slots[1], 2) 104 | update_metric(solana_val_max_shred_insert_slot, val_max_shred_insert_slot) 105 | val_max_retransmit_slot = extract_slot(slots[1], 1) 106 | update_metric(solana_val_max_retransmit_slot, val_max_retransmit_slot) 107 | logger.debug(f"Validator slot: {val_slot}, " 108 | f"val_max_shred_insert_slot: {val_max_shred_insert_slot}, val_max_retransmit_slot: {val_max_retransmit_slot}") 109 | else: 110 | logger.warning(f"{func_name.upper()} No slot data for validator") 111 | 112 | if val_slot is not None and net_slot is not None: 113 | update_metric(solana_slot_diff, val_slot - net_slot) 114 | logger.debug(f"Slot diff: {val_slot - net_slot}") 115 | 116 | except Exception as e: 117 | logger.error(f"Error processing slots data: {e}") 118 | 119 | 120 | async def get_block_height(): 121 | payload = { 122 | "jsonrpc": "2.0", "id": 1, "method": "getBlockHeight" 123 | } 124 | 125 | func_name = inspect.currentframe().f_code.co_name 126 | blocks, response_times = await make_requests(payload, func_name) 127 | 128 | retry_count = 0 129 | last_blocks = None 130 | while retry_count < RETRY and any(t is not None and t > 1 for t in response_times.values()): 131 | retry_count += 1 132 | logger.info("One or more requests took longer than 1 second. Retrying...") 133 | blocks, response_times = await make_requests(payload, func_name) 134 | 135 | last_blocks = blocks 136 | 137 | if retry_count == RETRY: 138 | blocks = last_blocks 139 | 140 | try: 141 | val_block_height = net_block_height = None 142 | if blocks[0] is not None: 143 | net_block_height = blocks[0]['result'] 144 | update_metric(solana_network_block_height, net_block_height) 145 | else: 146 | logger.warning(f"{func_name.upper()} No block height data for network") 147 | 148 | if blocks[1] is not None: 149 | val_block_height = blocks[1]['result'] 150 | update_metric(solana_block_height, val_block_height) 151 | else: 152 | logger.warning(f"{func_name.upper()} No block height data for validator") 153 | 154 | if val_block_height and net_block_height: 155 | update_metric(solana_block_height_diff, val_block_height-net_block_height) 156 | logger.debug(f"Block diff: {val_block_height - net_block_height}") 157 | 158 | logger.debug(f"Network block height: {net_block_height}, Validator block height: {val_block_height}") 159 | 160 | except Exception as e: 161 | logger.error(f"Error processing blocks data: {e}") 162 | -------------------------------------------------------------------------------- /modules/validator.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import json 3 | import time 4 | import aiohttp 5 | import subprocess 6 | from loguru import logger 7 | from config import PUB_KEY, VOTE_PUB_KEY, NETWORK_RPC_ENDPOINT, SOLANA_BINARY_PATH, HEADERS 8 | from utils.func import update_metric 9 | from prometheus.metrics import (solana_active_stake, solana_current_stake, solana_delinquent_stake, solana_vote_credits, 10 | solana_active_validators, solana_validator_activated_stake, solana_val_status, 11 | solana_total_credits, solana_val_commission, solana_avg_vote_credits) 12 | 13 | 14 | def get_validators(): 15 | """Fetch validators information using Solana CLI and update Prometheus modules.""" 16 | try: 17 | result = subprocess.run( 18 | [SOLANA_BINARY_PATH, "validators", "--output", "json-compact"], 19 | capture_output=True, text=True, check=True 20 | ) 21 | logger.info("Successfully executed solana validators command.") 22 | except subprocess.CalledProcessError as e: 23 | logger.error(f"Error while running solana validators command: {e}") 24 | return None 25 | 26 | # Remove any "Note:" lines that might be in the output 27 | validators = "\n".join([line for line in result.stdout.splitlines() if "Note:" not in line]) 28 | 29 | try: 30 | # Parse the JSON data from Solana validators 31 | validators_data = json.loads(validators) 32 | logger.info("Block production data successfully parsed.") 33 | except json.JSONDecodeError as e: 34 | logger.error(f"Error decoding JSON from block production data: {e}") 35 | return None 36 | 37 | # Return parsed data 38 | return validators_data 39 | 40 | 41 | def process_metrics(validators_data): 42 | if not validators_data: 43 | logger.warning("No validator data available to process.") 44 | return 45 | 46 | # Retrieve network modules 47 | try: 48 | active_stake = validators_data.get('totalActiveStake') / 10 ** 9 49 | current_stake = validators_data.get('totalCurrentStake') / 10 ** 9 50 | delinquent_stake = validators_data.get('totalDelinquentStake') / 10 ** 9 51 | logger.debug(f'Active Stake: {round(active_stake, 2)}, Current Stake: {round(current_stake, 2)}, ' 52 | f'Delinquent Stake: {round(delinquent_stake, 2)}') 53 | except KeyError as e: 54 | logger.error(f"EKey error when extracting validator-specific modules: {e}") 55 | return 56 | 57 | # Dictionary mapping Prometheus Gauges to their corresponding values 58 | update_metric(solana_active_stake, active_stake) 59 | update_metric(solana_current_stake, current_stake) 60 | update_metric(solana_delinquent_stake, delinquent_stake) 61 | 62 | 63 | def validator_metrics(): 64 | logger.info(f"{inspect.currentframe().f_code.co_name}: Starting modules collection process.") 65 | start_time = time.time() 66 | 67 | # Fetch validators data 68 | validators_data = get_validators() 69 | 70 | # Process and send modules to Prometheus 71 | process_metrics(validators_data) 72 | 73 | end_time = time.time() 74 | logger.success(f"{inspect.currentframe().f_code.co_name}: All modules have been successfully collected and sent " 75 | f"to Prometheus. Time: {end_time - start_time}") 76 | 77 | 78 | async def get_vote_accounts(): 79 | """Fetch vote account information using RPC and update Prometheus metrics.""" 80 | payload = [ 81 | {"jsonrpc": "2.0", "id": 1, "method": "getVoteAccounts", "params": [{"commitment": "recent"}]}, 82 | {"jsonrpc": "2.0", "id": 2, "method": "getEpochInfo"} 83 | ] 84 | 85 | logger.info(f"{inspect.currentframe().f_code.co_name}: Starting modules collection process.") 86 | start_time = time.time() 87 | 88 | try: 89 | async with aiohttp.ClientSession() as session: 90 | async with session.post(NETWORK_RPC_ENDPOINT, json=payload, headers=HEADERS) as response: 91 | response.raise_for_status() 92 | result = await response.json() 93 | 94 | current_val = result[0]['result'].get('current', []) 95 | delinquent_val = result[0]['result'].get('delinquent', []) 96 | current_epoch = result[1]['result']['epoch'] 97 | 98 | update_metric(solana_active_validators, len(current_val), labels={"state": "current"}) 99 | update_metric(solana_active_validators, len(delinquent_val), labels={"state": "delinquent"}) 100 | 101 | logger.debug(f'Current: {len(current_val)}, Delinquent: {len(delinquent_val)}') 102 | 103 | all_vote_credits = [] 104 | vote_account = None 105 | 106 | all_accounts = current_val + delinquent_val 107 | 108 | for account in all_accounts: 109 | if account.get('nodePubkey') == PUB_KEY: 110 | vote_account = account 111 | if account in delinquent_val: 112 | logger.error("Your Solana validator is in DELINQUENT state") 113 | 114 | activated_stake = account.get('activatedStake') 115 | epoch_credits = account.get('epochCredits', []) 116 | 117 | if activated_stake and epoch_credits: 118 | last_epoch = epoch_credits[-1] 119 | if last_epoch and current_epoch == last_epoch[0]: 120 | vote_credits = last_epoch[1] - last_epoch[2] 121 | all_vote_credits.append(vote_credits) 122 | 123 | if all_vote_credits: 124 | avg_vote_credits = sum(all_vote_credits) / len(all_vote_credits) 125 | update_metric(solana_avg_vote_credits, avg_vote_credits) 126 | logger.debug(f'Average Network Vote credits: {avg_vote_credits}') 127 | 128 | if vote_account: 129 | process_vote_account(vote_account) 130 | else: 131 | logger.error("Validator account not found in both current and delinquent lists.") 132 | 133 | except Exception as e: 134 | logger.error(f"Error fetching or processing vote accounts: {e}") 135 | 136 | end_time = time.time() 137 | logger.success(f"{inspect.currentframe().f_code.co_name}: Collection completed in {end_time - start_time:.2f} seconds.") 138 | 139 | 140 | def process_vote_account(vote_account): 141 | """Process and update metrics for the given vote account.""" 142 | val_stake = vote_account.get('activatedStake') / 10 ** 9 143 | commission = vote_account.get('commission') 144 | epoch_vote = vote_account.get('epochVoteAccount') 145 | root_slot = vote_account.get('rootSlot') 146 | last_epoch = vote_account.get('epochCredits')[-1] 147 | vote_credits = last_epoch[1] - last_epoch[2] 148 | total_credits = last_epoch[1] 149 | 150 | update_metric(solana_validator_activated_stake, val_stake, labels={"pubkey": PUB_KEY, "votekey": VOTE_PUB_KEY}) 151 | update_metric(solana_val_commission, commission, labels={"commission": str(commission)}) 152 | 153 | logger.debug(f'Validator Stake: {round(val_stake, 2)}, Commission: {commission}, Epoch vote: {epoch_vote}, ' 154 | f'Vote credits: {vote_credits}, Total credits: {total_credits}') 155 | 156 | update_metric(solana_val_status, 1 if epoch_vote else 0, labels={"state": "voting" if epoch_vote else "not voting"}) 157 | update_metric(solana_vote_credits, vote_credits) 158 | update_metric(solana_total_credits, total_credits) 159 | 160 | logger.info("Updated Prometheus metrics for validator.") 161 | --------------------------------------------------------------------------------