├── main.py
├── Dockerfile
├── requirements.txt
├── .gitignore
├── config.yml
├── utils
    └── func.py
├── config.py
├── LICENSE
├── prometheus.yml
├── promtail.yml
├── modules
    ├── version.py
    ├── epoch.py
    ├── node_health.py
    ├── balance.py
    ├── vote.py
    ├── block.py
    ├── leader_slot.py
    ├── slot.py
    └── validator.py
├── docker-compose.yml
├── exporter
    ├── collector.py
    └── exporter.py
├── prometheus
    └── metrics.py
└── README.md


/main.py:
--------------------------------------------------------------------------------
1 | from exporter.exporter import main
2 | 
3 | if __name__ == "__main__":
4 | 
5 |     main()
6 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.12-slim
2 | WORKDIR /app
3 | COPY . .
4 | RUN pip install --root-user-action=ignore -r requirements.txt
5 | 
6 | CMD ["python", "-u",  "main.py"]


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.10.5
2 | aiosignal==1.3.1
3 | loguru==0.7.2
4 | prometheus_client==0.21.0
5 | python-telegram-bot==21.6
6 | requests==2.32.3
7 | PyYAML==6.0.2


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.log
 2 | 
 3 | # Environments
 4 | .venv
 5 | env/
 6 | venv/
 7 | ENV/
 8 | env.bak/
 9 | venv.bak/
10 | prometheus_data/
11 | 
12 | # Byte-compiled / optimized / DLL files
13 | __pycache__/
14 | *.py[cod]
15 | *$py.class
16 | 
17 | # pycharm
18 | .idea
19 | .DS_Store
20 | 


--------------------------------------------------------------------------------
/config.yml:
--------------------------------------------------------------------------------
 1 | pub_key: YOUR_PUB_KEY
 2 | vote_pub_key: YOUR_VOTE_KEY
 3 | network_rpc_endpoint: https://api.testnet.solana.com
 4 | validator_rpc_endpoint: http://localhost:8899
 5 | 
 6 | sleep_time: 45
 7 | metric_port: 1234
 8 | 
 9 | thread_pool_size: 2
10 | log_level: DEBUG  # INFO/WARNING/SUCCESS/ERROR
11 | retry: 10
12 | 


--------------------------------------------------------------------------------
/utils/func.py:
--------------------------------------------------------------------------------
 1 | def update_metric(metric, value, labels=None):
 2 |     """
 3 |     Update Prometheus metric with optional labels and log the update if value is not None.
 4 | 
 5 |     :param metric: Prometheus metric to update
 6 |     :param value: Value to set for the metric
 7 |     :param labels: Optional dictionary of labels for the metric
 8 |     """
 9 |     if value is not None:
10 |         if labels:
11 |             # If labels are provided, update metric with multiple labels
12 |             metric.labels(**labels).set(value)
13 |         else:
14 |             # Update metric without labels
15 |             metric.set(value)
16 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | 
 3 | HEADERS = {'Content-Type': 'application/json'}
 4 | 
 5 | with open("config.yml", "r") as config_file:
 6 |     config = yaml.safe_load(config_file)
 7 | 
 8 | PUB_KEY = config.get("pub_key")
 9 | VOTE_PUB_KEY = config.get("vote_pub_key")
10 | NETWORK_RPC_ENDPOINT = config.get("network_rpc_endpoint", "https://api.testnet.solana.com")
11 | VALIDATOR_RPC_ENDPOINT = config.get("validator_rpc_endpoint", "http://localhost:8899")
12 | SOLANA_BINARY_PATH = config.get("solana_binary_path", "solana")
13 | 
14 | THREAD_POOL_SIZE = config.get("thread_pool_size", 4)
15 | SLEEP_TIME = config.get("sleep_time", 45)
16 | PORT = config.get("metric_port", 1234)
17 | LOG_LEVEL = config.get("log_level", "INFO")
18 | RETRY = config.get("retry", 5)
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 qskyhigh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/prometheus.yml:
--------------------------------------------------------------------------------
 1 | # my global config
 2 | global:
 3 |   scrape_interval:     60s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
 4 |   evaluation_interval: 60s # Evaluate rules every 15 seconds. The default is every 1 minute.
 5 |   # scrape_timeout is set to the global default (10s).
 6 |   external_labels:
 7 |     host: "solana-monitor-testnet"
 8 | # Alertmanager configuration
 9 | alerting:
10 |   alertmanagers:
11 |   - static_configs:
12 |     - targets:
13 | #       - alertmanager:9093
14 | 
15 | # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
16 | rule_files:
17 | #   - "first_rules.yml"
18 | #   - "second_rules.yml"
19 | 
20 | # A scrape configuration containing exactly one endpoint to scrape:
21 | # Here it's Prometheus itself.
22 | scrape_configs:
23 |   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
24 |   - job_name: 'prometheus'
25 | 
26 |     # metrics_path defaults to '/metrics'
27 |     # scheme defaults to 'http'.
28 | 
29 |     static_configs:
30 |     - targets: ['localhost:9090']
31 | 
32 |   - job_name: 'solana'
33 | 
34 |     static_configs:
35 |     - targets: ['localhost:1234']
36 |       labels:
37 |         instance: 'solana-monitor'
38 | 
39 | 
40 |   - job_name: 'node_exporter'
41 | 
42 |     static_configs:
43 |     - targets: ['localhost:9100']
44 |       labels:
45 |         instance: 'SolanaTestnet'
46 | 
47 | remote_write:
48 | - url: https://prometheus-prod-13-prod-us-east-0.grafana.net/api/prom/push
49 |   basic_auth:
50 |     username: YOUR_USERNAME
51 |     password: YOUR_API_TOKEN
52 | 


--------------------------------------------------------------------------------
/promtail.yml:
--------------------------------------------------------------------------------
 1 | server:
 2 |   http_listen_port: 9080
 3 |   grpc_listen_port: 0
 4 | 
 5 | positions:
 6 |   filename: /tmp/positions.yaml
 7 | 
 8 | clients:
 9 |   - url: https://YOUR_USERNAME:YOUR_API_TOKEN@logs-prod-006.grafana.net/loki/api/v1/push
10 | scrape_configs:
11 |   - job_name: "monitor-logs"
12 |     static_configs:
13 |       - targets:
14 |           - localhost
15 |         labels:
16 |           __path__: /app/logs/monitor.log
17 |           job: monitor-logs
18 |     pipeline_stages:
19 |       - regex:
20 |           expression: '(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}) \| (?P<level>\S+) *\| (?P<source>.*?) - (?P<message>.*)$'
21 |       - labels:
22 |           level:
23 |           source:
24 |       - timestamp:
25 |           source: timestamp
26 |           format: '2006-01-02 15:04:05.000'
27 |           location: Europe/Moscow
28 |       - output:
29 |           source: message
30 |   - job_name: "solana-logs"
31 |     static_configs:
32 |       - targets:
33 |           - localhost
34 |         labels:
35 |           __path__: /var/log/solana.log
36 |           job: solana-logs
37 |     pipeline_stages:
38 |       - regex:
39 |           expression: '\[(?P<timestamp>.*?)\s+(?P<level>.*?)\s+(?P<source>.*?)\]\s+(?P<message>.*)'
40 |       - labels:
41 |           level:
42 |           source:
43 |       - timestamp:
44 |           source: timestamp
45 |           format: RFC3339
46 |           location: Europe/Moscow
47 |       - match:
48 |           selector: '{level!="ERROR"}'
49 |           action: drop
50 |           drop_counter_reason: promtail_noisy_error
51 |       - output:
52 |           source: message
53 | 
54 | 


--------------------------------------------------------------------------------
/modules/version.py:
--------------------------------------------------------------------------------
 1 | import aiohttp
 2 | from loguru import logger
 3 | from config import HEADERS, VALIDATOR_RPC_ENDPOINT, PORT
 4 | from utils.func import update_metric
 5 | from prometheus.metrics import solana_node_version
 6 | 
 7 | PROMETHEUS_METRICS_URL = f"http://localhost:{PORT}/metrics"
 8 | 
 9 | 
10 | async def get_versions_from_prometheus() -> list:
11 |     try:
12 |         async with aiohttp.ClientSession() as session:
13 |             async with session.get(PROMETHEUS_METRICS_URL) as response:
14 |                 response.raise_for_status()
15 |                 metrics = await response.text()
16 | 
17 |         all_versions = [
18 |             line.split('{')[1].split('}')[0].split('=')[1].replace('"', '')
19 |             for line in metrics.splitlines()
20 |             if line.startswith(f'{solana_node_version._name}') and '1.0' in line
21 |         ]
22 |         return all_versions
23 |     except Exception as e:
24 |         logger.error(f"Failed to fetch version from Prometheus: {e}")
25 |         return []
26 | 
27 | 
28 | async def get_version():
29 |     all_versions = await get_versions_from_prometheus()
30 |     payload = {"jsonrpc": "2.0", "id": 1, "method": "getVersion"}
31 | 
32 |     try:
33 |         async with aiohttp.ClientSession() as session:
34 |             async with session.post(VALIDATOR_RPC_ENDPOINT, json=payload, headers=HEADERS) as response:
35 |                 response.raise_for_status()
36 |                 result = await response.json()
37 | 
38 |         current_version = result['result'].get('solana-core')
39 |         for version in all_versions:
40 |             if version and version != current_version:
41 |                 update_metric(solana_node_version, 0, labels={"version": version})
42 | 
43 |         update_metric(solana_node_version, 1, labels={"version": current_version})
44 |         logger.info(f"Node version of solana: {current_version}")
45 | 
46 |     except Exception as e:
47 |         logger.error(f"Error getting version node: {e}")
48 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   solana-monitor:
 3 |     container_name: solana-monitor
 4 |     restart: unless-stopped
 5 |     build:
 6 |       context: .
 7 |       dockerfile: Dockerfile
 8 |     image: monitor-image:latest
 9 |     network_mode: "host"
10 |     volumes:
11 |       - /root/.local/share/solana/install/active_release/bin:/solana
12 |       - ~/.config/solana:/root/.config/solana
13 |       - app_logs:/app/logs
14 |       - ./config.yml:/app/config.yml
15 |     environment:
16 |       - PATH=/solana:$PATH
17 |       - TZ=Europe/Moscow
18 | 
19 |   promtail:
20 |     container_name: promtail
21 |     restart: unless-stopped
22 |     image: grafana/promtail
23 |     volumes:
24 |       - ./promtail.yml:/etc/agent/agent.yaml
25 |       - /root/solana/solana.log:/var/log/solana.log
26 |       - app_logs:/app/logs
27 |     environment:
28 |       - TZ=Europe/Moscow
29 |     command:
30 |       - --config.file=/etc/agent/agent.yaml
31 | 
32 |   prometheus:
33 |     container_name: prometheus
34 |     restart: unless-stopped
35 |     image: prom/prometheus
36 |     network_mode: "host"
37 |     volumes:
38 |       - ./prometheus.yml:/etc/prometheus/prometheus.yml
39 |       - prometheus_data:/prometheus
40 |     command:
41 |       - '--config.file=/etc/prometheus/prometheus.yml'
42 |       - '--storage.tsdb.path=/prometheus'
43 |       - '--web.console.libraries=/etc/prometheus/console_libraries'
44 |       - '--web.console.templates=/etc/prometheus/consoles'
45 |       - '--web.enable-lifecycle'
46 | 
47 |   node-exporter:
48 |     container_name: node-exporter
49 |     restart: unless-stopped
50 |     image: prom/node-exporter:latest
51 |     network_mode: "host"
52 |     volumes:
53 |       - /proc:/host/proc:ro
54 |       - /sys:/host/sys:ro
55 |       - /:/rootfs:ro
56 |     command:
57 |       - '--path.procfs=/host/proc'
58 |       - '--path.rootfs=/rootfs'
59 |       - '--path.sysfs=/host/sys'
60 |       - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
61 | 
62 | networks:
63 |   solana-monitor_monitoring:
64 |     external: true
65 | 
66 | volumes:
67 |   app_logs:
68 |   prometheus_data:
69 | 
70 | 


--------------------------------------------------------------------------------
/exporter/collector.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from loguru import logger
 3 | from concurrent.futures import ThreadPoolExecutor, as_completed
 4 | from config import THREAD_POOL_SIZE
 5 | from modules.balance import balance_metrics
 6 | from modules.block import block_metrics
 7 | from modules.epoch import get_epoch_information
 8 | from modules.validator import validator_metrics, get_vote_accounts
 9 | from modules.version import get_version
10 | from modules.leader_slot import leader_slot_metrics
11 | from modules.node_health import get_health
12 | from modules.slot import get_block_height, get_slots
13 | from modules.vote import get_votes
14 | 
15 | 
16 | def run_sync_tasks():
17 |     sync_tasks = [
18 |         block_metrics,
19 |         validator_metrics
20 |     ]
21 | 
22 |     with ThreadPoolExecutor(max_workers=THREAD_POOL_SIZE) as executor:
23 |         futures = [executor.submit(task) for task in sync_tasks]
24 | 
25 |         for future in as_completed(futures):
26 |             try:
27 |                 future.result()
28 |             except Exception as e:
29 |                 logger.error(f"Error collecting sync metric: {e}, task: {future}")
30 | 
31 | 
32 | async def run_async_tasks():
33 |     async_tasks = {
34 |         "get_block_height": get_block_height(),
35 |         "get_slots": get_slots(),
36 |         "get_votes": get_votes(),
37 |         "balance_metrics": balance_metrics(),
38 |         "get_vote_accounts": get_vote_accounts(),
39 |         "leader_slot_metrics": leader_slot_metrics(),
40 |         "get_epoch_information": get_epoch_information(),
41 |         "get_health": get_health(),
42 |         "get_version": get_version()
43 |     }
44 | 
45 |     try:
46 |         results = await asyncio.gather(*async_tasks.values(), return_exceptions=True)
47 | 
48 |         for task, result in zip(async_tasks.keys(), results):
49 |             if isinstance(result, Exception):
50 |                 logger.error(f"{task.upper()}: {result}", exc_info=True)
51 |     except Exception as e:
52 |         logger.error(f"Error collecting async metric: {e}")
53 | 
54 | 
55 | async def collect():
56 |     loop = asyncio.get_event_loop()
57 |     sync_task = loop.run_in_executor(None, run_sync_tasks)
58 |     async_task = run_async_tasks()
59 |     await asyncio.gather(sync_task, async_task)
60 | 


--------------------------------------------------------------------------------
/modules/epoch.py:
--------------------------------------------------------------------------------
 1 | import aiohttp
 2 | from loguru import logger
 3 | from config import NETWORK_RPC_ENDPOINT, HEADERS
 4 | from utils.func import update_metric
 5 | from prometheus.metrics import (solana_network_epoch, solana_tx_count, solana_slot_in_epoch, solana_slot_index)
 6 | 
 7 | 
 8 | async def get_epoch_information():
 9 |     """Fetches epoch information from Solana network and updates Prometheus metrics asynchronously."""
10 |     payload = {
11 |         "jsonrpc": "2.0", "id": 1, "method": "getEpochInfo"
12 |     }
13 | 
14 |     try:
15 |         # Create async session and send request to Solana RPC endpoint
16 |         # logger.info("Sending async request to fetch epoch information from Solana network.")
17 |         async with aiohttp.ClientSession() as session:
18 |             async with session.post(NETWORK_RPC_ENDPOINT, json=payload, headers=HEADERS) as response:
19 |                 response.raise_for_status()  # Raise error for HTTP issues
20 |                 result = await response.json()
21 | 
22 |         # Validate required keys in the result
23 |         if 'result' not in result:
24 |             raise ValueError("Invalid response format: 'result' field is missing")
25 | 
26 |         # Extract metrics from result
27 |         epoch = result['result'].get('epoch')
28 |         update_metric(solana_network_epoch, epoch)
29 |         slot_in_epoch = result['result'].get('slotsInEpoch')
30 |         update_metric(solana_slot_in_epoch, slot_in_epoch)
31 |         slot_index = result['result'].get('slotIndex')
32 |         update_metric(solana_slot_index, slot_index)
33 |         tx_count = result['result'].get('transactionCount')
34 |         update_metric(solana_tx_count, tx_count)
35 | 
36 |         # logger.info("Successfully retrieved epoch information.")
37 |         logger.debug(f"Epoch: {epoch}, Slot in Epoch: {slot_in_epoch}, Slot Index: {slot_index}, Transaction Count: {tx_count}")
38 | 
39 |     except aiohttp.ClientError as e:
40 |         # Log network or connection errors
41 |         logger.error(f"Network error occurred while fetching epoch information: {e}")
42 |     except ValueError as e:
43 |         # Log invalid response format issues
44 |         logger.error(f"Data format error: {e}")
45 |     except Exception as e:
46 |         # Log any other unexpected errors
47 |         logger.error(f"Unexpected error while getting epoch information: {e}")
48 | 


--------------------------------------------------------------------------------
/modules/node_health.py:
--------------------------------------------------------------------------------
 1 | import aiohttp
 2 | from loguru import logger
 3 | from config import VALIDATOR_RPC_ENDPOINT, HEADERS
 4 | from utils.func import update_metric
 5 | from prometheus.metrics import solana_node_health, solana_node_slots_behind
 6 | 
 7 | 
 8 | async def get_health():
 9 |     payload = {
10 |         "jsonrpc": "2.0", "id": 1, "method": "getHealth"
11 |     }
12 | 
13 |     try:
14 |         # Create async session and send request to Validator RPC endpoint
15 |         async with aiohttp.ClientSession() as session:
16 |             async with session.post(VALIDATOR_RPC_ENDPOINT, json=payload, headers=HEADERS) as response:
17 |                 response.raise_for_status()  # Raise error for HTTP issues
18 |                 result = await response.json()
19 | 
20 |         if "result" in result and result["result"] == "ok":
21 |             update_metric(solana_node_health, 1, labels={"status": "healthy", "cause": "none"})
22 |             logger.info("Node is healthy")
23 |         elif "error" in result:
24 |             error_message = result["error"].get("message", "Unknown error")
25 |             slots_behind = result["error"]["data"]["numSlotsBehind"]
26 |             solana_node_health.labels(status="unhealthy", cause="slots_behind").set(0)
27 |             update_metric(solana_node_health, 1, labels={"status": "unhealthy", "cause": "slots_behind"})
28 |             update_metric(solana_node_health, 0, labels={"status": "healthy", "cause": "none"})
29 |             update_metric(solana_node_slots_behind, slots_behind)
30 |             logger.error(f"Node is unhealthy: {error_message}.")
31 | 
32 |         else:
33 |             logger.error("Unexpected response format")
34 |             update_metric(solana_node_health, 0, labels={"status": "healthy", "cause": "none"})
35 | 
36 |     except aiohttp.ClientError as e:
37 |         logger.error(f"Network error occurred while fetching node information: {e}")
38 |         update_metric(solana_node_health, 0, labels={"status": "healthy", "cause": "none"})
39 |     except ValueError as e:
40 |         logger.error(f"Data format error: {e}")
41 |         update_metric(solana_node_health, 0, labels={"status": "healthy", "cause": "none"})
42 |     except Exception as e:
43 |         logger.error(f"Error getting node status: {e}")
44 |         update_metric(solana_node_health, 0, labels={"status": "healthy", "cause": "none"})
45 | 


--------------------------------------------------------------------------------
/exporter/exporter.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import signal
 3 | from prometheus_client import start_http_server
 4 | import time
 5 | from exporter.collector import collect
 6 | from loguru import logger
 7 | from config import SLEEP_TIME, PORT, LOG_LEVEL
 8 | 
 9 | 
10 | async def graceful_shutdown(loop, sig=None):
11 |     """Handles graceful shutdown on receiving a signal"""
12 |     if sig:
13 |         logger.info(f"Received exit signal {sig.name}...")
14 | 
15 |     tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
16 | 
17 |     logger.info(f"Cancelling {len(tasks)} outstanding tasks")
18 |     for task in tasks:
19 |         task.cancel()
20 | 
21 |     await asyncio.gather(*tasks, return_exceptions=True)
22 |     loop.stop()
23 | 
24 | 
25 | def setup_signals(loop):
26 |     """Setup signal handling for graceful shutdown"""
27 |     for sig in (signal.SIGINT, signal.SIGTERM):
28 |         loop.add_signal_handler(sig, lambda: asyncio.create_task(graceful_shutdown(loop, sig)))
29 | 
30 | 
31 | async def run_exporter():
32 |     """Main function to run the Prometheus exporter"""
33 |     logger.info(f"Starting Prometheus metrics server on localhost:{PORT}/metrics")
34 |     start_http_server(PORT)
35 | 
36 |     while True:
37 |         start_time = time.time()
38 |         logger.info("Starting collection of metrics")
39 |         try:
40 |             await collect()
41 |             logger.info(f"Metrics collected successfully in {time.time() - start_time:.2f} seconds")
42 |         except Exception as e:
43 |             logger.error(f"Error during metrics collection: {e}")
44 | 
45 |         logger.info(f"💤 Sleeping for {SLEEP_TIME} seconds")
46 |         await asyncio.sleep(SLEEP_TIME)
47 | 
48 | 
49 | def main():
50 |     logger.add("logs/monitor.log",
51 |                level=LOG_LEVEL,
52 |                rotation="00:00",
53 |                retention="6 days",
54 |                compression=None,
55 |                backtrace=True,
56 |                diagnose=True,
57 |                enqueue=True)
58 | 
59 |     loop = asyncio.get_event_loop()
60 |     setup_signals(loop)  # Graceful shutdown signal setup
61 |     try:
62 |         loop.run_until_complete(run_exporter())
63 |     except Exception as e:
64 |         logger.error(f"Unexpected error: {e}")
65 |     finally:
66 |         logger.info("Shutting down Prometheus exporter")
67 |         loop.close()
68 | 


--------------------------------------------------------------------------------
/modules/balance.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | import time
 3 | import aiohttp
 4 | from loguru import logger
 5 | from utils.func import update_metric
 6 | from prometheus.metrics import solana_account_balance, solana_vote_account_balance
 7 | from config import PUB_KEY, VOTE_PUB_KEY, NETWORK_RPC_ENDPOINT, HEADERS
 8 | 
 9 | 
10 | # Unified async function to fetch both balances
11 | async def fetch_balances():
12 |     # Payload for fetching both identity and vote account balances
13 |     payload = [
14 |         {"jsonrpc": "2.0", "id": 1, "method": "getBalance", "params": [PUB_KEY]},
15 |         {"jsonrpc": "2.0", "id": 2, "method": "getBalance", "params": [VOTE_PUB_KEY]}
16 |     ]
17 | 
18 |     try:
19 |         # Send the requests asynchronously
20 |         async with aiohttp.ClientSession() as session:
21 |             async with session.post(NETWORK_RPC_ENDPOINT, json=payload, headers=HEADERS) as response:
22 |                 response.raise_for_status()
23 |                 results = await response.json()
24 | 
25 |         try:
26 |             # Parsing the results for both accounts
27 |             identity_balance = results[0]['result'].get('value') / 10 ** 9 if 'result' in results[0] else None
28 |             vote_acc_balance = results[1]['result'].get('value') / 10 ** 9 if 'result' in results[1] else None
29 | 
30 |             logger.debug(f"Identity balance: {identity_balance} SOL, Vote account balance: {vote_acc_balance} SOL")
31 | 
32 |             return identity_balance, vote_acc_balance
33 |         except Exception as e:
34 |             logger.error(f"Error processing balance data: {e}")
35 |             return None, None
36 | 
37 |     except aiohttp.ClientError as e:
38 |         logger.error(f"Error making request to Solana RPC: {e}")
39 |         return None, None
40 | 
41 | 
42 | # Main async function to gather and update modules
43 | async def balance_metrics():
44 |     logger.info(f"{inspect.currentframe().f_code.co_name}: Starting metrics collection process.")
45 |     start_time = time.time()
46 | 
47 |     # Fetch both balances asynchronously
48 |     identity_balance, vote_acc_balance = await fetch_balances()
49 | 
50 |     # Function to update Prometheus modules if values are valid
51 |     update_metric(solana_account_balance, identity_balance)
52 |     update_metric(solana_vote_account_balance, vote_acc_balance)
53 | 
54 |     end_time = time.time()
55 |     logger.success(f"{inspect.currentframe().f_code.co_name}: Metrics successfully collected and exported to "
56 |                    f"Prometheus. Time: {end_time - start_time}")
57 | 


--------------------------------------------------------------------------------
/modules/vote.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import time
  3 | import aiohttp
  4 | import asyncio
  5 | from loguru import logger
  6 | from utils.func import update_metric
  7 | from config import VALIDATOR_RPC_ENDPOINT, NETWORK_RPC_ENDPOINT, VOTE_PUB_KEY, HEADERS, RETRY
  8 | from prometheus.metrics import solana_validator_vote_height, solana_network_vote_height, solana_vote_height_diff
  9 | 
 10 | rpc_urls = {
 11 |     "network": NETWORK_RPC_ENDPOINT,
 12 |     "validator": VALIDATOR_RPC_ENDPOINT
 13 | }
 14 | 
 15 | 
 16 | def get_vote_accounts(result):
 17 |     if "error" in result:
 18 |         error_code = result["error"].get("code", "Unknown code")
 19 |         error_message = result["error"].get("message", "Unknown message")
 20 |         logger.error(f"Error fetching vote accounts - code: {error_code}, message: {error_message}")
 21 |         return None
 22 |     return result.get('result', {}).get('current', []) or result.get('result', {}).get('delinquent', [])
 23 | 
 24 | 
 25 | async def measure_rpc_response_time(url, session, payload):
 26 |     try:
 27 |         start_time = time.time()
 28 |         async with session.post(url, json=payload, headers=HEADERS) as response:
 29 |             result = await response.json()
 30 |         end_time = time.time()
 31 |         return result, end_time - start_time
 32 |     except Exception as e:
 33 |         logger.error(f"Error while accessing {url}: {e}")
 34 |         return None, None
 35 | 
 36 | 
 37 | async def make_requests(payload, func_name):
 38 |     async with aiohttp.ClientSession() as session:
 39 |         raw_results = await asyncio.gather(
 40 |             *[measure_rpc_response_time(url, session, payload) for url in rpc_urls.values()],
 41 |             return_exceptions=True
 42 |         )
 43 | 
 44 |         results = []
 45 |         response_times = {}
 46 | 
 47 |         for (row, response_time), name in zip(raw_results, rpc_urls.keys()):
 48 |             if row is not None:
 49 |                 results.append(row)
 50 |             response_times[name] = response_time
 51 |         network_time = response_times.get('network')
 52 |         validator_time = response_times.get('validator')
 53 | 
 54 |         if network_time is not None:
 55 |             logger.debug(f"{func_name.upper()} Response time for network: {network_time:.4f} seconds")
 56 |         else:
 57 |             logger.warning(f"{func_name.upper()} No valid response time for network")
 58 | 
 59 |         if validator_time is not None:
 60 |             logger.debug(f"{func_name.upper()} Response time for validator: {validator_time:.4f} seconds")
 61 |         else:
 62 |             logger.warning(f"{func_name.upper()} No valid response time for validator")
 63 | 
 64 |     return results, response_times
 65 | 
 66 | 
 67 | async def get_votes():
 68 |     payload = {
 69 |         "jsonrpc": "2.0",
 70 |         "id": 1,
 71 |         "method": "getVoteAccounts",
 72 |         "params": [
 73 |             {
 74 |                 "votePubkey": VOTE_PUB_KEY
 75 |             }
 76 |         ]
 77 |     }
 78 |     func_name = inspect.currentframe().f_code.co_name
 79 |     blocks, response_times = await make_requests(payload, func_name)
 80 | 
 81 |     retry_count = 0
 82 |     last_blocks = None
 83 |     while retry_count < RETRY and any(t is not None and t > 1 for t in response_times.values()):
 84 |         retry_count += 1
 85 |         logger.info("One or more requests took longer than 1 second. Retrying...")
 86 |         blocks, response_times = await make_requests(payload, func_name)
 87 | 
 88 |         last_blocks = blocks
 89 | 
 90 |     if retry_count == RETRY:
 91 |         blocks = last_blocks
 92 | 
 93 |     try:
 94 |         results = blocks
 95 |         validator_vote_height = network_vote_height = None
 96 |         if results:
 97 |             vote_accounts_network = get_vote_accounts(results[0]) if len(results) > 0 else None
 98 |             if vote_accounts_network:
 99 |                 network_vote_height = vote_accounts_network[0]['lastVote']
100 |                 update_metric(solana_network_vote_height, network_vote_height, labels={"rpc": "network"})
101 |                 logger.debug(f"Network vote height: {network_vote_height}")
102 |             else:
103 |                 logger.warning(f"{func_name.upper()} No vote data for network")
104 | 
105 |             vote_accounts_validator = get_vote_accounts(results[1]) if len(results) > 1 else None
106 |             if vote_accounts_validator:
107 |                 validator_vote_height = vote_accounts_validator[0]['lastVote']
108 |                 update_metric(solana_validator_vote_height, validator_vote_height, labels={"rpc": "validator"})
109 |                 logger.debug(f"Validator vote height: {validator_vote_height}")
110 |             else:
111 |                 logger.warning(f"{func_name.upper()} No vote data for validator")
112 | 
113 |             if vote_accounts_network and vote_accounts_validator:
114 |                 update_metric(solana_vote_height_diff, validator_vote_height - network_vote_height)
115 |                 logger.debug(f"Diff vote height: {validator_vote_height - network_vote_height}")
116 |         else:
117 |             logger.error(f"Error processing vote data: {results}")
118 |     except Exception as e:
119 |         logger.error(f"Error processing vote data: {e}")
120 | 


--------------------------------------------------------------------------------
/prometheus/metrics.py:
--------------------------------------------------------------------------------
 1 | from prometheus_client import Gauge
 2 | 
 3 | # Prometheus Gauges
 4 | # balance module
 5 | solana_account_balance = Gauge('solana_account_balance', 'Identity account balance')
 6 | solana_vote_account_balance = Gauge('solana_vote_account_balance', 'Vote account balance')
 7 | 
 8 | # block module
 9 | solana_net_skip_rate = Gauge('solana_net_skip_rate', 'Network skip rate')
10 | solana_skipped_total = Gauge('solana_skipped_total', 'Total skipped slots of network in current epoch')
11 | solana_val_blocks_produced = Gauge('solana_val_blocks_produced', 'Blocks produced of a validator in current epoch')
12 | solana_val_skip_rate = Gauge('solana_val_skip_rate', 'Validator skip rate')
13 | solana_val_skipped_slots = Gauge('solana_val_skipped_slots', 'Skipped slots of a validator in current epoch')
14 | solana_total_blocks_produced = Gauge('solana_total_blocks_produced', 'Total blocks produced in current epoch')
15 | solana_skip_rate_diff = Gauge('solana_skip_rate_diff', 'Skip rate difference of network and validator')
16 | solana_val_leader_slots = Gauge('solana_val_leader_slots', 'Leader slots of a validator in current epoch')
17 | solana_total_slots = Gauge('solana_total_slots', 'Total slots in current epoch')
18 | solana_confirmed_epoch_first_slot = Gauge('solana_confirmed_epoch_first_slot', 'First slot in current epoch')
19 | solana_confirmed_epoch_last_slot = Gauge('solana_confirmed_epoch_last_slot', 'Last slot in current epoch')
20 | 
21 | # epoch module
22 | solana_node_version = Gauge('solana_node_version', 'Node version of solana', ['version'])
23 | solana_network_epoch = Gauge('solana_network_epoch', 'Current epoch of network (max confirmation)')
24 | solana_tx_count = Gauge('solana_tx_count', 'solana transaction count')
25 | solana_slot_in_epoch = Gauge('solana_slot_in_epoch', 'solana_slot_in_epoch')
26 | solana_slot_index = Gauge('solana_slot_index', 'solana_slot_index')
27 | 
28 | # leader_slot module
29 | solana_val_total_leader_slots = Gauge('solana_val_total_leader_slots', 'Total number of leader slots in current epoch')
30 | solana_next_leader_slot = Gauge('solana_next_leader_slot', 'The next leader slot')
31 | solana_time_to_next_slot = Gauge('solana_time_to_next_slot', 'Time until the next leader slot in seconds')
32 | solana_avg_slot_duration = Gauge('solana_avg_slot_duration', 'Average slot duration in seconds')
33 | solana_next_slot_time = Gauge('solana_next_slot_time', 'Time of the next leader slot')
34 | solana_previous_leader_slot = Gauge('solana_previous_leader_slot', 'The previous leader slot')
35 | 
36 | # node_health module
37 | solana_node_health = Gauge('solana_node_health', 'Health status of the Solana node', ['status', 'cause'])
38 | solana_node_slots_behind = Gauge('solana_node_slots_behind', 'Number of slots the Solana node is behind')
39 | 
40 | # slot module
41 | solana_block_height = Gauge('solana_block_height', 'Current Block Height of validator')
42 | solana_network_block_height = Gauge('solana_network_block_height', 'Current Block Height of network')
43 | solana_block_height_diff = Gauge('solana_block_height_diff', 'Current Block Height difference of network and validator')
44 | solana_current_slot = Gauge('solana_current_slot', 'Current validator slot height')
45 | solana_net_current_slot = Gauge('solana_net_current_slot', 'Current network slot height')
46 | solana_slot_diff = Gauge('solana_slot_diff', 'Current slot difference of network and validator')
47 | solana_net_max_shred_insert_slot = Gauge('solana_net_max_shred_insert_slot', 'Get the max NETWORK slot seen from after shred insert')
48 | solana_net_max_retransmit_slot = Gauge('solana_net_max_retransmit_slot', 'Get the max NETWORK slot seen from retransmit stage')
49 | solana_val_max_shred_insert_slot = Gauge('solana_val_max_shred_insert_slot', 'Get the max VALIDATOR slot seen from after shred insert')
50 | solana_val_max_retransmit_slot = Gauge('solana_val_max_retransmit_slot', 'Get the max VALIDATOR slot seen from retransmit stage')
51 | 
52 | # validator module
53 | solana_active_stake = Gauge('solana_active_stake', 'Active Stake SOLs')
54 | solana_current_stake = Gauge('solana_current_stake', 'Current Stake SOLs')
55 | solana_delinquent_stake = Gauge('solana_delinquent_stake', 'Delinquent Stake SOLs')
56 | solana_val_commission = Gauge('solana_val_commission', 'Solana validator current commission', ['commission'])
57 | solana_active_validators = Gauge('solana_active_validators', 'Total number of active validators by state', ['state'])
58 | solana_validator_activated_stake = Gauge('solana_validator_activated_stake', 'Activated stake per validator',
59 |                                          ['pubkey', 'votekey'])
60 | solana_val_status = Gauge('solana_val_status', 'Solana validator voting status i.e., voting or jailed', ['state'])
61 | solana_vote_credits = Gauge('solana_vote_credits', 'Solana validator vote credits of current epoch')
62 | solana_avg_vote_credits = Gauge('solana_avg_vote_credits', 'Average network vote credits of current epoch')
63 | solana_total_credits = Gauge('solana_total_credits', 'Solana validator vote credits of all epochs')
64 | 
65 | # vote module
66 | solana_validator_vote_height = Gauge('solana_validator_vote_height',
67 |                                      'Most recent VALIDATOR slot voted on by this vote account',
68 |                                      ['rpc'])
69 | solana_network_vote_height = Gauge('solana_network_vote_height',
70 |                                    'Most recent NETWORK slot voted on by this vote account',
71 |                                    ['rpc'])
72 | solana_vote_height_diff = Gauge('solana_vote_height_diff', 'Vote height difference of validator and network')
73 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Solana Monitor
  3 | 
  4 | This project monitors Solana validator metrics using Docker and Docker Compose. It also integrates with Prometheus and Grafana for visualization.
  5 | 
  6 | ## Prerequisites
  7 | 
  8 | - **Docker**: Container management
  9 | - **Docker Compose**: Multi-container orchestration
 10 | - **Git**: To clone the repository
 11 | 
 12 | ## Path Configuration
 13 | 
 14 | ### Solana Node Path
 15 | If you installed the Solana node under a different user (not `root`), you need to modify the path in `docker-compose.yml`. 
 16 | 
 17 | Find this line:
 18 | ```yaml
 19 | - /root/.local/share/solana/install/active_release/bin:/solana
 20 | ```
 21 | 
 22 | You can find your Solana installation path by running:
 23 | ```bash
 24 | which solana
 25 | ```
 26 | 
 27 | Similarly, update the Solana log path in the promtail service if needed:
 28 | ```yaml
 29 | - /root/solana/solana.log:/var/log/solana.log  # Change /root to your user's home directory
 30 | ```
 31 | 
 32 | These path adjustments ensure the containers can access your Solana installation and log files correctly.
 33 | 
 34 | ## Building the Project
 35 | 
 36 | ### 1. Install Docker and Docker Compose
 37 | 
 38 | Set up Docker by adding the official Docker repository and installing the required packages.
 39 | 
 40 | #### Install Docker:
 41 | 
 42 | Follow the official [Docker installation guide](https://docs.docker.com/engine/install/ubuntu/) for more details.
 43 | 
 44 | ```bash
 45 | # Update packages and install dependencies
 46 | sudo apt-get update
 47 | sudo apt-get install ca-certificates curl
 48 | 
 49 | # Add Docker's official GPG key and set up the repository
 50 | sudo install -m 0755 -d /etc/apt/keyrings
 51 | sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
 52 | sudo chmod a+r /etc/apt/keyrings/docker.asc
 53 | echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] \
 54 | https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
 55 | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
 56 | ```
 57 | ```bash
 58 | sudo apt-get update
 59 | sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
 60 | ```
 61 | #### Install Docker Compose:
 62 | 
 63 | ```shell
 64 | sudo curl -L https://github.com/docker/compose/releases/download/v2.29.7/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose
 65 | sudo chmod +x /usr/local/bin/docker-compose
 66 | ```
 67 | 
 68 | ### 2. Clone the Repository
 69 | Download the project source code:
 70 | ```bash
 71 | git clone https://github.com/qskyhigh/solana-monitor-public
 72 | cd solana-monitor-public
 73 | ```
 74 | 
 75 | ### 3. Build and Start the Application
 76 | Use Docker Compose to build the project and run the services in the background:
 77 | ```bash
 78 | docker-compose build --no-cache
 79 | docker-compose up -d
 80 | ```
 81 | 
 82 | ## Grafana Cloud API Token Configuration
 83 | To connect Prometheus and Loki with Grafana Cloud, you need to generate your own API tokens and update the relevant configuration files.
 84 | ### 1. Prometheus Configuration (`prometheus.yml`)
 85 | 
 86 | In the `prometheus.yml` file, replace the `username` and `password` with your own Grafana Cloud Prometheus API credentials.
 87 | ```yml
 88 | remote_write:
 89 | - url: https://prometheus-prod-13-prod-us-east-0.grafana.net/api/prom/push
 90 |   basic_auth:
 91 |     username: YOUR_USERNAME
 92 |     password: YOUR_API_TOKEN
 93 | ```
 94 | 
 95 | ### 2. Loki Configuration (`promtail.yml`)
 96 | In the `promtail.yml` file, replace the `username` and `token` with your own Grafana Cloud Loki credentials.
 97 | ```yml
 98 | clients:
 99 |   - url: https://YOUR_USERNAME:YOUR_API_TOKEN@logs-prod-006.grafana.net/loki/api/v1/push
100 | ```
101 | 
102 | #### How to Obtain API Tokens
103 | 1. Log in to your Grafana Cloud account.
104 | 2. Go to the API Keys section under Settings.
105 | 3. Generate API tokens for both Prometheus and Loki.
106 | 4. Use the generated tokens to replace the placeholders in `prometheus.yml` and `promtail.yml`.
107 | 
108 | ## Grafana Dashboard Configuration
109 | 
110 | The dashboard can be imported from the docs/ directory to your Grafana instance<br>
111 |   - Default is to utilize a label applied by the collector `host: solana-monitor-testnet` (you can change the global label in `prometheus.yml`)
112 | 
113 | ## Testing
114 | You can check the running Docker containers with:
115 | ```bash
116 | docker ps
117 | ```
118 | Once the containers are up, access Grafana to visualize Solana metrics. For more details on the dashboard configuration, refer to the provided Grafana screenshot:
119 | <img src="https://i.ibb.co/7kc3L8g/dqskyhigh-grafana.png" alt="dqskyhigh-grafana" border="0" style="width:100%;">
120 | 
121 | ### Node Exporter Metrics
122 | 
123 | To monitor system-level metrics such as CPU, memory, and disk usage, you can use the **Node Exporter**.
124 | 
125 | You can download and import the Node Exporter dashboard with **ID 1860** from Grafana's dashboard library:
126 | 
127 | 1. Go to Grafana and navigate to **Dashboards** > **Import**.
128 | 2. Enter the **Dashboard ID: `1860`** and click **Load**.
129 | 3. Select your Prometheus datasource and click **Import**.
130 | 
131 | This will provide a comprehensive overview of your system's performance using Node Exporter metrics.
132 | 
133 | ### 
134 | If you found this project helpful, feel free to support by donating SOL to my wallet.
135 | 
136 | **SOL Wallet Address**: `FNu9BCwCmgSmeCa56LCAErPBNeAdgnQJsBrrLgVbbMKt`
137 | 
138 | <img src="https://www.buymeacoffee.com/assets/img/custom_images/yellow_img.png" alt="Buy Me A Coffee">


--------------------------------------------------------------------------------
/modules/block.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import time
  3 | import inspect
  4 | import json
  5 | from loguru import logger
  6 | from config import PUB_KEY, SOLANA_BINARY_PATH
  7 | from utils.func import update_metric
  8 | from prometheus.metrics import (solana_net_skip_rate, solana_skipped_total, solana_val_blocks_produced,
  9 |                                 solana_val_skip_rate, solana_val_skipped_slots, solana_total_blocks_produced,
 10 |                                 solana_skip_rate_diff, solana_val_leader_slots, solana_total_slots,
 11 |                                 solana_confirmed_epoch_first_slot, solana_confirmed_epoch_last_slot)
 12 | 
 13 | 
 14 | # Function to fetch block production data from Solana CLI
 15 | def get_block_production():
 16 |     # Run solana block-production command and capture output
 17 |     try:
 18 |         result = subprocess.run(
 19 |             [SOLANA_BINARY_PATH, "block-production", "--output", "json-compact"],
 20 |             capture_output=True, text=True, check=True
 21 |         )
 22 |         # logger.info("Block production command executed successfully.")
 23 |     except subprocess.CalledProcessError as e:
 24 |         logger.error(f"Error executing solana block-production command: {e}")
 25 |         return None
 26 | 
 27 |     # Remove any "Note:" lines that might be in the output
 28 |     block_production = "\n".join([line for line in result.stdout.splitlines() if "Note:" not in line])
 29 | 
 30 |     try:
 31 |         block_production_data = json.loads(block_production)
 32 |         # logger.info("Block production data successfully parsed.")
 33 |     except json.JSONDecodeError as e:
 34 |         logger.error(f"Error decoding JSON from block production data: {e}")
 35 |         return None
 36 | 
 37 |     # Return parsed data
 38 |     return block_production_data
 39 | 
 40 | 
 41 | # Function to extract validator's modules and send them to Prometheus
 42 | def process_metrics(block_production_data):
 43 | 
 44 |     if not block_production_data:
 45 |         logger.warning("No block production data available to process.")
 46 |         return
 47 | 
 48 |     # Retrieve network modules
 49 |     try:
 50 |         total_slots_skipped = block_production_data.get('total_slots_skipped')
 51 |         update_metric(solana_skipped_total, total_slots_skipped)
 52 |         total_slots = block_production_data.get('total_slots')
 53 |         update_metric(solana_total_slots, total_slots)
 54 |         total_blocks_produced = block_production_data.get('total_blocks_produced')
 55 |         update_metric(solana_total_blocks_produced, total_blocks_produced)
 56 |         start_slot = block_production_data.get('start_slot')
 57 |         update_metric(solana_confirmed_epoch_first_slot, start_slot)
 58 |         end_slot = block_production_data.get('end_slot')
 59 |         update_metric(solana_confirmed_epoch_last_slot, end_slot)
 60 |         total_net_skip_rate = (total_slots_skipped / total_slots) * 100
 61 |         update_metric(solana_net_skip_rate, total_net_skip_rate)
 62 |         logger.debug(f"Network metrics - Total slots skipped: {total_slots_skipped}, Total slots: {total_slots}, "
 63 |                      f"Total blocks produced: {total_blocks_produced}, Start slot: {start_slot}, "
 64 |                      f"End slot: {end_slot}, Net skip rate: {total_net_skip_rate}")
 65 |     except KeyError as e:
 66 |         logger.error(f"Key error when extracting network metrics: {e}")
 67 |         return
 68 | 
 69 |     validator_block_production = [
 70 |         leader for leader in block_production_data.get("leaders", [])
 71 |         if leader.get("identityPubkey") == PUB_KEY
 72 |     ]
 73 | 
 74 |     # Retrieve validator-specific modules
 75 |     if validator_block_production:
 76 |         try:
 77 |             val_slots_skipped = validator_block_production[0].get('skippedSlots')
 78 |             update_metric(solana_val_skipped_slots, val_slots_skipped)
 79 |             val_leader_slots = validator_block_production[0].get('leaderSlots')
 80 |             # update_metric(solana_val_leader_slots, val_leader_slots)
 81 |             val_blocks_produced = validator_block_production[0].get('blocksProduced')
 82 |             update_metric(solana_val_blocks_produced, val_blocks_produced)
 83 |             val_skip_rate = (val_slots_skipped / val_leader_slots) * 100
 84 |             update_metric(solana_val_skip_rate, val_skip_rate)
 85 |             skip_rate_diff = val_skip_rate - total_net_skip_rate
 86 |             update_metric(solana_skip_rate_diff, skip_rate_diff)
 87 |             logger.debug(
 88 |                 f"Validator metrics - blocks produced: {val_blocks_produced}, "
 89 |                 f"skip rate: {val_skip_rate:.2f}%, slots skipped: {val_slots_skipped}, "
 90 |                 f"leader_slots: {val_leader_slots}, skip rate diff: {skip_rate_diff}")
 91 |         except KeyError as e:
 92 |             logger.error(f"Key error when extracting validator-specific metrics: {e}")
 93 |             return
 94 |     else:
 95 |         logger.warning("No validator block production data found.")
 96 |         update_metric(solana_val_skipped_slots, 0)
 97 |         update_metric(solana_val_blocks_produced, 0)
 98 |         update_metric(solana_val_skip_rate, 0)
 99 |         update_metric(solana_skip_rate_diff, -total_net_skip_rate)
100 | 
101 | 
102 | # Main function to collect block production data and process it
103 | def block_metrics():
104 |     logger.info(f"{inspect.currentframe().f_code.co_name}: Starting metrics collection process.")
105 |     start_time = time.time()
106 | 
107 |     # Fetch block production data
108 |     block_production_data = get_block_production()
109 | 
110 |     # Process and send modules to Prometheus
111 |     process_metrics(block_production_data)
112 | 
113 |     end_time = time.time()
114 |     logger.success(f"{inspect.currentframe().f_code.co_name}: All metrics have been successfully collected and sent "
115 |                    f"to Prometheus. Time: {end_time - start_time}")
116 | 


--------------------------------------------------------------------------------
/modules/leader_slot.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import aiohttp
  3 | import asyncio
  4 | import time
  5 | from datetime import datetime, timedelta
  6 | from loguru import logger
  7 | from utils.func import update_metric
  8 | from config import NETWORK_RPC_ENDPOINT, PUB_KEY, HEADERS
  9 | from prometheus.metrics import (solana_val_total_leader_slots, solana_next_leader_slot, solana_time_to_next_slot,
 10 |                                 solana_avg_slot_duration, solana_next_slot_time, solana_previous_leader_slot)
 11 | 
 12 | 
 13 | # Generalized async function to fetch data from the Solana RPC
 14 | async def fetch_rpc_data(session, method, params=None):
 15 |     payload = {
 16 |         "jsonrpc": "2.0", "id": 1, "method": method, "params": params or []
 17 |     }
 18 | 
 19 |     try:
 20 |         async with session.post(NETWORK_RPC_ENDPOINT, headers=HEADERS, json=payload) as response:
 21 |             response.raise_for_status()
 22 |             return await response.json()
 23 |     except Exception as e:
 24 |         logger.error(f"Error fetching {method} from RPC: {e}")
 25 |         return None
 26 | 
 27 | 
 28 | # Get current slot
 29 | async def get_current_slot(session):
 30 |     result = await fetch_rpc_data(session, "getSlot", [{"commitment": "confirmed"}])
 31 |     return result.get('result') if result else None
 32 | 
 33 | 
 34 | # Get leader schedule
 35 | async def get_leader_schedule(session):
 36 |     result = await fetch_rpc_data(session, "getLeaderSchedule", [None, {"identity": PUB_KEY}])
 37 |     return result.get('result', {}).get(PUB_KEY, []) if result else None
 38 | 
 39 | 
 40 | # Get epoch information
 41 | async def get_epoch(session):
 42 |     epoch_schedule = await fetch_rpc_data(session, "getEpochSchedule")
 43 |     epoch_info = await fetch_rpc_data(session, "getEpochInfo")
 44 | 
 45 |     if epoch_schedule and epoch_info:
 46 |         return (epoch_schedule['result'].get('firstNormalEpoch'),
 47 |                 epoch_schedule['result'].get('firstNormalSlot'),
 48 |                 epoch_schedule['result'].get('slotsPerEpoch'),
 49 |                 epoch_info['result'].get('epoch'))
 50 |     else:
 51 |         logger.error("Error fetching epoch data")
 52 |         return None, None, None, None
 53 | 
 54 | 
 55 | # Calculate average slot duration
 56 | async def calculate_slot_duration(session):
 57 |     result = await fetch_rpc_data(session, "getRecentPerformanceSamples", [1])
 58 |     if result and len(result['result']) > 0:
 59 |         sample = result['result'][0]
 60 |         sample_period_secs = sample.get('samplePeriodSecs')
 61 |         num_slots = sample.get('numSlots')
 62 |         if num_slots != 0:
 63 |             return sample_period_secs / num_slots
 64 |         else:
 65 |             logger.warning("Warning: Division by zero, setting result to 0")
 66 |             return 0
 67 |     else:
 68 |         logger.error("Error fetching slot duration data")
 69 |         return None
 70 | 
 71 | 
 72 | # Main function to gather and set Prometheus modules
 73 | async def leader_slot_metrics():
 74 |     logger.info(f"{inspect.currentframe().f_code.co_name}: Starting metrics collection process.")
 75 |     start_time = time.time()
 76 |     async with aiohttp.ClientSession() as session:
 77 |         # Parallel requests to Solana RPC
 78 |         current_slot, leader_slots_in_epoch, epoch_data, slot_duration = await asyncio.gather(
 79 |             get_current_slot(session),
 80 |             get_leader_schedule(session),
 81 |             get_epoch(session),
 82 |             calculate_slot_duration(session)
 83 |         )
 84 | 
 85 |     if current_slot is None or leader_slots_in_epoch is None or epoch_data is None or slot_duration is None:
 86 |         logger.error("Failed to fetch all required data. Skipping metric collection.")
 87 |         return
 88 | 
 89 |     first_normal_epoch, first_normal_slot, slots_per_epoch, epoch = epoch_data
 90 |     if not all([first_normal_epoch, first_normal_slot, slots_per_epoch, epoch]):
 91 |         logger.error("Incomplete epoch data. Skipping metric collection.")
 92 |         return
 93 | 
 94 |     # Calculate next and previous leader slots
 95 |     first_slot_in_epoch = (epoch - first_normal_epoch) * slots_per_epoch + first_normal_slot
 96 |     next_slot = next((slot for slot in leader_slots_in_epoch if slot + first_slot_in_epoch > current_slot), None)
 97 |     previous_slot = next(
 98 |         (slot for slot in reversed(leader_slots_in_epoch) if slot + first_slot_in_epoch < current_slot), 0)
 99 | 
100 |     if next_slot:
101 |         next_slot_epoch = first_slot_in_epoch + next_slot
102 |         time_to_next_slot = (next_slot_epoch - current_slot) * slot_duration
103 |         next_slot_time = datetime.now() + timedelta(seconds=time_to_next_slot)
104 |         next_slot_time = next_slot_time.replace(second=0, microsecond=0)
105 |         next_slot_time_unix = time.mktime(next_slot_time.timetuple())
106 |         logger.debug(f"Next leader slot: {next_slot_epoch} in {time_to_next_slot:.2f}s")
107 | 
108 |         # Update Prometheus modules
109 |         update_metric(solana_next_leader_slot, next_slot_epoch)
110 |         update_metric(solana_time_to_next_slot, time_to_next_slot)
111 |         update_metric(solana_next_slot_time, next_slot_time_unix)
112 |     else:
113 |         logger.warning("No upcoming leader slots found.")
114 |         solana_next_leader_slot.set(0)
115 |         solana_time_to_next_slot.set(0)
116 |         solana_next_slot_time.set(0)
117 | 
118 |     previous_slot_epoch = first_slot_in_epoch + previous_slot
119 |     update_metric(solana_previous_leader_slot, previous_slot_epoch)
120 |     update_metric(solana_val_total_leader_slots, len(leader_slots_in_epoch))
121 |     update_metric(solana_avg_slot_duration, slot_duration)
122 |     logger.debug(f"Previous leader slot: {previous_slot_epoch}, Total_leader_slots: {len(leader_slots_in_epoch)}, "
123 |                  f"Avg slot duration: {slot_duration}")
124 |     end_time = time.time()
125 |     logger.success(f"{inspect.currentframe().f_code.co_name}: Metrics successfully collected and exported to "
126 |                    f"Prometheus. Time: {end_time - start_time}")
127 | 


--------------------------------------------------------------------------------
/modules/slot.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import time
  3 | import aiohttp
  4 | import asyncio
  5 | from loguru import logger
  6 | from config import VALIDATOR_RPC_ENDPOINT, NETWORK_RPC_ENDPOINT, HEADERS, RETRY
  7 | from utils.func import update_metric
  8 | from prometheus.metrics import (solana_block_height, solana_network_block_height, solana_current_slot,
  9 |                                 solana_net_current_slot, solana_net_max_shred_insert_slot,
 10 |                                 solana_net_max_retransmit_slot, solana_slot_diff, solana_block_height_diff,
 11 |                                 solana_val_max_shred_insert_slot, solana_val_max_retransmit_slot)
 12 | 
 13 | rpc_urls = {
 14 |     "network": NETWORK_RPC_ENDPOINT,
 15 |     "validator": VALIDATOR_RPC_ENDPOINT
 16 | }
 17 | 
 18 | 
 19 | def extract_slot(data, req_id):
 20 |     return next((item['result'] for item in data if item['id'] == req_id and 'error' not in item), None)
 21 | 
 22 | 
 23 | async def measure_rpc_response_time(url, session, payload):
 24 |     try:
 25 |         start_time = time.time()
 26 |         async with session.post(url, json=payload, headers=HEADERS) as response:
 27 |             result = await response.json()
 28 |         end_time = time.time()
 29 |         return result, end_time - start_time
 30 |     except Exception as e:
 31 |         logger.error(f"Error while accessing {url}: {e}")
 32 |         return None, None
 33 | 
 34 | 
 35 | async def make_requests(payload, func_name):
 36 | 
 37 |     async with aiohttp.ClientSession() as session:
 38 |         raw_results = await asyncio.gather(
 39 |             *[measure_rpc_response_time(url, session, payload) for url in rpc_urls.values()],
 40 |             return_exceptions=True
 41 |         )
 42 | 
 43 |         results = []
 44 |         response_times = {}
 45 | 
 46 |         for (row, response_time), name in zip(raw_results, rpc_urls.keys()):
 47 |             results.append(row)
 48 |             response_times[name] = response_time
 49 |         network_time = response_times.get('network')
 50 |         validator_time = response_times.get('validator')
 51 | 
 52 |         if network_time is not None:
 53 |             logger.debug(f"{func_name.upper()} Response time for network: {network_time:.4f} seconds")
 54 |         else:
 55 |             logger.warning(f"{func_name.upper()} No valid response time for network")
 56 | 
 57 |         if validator_time is not None:
 58 |             logger.debug(f"{func_name.upper()} Response time for validator: {validator_time:.4f} seconds")
 59 |         else:
 60 |             logger.warning(f"{func_name.upper()} No valid response time for validator")
 61 | 
 62 |     return results, response_times
 63 | 
 64 | 
 65 | async def get_slots():
 66 |     payload = [
 67 |         {"jsonrpc": "2.0", "id": 1, "method": "getMaxRetransmitSlot"},
 68 |         {"jsonrpc": "2.0", "id": 2, "method": "getMaxShredInsertSlot"},
 69 |         {"jsonrpc": "2.0", "id": 3, "method": "getSlot", "params": [{"commitment": "confirmed"}]}
 70 |     ]
 71 |     func_name = inspect.currentframe().f_code.co_name
 72 |     slots, response_times = await make_requests(payload, func_name)
 73 | 
 74 |     retry_count = 0
 75 |     last_slots = None
 76 |     while retry_count < RETRY and any(t is not None and t > 1 for t in response_times.values()):
 77 |         retry_count += 1
 78 |         logger.info("One or more requests took longer than 1 second. Retrying...")
 79 |         slots, response_times = await make_requests(payload, func_name)
 80 | 
 81 |         last_slots = slots
 82 | 
 83 |     if retry_count == RETRY:
 84 |         slots = last_slots
 85 | 
 86 |     try:
 87 |         val_slot = net_slot = None
 88 |         if slots[0] is not None:
 89 |             net_slot = extract_slot(slots[0], 3)
 90 |             update_metric(solana_current_slot, net_slot)
 91 |             net_max_shred_insert_slot = extract_slot(slots[0], 2)
 92 |             update_metric(solana_net_max_shred_insert_slot, net_max_shred_insert_slot)
 93 |             net_max_retransmit_slot = extract_slot(slots[0], 1)
 94 |             update_metric(solana_net_max_retransmit_slot, net_max_retransmit_slot)
 95 |             logger.debug(f"Network slot: {net_slot}, "
 96 |                          f"net_max_shred_insert_slot: {net_max_shred_insert_slot}, net_max_retransmit_slot: {net_max_retransmit_slot}")
 97 |         else:
 98 |             logger.warning(f"{func_name.upper()} No slot data for network")
 99 | 
100 |         if slots[1] is not None:
101 |             val_slot = extract_slot(slots[1], 3)
102 |             update_metric(solana_net_current_slot, val_slot)
103 |             val_max_shred_insert_slot = extract_slot(slots[1], 2)
104 |             update_metric(solana_val_max_shred_insert_slot, val_max_shred_insert_slot)
105 |             val_max_retransmit_slot = extract_slot(slots[1], 1)
106 |             update_metric(solana_val_max_retransmit_slot, val_max_retransmit_slot)
107 |             logger.debug(f"Validator slot: {val_slot}, "
108 |                          f"val_max_shred_insert_slot: {val_max_shred_insert_slot}, val_max_retransmit_slot: {val_max_retransmit_slot}")
109 |         else:
110 |             logger.warning(f"{func_name.upper()} No slot data for validator")
111 | 
112 |         if val_slot is not None and net_slot is not None:
113 |             update_metric(solana_slot_diff, val_slot - net_slot)
114 |             logger.debug(f"Slot diff: {val_slot - net_slot}")
115 | 
116 |     except Exception as e:
117 |         logger.error(f"Error processing slots data: {e}")
118 | 
119 | 
120 | async def get_block_height():
121 |     payload = {
122 |         "jsonrpc": "2.0", "id": 1, "method": "getBlockHeight"
123 |     }
124 | 
125 |     func_name = inspect.currentframe().f_code.co_name
126 |     blocks, response_times = await make_requests(payload, func_name)
127 | 
128 |     retry_count = 0
129 |     last_blocks = None
130 |     while retry_count < RETRY and any(t is not None and t > 1 for t in response_times.values()):
131 |         retry_count += 1
132 |         logger.info("One or more requests took longer than 1 second. Retrying...")
133 |         blocks, response_times = await make_requests(payload, func_name)
134 | 
135 |         last_blocks = blocks
136 | 
137 |     if retry_count == RETRY:
138 |         blocks = last_blocks
139 | 
140 |     try:
141 |         val_block_height = net_block_height = None
142 |         if blocks[0] is not None:
143 |             net_block_height = blocks[0]['result']
144 |             update_metric(solana_network_block_height, net_block_height)
145 |         else:
146 |             logger.warning(f"{func_name.upper()} No block height data for network")
147 | 
148 |         if blocks[1] is not None:
149 |             val_block_height = blocks[1]['result']
150 |             update_metric(solana_block_height, val_block_height)
151 |         else:
152 |             logger.warning(f"{func_name.upper()} No block height data for validator")
153 | 
154 |         if val_block_height and net_block_height:
155 |             update_metric(solana_block_height_diff, val_block_height-net_block_height)
156 |             logger.debug(f"Block diff: {val_block_height - net_block_height}")
157 | 
158 |         logger.debug(f"Network block height: {net_block_height}, Validator block height: {val_block_height}")
159 | 
160 |     except Exception as e:
161 |         logger.error(f"Error processing blocks data: {e}")
162 | 


--------------------------------------------------------------------------------
/modules/validator.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | import json
  3 | import time
  4 | import aiohttp
  5 | import subprocess
  6 | from loguru import logger
  7 | from config import PUB_KEY, VOTE_PUB_KEY, NETWORK_RPC_ENDPOINT, SOLANA_BINARY_PATH, HEADERS
  8 | from utils.func import update_metric
  9 | from prometheus.metrics import (solana_active_stake, solana_current_stake, solana_delinquent_stake, solana_vote_credits,
 10 |                                 solana_active_validators, solana_validator_activated_stake, solana_val_status,
 11 |                                 solana_total_credits, solana_val_commission, solana_avg_vote_credits)
 12 | 
 13 | 
 14 | def get_validators():
 15 |     """Fetch validators information using Solana CLI and update Prometheus modules."""
 16 |     try:
 17 |         result = subprocess.run(
 18 |             [SOLANA_BINARY_PATH, "validators", "--output", "json-compact"],
 19 |             capture_output=True, text=True, check=True
 20 |         )
 21 |         logger.info("Successfully executed solana validators command.")
 22 |     except subprocess.CalledProcessError as e:
 23 |         logger.error(f"Error while running solana validators command: {e}")
 24 |         return None
 25 | 
 26 |     # Remove any "Note:" lines that might be in the output
 27 |     validators = "\n".join([line for line in result.stdout.splitlines() if "Note:" not in line])
 28 | 
 29 |     try:
 30 |         # Parse the JSON data from Solana validators
 31 |         validators_data = json.loads(validators)
 32 |         logger.info("Block production data successfully parsed.")
 33 |     except json.JSONDecodeError as e:
 34 |         logger.error(f"Error decoding JSON from block production data: {e}")
 35 |         return None
 36 | 
 37 |     # Return parsed data
 38 |     return validators_data
 39 | 
 40 | 
 41 | def process_metrics(validators_data):
 42 |     if not validators_data:
 43 |         logger.warning("No validator data available to process.")
 44 |         return
 45 | 
 46 |     # Retrieve network modules
 47 |     try:
 48 |         active_stake = validators_data.get('totalActiveStake') / 10 ** 9
 49 |         current_stake = validators_data.get('totalCurrentStake') / 10 ** 9
 50 |         delinquent_stake = validators_data.get('totalDelinquentStake') / 10 ** 9
 51 |         logger.debug(f'Active Stake: {round(active_stake, 2)}, Current Stake: {round(current_stake, 2)}, '
 52 |                      f'Delinquent Stake: {round(delinquent_stake, 2)}')
 53 |     except KeyError as e:
 54 |         logger.error(f"EKey error when extracting validator-specific modules: {e}")
 55 |         return
 56 | 
 57 |     # Dictionary mapping Prometheus Gauges to their corresponding values
 58 |     update_metric(solana_active_stake, active_stake)
 59 |     update_metric(solana_current_stake, current_stake)
 60 |     update_metric(solana_delinquent_stake, delinquent_stake)
 61 | 
 62 | 
 63 | def validator_metrics():
 64 |     logger.info(f"{inspect.currentframe().f_code.co_name}: Starting modules collection process.")
 65 |     start_time = time.time()
 66 | 
 67 |     # Fetch validators data
 68 |     validators_data = get_validators()
 69 | 
 70 |     # Process and send modules to Prometheus
 71 |     process_metrics(validators_data)
 72 | 
 73 |     end_time = time.time()
 74 |     logger.success(f"{inspect.currentframe().f_code.co_name}: All modules have been successfully collected and sent "
 75 |                    f"to Prometheus. Time: {end_time - start_time}")
 76 | 
 77 | 
 78 | async def get_vote_accounts():
 79 |     """Fetch vote account information using RPC and update Prometheus metrics."""
 80 |     payload = [
 81 |         {"jsonrpc": "2.0", "id": 1, "method": "getVoteAccounts", "params": [{"commitment": "recent"}]},
 82 |         {"jsonrpc": "2.0", "id": 2, "method": "getEpochInfo"}
 83 |     ]
 84 | 
 85 |     logger.info(f"{inspect.currentframe().f_code.co_name}: Starting modules collection process.")
 86 |     start_time = time.time()
 87 | 
 88 |     try:
 89 |         async with aiohttp.ClientSession() as session:
 90 |             async with session.post(NETWORK_RPC_ENDPOINT, json=payload, headers=HEADERS) as response:
 91 |                 response.raise_for_status()
 92 |                 result = await response.json()
 93 | 
 94 |         current_val = result[0]['result'].get('current', [])
 95 |         delinquent_val = result[0]['result'].get('delinquent', [])
 96 |         current_epoch = result[1]['result']['epoch']
 97 | 
 98 |         update_metric(solana_active_validators, len(current_val), labels={"state": "current"})
 99 |         update_metric(solana_active_validators, len(delinquent_val), labels={"state": "delinquent"})
100 | 
101 |         logger.debug(f'Current: {len(current_val)}, Delinquent: {len(delinquent_val)}')
102 | 
103 |         all_vote_credits = []
104 |         vote_account = None
105 | 
106 |         all_accounts = current_val + delinquent_val
107 | 
108 |         for account in all_accounts:
109 |             if account.get('nodePubkey') == PUB_KEY:
110 |                 vote_account = account
111 |                 if account in delinquent_val:
112 |                     logger.error("Your Solana validator is in DELINQUENT state")
113 | 
114 |             activated_stake = account.get('activatedStake')
115 |             epoch_credits = account.get('epochCredits', [])
116 | 
117 |             if activated_stake and epoch_credits:
118 |                 last_epoch = epoch_credits[-1]
119 |                 if last_epoch and current_epoch == last_epoch[0]:
120 |                     vote_credits = last_epoch[1] - last_epoch[2]
121 |                     all_vote_credits.append(vote_credits)
122 | 
123 |         if all_vote_credits:
124 |             avg_vote_credits = sum(all_vote_credits) / len(all_vote_credits)
125 |             update_metric(solana_avg_vote_credits, avg_vote_credits)
126 |             logger.debug(f'Average Network Vote credits: {avg_vote_credits}')
127 | 
128 |         if vote_account:
129 |             process_vote_account(vote_account)
130 |         else:
131 |             logger.error("Validator account not found in both current and delinquent lists.")
132 | 
133 |     except Exception as e:
134 |         logger.error(f"Error fetching or processing vote accounts: {e}")
135 | 
136 |     end_time = time.time()
137 |     logger.success(f"{inspect.currentframe().f_code.co_name}: Collection completed in {end_time - start_time:.2f} seconds.")
138 | 
139 | 
140 | def process_vote_account(vote_account):
141 |     """Process and update metrics for the given vote account."""
142 |     val_stake = vote_account.get('activatedStake') / 10 ** 9
143 |     commission = vote_account.get('commission')
144 |     epoch_vote = vote_account.get('epochVoteAccount')
145 |     root_slot = vote_account.get('rootSlot')
146 |     last_epoch = vote_account.get('epochCredits')[-1]
147 |     vote_credits = last_epoch[1] - last_epoch[2]
148 |     total_credits = last_epoch[1]
149 | 
150 |     update_metric(solana_validator_activated_stake, val_stake, labels={"pubkey": PUB_KEY, "votekey": VOTE_PUB_KEY})
151 |     update_metric(solana_val_commission, commission, labels={"commission": str(commission)})
152 | 
153 |     logger.debug(f'Validator Stake: {round(val_stake, 2)}, Commission: {commission}, Epoch vote: {epoch_vote}, '
154 |                  f'Vote credits: {vote_credits}, Total credits: {total_credits}')
155 | 
156 |     update_metric(solana_val_status, 1 if epoch_vote else 0, labels={"state": "voting" if epoch_vote else "not voting"})
157 |     update_metric(solana_vote_credits, vote_credits)
158 |     update_metric(solana_total_credits, total_credits)
159 | 
160 |     logger.info("Updated Prometheus metrics for validator.")
161 | 


--------------------------------------------------------------------------------