├── etc └── zabbix │ ├── scripts │ ├── postgresql │ │ ├── pgsql.ping.time.sql │ │ ├── pgsql.replication.recovery_role.sql │ │ ├── pgsql.connections.prepared.sql │ │ ├── pgsql.uptime.sql │ │ ├── pgsql.cache.hit.sql │ │ ├── pgsql.discovery.db.sql │ │ ├── pgsql.scans.sql │ │ ├── pgsql.dbstat.sql │ │ ├── pgsql.config.hash.sql │ │ ├── pgsql.bgwriter.sql │ │ ├── pgsql.frozenxid.sql │ │ ├── pgsql.replication.status.sql │ │ ├── pgsql.dbstat.sum.sql │ │ ├── pgsql.wal.stat.sql │ │ ├── pgsql.replication.lag.sql │ │ ├── pgsql.locks.sql │ │ ├── pgsql.connections.sum.sql │ │ ├── pgsql.connections.sql │ │ ├── pgsql.transactions.sql │ │ └── pgsql.query.time.sql │ ├── zabbix_files.sh │ ├── script_version.sh │ ├── kontena_grid.sh │ ├── process.sh │ ├── db2snapshot.pl │ ├── discover_subprocess.sh │ ├── discover_certificates.py │ ├── db2stat.pl │ ├── docker_swarm.py │ ├── check_certificate.py │ ├── zabbix_sender_psk.py │ ├── docker.sh │ ├── kubernetes_monitoring.py │ └── pacemaker.py │ └── zabbix_agentd.d │ ├── pacemaker.conf │ ├── script_version.conf │ ├── zabbix_files.conf │ ├── kontena_grid.conf │ ├── docker_swarm.conf │ ├── mysql.conf │ ├── process.conf │ ├── docker.conf │ ├── certificates.conf │ ├── kubernetes_monitoring.conf │ ├── postgresql_monitoring.conf │ └── galera.conf ├── custom ├── conf │ ├── elastizabbix.conf │ ├── fileTimestamp.conf │ ├── curl.conf │ ├── alfresco.conf │ ├── zapache.conf │ ├── nginx_monitoring.conf │ └── discover_apache.conf └── scripts │ ├── curl.sh │ ├── alfresco-pdf.sh │ ├── alfresco-pdfa.sh │ ├── discover_apache-backends.sh │ ├── discover_responsecodes.sh │ ├── fileTimestamp.vbs │ ├── elastizabbix.py │ └── zapache ├── documentation ├── docker.png ├── process.png ├── certificates.png ├── kubernetes_monitoring │ ├── csr.yml │ ├── config │ └── access.yml ├── kontena_grid.md ├── process.md ├── docker_swarm.md ├── pacemaker.md ├── certificates.md ├── mysql-galera.md ├── db2stat.md ├── docker.md ├── db2stat-testing.md └── kubernetes_monitoring.md ├── README.md ├── opt └── cron │ └── docker_stats.sh └── templates ├── process.xml ├── docker_trapper.xml ├── process_active.xml ├── pacemaker.xml └── pacemaker_active.xml /etc/zabbix/scripts/postgresql/pgsql.ping.time.sql: -------------------------------------------------------------------------------- 1 | \timing 2 | SELECT 1; 3 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/zabbix_files.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Version: 1.0 3 | ls -la $1 4 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.replication.recovery_role.sql: -------------------------------------------------------------------------------- 1 | SELECT pg_is_in_recovery()::int 2 | -------------------------------------------------------------------------------- /custom/conf/elastizabbix.conf: -------------------------------------------------------------------------------- 1 | UserParameter=elastizabbix[*],/etc/zabbix/scripts/elastizabbix.py $1 $2 $3 2 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.connections.prepared.sql: -------------------------------------------------------------------------------- 1 | SELECT count(*) 2 | FROM pg_prepared_xacts 3 | -------------------------------------------------------------------------------- /custom/conf/fileTimestamp.conf: -------------------------------------------------------------------------------- 1 | UserParameter=fileTimestamp[*],cscript c:\zabbix\scripts\fileTimestamp.vbs //nologo $1 -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.uptime.sql: -------------------------------------------------------------------------------- 1 | SELECT date_part('epoch', now() - pg_postmaster_start_time())::int 2 | -------------------------------------------------------------------------------- /documentation/docker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digiapulssi/zabbix-monitoring-scripts/HEAD/documentation/docker.png -------------------------------------------------------------------------------- /documentation/process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digiapulssi/zabbix-monitoring-scripts/HEAD/documentation/process.png -------------------------------------------------------------------------------- /custom/conf/curl.conf: -------------------------------------------------------------------------------- 1 | #First parameter is header. Second is URL 2 | UserParameter=curl[*],/etc/zabbix/scripts/curl.sh "$1" "$2" 3 | -------------------------------------------------------------------------------- /documentation/certificates.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digiapulssi/zabbix-monitoring-scripts/HEAD/documentation/certificates.png -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.cache.hit.sql: -------------------------------------------------------------------------------- 1 | SELECT round(sum(blks_hit)*100/sum(blks_hit+blks_read), 2) 2 | FROM pg_stat_database 3 | -------------------------------------------------------------------------------- /etc/zabbix/zabbix_agentd.d/pacemaker.conf: -------------------------------------------------------------------------------- 1 | UserParameter=pacemaker.status[*],/etc/zabbix/scripts/pacemaker.py $1 $2 $3 $4 $5 $6 $7 $8 2 | 3 | -------------------------------------------------------------------------------- /etc/zabbix/zabbix_agentd.d/script_version.conf: -------------------------------------------------------------------------------- 1 | UserParameter=script.version[*],/etc/zabbix/scripts/script_version.sh /etc/zabbix/scripts/ 2 | -------------------------------------------------------------------------------- /custom/scripts/curl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Description: 5 | #Silmple curl wiht one header 6 | 7 | set -e 8 | curl -s -H "$1" "$2" 9 | 10 | -------------------------------------------------------------------------------- /custom/conf/alfresco.conf: -------------------------------------------------------------------------------- 1 | UserParameter=alfresco.pdf[*],/etc/zabbix/scripts/alfresco-pdf.sh $1 $2 $3 2 | UserParameter=alfresco.pdfa[*],/etc/zabbix/scripts/alfresco-pdfa.sh $1 $2 $3 3 | -------------------------------------------------------------------------------- /custom/conf/zapache.conf: -------------------------------------------------------------------------------- 1 | # 2 | # This is a sample zabbix_agentd config file. 3 | # Edit to your needs. 4 | # 5 | UserParameter=zapache[*],/etc/zabbix/scripts/zapache "$1" "$2" "$3" 6 | -------------------------------------------------------------------------------- /custom/conf/nginx_monitoring.conf: -------------------------------------------------------------------------------- 1 | # ~> Zabbix3.4 2 | UserParameter=nginx.json[*],curl -s 'http://$1:$2/nginx_status' | tr -d a-zA-Z\\n | tr ' :' ',' | sed -e s/',,*'/,/g -e s/'^,'/'{"nginx":['/g -e s/',$'/']}'/g 3 | -------------------------------------------------------------------------------- /custom/conf/discover_apache.conf: -------------------------------------------------------------------------------- 1 | UserParameter=discover.backends[*],/etc/zabbix/scripts/discover_apache-backends.sh "$1" 2 | UserParameter=discover.responsecodes[*],/etc/zabbix/scripts/discover_responsecodes.sh "$1" 3 | -------------------------------------------------------------------------------- /etc/zabbix/zabbix_agentd.d/zabbix_files.conf: -------------------------------------------------------------------------------- 1 | UserParameter=agent.scripts[*],/etc/zabbix/scripts/zabbix_files.sh /etc/zabbix/scripts 2 | UserParameter=agent.confs[*],/etc/zabbix/scripts/zabbix_files.sh /etc/zabbix/zabbix_agentd.d 3 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/script_version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Version: 1.0 3 | set -e 4 | 5 | echo -n '{"data":[' 6 | # format to json with sed 7 | ls $1 | sed 's/\(.*\)/{"{#SCRIPT}":"\1"}/g' | sed '$!s/$/,/' | tr '\n' ' ' 8 | echo -n ']}' 9 | -------------------------------------------------------------------------------- /etc/zabbix/zabbix_agentd.d/kontena_grid.conf: -------------------------------------------------------------------------------- 1 | UserParameter=kontena.grid.discover_nodes[*],/etc/zabbix/scripts/kontena_grid.sh discover "$1" "$2" "$3" 2 | UserParameter=kontena.grid.node.connected[*],/etc/zabbix/scripts/kontena_grid.sh stat "$1" "$2" "$3" "$4" connected 3 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.discovery.db.sql: -------------------------------------------------------------------------------- 1 | WITH T AS ( 2 | SELECT 3 | datname AS "{#DBNAME}" 4 | FROM pg_database 5 | WHERE 6 | NOT datistemplate 7 | AND datname != 'postgres' 8 | ) 9 | SELECT '{"data":'|| regexp_replace(coalesce(json_agg(T), '[]'::json)::text, E'[\\n\\r\\s]+', '', 'g') || '}' 10 | FROM T 11 | -------------------------------------------------------------------------------- /documentation/kubernetes_monitoring/csr.yml: -------------------------------------------------------------------------------- 1 | apiVersion: certificates.k8s.io/v1beta1 2 | kind: CertificateSigningRequest 3 | metadata: 4 | name: zabbix 5 | spec: 6 | groups: 7 | - system:authenticated 8 | request: 9 | signerName: kubernetes.io/kube-apiserver-client 10 | usages: 11 | - client auth 12 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.scans.sql: -------------------------------------------------------------------------------- 1 | WITH T AS ( 2 | SELECT 3 | sum(CASE WHEN relkind IN ('r', 't', 'm') THEN pg_stat_get_numscans(oid) END) seq, 4 | sum(CASE WHEN relkind = 'i' THEN pg_stat_get_numscans(oid) END) idx 5 | FROM pg_class 6 | WHERE relkind IN ('r', 't', 'm', 'i') 7 | ) 8 | SELECT row_to_json(T) 9 | FROM T 10 | -------------------------------------------------------------------------------- /custom/scripts/alfresco-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Takes IP and port as an argument. 3 | # Script gets content from given url. Returns lines only with "SUMMARY". 4 | set -e 5 | url='http://'${1}':'${2}'/alfresco/s/enterprise/admin/admin-testtransform-test?operation=getTransformationStatistics&arg1=&arg2=&arg3=pdf' 6 | header=${3} 7 | curl -v -s -H $header $url --stderr - | grep SUMMARY 8 | -------------------------------------------------------------------------------- /custom/scripts/alfresco-pdfa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Takes IP and port as an argument. 3 | # Script gets content from given url. Returns lines only with "SUMMARY". 4 | set -e 5 | url='http://'${1}':'${2}'/alfresco/s/enterprise/admin/admin-testtransform-test?operation=getTransformationStatistics&arg1=&arg2=&arg3=pdfa' 6 | header=${3} 7 | curl -v -s -H $header $url --stderr - | grep SUMMARY 8 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.dbstat.sql: -------------------------------------------------------------------------------- 1 | SELECT json_object_agg(datname, row_to_json(T)) FROM ( 2 | SELECT datname, 3 | numbackends, 4 | xact_commit, 5 | xact_rollback, 6 | blks_read, 7 | blks_hit, 8 | tup_returned, 9 | tup_fetched, 10 | tup_inserted, 11 | tup_updated, 12 | tup_deleted, 13 | conflicts, 14 | temp_files, 15 | temp_bytes, 16 | deadlocks 17 | FROM pg_stat_database 18 | WHERE datname IS NOT NULL) T 19 | -------------------------------------------------------------------------------- /documentation/kontena_grid.md: -------------------------------------------------------------------------------- 1 | # Kontena Grid Monitoring 2 | 3 | Monitor Kontena grid nodes. 4 | 5 | Requirements: 6 | - jq 7 | - curl 8 | 9 | ## Usage 10 | 11 | Item Syntax | Description | Units | 12 | ----------- | ----------- | ----- | 13 | kontena.grid.discover[, , ] | Discover nodes in Kontena grid | | 14 | kontena.grid.node.connected[, , , {#NODE}] | Node connection status | true/false | 15 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.config.hash.sql: -------------------------------------------------------------------------------- 1 | SELECT md5( 2 | json_build_object( 3 | 'extensions', ( 4 | SELECT array_agg(extname) FROM ( 5 | SELECT extname 6 | FROM pg_extension 7 | ORDER BY extname 8 | ) AS e 9 | ), 10 | 'settings', ( 11 | SELECT json_object(array_agg(name), array_agg(setting)) FROM ( 12 | SELECT name, setting 13 | FROM pg_settings 14 | WHERE name != 'application_name' 15 | ORDER BY name 16 | ) AS s 17 | ) 18 | )::text); 19 | -------------------------------------------------------------------------------- /documentation/kubernetes_monitoring/config: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | clusters: 3 | - cluster: 4 | certificate-authority: /path/to/ca.crt 5 | server: https://127.0.0.1:8443 6 | name: 7 | contexts: 8 | - context: 9 | cluster: 10 | user: 11 | name: 12 | current-context: 13 | kind: Config 14 | preferences: {} 15 | users: 16 | - name: 17 | user: 18 | client-certificate: /path/to/client.crt 19 | client-key: /path/to/client.key 20 | -------------------------------------------------------------------------------- /etc/zabbix/zabbix_agentd.d/docker_swarm.conf: -------------------------------------------------------------------------------- 1 | UserParameter=docker.swarm.discover.services[*],/etc/zabbix/scripts/docker_swarm.py "discovery" 2 | 3 | # Metric retrievals for Zabbix 4.0 compatibility. Use dependent discoveries on Zabbix 4.2+ 4 | UserParameter=docker.swarm.hostname[*],/etc/zabbix/scripts/docker_swarm.py "hostname" --service "$1" 5 | UserParameter=docker.swarm.status[*],/etc/zabbix/scripts/docker_swarm.py "status" --service "$1" 6 | UserParameter=docker.swarm.uptime[*],/etc/zabbix/scripts/docker_swarm.py "uptime" --service "$1" 7 | -------------------------------------------------------------------------------- /documentation/kubernetes_monitoring/access.yml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: zabbix-role 5 | rules: 6 | - apiGroups: ["batch"] 7 | resources: ["pods", "nodes", "services", "jobs"] 8 | verbs: ["get", "list"] 9 | --- 10 | apiVersion: rbac.authorization.k8s.io/v1 11 | kind: ClusterRoleBinding 12 | metadata: 13 | name: zabbix-role 14 | subjects: 15 | - kind: User 16 | name: zabbix 17 | roleRef: 18 | kind: ClusterRole 19 | name: zabbix-role 20 | apiGroup: rbac.authorization.k8s.io 21 | -------------------------------------------------------------------------------- /custom/scripts/discover_apache-backends.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Version: 1.0 3 | 4 | # This script takes path to apache configuration folder as an argument and reads lines only with 'ProxyPass' or 'Location'. 5 | # It takes all backends between first and second slash and prints them out as json format. 6 | 7 | set -e 8 | 9 | echo -n '{"data":[' 10 | 11 | # Removes duplicates with awk and formats to json with sed 12 | grep -r 'ProxyPass\|Location' $1 | grep -Po '(?<=[[:blank:]])\/[^\/ \s]*' | awk '!a[$0]++' | sed 's/\(.*\)/{"{#URI}":"\1"}/g' | sed '$!s/$/,/' | tr '\n' ' ' 13 | echo -n ']}' 14 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.bgwriter.sql: -------------------------------------------------------------------------------- 1 | SELECT row_to_json(T) 2 | FROM 3 | (SELECT checkpoints_timed, 4 | checkpoints_req, 5 | checkpoint_write_time, 6 | checkpoint_sync_time, 7 | current_setting('block_size')::int*buffers_checkpoint AS buffers_checkpoint, 8 | current_setting('block_size')::int*buffers_clean AS buffers_clean, 9 | maxwritten_clean, 10 | current_setting('block_size')::int*buffers_backend AS buffers_backend, 11 | buffers_backend_fsync, 12 | current_setting('block_size')::int*buffers_alloc AS buffers_alloc 13 | FROM pg_stat_bgwriter) T 14 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.frozenxid.sql: -------------------------------------------------------------------------------- 1 | WITH T AS ( 2 | SELECT 3 | age(relfrozenxid), 4 | current_setting('autovacuum_freeze_max_age')::integer fma 5 | FROM pg_class 6 | WHERE relkind IN ('r', 't')) 7 | SELECT row_to_json(T2) 8 | FROM ( 9 | SELECT extract(epoch FROM now())::integer ts, 10 | ( 11 | SELECT ((1 - max(age)::double precision / current_setting('autovacuum_freeze_max_age')::integer) * 100)::numeric(9,6) 12 | FROM T 13 | WHERE age < fma 14 | ) prc_before_av, 15 | ( 16 | SELECT ((1 - max(age)::double precision / -((1 << 31) + 1)) * 100)::numeric(9,6) 17 | FROM T 18 | ) prc_before_stop 19 | ) T2 20 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.replication.status.sql: -------------------------------------------------------------------------------- 1 | DO LANGUAGE plpgsql $$ 2 | DECLARE 3 | ver integer; 4 | res text := 2; 5 | BEGIN 6 | SELECT current_setting('server_version_num') INTO ver; 7 | 8 | IF (SELECT pg_is_in_recovery()) THEN 9 | IF (ver >= 90600) THEN 10 | SELECT * INTO res from ( 11 | SELECT COUNT(*) FROM pg_stat_wal_receiver 12 | ) T; 13 | ELSE 14 | res := 'ZBX_NOTSUPPORTED: Requires PostgreSQL version 9.6 or higher'; 15 | END IF; 16 | END IF; 17 | 18 | perform set_config('zbx_tmp.repl_status_res', res, false); 19 | END $$; 20 | 21 | SELECT current_setting('zbx_tmp.repl_status_res'); 22 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/kontena_grid.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | kontena_api_v1() { 5 | RESPONSE=$(curl -k -s \ 6 | -H "Authorization: Bearer $AUTH_TOKEN" \ 7 | -H "Accept: application/json" \ 8 | "$1") 9 | 10 | echo $RESPONSE 11 | } 12 | 13 | CMD=$1 14 | MASTER_ADDRESS=$2 15 | AUTH_TOKEN=$3 16 | GRID=$4 17 | 18 | if [ "$CMD" == "discover" ]; then 19 | kontena_api_v1 https://$MASTER_ADDRESS/v1/grids/$GRID/nodes | jq '.nodes | map({"{#NODE}": .name}) | { "data": . }' 20 | elif [ "$CMD" == "stat" ]; then 21 | NODE=$5 22 | STAT=$6 23 | kontena_api_v1 https://$MASTER_ADDRESS/v1/nodes/$GRID/$NODE | jq '.'$STAT 24 | fi 25 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.dbstat.sum.sql: -------------------------------------------------------------------------------- 1 | SELECT row_to_json(T) from ( 2 | SELECT sum(numbackends) AS numbackends, 3 | sum(xact_commit) AS xact_commit, 4 | sum(xact_rollback) AS xact_rollback, 5 | sum(blks_read) AS blks_read, 6 | sum(blks_hit) AS blks_hit, 7 | sum(tup_returned) AS tup_returned, 8 | sum(tup_fetched) AS tup_fetched, 9 | sum(tup_inserted) AS tup_inserted, 10 | sum(tup_updated) AS tup_updated, 11 | sum(tup_deleted) AS tup_deleted, 12 | sum(conflicts) AS conflicts, 13 | sum(temp_files) AS temp_files, 14 | sum(temp_bytes) AS temp_bytes, 15 | sum(deadlocks) AS deadlocks 16 | FROM pg_stat_database) T 17 | -------------------------------------------------------------------------------- /etc/zabbix/zabbix_agentd.d/mysql.conf: -------------------------------------------------------------------------------- 1 | #copied from https://github.com/zabbix/zabbix/blob/master/conf/zabbix_agentd/userparameter_mysql.conf 2 | 3 | UserParameter=mysql.ping, HOME=/var/lib/zabbix mysqladmin ping 4 | UserParameter=mysql.get_status_variables, HOME=/var/lib/zabbix mysql -sNX -e "show global status" 5 | UserParameter=mysql.version, HOME=/var/lib/zabbix mysqladmin -s version 6 | UserParameter=mysql.db.discovery, HOME=/var/lib/zabbix mysql -sN -e "show databases" 7 | UserParameter=mysql.db.size[*], HOME=/var/lib/zabbix mysql -sN -e "SELECT SUM(DATA_LENGTH + INDEX_LENGTH) FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_SCHEMA='$1'" 8 | UserParameter=mysql.replication.discovery, HOME=/var/lib/zabbix mysql -sNX -e "show slave status" 9 | UserParameter=mysql.slave_status, HOME=/var/lib/zabbix mysql -sNX -e "show slave status" 10 | -------------------------------------------------------------------------------- /documentation/process.md: -------------------------------------------------------------------------------- 1 | # Process Monitoring 2 | 3 | Discover running processes in Unix/Linux/AIX environments. 4 | 5 | Discovery uses the process name in /proc/pid/status that's truncated to 15 characters 6 | because it's the most reliable name used by Zabbix process monitoring items (see https://www.zabbix.com/documentation/3.0/manual/appendix/items/proc_mem_num_notes) 7 | 8 | ## Usage 9 | 10 | Item Syntax | Description | Units | 11 | ----------- | ----------- | ----- | 12 | discover.processes | Discover all processes | Provides the following template variables: {#COMMAND} | 13 | proc.cpu.util["{#COMMAND}"] | Process CPU utilization | % | 14 | proc.mem["{#COMMAND}"] | Process memory usage | bytes | 15 | proc.num["{#COMMAND}"] | Number of processes with the same command | (number) | 16 | 17 | ## Example 18 | 19 | ![Screenshot](process.png) 20 | 21 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.wal.stat.sql: -------------------------------------------------------------------------------- 1 | DO LANGUAGE plpgsql $$ 2 | DECLARE 3 | ver integer; 4 | res text := '{"write":0,"count":0}'; 5 | BEGIN 6 | SELECT current_setting('server_version_num') INTO ver; 7 | 8 | IF (SELECT NOT pg_is_in_recovery()) THEN 9 | IF (ver >= 100000) THEN 10 | SELECT row_to_json(T) INTO res FROM ( 11 | SELECT pg_wal_lsn_diff(pg_current_wal_lsn(),'0/00000000') AS WRITE, 12 | count(*) FROM pg_ls_waldir() AS COUNT 13 | ) T; 14 | 15 | ELSE 16 | SELECT row_to_json(T) INTO res FROM ( 17 | SELECT pg_xlog_location_diff(pg_current_xlog_location(),'0/00000000') AS WRITE, 18 | count(*) FROM pg_ls_dir('pg_xlog') AS COUNT 19 | ) T; 20 | END IF; 21 | END IF; 22 | 23 | perform set_config('zbx_tmp.wal_json_res', res, false); 24 | END $$; 25 | 26 | select current_setting('zbx_tmp.wal_json_res'); 27 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.replication.lag.sql: -------------------------------------------------------------------------------- 1 | DO LANGUAGE plpgsql $$ 2 | DECLARE 3 | ver integer; 4 | res text; 5 | BEGIN 6 | SELECT current_setting('server_version_num') INTO ver; 7 | 8 | IF (ver >= 100000) THEN 9 | SELECT * INTO res from ( 10 | SELECT 11 | CASE WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() 12 | THEN 0 13 | ELSE COALESCE(EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp())::integer, 0) 14 | END 15 | ) T; 16 | 17 | ELSE 18 | SELECT * INTO res from ( 19 | SELECT 20 | CASE WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() 21 | THEN 0 22 | ELSE COALESCE(EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp())::integer, 0) 23 | END 24 | ) T; 25 | END IF; 26 | 27 | perform set_config('zbx_tmp.repl_lag_res', res, false); 28 | END $$; 29 | 30 | select current_setting('zbx_tmp.repl_lag_res'); 31 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/process.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Version: 1.0 3 | set -e 4 | 5 | # Discover all running process names 6 | # Use the process name in /proc/pid/status that's truncated to 15 characters 7 | # because it's the most reliable name used by Zabbix process monitoring items 8 | # See https://www.zabbix.com/documentation/3.0/manual/appendix/items/proc_mem_num_notes 9 | 10 | echo -n '{"data":[' 11 | # Filter away processes with no cumulative CPU time with grep -v 12 | # Filter away kernel processes (and also zombie processes) by filtering out processes that don't use any user memory (vsz == 0) 13 | # Take only 15 characters (to leave out the time portion) with cut 14 | # Remove duplicates with awk, format to json with sed 15 | ps -A -o comm= -o time= -o vsz= | grep -v ' 00:00:00' | awk '$3 != 0' | cut -c-15 | sed 's/ *$//' | awk '!a[$0]++' | sed 's/\(.*\)/{"{#COMMAND}":"\1"}/g' | sed '$!s/$/,/' | tr '\n' ' ' 16 | echo -n ']}' 17 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/db2snapshot.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -wT 2 | # Version: 1.0 3 | # Usage: db2snapshot 4 | # 5 | # dbname - Name of db2 database 6 | # lines - How many lines to output 7 | 8 | use File::Spec; 9 | 10 | # Directory where snapshots are cached. 11 | my $SNAPSHOT_DIR = File::Spec->tmpdir(); 12 | 13 | # Get database path, name and timeout args 14 | my $dbname = shift @ARGV; 15 | my $lines = shift @ARGV; 16 | 17 | # Untaint 18 | if ($lines =~ /^(\d+)$/) { 19 | $lines = $1; 20 | } else { 21 | die "Bad lines value"; 22 | } 23 | 24 | if ($dbname =~ /^([-\w.]+)$/) { 25 | $dbname = $1; 26 | } else { 27 | die "Bad dbname argument"; 28 | } 29 | 30 | # Generate stat file name 31 | my $statfile = "$SNAPSHOT_DIR/$dbname.txt"; 32 | 33 | # Open db2 snapshot file and output x lines 34 | my $i=1; 35 | open FILE, "<$statfile"; 36 | while () { 37 | if ($i >$lines) { 38 | last; 39 | } 40 | print $_; 41 | $i++; 42 | } 43 | -------------------------------------------------------------------------------- /etc/zabbix/zabbix_agentd.d/process.conf: -------------------------------------------------------------------------------- 1 | UserParameter=discover.processes,/etc/zabbix/scripts/process.sh 2 | 3 | # This is exactly the same as discover.processes but can be used in Zabbix as a separate discovery with filtering for critical processes 4 | UserParameter=discover.critical.processes,/etc/zabbix/scripts/process.sh 5 | 6 | # Discovery sub-processes, arguments: 7 | # - Main process name (eg. DataFlowEngine) by which sub-processes are filtered from ps command output 8 | # - Argument order number to return as {#PARAM1} LLD macro; 1 for the first command-line argument to the process, 2 for the second etc. 9 | # - Argument order number to return as {#PARAM2} LLD macro (if not used you can use eg. 0 to return the process path as macro value) 10 | # 11 | # The LLD macros can then be used as argument for proc.mem, proc.cpu.util and proc.num items 12 | UserParameter=discover.subprocess[*],/etc/zabbix/scripts/discover_subprocess.sh "$1" "$2" "$3" 13 | -------------------------------------------------------------------------------- /etc/zabbix/zabbix_agentd.d/docker.conf: -------------------------------------------------------------------------------- 1 | UserParameter=docker.containers.discovery,/etc/zabbix/scripts/docker.sh discovery 2 | UserParameter=docker.containers.count,/etc/zabbix/scripts/docker.sh count 3 | UserParameter=docker.containers.discovery.all,/etc/zabbix/scripts/docker.sh discovery_all 4 | UserParameter=docker.containers.count.all,/etc/zabbix/scripts/docker.sh count_all 5 | 6 | # First parameter: container id 7 | # Second parameter: one of netin, netout, cpu, disk, memory, uptime, up or status 8 | UserParameter=docker.containers[*],/etc/zabbix/scripts/docker.sh "$1" "$2" 9 | 10 | ####################################################################### 11 | # Compatibility with www.monitoringartist.com docker templates 12 | 13 | UserParameter=docker.discovery,/etc/zabbix/scripts/docker.sh discovery 14 | UserParameter=docker.up[*],/etc/zabbix/scripts/docker.sh "$1" up 15 | 16 | # Ignore the second argument for docker.cpu (system vs user) 17 | UserParameter=docker.cpu[*],/etc/zabbix/scripts/docker.sh "$1" cpu 18 | 19 | # Ignore the second argument for docker.mem (total_cache vs total_rss vs total_swap) 20 | UserParameter=docker.mem[*],/etc/zabbix/scripts/docker.sh "$1" memory 21 | -------------------------------------------------------------------------------- /etc/zabbix/zabbix_agentd.d/certificates.conf: -------------------------------------------------------------------------------- 1 | UserParameter=certificates.discovery[*],/etc/zabbix/scripts/discover_certificates.py "$1" 2 | UserParameter=certificate.status[*],/etc/zabbix/scripts/check_certificate.py "$1" "$2" status 3 | UserParameter=certificate.startdate[*],/etc/zabbix/scripts/check_certificate.py "$1" "$2" startdate 4 | UserParameter=certificate.enddate[*],/etc/zabbix/scripts/check_certificate.py "$1" "$2" enddate 5 | UserParameter=certificate.lifetime[*],/etc/zabbix/scripts/check_certificate.py "$1" "$2" lifetime 6 | UserParameter=certificate.lifetime_days[*],/etc/zabbix/scripts/check_certificate.py "$1" "$2" lifetime_days 7 | UserParameter=certificate.serial[*],/etc/zabbix/scripts/check_certificate.py "$1" "$2" serial 8 | UserParameter=certificate.subject[*],/etc/zabbix/scripts/check_certificate.py "$1" "$2" subject 9 | UserParameter=certificate.issuer[*],/etc/zabbix/scripts/check_certificate.py "$1" "$2" issuer 10 | UserParameter=certificate.subject_hash[*],/etc/zabbix/scripts/check_certificate.py "$1" "$2" subject_hash 11 | UserParameter=certificate.issuer_hash[*],/etc/zabbix/scripts/check_certificate.py "$1" "$2" issuer_hash 12 | UserParameter=certificate.fingerprint[*],/etc/zabbix/scripts/check_certificate.py "$1" "$2" fingerprint 13 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.locks.sql: -------------------------------------------------------------------------------- 1 | WITH T AS 2 | (SELECT db.datname dbname, 3 | lower(replace(Q.mode, 'Lock', '')) AS MODE, 4 | coalesce(T.qty, 0) val 5 | FROM pg_database db 6 | JOIN ( 7 | VALUES ('AccessShareLock') ,('RowShareLock') ,('RowExclusiveLock') ,('ShareUpdateExclusiveLock') ,('ShareLock') ,('ShareRowExclusiveLock') ,('ExclusiveLock') ,('AccessExclusiveLock')) Q(MODE) ON TRUE NATURAL 8 | LEFT JOIN 9 | (SELECT datname, 10 | MODE, 11 | count(MODE) qty 12 | FROM pg_locks lc 13 | RIGHT JOIN pg_database db ON db.oid = lc.database 14 | GROUP BY 1, 2) T 15 | WHERE NOT db.datistemplate 16 | ORDER BY 1, 2) 17 | SELECT json_object_agg(dbname, row_to_json(T2)) 18 | FROM 19 | (SELECT dbname, 20 | sum(val) AS total, 21 | sum(CASE 22 | WHEN MODE = 'accessexclusive' THEN val 23 | END) AS accessexclusive, 24 | sum(CASE 25 | WHEN MODE = 'accessshare' THEN val 26 | END) AS accessshare, 27 | sum(CASE 28 | WHEN MODE = 'exclusive' THEN val 29 | END) AS EXCLUSIVE, 30 | sum(CASE 31 | WHEN MODE = 'rowexclusive' THEN val 32 | END) AS rowexclusive, 33 | sum(CASE 34 | WHEN MODE = 'rowshare' THEN val 35 | END) AS rowshare, 36 | sum(CASE 37 | WHEN MODE = 'share' THEN val 38 | END) AS SHARE, 39 | sum(CASE 40 | WHEN MODE = 'sharerowexclusive' THEN val 41 | END) AS sharerowexclusive, 42 | sum(CASE 43 | WHEN MODE = 'shareupdateexclusive' THEN val 44 | END) AS shareupdateexclusive 45 | FROM T 46 | GROUP BY dbname) T2 47 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/discover_subprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Version: 1.0 3 | set -e 4 | 5 | 6 | if [ "$#" -ne 3 ] 7 | then 8 | echo "Missing or too many command line arguments. Usage: discover.subprocess[, , ]" 9 | exit 1 10 | fi 11 | 12 | # Discover all running processes with the given process name (first cmdline parameter) 13 | # Print the startup arguments of the processes using awk 14 | # The first argument is fifth column in ps command output 15 | PROCESS="$1" 16 | PARAM1_COL=`expr $2 + 4` 17 | PARAM2_COL=`expr $3 + 4` 18 | 19 | echo -n '{"data":[' 20 | 21 | # Uses first command line argument to filter processes 22 | # Filter away processes with no cumulative CPU time with grep -v 23 | # Filter away kernel processes (and also zombie processes) by filtering out processes that don't use any user memory (vsz == 0) 24 | 25 | # Example output of ps command: 26 | # DataFlowEngine 00:01:16 2822924 DataFlowEngine ACEBET1 00000000-0000-0000-0000-000000000000 EJSGRP1 27 | # DataFlowEngine 00:00:22 3427168 DataFlowEngine ACEBET1 00000000-0000-0000-0000-000000000000 HTTPGRP1 28 | # DataFlowEngine 00:00:07 2466424 DataFlowEngine ACEBET1 00000000-0000-0000-0000-000000000000 MONITORGRP1 29 | 30 | ps -A -o comm= -o time= -o vsz= -o args= | egrep "^$PROCESS " | grep -v ' 00:00:00' | awk '$3 != 0' | awk -v a="$PARAM1_COL" -v b="$PARAM2_COL" '{print $1 " " $a " " $b}' | sed 's/\(.*\) \(.*\) \(.*\)/{"{#COMMAND}":"\1", "{#PARAM1}":"\2", "{#PARAM2}":"\3"}/g' | sed '$!s/$/,/' | tr '\n' ' ' 31 | 32 | echo -n ']}' 33 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.connections.sum.sql: -------------------------------------------------------------------------------- 1 | DO LANGUAGE plpgsql $$ 2 | DECLARE 3 | ver integer; 4 | res text; 5 | BEGIN 6 | SELECT current_setting('server_version_num') INTO ver; 7 | 8 | IF (ver >= 90600) THEN 9 | SELECT row_to_json(T) INTO res from ( 10 | SELECT 11 | sum(CASE WHEN state = 'active' THEN 1 ELSE 0 END) AS active, 12 | sum(CASE WHEN state = 'idle' THEN 1 ELSE 0 END) AS idle, 13 | sum(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END) AS idle_in_transaction, 14 | count(*) AS total, 15 | count(*)*100/(SELECT current_setting('max_connections')::int) AS total_pct, 16 | sum(CASE WHEN wait_event IS NOT NULL THEN 1 ELSE 0 END) AS waiting, 17 | (SELECT count(*) FROM pg_prepared_xacts) AS prepared 18 | FROM pg_stat_activity WHERE datid is not NULL 19 | ) T; 20 | 21 | ELSE 22 | SELECT row_to_json(T) INTO res from ( 23 | SELECT 24 | sum(CASE WHEN state = 'active' THEN 1 ELSE 0 END) AS active, 25 | sum(CASE WHEN state = 'idle' THEN 1 ELSE 0 END) AS idle, 26 | sum(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END) AS idle_in_transaction, 27 | count(*) AS total, 28 | count(*)*100/(SELECT current_setting('max_connections')::int) AS total_pct, 29 | sum(CASE WHEN waiting IS TRUE THEN 1 ELSE 0 END) AS waiting, 30 | (SELECT count(*) FROM pg_prepared_xacts) AS prepared 31 | FROM pg_stat_activity 32 | ) T; 33 | END IF; 34 | 35 | perform set_config('zbx_tmp.conn_json_res', res, false); 36 | END $$; 37 | 38 | select current_setting('zbx_tmp.conn_json_res'); 39 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.connections.sql: -------------------------------------------------------------------------------- 1 | DO LANGUAGE plpgsql $$ 2 | DECLARE 3 | ver integer; 4 | res text; 5 | 6 | BEGIN 7 | SELECT current_setting('server_version_num') INTO ver; 8 | 9 | IF (ver >= 90600) THEN 10 | SELECT json_object_agg(datname, row_to_json(T)) INTO res from ( 11 | SELECT 12 | datname, 13 | sum(CASE WHEN state = 'active' THEN 1 ELSE 0 END) AS active, 14 | sum(CASE WHEN state = 'idle' THEN 1 ELSE 0 END) AS idle, 15 | sum(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END) AS idle_in_transaction, 16 | count(*) AS total, 17 | count(*)*100/(SELECT current_setting('max_connections')::int) AS total_pct, 18 | sum(CASE WHEN wait_event IS NOT NULL THEN 1 ELSE 0 END) AS waiting 19 | FROM pg_stat_activity WHERE datid is not NULL GROUP BY datname ) T; 20 | 21 | ELSE 22 | SELECT json_object_agg(datname, row_to_json(T)) INTO res from ( 23 | SELECT 24 | datname, 25 | sum(CASE WHEN state = 'active' THEN 1 ELSE 0 END) AS active, 26 | sum(CASE WHEN state = 'idle' THEN 1 ELSE 0 END) AS idle, 27 | sum(CASE WHEN state = 'idle in transaction' THEN 1 ELSE 0 END) AS idle_in_transaction, 28 | count(*) AS total, 29 | count(*)*100/(SELECT current_setting('max_connections')::int) AS total_pct, 30 | sum(CASE WHEN waiting IS TRUE THEN 1 ELSE 0 END) AS waiting 31 | FROM pg_stat_activity GROUP BY datname ) T; 32 | END IF; 33 | 34 | perform set_config('zbx_tmp.db_conn_json_res', res, false); 35 | 36 | END $$; 37 | 38 | SELECT current_setting('zbx_tmp.db_conn_json_res'); 39 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.transactions.sql: -------------------------------------------------------------------------------- 1 | DO LANGUAGE plpgsql $$ 2 | DECLARE 3 | ver integer; 4 | res text; 5 | BEGIN 6 | SELECT current_setting('server_version_num') INTO ver; 7 | 8 | IF (ver >= 90600) THEN 9 | SELECT row_to_json(T) INTO res from ( 10 | SELECT 11 | coalesce(extract(epoch FROM max(CASE WHEN state = 'idle in transaction' THEN age(now(), query_start) END)), 0) AS idle, 12 | coalesce(extract(epoch FROM max(CASE WHEN state <> 'idle in transaction' AND state <> 'idle' THEN age(now(), query_start) END)), 0) AS active, 13 | coalesce(extract(epoch FROM max(CASE WHEN wait_event IS NOT NULL THEN age(now(), query_start) END)), 0) AS waiting, 14 | (SELECT coalesce(extract(epoch FROM max(age(now(), prepared))), 0) FROM pg_prepared_xacts) AS prepared 15 | FROM pg_stat_activity) T; 16 | 17 | ELSE 18 | SELECT row_to_json(T) INTO res from ( 19 | SELECT 20 | coalesce(extract(epoch FROM max(CASE WHEN state = 'idle in transaction' THEN age(now(), query_start) END)), 0) AS idle, 21 | coalesce(extract(epoch FROM max(CASE WHEN state <> 'idle in transaction' AND state <> 'idle' THEN age(now(), query_start) END)), 0) AS active, 22 | coalesce(extract(epoch FROM max(CASE WHEN waiting IS TRUE THEN age(now(), query_start) END)), 0) AS waiting, 23 | (SELECT coalesce(extract(epoch FROM max(age(now(), prepared))), 0) FROM pg_prepared_xacts) AS prepared 24 | FROM pg_stat_activity) T; 25 | END IF; 26 | 27 | perform set_config('zbx_tmp.trans_json_res', res, false); 28 | END $$; 29 | 30 | SELECT current_setting('zbx_tmp.trans_json_res'); 31 | -------------------------------------------------------------------------------- /documentation/docker_swarm.md: -------------------------------------------------------------------------------- 1 | # Docker Swarm service discovery and monitoring 2 | 3 | Requirements: 4 | - Python 2.7.13 or Python 3.6.8 5 | - Libraries for Python: docker, requests, urllib3. 6 | 7 | 8 | ## For Python version 3, install dependencies using pip: 9 | ``` 10 | pip3 install docker requests urllib3 python-dateutil 11 | ``` 12 | 13 | 14 | ## For Python version 2, install specific versions of libraries: 15 | ``` 16 | pip install docker==2.7.0 requests==2.23.0 urllib3==1.24.3 python-dateutil==2.8.1 17 | ``` 18 | 19 | 20 | The zabbix user must have enough privileges to monitor docker 21 | 22 | * Add zabbix user to docker group `sudo usermod -aG docker zabbix` 23 | 24 | 25 | ## Usage 26 | 27 | Item Syntax | Description | Units | 28 | ----------- | ----------- | ----- | 29 | docker.swarm.discover.services | Discover all running Docker services | Provides the following template variables: {#SERVICE}. Also provides service information in an array: hostname, status, uptime. | 30 | docker.swarm.hostname | Retrieve hostname(s) for specified service. | Hostname(s) as a comma separated list. | 31 | docker.swarm.status | Current service status. | String containing either "running" or "not running". | 32 | docker.swarm.uptime | Retrieve uptime for specified service. | Seconds. | 33 | 34 | 35 | ## Retrieving data from discovery using JSONPath 36 | 37 | In this example, service data can be retrieved using JSONPath: 38 | ``` 39 | $.data[?(@.service == "")].hostname 40 | $.data[?(@.service == "")].status 41 | $.data[?(@.service == "")].uptime 42 | ``` 43 | -------------------------------------------------------------------------------- /etc/zabbix/zabbix_agentd.d/kubernetes_monitoring.conf: -------------------------------------------------------------------------------- 1 | # Discoveries. Possible arguments are: pods/nodes/services/cronjobs, config_file, field-selector. 2 | UserParameter=kubernetes.discover.pods[*],source /opt/virtualenv/kube-monitoring/bin/activate && python /etc/zabbix/scripts/kubernetes_monitoring.py "pods" --config "$1" --field-selector "$2" 3 | UserParameter=kubernetes.discover.nodes[*],source /opt/virtualenv/kube-monitoring/bin/activate && python /etc/zabbix/scripts/kubernetes_monitoring.py "nodes" --config "$1" --field-selector "$2" 4 | UserParameter=kubernetes.discover.services[*],source /opt/virtualenv/kube-monitoring/bin/activate && python /etc/zabbix/scripts/kubernetes_monitoring.py "services" --config "$1" --field-selector "$2" 5 | UserParameter=kubernetes.discover.cronjobs[*],source /opt/virtualenv/kube-monitoring/bin/activate && python /etc/zabbix/scripts/kubernetes_monitoring.py "cronjobs" --config "$1" 6 | 7 | # Poller(s) for trapper item data. Possible arguments are: config_file, field-selector, host name, minutes. 8 | UserParameter=kubernetes.poller.cronjobs[*],source /opt/virtualenv/kube-monitoring/bin/activate && python /etc/zabbix/scripts/kubernetes_monitoring.py "cronjobs" --config "$1" --host-name "$2" --minutes "$3" 9 | 10 | # Default field selectors for pods. 11 | # Possible status phase values are: Pending, Running, Succeeded, Failed or Unknown. 12 | UserParameter=kubernetes.discover.pods.default[*],source /opt/virtualenv/kube-monitoring/bin/activate && python /etc/zabbix/scripts/kubernetes_monitoring.py "pods" --config "$1" --field-selector "metadata.namespace!=kube-system,status.phase=Running" 13 | -------------------------------------------------------------------------------- /custom/scripts/discover_responsecodes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Version: 1.0 3 | 4 | # This script takes path to apache configuration folder as an argument and reads lines only with 'ProxyPass' or 'Location'. 5 | # It saves string between first and second slash to "URIS" array. 6 | # After that it loops through "STATUSCODES" array and "URIS" array and prints all combinations as json. 7 | 8 | set -e 9 | STATUSCODES=(100 101 102 200 201 202 203 204 205 206 207 208 226 300 301 302 303 304 305 306 307 308 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 426 428 429 431 440 444 449 450 451 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 520 598 599) 10 | 11 | # Reads files from folder line by line and takes backends from ProxyPass or Location lines and push them into array. 12 | URIS=() 13 | while IFS= read -r line; do 14 | URIS+="$line " 15 | done < <( grep -r 'ProxyPass\|Location' $1 | grep -Po '(?<=[[:blank:]])\/[^\/ \s]*' | awk '!a[$0]++' ) 16 | 17 | IFS=$' ' read -ra URIS <<< "$URIS" 18 | 19 | echo -n '{"data":[' 20 | 21 | var1=0 22 | var3=$[ ${#URIS[@]} - 1 ] 23 | 24 | while [ $var1 -lt "${#URIS[@]}" ] 25 | do 26 | for (( var2 = 0; $var2 <= 76; var2++ )) 27 | do 28 | # trims last "," from last line. 29 | if [[ $var1 -eq $var3 && $var2 -eq 76 ]] 30 | then 31 | echo '{"{#URI}":"'${URIS[$var1]}'","{#RESPONSE}": "'${STATUSCODES[$var2]}'"}' | tr '\n' ' ' 32 | else 33 | echo '{"{#URI}":"'${URIS[$var1]}'","{#RESPONSE}": "'${STATUSCODES[$var2]}'"},' | tr '\n' ' ' 34 | fi 35 | done 36 | var1=$[ $var1 + 1 ] 37 | done 38 | echo -n ']}' 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Zabbix Monitoring Scripts 2 | 3 | This project contains various custom Zabbix monitoring scripts used as user parameters by Zabbix agent. 4 | 5 | ## Installation 6 | 7 | The repository includes ready-to-install files for Zabbix Agent. 8 | 9 | * Copy the files under [etc/zabbix/scripts](etc/zabbix/scripts) to `/etc/zabbix/scripts` 10 | * Copy the files under [etc/zabbix/zabbix_agentd.d](etc/zabbix/zabbix_agentd.d) to `/etc/zabbix/zabbix_agentd.d` 11 | 12 | ## Templates 13 | 14 | Each monitoring script has a corresponding template that can be imported to Zabbix Server. Templates can be found under [templates](templates). 15 | 16 | ## Version Numbering Scheme 17 | 18 | Each script has version information at the beginning of the script. 19 | [Semantic versioning](https://semver.org/) scheme is used with major.minor syntax 20 | 21 | * Major version changes when you make incompatible changes with existing items / configuration syntax 22 | * Minor version changes when you add functionality or bug-fixes in backwards-compatible manner 23 | 24 | ## Usage 25 | 26 | See the below documentation for each monitoring script. 27 | 28 | - [DB2 database snapshot statistics](documentation/db2stat.md) 29 | - [Docker discovery and monitoring](documentation/docker.md) 30 | - [Docker Swarm service discovery and monitoring](documentation/docker_swarm.md) 31 | - [Process discovery and monitoring](documentation/process.md) 32 | - [Pacemaker monitoring](documentation/pacemaker.md) 33 | - [PEM file certificate monitoring](documentation/certificates.md) 34 | - [Kontena grid monitoring](documentation/kontena_grid.md) 35 | - [Kubernetes monitoring](documentation/kubernetes_monitoring.md) 36 | - [MySQL & Galera monitoring](documentation/mysql-galera.md) 37 | 38 | -------------------------------------------------------------------------------- /documentation/pacemaker.md: -------------------------------------------------------------------------------- 1 | # Pacemaker Monitoring 2 | 3 | Get Pacemaker status. Adding -v option to command prints a more verbose string. Otherwise the script returns decimal or single word statuses. 4 | 5 | See [user parameter configuration file](../etc/zabbix/zabbix_agentd.d/pacemaker.conf) for Zabbix item format. 6 | 7 | ## Script Usage 8 | 9 | | Command | Description | Units | 10 | | ------- | ----------- | ----- | 11 | pacemaker_status.py -i cluster -v | Get the cluster status in verbose format | text | 12 | pacemaker_status.py -i cluster | Cluster status in integer format | 0 if no nodes, 1 if running ok, 2 if any in standby 3 if any in maintenance, 4 if any in shutdown | 13 | pacemaker_status.py -i cluster -p failed | Count the resources in given state. e.g. how many failed | number | 14 | pacemaker_status.py -i resource -n Grafana | Get status of the single resource. Returns count of resources running | number | 15 | pacemaker_status.py -i resource -n Grafana -N application1 -p managed | Get the property value for single resource in given node. | If node is not given returns true if all the nodes have the property set to "true" | 16 | pacemaker_status.py -i node -n application1 | Get the status on node | returns count of services running | 17 | pacemaker_status.py -i node -n application1 -v | Get the status on node | returns verbose string of resource status | 18 | pacemaker_status.py -i resource -n Grafana -l | Get the nodes where resource is active. | Text format resource:node1,node2 | 19 | pacemaker_status.py -i cluster -l | Get all resources in the cluster and nodes where they are active. | Returns each resource and the nodes, separated by space | 20 | 21 | ## Example verbose output 22 | 23 | `application1:online:standby:resources_running=0 application2:online:resources_running=10 resources=10/12` 24 | -------------------------------------------------------------------------------- /documentation/certificates.md: -------------------------------------------------------------------------------- 1 | # Certificate Monitoring 2 | 3 | Discover PEM files and monitor certificates stored within. 4 | 5 | Discovery item scans the configured path recursively for files containing PEM 6 | formatted certificates. Directories and files that are not readable to agent 7 | are skipped. Make sure zabbix agent user has access to monitored files. 8 | 9 | Monitoring script requires the following Python modules to be installed on the system (confirmed working with version in parenthesis): 10 | 11 | * pyOpenSSL (17.3.0) URL: https://pypi.python.org/pypi/pyOpenSSL 12 | * pem (17.1.0) URL: https://pypi.python.org/pypi/pem 13 | 14 | ## Usage 15 | 16 | Item Syntax | Description | Units | 17 | ----------- | ----------- | ----- | 18 | certificates.discovery[{$CERT_FILE_PATH}] | Discover certificates from path | Provides the following template variables: {#CRT_SUBJECT} {#CRT_FILE} {#CRT_INDEX}, {#CRT_CN} | 19 | certificate.status[{#CRT_FILE},{#CRT_INDEX}] | Certificate status | 0 = Valid, 1 = Not yet valid, 2 = Expired | 20 | certificate.startdate[{#CRT_FILE},{#CRT_INDEX}] | Certificate not before | ISO Date | 21 | certificate.enddate[{#CRT_FILE},{#CRT_INDEX}] | Certificate not after | ISO Date | 22 | certificate.lifetime[{#CRT_FILE},{#CRT_INDEX}] | Certificate lifetime until expiration (seconds) | | 23 | certificate.lifetime_days[{#CRT_FILE},{#CRT_INDEX}] | Certificate lifetime until expiration (days) | | 24 | certificate.serial[{#CRT_FILE},{#CRT_INDEX}] | Certificate serial | | 25 | certificate.subject[{#CRT_FILE},{#CRT_INDEX}] | Certificate subject | | 26 | certificate.issuer[{#CRT_FILE},{#CRT_INDEX}] | Certificate issuer | | 27 | certificate.subject_hash[{#CRT_FILE},{#CRT_INDEX}] | Certificate subject hash | | 28 | certificate.issuer_hash[{#CRT_FILE},{#CRT_INDEX}] | Certificate issuer hash | | 29 | certificate.fingerprint[{#CRT_FILE},{#CRT_INDEX}] | Certificate fingerprint | SHA-1 | 30 | 31 | ## Example 32 | 33 | ![Screenshot](certificates.png) 34 | -------------------------------------------------------------------------------- /documentation/mysql-galera.md: -------------------------------------------------------------------------------- 1 | This documentation is for configurating mysql and galera monitoring permissions. 2 | 3 | **Use sudo/root to configure following steps** 4 | 5 | 1. Install Zabbix agent and MySQL client. If necessary, add the path to the mysql and mysqladmin utilities to the global environment variable PATH. 6 | 2. Create a MySQL user for monitoring ( at your discretion): 7 | ``` 8 | CREATE USER 'zbx_monitor'@'%' IDENTIFIED BY ''; 9 | GRANT REPLICATION CLIENT,PROCESS,SHOW DATABASES,SHOW VIEW ON *.* TO 'zbx_monitor'@'%'; 10 | ``` 11 | For more information, please see MySQL documentation https://dev.mysql.com/doc/refman/8.0/en/grant.html 12 | 13 | 3. Create .my.cnf in the home directory of Zabbix agent for Linux (/var/lib/zabbix by default ) or my.cnf in c:\ for Windows. The file must have three strings: 14 | ``` 15 | [client] 16 | user='zbx_monitor' 17 | password='' 18 | ``` 19 | 20 | 21 | Add the rule to the SELinux policy (example for Centos): 22 | ``` 23 | # cat < zabbix_home.te 24 | 25 | module zabbix_home 1.0; 26 | 27 | require { 28 | type zabbix_agent_t; 29 | type zabbix_var_lib_t; 30 | type mysqld_etc_t; 31 | type mysqld_port_t; 32 | type mysqld_var_run_t; 33 | class file { open read }; 34 | class tcp_socket name_connect; 35 | class sock_file write; 36 | } 37 | 38 | ============= zabbix_agent_t ============== 39 | 40 | allow zabbix_agent_t zabbix_var_lib_t:file read; 41 | allow zabbix_agent_t zabbix_var_lib_t:file open; 42 | allow zabbix_agent_t mysqld_etc_t:file read; 43 | allow zabbix_agent_t mysqld_port_t:tcp_socket name_connect; 44 | allow zabbix_agent_t mysqld_var_run_t:sock_file write; 45 | EOF 46 | # checkmodule -M -m -o zabbix_home.mod zabbix_home.te 47 | # semodule_package -o zabbix_home.pp -m zabbix_home.mod 48 | # semodule -i zabbix_home.pp 49 | # restorecon -R /var/lib/zabbix 50 | ``` 51 | 52 | 4. To test mysql connection, run 53 | ``` 54 | zabbix_agentd -t mysql.version 55 | zabbix_agentd -t mysql.get_status_variables 56 | ``` 57 | 5. To test galera connection, run 58 | ``` 59 | zabbix_agentd -t galera.cluster_status 60 | ``` 61 | -------------------------------------------------------------------------------- /opt/cron/docker_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Script for running docker monitoring script actions and posting results via trapper items. 3 | # 4 | # USAGE: 5 | # Discover running containers: 6 | # /opt/cron/docker_stats.sh discovery 7 | # Discover all containers: 8 | # /opt/cron/docker_stats.sh discovery_all 9 | # Count running containers: 10 | # /opt/cron/docker_stats.sh count 11 | # Count all containers: 12 | # /opt/cron/docker_stats.sh count_all 13 | # Send stats to trapper items: 14 | # /opt/cron/docker_stats.sh stats "" "" 15 | # - OPTIONAL stats: space delimited list of stats, defaults to all supported stats (cpu disk netin netout memory status uptime) 16 | # - OPTIONAL containers: space delimited list of container names or ids, defaults to all containers 17 | # 18 | set -e 19 | 20 | # Path to Zabbix agent script docker.sh 21 | ZBX_DOCKER_SCRIPT=/etc/zabbix/scripts/docker.sh 22 | # Path to temporary stats file 23 | STATS_FILE=/tmp/docker_stats.txt 24 | 25 | SCRIPT_ACTION=$1 26 | shift 27 | 28 | rm -f $STATS_FILE 29 | if [ "$SCRIPT_ACTION" == "stats" ]; then 30 | stats=${1:-cpu disk netin netout memory status uptime} 31 | containers=$2 32 | if [ -z "$containers" ]; then 33 | containers=$($ZBX_DOCKER_SCRIPT discovery_all | jq -r '.[][]["{#CONTAINERNAME}"]') 34 | fi 35 | 36 | for c in $containers; do 37 | for s in $stats; do 38 | value=$($ZBX_DOCKER_SCRIPT $c $s) 39 | echo "- docker.containers[$c,$s] $value" >>$STATS_FILE 40 | done 41 | done 42 | elif [ "$SCRIPT_ACTION" == "discovery" ]; then 43 | value=$($ZBX_DOCKER_SCRIPT discovery) 44 | echo "- docker.containers.discovery $value" >>$STATS_FILE 45 | elif [ "$SCRIPT_ACTION" == "discovery_all" ]; then 46 | value=$($ZBX_DOCKER_SCRIPT discovery_all) 47 | echo "- docker.containers.discovery.all $value" >>$STATS_FILE 48 | elif [ "$SCRIPT_ACTION" == "count" ]; then 49 | value=$($ZBX_DOCKER_SCRIPT count) 50 | echo "- docker.containers.count $value" >>$STATS_FILE 51 | elif [ "$SCRIPT_ACTION" == "count_all" ]; then 52 | value=$($ZBX_DOCKER_SCRIPT count_all) 53 | echo "- docker.containers.count.all $value" >>$STATS_FILE 54 | fi 55 | 56 | # Send results if we got some 57 | if [ -e $STATS_FILE ]; then 58 | zabbix_sender -vv -c /etc/zabbix/zabbix_agentd.conf -i $STATS_FILE 59 | rm -f $STATS_FILE 60 | fi -------------------------------------------------------------------------------- /custom/scripts/fileTimestamp.vbs: -------------------------------------------------------------------------------- 1 | ' SCRIPT NAME 2 | ' fileTimestamp.vbs 3 | ' SUMMARY 4 | ' This script loops given folder's subfolders recursively 5 | ' and returns the latest timestamp it can find, also from starting folder 6 | ' PARAMS 7 | ' starting folder 8 | ' RETURNS 9 | ' timestamp in seconds (unix time) in UTC time 10 | ' If starting folder doesn't exist, returns 0. 11 | ' If some other error occurs, returns compiler's error message and code. 12 | 13 | Public latestTime 'the latest timestamp from all folders' files 14 | Dim objFSO, objFolder, startFolder 15 | 16 | On Error Resume Next 17 | 18 | Set objFSO = CreateObject("Scripting.FileSystemObject") 19 | startFolder = Replace(WScript.Arguments(0),"/","\") 20 | 'startFolder = "f:\startFolder" 'for testing 21 | 22 | Set objFolder = objFSO.GetFolder(startFolder) 23 | 'checking that the given path exists 24 | If Err.Number <> 0 Then 25 | 'if path doesn't exist, returns 0 26 | WScript.StdOut.Write 0 27 | WScript.Quit 28 | End If 29 | 30 | 'goes through given folder's subfolders and hunts for the latest timestamp 31 | FindLatestTimestamp objFolder 32 | 33 | 'sets the latest local timestamp to UTC time 34 | Set dateTime = CreateObject("WbemScripting.SWbemDateTime") 35 | dateTime.SetVarDate(latestTime) 36 | 37 | 'writes the latest timestamp in unix time (seconds from 1.1.1970) 38 | WScript.StdOut.Write DateDiff("s", "1/1/1970", dateTime.GetVarDate(false)) 39 | 40 | 'Method that loops given folder's subfolders recursively and finds the latest timestamp of all files 41 | Sub FindLatestTimestamp(Folder) 42 | 43 | 'first check the files for given folder 44 | Set objFolder = objFSO.GetFolder(Folder) 45 | If objFolder.Files.Count > 0 Then 46 | 47 | Set colFiles = objFolder.Files 48 | 49 | For Each objFile In colFiles 50 | 'check if the file is the latest so far 51 | If DateDiff("s", objFile.DateLastModified, latestTime) < 0 Then 52 | latestTime = objFile.DateLastModified 53 | End If 54 | Next 55 | 56 | End If 57 | 58 | 'then recursively start to check another folder 59 | For Each Subfolder In Folder.SubFolders 60 | FindLatestTimestamp Subfolder 61 | Next 62 | 63 | End Sub 64 | 65 | 'checking if other errors have occurred 66 | If Err.Number <> 0 Then 67 | WScript.StdOut.Write "Error: " & Err.Description & " (" & Err.Number & ")" 68 | WScript.Quit 69 | End If 70 | -------------------------------------------------------------------------------- /documentation/db2stat.md: -------------------------------------------------------------------------------- 1 | 2 | # DB2 Database Snapshot Statistics (db2stat) 3 | 4 | This script generates database snapshots (i.e. get snapshot for database) from 5 | DB2 and retrieves statistics from it. 6 | 7 | Because DB2 install location varies by system and installation method, the path 8 | to DB2 executable must be edited into PATH environment variable set up in the 9 | script. 10 | 11 | Zabbix agent user must also have permission to create database snapshots. See 12 | below on how to do this. 13 | 14 | See the [script file](../etc/zabbix/scripts/db2stat.pl) for detailed information. 15 | 16 | ## Enabling DB2 Snapshots for Zabbix User 17 | 18 | To allow Zabbix agent user to create snapshots it must have capability in DB2 19 | to do that. For monitoring purposes the best match is the SYSMON permission. The 20 | operating system group for this is set via DB2 configuration parameters. 21 | 22 | To enable snapshots: 23 | 24 | 1. Create sysmon group in operating system and add zabbix agent user to it (Zabbix agent must be installed so that zabbix user is present). 25 | - Linux systems: `groupadd sysmon && usermod -a -G sysmon zabbix` 26 | - AIX systems: `mkgroup sysmon` 27 | `chgrpmem -m + zabbix sysmon` 28 | 2. Configure sysmon group have SYSMON permission in database execute following *as db2 user*: 29 | - Configure sysmon group: `db2 update dbm cfg using sysmon_group sysmon` 30 | - Restart databse: `db2stop && db2start` 31 | 32 | To test taking the snapshot with zabbix user in Linux as root: 33 | `su -s /bin/bash -c "/bin/db2 get snapshot for database on " zabbix` 34 | To test taking the snapshot with zabbix user in AIX as root: 35 | `su - zabbix -c "/bin/db2 get snapshot for database on "` 36 | ## Installing Items from Template 37 | 38 | Zabbix template for all items supported in configuration is 39 | [included](../templates/db2stat.xml). To configure it, at least macro 40 | value for DATABASE_NAME must be updated. 41 | 42 | ## Manual Item Configuration 43 | 44 | Provided user parameter configuration contains several parameters. Consult the 45 | [configuration file](../etc/zabbix/zabbix_agentd.d/db2stat.conf) for a full list. 46 | 47 | Simple statistics can be retrieved with two paramters, maximum snapshot age in seconds and database: 48 | 49 | `db2stat.database_status[60,SAMPLE]` 50 | 51 | Retrieving memory statistics additionally requires node number: 52 | 53 | `db2stat.package_cache_heap_size[60,SAMPLE,0]` 54 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/discover_certificates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2 2 | # Version: 1.0 3 | """ 4 | Zabbix discovery for certificates stored in PEM files. 5 | """ 6 | 7 | from __future__ import print_function 8 | from OpenSSL.crypto import load_certificate, FILETYPE_PEM 9 | import sys 10 | import pem 11 | import os 12 | import json 13 | 14 | 15 | class CertEntry(): 16 | """Certificate entry model.""" 17 | 18 | def __init__(self, file_name, index, cert): 19 | self.file_name = file_name 20 | self.index = index 21 | self.cert = cert 22 | 23 | def __str__(self): 24 | return (self.file_name + "[" + str(self.index) + "]:" + 25 | str(self.cert.get_subject())) 26 | 27 | 28 | def get_certificates_from_pem(file_name, certificates): 29 | """Finds all certificate entries from PEM file and adds them to given list. 30 | """ 31 | entries = pem.parse_file(file_name) 32 | index = 0 33 | for entry in entries: 34 | if type(entry) == pem.Certificate: 35 | cert = load_certificate(FILETYPE_PEM, entry.as_bytes()) 36 | certificates.append(CertEntry(file_name, index, cert)) 37 | index = index + 1 38 | 39 | 40 | def format_x509_name(x509_name): 41 | """Formats X509Name object into string representation.""" 42 | name = "" 43 | for c in x509_name.get_components(): 44 | name += '/' 45 | name += c[0].decode("utf-8") 46 | name += '=' 47 | name += c[1].decode("utf-8") 48 | return name 49 | 50 | 51 | def get_name_component(x509_name, component): 52 | """Gets single name component from X509 name.""" 53 | value = "" 54 | for c in x509_name.get_components(): 55 | if c[0].decode("utf-8") == component: 56 | value = c[1].decode("utf-8") 57 | return value 58 | 59 | 60 | def json_output(entries): 61 | """Outputs list of certificate entries as Zabbix compatible discovery JSON. 62 | """ 63 | data = [] 64 | output = { 65 | 'data': data 66 | } 67 | for entry in entries: 68 | data.append({ 69 | '{#CRT_FILE}': entry.file_name, 70 | '{#CRT_INDEX}': entry.index, 71 | '{#CRT_SUBJECT}': format_x509_name(entry.cert.get_subject()), 72 | '{#CRT_CN}': get_name_component(entry.cert.get_subject(), 'CN') 73 | }) 74 | print(json.dumps(output)) 75 | 76 | 77 | def search_certificates(path, entries): 78 | """Searches certificates from PEM files in given path recursively. 79 | """ 80 | if os.path.isdir(path) and os.access(path, os.R_OK): 81 | for child in os.listdir(path): 82 | search_certificates(os.path.join(path, child), entries) 83 | elif os.path.isfile(path) and os.access(path, os.R_OK): 84 | get_certificates_from_pem(path, entries) 85 | 86 | 87 | if __name__ == '__main__': 88 | entries = [] 89 | for path in sys.argv[1:]: 90 | search_certificates(path, entries) 91 | 92 | json_output(entries) 93 | -------------------------------------------------------------------------------- /custom/scripts/elastizabbix.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import os 4 | import sys 5 | import json 6 | import urllib.request 7 | import time 8 | import errno 9 | 10 | # Check parameter count 11 | if len(sys.argv) < 4: 12 | sys.exit('This script needs at least 3 parameters: ip, api, stat.') 13 | 14 | ttl = 60 15 | ip = sys.argv[1] 16 | 17 | cluster_url = 'http://%s:9200/_cluster/stats' %ip 18 | nodes_url = 'http://%s:9200/_nodes/stats' %ip 19 | indicies_url = 'http://%s:9200/_stats' %ip 20 | health_url = 'http://%s:9200/_cluster/health' %ip 21 | 22 | stats = { 23 | 'cluster': cluster_url, 24 | 'nodes' : nodes_url, 25 | 'indices': indicies_url, 26 | 'health' : health_url 27 | } 28 | 29 | def created_file(name): 30 | try: 31 | fd = os.open(name, os.O_WRONLY | os.O_CREAT | os.O_EXCL) 32 | os.close(fd) 33 | return True 34 | except OSError as e: 35 | if e.errno == errno.EEXIST: 36 | return False 37 | raise 38 | 39 | def is_older_then(name, ttl): 40 | age = time.time() - os.path.getmtime(name) 41 | return age > ttl 42 | 43 | def get_cache(api): 44 | cache = '/tmp/elastizabbix-{0}.json'.format(api) 45 | lock = '/tmp/elastizabbix-{0}.lock'.format(api) 46 | should_update = (not os.path.exists(cache)) or is_older_then(cache, ttl) 47 | if should_update and created_file(lock): 48 | try: 49 | d = urllib.request.urlopen(stats[api]).read() 50 | with open(cache, 'w') as f: 51 | f.write(d) 52 | except Exception as e: 53 | pass 54 | if os.path.exists(lock): 55 | os.remove(lock) 56 | if os.path.exists(lock) and is_older_then(lock, 300): 57 | os.remove(lock) 58 | ret_data = {} 59 | try: 60 | with open(cache) as data_file: 61 | ret_data = json.load(data_file) 62 | except Exception as e: 63 | ret_data = json.loads(urllib.request.urlopen(stats[api]).read()) 64 | return ret_data 65 | 66 | def get_stat(api, stat): 67 | d = get_cache(api) 68 | keys = [] 69 | for i in stat.split('.'): 70 | keys.append(i) 71 | key = '.'.join(keys) 72 | if key in d: 73 | d = d.get(key) 74 | keys = [] 75 | return d 76 | 77 | def discover_nodes(): 78 | d = {'data': []} 79 | for k,v in get_stat('nodes', 'nodes').items(): 80 | d['data'].append({'{#NAME}': v['name'], '{#NODE}': k}) 81 | return json.dumps(d) 82 | 83 | def discover_indices(): 84 | d = {'data': []} 85 | for k,v in get_stat('indices', 'indices').items(): 86 | d['data'].append({'{#NAME}': k}) 87 | return json.dumps(d) 88 | 89 | 90 | if __name__ == '__main__': 91 | api = sys.argv[2] 92 | stat = sys.argv[3] 93 | if api == 'discover': 94 | if stat == 'nodes': 95 | print(discover_nodes()) 96 | if stat == 'indices': 97 | print(discover_indices()) 98 | 99 | else: 100 | stat = get_stat(api, stat) 101 | if isinstance(stat, dict): 102 | print('') 103 | else: 104 | print(stat) 105 | -------------------------------------------------------------------------------- /etc/zabbix/zabbix_agentd.d/postgresql_monitoring.conf: -------------------------------------------------------------------------------- 1 | UserParameter=pgsql.bgwriter[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.bgwriter.sql" 2 | 3 | UserParameter=pgsql.connections.sum[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.connections.sum.sql" 4 | UserParameter=pgsql.connections[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.connections.sql" 5 | UserParameter=pgsql.connections.prepared[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.connections.prepared.sql" 6 | 7 | UserParameter=pgsql.dbstat.sum[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.dbstat.sum.sql" 8 | UserParameter=pgsql.dbstat[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.dbstat.sql" 9 | 10 | UserParameter=pgsql.transactions[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.transactions.sql" 11 | UserParameter=pgsql.config.hash[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.config.hash.sql" 12 | UserParameter=pgsql.wal.stat[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.wal.stat.sql" 13 | UserParameter=pgsql.locks[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.locks.sql" 14 | UserParameter=pgsql.queries[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -v tmax=$5 -f "/etc/zabbix/scripts/postgresql/pgsql.query.time.sql" 15 | UserParameter=pgsql.uptime[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.uptime.sql" 16 | UserParameter=pgsql.cache.hit[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.cache.hit.sql" 17 | UserParameter=pgsql.scans[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.scans.sql" 18 | UserParameter=pgsql.frozenxid[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.frozenxid.sql" 19 | 20 | UserParameter=pgsql.discovery.db[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.discovery.db.sql" 21 | UserParameter=pgsql.db.size[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -c "SELECT pg_database_size('$5')" 22 | UserParameter=pgsql.ping[*], pg_isready -h "$1" -p "$2" -U "$3" -d "$4" 23 | UserParameter=pgsql.ping.time[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.ping.time.sql" 24 | UserParameter=pgsql.version[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -c "SELECT version();" 25 | 26 | UserParameter=pgsql.replication.count[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -c "SELECT count(*) FROM pg_stat_replication" 27 | UserParameter=pgsql.replication.recovery_role[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.replication.recovery_role.sql" 28 | UserParameter=pgsql.replication.lag.sec[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.replication.lag.sql" 29 | UserParameter=pgsql.replication.status[*], psql -qtAX -h "$1" -p "$2" -U "$3" -d "$4" -f "/etc/zabbix/scripts/postgresql/pgsql.replication.status.sql" 30 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/postgresql/pgsql.query.time.sql: -------------------------------------------------------------------------------- 1 | WITH T AS 2 | (SELECT db.datname, 3 | coalesce(T.query_time_max, 0) query_time_max, 4 | coalesce(T.tx_time_max, 0) tx_time_max, 5 | coalesce(T.mro_time_max, 0) mro_time_max, 6 | coalesce(T.query_time_sum, 0) query_time_sum, 7 | coalesce(T.tx_time_sum, 0) tx_time_sum, 8 | coalesce(T.mro_time_sum, 0) mro_time_sum, 9 | coalesce(T.query_slow_count, 0) query_slow_count, 10 | coalesce(T.tx_slow_count, 0) tx_slow_count, 11 | coalesce(T.mro_slow_count, 0) mro_slow_count 12 | FROM pg_database db NATURAL 13 | LEFT JOIN ( 14 | SELECT datname, 15 | extract(epoch FROM now())::integer ts, 16 | coalesce(max(extract('epoch' FROM (clock_timestamp() - query_start))::integer * (state NOT IN ('idle', 'idle in transaction', 'idle in transaction (aborted)') AND query !~* E'^(\\s*(--[^\\n]*\\n|/\\*.*\\*/|\\n))*(autovacuum|VACUUM|ANALYZE|REINDEX|CLUSTER|CREATE|ALTER|TRUNCATE|DROP)')::integer), 0) query_time_max, 17 | coalesce(max(extract('epoch' FROM (clock_timestamp() - query_start))::integer * (state NOT IN ('idle') AND query !~* E'^(\\s*(--[^\\n]*\\n|/\\*.*\\*/|\\n))*(autovacuum|VACUUM|ANALYZE|REINDEX|CLUSTER|CREATE|ALTER|TRUNCATE|DROP)')::integer), 0) tx_time_max, 18 | coalesce(max(extract('epoch' FROM (clock_timestamp() - query_start))::integer * (state NOT IN ('idle') AND query ~* E'^(\\s*(--[^\\n]*\\n|/\\*.*\\*/|\\n))*(autovacuum|VACUUM|ANALYZE|REINDEX|CLUSTER|CREATE|ALTER|TRUNCATE|DROP)')::integer), 0) mro_time_max, 19 | coalesce(sum(extract('epoch' FROM (clock_timestamp() - query_start))::integer * (state NOT IN ('idle', 'idle in transaction', 'idle in transaction (aborted)') AND query !~* E'^(\\s*(--[^\\n]*\\n|/\\*.*\\*/|\\n))*(autovacuum|VACUUM|ANALYZE|REINDEX|CLUSTER|CREATE|ALTER|TRUNCATE|DROP)')::integer), 0) query_time_sum, 20 | coalesce(sum(extract('epoch' FROM (clock_timestamp() - query_start))::integer * (state NOT IN ('idle') AND query !~* E'^(\\s*(--[^\\n]*\\n|/\\*.*\\*/|\\n))*(autovacuum|VACUUM|ANALYZE|REINDEX|CLUSTER|CREATE|ALTER|TRUNCATE|DROP)')::integer), 0) tx_time_sum, 21 | coalesce(sum(extract('epoch' FROM (clock_timestamp() - query_start))::integer * (state NOT IN ('idle') AND query ~* E'^(\\s*(--[^\\n]*\\n|/\\*.*\\*/|\\n))*(autovacuum|VACUUM|ANALYZE|REINDEX|CLUSTER|CREATE|ALTER|TRUNCATE|DROP)')::integer), 0) mro_time_sum, 22 | 23 | coalesce(sum((extract('epoch' FROM (clock_timestamp() - query_start)) > :tmax)::integer * (state NOT IN ('idle', 'idle in transaction', 'idle in transaction (aborted)') AND query !~* E'^(\\s*(--[^\\n]*\\n|/\\*.*\\*/|\\n))*(autovacuum|VACUUM|ANALYZE|REINDEX|CLUSTER|CREATE|ALTER|TRUNCATE|DROP)')::integer), 0) query_slow_count, 24 | coalesce(sum((extract('epoch' FROM (clock_timestamp() - query_start)) > :tmax)::integer * (state NOT IN ('idle') AND query !~* E'^(\\s*(--[^\\n]*\\n|/\\*.*\\*/|\\n))*(autovacuum|VACUUM|ANALYZE|REINDEX|CLUSTER|CREATE|ALTER|TRUNCATE|DROP)')::integer), 0) tx_slow_count, 25 | coalesce(sum((extract('epoch' FROM (clock_timestamp() - query_start)) > :tmax)::integer * (state NOT IN ('idle') AND query ~* E'^(\\s*(--[^\\n]*\\n|/\\*.*\\*/|\\n))*(autovacuum|VACUUM|ANALYZE|REINDEX|CLUSTER|CREATE|ALTER|TRUNCATE|DROP)')::integer), 0) mro_slow_count 26 | FROM pg_stat_activity 27 | WHERE pid <> pg_backend_pid() 28 | GROUP BY 1) T 29 | WHERE NOT db.datistemplate ) 30 | SELECT json_object_agg(datname, row_to_json(T)) 31 | FROM T 32 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/db2stat.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -wT 2 | # Version: 1.2 3 | # Usage: db2stat [ ...] 4 | # 5 | # timeout - Timeout of snapshot in seconds (new snapshot will be taken if 6 | # previous is older than timeout) 7 | # db2instance - Db2 instance 8 | # dbpath - Path of db2 database 9 | # dbname - Name of db2 database 10 | # key/value - Key value pairs to match (e.g. "Node number" "0") 11 | # stat - Actual stat to search for (e.g. "Database status") 12 | # 13 | # Generates snapshot from db2 into file in tmpdir directory if one does not 14 | # already exist. The file is updated if it is older than specified timeout by 15 | # taking new snapshot. 16 | # 17 | # After that searches for named stat line from generated file and returns its 18 | # value. If key value pairs are specified, those must be found preceding the 19 | # stat line in specified order and not have empty line in between. 20 | # 21 | # Note that key, value and stat are case sensitive and the way db2 uses 22 | # capitalization is quite inconsistent. 23 | # 24 | # Examples: 25 | # 26 | # Retrieve simple stat "Database status" that is at most 10 seconds old: 27 | # db2stat 10 "/usr/bin" mydb "Database status" 28 | # 29 | # Retrieve current size of package cache heap on node 0 at most 60 seconds old: 30 | # db2stat 60 db2instance "/usr/bin" mydb "Node number" "0" "Memory Pool Type" "Package Cache Heap" 31 | # "Current size (bytes)" 32 | 33 | use File::Spec; 34 | 35 | # Directory where snapshots are cached. 36 | my $SNAPSHOT_DIR = File::Spec->tmpdir(); 37 | 38 | # Get database path, name and timeout args 39 | my $timeout = shift @ARGV; 40 | my $dbinstance = shift @ARGV; 41 | my $dbpath = shift @ARGV; 42 | my $dbname = shift @ARGV; 43 | 44 | # Untaint 45 | if ($timeout =~ /^(\d+)$/) { 46 | $timeout = $1; 47 | } else { 48 | die "Bad timeout value"; 49 | } 50 | 51 | if ($dbinstance =~ /^([-\/\w.]+)$/) { 52 | $dbinstance = $1; 53 | } else { 54 | die "Bad dbinstance argument"; 55 | } 56 | 57 | if ($dbpath =~ /^([-\/\w.]+)$/) { 58 | $dbpath = $1; 59 | } else { 60 | die "Bad dbpath argument"; 61 | } 62 | 63 | if ($dbname =~ /^([-\w.]+)$/) { 64 | $dbname = $1; 65 | } else { 66 | die "Bad dbname argument"; 67 | } 68 | 69 | # Set environment variable for db2 instance 70 | $ENV{'DB2INSTANCE'} = "$dbinstance"; 71 | 72 | # Set path of db2 executable 73 | $ENV{'PATH'} = "$dbpath"; 74 | 75 | # Generate stat file name 76 | my $statfile = "$SNAPSHOT_DIR/$dbname.txt"; 77 | my $tmpstatfile = "$SNAPSHOT_DIR/$dbname.txt.tmp"; 78 | 79 | # Regenerate stats if file too old 80 | if (! -f $statfile or (time - (stat($statfile))[10]) > $timeout) { 81 | # first touch file to prevent another perform in next moment 82 | system("/usr/bin/touch $statfile"); 83 | #then generate tmp data 84 | system("db2 get snapshot for database on $dbname >$tmpstatfile"); 85 | #finally swap files 86 | system("/usr/bin/cp -p $tmpstatfile $statfile"); 87 | } 88 | 89 | # Generate regular expressions to match from args 90 | while (@ARGV) { 91 | my $key = shift(@ARGV); 92 | my $value = shift(@ARGV); 93 | if (defined $value) { 94 | # Add preceding line match 95 | push(@RE, qr/\s*\Q$key\E\s*=\s*\Q$value\E\s*/); 96 | } else { 97 | # Add stat line match 98 | push(@RE, qr/\s*\Q$key\E\s*=\s*(.+)\s*/); 99 | } 100 | } 101 | 102 | # Try to find value with preceding line matches and stat line match 103 | my $idx = 0; 104 | open(DATA, "<$statfile"); 105 | while () { 106 | my $line = $_; 107 | if ($line =~ $RE[$idx]) { 108 | my $match = $1; 109 | 110 | # Return the match if all expressions have been matched 111 | $idx++; 112 | if ($idx == scalar @RE) { 113 | print "$match\n"; 114 | exit 0; 115 | } 116 | } 117 | 118 | # Reset matches if empty line 119 | if ($line eq "") { 120 | $idx = 0; 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /documentation/docker.md: -------------------------------------------------------------------------------- 1 | # Docker containers discovery and monitoring 2 | 3 | Requirements: 4 | - Python 2.7.13 5 | - netcat (ubuntu: `sudo apt-get install netcat`) 6 | - jq (ubuntu: `sudo apt-get install jq`) 7 | 8 | In addition to the provided [template](../templates) the script is compatible with www.monitoringartist.com docker monitoring templates that's included by default in [zabbix-xxl](https://github.com/monitoringartist/dockbix-xxl). 9 | 10 | The zabbix user must have enough privileges to monitor docker 11 | 12 | * Either add zabbix user to docker group `sudo usermod -aG docker zabbix` 13 | * Or add a file under `/etc/sudoers.d` containing line `zabbix ALL=(ALL:ALL) NOPASSWD: /bin/netcat` 14 | 15 | ## Usage 16 | 17 | Item Syntax | Description | Units | 18 | ----------- | ----------- | ----- | 19 | docker.containers.discovery | Discover all running Docker containers | Provides the following template variables: {#CONTAINERID}, {#CONTAINERNAME}, {#HCONTAINERID}, {#IMAGENAME}, {#IMAGETAG} | 20 | docker.containers.count | Number of all running Docker containers | (number) | 21 | docker.containers.discovery.all | Discover all Docker containers | Provides the following template variables: {#CONTAINERID}, {#CONTAINERNAME}, {#HCONTAINERID}, {#IMAGENAME}, {#IMAGETAG} | 22 | docker.containers.count.all | Number of all Docker containers | (number) | 23 | docker.containers[{#CONTAINERID}, netin] | Incoming network traffic (eth0) of the container | bytes per second (B/s) | 24 | docker.containers[{#CONTAINERID}, netout] | Outgoing network traffic (eth0) of the container | bytes per second (B/s) | 25 | docker.containers[{#CONTAINERID}, cpu] | Container CPU usage | % | 26 | docker.containers[{#CONTAINERID}, disk] | Container disk usage | bytes | 27 | docker.containers[{#CONTAINERID}, memory] | Container memory usage | bytes | 28 | docker.containers[{#CONTAINERID}, uptime] | Container uptime | uptime (seconds) | 29 | docker.containers[{#CONTAINERID}, up] | Is container up and running? | 1 (yes), 0 (no) | 30 | docker.containers[{#CONTAINERID}, status] | Container status | 0 (exited with error or no such container), 1 (running), 2 (not started or shut down) | 31 | docker.containers[{#IMAGENAME}, image_netin] | Incoming network traffic (eth0) of only container running given image | bytes per second (B/s) | 32 | docker.containers[{#IMAGENAME}, image_netout] | Outgoing network traffic (eth0) of only container running given image | bytes per second (B/s) | 33 | docker.containers[{#IMAGENAME}, image_cpu] | CPU usage of only container running given image | % | 34 | docker.containers[{#IMAGENAME}, image_disk] | Disk usage of only container running given image | bytes | 35 | docker.containers[{#IMAGENAME}, image_memory] | Memory usage of only container running given image | bytes | 36 | docker.containers[{#IMAGENAME}, image_uptime] | Uptime of only container running given image | uptime (seconds) | 37 | docker.containers[{#IMAGENAME}, image_up] | Is there single container running image up and running? | 1 (yes), 0 (no) | 38 | docker.containers[{#IMAGENAME}, image_containerids] | List of running container IDs with imagename | container IDs, one per line | 39 | docker.containers[{#IMAGENAME}, image_containerids_all] | List of all container IDs with imagename | container IDs, one per line | 40 | 41 | * Items returning container metrics or status with image name will error if multiple containers with image are running 42 | * Items with image name also allow specifying imagename + tag (i.e. {#IMAGENAME}:{#IMAGETAG}) 43 | 44 | ### Trapper Based Execution 45 | 46 | Folder /opt/cron includes wrapper script that allows posting status into trapper items instead. 47 | 48 | It has some benefits over standard approach: 49 | - All container stats are sent in one bulk request to Zabbix 50 | - It can be set up to run on separate account from zabbix to avoid granting docker permissions to Zabbix agent 51 | 52 | Main disadvantange is that it requires setting up separate cron jobs to execute discovery and stats gathering (also container count if necessary). 53 | 54 | Additional requirements: 55 | - zabbix_sender installed in the system and available in user path 56 | - Hostname set in the Zabbix agent configuration file (/etc/zabbix/zabbix_agentd.conf) 57 | 58 | Zabbix template for trapper version of monitoring is named docker_trapper.xml. 59 | 60 | *Example crontab setup:* 61 | ``` 62 | 0 * * * * /opt/cron/docker_stats.sh discovery_all >>/var/log/docker_stats.log 2>&1 63 | * * * * * /opt/cron/docker_stats.sh count >>/var/log/docker_stats.log 2>&1 64 | * * * * * /opt/cron/docker_stats.sh stats >>/var/log/docker_stats.log 2>&1 65 | ``` 66 | 67 | ## Example 68 | 69 | ![Screenshot](docker.png) 70 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/docker_swarm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2 2 | 3 | """ 4 | Docker Swarm service monitoring 5 | Version: 1.0.1 6 | 7 | Usage: 8 | python3 docker_swarm.py discovery 9 | python3 docker_swarm.py --service 10 | 11 | Discover Docker Swarm services (with service data as an array): 12 | python3 docker_swarm.py discovery 13 | 14 | Retrieve service hostname, status or uptime: 15 | python3 docker_swarm.py hostname --service 16 | python3 docker_swarm.py status --service 17 | python3 docker_swarm.py uptime --service 18 | """ 19 | 20 | # Python imports 21 | from argparse import ArgumentParser 22 | import datetime 23 | import json 24 | 25 | # 3rd party imports 26 | import dateutil.parser 27 | import docker 28 | 29 | # Declare variables 30 | modes = ["discovery", "hostname", "status", "uptime"] # Available modes 31 | services = {} # Dictionary for Docker service(s) data 32 | 33 | # Parse command-line arguments 34 | parser = ArgumentParser( 35 | description="Discover or retrieve metrics from Docker Swarm services." 36 | ) 37 | parser.add_argument("mode", choices=modes, help="Discovery or metric: " + \ 38 | ", ".join(modes)) 39 | parser.add_argument("-s", "--service", type=str, 40 | help="Service name to retrieve information from.") 41 | args = parser.parse_args() 42 | 43 | # Retrieve docker client instance using environment settings 44 | client = docker.from_env() 45 | 46 | # Parse system time from Docker 47 | system_time = dateutil.parser.parse(client.info().get("SystemTime")) 48 | 49 | # Limit results to specific service if service parameter is used 50 | service_filters = {} 51 | if args.service: 52 | service_filters["name"] = args.service 53 | 54 | # Loop services and tasks and retrieve information 55 | for service in client.services.list(filters=service_filters): 56 | 57 | # Reset task variables for each service 58 | created_date = None # Task's creation date 59 | nodes = [] # A list of nodes where task is currently running 60 | task_created = None # A datetime object for latest task's creation date 61 | task_status = "not running" # Task status, default is "not running" 62 | uptime = datetime.timedelta() # A datetime object for latest task's uptime 63 | 64 | # Loop tasks to collect data, but only from running tasks 65 | for task in service.tasks({"desired-state": "running"}): 66 | 67 | # Parse task creation date for comparison 68 | created_date = dateutil.parser.parse(task.get("CreatedAt")) 69 | 70 | # First time around, grab the first task 71 | if not task_created: 72 | task_created = created_date 73 | task_status = task.get("Status").get("State") 74 | # Compare previous task's date to current one 75 | elif task_created < created_date: 76 | task_created = created_date 77 | task_status = task.get("Status").get("State") 78 | 79 | # Grab node ID for later matching from nodes list 80 | nodes.append(task.get("NodeID")) 81 | 82 | # Count uptime 83 | if task_created: 84 | uptime = system_time - task_created 85 | 86 | # Append service data to dictionary 87 | services[service.name] = { 88 | "hostname": "", 89 | "nodes": nodes, 90 | "status": task_status, 91 | "uptime": uptime.total_seconds() 92 | } 93 | 94 | # Loop services and nodes to retrieve additional information 95 | for node in client.nodes.list(): 96 | for name, service in services.items(): 97 | 98 | # Match node ID to service's node IDs 99 | if node.attrs.get("ID") in service.get("nodes"): 100 | 101 | # Add comma if hostnames already have items 102 | if services[name].get("hostname"): 103 | services[name]["hostname"] += ", " 104 | 105 | # Add node hostname to services dictionary 106 | services[name]["hostname"] += "{}".format( 107 | node.attrs.get("Description").get("Hostname") 108 | ) 109 | 110 | # Loop service data and create discovery 111 | if args.mode == "discovery": 112 | output = [] 113 | for name, service in services.items(): 114 | output.append({ 115 | "{#SERVICE}": name, 116 | "hostname": service.get("hostname"), 117 | "uptime": service.get("uptime"), 118 | "service": name, 119 | "status": service.get("status") 120 | }) 121 | 122 | # Dump discovery 123 | discovery = {"data": output} 124 | print(json.dumps(discovery)) 125 | 126 | # Retrieve service information using command-line arguments 127 | else: 128 | if not services.get(args.service): 129 | print("Invalid service name.") 130 | elif not services[args.service].get(args.mode): 131 | print("Invalid mode argument.") 132 | else: 133 | print(services[args.service].get(args.mode)) 134 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/check_certificate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2 2 | # Version: 1.0 3 | """ 4 | Script for retrieving certificate information items from PEM file. 5 | """ 6 | 7 | from __future__ import print_function 8 | from OpenSSL.crypto import load_certificate, FILETYPE_PEM 9 | from datetime import datetime 10 | import argparse 11 | import pem 12 | import sys 13 | import os 14 | 15 | 16 | def format_x509_name(x509_name): 17 | """Formats X509Name object into string representation.""" 18 | name = "" 19 | for c in x509_name.get_components(): 20 | name += '/' 21 | name += c[0].decode("utf-8") 22 | name += '=' 23 | name += c[1].decode("utf-8") 24 | return name 25 | 26 | 27 | def from_asn1_date(asn1date): 28 | """Converts ASN1 formatted datetime into datetime object. 29 | """ 30 | return datetime.strptime(asn1date.decode("utf-8"), '%Y%m%d%H%M%SZ') 31 | 32 | 33 | def get_certificate(file_name, index): 34 | """Retrieves certificate from PEM file. Returns None if certificate cannot 35 | be extracted and writes reason to stdout. 36 | """ 37 | if os.path.isfile(file_name) and os.access(file_name, os.R_OK): 38 | entries = pem.parse_file(file_name) 39 | if index >= len(entries): 40 | print('Requested entry at index', index, 'while file has only', 41 | len(entries), 'entries.') 42 | elif type(entries[index]) == pem.Certificate: 43 | return load_certificate(FILETYPE_PEM, entries[index].as_bytes()) 44 | else: 45 | print('Entry at index', index, 'is not a certificate.') 46 | else: 47 | print('Unable to read file [' + file_name + '].') 48 | 49 | 50 | def execute_command(command, cert): 51 | """Dispatches certificate to command function. 52 | """ 53 | module = sys.modules[__name__] 54 | return getattr(module, 'cmd_' + command, None)(cert) 55 | 56 | 57 | def cmd_status(cert): 58 | """Returns certificate status based on certificate validity period. Values: 59 | 0 - valid 60 | 1 - not yet valid 61 | 2 - expired 62 | """ 63 | not_before = from_asn1_date(cert.get_notBefore()) 64 | not_after = from_asn1_date(cert.get_notAfter()) 65 | at = datetime.now() 66 | if at < not_before: 67 | return 1 68 | elif at > not_after: 69 | return 2 70 | else: 71 | return 0 72 | 73 | 74 | def cmd_startdate(cert): 75 | """Returns not before date of certificate. 76 | """ 77 | return from_asn1_date(cert.get_notBefore()) 78 | 79 | 80 | def cmd_enddate(cert): 81 | """Returns not after date of certificate. 82 | """ 83 | return from_asn1_date(cert.get_notAfter()) 84 | 85 | 86 | def cmd_serial(cert): 87 | """Returns serial number of certificate. 88 | """ 89 | return cert.get_serial_number() 90 | 91 | 92 | def cmd_subject(cert): 93 | """Returns subject of certificate. 94 | """ 95 | return format_x509_name(cert.get_subject()) 96 | 97 | 98 | def cmd_subject_hash(cert): 99 | """Returns hash of certificate subject. 100 | """ 101 | return cert.get_subject().hash() 102 | 103 | 104 | def cmd_issuer(cert): 105 | """Returns issuer of certificate. 106 | """ 107 | return format_x509_name(cert.get_issuer()) 108 | 109 | 110 | def cmd_issuer_hash(cert): 111 | """Returns hash of certificate issuer. 112 | """ 113 | return cert.get_issuer().hash() 114 | 115 | 116 | def cmd_fingerprint(cert): 117 | """Returns SHA-1 fingerprint of certificate. 118 | """ 119 | return cert.digest('sha1') 120 | 121 | 122 | def cmd_lifetime(cert): 123 | """Returns remaining lifetime of certificate in seconds. 124 | """ 125 | delta = (from_asn1_date(cert.get_notAfter()) - datetime.now()) 126 | return delta.days * 86400 + delta.seconds 127 | 128 | 129 | def cmd_lifetime_days(cert): 130 | """Returns remaining lifetime of certificate in full days. 131 | """ 132 | delta = (from_asn1_date(cert.get_notAfter()) - datetime.now()) 133 | return delta.days 134 | 135 | 136 | if __name__ == '__main__': 137 | commands = ['status', 'startdate', 'enddate', 'lifetime', 'lifetime_days', 138 | 'serial', 'subject', 'issuer', 'subject_hash', 'issuer_hash', 139 | 'fingerprint'] 140 | # Define and parse arguments 141 | parser = argparse.ArgumentParser( 142 | description='Check certificate in PEM file') 143 | parser.add_argument('file', nargs='?', help='PEM file') 144 | parser.add_argument('index', nargs='?', type=int, 145 | help='Certificate\'s index in file') 146 | parser.add_argument('stat', nargs='?', default='status', 147 | choices=commands, help='Information to return') 148 | args = parser.parse_args() 149 | 150 | # Retrieve certificate and execute command function on it 151 | cert = get_certificate(args.file, args.index) 152 | if cert is not None: 153 | print(execute_command(args.stat, cert)) 154 | -------------------------------------------------------------------------------- /documentation/db2stat-testing.md: -------------------------------------------------------------------------------- 1 | # How to test db2stat.pl in a docker container 2 | 3 | IBM Db2 docker container URL 4 | https://hub.docker.com/r/ibmcom/db2 5 | 6 | Download and run docker container 7 | ``` 8 | docker run -itd --name mydb2 --privileged=true -p 50000:50000 -e LICENSE=accept -e DB2INST1_PASSWORD=root -e DBNAME=testdb -v /tmp/database:/database ibmcom/db2 9 | ``` 10 | 11 | 12 | Log on to the container 13 | ``` 14 | docker exec -ti mydb2 bash 15 | ``` 16 | 17 | 18 | Set correct rights for database configurations 19 | ``` 20 | groupadd sysmon 21 | usermod -a -G sysmon db2inst1 22 | ``` 23 | 24 | 25 | Install nano and perl 26 | ``` 27 | yum install -y nano perl 28 | ``` 29 | 30 | 31 | Switch to user db2inst1 32 | ``` 33 | su - db2inst1 34 | ``` 35 | 36 | 37 | Modify database configuration file 38 | ``` 39 | db2 update dbm cfg using sysmon_group sysmon 40 | ``` 41 | 42 | 43 | Start the Db2 command line application 44 | ``` 45 | db2 46 | ``` 47 | 48 | 49 | Connect to database 50 | ``` 51 | connect to testdb user db2inst1 using root 52 | ``` 53 | 54 | 55 | Create table and insert test data into it 56 | ``` 57 | CREATE TABLE test (sarake1 INT PRIMARY KEY NOT NULL); 58 | INSERT INTO test VALUES (1); 59 | INSERT INTO test VALUES (2); 60 | INSERT INTO test VALUES (3); 61 | ``` 62 | 63 | 64 | Test database snapshot 65 | ``` 66 | get snapshot for database on testdb 67 | ``` 68 | 69 | 70 | Exit db2 command line 71 | ``` 72 | CTRL+D 73 | ``` 74 | 75 | 76 | Change directory to /tmp 77 | ``` 78 | cd /tmp 79 | ``` 80 | 81 | 82 | Create db2stat.pl using nano 83 | - Change the db2 application path to the script into "/opt/ibm/db2/V11.5/bin/db2" 84 | ``` 85 | nano db2stat.pl 86 | ``` 87 | 88 | 89 | Test db2stat script 90 | ``` 91 | perl -T ./db2stat.pl 60 db2inst1 /opt/ibm/db2/V11.5/ testdb "Database status" 92 | ``` 93 | 94 | 95 | The script should return the output: 96 | ``` 97 | Active 98 | ``` 99 | 100 | 101 | ## Alternative way using a docker with multiple Db2 instances 102 | 103 | Angoca's Db2 docker container URL: 104 | https://github.com/angoca/db2-docker/tree/master/db2-install/expc 105 | 106 | 107 | Download and run docker container 108 | ``` 109 | sudo docker run -i -t --privileged=true --name="db2inst1" -p 50000:50000 angoca/db2-install 110 | ``` 111 | 112 | 113 | Change directory to /tmp/db2_conf 114 | ``` 115 | cd /tmp/db2_conf 116 | ``` 117 | 118 | 119 | Update apt lists 120 | ``` 121 | apt update 122 | ``` 123 | 124 | 125 | Download and install nano and perl 126 | ``` 127 | apt install -y nano perl 128 | ``` 129 | 130 | 131 | Create a new database instance 132 | ``` 133 | ./createInstance db2inst1 134 | ``` 135 | 136 | 137 | Create a new user for second instance 138 | (User db2inst1 exists in docker container already) 139 | ``` 140 | useradd -g db2grp1 -m db2inst2 141 | ``` 142 | 143 | 144 | Set db2inst2 user password 145 | ``` 146 | passwd db2inst2 147 | ``` 148 | 149 | 150 | Create new Db2 instance 151 | ``` 152 | ./createInstance db2inst2 153 | ``` 154 | 155 | (If you received an error about invalid GUID for user "db2inst2", just change 156 | the user GUID value to file "/tmp/db2_conf/db2expc_instance.rsp". The value is 157 | set at the line "DB2_INST.UID". You can check the user GUID by running command 158 | "id db2inst2". 159 | 160 | 161 | Change directory to /tmp 162 | ``` 163 | cd /tmp 164 | ``` 165 | 166 | 167 | Create db2stat.pl using nano 168 | - Change the db2 application path to the script into "/opt/ibm/db2/V11.5/bin/db2" 169 | - Change the cp and touch command paths to "/bin/cp" and "/bin/touch". 170 | ``` 171 | nano db2stat.pl 172 | ``` 173 | 174 | 175 | Change group and permissions for db2stat.pl file 176 | ``` 177 | chgrp db2grp1 db2stat.pl 178 | chmod 770 db2stat.pl 179 | ``` 180 | 181 | 182 | Switch user 183 | ``` 184 | su - db2inst1 185 | ``` 186 | 187 | 188 | Start the database manager 189 | ``` 190 | db2start 191 | ``` 192 | 193 | 194 | Start the Db2 command line application 195 | ``` 196 | db2 197 | ``` 198 | 199 | 200 | Create a new database 201 | ``` 202 | create database testdb1 using codeset UTF-8 territory en 203 | ``` 204 | 205 | 206 | Connect to database 207 | ``` 208 | connect to testdb1 user db2inst1 using db2inst1 209 | ``` 210 | 211 | 212 | Create table and insert test data into it 213 | ``` 214 | CREATE TABLE test (sarake1 INT PRIMARY KEY NOT NULL) 215 | INSERT INTO test VALUES (1) 216 | INSERT INTO test VALUES (2) 217 | INSERT INTO test VALUES (3) 218 | ``` 219 | 220 | 221 | Exit db2 command line 222 | ``` 223 | CTRL+D 224 | ``` 225 | 226 | 227 | Test db2stat.pl script 228 | ``` 229 | perl -T ./db2stat.pl 60 db2inst1 /opt/ibm/db2/V11.5/ testdb1 "Database status" 230 | ``` 231 | 232 | 233 | The script should return the output: 234 | ``` 235 | Active 236 | ``` 237 | 238 | 239 | Exit db2inst1 user shell 240 | ``` 241 | exit 242 | ``` 243 | 244 | 245 | Switch user 246 | ``` 247 | su - db2inst2 248 | ``` 249 | 250 | 251 | Start the database manager 252 | ``` 253 | db2start 254 | ``` 255 | 256 | 257 | Start the Db2 command line application 258 | ``` 259 | db2 260 | ``` 261 | 262 | 263 | Create a new database 264 | ``` 265 | create database testdb2 using codeset UTF-8 territory en 266 | ``` 267 | 268 | 269 | Connect to database 270 | ``` 271 | connect to testdb2 user db2inst2 using db2inst2 272 | ``` 273 | 274 | 275 | Create table and insert test data into it 276 | ``` 277 | CREATE TABLE test (sarake1 INT PRIMARY KEY NOT NULL) 278 | INSERT INTO test VALUES (1) 279 | INSERT INTO test VALUES (2) 280 | INSERT INTO test VALUES (3) 281 | ``` 282 | 283 | 284 | Exit db2 command line 285 | ``` 286 | CTRL+D 287 | ``` 288 | 289 | 290 | Test db2stat.pl script 291 | ``` 292 | perl -T ./db2stat.pl 60 db2inst2 /opt/ibm/db2/V11.5/ testdb2 "Database status" 293 | ``` 294 | 295 | 296 | The script should return the output: 297 | ``` 298 | Active 299 | ``` 300 | 301 | 302 | Exit db2inst2 user shell 303 | ``` 304 | exit 305 | ``` 306 | -------------------------------------------------------------------------------- /etc/zabbix/zabbix_agentd.d/galera.conf: -------------------------------------------------------------------------------- 1 | # Copied and adapted from https://github.com/MogiePete/zabbix-galera-template/blob/master/userparameter_galera.conf 2 | 3 | #Total number of cluster membership changes happened. 4 | UserParameter=galera.cluster_conf_id,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_cluster_conf_id';" | HOME=/var/lib/zabbix mysql -N 5 | 6 | #Current number of members in the cluster. 7 | UserParameter=galera.cluster_size,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_cluster_size';" | HOME=/var/lib/zabbix mysql -N 8 | 9 | #Status of this cluster component. That is, whether the node is part of a PRIMARY or NON_PRIMARY component. 10 | UserParameter=galera.cluster_status,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_cluster_status';" | HOME=/var/lib/zabbix mysql -N 11 | 12 | #If the value is OFF, the node has not yet connected to any of the cluster components. 13 | UserParameter=galera.wsrep_connected,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_connected';" | HOME=/var/lib/zabbix mysql -N 14 | 15 | #Shows the internal state of the EVS Protocol 16 | UserParameter=galera.wsrep_evs_state,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_evs_state';" | HOME=/var/lib/zabbix mysql -N 17 | 18 | #How much the slave lag is slowing down the cluster. 19 | UserParameter=galera.wsrep_flow_control_paused,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_flow_control_paused';" | HOME=/var/lib/zabbix mysql -N 20 | 21 | #The total time spent in a paused state measured in nanoseconds. 22 | UserParameter=galera.wsrep_flow_control_paused_ns,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_flow_control_paused_ns';" | HOME=/var/lib/zabbix mysql -N 23 | 24 | #Returns the number of FC_PAUSE events the node has received. Does not reset over time 25 | UserParameter=galera.wsrep_flow_control_recv,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_flow_control_recv';" | HOME=/var/lib/zabbix mysql -N 26 | 27 | #Returns the number of FC_PAUSE events the node has sent. Does not reset over time 28 | UserParameter=galera.wsrep_flow_control_sent,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_flow_control_sent';" | HOME=/var/lib/zabbix mysql -N 29 | 30 | #Displays the group communications UUID. 31 | UserParameter=galera.wsrep_gcom_uuid,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_gcomm_uuid';" | HOME=/var/lib/zabbix mysql -N 32 | 33 | #The sequence number, or seqno, of the last committed transaction. 34 | UserParameter=galera.wsrep_last_committed,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_last_committed';" | HOME=/var/lib/zabbix mysql -N 35 | 36 | #Internal Galera Cluster FSM state number. 37 | UserParameter=galera.wsrep_local_state,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_local_state';" | HOME=/var/lib/zabbix mysql -N 38 | 39 | #Total number of local transactions that were aborted by slave transactions while in execution. 40 | UserParameter=galera.wsrep_local_bf_aborts,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_local_bf_aborts';" | HOME=/var/lib/zabbix mysql -N 41 | 42 | #Current (instantaneous) length of the recv queue. 43 | UserParameter=galera.wsrep_local_recv_queue,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_local_recv_queue';" | HOME=/var/lib/zabbix mysql -N 44 | 45 | #Current (instantaneous) length of the send queue. 46 | UserParameter=galera.wsrep_local_send_queue,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_local_send_queue';" | HOME=/var/lib/zabbix mysql -N 47 | 48 | #Human-readable explanation of the state. 49 | UserParameter=galera.wsrep_local_state_comment,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_local_state_comment';" | HOME=/var/lib/zabbix mysql -N 50 | 51 | #The UUID of the state stored on this node. 52 | UserParameter=galera.wsrep_local_state_uuid,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_local_state_uuid';" | HOME=/var/lib/zabbix mysql -N 53 | 54 | #Whether the server is ready to accept queries. 55 | UserParameter=galera.wsrep_ready,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_ready';" | HOME=/var/lib/zabbix mysql -N 56 | 57 | #Total size of write-sets received from other nodes. 58 | UserParameter=galera.wsrep_received_bytes,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_received_bytes';" | HOME=/var/lib/zabbix mysql -N 59 | 60 | #Total size of write-sets replicated. 61 | UserParameter=galera.replicated_bytes,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_replicated_bytes';" | HOME=/var/lib/zabbix mysql -N 62 | 63 | #Total size of data replicated. 64 | UserParameter=galera.wsrep_repl_data_bytes,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_repl_data_bytes';" | HOME=/var/lib/zabbix mysql -N 65 | 66 | #Total number of keys replicated. 67 | UserParameter=galera.wsrep_repl_keys,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_repl_keys';" | HOME=/var/lib/zabbix mysql -N 68 | 69 | #Total size of keys replicated in bytes 70 | UserParameter=galera.wsrep_repl_keys_bytes,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_repl_keys_bytes';" | HOME=/var/lib/zabbix mysql -N 71 | 72 | #Total size of other bits replicated 73 | UserParameter=galera.wsrep_repl_other_bytes,echo "select VARIABLE_VALUE from information_schema.GLOBAL_STATUS where VARIABLE_NAME = 'wsrep_repl_other_bytes';" | HOME=/var/lib/zabbix mysql -N 74 | -------------------------------------------------------------------------------- /documentation/kubernetes_monitoring.md: -------------------------------------------------------------------------------- 1 | # Kubernetes pods and nodes discovery and monitoring 2 | 3 | Requirements: 4 | - Python 3.6.8 or later 5 | - VirtualEnv 15.1.0 or later 6 | - 3rd party libraries for Python: kubernetes. 7 | 8 | 9 | ## Creating and activating VirtualEnv for Python: 10 | ``` 11 | mkdir /opt/virtualenv 12 | cd /opt/virtualenv 13 | virtualenv -p python3 kubernetes-monitoring 14 | cd kubernetes-monitoring 15 | source bin/activate 16 | ``` 17 | 18 | 19 | ## Install Python dependencies using pip3: 20 | ``` 21 | pip3 install kubernetes py-zabbix 22 | ``` 23 | 24 | PSK key support requires additionally sslpsk, installation requires development 25 | packages for Python3 and OpenSSL. Development packages can be removed after 26 | installation. 27 | 28 | (RedHat/CentOS): 29 | ```bash 30 | sudo yum install python3-devel openssl-devel 31 | pip3 install sslpsk 32 | sudo yum remove python3-devel openssl-devel 33 | ``` 34 | 35 | 36 | ## Configuring access for user zabbix 37 | 38 | The zabbix user must have enough privileges to read Kubernetes configurations 39 | and access the Kubernetes objects. It is recommended that you create a context 40 | that specifies the cluster, the user and the namespace that the monitoring 41 | script will use when making calls to the API server. To achieve this, you need 42 | to create a kubeconfig file. A kubekonfig file requires the URL of the API 43 | server, a cluster CA certificate and credentials in the form of a key and a 44 | certificate signed by the cluster CA. 45 | 46 | This documentation provides steps to create certificates and how to have them 47 | accepted by an existing Kubernetes cluster. If you already have existing 48 | certificates and configurations, you may skip the first part where certificates 49 | are created and approved. There is a template for the configuration file in case 50 | you already have the certificates and you do not have a configuration file: 51 | [There is an example file here](kubernetes_monitoring/config). 52 | 53 | ### Creating a certificate signing request (CSR) and retrieving certificates 54 | 55 | First we run the OpenSSL command to generate new private key and CSR. You may 56 | change the subject fields to suit your needs. Atleast the "/CN=zabbix"-field 57 | should be checked since the role based access control (RBAC) sub-system will 58 | determine the username from that field: 59 | ``` 60 | openssl req -new -newkey rsa:4096 -nodes -keyout zabbix.key -out zabbix.csr -subj "/C=FI/ST=Pirkanmaa/L=Tampere/O=Digia Oyj/OU=Digia Iiris/CN=zabbix" 61 | ``` 62 | 63 | Then we can retrieve the CSR-file and encode it using the base64 command: 64 | ``` 65 | cat zabbix.csr | base64 | tr -d '\n' 66 | ``` 67 | 68 | Then we paste the base 64 encoded CSR into the certificate signing request YAML-file. 69 | [There is an example file here](kubernetes_monitoring/csr.yml). 70 | 71 | Then we send the request to the API server: 72 | ``` 73 | kubectl create -f csr.yml 74 | ``` 75 | 76 | Then we check the condition of the request using the following command: 77 | ``` 78 | kubectl get csr 79 | ``` 80 | 81 | We should receive an output that is somewhat like this: 82 | ``` 83 | NAME AGE SIGNERNAME REQUESTOR CONDITION 84 | zabbix 10s kubernetes.io/kube-apiserver-client minikube-user Pending 85 | ``` 86 | 87 | The next thing we need to do is approve the request: 88 | ``` 89 | kubectl certificate approve zabbix 90 | ``` 91 | 92 | When we check the status for the request again, the request should be approved: 93 | ``` 94 | kubectl get csr 95 | 96 | NAME AGE SIGNERNAME REQUESTOR CONDITION 97 | zabbix 30s kubernetes.io/kube-apiserver-client minikube-user Approved,Issued 98 | ``` 99 | 100 | Now that our request is approved, we can retrieve the certificate. We pipe the 101 | output to base64 command for decoding and finally save it to a file: 102 | ``` 103 | kubectl get csr zabbix -o jsonpath='{.status.certificate}' | base64 --decode > zabbix.crt 104 | ``` 105 | 106 | Next thing we need is the cluster CA certificate. We pipe it to the base64 107 | command for decoding and save it into a file as with the previous command: 108 | ``` 109 | kubectl get secret -o jsonpath="{.items[?(@.type==\"kubernetes.io/service-account-token\")].data['ca\.crt']}" | base64 --decode >ca.crt 110 | ``` 111 | 112 | 113 | ### Setting up the configuration using existing certificates 114 | 115 | Retrieve cluster name: 116 | ``` 117 | kubectl config view -o jsonpath='{.clusters[0].name}' 118 | ``` 119 | 120 | Retrieve cluster server address: 121 | ``` 122 | kubectl config view -o jsonpath='{.clusters[0].cluster.server}' 123 | ``` 124 | 125 | Pull details from existing Kubernetes-configurations: 126 | ``` 127 | kubectl config set-cluster --server= --certificate-authority= --kubeconfig= --embed-certs 128 | ``` 129 | 130 | Set up the user: 131 | ``` 132 | kubectl config set-credentials zabbix --client-certificate= --client-key= --kubeconfig= --embed-certs 133 | ``` 134 | 135 | Create a context: 136 | ``` 137 | kubectl config set-context zabbix --cluster= --namespace=default --user=zabbix --kubeconfig= 138 | ``` 139 | 140 | Specify the context for user zabbix: 141 | ``` 142 | kubectl config use-context zabbix --kubeconfig= 143 | ``` 144 | 145 | Test configurations: 146 | ``` 147 | kubectl version --kubeconfig= 148 | ``` 149 | 150 | You should now see a version listing from client and server, similar to the following: 151 | ``` 152 | Client Version: version.Info{Major:"1", Minor:"17", GitVersion:"v1.17.4", GitCommit:"", GitTreeState:"clean", BuildDate:"2020-03-12T21:03:42Z", GoVersion:"go1.13.8", Compiler:"gc", Platform:"linux/amd64"} 153 | Server Version: version.Info{Major:"1", Minor:"18", GitVersion:"v1.18.0", GitCommit:"", GitTreeState:"clean", BuildDate:"2020-03-25T14:50:46Z", GoVersion:"go1.13.8", Compiler:"gc", Platform:"linux/amd64"} 154 | ``` 155 | 156 | 157 | ### Authorize user to list pods, nodes and services from Kubernetes cluster. 158 | 159 | [There is an example file here](kubernetes_monitoring/access.yml). 160 | 161 | ``` 162 | kubectl create -f kubernetes_monitoring/access.yml 163 | ``` 164 | 165 | 166 | ## Usage 167 | 168 | Item Syntax | Description | Units | 169 | ----------- | ----------- | ----- | 170 | kubernetes.discover.pods | Discover all Kubernetes pods | Provides the following template variables: {#POD}. Also provides service information in an array: ip, namespace, pod, restart_count, uptime. | 171 | kubernetes.discover.pods.default | Discover all Kubernetes pods using default field selectors | Provides the following template variables: {#POD}. Also provides service information in an array: ip, namespace, pod, restart_count, uptime. | 172 | kubernetes.discover.nodes | Discover all Kubernetes nodes | Provides the following template variables: {#NODE}. Also provides service information in an array: node, machine_id, status, system_uuid. | 173 | kubernetes.discover.services | Discover all Kubernetes services | Provides the following template variables: {#SERVICE}. Also provides service information in an array: namespace, service, uid. | 174 | 175 | 176 | ## Retrieving data from discovery using JSONPath 177 | 178 | In this example, data can be retrieved using JSONPath: 179 | ``` 180 | $.data[?(@.pod == "")].pod 181 | $.data[?(@.pod == "")].restart_count 182 | 183 | $.data[?(@.node == "")].node 184 | $.data[?(@.node == "")].status 185 | 186 | $.data[?(@.service == "")].service 187 | $.data[?(@.service == "")].uid 188 | ``` 189 | -------------------------------------------------------------------------------- /custom/scripts/zapache: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | # 3 | # Name: zapache 4 | # 5 | # Checks Apache activity. 6 | # 7 | # Author: Alejandro Michavila 8 | # Modified for Scoreboard Values: Murat Koc, murat@profelis.com.tr 9 | # Modified for using also as external script: Murat Koc, murat@profelis.com.tr 10 | # Modified for outputting usage or ZBX_NOTSUPPORTED: Alejandro Michavila 11 | # Modified to do cacheing for performance, dmitry.frolov@gmail.com 12 | # 13 | # Version: 1.5 14 | # 15 | 16 | zapachever="1.5" 17 | rval=0 18 | value="" 19 | cache_seconds="60" 20 | HEADER_PARAM="" 21 | curl="`which curl`" 22 | wget="`which wget`" 23 | [ "$TMPDIR" ] || TMPDIR=/tmp 24 | 25 | function usage() 26 | { 27 | echo "zapache version: $zapachever" 28 | echo "usage:" 29 | echo " $0 [] [
] TotalAccesses - Check total accesses." 30 | echo " $0 [] [
] TotalKBytes - Check total KBytes." 31 | echo " $0 [] [
] CPULoad - Check CPU load." 32 | echo " $0 [] [
] Uptime - Check uptime." 33 | echo " $0 [] [
] ReqPerSec - Check requests per second." 34 | echo " $0 [] [
] BytesPerSec - Check Bytes per second." 35 | echo " $0 [] [
] BytesPerReq - Check Bytes per request." 36 | echo " $0 [] [
] BusyWorkers - Check busy workers." 37 | echo " $0 [] [
] IdleWorkers - Check idle workers." 38 | echo " $0 [] [
] version - Version of this script." 39 | echo " $0 [] [
] ping - Check if Apache is up." 40 | echo " $0 [] [
] WaitingForConnection - Check Waiting for Connection processess." 41 | echo " $0 [] [
] StartingUp - Check Starting Up processess." 42 | echo " $0 [] [
] ReadingRequest - Check Reading Request processess." 43 | echo " $0 [] [
] SendingReply - Check Sending Reply processess." 44 | echo " $0 [] [
] KeepAlive - Check KeepAlive Processess." 45 | echo " $0 [] [
] DNSLookup - Check DNSLookup Processess." 46 | echo " $0 [] [
] ClosingConnection - Check Closing Connection Processess." 47 | echo " $0 [] [
] Logging - Check Logging Processess." 48 | echo " $0 [] [
] GracefullyFinishing - Check Gracefully Finishing Processess." 49 | echo " $0 [] [
] IdleCleanupOfWorker - Check Idle Cleanup of Worker Processess." 50 | echo " $0 [] [
] OpenSlotWithNoCurrentProcess - Check Open Slots with No Current Process." 51 | } 52 | 53 | ######## 54 | # Main # 55 | ######## 56 | 57 | if [[ $# == 1 ]];then 58 | #Agent Mode 59 | STATUS_URL="http://localhost/server-status?auto" 60 | CASE_VALUE="$1" 61 | elif [[ $# == 2 ]];then 62 | #External Script Mode 63 | STATUS_URL="$1" 64 | case "$STATUS_URL" in 65 | http://*|https://*) ;; 66 | *) STATUS_URL="http://$STATUS_URL/server-status?auto";; 67 | esac 68 | CASE_VALUE="$2" 69 | elif [[ $# == 3 ]];then 70 | #External Script Mode 71 | STATUS_URL="$1" 72 | case "$STATUS_URL" in 73 | http://*|https://*) ;; 74 | *) STATUS_URL="http://$STATUS_URL/server-status?auto";; 75 | esac 76 | if [ "$curl" ]; then 77 | HEADER_PARAM="-H \"$2\"" 78 | else 79 | HEADER_PARAM="--header \"$2\"" 80 | fi 81 | CASE_VALUE="$3" 82 | else 83 | #No Parameter 84 | usage 85 | exit 0 86 | fi 87 | 88 | case "$CASE_VALUE" in 89 | 'version') 90 | echo "$zapachever" 91 | exit 0;; 92 | esac 93 | 94 | umask 077 95 | 96 | # $UID is bash-specific 97 | cache_prefix="zapache-$UID-${STATUS_URL//[^a-zA-Z0-9_-]/_}" 98 | cache="$TMPDIR/$cache_prefix.cache" 99 | cache_timestamp_check="$TMPDIR/$cache_prefix.ts" 100 | # This assumes touch from coreutils 101 | touch -d "@$((`date +%s` - ($cache_seconds - 1)))" "$cache_timestamp_check" 102 | 103 | if [ "$cache" -ot "$cache_timestamp_check" ]; then 104 | if [ "$curl" ]; then 105 | fetch_url() { $curl --insecure --silent --location $HEADER_PARAM -H "Cache-Control: no-cache" "$@"; } 106 | else 107 | if [ "$wget" ]; then 108 | fetch_url() { $wget --no-check-certificate --quiet $HEADER_PARAM --header "Cache-Control: no-cache" -O - "$@"; } 109 | else 110 | echo "ZBX_NOTSUPPORTED" 111 | exit 1 112 | fi 113 | fi 114 | 115 | fetch_url "$STATUS_URL" > "$cache" 116 | rval=$? 117 | if [ $rval != 0 ]; then 118 | echo "ZBX_NOTSUPPORTED" 119 | exit 1 120 | fi 121 | fi 122 | 123 | case "$CASE_VALUE" in 124 | 'ping') 125 | if [ ! -s "$cache" -o "$cache" -ot "$cache_timestamp_check" ]; then 126 | echo "0" 127 | else 128 | echo "1" 129 | fi 130 | exit 0;; 131 | esac 132 | 133 | if ! [ -s "$cache" ]; then 134 | echo "ZBX_NOTSUPPORTED" 135 | exit 1 136 | fi 137 | 138 | case "$CASE_VALUE" in 139 | 'TotalAccesses') 140 | value="`awk '/^Total Accesses:/ {print $3}' < \"$cache\"`" 141 | rval=$?;; 142 | 'TotalKBytes') 143 | value="`awk '/^Total kBytes:/ {print $3}' < \"$cache\"`" 144 | rval=$?;; 145 | 'CPULoad') 146 | value="`awk '/^CPULoad:/ {print $2}' < \"$cache\"`" 147 | rval=$?;; 148 | 'Uptime') 149 | value="`awk '/^Uptime:/ {print $2}' < \"$cache\"`" 150 | rval=$?;; 151 | 'ReqPerSec') 152 | value="`awk '/^ReqPerSec:/ {print $2}' < \"$cache\"`" 153 | rval=$?;; 154 | 'BytesPerSec') 155 | value="`awk '/^BytesPerSec:/ {print $2}' < \"$cache\"`" 156 | rval=$?;; 157 | 'BytesPerReq') 158 | value="`awk '/^BytesPerReq:/ {print $2}' < \"$cache\"`" 159 | rval=$?;; 160 | 'BusyWorkers') 161 | value="`awk '/^BusyWorkers:/ {print $2}' < \"$cache\"`" 162 | rval=$?;; 163 | 'IdleWorkers') 164 | value="`awk '/^IdleWorkers:/ {print $2}' < \"$cache\"`" 165 | rval=$?;; 166 | 'WaitingForConnection') 167 | value="`awk '/^Scoreboard:/ {print split($2,notused,"_")-1}' < \"$cache\"`" 168 | rval=$?;; 169 | 'StartingUp') 170 | value="`awk '/^Scoreboard:/ {print split($2,notused,"S")-1}' < \"$cache\"`" 171 | rval=$?;; 172 | 'ReadingRequest') 173 | value="`awk '/^Scoreboard:/ {print split($2,notused,"R")-1}' < \"$cache\"`" 174 | rval=$?;; 175 | 'SendingReply') 176 | value="`awk '/^Scoreboard:/ {print split($2,notused,"W")-1}' < \"$cache\"`" 177 | rval=$?;; 178 | 'KeepAlive') 179 | value="`awk '/^Scoreboard:/ {print split($2,notused,"K")-1}' < \"$cache\"`" 180 | rval=$?;; 181 | 'DNSLookup') 182 | value="`awk '/^Scoreboard:/ {print split($2,notused,"D")-1}' < \"$cache\"`" 183 | rval=$?;; 184 | 'ClosingConnection') 185 | value="`awk '/^Scoreboard:/ {print split($2,notused,"C")-1}' < \"$cache\"`" 186 | rval=$?;; 187 | 'Logging') 188 | value="`awk '/^Scoreboard:/ {print split($2,notused,"L")-1}' < \"$cache\"`" 189 | rval=$?;; 190 | 'GracefullyFinishing') 191 | value="`awk '/^Scoreboard:/ {print split($2,notused,"G")-1}' < \"$cache\"`" 192 | rval=$?;; 193 | 'IdleCleanupOfWorker') 194 | value="`awk '/^Scoreboard:/ {print split($2,notused,"I")-1}' < \"$cache\"`" 195 | rval=$?;; 196 | 'OpenSlotWithNoCurrentProcess') 197 | value="`awk '/^Scoreboard:/ {print split($2,notused,".")-1}' < \"$cache\"`" 198 | rval=$?;; 199 | *) 200 | usage 201 | exit 1;; 202 | esac 203 | 204 | if [ "$rval" -eq 0 -a -z "$value" ]; then 205 | case "$CASE_VALUE" in 206 | # Theese metrics are output only if non-zero 207 | 'CPULoad' | 'ReqPerSec' | 'BytesPerSec' | 'BytesPerReq') 208 | value=0 209 | ;; 210 | *) 211 | rval=1 212 | ;; 213 | esac 214 | fi 215 | 216 | if [ "$rval" -ne 0 ]; then 217 | echo "ZBX_NOTSUPPORTED" 218 | fi 219 | 220 | echo "$value" 221 | exit $rval 222 | 223 | # 224 | # end zapache 225 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/zabbix_sender_psk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Provides functionally extended version of py-zabbix ZabbixSender and pure python utility for 4 | sending trapper data. 5 | """ 6 | from argparse import ArgumentParser 7 | from configparser import RawConfigParser 8 | from datetime import datetime 9 | from io import StringIO 10 | from typing import Callable, List, Optional, Tuple 11 | import functools 12 | import sys 13 | 14 | from pyzabbix import ZabbixSender, ZabbixMetric, ZabbixResponse 15 | 16 | # NOTE: Python 3 and OpenSSL development files required to install sslpsk 17 | # Packages needed only during installation and can be removed afterwards 18 | # --------------------------------------------------------------------- 19 | # RedHat/CentOS: sudo yum install python3-devel openssl-devel ; pip install sslpsk 20 | 21 | # Socket wrapper implementation adapted from GitHub issue: 22 | # https://github.com/adubkov/py-zabbix/issues/114 23 | class PyZabbixPSKSocketWrapper: 24 | """Implements ssl.wrap_socket with PSK instead of certificates. 25 | 26 | Proxies calls to a `socket` instance. 27 | """ 28 | 29 | def __init__(self, sock, *, identity, psk): 30 | self.__sock = sock 31 | self.__identity = identity 32 | self.__psk = psk 33 | 34 | def connect(self, *args, **kwargs): 35 | """ 36 | Opens socket connection. 37 | """ 38 | # PSK is optional to use so SSL dependencies are only imported when actually needed 39 | # Otherwise script would require bunch of system packages to be installed unnecessarily 40 | import ssl # pylint: disable=import-outside-toplevel 41 | import sslpsk # pylint: disable=import-outside-toplevel 42 | 43 | # `sslpsk.wrap_socket` must be called *after* socket.connect, 44 | # while the `ssl.wrap_socket` must be called *before* socket.connect. 45 | self.__sock.connect(*args, **kwargs) 46 | 47 | # `sslv3 alert bad record mac` exception means incorrect PSK 48 | self.__sock = sslpsk.wrap_socket( 49 | self.__sock, 50 | # https://github.com/zabbix/zabbix/blob/f0a1ad397e5653238638cd1a65a25ff78c6809bb/src/libs/zbxcrypto/tls.c#L3231 51 | ssl_version=ssl.PROTOCOL_TLSv1_2, 52 | # https://github.com/zabbix/zabbix/blob/f0a1ad397e5653238638cd1a65a25ff78c6809bb/src/libs/zbxcrypto/tls.c#L3179 53 | ciphers="PSK-AES128-CBC-SHA", 54 | psk=(self.__psk, self.__identity), 55 | ) 56 | 57 | def __getattr__(self, name): 58 | return getattr(self.__sock, name) 59 | 60 | 61 | class ZabbixSenderPSK(ZabbixSender): 62 | """ 63 | Extends py-zabbix library's ZabbixSender by implementing PSK support and sending semantics 64 | of command line sender (command line version =>4.2). 65 | 66 | User can also specify error_listener function which is called in case send call fail. If 67 | listener raises another error, the send is terminated. 68 | 69 | This version always uses Zabbix agent configuration file. 70 | """ 71 | 72 | def __init__(self, 73 | config_file: str = None, 74 | error_listener: Callable[[OSError], None] = None): 75 | if config_file is None: 76 | config_file = '/etc/zabbix/zabbix_agentd.conf' 77 | self.config_file = config_file 78 | self.error_listener = error_listener 79 | self._config = None 80 | 81 | psk_info = self._get_psk_info() 82 | if psk_info: 83 | wrapper = functools.partial( 84 | PyZabbixPSKSocketWrapper, 85 | identity=psk_info[0], 86 | psk=psk_info[1]) 87 | ZabbixSender.__init__(self, use_config=config_file, socket_wrapper=wrapper) 88 | else: 89 | ZabbixSender.__init__(self, use_config=config_file) 90 | 91 | def _load_agent_config(self): 92 | if self._config is None: 93 | with open(self.config_file, 'r') as file_handle: 94 | config_file_data = '[root]\n' + file_handle.read() 95 | 96 | config_file_fp = StringIO(config_file_data) 97 | config = RawConfigParser(strict=False) 98 | config.read_file(config_file_fp) 99 | self._config = config 100 | return self._config 101 | 102 | def _get_psk_info(self) -> Optional[Tuple[str, bytearray]]: 103 | config = self._load_agent_config() 104 | 105 | tls_connect = config.get('root', 'TLSConnect', fallback=None) 106 | if tls_connect and tls_connect == 'psk': 107 | psk_identity = config.get('root', 'TLSPSKIdentity', fallback=None) 108 | if psk_identity is None: 109 | raise ValueError('Error in config file, TLSPSKIdentity missing') 110 | 111 | psk_file = config.get('root', 'TLSPSKFile', fallback=None) 112 | if psk_file is None: 113 | raise ValueError('Error in config file, TLSPSKFile missing.') 114 | 115 | with open(psk_file, 'r') as file_handle: 116 | psk_key = bytes.fromhex(file_handle.read().strip()) 117 | 118 | return (psk_identity, psk_key) 119 | 120 | return None 121 | 122 | def get_agent_config(self): 123 | """ 124 | Returns the agent configuration. 125 | """ 126 | return self._load_agent_config() 127 | 128 | def send(self, metrics: List[ZabbixMetric]) -> ZabbixResponse: 129 | zabbix_uris = self.zabbix_uri 130 | response = None 131 | for uri in zabbix_uris: 132 | try: 133 | self.zabbix_uri = [uri] 134 | response = ZabbixSender.send(self, metrics) 135 | except OSError as ex: 136 | if self.error_listener: 137 | self.error_listener(ex) 138 | self.zabbix_uri = zabbix_uris 139 | 140 | # Only last successful response is returned, this follows ZabbixSender semantics 141 | if response is None: 142 | raise OSError('Could not send values to any Zabbix server.') 143 | return response 144 | 145 | def _print_cfg_value(config: RawConfigParser, key: str): 146 | print(f"{key}: {config.get('root', key, fallback='-')}") 147 | 148 | 149 | 150 | def display_config(sender: ZabbixSenderPSK): 151 | """ 152 | Prints trapper related Zabbix agent configuration options to stdout. 153 | """ 154 | config = sender.get_agent_config() 155 | _print_cfg_value(config, 'ServerActive') 156 | _print_cfg_value(config, 'Hostname') 157 | _print_cfg_value(config, 'TLSConnect') 158 | _print_cfg_value(config, 'TLSPSKIdentity') 159 | _print_cfg_value(config, 'TLSPSKFile') 160 | 161 | 162 | def send_from_file(sender: ZabbixSenderPSK, input_file: str, with_timestamps: bool = False): 163 | """ 164 | Sends values from file to Zabbix server. 165 | """ 166 | metrics = [] 167 | with sys.stdin if input_file == '-' else open(input_file, 'r') as file_handle: 168 | for line in file_handle: 169 | line = line.strip() # Remove newline 170 | if with_timestamps: 171 | parts = line.split(' ', 4) 172 | metrics.append(ZabbixMetric(parts[0], parts[1], parts[3], clock(parts[2]))) 173 | else: 174 | parts = line.split(' ', 3) 175 | metrics.append(ZabbixMetric(parts[0], parts[1], parts[2])) 176 | 177 | response = sender.send(metrics) 178 | print(response) 179 | 180 | 181 | def send_value(sender: ZabbixSenderPSK, host: str, key: str, value: str, clock_value: int): 182 | """ 183 | Sends single value to Zabbix server. 184 | """ 185 | if host is None: 186 | config = sender.get_agent_config() 187 | host = config.get('root', 'Hostname', fallback=None) 188 | if host is None: 189 | raise ValueError('Cannot resolve trapper hostname.') 190 | metric = ZabbixMetric(host, key, value, clock_value) 191 | response = sender.send([metric]) 192 | print(response) 193 | 194 | 195 | def run_sender(args): 196 | """ 197 | Executes the sender utility. 198 | """ 199 | sender = ZabbixSenderPSK(args.config) 200 | if args.display_config: 201 | display_config(sender) 202 | sys.exit(0) 203 | 204 | if args.input_file: 205 | send_from_file(sender, args.input_file, args.with_timestamps) 206 | else: 207 | if args.key and args.value: 208 | send_value(sender, args.host, args.key, args.value, args.clock) 209 | else: 210 | sys.exit('Invalid arguments: specify either key and value or input file.') 211 | 212 | 213 | def clock(value: str) -> int: 214 | """ 215 | Tries to parse clock value from string in multiple formats. 216 | 217 | Supported formats: 218 | - Bare clock seconds value from unix epoch 219 | - ISO datetime without timezone 220 | """ 221 | try: 222 | return int(value) 223 | except ValueError: 224 | return int(datetime.strptime(value, '%Y-%m-%dT%H:%M:%S').timestamp()) 225 | 226 | 227 | if __name__ == '__main__': 228 | parser = ArgumentParser() 229 | parser.add_argument('-c', '--config', default=None, 230 | help='Path to Zabbix agentd configuration file') 231 | parser.add_argument('-s', '--host', default=None, 232 | help='Specify host name the item belongs to') 233 | parser.add_argument('-k', '--key', 234 | help='Specify item key') 235 | parser.add_argument('-o', '--value', 236 | help='Specify item value') 237 | parser.add_argument('-t', '--clock', type=clock, default=None, 238 | help='Specify item clock') 239 | parser.add_argument('-i', '--input-file', 240 | help='Load values from input file. Specify - for standard input.') 241 | parser.add_argument('-T', '--with-timestamps', action='store_true', 242 | help='Each line of file contains whitespace delimited:\n' \ 243 | ' ') 244 | parser.add_argument('-d', '--display-config', action='store_true', 245 | help='Print trapper related Zabbix agent configuration') 246 | cmd_args = parser.parse_args() 247 | 248 | run_sender(cmd_args) 249 | -------------------------------------------------------------------------------- /templates/process.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 3.2 4 | 2017-09-28T11:28:48Z 5 | 6 | 7 | Templates 8 | 9 | 10 | 11 | 221 | 222 | 223 | -------------------------------------------------------------------------------- /templates/docker_trapper.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5.0 4 | 2020-11-19T08:26:56Z 5 | 6 | 7 | Templates 8 | 9 | 10 | 11 | 205 | 206 | 207 | 208 | Service state 209 | 210 | 211 | 0 212 | Down 213 | 214 | 215 | 1 216 | Up 217 | 218 | 219 | 220 | 221 | 222 | -------------------------------------------------------------------------------- /templates/process_active.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 3.2 4 | 2017-09-28T11:28:55Z 5 | 6 | 7 | Templates 8 | 9 | 10 | 11 | 224 | 225 | 226 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Version: 1.0 3 | set -e 4 | 5 | # Find netcat command or die 6 | NC_CMD=$(command -v netcat || command -v nc || exit 1) 7 | # Path of docker socket 8 | DOCKER_SOCKET=/var/run/docker.sock 9 | # Statistics directory (parent directory must exist and be writable for user running script) 10 | STATS_DIR=/tmp/zabbix-docker-stats 11 | 12 | # Chech if docker socket is writable with the current user 13 | if [ -w "$DOCKER_SOCKET" ]; then 14 | NC="$NC_CMD" 15 | else 16 | # Current user does not belong to docker group, use sudo (requires that sudo rights given correctly in the system) 17 | NC="sudo $NC_CMD" 18 | fi 19 | 20 | # Create statistics directory if it does not exist 21 | if [ ! -e "$STATS_DIR" ]; then 22 | mkdir -p $STATS_DIR 23 | fi 24 | 25 | # Executes GET command to docker socket 26 | # Parameters: 1 - docker command 27 | docker_get() { 28 | RESPONSE=$(printf "GET $1 HTTP/1.0\r\n\r\n" | $NC -U $DOCKER_SOCKET | tail -n 1) 29 | } 30 | 31 | # Executes command in docker container 32 | # Parameters: 1 - Container name 33 | # 2 - Command and arguments in quoted comma separated list (e.g. "ls", "-l") 34 | docker_exec() { 35 | # Create command execution 36 | local BODY="{\"AttachStdout\": true, \"Cmd\": [$2]}" 37 | local CREATE_RESPONSE 38 | CREATE_RESPONSE=$(printf "POST /containers/$1/exec HTTP/1.0\r\nContent-Type: application/json\r\nContent-Length: ${#BODY}\r\n\r\n${BODY}" | $NC -U $DOCKER_SOCKET | tail -n 1) 39 | local RUN_ID=$(echo $CREATE_RESPONSE | jq ".Id // empty " | sed -e 's/"//g') 40 | 41 | # Start execution 42 | if [ "$RUN_ID" != "" ]; then 43 | # tr at end is used to suppress warning on bash >=4.4 44 | RESPONSE=$(printf "POST /exec/$RUN_ID/start HTTP/1.0\r\nContent-Type: application/json\r\nContent-Length: 2\r\n\r\n{}" | $NC -U $DOCKER_SOCKET | tr -d '\0') 45 | else 46 | RESPONSE="" 47 | fi 48 | } 49 | 50 | # Obtains last line from execution of cat file on docker container 51 | # Parameters:: 1 - Container name 52 | # 2 - File in container 53 | cat_single_value() { 54 | local CMD="\"cat\", \"$2\"" 55 | docker_exec $1 "$CMD" 56 | local VALUE=$(echo "$RESPONSE" | tail -n 1 | tr -cd "[:print:]") 57 | echo $VALUE 58 | } 59 | 60 | # Updates timestamp of statistic and returns the time elapsed since last update 61 | # in nanoseconds 62 | # Parameters: 1 - Container name 63 | # 2 - Statistic name 64 | update_stat_time() { 65 | local UTIME_FILE="$STATS_DIR/$1/$2.utime" 66 | local NEW_VALUE=$(date +%s%N) 67 | 68 | if [ ! -e "$UTIME_FILE" ]; then 69 | printf "0" >$UTIME_FILE 70 | fi 71 | local OLD_VALUE=$(cat $UTIME_FILE) 72 | 73 | printf "$NEW_VALUE" >$UTIME_FILE 74 | TIMEDIFF=$((NEW_VALUE-OLD_VALUE)) 75 | printf $TIMEDIFF 76 | } 77 | 78 | # Updates statistic value and prints the old value 79 | # Parameters: 1 - Container name 80 | # 2 - Statistic name 81 | # 3 - New monitored value 82 | update_stat() { 83 | local STAT_FILE="$STATS_DIR/$1/$2" 84 | local NEW_VALUE=$3 85 | if [ ! -e "$STATS_DIR/$1" ]; then 86 | mkdir -p "$STATS_DIR/$1" 87 | fi 88 | 89 | if [ ! -e "$STAT_FILE" ]; then 90 | printf "0" >$STAT_FILE 91 | fi 92 | 93 | cat $STAT_FILE 94 | printf "$NEW_VALUE" >$STAT_FILE 95 | } 96 | 97 | # Statistic: Number of running docker containers 98 | # Parameters: 1 - all or running; defaults to running 99 | count() { 100 | if [ "$1" = "all" ]; then 101 | docker_get "/containers/json?all=true" 102 | else 103 | docker_get "/containers/json" 104 | fi 105 | echo $RESPONSE | jq "length" 106 | } 107 | 108 | count_all() { 109 | count all 110 | } 111 | 112 | # Docker container discovery 113 | # Parameters: 1 - all or running; defaults to running 114 | discovery() { 115 | if [ "$1" = "all" ]; then 116 | docker_get "/containers/json?all=true" 117 | else 118 | docker_get "/containers/json" 119 | fi 120 | LEN=$(echo $RESPONSE | jq "length") 121 | for I in $(seq 0 $((LEN-1))) 122 | do 123 | NAME=$(echo "$RESPONSE"|jq --raw-output ".[$I].Names[0]"|sed -e 's/^\///') 124 | ID=$(echo "$RESPONSE"|jq --raw-output ".[$I].Id") 125 | IMAGENAME=$(echo "$RESPONSE"|jq --raw-output ".[$I].Image"|sed -e 's/:.*//') 126 | IMAGETAG=$(echo "$RESPONSE"|jq --raw-output ".[$I].Image"|sed -e 's/.*://') 127 | 128 | DATA="$DATA,"'{"{#CONTAINERNAME}":"'$NAME'","{#CONTAINERID}":"'$ID'","{#IMAGENAME}":"'$IMAGENAME'","{#IMAGETAG}":"'$IMAGETAG'"' 129 | 130 | # Compatibility with www.monitoringartist.com Docker template 131 | DATA="$DATA,"'"{#HCONTAINERID}":"'$ID'"}' 132 | 133 | done 134 | echo '{"data":['${DATA#,}']}' 135 | } 136 | 137 | discovery_all() { 138 | discovery all 139 | } 140 | 141 | # Statistic: Container status 142 | status() { 143 | docker_get "/containers/$1/json" 144 | STATUS=$(echo $RESPONSE | jq ".State.Status" 2>/dev/null | sed -e 's/\"//g') 145 | 146 | if [ "$STATUS" = "running" ]; then 147 | # Running 148 | echo "1" 149 | elif [ "$STATUS" = "created" ] || [ "$STATUS" = "paused" ] || [ "$STATUS" = "restarting" ]; then 150 | # Not started (purposefully) 151 | echo "2" 152 | elif [ "$STATUS" = "exited" ] && [ "$(echo $RESPONSE | jq '.State.ExitCode')" = "0" ]; then 153 | # Stopped purposefully with ok exit code => status is "not started" 154 | echo "2" 155 | elif [ "$STATUS" = "exited" ] && [ "$(echo $RESPONSE | jq '.State.ExitCode')" = "137" ]; then 156 | # Stopped purposefully with sigkill (exit code 137) => status is "not started" 157 | echo "2" 158 | else 159 | # Exited with error (accidentally) or no such container exists 160 | echo "0" 161 | fi 162 | } 163 | 164 | # Container up and runnig? 1 (yes) or 0 (no) 165 | up() { 166 | docker_get "/containers/$1/json" 167 | STATUS=$(echo $RESPONSE | jq ".State.Status" 2>/dev/null | sed -e 's/\"//g') 168 | 169 | # Running 170 | if [ "$STATUS" = "running" ]; then 171 | echo "1" 172 | else 173 | echo "0" 174 | fi 175 | } 176 | 177 | # Statistic: Container uptime 178 | uptime() { 179 | docker_get "/containers/$1/json" 180 | # if running 181 | if [ "$(echo $RESPONSE | jq '.State.Running')" = "true" ]; then 182 | local STARTED=$(echo $RESPONSE | jq ".State.StartedAt" | sed -e 's/\"//g') 183 | local STARTED_S=$(date -d $STARTED +%s) 184 | local NOW_S=$(date +%s) 185 | UPTIME=$((NOW_S-STARTED_S)) 186 | echo $UPTIME 187 | else 188 | # not running, uptime always zero (must output some number so that zabbix item won't get into error state) 189 | echo "0" 190 | fi 191 | } 192 | 193 | # Statistic: Container memory 194 | memory() { 195 | NEW_VALUE=$(cat_single_value $1 "/sys/fs/cgroup/memory/memory.usage_in_bytes") 196 | if [ "$NEW_VALUE" = "" ]; then 197 | echo "0" 198 | else 199 | echo $NEW_VALUE 200 | fi 201 | } 202 | 203 | # Statistic: Container disk usage 204 | disk() { 205 | docker_get "/containers/$1/json?size=1" 206 | echo $RESPONSE | jq ".SizeRootFs" 207 | } 208 | 209 | # Statistic: Container CPU usage 210 | cpu() { 211 | NEW_VALUE=$(cat_single_value $1 "/sys/fs/cgroup/cpuacct/cpuacct.usage") 212 | if [ "$NEW_VALUE" = "" ]; then 213 | echo "0.0000" 214 | else 215 | OLD_VALUE=$(update_stat $1 "cpuacct.usage" "$NEW_VALUE") 216 | TIMEDIFF=$(update_stat_time $1 "cpuacct.usage") 217 | perl -e "print sprintf(\"%.4f\", (($NEW_VALUE-$OLD_VALUE)<0?0:($NEW_VALUE-$OLD_VALUE)/$TIMEDIFF*100))" # cpu percent 218 | fi 219 | } 220 | 221 | # Statistic: Container network traffic in 222 | netin() { 223 | NEW_VALUE=$(cat_single_value $1 "/sys/devices/virtual/net/eth0/statistics/rx_bytes") 224 | if [ "$NEW_VALUE" = "" ]; then 225 | echo "0" 226 | else 227 | OLD_VALUE=$(update_stat $1 "rx_bytes" "$NEW_VALUE") 228 | TIMEDIFF=$(update_stat_time $1 "rx_bytes") 229 | perl -e "print int(($NEW_VALUE-$OLD_VALUE)<0?0:($NEW_VALUE-$OLD_VALUE)/$TIMEDIFF*1000000000)" # nanos to seconds 230 | fi 231 | } 232 | 233 | # Statistic: Container network traffic out 234 | netout() { 235 | NEW_VALUE=$(cat_single_value $1 "/sys/devices/virtual/net/eth0/statistics/tx_bytes") 236 | if [ "$NEW_VALUE" = "" ]; then 237 | echo "0" 238 | else 239 | OLD_VALUE=$(update_stat $1 "tx_bytes" "$NEW_VALUE") 240 | TIMEDIFF=$(update_stat_time $1 "tx_bytes") 241 | perl -e "print int(($NEW_VALUE-$OLD_VALUE)<0?0:($NEW_VALUE-$OLD_VALUE)/$TIMEDIFF*1000000000)" # nanos to seconds 242 | fi 243 | } 244 | 245 | # Container image up and runnig? 1 (yes) or 0 (no) 246 | image_up() { 247 | running_containerid $1 248 | if [ "$CONTAINER_ID" = "" ]; then 249 | echo 0 250 | else 251 | echo 1 252 | fi 253 | } 254 | 255 | # Statistic: Container image uptime 256 | image_uptime() { 257 | running_containerid $1 258 | if [ "$CONTAINER_ID" = "" ]; then 259 | echo 0 260 | else 261 | uptime $CONTAINER_ID 262 | fi 263 | } 264 | 265 | # Statistic: Container image memory 266 | image_memory() { 267 | running_containerid $1 268 | if [ "$CONTAINER_ID" = "" ]; then 269 | echo 0 270 | else 271 | memory $CONTAINER_ID 272 | fi 273 | } 274 | 275 | # Statistic: Container image disk usage 276 | image_disk() { 277 | running_containerid $1 278 | if [ "$CONTAINER_ID" = "" ]; then 279 | echo 0 280 | else 281 | disk $CONTAINER_ID 282 | fi 283 | } 284 | 285 | # Statistic: Container image CPU usage 286 | image_cpu() { 287 | running_containerid $1 288 | if [ "$CONTAINER_ID" = "" ]; then 289 | echo 0 290 | else 291 | cpu $CONTAINER_ID 292 | fi 293 | } 294 | 295 | # Statistic: Container image network traffic in 296 | image_netin() { 297 | running_containerid $1 298 | if [ "$CONTAINER_ID" = "" ]; then 299 | echo 0 300 | else 301 | netin $CONTAINER_ID 302 | fi 303 | } 304 | 305 | # Statistic: Container image network traffic out 306 | image_netout() { 307 | running_containerid $1 308 | if [ "$CONTAINER_ID" = "" ]; then 309 | echo 0 310 | else 311 | netout $CONTAINER_ID 312 | fi 313 | } 314 | 315 | # Returns all running container IDs for image, one per line 316 | image_containerids() { 317 | containerids $1 318 | for i in $CONTAINER_IDS; do 319 | echo $i 320 | done 321 | } 322 | 323 | # Returns all existing container IDs for image, one per line 324 | image_containerids_all() { 325 | containerids $1 all 326 | for i in $CONTAINER_IDS; do 327 | echo $i 328 | done 329 | } 330 | 331 | # Get sole running container ID for image into CONTAINER_ID 332 | # - Exit with message if multiple running 333 | # - Empty CONTAINER_ID if there are no running containers 334 | running_containerid() { 335 | CONTAINER_ID="" 336 | containerids $1 337 | for i in $CONTAINER_IDS; do 338 | if [ "$CONTAINER_ID" = "" ]; then 339 | CONTAINER_ID=$i 340 | else 341 | echo "Multiple running containers for image" 342 | exit 1 343 | fi 344 | done 345 | } 346 | 347 | # Get container IDs for imagename into CONTAINER_IDS 348 | containerids() { 349 | IMAGENAME=$1 350 | if [ "$2" = "all" ]; then 351 | docker_get "/containers/json?all=true" 352 | else 353 | docker_get "/containers/json" 354 | fi 355 | CONTAINER_IDS=$(echo $RESPONSE | jq '.[]|select(.Image|test("^'$IMAGENAME'(:.*)?$"))|.Id' | sed -e 's/\"//g') 356 | } 357 | 358 | if [ $# -eq 0 ]; then 359 | echo "No arguments" 360 | exit 1 361 | elif [ $# -eq 1 ]; then 362 | $1 363 | elif [ $# -eq 2 ]; then 364 | # Compatibility with www.monitoringartist.com docker template: 365 | # Remove leading slash from container id 366 | CONT_ID=$(echo "$1" | sed 's/^\///') 367 | 368 | # Execute statistic function with container argument 369 | $2 "$CONT_ID" 370 | fi 371 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/kubernetes_monitoring.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Kubernetes monitoring 5 | Version: 1.2 6 | 7 | Usage: 8 | python kubernetes_monitoring.py pods 9 | python kubernetes_monitoring.py pods -c -f 10 | 11 | python kubernetes_monitoring.py nodes 12 | 13 | python kubernetes_monitoring.py services 14 | 15 | python kubernetes_monitoring.py cronjobs 16 | python kubernetes_monitoring.py cronjobs -c -f 17 | python kubernetes_monitoring.py cronjobs -c -f 18 | --host-name 19 | --minutes 20 | """ 21 | 22 | # Python imports 23 | from argparse import ArgumentParser 24 | import datetime 25 | import json 26 | import os 27 | import sys 28 | 29 | # 3rd party imports 30 | from kubernetes import client, config 31 | from pyzabbix import ZabbixMetric 32 | 33 | from zabbix_sender_psk import ZabbixSenderPSK as ZabbixSender 34 | 35 | epoch_start = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) 36 | system_time = datetime.datetime.now(datetime.timezone.utc) 37 | 38 | # Loop cron jobs and create discovery 39 | def cronjobs(args, v1): 40 | 41 | # Retrieve cron jobs from Kubernetes API v1 42 | api_client = client.ApiClient() 43 | api_instance = client.BatchV1Api(api_client) 44 | api_response = api_instance.list_job_for_all_namespaces( 45 | watch=False, 46 | field_selector=args.field_selector 47 | ) 48 | 49 | # Declare variables 50 | cronjobs = {} 51 | start_interval = int((( 52 | system_time - datetime.timedelta(minutes=args.minutes)) - epoch_start 53 | ).total_seconds()) 54 | 55 | # Check API response before listing 56 | if not api_response: 57 | raise Exception("Unable to retrieve API response.") 58 | 59 | # Loop API response items 60 | for item in api_response.items: 61 | 62 | # Reset loop variables 63 | completion_time = None 64 | job_length = 0 65 | job_name = None 66 | job_status = 0 67 | packet = [] 68 | start_time = None 69 | 70 | # Discard active cron jobs 71 | if item.status.active is not None: 72 | continue 73 | 74 | # Check and convert completion time to epoch 75 | if item.status.completion_time: 76 | completion_time = int( 77 | (item.status.completion_time - epoch_start).total_seconds() 78 | ) 79 | 80 | # Skip completed jobs that are outside the interval range 81 | if completion_time and completion_time < start_interval: 82 | continue 83 | 84 | # Check and convert start time to epoch 85 | if item.status.start_time: 86 | start_time = int( 87 | (item.status.start_time - epoch_start).total_seconds() 88 | ) 89 | 90 | # Calculate cron job length 91 | if completion_time and start_time: 92 | job_length = int(completion_time - start_time) 93 | 94 | # Only retrieve data from cron jobs 95 | for owner_reference in item.metadata.owner_references: 96 | if owner_reference.kind != "CronJob": 97 | continue 98 | 99 | # Retrieve job name 100 | job_name = owner_reference.name 101 | 102 | # If job name was not retrieved, kind was not CronJob 103 | if not job_name: 104 | continue 105 | 106 | # Check job status comparing succeeded and status fields 107 | if item.status.succeeded and item.status.succeeded > 0 and item.status.failed is None: 108 | job_status = 1 109 | 110 | # Set job data to dictionary 111 | cronjobs[job_name] = { 112 | "{#CRONJOB}": job_name, 113 | "completion_time": completion_time, 114 | "length": job_length, 115 | "name": job_name, 116 | "start_time": start_time, 117 | "status": job_status, 118 | "uid": item.metadata.uid 119 | } 120 | 121 | # If instance name is not set, we output discovery 122 | if not args.host_name: 123 | 124 | # Loop and append jobs to output list 125 | for cron_job in cronjobs: 126 | output.append(cronjobs[cron_job]) 127 | 128 | # Dump discovery 129 | discovery = {"data": output} 130 | print(json.dumps(discovery)) 131 | 132 | else: 133 | # Append item data to list 134 | for cron_job in cronjobs: 135 | packet.append(ZabbixMetric( 136 | args.host_name, 137 | f'kubernetes.cronjob["{cron_job}"]', 138 | json.dumps(cronjobs[cron_job]), 139 | cronjobs[cron_job].get("completion_time") 140 | )) 141 | 142 | # Send data using ZabbixSender 143 | result = ZabbixSender().send(packet) 144 | 145 | # Print result 146 | print(result) 147 | 148 | 149 | # Loop pods and create discovery 150 | def pods(args, v1): 151 | 152 | # Retrieve pods from Kubernetes API v1 153 | pods = v1.list_pod_for_all_namespaces( 154 | watch=False, 155 | field_selector=args.field_selector 156 | ) 157 | 158 | # Check pods before listing 159 | if pods: 160 | for pod in pods.items: 161 | 162 | # Retrieve container's restart counts 163 | container_started = None # Container's start time 164 | kind = None # Pod's kind found under metadata.owner_references 165 | restart_count = 0 # Container's restart count 166 | started_at = None # Latest start time 167 | uptime = datetime.timedelta() # Datetime object for latest uptime 168 | 169 | # Loop possible owner_references and retrieve "kind"-field 170 | if pod.metadata.owner_references: 171 | for ref in pod.metadata.owner_references: 172 | kind = ref.kind 173 | 174 | # Pods that are identified as "Job" are skipped 175 | if kind == "Job": 176 | continue 177 | 178 | # Check if container_statuses is available 179 | if pod.status.container_statuses: 180 | 181 | # Loop containers and retrieve information 182 | for container in pod.status.container_statuses: 183 | restart_count = int(container.restart_count) 184 | 185 | # Check "running"-state first, then "terminated"-state 186 | if container.state.running is not None: 187 | container_started = container.state.running.started_at 188 | elif container.state.terminated is not None: 189 | container_started = container.state.terminated.started_at 190 | else: 191 | continue 192 | 193 | # First time around, grab the first start time 194 | if not started_at: 195 | started_at = container_started 196 | # Compare previous container's start time to current one 197 | elif started_at < container_started: 198 | started_at = container_started 199 | 200 | # Count uptime 201 | if started_at: 202 | uptime = system_time - started_at 203 | 204 | # Append information to output list 205 | output.append({ 206 | "{#POD}": pod.metadata.name, 207 | "restart_count": restart_count, 208 | "ip": pod.status.pod_ip, 209 | "namespace": pod.metadata.namespace, 210 | "pod": pod.metadata.name, 211 | "uptime": uptime.total_seconds() 212 | }) 213 | 214 | # Dump discovery 215 | discovery = {"data": output} 216 | print(json.dumps(discovery)) 217 | 218 | 219 | # Loop nodes and create discovery 220 | def nodes(args, v1): 221 | 222 | # Retrieve nodes from Kubernetes API v1 223 | nodes = v1.list_node( 224 | watch=False, 225 | field_selector=args.field_selector 226 | ) 227 | 228 | # Check nodes before listing 229 | if nodes: 230 | for node in nodes.items: 231 | 232 | # Node status is retrieved from node's conditions. Possible 233 | # conditions are: Ready, MemoryPressure, PIDPressure, DiskPressure 234 | # and NetworkUnavailable. We are interested only in the main one, 235 | # "Ready", which describes if the node is healthy and ready to 236 | # accept pods. 237 | status = "" 238 | for condition in node.status.conditions: 239 | if condition.type == "Ready": 240 | status = condition.status 241 | 242 | # Append information to output list 243 | output.append({ 244 | "{#NODE}": node.status.node_info.machine_id, 245 | "node": next((i.address for i in node.status.addresses if i.type == "Hostname"), node.status.node_info.machine_id), 246 | "allocatable_cpu": node.status.allocatable.get("cpu"), 247 | "allocatable_storage": node.status.allocatable.get("ephemeral-storage"), 248 | "allocatable_memory": node.status.allocatable.get("memory"), 249 | "capacity_cpu": node.status.capacity.get("cpu"), 250 | "capacity_storage": node.status.capacity.get("ephemeral-storage"), 251 | "capacity_memory": node.status.capacity.get("memory"), 252 | "external_ip": next((i.address for i in node.status.addresses if i.type == "ExternalIP"), ""), 253 | "machine_id": node.status.node_info.machine_id, 254 | "status": status, 255 | "system_uuid": node.status.node_info.system_uuid 256 | }) 257 | 258 | # Dump discovery 259 | discovery = {"data": output} 260 | print(json.dumps(discovery)) 261 | 262 | 263 | # Loop services and create discovery 264 | def services(args, v1): 265 | 266 | # Retrieve services from Kubernetes API v1 267 | services = v1.list_service_for_all_namespaces( 268 | watch=False, 269 | field_selector=args.field_selector 270 | ) 271 | 272 | # Check services before listing 273 | if services: 274 | for service in services.items: 275 | 276 | # Append information to output list 277 | output.append({ 278 | "{#SERVICE}": service.metadata.name, 279 | "namespace": service.metadata.namespace, 280 | "service": service.metadata.name, 281 | "uid": service.metadata.uid 282 | }) 283 | 284 | # Dump discovery 285 | discovery = {"data": output} 286 | print(json.dumps(discovery)) 287 | 288 | 289 | if __name__ == "__main__": 290 | 291 | # Declare variables 292 | output = [] # List for output data 293 | 294 | # Parse command-line arguments 295 | parser = ArgumentParser( 296 | description="Discover and retrieve metrics from Kubernetes.", 297 | ) 298 | 299 | # Use sub-parsers run functions using mandatory positional argument 300 | subparsers = parser.add_subparsers() 301 | parser_cronjobs = subparsers.add_parser("cronjobs") 302 | parser_cronjobs.set_defaults(func=cronjobs) 303 | parser_pods = subparsers.add_parser("pods") 304 | parser_pods.set_defaults(func=pods) 305 | parser_services = subparsers.add_parser("services") 306 | parser_services.set_defaults(func=services) 307 | parser_nodes = subparsers.add_parser("nodes") 308 | parser_nodes.set_defaults(func=nodes) 309 | 310 | # Each subparser has the same optional arguments. For now. 311 | for item in [parser_cronjobs, parser_pods, parser_nodes, parser_services]: 312 | item.add_argument("-c", "--config", default="", dest="config", 313 | type=str, 314 | help="Configuration file for Kubernetes client.") 315 | item.add_argument("-f", "--field-selector", default="", 316 | dest="field_selector", type=str, 317 | help="Filter results using field selectors.") 318 | item.add_argument("-hn", "--host-name", default="", 319 | dest="host_name", type=str, 320 | help="Zabbix host name for sending item data.") 321 | item.add_argument("-m", "--minutes", default=5, 322 | dest="minutes", type=int, 323 | help="Interval for cron job retrieval.") 324 | 325 | args = parser.parse_args() 326 | 327 | # Check configuration file 328 | if args.config != "": 329 | if not os.path.isfile(args.config): 330 | print("Configuration file is not valid.") 331 | sys.exit() 332 | 333 | # Load kubernetes configuration 334 | try: 335 | if args.config != "": 336 | config.load_kube_config(config_file=args.config) 337 | else: 338 | config.load_kube_config() 339 | except Exception as e: 340 | print(f"Unable to load Kubernetes configuration file. Error: {e}") 341 | sys.exit() 342 | 343 | # Initialize Kubernetes client 344 | v1 = client.CoreV1Api() 345 | 346 | # Run specified mode 347 | args.func(args, v1) 348 | -------------------------------------------------------------------------------- /etc/zabbix/scripts/pacemaker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python2 2 | # Version: 1.0 3 | # Get Pacemaker status. Adding -v option to command prints a more verbose string 4 | # Otherwise returns decimal or single word statuses. 5 | # 6 | # Get the cluster status in verbose format: 7 | # pacemaker_status.py -i cluster -v 8 | # Get the simple cluster status, 0 if no nodes, 1 if running ok, 2 if any in standby 9 | # 3 if any in maintenance, 4 if any in shutdown 10 | # pacemaker_status.py -i cluster 11 | # Count the resources in given state. e.g. how many failed: 12 | # pacemaker_status.py -i cluster -p failed 13 | # Sum the failcount in cluster: 14 | # pacemaker_status.py -i cluster -p fail-count 15 | # Get status of the single resource. Returns count of resources running 16 | # pacemaker_status.py -i resource -n Grafana 17 | # Get the property value for single resource in given node. If node is not given 18 | # returns true if all the nodes have the property set to "true". 19 | # pacemaker_status.py -i resource -n Grafana -N application1 -p managed 20 | # Get the status on node, returns count of services running 21 | # pacemaker_status.py -i node -n application1 22 | # Get the status on node, returns verbose string of resource status 23 | # pacemaker_status.py -i node -n application1 -v 24 | # Get the nodes where resource is active. Returns in format resource:node1,node2 25 | # pacemaker_status.py -i resource -n Grafana -l 26 | # Get all resources in the cluster and nodes where they are active. Returns each 27 | # resource and the nodes, separated by space 28 | # pacemaker_status.py -i cluster -l 29 | # Get last failure in cluster 30 | # pacemaker_status.py -i cluster -f 31 | 32 | 33 | import argparse 34 | import sys 35 | import subprocess 36 | from datetime import datetime 37 | from lxml import etree 38 | 39 | def process_xml(): 40 | command = "sudo crm_mon -X" 41 | process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 42 | xml, error = process.communicate() 43 | if error: 44 | print("Could not read command output: " + error.decode("utf-8")) 45 | exit() 46 | try: 47 | root = etree.fromstring(xml) 48 | except Exception as e: 49 | if ("Connection to cluster failed: Transport endpoint is not connected" in xml): 50 | # cluster is not running, all queries default to 0 51 | print("0") 52 | exit() 53 | else: 54 | print("Could not get xml from crm_mon, check command righs.") 55 | print(xml) 56 | raise e 57 | return root 58 | 59 | # simple check, return count of active nodes that are running 60 | # or return true if property is true for the nodeset or node, otherwise false 61 | # for fail-count, return the summed up fail-count 62 | def resource_status_simple(args): 63 | root = process_xml() 64 | if args.property and args.property == "fail-count": 65 | xpath = "sum(/crm_mon/node_history/node/resource_history[@id = '" + args.name +"']/@fail-count)" 66 | if args.node: 67 | xpath = "sum(/crm_mon/node_history/node[@name='" + args.node + "']/resource_history[@id = '" + args.name +"']/@fail-count)" 68 | fail_count = root.xpath(xpath) 69 | print(fail_count) 70 | elif args.property: 71 | prop_status = "true" 72 | xpath = "/crm_mon/resources//resource[@id='" + args.name + "']/@" + args.property 73 | if args.node: 74 | # if a node was defined, check only that one with xpath, otherwise print false if 75 | # any of the nodes had false status 76 | xpath = "/crm_mon/resources//resource[node/@name = '" +args.node+ "'][@id='" + args.name + "']/@" + args.property 77 | props = root.xpath(xpath) 78 | for prop in props: 79 | if prop == "false": 80 | prop_status = "false" 81 | print(prop_status) 82 | else: 83 | xpath = "count(/crm_mon/resources//resource[@id='" + args.name + "' and (@role = 'Started' or 'Master')][@active='true'][@orphaned='false'][@managed='true'][@failed='false'][@failure_ignored='false'][@nodes_running_on > 0])" 84 | if args.node: 85 | # if a node was defined, check only that one with xpath 86 | xpath = "count(/crm_mon/resources//resource[node/@name = '" +args.node+ "'][@id='" + args.name + "' and (@role = 'Started' or 'Master')][@active='true'][@orphaned='false'][@managed='true'][@failed='false'][@failure_ignored='false'][@nodes_running_on > 0][node/@name = '" + args.node + "'])" 87 | count = root.xpath(xpath) 88 | print(count) 89 | 90 | # check the status of given resource, return node:status for resources 91 | def resource_status(args): 92 | root = process_xml() 93 | resource_status = "" 94 | if args.node: 95 | status = resource_verbose(root, args.node, args.name) 96 | xpath = "/crm_mon/resources//resource[@id='" + args.name + "'][node/@name = '" +args.node+ "']/@role" 97 | role_query = root.xpath(xpath) 98 | 99 | if role_query: 100 | role = role_query[0] 101 | else: 102 | role = "NotRunning" 103 | 104 | resource_status = args.node + ":" + role 105 | 106 | if status != "": 107 | resource_status += "[" + status + "]" 108 | 109 | else: 110 | # get list of nodes 111 | xpath = "/crm_mon/nodes/node/@name" 112 | nodes = root.xpath(xpath) 113 | 114 | # get status for each node 115 | for i in range(len(nodes)): 116 | if i > 0: 117 | resource_status += " " 118 | status = resource_verbose(root, nodes[i], args.name) 119 | xpath = "/crm_mon/resources//resource[@id='" + args.name + "'][node/@name = '" +nodes[i]+ "']/@role" 120 | role_query = root.xpath(xpath) 121 | if role_query: 122 | role = role_query[0] 123 | else: 124 | role = "NotRunning" 125 | 126 | resource_status += nodes[i] + ":" + role 127 | 128 | if status != "": 129 | resource_status += "[" + status + "]" 130 | 131 | print(resource_status) 132 | 133 | # verbose resource printout, used for node and cluster also 134 | def resource_verbose(root,node,resource): 135 | resource_status = "" 136 | resource_statuses = [] 137 | 138 | xpath = "/crm_mon/resources//resource[@id='" + resource + "'][node/@name = '" + node + "']/@active" 139 | active = root.xpath(xpath) 140 | xpath = "/crm_mon/resources//resource[@id='" + resource + "'][node/@name = '" + node + "']/@orphaned" 141 | orphaned = root.xpath(xpath) 142 | xpath = "/crm_mon/resources//resource[@id='" + resource + "'][node/@name = '" + node + "']/@managed" 143 | managed = root.xpath(xpath) 144 | xpath = "/crm_mon/resources//resource[@id='" + resource + "'][node/@name = '" + node + "']/@failed" 145 | failed = root.xpath(xpath) 146 | xpath = "/crm_mon/resources//resource[@id='" + resource + "'][node/@name = '" + node + "']/@failure_ignored" 147 | failure_ignored = root.xpath(xpath) 148 | xpath = "/crm_mon/resources//resource[@id='" + resource + "'][node/@name = '" + node + "']/@nodes_running_on" 149 | nodes_running = root.xpath(xpath) 150 | xpath = "/crm_mon/node_history/node[@name='" + node + "']/resource_history[@id = '" + resource +"']/@fail-count" 151 | fail_count = root.xpath(xpath) 152 | 153 | 154 | if "false" in active: 155 | resource_statuses.append("inactive") 156 | if "false" in managed: 157 | resource_statuses.append("unmanaged") 158 | if "true" in orphaned: 159 | resource_statuses.append("orphaned") 160 | if "true" in failed: 161 | resource_statuses.append("failed") 162 | if "true" in failure_ignored: 163 | resource_statuses.append("failure_ignored") 164 | if "0" in nodes_running: 165 | resource_statuses.append("nodes_running_on=0") 166 | if fail_count: 167 | resource_statuses.append("fail-count=" + fail_count[0]) 168 | 169 | resource_status += ",".join(resource_statuses) 170 | 171 | return resource_status 172 | 173 | def node_status_simple(args): 174 | root = process_xml() 175 | xpath = "/crm_mon/nodes/node[@name='" + args.name + "']/@resources_running" 176 | count = root.xpath(xpath) 177 | if len(count) > 0: 178 | print(count[0]) 179 | else: 180 | print("0") 181 | 182 | # used also in cluster status 183 | def node_verbose(root,node): 184 | xpath = "/crm_mon/nodes/node[@name='" + node + "']/@online" 185 | online = root.xpath(xpath) 186 | xpath = "/crm_mon/nodes/node[@name='" + node + "']/@standby" 187 | standby = root.xpath(xpath) 188 | xpath = "/crm_mon/nodes/node[@name='" + node + "']/@maintenance" 189 | maintenance = root.xpath(xpath) 190 | xpath = "/crm_mon/nodes/node[@name='" + node + "']/@resources_running" 191 | resource_count = root.xpath(xpath) 192 | node_status = node 193 | 194 | if not(online): 195 | node_status += ":Not found" 196 | else: 197 | if (online[0] == 'true'): 198 | node_status += ":online" 199 | if (standby[0] == 'true'): 200 | node_status += ":standby" 201 | if (maintenance[0] == 'true'): 202 | node_status += ":maintenance" 203 | 204 | # prepare resources dict 205 | resources_status = {} 206 | xpath = "/crm_mon/resources//resource[node/@name = '" + node + "']/@id" 207 | resources = root.xpath(xpath) 208 | for resource in resources: 209 | resources_status[resource] = [] 210 | 211 | # include also node history 212 | xpath = "/crm_mon/node_history/node[@name='"+node+"']/resource_history/@id" 213 | resources_history = root.xpath(xpath) 214 | for resource in resources_history: 215 | resources_status[resource] = [] 216 | 217 | for resource in resources_status: 218 | status = resource_verbose(root,node,resource) 219 | if len(status) > 0: 220 | node_status += ":" + resource + "[" + status + "]" 221 | 222 | node_status += ":resources_running=" + resource_count[0] 223 | 224 | return node_status 225 | 226 | def node_status(args): 227 | root = process_xml() 228 | node_status = node_verbose(root,args.name) 229 | print(node_status) 230 | 231 | # print cluster status in a string of data 232 | # includes resources information 233 | def cluster_status(): 234 | root = process_xml() 235 | cluster_status = "" 236 | 237 | xpath = "/crm_mon/nodes/node/@name" 238 | nodes = root.xpath(xpath) 239 | xpath = "/crm_mon/nodes/node/@resources_running" 240 | res_running = root.xpath(xpath) 241 | xpath = "/crm_mon/summary/resources_configured/@number" 242 | res_configured = root.xpath(xpath) 243 | res_running_total = 0 244 | 245 | # gather a string of status data 246 | for i in range(len(nodes)): 247 | res_running_total += int(res_running[i]) 248 | if i > 0: 249 | cluster_status += " " 250 | cluster_status += node_verbose(root,nodes[i]) 251 | 252 | cluster_status += " resources=" + str(res_running_total) + "/" + str(res_configured[0]) 253 | print(cluster_status) 254 | 255 | 256 | # simple status, return 0 if no nodes, 1 if at running, 2 if any in standby 257 | # 3 if any in maintenance, 4 if any in shutdown. 5 if status cannot be determined 258 | # Does not care about resource level statuses. 259 | def cluster_status_simple(): 260 | root = process_xml() 261 | 262 | # if no nodes are found, or all nodes are offline 263 | cluster_status = "5" 264 | xpath = "/crm_mon/nodes/node/@online" 265 | online = root.xpath(xpath) 266 | xpath = "/crm_mon/nodes/node/@maintenance" 267 | maintenance = root.xpath(xpath) 268 | xpath = "/crm_mon/nodes/node/@standby" 269 | standby = root.xpath(xpath) 270 | xpath = "/crm_mon/nodes/node/@shutdown" 271 | shutdown = root.xpath(xpath) 272 | 273 | # any one node causes the status to increase 274 | for state in online: 275 | if state == "true": 276 | cluster_status = "1" 277 | for state in standby: 278 | if state == "true": 279 | cluster_status = "2" 280 | for state in maintenance: 281 | if state == "true": 282 | cluster_status = "3" 283 | for state in shutdown: 284 | if state == "true": 285 | cluster_status = "4" 286 | 287 | print(cluster_status) 288 | 289 | # count statuses from all resources for given property 290 | # e.g. how many failed, how many managed. 291 | def cluster_statuses_simple(args): 292 | root = process_xml() 293 | 294 | if args.property == "nodes_running_on": 295 | print("Nonsensical parameter for cluster proprety count.") 296 | exit() 297 | elif args.property == "fail-count": 298 | xpath = "sum(/crm_mon/node_history/node/resource_history/@fail-count)" 299 | property_count = root.xpath(xpath) 300 | 301 | else: 302 | xpath = "count(/crm_mon/resources//resource[@" + args.property + " = 'true'])" 303 | property_count = root.xpath(xpath) 304 | 305 | print(property_count) 306 | 307 | # print the resource locations where resources are active 308 | def resource_location(args): 309 | root = process_xml() 310 | 311 | resource_locations = {} 312 | locations = "" 313 | xpath = "/crm_mon/resources//resource[@active = 'true']/@id" 314 | resources = root.xpath(xpath) 315 | 316 | for resource in resources: 317 | xpath = "/crm_mon/resources//resource[@active = 'true'][@id = '"+resource+"']/node/@name" 318 | nodes = root.xpath(xpath) 319 | resource_locations[resource] = nodes 320 | 321 | 322 | if (args.item == "cluster"): 323 | for resource in resource_locations: 324 | locations += resource + ":" + ",".join(resource_locations[resource]) + " " 325 | 326 | else: 327 | if len(resource_locations) == 0: 328 | print(args.name+":Not found") 329 | exit() 330 | locations = args.name +":"+ ",".join(resource_locations[args.name]) 331 | 332 | print(locations) 333 | 334 | # print last failures 335 | def cluster_failures(): 336 | root = process_xml() 337 | 338 | xpath = "/crm_mon/failures/failure/@op_key" 339 | failures = root.xpath(xpath) 340 | # get the latest failure 341 | if len(failures) > 0: 342 | failure_info = "" 343 | newest = datetime(1970, 1, 1, 0, 0) 344 | for failure in failures: 345 | 346 | xpath = "/crm_mon/failures/failure[@op_key = '"+failure+"']" 347 | element = root.xpath(xpath) 348 | #Sun Apr 16 21:46:27 2017 349 | failure_time = datetime.strptime(element[0].get("last-rc-change"), "%a %b %d %H:%M:%S %Y") 350 | if failure_time > newest: 351 | newest = failure_time 352 | failure_info += element[0].get("node") 353 | failure_info += ":" + element[0].get("op_key") 354 | failure_info += ":" + element[0].get("status") 355 | failure_info += ":" + element[0].get("last-rc-change") 356 | 357 | print(failure_info) 358 | 359 | else: 360 | # no failures 361 | exit() 362 | 363 | 364 | 365 | if __name__ == "__main__": 366 | 367 | parser = argparse.ArgumentParser(prog="pacemaker_status.py", description="Check the pacemaker cluster status") 368 | parser.add_argument("-i", "--item", help="Item type to check", choices=["resource", "node", "cluster"]) 369 | parser.add_argument("-n", "--name", help="Resource or node name to check.") 370 | parser.add_argument("-l", "--location", help="Return the node where is running.", action="store_true") 371 | parser.add_argument("-N", "--node", help="Node to check the resource in. Default checks all nodes.") 372 | parser.add_argument("-p", "--property", help="Check status of resource property", choices=["active","orphaned","managed","failed","failure_ignored","nodes_running_on","fail-count"]) 373 | parser.add_argument("-v", "--verbose", help="Verbose status", action="store_true") 374 | parser.add_argument("-f", "--failures", help="Failures", action="store_true") 375 | 376 | if len(sys.argv) > 1: 377 | 378 | args = parser.parse_args() 379 | 380 | if (args.item == "resource"): 381 | if not(args.name): 382 | print("Must define resource name.") 383 | elif (args.verbose): 384 | resource_status(args) 385 | elif (args.location): 386 | resource_location(args) 387 | else: 388 | resource_status_simple(args) 389 | elif (args.item == "node"): 390 | if not(args.name): 391 | print("Must define node name.") 392 | elif (args.verbose): 393 | node_status(args) 394 | else: 395 | node_status_simple(args) 396 | elif (args.item == "cluster"): 397 | if (args.property): 398 | cluster_statuses_simple(args) 399 | elif (args.failures): 400 | cluster_failures() 401 | elif (args.verbose): 402 | cluster_status() 403 | elif (args.location): 404 | resource_location(args) 405 | else: 406 | cluster_status_simple() 407 | else: 408 | print("No arguments given. Nothing to do.") 409 | parser.print_help() 410 | -------------------------------------------------------------------------------- /templates/pacemaker.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 3.2 4 | 2017-09-28T11:28:33Z 5 | 6 | 7 | Templates 8 | 9 | 10 | 11 | 377 | 378 | 379 | -------------------------------------------------------------------------------- /templates/pacemaker_active.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 3.2 4 | 2017-09-28T11:28:40Z 5 | 6 | 7 | Templates 8 | 9 | 10 | 11 | 377 | 378 | 379 | --------------------------------------------------------------------------------