├── exporter_monitoring.xinetd ├── recording_rules.yml ├── httpwrapper ├── monitoring_exporter.yml ├── exporter_monitoring └── README.md /exporter_monitoring.xinetd: -------------------------------------------------------------------------------- 1 | service exporter_monitoring 2 | { 3 | type = unlisted 4 | port = 10100 5 | socket_type = stream 6 | wait = no 7 | user = root 8 | server = /opt/metrics.d/httpwrapper 9 | server_args = exporter_monitoring 10 | disable = no 11 | only_from = {{ ip_address }} 12 | log_type = FILE /dev/null 13 | } 14 | -------------------------------------------------------------------------------- /recording_rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: monitoring_recording_rules 3 | interval: 1m 4 | rules: 5 | - record: monitoring_memory_usage_percent 6 | expr: ((monitoring_meminfo_memtotal_kbytes - (monitoring_meminfo_memavailable_kbytes or (monitoring_meminfo_buffers_kbytes + monitoring_meminfo_cached_kbytes + monitoring_meminfo_memfree_kbytes))) / monitoring_meminfo_memtotal_kbytes) * 100 7 | - record: monitoring_swap_usage_percent 8 | expr: ((monitoring_meminfo_swaptotal_kbytes - monitoring_meminfo_swapfree_kbytes) / monitoring_meminfo_swaptotal_kbytes) * 100 9 | -------------------------------------------------------------------------------- /httpwrapper: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # small http wrapper for bash scripts via xinetd 3 | 4 | ulimit -n 20480 5 | ulimit -l 512 6 | 7 | root='/opt/metrics.d/' 8 | file="$1" 9 | mime='text/plain' 10 | 11 | cd $root 12 | 13 | if [ -f "$root$file" ]; then 14 | $root$file > /tmp/.$$.output 15 | 16 | size=$(stat -c "%s" "/tmp/.$$.output") 17 | 18 | printf 'HTTP/1.1 200 OK\r\nDate: %s\r\nContent-Length: %s\r\nContent-Type: %s\r\nConnection: close\r\n\r\n' "$(date)" "$size" "$mime" 19 | 20 | cat /tmp/.$$.output 21 | 22 | sleep 1 23 | rm -f /tmp/.$$.output 24 | exit 0 25 | fi 26 | 27 | exit 1 28 | -------------------------------------------------------------------------------- /monitoring_exporter.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | vars: 4 | # set below to an IP address you'd like to allow to read the xinetd files 5 | ip_address: "123.123.123.123" 6 | tasks: 7 | - file: 8 | path: "/opt/metrics.d" 9 | state: directory 10 | 11 | - apt: 12 | pkg: [ "xinetd", "bc", "sysstat", "net-tools", "conntrack" ] 13 | state: present 14 | when: 15 | - "ansible_distribution == 'Ubuntu' or ansible_distribution == 'Debian'" 16 | 17 | - yum: 18 | pkg: [ "xinetd", "bc", "sysstat", "net-tools", "conntrack-tools" ] 19 | state: present 20 | when: 21 | - "ansible_distribution == 'CentOS' or ansible_distribution == 'CloudLinux'" 22 | 23 | - copy: 24 | src: "httpwrapper" 25 | dest: "/opt/metrics.d/httpwrapper" 26 | mode: 0755 27 | 28 | - copy: 29 | src: "exporter_monitoring" 30 | dest: "/opt/metrics.d/exporter_monitoring" 31 | mode: 0755 32 | 33 | - template: 34 | src: "exporter_monitoring.xinetd" 35 | dest: "/etc/xinetd.d/exporter_monitoring" 36 | 37 | - service: 38 | name: xinetd 39 | state: restarted 40 | enabled: yes 41 | -------------------------------------------------------------------------------- /exporter_monitoring: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Pawel Adamski @ https://github.com/pawadski/monitoring_exporter 3 | # tested on: bash >= 4.0 4 | # coreutils >= 8.4 5 | 6 | # BEGIN CONFIG 7 | # edit below 8 | # number of workers (subprocesses) to use 9 | workers=2 10 | # enables "modules" 11 | # loadavg - load average, jobs, last PID 12 | # stat - /proc/stat (no individual CPU granularity) 13 | # iostat - iostat -x 14 | # filesystem - df 15 | # memory - /proc/meminfo 16 | # netdev - /proc/net/dev 17 | # netstat - netstat, per-port granularity of established connections 18 | # uptime - system uptime 19 | # kernel - kernel version (as label) 20 | # slabinfo - /proc/slabinfo # disabled by default 21 | # file_nr - /proc/sys/fs/file-nr, file descriptor information 22 | # conntrack - information from the conntrack utility 23 | declare -a exporters=(loadavg stat iostat filesystem memory netdev netstat uptime kernel file_nr) 24 | # used to determine if netstat module should only report total amount of connections 25 | # useful for servers like openvz hypervisors or generally busy servers, enabled by default 26 | netstat_totalonly=1 27 | # useful for limiting the amount of metrics - enable to report only 28 | # slabs that are of non-zero size 29 | slab_nonzeroonly=1 30 | # optionally specify interfaces you'd like to pull netdev metrics for 31 | # ie. eth0 eth1, empty by default 32 | netdev_targets="" 33 | # optionally specify filesystem mount points for df 34 | # ie. / /mnt/otherfs, empty by default 35 | filesystem_targets="" 36 | # optionally specify block devices you'd like to pull iostat metrics for 37 | # ie. sda sdb, empty by default 38 | iostat_targets="" 39 | # whether to ignore CPUs with IDs > number of processors in conntrack 40 | conntrack_ignore_fake_cpuid=1 41 | # high cardinality metric! 42 | # when 0, will report conntrack stats for each cpu 43 | # when 1, will report a sum of conntrack stats for all cpus 44 | conntrack_sum_all_cpu=1 45 | # defines the exporter name, prepended to every metric name 46 | exporterName="monitoring" 47 | # define path to interpreter, in case it's different than stock 48 | bashPath="/bin/bash" 49 | 50 | # END CONFIG 51 | # 52 | # do not edit below 53 | output="" 54 | 55 | # define some associative arrays 56 | declare -A metricContent metricType metricHelp 57 | 58 | # 59 | # this function builds the metric type and help data 60 | # 61 | function buildMetric { 62 | # 1 = metric name 63 | # 2 = metric type 64 | # 3 = metric help text 65 | metricType["${exporterName}_$1"]="$2" 66 | metricHelp["${exporterName}_$1"]="$3" 67 | # metricContent["${exporterName}_$1"]="" 68 | } 69 | 70 | # 71 | # this function adds actual metric content to metric 72 | # 73 | function addMetricContent { 74 | metricName="${exporterName}_$1" 75 | declare -a contents 76 | shift; 77 | # add everything 78 | for item in "$@"; do 79 | if [ "${item:0:1}" != "{" ]; then 80 | item=" $item" 81 | fi 82 | # metricContent[$metricName]="${metricContent[$metricName]}$metricName$item\n" 83 | contents+=("$metricName$item") 84 | done 85 | if [ ! -z "${metricContent[$metricName]}" ]; then 86 | metricContent[$metricName]=$(join_by "\n" "${metricContent[$metricName]}" "${contents[*]}") 87 | else 88 | metricContent[$metricName]=$(join_by "\n" "${contents[*]}") 89 | fi 90 | } 91 | 92 | # 93 | # this builds and prints final output 94 | # 95 | function buildOutput { 96 | # 97 | # build output 98 | # 99 | for metricName in "${!metricContent[@]}"; do 100 | output="$output# HELP $metricName ${metricHelp[$metricName]}\n# TYPE $metricName ${metricType[$metricName]}\n${metricContent[$metricName]}\n" 101 | done 102 | 103 | # 104 | # print output into temp directory 105 | # 106 | echo -n "$output" 107 | 108 | exit 109 | } 110 | 111 | # 112 | # simulates array join 113 | # 114 | function join_by { local d=$1; shift; echo -n "$1"; shift; printf "%s" "${@/#/$d}"; } 115 | 116 | # 117 | # metric runners 118 | # ideally define your exporters here as metrics_XXXXXX 119 | # 120 | 121 | # 122 | # BEGIN uptime 123 | function metrics_slabinfo { 124 | buildMetric "slabinfo_active_objs" "gauge" "The number of objects that are currently active (i.e., in use)" 125 | buildMetric "slabinfo_num_objs" "gauge" "The total number of allocated objects (i.e., objects that are both in use and not in use)" 126 | buildMetric "slabinfo_objsize_bytes" "gauge" "The size of objects in this slab, in bytes" 127 | buildMetric "slabinfo_objperslab" "gauge" "The number of objects stored in each slab" 128 | buildMetric "slabinfo_pagesperslab" "gauge" "The number of pages allocated for each slab" 129 | 130 | while read slabname active_objs num_objs objsize objperslab pagesperslab etc; do 131 | if [ "$slabname" = "#" ]; then 132 | continue 133 | fi 134 | if [ "$slab_nonzeroonly" = "1" ]; then 135 | if [ "$objsize" = "0" ]; then 136 | continue 137 | fi 138 | fi 139 | addMetricContent "slabinfo_active_objs" "{slab=\"$slabname\"} $active_objs" 140 | addMetricContent "slabinfo_num_objs" "{slab=\"$slabname\"} $num_objs" 141 | addMetricContent "slabinfo_objsize_bytes" "{slab=\"$slabname\"} $objsize" 142 | addMetricContent "slabinfo_objperslab" "{slab=\"$slabname\"} $objperslab" 143 | addMetricContent "slabinfo_pagesperslab" "{slab=\"$slabname\"} $pagesperslab" 144 | done < <(grep tunables /proc/slabinfo) 145 | 146 | buildOutput 147 | } 148 | # END uptime 149 | 150 | # 151 | # BEGIN file_nr 152 | function metrics_file_nr { 153 | file_nr_array=($(cat /proc/sys/fs/file-nr)) 154 | 155 | buildMetric "file_nr_allocated" "gauge" "Total allocated file descriptors" 156 | buildMetric "file_nr_free" "gauge" "Free allocated file descriptors" 157 | buildMetric "file_nr_max" "gauge" "Max allowed open file descriptors" 158 | 159 | addMetricContent "file_nr_allocated" "${file_nr_array[0]}" 160 | addMetricContent "file_nr_free" "${file_nr_array[1]}" 161 | addMetricContent "file_nr_max" "${file_nr_array[2]}" 162 | 163 | buildOutput 164 | } 165 | # END file_nr 166 | 167 | # 168 | # BEGIN uptime 169 | function metrics_kernel { 170 | buildMetric "kernel_version" "gauge" "Current kernel version as label" 171 | addMetricContent "kernel_version" "{version=\"$(uname -r)\"} 1" 172 | 173 | buildOutput 174 | } 175 | # END uptime 176 | 177 | # 178 | # BEGIN uptime 179 | function metrics_uptime { 180 | buildMetric "uptime_seconds_total" "counter" "System uptime" 181 | addMetricContent "uptime_seconds_total" "$(awk '{print $1}' /proc/uptime)" 182 | 183 | buildOutput 184 | } 185 | # END uptime 186 | 187 | # 188 | # BEGIN netstat (sorts netstat ESTABLISHED connections by port label) 189 | function metrics_netstat { 190 | buildMetric "netstat_established_total" "gauge" "Total number of Established connections" 191 | 192 | read conn _ <<< $(netstat -s | grep 'connections established$') 193 | addMetricContent "netstat_established_total" "$conn" 194 | 195 | if [ "$netstat_totalonly" = "1" ]; then 196 | buildOutput 197 | return 198 | fi 199 | 200 | conndump=$(netstat -tn | grep ESTABLISHED) 201 | 202 | buildMetric "netstat_established" "gauge" "Number of Established connections per port" 203 | 204 | while read -r count port; do 205 | addMetricContent "netstat_established" "{port=\"$port\"} $count" 206 | done < <(echo "$conndump" | awk '{print $4}' | cut -d':' -f2 | sort | uniq -c | sed 's/^\s*//') 207 | 208 | buildOutput 209 | } 210 | # END netstat 211 | 212 | # 213 | # BEGIN load average stuff 214 | function metrics_loadavg { 215 | buildMetric "loadavg_load1" "gauge" "1-minute load average from /proc/loadavg" 216 | buildMetric "loadavg_load5" "gauge" "10-minute load average from /proc/loadavg" 217 | buildMetric "loadavg_load15" "gauge" "15-minute load average from /proc/loadavg" 218 | buildMetric "loadavg_jobs_running" "gauge" "Running jobs from /proc/loadavg" 219 | buildMetric "loadavg_jobs_background" "gauge" "Background jobs from /proc/loadavg" 220 | buildMetric "loadavg_last_pid" "counter" "Last PID from /proc/loadavg" 221 | 222 | while read -r load1 load2 load3 procs lastpid; do 223 | # sample input: 4.76 3.86 3.58 9/652 17232 224 | addMetricContent "loadavg_load1" $load1 225 | addMetricContent "loadavg_load5" $load2 226 | addMetricContent "loadavg_load15" $load3 227 | while IFS='/' read -r running background; do 228 | addMetricContent "loadavg_jobs_running" $running 229 | addMetricContent "loadavg_jobs_background" $background 230 | done <<< "$procs" 231 | addMetricContent "loadavg_last_pid" $lastpid 232 | done < /proc/loadavg 233 | 234 | buildOutput 235 | } 236 | # END load average stuff 237 | 238 | # 239 | # BEGIN memory stuff 240 | function metrics_memory { 241 | buffer=$(awk '{ print tolower($1), $2 }' /proc/meminfo) 242 | # we export only what we need 243 | for item in memtotal memfree memavailable buffers cached slab swaptotal swapfree sreclaimable committed_as; do 244 | while read -r metric value; do 245 | if [ ! -z "$metric" ]; then 246 | buildMetric "meminfo_${item}_kbytes" "gauge" "From /proc/meminfo" 247 | addMetricContent "meminfo_${item}_kbytes" "$value" 248 | fi 249 | done < <(grep "^$item:" <<< "$buffer") 250 | done 251 | 252 | buildOutput 253 | } 254 | # END memory stuff 255 | 256 | # 257 | # BEGIN filesystem usage 258 | function metrics_filesystem { 259 | buffer=$(df -PT $filesystem_targets 2>/dev/null | grep -v '^Filesystem' | tr -s '\t' ' ') 260 | 261 | # Filesystem 1024-blocks Used Available Capacity Mounted on 262 | # /dev/mapper/cl-home 99533328 37320564 62212764 38% / 263 | buildMetric "filesystem_total_kbytes" "gauge" "From df" 264 | buildMetric "filesystem_used_kbytes" "gauge" "From df" 265 | buildMetric "filesystem_avail_kbytes" "gauge" "From df" 266 | buildMetric "filesystem_capacity_percent" "gauge" "From df" 267 | 268 | while read -r device fstype total used avail percent target; do 269 | labels="{mountpoint=\"$target\",source=\"$device\",fstype=\"$fstype\"}" 270 | addMetricContent "filesystem_total_kbytes" "$labels $total" 271 | addMetricContent "filesystem_used_kbytes" "$labels $used" 272 | addMetricContent "filesystem_avail_kbytes" "$labels $avail" 273 | noPercentSign=$(echo "$percent" | sed 's,%,,g') 274 | addMetricContent "filesystem_capacity_percent" "$labels $noPercentSign" 275 | done <<< "$buffer" 276 | 277 | buildOutput 278 | } 279 | # END filesystem usage 280 | 281 | # 282 | # BEGIN /proc/stat CPU stats 283 | function metrics_stat { 284 | # /proc/stat has different column number in various kernel versions, 285 | # so we instantiate a list of columns we're going to loop over later 286 | declare -a cols=(user nice system idle iowait irq softirq steal guest guest_nice) 287 | 288 | # add metric 289 | buildMetric "stat_cpu_seconds_total" "counter" "From /proc/stat" 290 | 291 | # fetch first (we'll use it to build "current" usage later) 292 | cpu1=$(head -n 1 /proc/stat | tr -s '\t' ' ' | sed 's,cpu ,,g') 293 | # sleep 294 | sleep 1 295 | # fetch second 296 | cpu2=$(head -n 1 /proc/stat | tr -s '\t' ' ' | sed 's,cpu ,,g') 297 | 298 | # add metric contents 299 | index=0 300 | while read -r data; do 301 | for item in $data; do 302 | addMetricContent "stat_cpu_seconds_total" "{mode=\"${cols[$index]}\"} $item" 303 | index=$(( index + 1 )) 304 | done 305 | done <<< "$cpu2" 306 | 307 | # create three temporary arrays 308 | read -r -a cpudata1 <<< "$cpu1" 309 | read -r -a cpudata2 <<< "$cpu2" 310 | declare -a difference=() 311 | 312 | # iterate & compare /proc/stat output arrays 313 | index=0 314 | delta_total=0 315 | for item in "${cpudata1[@]}"; do 316 | delta=$(( ${cpudata2[$index]} - ${cpudata1[$index]} )) 317 | 318 | difference[$index]=$delta 319 | delta_total=$(( $delta_total + $delta )) 320 | 321 | index=$(( $index + 1 )) 322 | done 323 | 324 | # total=$(( ${difference[0]} + ${difference[1]} + ${difference[2]} + ${difference[3]} + ${difference[4]} )) 325 | idlePercentage="(( ${difference[3]} + ${difference[4]} ) / $delta_total) * 100.0" 326 | iowaitPercentage="(( ${difference[4]} ) / $delta_total) * 100.0" 327 | cpuutil=$(awk "BEGIN { printf \"%i\", 100.0 - $idlePercentage }" ) 328 | iowait=$(awk "BEGIN { printf \"%i\", $iowaitPercentage }") 329 | 330 | # build "current" usage metric 331 | buildMetric "stat_cpu_current_usage_percent" "gauge" "Calculated instant CPU usage" 332 | buildMetric "stat_cpu_current_iowait_percent" "gauge" "Calculated instant CPU iowait" 333 | 334 | # add content 335 | addMetricContent "stat_cpu_current_usage_percent" $cpuutil 336 | addMetricContent "stat_cpu_current_iowait_percent" $iowait 337 | 338 | buildOutput 339 | } 340 | # END /proc/stat CPU stats 341 | 342 | # 343 | # BEGIN extract data from iostat 344 | function metrics_iostat { 345 | # OLD:buffer=$(iostat -xmy 1 1 | grep -v '^$\|^avg\-cpu\|^Linux\|^\s' | tr -s '\t' ' ') 346 | # NEW:support for centos5 iostat 347 | buffer=$(iostat -xm 1 2 | awk '/avg-cpu/{i++}i==2' | grep -v '^$\|^avg\-cpu\|^Linux\|^\s' | tr -s '\t' ' ') 348 | # extract & make columns 349 | declare -a cols=() 350 | 351 | # loop over column names excluding "Device" 352 | for column in $(echo "$buffer" | grep '^Device'); do 353 | if [ ! -z "$(grep -i device <<< $column)" ]; then 354 | continue 355 | fi 356 | 357 | # run some patterns against the column names 358 | col=$(echo $column | sed 's,/s,,g; s,\-,_,g; s,%,,g' | tr '[:upper:]' '[:lower:]') 359 | if [ "$col" = "util" ]; then 360 | col="util_percent" 361 | fi 362 | 363 | # finally add to column array 364 | cols+=($col) 365 | done 366 | 367 | # create our iostat metrics 368 | for column in "${cols[@]}"; do 369 | buildMetric "iostat_$column" "gauge" "From iostat" 370 | done 371 | 372 | # device targets 373 | declare -a only_devices=($iostat_targets) 374 | 375 | # finally add our metric contents 376 | index=0 377 | while read line; do 378 | while read -r device data; do 379 | device="${device/\//_}" # replace / in device names with _ 380 | 381 | if [ ! -z "$iostat_targets" ]; then 382 | if [[ ! "${only_devices[@]}" =~ "$device" ]]; then 383 | continue # not in list of targets 384 | fi 385 | fi 386 | 387 | for item in $data; do 388 | addMetricContent "iostat_${cols[$index]}" "{device=\"$device\"} $item" 389 | index=$(( index + 1 )) 390 | done 391 | done <<< "$line" 392 | index=0 393 | done < <(echo "$buffer" | grep -v '^Device') 394 | 395 | buildOutput 396 | } 397 | # END extract data from iostat 398 | 399 | # 400 | # BEGIN extract data from /proc/net/dev 401 | function metrics_netdev { 402 | buffer="$(tr -s '\t' ' ' < /proc/net/dev)" 403 | # Inter-| Receive | Transmit 404 | # face |bytes packets errs drop fifo frame compressed multicast|bytes packets errs drop fifo colls carrier compressed 405 | # lo: 8813597 72990 0 0 0 0 0 0 8813597 72990 0 0 0 0 0 0 406 | 407 | for type in rx tx; do 408 | for counter in bytes packets errs drop fifo frame compressed multicast; do 409 | buildMetric "netdev_${type}_${counter}_total" "counter" "From /proc/net/dev" 410 | done 411 | done 412 | 413 | # device target selection 414 | declare -a only_interfaces=($netdev_targets) 415 | 416 | # dev rxbytes rxpkts rxerrs rxdrop rxfifo rxframe rxcomp rxmulticast txbytes txpkts txerrs txdrop txfifo txframe txcomp txmulticast 417 | while read line; do 418 | newline=$(echo "$line" | sed 's,:, ,g' | tr -s '\t' ' ') 419 | array=($newline) 420 | 421 | if [ ! -z "$netdev_targets" ]; then 422 | if [[ ! "${only_interfaces[@]}" =~ "${array[0]}" ]]; then 423 | continue # not in list of targets 424 | fi 425 | fi 426 | 427 | index=1 428 | for type in rx tx; do 429 | for counter in bytes packets errs drop fifo frame compressed multicast; do 430 | addMetricContent "netdev_${type}_${counter}_total" "{device=\"${array[0]}\"} ${array[$index]}" 431 | index=$(( index + 1 )) 432 | done 433 | done 434 | done < <(echo "$buffer" | grep ':') 435 | 436 | buildOutput 437 | } 438 | # 439 | 440 | # 441 | # BEGIN extract data from the conntrack utility 442 | function metrics_conntrack { 443 | # # conntrack -S conntrack 444 | # cpu=0 found=136 invalid=7864574 ignore=1238 insert=0 insert_failed=0 drop=0 early_drop=5904 error=193727 search_restart=12759417 445 | # cpu=1 found=131 invalid=1455753 ignore=468 insert=0 insert_failed=0 drop=0 early_drop=81 error=3 search_restart=5313345 446 | # cpu=2 found=128 invalid=1462495 ignore=552 insert=0 insert_failed=0 drop=0 early_drop=30 error=0 search_restart=5338620 447 | # # conntrack -S expect 448 | # cpu=0 expect_new=0 expect_create=0 expect_delete=0 449 | # cpu=1 expect_new=0 expect_create=0 expect_delete=0 450 | # cpu=2 expect_new=0 expect_create=0 expect_delete=0 451 | 452 | if [ "$conntrack_ignore_fake_cpuid" = "1" ]; then 453 | processors=$(($(nproc)-1)) 454 | fi 455 | 456 | buildMetric "conntrack_counter" "gauge" "From conntrack -C" 457 | buildMetric "conntrack_statistics" "gauge" "From conntrack -S" 458 | 459 | for table in conntrack expect; do 460 | addMetricContent "conntrack_counter" "{table=\"$table\"} $(conntrack -C $table)" 461 | 462 | # sum of all CPUs is way less cardinality 463 | if [ "$conntrack_sum_all_cpu" = "1" ]; then 464 | declare -A cols=() 465 | 466 | while read line; do 467 | read -r cpu_label counters <<< "$line" 468 | IFS="=" read -r cpu_label cpu_value <<< "$cpu_label" 469 | 470 | if [ "$conntrack_ignore_fake_cpuid" = "1" ]; then 471 | if [ "$cpu_value" -gt "$processors" ]; then 472 | continue 473 | fi 474 | fi 475 | 476 | for counter in $counters; do 477 | IFS="=" read -r metric value <<< "$counter" 478 | cols[$metric]=$(( ${cols[$metric]} + $value )) 479 | done 480 | done < <(conntrack -S $table) 481 | 482 | for counter in "${!cols[@]}"; do 483 | addMetricContent "conntrack_statistics" "{counter=\"$counter\",table=\"$table\"} ${cols[$counter]}" 484 | done 485 | else 486 | while read line; do 487 | read -r cpu_label counters <<< "$line" 488 | IFS="=" read -r cpu_label cpu_value <<< "$cpu_label" 489 | 490 | if [ "$conntrack_ignore_fake_cpuid" = "1" ]; then 491 | if [ "$cpu_value" -gt "$processors" ]; then 492 | continue 493 | fi 494 | fi 495 | 496 | for counter in $counters; do 497 | IFS="=" read -r metric value <<< "$counter" 498 | addMetricContent "conntrack_statistics" "{cpu=\"$cpu_value\",counter=\"$metric\",table=\"$table\"} $value" 499 | done 500 | done < <(conntrack -S $table) 501 | fi 502 | done 503 | 504 | buildOutput 505 | } 506 | # 507 | 508 | if [ "$1" = "get" ]; then 509 | metrics_$3 > "/tmp/$2/$3" 510 | else 511 | run=$(join_by , ${exporters[*]}) 512 | 513 | # make temp dir 514 | mkdir -p /tmp/$$/ 515 | 516 | # we're using xargs to run this script in multiple subprocesses 517 | # xargs will handle collecting data for us 518 | output=$(echo "$run" | xargs -n1 -P$workers -d',' -- "$bashPath" "$0" get $$) 519 | 520 | # the -e parameter transforms literal newlines into "actual" newlines 521 | # echo -e "$output" 522 | echo -e "$(cat /tmp/$$/*)" 523 | 524 | rm -rf /tmp/$$/ 525 | fi 526 | 527 | exit 528 | 529 | # BEGIN DEBUG SECTION 530 | # buildOutput 531 | # END DEBUG SECTION 532 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Monitoring Exporter 2 | 3 | System metrics exporter for Prometheus. Written in Bash and served via Xinetd (and a small wrapper script that turns content HTTP-compliant). The exporter provides reasonably fast response times by offloading methods to subprocesses (see `workers=2`). However, the response time will always be at least 1 second, as the exporter calculates current CPU usage by measuring it over a second. 4 | 5 | #### A note on monitoring CPU usage 6 | 7 | This exporter provides you with two CPU usage metrics. One of them is taken from /proc/stat verbatim (`monitoring_stat_cpu_seconds_total`). The other is a computed CPU usage. The exporter computes CPU usage by measuring it over one second. **The CPU usage calculation ignores iowait - iowait is not CPU usage, the processor is happy to do other things while waiting on disk**. 8 | 9 | ## Table of Contents 10 | 11 | 1. [Installation](#installation) 12 | 2. [Grafana Dashboard](#grafana) 13 | 3. [Metrics and Configuration](#metrics-and-configuration) 14 | 4. [Extending](#extending) 15 | 16 | ## Installation 17 | 18 | Tested on: 19 | - Debian 7, 8, 9 20 | - Ubuntu 14, 16, 18 21 | - CentOS 5 (with a workaround), 6, 7 22 | - CloudLinux 6, 7 23 | 24 | Requires Bash >4, on CentOS 5 for example, you will need to install Bash 4 (perhaps into a separate binary ie. `/bin/bash42`) and edit `bashPath` to it in the exporter file. You may also need to change the shebang lines, depending on your system. 25 | 26 | ### Ansible 27 | 28 | Playbook is included in `monitoring_exporter.yml`. 29 | 30 | You will likely need to edit the IP address variable `ip_address` - this is the IP address the playbook will add to the appropriate xinetd.d files, as a makeshift whitelist if you have only one Prometheus instance that's going to be contacting the exporter. 31 | 32 | You can run the playbook with variables using the `-e` parameter: `ansible-playbook -l web_servers -e ip_address=123.123.123.123 monitoring_exporter.yml` 33 | 34 | Or fetching your public IP dynamically: `ansible-playbook -l web_servers -e ip_address=$(curl -s icanhazip.com) monitoring_exporter.yml` 35 | 36 | ### Installing without Ansible 37 | 38 | - Install Xinetd 39 | - "template" the `exporter_monitoring.xinetd` file into /etc/xinetd.d/ (you will need to change the IP address setting inside or remove it if you do not intend on using the Xinetd whitelist) 40 | - `mkdir -p /opt/metrics.d/` 41 | - Place `exporter_monitoring` in `/opt/metrics.d/exporter_monitoring` 42 | - Place `httpwrapper` in `/opt/metrics.d/httpwrapper` 43 | - `chmod +x /opt/metrics.d/*` 44 | - Restart Xinetd 45 | 46 | ## Grafana 47 | 48 | A Grafana dashboard can be found here: https://grafana.com/grafana/dashboards/12095 49 | 50 | ## No docker?! 51 | 52 | This exporter is designed to run directly on the server. If you do make it work in Docker do let me know, though. 53 | 54 | # Metrics and Configuration 55 | 56 | ## Port 10100 57 | 58 | Provides the following metrics, separated by functions. All module exports are prefixed with `monitoring_` in code. 59 | 60 | ## Configuring 61 | 62 | You are free to select only metrics relevant to you by editing this line: 63 | `declare -a exporters=(loadavg stat iostat filesystem memory netdev netstat uptime kernel file_nr)` 64 | 65 | #### loadavg (default: enabled) 66 | 67 | Exports data from `/proc/loadavg` 68 | 69 | | name | description | additional labels | units | 70 | |------------------------------------|--------------------------------------------------|-------------------|-------| 71 | | monitoring_loadavg_load1 | 1 minute load average, as seen in /proc/loadavg | none | N | 72 | | monitoring_loadavg_load5 | 5 minute load average, as seen in /proc/loadavg | none | N | 73 | | monitoring_loadavg_load15 | 15 minute load average, as seen in /proc/loadavg | none | N | 74 | | monitoring_loadavg_jobs_running | Number of running jobs | none | N | 75 | | monitoring_loadavg_jobs_background | Number of background jobs | none | N | 76 | 77 | #### stat (default: enabled) 78 | 79 | Exports *some* data from `/proc/stat` 80 | 81 | | name | description | additional labels | units | 82 | |--------------------------------------------|-------------------------------------------------------------------------|-------------------|------------| 83 | | monitoring_stat_cpu_seconds_total | Total time in ticks (usually seconds) the CPU has spent in each mode | mode | Seconds | 84 | | monitoring_stat_cpu_current_usage_percent | CPU usage at the moment of execution `1` | none | Percentage | 85 | | monitoring_stat_cpu_current_iowait_percent | iowait at the moment of execution `2` - same you'd see when running "top" | none | Percentage | 86 | 87 | `1.` calculated by subtracting `%iowait`+`%idle` from 100 over 1 second: `100 - (idle + iowait)` 88 | `2.` calculated by subtracting `iowait` from `idle` over 1 second 89 | 90 | #### filesystem (default: enabled) 91 | 92 | Exports data from `df` 93 | 94 | | name | description | additional labels | units | 95 | |----------------------------------------|----------------------------------------------|--------------------------------|------------| 96 | | monitoring_filesystem_total_kbytes | Total amount of kbytes space in a filesystem | mountpoint, source, fstype `1` | kbytes | 97 | | monitoring_filesystem_used_kbytes | Used kbytes in filesystem | mountpoint, source, fstype `1` | kbytes | 98 | | monitoring_filesystem_avail_kbytes | Available kbytes in a filesystem | mountpoint, source, fstype `1` | kbytes | 99 | | monitoring_filesystem_capacity_percent | Percentage usage of the filessystem | mountpoint, source, fstype `1` | Percentage | 100 | 101 | `1.` mountpoint: the mount point in local system 102 | `1.` source: the block device 103 | `1.` fstype: filesystem type 104 | 105 | #### iostat (default: enabled) 106 | Exports data from `iostat -xm` 107 | 108 | | name | description | additional labels | units | 109 | |----------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|------------| 110 | | monitoring_iostat_rrqm | The number of read requests merged per second that were queued to the device | device* | N | 111 | | monitoring_iostat_wrqm | The number of write requests merged per second that were queued to the device | device* | N | 112 | | monitoring_iostat_r | The number (after merges) of read requests completed per second for the device | device* | N | 113 | | monitoring_iostat_w | The number (after merges) of write requests completed per second for the device | device* | N | 114 | | monitoring_iostat_rmb | The number of megabytes read from the device per second | device* | N | 115 | | monitoring_iostat_wmb | The number of megabytes written to the device per second | device* | N | 116 | | monitoring_iostat_avgrq_sz | The average size (in sectors) of the requests that were issued to the device | device* | N | 117 | | monitoring_iostat_avgqu_sz | The average queue length of the requests that were issued to the device | device* | N | 118 | | monitoring_iostat_await | The average time (in milliseconds) for I/O requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them | device* | ms | 119 | | monitoring_iostat_r_await | The average time (in milliseconds) for read requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them | device* | ms | 120 | | monitoring_iostat_w_await | The average time (in milliseconds) for write requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them | device* | ms | 121 | | monitoring_iostat_svctm | The average service time (in milliseconds) for I/O requests that were issued to the device. Warning! Do not trust this field any more. This field will be removed in a future sysstat version | device* | ms | 122 | | monitoring_iostat_util | Percentage of elapsed time during which I/O requests were issued to the device (bandwidth utilization for the device). Device saturation occurs when this value is close to 100% | device* | Percentage | 123 | 124 | * device: the block device 125 | 126 | Configuration variable `iostat_targets` can be used to specify a list of block devices to pull metrics for, space-separated. 127 | 128 | #### meminfo (default: enabled) 129 | 130 | Exports data from `/proc/meminfo` 131 | 132 | | name | description | additional labels | units | 133 | |----------------------------------------|----------------------------------------------------------------------------------------------------------------------------|-------------------|-----------| 134 | | monitoring_meminfo_committed_as_kbytes | The number of read requests merged per second that were queued to the device | none | kibibytes | 135 | | monitoring_meminfo_buffers_kbytes | The amount, in kibibytes, of temporary storage for raw disk blocks | none | kibibytes | 136 | | monitoring_meminfo_sreclaimable_kbytes | The part of Slab that can be reclaimed, such as caches | none | kibibytes | 137 | | monitoring_meminfo_slab_kbytes | The total amount of memory, in kibibytes, used by the kernel to cache data structures for its own use | none | kibibytes | 138 | | monitoring_meminfo_swaptotal_kbytes | The total amount of swap available, in kibibytes | none | kibibytes | 139 | | monitoring_meminfo_memtotal_kbytes | Total amount of usable RAM, in kibibytes, which is physical RAM minus a number of reserved bits and the kernel binary code | none | kibibytes | 140 | | monitoring_meminfo_swapfree_kbytes | The total amount of swap free, in kibibytes | none | kibibytes | 141 | | monitoring_meminfo_memfree_kbytes | The amount of physical RAM, in kibibytes, left unused by the system | none | kibibytes | 142 | | monitoring_meminfo_cached_kbytes | The amount of physical RAM, in kibibytes, used as cache memory | none | kibibytes | 143 | 144 | **Additionally, systems with Kernel >3 may contain the following metrics:** 145 | 146 | | name | description | additional labels | units | 147 | |----------------------------------------|---------------------------------------------------------------------------------------------|-------------------|-----------| 148 | | monitoring_meminfo_memavailable_kbytes | An estimate of how much memory is available for starting new applications, without swapping | none | kibibytes | 149 | 150 | #### netdev (default: enabled) 151 | 152 | Exports data from `/proc/net/dev` 153 | 154 | | name | description | additional labels | units | 155 | |---------------------------------------|-------------------------------------------------------------------|-------------------|-------| 156 | | monitoring_netdev_rx_bytes_total | The total number of bytes of data received by the interface | device`1` | N | 157 | | monitoring_netdev_rx_packets_total | The total number of packets received by the interface | device`1` | N | 158 | | monitoring_netdev_rx_errs_total | The total number of receive errors detected by the device driver | device`1` | N | 159 | | monitoring_netdev_rx_drop_total | The total number of packets dropped by the device driver | device`1` | N | 160 | | monitoring_netdev_rx_fifo_total | The number of FIFO buffer errors | device`1` | N | 161 | | monitoring_netdev_rx_frame_total | The number of packet framing errors | device`1` | N | 162 | | monitoring_netdev_rx_compressed_total | The number of compressed packets received by the device driver | device`1` | N | 163 | | monitoring_netdev_rx_multicast_total | The number of multicast frames received by the device driver | device`1` | N | 164 | | monitoring_netdev_tx_bytes_total | The total number of bytes of data transmitted by the interface | device`1` | N | 165 | | monitoring_netdev_tx_packets_total | The total number of packets transmittedby the interface | device`1` | N | 166 | | monitoring_netdev_tx_errs_total | The total number of transmit errors detected by the device driver | device`1` | N | 167 | | monitoring_netdev_tx_drop_total | The total number of packets dropped by the device driver | device`1` | N | 168 | | monitoring_netdev_tx_fifo_total | The number of FIFO buffer errors | device`1` | N | 169 | | monitoring_netdev_tx_frame_total | The number of packet framing errors | device`1` | N | 170 | | monitoring_netdev_tx_compressed_total | The number of compressed packets transmitted by the device driver | device`1` | N | 171 | | monitoring_netdev_tx_multicast_total | The number of multicast frames transmitted by the device driver | device`1` | N | 172 | 173 | `1` network device 174 | 175 | Configuration variable `netdev_targets` can be used to specify a list of interfaces to pull metrics for, space-separated. 176 | 177 | #### netstat (default: enabled) 178 | 179 | | name | description | additional labels | units | 180 | |---------------------------------------|-------------------------------------------------------------------|-------------------|-------| 181 | | monitoring_netstat_established_total | The total number of established connections | none | N | 182 | | monitoring_netstat_established | The number of established connections per port | port`1` | N | 183 | 184 | NB: Some hosts (like OpenVZ/LXC hypervisors) may have trouble reporting all connections on time with Netstat. It is advised to disable per-port granularity inside monitoring_exporter, under `netstat_totalonly`, ie. set it to `1`. 185 | 186 | `1` source port 187 | 188 | #### file_nr (default: enabled) 189 | 190 | Provides data from /proc/sys/fs/file-nr 191 | 192 | | name | description | additional labels | units | 193 | |------------------------------|---------------------------------|-------------------|-------| 194 | | monitoring_file_nr_allocated | All allocated file descriptors | none | N | 195 | | monitoring_file_nr_free | Free allocated file descriptors | none | N | 196 | | monitoring_file_nr_max | Maximum open file descriptors | none | N | 197 | 198 | #### slabinfo (default: disabled) 199 | 200 | From /proc/slabinfo - usually lots of output, so disabled by default 201 | 202 | By default, `slab_nonzeroonly` is set to `1` so that this function reports only non-zero slabs 203 | 204 | | name | description | additional labels | units | 205 | |-----------------------------------|---------------------------------------------------------------------------------------|-------------------|-------| 206 | | monitoring_slabinfo_active_objs | Number of objects that are currently active (i.e., in use) | slab`1` | N | 207 | | monitoring_slabinfo_num_objs | Total number of allocated objects (i.e., objects that are both in use and not in use) | slab`1` | N | 208 | | monitoring_slabinfo_objsize_bytes | Size of objects in this slab, in bytes | slab`1` | Bytes | 209 | | monitoring_slabinfo_objperslab | Number of objects stored in each slab | slab`1` | N | 210 | | monitoring_slabinfo_pagesperslab | Number of pages allocated for each slab | slab`1` | N | 211 | 212 | `1` slab identifier 213 | 214 | #### conntrack (default: disabled) 215 | 216 | | name | description | additional labels | units | 217 | |-----------------------------------|----------------------------|----------------------|-------| 218 | | monitoring_conntrack_counter | Output from `conntrack -C` | table`1` | N | 219 | | monitoring_conntrack_statistics | Output from `conntrack -S` | table`1`, counter`2`, cpu`3` | N | 220 | 221 | `1` conntrack table name 222 | `2` conntrack counter name 223 | `3` this label represents the CPU ID if `conntrack_sum_all_cpu` is set to `0`, otherwise the label is not present 224 | 225 | Configuration variable `conntrack_ignore_fake_cpuid` controls whether "fake cpuids" are reported. Default `1`. 226 | 227 | ## Extending 228 | 229 | Instead of modifying this exporter you may be interested in creating your own workflow following this post: [Exporting Prometheus Metrics with Bash Scripts](https://apawel.me/exporting-prometheus-metrics-with-bash-scripts/) 230 | 231 | It is relatively simple to add your own metric functions. Take on this function as an example. 232 | 233 | ``` 234 | # 235 | # BEGIN filesystem usage 236 | function metrics_filesystem { 237 | buffer=$(df -PT $filesystem_targets 2>/dev/null | grep -v '^Filesystem' | tr -s '\t' ' ') 238 | 239 | # Filesystem 1024-blocks Used Available Capacity Mounted on 240 | # /dev/mapper/cl-home 99533328 37320564 62212764 38% / 241 | buildMetric "filesystem_total_kbytes" "gauge" "From df" 242 | buildMetric "filesystem_used_kbytes" "gauge" "From df" 243 | buildMetric "filesystem_avail_kbytes" "gauge" "From df" 244 | buildMetric "filesystem_capacity_percent" "gauge" "From df" 245 | 246 | while read -r device fstype total used avail percent target; do 247 | labels="{mountpoint=\"$target\",source=\"$device\",fstype=\"$fstype\"}" 248 | addMetricContent "filesystem_total_kbytes" "$labels $total" 249 | addMetricContent "filesystem_used_kbytes" "$labels $used" 250 | addMetricContent "filesystem_avail_kbytes" "$labels $avail" 251 | noPercentSign=$(echo "$percent" | sed 's,%,,g') 252 | addMetricContent "filesystem_capacity_percent" "$labels $noPercentSign" 253 | done <<< "$buffer" 254 | 255 | buildOutput 256 | } 257 | # END filesystem usage 258 | ``` 259 | 260 | The function **buildMetric** basically adds a metric to the collection, in this case the metric name is uptime_seconds_total, the type is "counter" and the help text is "System uptime". 261 | 262 | The **addMetricContent** function is used to add the actual metric value and labels. Unfortunately there is no built-in handling for labels. 263 | 264 | **buildOutput** is used to "convert" the metric arrays to Prometheus compatible output and is placed at the very end. 265 | 266 | The function name name must be preceded by metrics_, ie. metrics_myfunction for it to be utilized. Once you have your function add it to the array at line 22: 267 | 268 | `declare -a exporters=( myfunction ... )` 269 | 270 | Note in the above array we are not using the "metrics_" prefix. 271 | --------------------------------------------------------------------------------