├── exporter_monitoring.xinetd
├── recording_rules.yml
├── httpwrapper
├── monitoring_exporter.yml
├── exporter_monitoring
└── README.md


/exporter_monitoring.xinetd:
--------------------------------------------------------------------------------
 1 | service exporter_monitoring
 2 | {
 3 |   type = unlisted
 4 |   port = 10100
 5 |   socket_type = stream
 6 |   wait = no
 7 |   user = root
 8 |   server = /opt/metrics.d/httpwrapper
 9 |   server_args = exporter_monitoring
10 |   disable = no
11 |   only_from = {{ ip_address }}
12 |   log_type = FILE /dev/null 
13 | }
14 | 


--------------------------------------------------------------------------------
/recording_rules.yml:
--------------------------------------------------------------------------------
1 | groups:
2 | - name: monitoring_recording_rules
3 |   interval: 1m
4 |   rules:
5 |   - record: monitoring_memory_usage_percent
6 |     expr: ((monitoring_meminfo_memtotal_kbytes - (monitoring_meminfo_memavailable_kbytes or (monitoring_meminfo_buffers_kbytes + monitoring_meminfo_cached_kbytes + monitoring_meminfo_memfree_kbytes))) / monitoring_meminfo_memtotal_kbytes) * 100
7 |   - record: monitoring_swap_usage_percent
8 |     expr: ((monitoring_meminfo_swaptotal_kbytes - monitoring_meminfo_swapfree_kbytes) / monitoring_meminfo_swaptotal_kbytes) * 100
9 | 


--------------------------------------------------------------------------------
/httpwrapper:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # small http wrapper for bash scripts via xinetd
 3 | 
 4 | ulimit -n 20480
 5 | ulimit -l 512
 6 | 
 7 | root='/opt/metrics.d/'
 8 | file="$1"
 9 | mime='text/plain'
10 | 
11 | cd $root
12 | 
13 | if [ -f "$root$file" ]; then
14 |   $root$file > /tmp/.$$.output
15 | 
16 |   size=$(stat -c "%s" "/tmp/.$$.output")
17 | 
18 |   printf 'HTTP/1.1 200 OK\r\nDate: %s\r\nContent-Length: %s\r\nContent-Type: %s\r\nConnection: close\r\n\r\n' "$(date)" "$size" "$mime"
19 | 
20 |   cat /tmp/.$$.output
21 | 
22 |   sleep 1
23 |   rm -f /tmp/.$$.output
24 |   exit 0
25 | fi
26 | 
27 | exit 1
28 | 


--------------------------------------------------------------------------------
/monitoring_exporter.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   vars:
 4 |     # set below to an IP address you'd like to allow to read the xinetd files
 5 |     ip_address: "123.123.123.123"
 6 |   tasks:
 7 |   - file:
 8 |       path: "/opt/metrics.d"
 9 |       state: directory
10 | 
11 |   - apt:
12 |       pkg: [ "xinetd", "bc", "sysstat", "net-tools", "conntrack" ]
13 |       state: present
14 |     when:
15 |     - "ansible_distribution == 'Ubuntu' or ansible_distribution == 'Debian'"
16 |     
17 |   - yum:
18 |       pkg: [ "xinetd", "bc", "sysstat", "net-tools", "conntrack-tools" ]
19 |       state: present
20 |     when:
21 |     - "ansible_distribution == 'CentOS' or ansible_distribution == 'CloudLinux'"
22 |   
23 |   - copy: 
24 |       src: "httpwrapper"
25 |       dest: "/opt/metrics.d/httpwrapper"
26 |       mode: 0755
27 | 
28 |   - copy: 
29 |       src: "exporter_monitoring"
30 |       dest: "/opt/metrics.d/exporter_monitoring"
31 |       mode: 0755
32 |       
33 |   - template:
34 |       src: "exporter_monitoring.xinetd"
35 |       dest: "/etc/xinetd.d/exporter_monitoring"
36 | 
37 |   - service:
38 |       name: xinetd
39 |       state: restarted
40 |       enabled: yes
41 | 


--------------------------------------------------------------------------------
/exporter_monitoring:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #  Pawel Adamski @ https://github.com/pawadski/monitoring_exporter
  3 | #  tested on: bash >= 4.0
  4 | #        coreutils >= 8.4
  5 | 
  6 | # BEGIN CONFIG
  7 | # edit below
  8 | # number of workers (subprocesses) to use
  9 | workers=2
 10 | # enables "modules"
 11 | #       loadavg - load average, jobs, last PID
 12 | #          stat - /proc/stat (no individual CPU granularity)
 13 | #        iostat - iostat -x
 14 | #    filesystem - df
 15 | #        memory - /proc/meminfo
 16 | #        netdev - /proc/net/dev
 17 | #       netstat - netstat, per-port granularity of established connections
 18 | #        uptime - system uptime
 19 | #        kernel - kernel version (as label)
 20 | #      slabinfo - /proc/slabinfo # disabled by default
 21 | #       file_nr - /proc/sys/fs/file-nr, file descriptor information
 22 | #     conntrack - information from the conntrack utility
 23 | declare -a exporters=(loadavg stat iostat filesystem memory netdev netstat uptime kernel file_nr)
 24 | # used to determine if netstat module should only report total amount of connections
 25 | # useful for servers like openvz hypervisors or generally busy servers, enabled by default
 26 | netstat_totalonly=1
 27 | # useful for limiting the amount of metrics - enable to report only
 28 | # slabs that are of non-zero size
 29 | slab_nonzeroonly=1
 30 | # optionally specify interfaces you'd like to pull netdev metrics for
 31 | # ie. eth0 eth1, empty by default
 32 | netdev_targets=""
 33 | # optionally specify filesystem mount points for df
 34 | # ie. / /mnt/otherfs, empty by default
 35 | filesystem_targets=""
 36 | # optionally specify block devices you'd like to pull iostat metrics for
 37 | # ie. sda sdb, empty by default
 38 | iostat_targets=""
 39 | # whether to ignore CPUs with IDs > number of processors in conntrack
 40 | conntrack_ignore_fake_cpuid=1
 41 | # high cardinality metric!
 42 | # when 0, will report conntrack stats for each cpu
 43 | # when 1, will report a sum of conntrack stats for all cpus
 44 | conntrack_sum_all_cpu=1
 45 | # defines the exporter name, prepended to every metric name
 46 | exporterName="monitoring"
 47 | # define path to interpreter, in case it's different than stock
 48 | bashPath="/bin/bash"
 49 | 
 50 | # END CONFIG
 51 | #
 52 | # do not edit below
 53 | output=""
 54 | 
 55 | # define some associative arrays
 56 | declare -A metricContent metricType metricHelp
 57 | 
 58 | #
 59 | #  this function builds the metric type and help data
 60 | #
 61 | function buildMetric {
 62 |     # 1 = metric name
 63 |     # 2 = metric type
 64 |     # 3 = metric help text
 65 |     metricType["${exporterName}_$1"]="$2"
 66 |     metricHelp["${exporterName}_$1"]="$3"
 67 |     # metricContent["${exporterName}_$1"]=""
 68 | }
 69 | 
 70 | #
 71 | #  this function adds actual metric content to metric
 72 | #
 73 | function addMetricContent {
 74 |     metricName="${exporterName}_$1"
 75 |     declare -a contents
 76 |     shift;
 77 |     # add everything
 78 |     for item in "$@"; do 
 79 |         if [ "${item:0:1}" != "{" ]; then
 80 |             item=" $item"
 81 |         fi
 82 |         # metricContent[$metricName]="${metricContent[$metricName]}$metricName$item\n"
 83 |         contents+=("$metricName$item")
 84 |     done 
 85 |     if [ ! -z "${metricContent[$metricName]}" ]; then 
 86 |         metricContent[$metricName]=$(join_by "\n" "${metricContent[$metricName]}" "${contents[*]}")
 87 |     else 
 88 |         metricContent[$metricName]=$(join_by "\n" "${contents[*]}")
 89 |     fi
 90 | }
 91 | 
 92 | #
 93 | #  this builds and prints final output
 94 | #
 95 | function buildOutput {
 96 |     #
 97 |     #  build output
 98 |     #
 99 |     for metricName in "${!metricContent[@]}"; do
100 |         output="$output# HELP $metricName ${metricHelp[$metricName]}\n# TYPE $metricName ${metricType[$metricName]}\n${metricContent[$metricName]}\n"
101 |     done
102 | 
103 |     #
104 |     #  print output into temp directory
105 |     #
106 |     echo -n "$output"
107 | 
108 |     exit
109 | }
110 | 
111 | #
112 | #  simulates array join
113 | #
114 | function join_by { local d=$1; shift; echo -n "$1"; shift; printf "%s" "${@/#/$d}"; }
115 | 
116 | #
117 | #  metric runners
118 | #  ideally define your exporters here as metrics_XXXXXX
119 | #
120 | 
121 | #
122 | # BEGIN uptime
123 | function metrics_slabinfo {
124 |     buildMetric "slabinfo_active_objs" "gauge" "The number of objects that are currently active (i.e., in use)"
125 |     buildMetric "slabinfo_num_objs" "gauge" "The total number of allocated objects (i.e., objects that are both in use and not in use)"
126 |     buildMetric "slabinfo_objsize_bytes" "gauge" "The size of objects in this slab, in bytes"
127 |     buildMetric "slabinfo_objperslab" "gauge" "The number of objects stored in each slab"
128 |     buildMetric "slabinfo_pagesperslab" "gauge" "The number of pages allocated for each slab"
129 | 
130 |     while read slabname active_objs num_objs objsize objperslab pagesperslab etc; do 
131 |         if [ "$slabname" = "#" ]; then 
132 |             continue
133 |         fi 
134 |         if [ "$slab_nonzeroonly" = "1" ]; then 
135 |             if [ "$objsize" = "0" ]; then 
136 |                 continue
137 |             fi 
138 |         fi 
139 |         addMetricContent "slabinfo_active_objs" "{slab=\"$slabname\"} $active_objs"
140 |         addMetricContent "slabinfo_num_objs" "{slab=\"$slabname\"} $num_objs"
141 |         addMetricContent "slabinfo_objsize_bytes" "{slab=\"$slabname\"} $objsize"
142 |         addMetricContent "slabinfo_objperslab" "{slab=\"$slabname\"} $objperslab"
143 |         addMetricContent "slabinfo_pagesperslab" "{slab=\"$slabname\"} $pagesperslab"
144 |     done < <(grep tunables /proc/slabinfo)
145 | 
146 |     buildOutput
147 | }
148 | # END uptime
149 | 
150 | #
151 | # BEGIN file_nr
152 | function metrics_file_nr {
153 |     file_nr_array=($(cat /proc/sys/fs/file-nr))
154 | 
155 |     buildMetric "file_nr_allocated" "gauge" "Total allocated file descriptors"
156 |     buildMetric "file_nr_free" "gauge" "Free allocated file descriptors"
157 |     buildMetric "file_nr_max" "gauge" "Max allowed open file descriptors"
158 | 
159 |     addMetricContent "file_nr_allocated" "${file_nr_array[0]}"
160 |     addMetricContent "file_nr_free" "${file_nr_array[1]}"
161 |     addMetricContent "file_nr_max" "${file_nr_array[2]}"
162 | 
163 |     buildOutput
164 | }
165 | # END file_nr
166 | 
167 | #
168 | # BEGIN uptime
169 | function metrics_kernel {
170 |     buildMetric "kernel_version" "gauge" "Current kernel version as label"
171 |     addMetricContent "kernel_version" "{version=\"$(uname -r)\"} 1"
172 | 
173 |     buildOutput
174 | }
175 | # END uptime
176 | 
177 | #
178 | # BEGIN uptime
179 | function metrics_uptime {
180 |     buildMetric "uptime_seconds_total" "counter" "System uptime"
181 |     addMetricContent "uptime_seconds_total" "$(awk '{print $1}' /proc/uptime)"
182 | 
183 |     buildOutput
184 | }
185 | # END uptime
186 | 
187 | #
188 | # BEGIN netstat (sorts netstat ESTABLISHED connections by port label)
189 | function metrics_netstat {
190 |     buildMetric "netstat_established_total" "gauge" "Total number of Established connections"
191 | 
192 |     read conn _ <<< $(netstat -s | grep 'connections established$')
193 |     addMetricContent "netstat_established_total" "$conn"
194 | 
195 |     if [ "$netstat_totalonly" = "1" ]; then 
196 |         buildOutput
197 |         return 
198 |     fi 
199 | 
200 |     conndump=$(netstat -tn | grep ESTABLISHED)
201 | 
202 |     buildMetric "netstat_established" "gauge" "Number of Established connections per port"
203 | 
204 |     while read -r count port; do 
205 |         addMetricContent "netstat_established" "{port=\"$port\"} $count"
206 |     done < <(echo "$conndump" | awk '{print $4}' | cut -d':' -f2 | sort | uniq -c | sed 's/^\s*//')
207 | 
208 |     buildOutput
209 | }
210 | # END netstat
211 | 
212 | #
213 | # BEGIN load average stuff
214 | function metrics_loadavg {
215 |     buildMetric "loadavg_load1" "gauge" "1-minute load average from /proc/loadavg"
216 |     buildMetric "loadavg_load5" "gauge" "10-minute load average from /proc/loadavg"
217 |     buildMetric "loadavg_load15" "gauge" "15-minute load average from /proc/loadavg"
218 |     buildMetric "loadavg_jobs_running" "gauge" "Running jobs from /proc/loadavg"
219 |     buildMetric "loadavg_jobs_background" "gauge" "Background jobs from /proc/loadavg"
220 |     buildMetric "loadavg_last_pid" "counter" "Last PID from /proc/loadavg"
221 | 
222 |     while read -r load1 load2 load3 procs lastpid; do
223 |         # sample input: 4.76 3.86 3.58 9/652 17232
224 |         addMetricContent "loadavg_load1" $load1
225 |         addMetricContent "loadavg_load5" $load2
226 |         addMetricContent "loadavg_load15" $load3
227 |         while IFS='/' read -r running background; do 
228 |             addMetricContent "loadavg_jobs_running" $running
229 |             addMetricContent "loadavg_jobs_background" $background
230 |         done <<< "$procs"
231 |         addMetricContent "loadavg_last_pid" $lastpid
232 |     done < /proc/loadavg
233 | 
234 |     buildOutput
235 | }
236 | # END load average stuff
237 | 
238 | #
239 | # BEGIN memory stuff
240 | function metrics_memory {
241 |     buffer=$(awk '{ print tolower($1), $2 }' /proc/meminfo)
242 |     # we export only what we need 
243 |     for item in memtotal memfree memavailable buffers cached slab swaptotal swapfree sreclaimable committed_as; do 
244 |         while read -r metric value; do
245 |             if [ ! -z "$metric" ]; then 
246 |                 buildMetric "meminfo_${item}_kbytes" "gauge" "From /proc/meminfo"
247 |                 addMetricContent "meminfo_${item}_kbytes" "$value"
248 |             fi 
249 |         done < <(grep "^$item:" <<< "$buffer")
250 |     done
251 | 
252 |     buildOutput
253 | }
254 | # END memory stuff
255 | 
256 | #
257 | # BEGIN filesystem usage
258 | function metrics_filesystem {
259 |     buffer=$(df -PT $filesystem_targets 2>/dev/null | grep -v '^Filesystem' | tr -s '\t' ' ')
260 |     
261 |     # Filesystem 1024-blocks Used Available Capacity Mounted on
262 |     # /dev/mapper/cl-home 99533328 37320564 62212764 38% /
263 |     buildMetric "filesystem_total_kbytes" "gauge" "From df"
264 |     buildMetric "filesystem_used_kbytes" "gauge" "From df"
265 |     buildMetric "filesystem_avail_kbytes" "gauge" "From df"
266 |     buildMetric "filesystem_capacity_percent" "gauge" "From df"
267 | 
268 |     while read -r device fstype total used avail percent target; do 
269 |         labels="{mountpoint=\"$target\",source=\"$device\",fstype=\"$fstype\"}"
270 |         addMetricContent "filesystem_total_kbytes" "$labels $total"
271 |         addMetricContent "filesystem_used_kbytes" "$labels $used"
272 |         addMetricContent "filesystem_avail_kbytes" "$labels $avail"
273 |         noPercentSign=$(echo "$percent" | sed 's,%,,g')
274 |         addMetricContent "filesystem_capacity_percent" "$labels $noPercentSign"
275 |     done <<< "$buffer"
276 | 
277 |     buildOutput
278 | }
279 | # END filesystem usage
280 | 
281 | #
282 | # BEGIN /proc/stat CPU stats
283 | function metrics_stat {
284 |     # /proc/stat has different column number in various kernel versions,
285 |     # so we instantiate a list of columns we're going to loop over later
286 |     declare -a cols=(user nice system idle iowait irq softirq steal guest guest_nice)
287 | 
288 |     # add metric
289 |     buildMetric "stat_cpu_seconds_total" "counter" "From /proc/stat"
290 | 
291 |     # fetch first (we'll use it to build "current" usage later)
292 |     cpu1=$(head -n 1 /proc/stat | tr -s '\t' ' ' | sed 's,cpu ,,g')
293 |     # sleep
294 |     sleep 1
295 |     # fetch second
296 |     cpu2=$(head -n 1 /proc/stat | tr -s '\t' ' ' | sed 's,cpu ,,g')
297 | 
298 |     # add metric contents
299 |     index=0
300 |     while read -r data; do 
301 |         for item in $data; do 
302 |             addMetricContent "stat_cpu_seconds_total" "{mode=\"${cols[$index]}\"} $item"
303 |             index=$(( index + 1 ))
304 |         done
305 |     done <<< "$cpu2"
306 | 
307 |     # create three temporary arrays
308 |     read -r -a cpudata1 <<< "$cpu1"
309 |     read -r -a cpudata2 <<< "$cpu2"
310 |     declare -a difference=()
311 | 
312 |     # iterate & compare /proc/stat output arrays
313 |     index=0
314 |     delta_total=0
315 |     for item in "${cpudata1[@]}"; do
316 |         delta=$(( ${cpudata2[$index]} - ${cpudata1[$index]} ))
317 | 
318 |         difference[$index]=$delta
319 |         delta_total=$(( $delta_total + $delta ))
320 | 
321 |         index=$(( $index + 1 ))
322 |     done 
323 | 
324 |     # total=$(( ${difference[0]} + ${difference[1]} + ${difference[2]} + ${difference[3]} + ${difference[4]} ))
325 |     idlePercentage="(( ${difference[3]} + ${difference[4]} ) / $delta_total) * 100.0"
326 |     iowaitPercentage="(( ${difference[4]} ) / $delta_total) * 100.0"
327 |     cpuutil=$(awk "BEGIN { printf \"%i\", 100.0 - $idlePercentage }" )
328 |     iowait=$(awk "BEGIN { printf \"%i\", $iowaitPercentage }")
329 | 
330 |     # build "current" usage metric
331 |     buildMetric "stat_cpu_current_usage_percent" "gauge" "Calculated instant CPU usage"
332 |     buildMetric "stat_cpu_current_iowait_percent" "gauge" "Calculated instant CPU iowait"
333 | 
334 |     # add content 
335 |     addMetricContent "stat_cpu_current_usage_percent" $cpuutil
336 |     addMetricContent "stat_cpu_current_iowait_percent" $iowait
337 | 
338 |     buildOutput
339 | }
340 | # END /proc/stat CPU stats
341 | 
342 | #
343 | # BEGIN extract data from iostat
344 | function metrics_iostat {
345 |     # OLD:buffer=$(iostat -xmy 1 1 | grep -v '^$\|^avg\-cpu\|^Linux\|^\s' | tr -s '\t' ' ')
346 |     # NEW:support for centos5 iostat
347 |     buffer=$(iostat -xm 1 2 | awk '/avg-cpu/{i++}i==2' | grep -v '^$\|^avg\-cpu\|^Linux\|^\s' | tr -s '\t' ' ')
348 |     # extract & make columns
349 |     declare -a cols=()
350 | 
351 |     # loop over column names excluding "Device"
352 |     for column in $(echo "$buffer" | grep '^Device'); do 
353 |         if [ ! -z "$(grep -i device <<< $column)" ]; then
354 |             continue
355 |         fi 
356 | 
357 |         # run some patterns against the column names
358 |         col=$(echo $column | sed 's,/s,,g; s,\-,_,g; s,%,,g' | tr '[:upper:]' '[:lower:]')
359 |         if [ "$col" = "util" ]; then 
360 |             col="util_percent"
361 |         fi 
362 | 
363 |         # finally add to column array
364 |         cols+=($col)
365 |     done 
366 | 
367 |     # create our iostat metrics
368 |     for column in "${cols[@]}"; do 
369 |         buildMetric "iostat_$column" "gauge" "From iostat"
370 |     done
371 | 
372 |     # device targets
373 |     declare -a only_devices=($iostat_targets)
374 | 
375 |     # finally add our metric contents
376 |     index=0
377 |     while read line; do 
378 |         while read -r device data; do
379 |             device="${device/\//_}" # replace / in device names with _
380 | 
381 |             if [ ! -z "$iostat_targets" ]; then
382 |                 if [[ ! "${only_devices[@]}" =~ "$device" ]]; then
383 |                     continue # not in list of targets
384 |                 fi
385 |             fi
386 | 
387 |             for item in $data; do 
388 |                 addMetricContent "iostat_${cols[$index]}" "{device=\"$device\"} $item"
389 |                 index=$(( index + 1 ))
390 |             done 
391 |         done <<< "$line"
392 |         index=0
393 |     done < <(echo "$buffer" | grep -v '^Device')
394 | 
395 |     buildOutput
396 | }
397 | # END extract data from iostat
398 | 
399 | #
400 | # BEGIN extract data from /proc/net/dev
401 | function metrics_netdev {
402 |     buffer="$(tr -s '\t' ' ' < /proc/net/dev)"
403 |     # Inter-|   Receive                                                |  Transmit
404 |     #  face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed
405 |     #    lo: 8813597   72990    0    0    0     0          0         0  8813597   72990    0    0    0     0       0          0
406 | 
407 |     for type in rx tx; do
408 |         for counter in bytes packets errs drop fifo frame compressed multicast; do 
409 |             buildMetric "netdev_${type}_${counter}_total" "counter" "From /proc/net/dev"
410 |         done 
411 |     done 
412 | 
413 |     # device target selection
414 |     declare -a only_interfaces=($netdev_targets)
415 | 
416 |     # dev rxbytes rxpkts rxerrs rxdrop rxfifo rxframe rxcomp rxmulticast txbytes txpkts txerrs txdrop txfifo txframe txcomp txmulticast
417 |     while read line; do 
418 |         newline=$(echo "$line" | sed 's,:, ,g' | tr -s '\t' ' ')
419 |         array=($newline)
420 | 
421 |         if [ ! -z "$netdev_targets" ]; then
422 |             if [[ ! "${only_interfaces[@]}" =~ "${array[0]}" ]]; then
423 |                 continue # not in list of targets
424 |             fi
425 |         fi
426 |         
427 |         index=1
428 |         for type in rx tx; do 
429 |             for counter in bytes packets errs drop fifo frame compressed multicast; do 
430 |                 addMetricContent "netdev_${type}_${counter}_total" "{device=\"${array[0]}\"} ${array[$index]}"
431 |                 index=$(( index + 1 ))
432 |             done
433 |         done 
434 |     done < <(echo "$buffer" | grep ':')    
435 | 
436 |     buildOutput
437 | }
438 | #
439 | 
440 | #
441 | # BEGIN extract data from the conntrack utility
442 | function metrics_conntrack {
443 |     # # conntrack -S conntrack
444 |     # cpu=0           found=136 invalid=7864574 ignore=1238 insert=0 insert_failed=0 drop=0 early_drop=5904 error=193727 search_restart=12759417
445 |     # cpu=1           found=131 invalid=1455753 ignore=468 insert=0 insert_failed=0 drop=0 early_drop=81 error=3 search_restart=5313345
446 |     # cpu=2           found=128 invalid=1462495 ignore=552 insert=0 insert_failed=0 drop=0 early_drop=30 error=0 search_restart=5338620
447 |     # # conntrack -S expect
448 |     # cpu=0           expect_new=0 expect_create=0 expect_delete=0
449 |     # cpu=1           expect_new=0 expect_create=0 expect_delete=0
450 |     # cpu=2           expect_new=0 expect_create=0 expect_delete=0
451 | 
452 |     if [ "$conntrack_ignore_fake_cpuid" = "1" ]; then 
453 |         processors=$(($(nproc)-1))
454 |     fi 
455 | 
456 |     buildMetric "conntrack_counter" "gauge" "From conntrack -C"
457 |     buildMetric "conntrack_statistics" "gauge" "From conntrack -S"
458 | 
459 |     for table in conntrack expect; do 
460 |         addMetricContent "conntrack_counter" "{table=\"$table\"} $(conntrack -C $table)"
461 | 
462 |         # sum of all CPUs is way less cardinality
463 |         if [ "$conntrack_sum_all_cpu" = "1" ]; then
464 |             declare -A cols=()
465 | 
466 |             while read line; do
467 |                 read -r cpu_label counters <<< "$line"
468 |                 IFS="=" read -r cpu_label cpu_value <<< "$cpu_label"
469 | 
470 |                 if [ "$conntrack_ignore_fake_cpuid" = "1" ]; then 
471 |                     if [ "$cpu_value" -gt "$processors" ]; then 
472 |                         continue 
473 |                     fi 
474 |                 fi 
475 | 
476 |                 for counter in $counters; do 
477 |                     IFS="=" read -r metric value <<< "$counter"
478 |                     cols[$metric]=$(( ${cols[$metric]} + $value ))
479 |                 done 
480 |             done < <(conntrack -S $table)
481 | 
482 |             for counter in "${!cols[@]}"; do
483 |                 addMetricContent "conntrack_statistics" "{counter=\"$counter\",table=\"$table\"} ${cols[$counter]}"
484 |             done
485 |         else
486 |             while read line; do 
487 |                 read -r cpu_label counters <<< "$line"
488 |                 IFS="=" read -r cpu_label cpu_value <<< "$cpu_label"
489 | 
490 |                 if [ "$conntrack_ignore_fake_cpuid" = "1" ]; then 
491 |                     if [ "$cpu_value" -gt "$processors" ]; then 
492 |                         continue 
493 |                     fi 
494 |                 fi 
495 | 
496 |                 for counter in $counters; do 
497 |                     IFS="=" read -r metric value <<< "$counter"
498 |                     addMetricContent "conntrack_statistics" "{cpu=\"$cpu_value\",counter=\"$metric\",table=\"$table\"} $value"
499 |                 done 
500 |             done < <(conntrack -S $table)
501 |         fi
502 |     done 
503 | 
504 |     buildOutput
505 | }
506 | #
507 | 
508 | if [ "$1" = "get" ]; then 
509 |     metrics_$3 > "/tmp/$2/$3"
510 | else
511 |     run=$(join_by , ${exporters[*]})
512 | 
513 |     # make temp dir
514 |     mkdir -p /tmp/$$/
515 | 
516 |     # we're using xargs to run this script in multiple subprocesses
517 |     # xargs will handle collecting data for us
518 |     output=$(echo "$run" | xargs -n1 -P$workers -d',' -- "$bashPath" "$0" get $$)
519 | 
520 |     # the -e parameter transforms literal newlines into "actual" newlines
521 |     # echo -e "$output"
522 |     echo -e "$(cat /tmp/$$/*)"
523 |     
524 |     rm -rf /tmp/$$/
525 | fi
526 | 
527 | exit 
528 | 
529 | # BEGIN DEBUG SECTION
530 | # buildOutput
531 | # END DEBUG SECTION
532 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Monitoring Exporter
  2 | 
  3 | System metrics exporter for Prometheus. Written in Bash and served via Xinetd (and a small wrapper script that turns content HTTP-compliant). The exporter provides reasonably fast response times by offloading methods to subprocesses (see `workers=2`). However, the response time will always be at least 1 second, as the exporter calculates current CPU usage by measuring it over a second.
  4 | 
  5 | #### A note on monitoring CPU usage
  6 | 
  7 | This exporter provides you with two CPU usage metrics. One of them is taken from /proc/stat verbatim (`monitoring_stat_cpu_seconds_total`). The other is a computed CPU usage. The exporter computes CPU usage by measuring it over one second. **The CPU usage calculation ignores iowait - iowait is not CPU usage, the processor is happy to do other things while waiting on disk**.
  8 | 
  9 | ## Table of Contents
 10 | 
 11 | 1. [Installation](#installation)
 12 | 2. [Grafana Dashboard](#grafana)
 13 | 3. [Metrics and Configuration](#metrics-and-configuration)
 14 | 4. [Extending](#extending)
 15 | 
 16 | ## Installation
 17 | 
 18 | Tested on:
 19 | - Debian 7, 8, 9
 20 | - Ubuntu 14, 16, 18
 21 | - CentOS 5 (with a workaround), 6, 7
 22 | - CloudLinux 6, 7
 23 | 
 24 | Requires Bash >4, on CentOS 5 for example, you will need to install Bash 4 (perhaps into a separate binary ie. `/bin/bash42`) and edit `bashPath` to it in the exporter file. You may also need to change the shebang lines, depending on your system.
 25 | 
 26 | ###  Ansible
 27 | 
 28 | Playbook is included in `monitoring_exporter.yml`.
 29 | 
 30 | You will likely need to edit the IP address variable `ip_address` - this is the IP address the playbook will add to the appropriate xinetd.d files, as a makeshift whitelist if you have only one Prometheus instance that's going to be contacting the exporter.   
 31 | 
 32 | You can run the playbook with variables using the `-e` parameter: `ansible-playbook -l web_servers -e ip_address=123.123.123.123 monitoring_exporter.yml`
 33 | 
 34 | Or fetching your public IP dynamically: `ansible-playbook -l web_servers -e ip_address=$(curl -s icanhazip.com) monitoring_exporter.yml`
 35 | 
 36 | ### Installing without Ansible
 37 | 
 38 | - Install Xinetd
 39 | - "template" the `exporter_monitoring.xinetd` file into /etc/xinetd.d/ (you will need to change the IP address setting inside or remove it if you do not intend on using the Xinetd whitelist)
 40 | - `mkdir -p /opt/metrics.d/`
 41 | - Place `exporter_monitoring` in `/opt/metrics.d/exporter_monitoring`
 42 | - Place `httpwrapper` in `/opt/metrics.d/httpwrapper`
 43 | - `chmod +x /opt/metrics.d/*`
 44 | - Restart Xinetd
 45 | 
 46 | ## Grafana
 47 | 
 48 | A Grafana dashboard can be found here: https://grafana.com/grafana/dashboards/12095
 49 | 
 50 | ## No docker?!
 51 | 
 52 | This exporter is designed to run directly on the server. If you do make it work in Docker do let me know, though.
 53 | 
 54 | # Metrics and Configuration
 55 | 
 56 | ## Port 10100
 57 | 
 58 | Provides the following metrics, separated by functions. All module exports are prefixed with `monitoring_` in code.   
 59 | 
 60 | ## Configuring
 61 | 
 62 | You are free to select only metrics relevant to you by editing this line:   
 63 | `declare -a exporters=(loadavg stat iostat filesystem memory netdev netstat uptime kernel file_nr)`   
 64 | 
 65 | #### loadavg (default: enabled)
 66 | 
 67 | Exports data from `/proc/loadavg`
 68 | 
 69 | | name                               | description                                      | additional labels | units |
 70 | |------------------------------------|--------------------------------------------------|-------------------|-------|
 71 | | monitoring_loadavg_load1           | 1 minute load average, as seen in /proc/loadavg  | none              | N     |
 72 | | monitoring_loadavg_load5           | 5 minute load average, as seen in /proc/loadavg  | none              | N     |
 73 | | monitoring_loadavg_load15          | 15 minute load average, as seen in /proc/loadavg | none              | N     |
 74 | | monitoring_loadavg_jobs_running    | Number of running jobs                           | none              | N     |
 75 | | monitoring_loadavg_jobs_background | Number of background jobs                        | none              | N     |
 76 | 
 77 | #### stat (default: enabled)
 78 | 
 79 | Exports *some* data from `/proc/stat`
 80 | 
 81 | | name                                       | description                                                             | additional labels | units      |
 82 | |--------------------------------------------|-------------------------------------------------------------------------|-------------------|------------|
 83 | | monitoring_stat_cpu_seconds_total          | Total time in ticks (usually seconds) the CPU has spent in each mode    | mode              | Seconds    |
 84 | | monitoring_stat_cpu_current_usage_percent  | CPU usage at the moment of execution `1`                                  | none              | Percentage |
 85 | | monitoring_stat_cpu_current_iowait_percent | iowait at the moment of execution `2` - same you'd see when running "top" | none              | Percentage |
 86 | 
 87 | `1.` calculated by subtracting `%iowait`+`%idle` from 100 over 1 second: `100 - (idle + iowait)`   
 88 | `2.` calculated by subtracting `iowait` from `idle` over 1 second
 89 | 
 90 | #### filesystem (default: enabled)
 91 | 
 92 | Exports data from `df`
 93 | 
 94 | | name                                   | description                                  | additional labels              | units      |
 95 | |----------------------------------------|----------------------------------------------|--------------------------------|------------|
 96 | | monitoring_filesystem_total_kbytes     | Total amount of kbytes space in a filesystem | mountpoint, source, fstype `1` | kbytes     |
 97 | | monitoring_filesystem_used_kbytes      | Used kbytes in filesystem                    | mountpoint, source, fstype `1` | kbytes     |
 98 | | monitoring_filesystem_avail_kbytes     | Available kbytes in a filesystem             | mountpoint, source, fstype `1` | kbytes     |
 99 | | monitoring_filesystem_capacity_percent | Percentage usage of the filessystem          | mountpoint, source, fstype `1` | Percentage |
100 | 
101 | `1.` mountpoint: the mount point in local system   
102 | `1.` source: the block device   
103 | `1.` fstype: filesystem type
104 | 
105 | #### iostat (default: enabled)
106 | Exports data from `iostat -xm`
107 | 
108 | | name                       | description                                                                                                                                                                                   | additional labels | units      |
109 | |----------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|------------|
110 | | monitoring_iostat_rrqm     | The number of read requests merged per second that were queued to the device                                                                                                                  | device*           | N          |
111 | | monitoring_iostat_wrqm     | The number of write requests merged per second that were queued to the device                                                                                                                 | device*           | N          |
112 | | monitoring_iostat_r        | The number (after merges) of read requests completed per second for the device                                                                                                                | device*           | N          |
113 | | monitoring_iostat_w        | The number (after merges) of write requests completed per second for the device                                                                                                               | device*           | N          |
114 | | monitoring_iostat_rmb      | The number of megabytes read from the device per second                                                                                                                                       | device*           | N          |
115 | | monitoring_iostat_wmb      | The number of megabytes written to the device per second                                                                                                                                      | device*           | N          |
116 | | monitoring_iostat_avgrq_sz | The average size (in sectors) of the requests that were issued to the device                                                                                                                  | device*           | N          |
117 | | monitoring_iostat_avgqu_sz | The average queue length of the requests that were issued to the device                                                                                                                       | device*           | N          |
118 | | monitoring_iostat_await    | The average time (in milliseconds) for I/O requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them                | device*           | ms         |
119 | | monitoring_iostat_r_await  | The average time (in milliseconds) for read requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them               | device*           | ms         |
120 | | monitoring_iostat_w_await  | The average time (in milliseconds) for write requests issued to the device to be served. This includes the time spent by the requests in queue and the time spent servicing them              | device*           | ms         |
121 | | monitoring_iostat_svctm    | The average service time (in milliseconds) for I/O requests that were issued to the device. Warning! Do not trust this field any more. This field will be removed in a future sysstat version | device*           | ms         |
122 | | monitoring_iostat_util     | Percentage of elapsed time during which I/O requests were issued to the device (bandwidth utilization for the device). Device saturation occurs when this value is close to 100%              | device*           | Percentage |
123 | 
124 | * device: the block device
125 | 
126 | Configuration variable `iostat_targets` can be used to specify a list of block devices to pull metrics for, space-separated.
127 | 
128 | #### meminfo (default: enabled)
129 | 
130 | Exports data from `/proc/meminfo`
131 | 
132 | | name                                   | description                                                                                                                | additional labels | units     |
133 | |----------------------------------------|----------------------------------------------------------------------------------------------------------------------------|-------------------|-----------|
134 | | monitoring_meminfo_committed_as_kbytes | The number of read requests merged per second that were queued to the device                                               | none              | kibibytes |
135 | | monitoring_meminfo_buffers_kbytes      | The amount, in kibibytes, of temporary storage for raw disk blocks                                                         | none              | kibibytes |
136 | | monitoring_meminfo_sreclaimable_kbytes | The part of Slab that can be reclaimed, such as caches                                                                     | none              | kibibytes |
137 | | monitoring_meminfo_slab_kbytes         | The total amount of memory, in kibibytes, used by the kernel to cache data structures for its own use                      | none              | kibibytes |
138 | | monitoring_meminfo_swaptotal_kbytes    | The total amount of swap available, in kibibytes                                                                           | none              | kibibytes |
139 | | monitoring_meminfo_memtotal_kbytes     | Total amount of usable RAM, in kibibytes, which is physical RAM minus a number of reserved bits and the kernel binary code | none              | kibibytes |
140 | | monitoring_meminfo_swapfree_kbytes     | The total amount of swap free, in kibibytes                                                                                | none              | kibibytes |
141 | | monitoring_meminfo_memfree_kbytes      | The amount of physical RAM, in kibibytes, left unused by the system                                                        | none              | kibibytes |
142 | | monitoring_meminfo_cached_kbytes       | The amount of physical RAM, in kibibytes, used as cache memory                                                             | none              | kibibytes |
143 | 
144 | **Additionally, systems with Kernel >3 may contain the following metrics:**
145 | 
146 | | name                                   | description                                                                                 | additional labels | units     |
147 | |----------------------------------------|---------------------------------------------------------------------------------------------|-------------------|-----------|
148 | | monitoring_meminfo_memavailable_kbytes | An estimate of how much memory is available for starting new applications, without swapping | none              | kibibytes |
149 | 
150 | #### netdev (default: enabled)
151 |    
152 | Exports data from `/proc/net/dev`
153 | 
154 | | name                                  | description                                                       | additional labels | units |
155 | |---------------------------------------|-------------------------------------------------------------------|-------------------|-------|
156 | | monitoring_netdev_rx_bytes_total      | The total number of bytes of data received by the interface       | device`1`         | N     |
157 | | monitoring_netdev_rx_packets_total    | The total number of packets received by the interface             | device`1`         | N     |
158 | | monitoring_netdev_rx_errs_total       | The total number of receive errors detected by the device driver  | device`1`         | N     |
159 | | monitoring_netdev_rx_drop_total       | The total number of packets dropped by the device driver          | device`1`         | N     |
160 | | monitoring_netdev_rx_fifo_total       | The number of FIFO buffer errors                                  | device`1`         | N     |
161 | | monitoring_netdev_rx_frame_total      | The number of packet framing errors                               | device`1`         | N     |
162 | | monitoring_netdev_rx_compressed_total | The number of compressed packets received by the device driver    | device`1`         | N     |
163 | | monitoring_netdev_rx_multicast_total  | The number of multicast frames received by the device driver      | device`1`         | N     |
164 | | monitoring_netdev_tx_bytes_total      | The total number of bytes of data transmitted by the interface    | device`1`         | N     |
165 | | monitoring_netdev_tx_packets_total    | The total number of packets transmittedby the interface           | device`1`         | N     |
166 | | monitoring_netdev_tx_errs_total       | The total number of transmit errors detected by the device driver | device`1`         | N     |
167 | | monitoring_netdev_tx_drop_total       | The total number of packets dropped by the device driver          | device`1`         | N     |
168 | | monitoring_netdev_tx_fifo_total       | The number of FIFO buffer errors                                  | device`1`         | N     |
169 | | monitoring_netdev_tx_frame_total      | The number of packet framing errors                               | device`1`         | N     |
170 | | monitoring_netdev_tx_compressed_total | The number of compressed packets transmitted by the device driver | device`1`         | N     |
171 | | monitoring_netdev_tx_multicast_total  | The number of multicast frames transmitted by the device driver   | device`1`         | N     |
172 | 
173 | `1` network device
174 | 
175 | Configuration variable `netdev_targets` can be used to specify a list of interfaces to pull metrics for, space-separated.
176 | 
177 | #### netstat (default: enabled)
178 | 
179 | | name                                  | description                                                       | additional labels | units |
180 | |---------------------------------------|-------------------------------------------------------------------|-------------------|-------|
181 | | monitoring_netstat_established_total  | The total number of established connections                       | none              | N     |
182 | | monitoring_netstat_established        | The number of established connections per port                    | port`1`           | N     |
183 | 
184 | NB: Some hosts (like OpenVZ/LXC hypervisors) may have trouble reporting all connections on time with Netstat. It is advised to disable per-port granularity inside monitoring_exporter, under `netstat_totalonly`, ie. set it to `1`.
185 | 
186 | `1` source port
187 | 
188 | #### file_nr (default: enabled)
189 | 
190 | Provides data from /proc/sys/fs/file-nr
191 | 
192 | | name                         | description                     | additional labels | units |
193 | |------------------------------|---------------------------------|-------------------|-------|
194 | | monitoring_file_nr_allocated | All allocated file descriptors  | none              | N     |
195 | | monitoring_file_nr_free      | Free allocated file descriptors | none              | N     |
196 | | monitoring_file_nr_max       | Maximum open file descriptors   | none              | N     |
197 | 
198 | #### slabinfo (default: disabled)
199 | 
200 | From /proc/slabinfo - usually lots of output, so disabled by default
201 | 
202 | By default, `slab_nonzeroonly` is set to `1` so that this function reports only non-zero slabs
203 | 
204 | | name                              | description                                                                           | additional labels | units |
205 | |-----------------------------------|---------------------------------------------------------------------------------------|-------------------|-------|
206 | | monitoring_slabinfo_active_objs   | Number of objects that are currently active (i.e., in use)                            | slab`1`           | N     |
207 | | monitoring_slabinfo_num_objs      | Total number of allocated objects (i.e., objects that are both in use and not in use) | slab`1`           | N     |
208 | | monitoring_slabinfo_objsize_bytes | Size of objects in this slab, in bytes                                                | slab`1`           | Bytes |
209 | | monitoring_slabinfo_objperslab    | Number of objects stored in each slab                                                 | slab`1`           | N     |
210 | | monitoring_slabinfo_pagesperslab  | Number of pages allocated for each slab                                               | slab`1`           | N     |
211 | 
212 | `1` slab identifier
213 | 
214 | #### conntrack (default: disabled)
215 | 
216 | | name                              | description                | additional labels    | units |
217 | |-----------------------------------|----------------------------|----------------------|-------|
218 | | monitoring_conntrack_counter      | Output from `conntrack -C` | table`1`             | N     |
219 | | monitoring_conntrack_statistics   | Output from `conntrack -S` | table`1`, counter`2`, cpu`3` | N     |
220 | 
221 | `1` conntrack table name   
222 | `2` conntrack counter name   
223 | `3` this label represents the CPU ID if `conntrack_sum_all_cpu` is set to `0`, otherwise the label is not present   
224 | 
225 | Configuration variable `conntrack_ignore_fake_cpuid` controls whether "fake cpuids" are reported. Default `1`.
226 | 
227 | ## Extending
228 | 
229 | Instead of modifying this exporter you may be interested in creating your own workflow following this post: [Exporting Prometheus Metrics with Bash Scripts](https://apawel.me/exporting-prometheus-metrics-with-bash-scripts/)
230 | 
231 | It is relatively simple to add your own metric functions. Take on this function as an example.
232 | 
233 | ```
234 | #
235 | # BEGIN filesystem usage
236 | function metrics_filesystem {
237 |     buffer=$(df -PT $filesystem_targets 2>/dev/null | grep -v '^Filesystem' | tr -s '\t' ' ')
238 |     
239 |     # Filesystem 1024-blocks Used Available Capacity Mounted on
240 |     # /dev/mapper/cl-home 99533328 37320564 62212764 38% /
241 |     buildMetric "filesystem_total_kbytes" "gauge" "From df"
242 |     buildMetric "filesystem_used_kbytes" "gauge" "From df"
243 |     buildMetric "filesystem_avail_kbytes" "gauge" "From df"
244 |     buildMetric "filesystem_capacity_percent" "gauge" "From df"
245 | 
246 |     while read -r device fstype total used avail percent target; do 
247 |         labels="{mountpoint=\"$target\",source=\"$device\",fstype=\"$fstype\"}"
248 |         addMetricContent "filesystem_total_kbytes" "$labels $total"
249 |         addMetricContent "filesystem_used_kbytes" "$labels $used"
250 |         addMetricContent "filesystem_avail_kbytes" "$labels $avail"
251 |         noPercentSign=$(echo "$percent" | sed 's,%,,g')
252 |         addMetricContent "filesystem_capacity_percent" "$labels $noPercentSign"
253 |     done <<< "$buffer"
254 | 
255 |     buildOutput
256 | }
257 | # END filesystem usage
258 | ```
259 | 
260 | The function **buildMetric** basically adds a metric to the collection, in this case the metric name is uptime_seconds_total, the type is "counter" and the help text is "System uptime".
261 | 
262 | The **addMetricContent** function is used to add the actual metric value and labels. Unfortunately there is no built-in handling for labels.
263 | 
264 | **buildOutput** is used to "convert" the metric arrays to Prometheus compatible output and is placed at the very end.
265 | 
266 | The function name name must be preceded by metrics_, ie. metrics_myfunction for it to be utilized. Once you have your function add it to the array at line 22:
267 | 
268 | `declare -a exporters=( myfunction ... )`
269 | 
270 | Note in the above array we are not using the "metrics_" prefix.
271 | 


--------------------------------------------------------------------------------