├── .gitmodules
├── README.md
├── activemq-status.sh
├── api_limit_ram_usage.py
├── cass_compaction.py
├── cassandra_pending_tpstats.py
├── centos_reboot_check.sh
├── check-mount.sh
├── check-mtime.sh
├── check_nodering.py
├── check_openmanage.sh
├── chef_node_checkin.py
├── cloud_queues.py
├── cloudbackup_mon.sh
├── clouddatabases_volume.py
├── cloudload_balancer.py
├── cman_nodes.rb
├── consul.py
├── content_check.py
├── curl.sh
├── curl_check.sh
├── dir_monitor.py
├── dir_stats.sh
├── directory.sh
├── dns_resolution.sh
├── docker_check.py
├── docker_stats_check.py
├── elasticsearch.py
├── etcd.py
├── examples
└── example.rb
├── file_info.py
├── galera.py
├── hadoop_hbase.py
├── hadoop_hdfs.py
├── hadoop_jobtracker.py
├── haproxy.rb
├── holland_mysqldump.py
├── jmx-gather.sh
├── latest_alarm_state.py
├── long_process.sh
├── lsyncd-status.sh
├── megaraid.sh
├── memcached_stats.py
├── mongodb_stats.py
├── murmur_monitor.py
├── mysql_ping.py
├── mysql_replication.py
├── nfs-status.sh
├── nginx_status_check.py
├── ntp_offset.sh
├── onmetal_v1_smart.py
├── open_files.py
├── pg_check.py
├── php-fpm_status_check.sh
├── ping.sh
├── port_check.py
├── port_check.sh
├── process_mon.sh
├── rabbitmq.py
├── redis_slave_count.sh
├── solrmon.py
├── ssl_cert_expiration.sh
├── ssl_protocols_check.sh
├── statsd_metric_emitter.py
├── systemctl_status.sh
├── ubuntu_updates_check.sh
├── uptime_reset_detector.sh
├── varnish.sh
├── varnish4.sh
├── windows
├── get-counters.ps1
├── ping.ps1
└── service_mon.ps1
└── yum_updates_check.sh
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "contrib/rpc-maas"]
2 | path = contrib/rpc-maas
3 | url = https://github.com/rcbops/rpc-maas.git
4 | [submodule "contrib/uptime-monitor"]
5 | path = contrib/uptime-monitor
6 | url = https://github.com/racker/uptime-monitor.git
7 | [submodule "contrib/conveyer"]
8 | path = contrib/conveyer
9 | url = https://github.com/sam-falvo/conveyer.git
10 | [submodule "contrib/rpc-openstack"]
11 | path = contrib/rpc-openstack
12 | url = https://github.com/rcbops/rpc-openstack.git
13 | [submodule "contrib/gardener"]
14 | path = contrib/gardener
15 | url = https://github.com/creationix/gardener.git
16 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Rackspace Monitoring Agent Custom plugins
2 |
3 | This repository contains contributed custom plugins for the Rackspace Cloud
4 | Monitoring agent. For details about installing plugins, see [agent plugin check documentation](https://developer.rackspace.com/docs/rackspace-monitoring/v1/tech-ref-info/check-type-reference/#agent-plugin).
5 |
6 | ## How to Contribute
7 |
8 | You can contribute your plugins by first forking the repo, committing your changes, and then opening a pull request through github. If you have any questions, feel free to reach out to us on #cloudmonitoring on freenode IRC.
9 |
10 | ## Plugin Requirements
11 |
12 | Each plugin must fulfill the following properties:
13 |
14 | * Output a status message to STDOUT
15 | * Output one or more metrics if it succeeds in obtaining them to STDOUT
16 | * Contain an appropriate license header
17 | * Contain example alarm criteria
18 |
19 | ## Status
20 |
21 | The status message should be of the form status $status_string
, For example, it might be:
22 |
23 | status ok succeeded in obtaining metrics
24 |
25 | or
26 |
27 | status err failed to obtain metrics
28 |
29 | The status string should be a summary of the results, with actionable information if it fails.
30 |
31 | ## Metrics
32 |
33 | The metrics message should be of the form metric $name $type $value [unit]
, for example:
34 |
35 | metric time int32 1 seconds
36 |
37 | The units are optional, and if present should be a string representing the units of the metric measurement. Units may not be provided on string metrics, and may not contain any spaces.
38 |
39 | The available types are:
40 |
41 | * string
42 | * float
43 | * double
44 | * int32
45 | * int64
46 | * uint32
47 | * uint64
48 | * gauge
49 |
50 | ## Alarm Criteria
51 |
52 | Each script should contain, just below the license header, in a comment, an example alarm criteria that can be used for the plugin. See the [Rackspace Cloud Monitoring Documentation](https://developer.rackspace.com/docs/rackspace-monitoring/v1/tech-ref-info/alert-triggers-and-alarms/) for how to write alarm criteria.
53 |
54 | ## Submodules
55 |
56 | Submodules of repositories are stored in the contrib folder in this repo.
57 | There are more plugins in that folder, some of these plugins have dependencies and their own readmes.
58 |
59 | The contrib directory contains submodules of more custom plugins that have been used by other teams, including those from openstack and rackspace. These are older plugins for Icehouse/Juno,
60 | newer plugins for Kilo can be found at [rcbops/rpc-openstack](https://github.com/rcbops/rpc-openstack) or inside contrib/rpc-openstack/maas/plugins.
61 |
62 | You can pull the submodules with
63 | ```
64 | git pull --recurse-submodules
65 | git submodule update --recursive
66 | ```
67 |
68 |
69 | ## License Header
70 |
71 | The exact content will depend on your chosen license, but we recommend BSD, Apache 2.0, or MIT Licenses. Regardless of license choice the header should contain the author's (or authors') name(s) and email address(es).
72 |
--------------------------------------------------------------------------------
/activemq-status.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Description: Custom plugin which checks activemq status.
4 |
5 | # Copyright 2013 Ted Neykov
6 | #
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 | # Looks for 'CurrentStatus' line in output of `activemq-admin query` command.
20 | # Query command can return multiple lines containing 'CurrentStatus'.
21 | # Returns non-zero and 'status error' if any of the lines are not 'CurrentStatus = Good'.
22 |
23 | # Look for the activemq-admin script in /opt
24 | amq_bin=$(find /opt/ -name 'activemq-admin' | egrep '/bin/activemq-admin')
25 |
26 | if [ -z $amq_bin ]; then
27 | echo "status error: Could not find activemq-admin."
28 | exit 1
29 | fi
30 |
31 | amq_query=`"$amq_bin" query`
32 | curr_status=`echo "$amq_query" | grep CurrentStatus`
33 |
34 | echo "$curr_status" |
35 | while read -r line; do
36 | line_status=`echo "$line" | awk '{print $3}'`
37 | if [ "$line_status" == 'Good' ]; then
38 | :
39 | else
40 | # Found non "Good" status or empty line
41 | exit 1
42 | fi
43 | done
44 |
45 | if [ $? -eq 0 ]; then
46 | echo "status good"
47 | else
48 | echo "status error"
49 | exit 1
50 | fi
51 |
--------------------------------------------------------------------------------
/api_limit_ram_usage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | '''Rackspace Cloud Servers RAM usage monitor'''
3 | '''Pulls metrics from API for Total RAM allowed for Cloud Servers and Total RAM currently used (for API limits) and returns percent_ram_used metric to Cloud Monitoring API. Threshold percentage of when to set status to warning is set at command line argument along with username and api key'''
4 | # Copyright 2013 Rackspace
5 |
6 | # All Rights Reserved.
7 | #
8 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
9 | # not use this file except in compliance with the License. You may obtain
10 | # a copy of the License at
11 | #
12 | # http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
16 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
17 | # License for the specific language governing permissions and limitations
18 | # under the License.
19 | import sys
20 | import pyrax
21 | import argparse
22 |
23 |
24 | def main():
25 | '''Script execution'''
26 | parser = argparse.ArgumentParser(description='get percent of api limit '
27 | 'of ram used')
28 | parser.add_argument('-u', '--username', help='Rackspace Username',
29 | required=True)
30 | parser.add_argument('-a', '--apikey', help='Rackspace API Key',
31 | required=True)
32 | parser.add_argument('-m', '--maxthreshold',
33 | help='API Percent Used Threshold, integer between '
34 | '1-99',
35 | required=True)
36 | parser.add_argument('-r', '--region', help='Rackspace Regional Datacenter',
37 | required=True)
38 | parser.add_argument('--human',
39 | help='Format output for humans, not Cloud Monitoring',
40 | action='store_true')
41 | args = parser.parse_args()
42 |
43 | if int(args.maxthreshold) < 1 or int(args.maxthreshold) > 99:
44 | print "You must enter a valid integer from 1-99 for maxthreshold"
45 | sys.exit(2)
46 | pyrax.set_setting("identity_type", "rackspace")
47 | pyrax.set_credentials(args.username, args.apikey)
48 |
49 | (ram_used, ram_allowed) = getlimits(args.region)
50 | display_usage(ram_used, ram_allowed, args.maxthreshold, args.human)
51 |
52 |
53 | def getlimits(region):
54 | '''Returns the RAM usage and limits'''
55 | compute = pyrax.cloudservers
56 | cslimits = compute.limits.get(region)
57 | # Convert the generator to a list
58 | cslimits_list = [rate for rate in cslimits.absolute]
59 | # Pull out max_ram api limit and total used ram from list
60 | max_ram = [
61 | x.value for x in cslimits_list if x.name == "maxTotalRAMSize"][0]
62 | total_ram = [x.value for x in cslimits_list if x.name == "totalRAMUsed"][0]
63 | return (total_ram, max_ram)
64 |
65 |
66 | def display_usage(ram_used, ram_allowed, alert_percentage, human):
67 | '''Print RAM usage information'''
68 | percent_ram = (float(ram_used) / float(ram_allowed)) * 100
69 | percent_ram_used = round(float(("%.2f" % percent_ram)))
70 |
71 | if human:
72 | print "Current RAM Usage: %sMB" % ram_used
73 | print "Max RAM API Limit: %sMB" % ram_allowed
74 | if percent_ram_used >= float(alert_percentage):
75 | print "WARNING: Percent of API Limit Used: %d%%" % (
76 | percent_ram_used)
77 | else:
78 | print "OK: Percent of API Limit Used: %0d%%" % percent_ram_used
79 | else:
80 | # Cloud Monitoring-aware output
81 | if percent_ram_used < float(alert_percentage):
82 | print "status ok Percent RAM Used", percent_ram_used
83 | print "metric percent_ram_used float", percent_ram_used
84 | else:
85 | print "status err Percent RAM Used", percent_ram_used
86 | print "metric percent_ram_used float", percent_ram_used
87 |
88 |
89 | if __name__ == "__main__":
90 | main()
91 |
--------------------------------------------------------------------------------
/cass_compaction.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | #
3 | # Rackspace Cloud Monitoring Plug-In
4 | # Checks the number of compactions pending in a cassandra node.
5 | #
6 | # (c) 2017 Jim Wang
7 | # All Rights Reserved.
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
10 | # not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | #
21 | # Usage:
22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
23 | #
24 | # It accepts three arguments, the path to the nodetool executable, the cassandra hostname
25 | # and the port on which to run on
26 | #
27 | # Returns the number of pending compactions on the cass node
28 | #
29 | #! /usr/bin/python
30 |
31 | from subprocess import check_output
32 | import sys
33 | import string
34 | import argparse
35 |
36 | parser = argparse.ArgumentParser(description='Run nodetool to check for inconsistent state')
37 | parser.add_argument('-p', '--port', dest='portforcassandra', default='9080', help='port that cassandra is running on')
38 | parser.add_argument('-t', '--tool', dest='pathtonodetool', default='/opt/cassandra/bin/', help='path to nodetool executable (ex /opt/cassandra/bin)')
39 | parser.add_argument('-o', '--host', dest='cassandrahost', default='localhost', help='host cassandra is running on.')
40 |
41 | args = parser.parse_args();
42 |
43 |
44 | node_tool_output = check_output([args.pathtonodetool + 'nodetool', '-h',
45 | args.cassandrahost, '-p', args.portforcassandra, 'compactionstats'])
46 | pending_compactions = int(node_tool_output.splitlines()[0].split(':')[1])
47 | print 'metric pending_compactions uint32 ' + str(pending_compactions)
48 |
--------------------------------------------------------------------------------
/cassandra_pending_tpstats.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | """
3 | Rackspace Cloud Monitoring plugin for pending cassandra tpstats (uses nodetool tpstats)
4 |
5 | Example:
6 | $ ./cassandra_pending_tpstats.py
7 |
8 | Example alarm criteria:
9 | if (metric['cassandra_pending_foo'] > 10) {
10 | return new AlarmStatus(CRITICAL, 'Over 10 pending connections, increase resources')
11 | }
12 |
13 | if (metric['cassandra_pending_foo'] < 5) {
14 | return new AlarmStatus(CRITICAL, 'Under 5 pending connections, decrease resources')
15 | }
16 |
17 | Copyright 2015 Rackspace
18 |
19 | Licensed under the Apache License, Version 2.0 (the "License");
20 | you may not use this file except in compliance with the License.
21 | You may obtain a copy of the License at
22 |
23 | http://www.apache.org/licenses/LICENSE-2.0
24 |
25 | Unless required by applicable law or agreed to in writing, software
26 | distributed under the License is distributed on an "AS IS" BASIS,
27 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28 | See the License for the specific language governing permissions and
29 | limitations under the License.
30 | """
31 | import re
32 | import socket
33 | import subprocess
34 |
35 | def camel_to_underscore(name):
36 | s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
37 | return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
38 |
39 | def parse_tpstats(output):
40 | return re.compile(r'([A-Za-z_]+)\s+\d+\s+(\d+)').findall(output)
41 |
42 |
43 | statsd = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
44 | hostname = socket.gethostname().replace(".", "_")
45 |
46 | output, error = subprocess.Popen(['nodetool','tpstats'], stdout=subprocess.PIPE).communicate()
47 |
48 | if not error:
49 | for pool_name, count in parse_tpstats(output):
50 | pool_name = camel_to_underscore(pool_name)
51 |
52 | print "status ok"
53 | print "metric cassandra_pending_{pool} int {count}".format(pool=pool_name, count=count)
54 | else:
55 | print "status err"
56 |
--------------------------------------------------------------------------------
/centos_reboot_check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Description: Custom plugin returns if there is a kernel update pending on
4 | # CentOS. Useful for knowing when to reboot a server updated by yum-cron.
5 | # Author: Russell Obets - Adapted from ubuntu_updates_check.sh by Tomas Muraus.
6 | # License: MIT
7 | #
8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | #
15 | # The above copyright notice and this permission notice shall be included in
16 | # all copies or substantial portions of the Software.
17 | #
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 | # THE SOFTWARE.
25 |
26 |
27 | LAST_KERNEL=$(rpm -q --last kernel | perl -pe 's/^kernel-(\S+).*/$1/' | head -1)
28 | CURRENT_KERNEL=$(uname -r)
29 |
30 | REBOOT_REQUIRED="yes"
31 |
32 | if [ $LAST_KERNEL = $CURRENT_KERNEL ]; then
33 | REBOOT_REQUIRED="no"
34 | echo "status ok Currently running latest installed kernel - ${CURRENT_KERNEL}"
35 | else
36 | echo "status ok Pending kernel updates: running ${CURRENT_KERNEL}, available: ${LAST_KERNEL}"
37 | fi
38 |
39 | echo "metric reboot_required string ${REBOOT_REQUIRED}"
40 |
41 |
--------------------------------------------------------------------------------
/check-mount.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Rackspace Cloud Monitoring Plug-In
4 | # Check that a mounted filesystem is mounted
5 | #
6 | # (c) 2017 Teddy Caddy
7 | # All Rights Reserved.
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
10 | # not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | #
21 | # Usage:
22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
23 | #
24 | # It accepts two arguments:
25 | # - path: the path of the mount point you want to check
26 | # - flag_file: (optional) check that a flag file exists and is readable
27 | #
28 | # Returns 5 metrics:
29 | # - path: the input path paramter
30 | # - flag_file: the input flag_file parameter
31 | # - mounted: returns 1 if mount point is mounted
32 | # - flag_file_exists: returns 1 if flag file exists
33 | # - flag_file_readable: returns 1 if flag file is readable
34 | #
35 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
36 | #
37 | # if (metric['mounted'] != 1) {
38 | # return new AlarmStatus(CRITICAL, 'The mount point is not mounted: #{path}');
39 | # }
40 |
41 | # if (metric['flag_file'] != '' && metric['flag_file_exists'] != 1) {
42 | # return new AlarmStatus(CRITICAL, 'The flag file does not exist: #{path}/#{flag_file}');
43 | # }
44 |
45 | # if (metric['flag_file'] != '' && metric['flag_file_readable'] != 1) {
46 | # return new AlarmStatus(CRITICAL, 'The flag file is not readable: #{path}/#{flag_file}');
47 | # }
48 |
49 | # return new AlarmStatus(OK, 'The mount point is OK: #{path}');
50 | #
51 | path=$1
52 | flag_file="$1/$2"
53 |
54 | mounted=0
55 | flag_file_exists=0
56 | flag_file_readable=0
57 |
58 | if [ -d $path ]; then
59 | mounts=$(cat /proc/mounts)
60 | if [[ $mounts == *"$path"* ]]; then
61 | mounted=1
62 | if [ ! -z "${2// }" ]; then
63 | if [ -e $flag_file ]; then
64 | flag_file_exists=1
65 | if [ -r $flag_file ]; then
66 | flag_file_readable=1
67 | fi
68 | fi
69 | fi
70 | fi
71 | fi
72 |
73 | echo "metric path string $path"
74 | echo "metric flag_file string $2"
75 | echo "metric mounted int64 $mounted"
76 | echo "metric flag_file_exists int64 $flag_file_exists"
77 | echo "metric flag_file_readable int64 $flag_file_readable"
78 | exit 0
79 |
--------------------------------------------------------------------------------
/check-mtime.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Rackspace Cloud Monitoring Plug-In
4 | # Check the mtime of a file and how long it has been since it has been modified
5 | #
6 | # (c) 2015 Justin Gallardo
7 | # All Rights Reserved.
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
10 | # not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | #
21 | # Usage:
22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
23 | #
24 | # It accepts one argument, which should be the file you wish to check the mtime of.
25 | #
26 | # Returns 2 metrics:
27 | # - mtime: The time(unix epoch) the file was last modified
28 | # - age: The number of seconds that have elapsed since the file was modified
29 | #
30 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
31 | #
32 | # if (metric['age'] > 3600) {
33 | # return new AlarmStatus(CRITICAL, 'The file has not been modified in more than 1 hour. Last modified #{age} seconds ago');
34 | # }
35 | # return new AlarmStatus(OK, 'The file was last modified #{age} seconds ago.');
36 | #
37 | file=$1
38 |
39 | if [ ! -e $file ]; then
40 | echo "status critical \"$file\" does not exist"
41 | exit 1
42 | fi
43 |
44 | if [ ! -r $file ]; then
45 | echo "status critical \"$file\" is not readable"
46 | exit 1
47 | fi
48 |
49 | mtime=$(stat -c%Y $file)
50 | now=$(date '+%s')
51 | age=$(( $now - $mtime ))
52 |
53 | echo "status ok file statted"
54 | echo "metric mtime uint64 $mtime"
55 | echo "metric age uint64 $age seconds"
56 | exit 0
57 |
--------------------------------------------------------------------------------
/check_nodering.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 | #
3 | # Rackspace Cloud Monitoring Plug-In
4 | # Check the status of a cassandra nodering and make sure none of the nodes
5 | # have a '?' as a status.
6 | #
7 | # (c) 2017 Jim Wang
8 | # All Rights Reserved.
9 | #
10 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
11 | # not use this file except in compliance with the License.
12 | # You may obtain a copy of the License at
13 | #
14 | # http://www.apache.org/licenses/LICENSE-2.0
15 | #
16 | # Unless required by applicable law or agreed to in writing, software
17 | # distributed under the License is distributed on an "AS IS" BASIS,
18 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 | # See the License for the specific language governing permissions and
20 | # limitations under the License.
21 | #
22 | # Usage:
23 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
24 | #
25 | # It accepts three arguments, the path to the nodetool executable, the cassandra hostname
26 | # and the port on which to run on
27 | #
28 | # Returns 1 metric, nodering_status:
29 | # 0 if the nodes don't have a '?' as a status
30 | # 1 if the nodes have a '?' as a status
31 | #
32 | #! /usr/bin/python
33 |
34 | from subprocess import check_output
35 | import sys
36 | import argparse
37 |
38 | parser = argparse.ArgumentParser(description='Run nodetool to check for inconsistent state')
39 | parser.add_argument('-p', '--port', dest='portforcassandra', default='9080', help='port that cassandra is running on')
40 | parser.add_argument('-t', '--tool', dest='pathtonodetool', default='/opt/cassandra/bin/', help='path to nodetool executable (ex /opt/cassandra/bin)')
41 | parser.add_argument('-o', '--host', dest='cassandrahost', default='localhost', help='host cassandra is running on.')
42 |
43 | args = parser.parse_args();
44 |
45 | nodetooloutput = check_output([args.pathtonodetool + '/nodetool', '-h',
46 | args.cassandrahost, '-p', args.portforcassandra, 'ring'])
47 |
48 | if nodetooloutput.find('?') >= 0 :
49 | print 'status critical nodering not consistent'
50 | print 'metric nodering_status uint32 1'
51 | sys.exit(2)
52 | else :
53 | print 'status ok nodering in consistent state'
54 | print 'metric nodering_status uint32 0'
55 | sys.exit(0)
56 |
--------------------------------------------------------------------------------
/check_openmanage.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Rackspace Cloud Monitoring Plug-In
4 | # check_openmanage wrapper plugin to check health of Dell servers via OMSA
5 |
6 | # ----------------------------------------------------------------------------
7 | # "THE BEER-WARE LICENSE" (Revision 42):
8 | # wrote this file. As long as you retain this notice you
9 | # can do whatever you want with this stuff. If we meet some day, and you think
10 | # this stuff is worth it, you can buy me a beer in return.
11 | # ----------------------------------------------------------------------------
12 |
13 | # Depends on Dell's OMSA being installed along with Trond Hasle Amundsen's
14 | # wonderful check_openmanage plugin for Nagios (avaiable via EPEL on RPM based
15 | # distributions or directly from his web site:
16 | # http://folk.uio.no/trondham/software/check_openmanage.html
17 |
18 | # Usage:
19 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
20 | #
21 | # This plugin returns 1 metric:
22 | # - status : the exit status returned from the check_openmanage script
23 | #
24 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
25 | #
26 | # :set consecutiveCount=3
27 | #
28 | # if (metric['status'] >= 3) {
29 | # return new AlarmStatus(CRITICAL, '#{report}');
30 | # }
31 | #
32 | # if (metric['status'] == 2) {
33 | # return new AlarmStatus(CRITICAL, '#{report}');
34 | # }
35 | #
36 | # if (metric['status'] == 1) {
37 | # return new AlarmStatus(WARNING, '#{report}');
38 | # }
39 | #
40 | # return new AlarmStatus(OK, '#{report}');
41 | #
42 | # Things to keep in mind:
43 | # - this plugin will try to find check_openmanage in one of the "normal" distribution
44 | # managed paths where it might reside, or the actual shell path as a last resort.
45 | # - by default, we're ignoring uncertified drive warnings. Feel free to change that
46 | # if it's important in your environment.
47 |
48 | # Previous version of this wrapper used an opt_args argument. But given
49 | # you can set arguments to pass via the Monitoring API, that has been
50 | # replaced with $@ below. You might want to update your check with
51 | # something like the following:
52 | #
53 | # {
54 | # "details": {
55 | # "file": "check_openmanage.sh",
56 | # "args": [
57 | # "-b",
58 | # "pdisk_cert=all",
59 | # "-b",
60 | # "ctrl_fw=all"
61 | # ]
62 | # }
63 | # }
64 |
65 | search=(/usr/lib64/nagios/plugins /usr/lib/nagios/plugins)
66 |
67 | for i in ${search[@]}; do
68 | if [[ -x ${i}/check_openmanage ]]; then
69 | check_cmd="${i}/check_openmanage"
70 | break
71 | fi
72 | done
73 |
74 | if [[ -z "${check_cmd}" ]]; then
75 | if ! check_cmd=$(which check_openmanage); then
76 | echo "status Could not find check_openmanage script!"
77 | exit 1
78 | fi
79 | fi
80 |
81 | report=$(${check_cmd} ${@})
82 | status=$?
83 |
84 | echo "status successfully ran check_openmanage wrapper plugin"
85 | echo "metric status int32 ${status}"
86 | echo "metric report string $(echo -E ${report} | head -1)"
87 |
88 | exit 0
89 |
--------------------------------------------------------------------------------
/chef_node_checkin.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | from datetime import datetime
3 | import os
4 |
5 |
6 | def PopulateMetrics(log):
7 | clientRuns = []
8 | logFile = open(log, 'r')
9 | for line in logFile:
10 | if 'INFO: Chef Run complete in' in line:
11 | date = line[1:11].split('-')
12 | time = line[12:20].split(':')
13 | clientRuns.append(datetime(int(date[0]), int(date[1]), int(
14 | date[2]), int(time[0]), int(time[1]), int(time[2]), 0))
15 | metrics['checkInDuration'] = int(float(line.split()[6]))
16 |
17 | metrics['timeSinceCheckIn'] = int(
18 | datetime.now().strftime('%s')) - int(
19 | sorted(clientRuns)[-1].strftime('%s')
20 | )
21 |
22 | metrics['processesAmount'] = 0
23 | pids = [pid for pid in os.listdir('/proc') if pid.isdigit()]
24 | for pid in pids:
25 | if 'chef-client' in open(os.path.join('/proc', pid, 'cmdline'), 'rb').read():
26 | metrics['processesAmount'] += 1
27 |
28 | if metrics['timeSinceCheckIn'] > 86400:
29 | print "status Critcal node has not checked in for {0} seconds".format(
30 | metrics['timeSinceCheckIn']
31 | )
32 | elif metrics['timeSinceCheckIn'] > 3600:
33 | print "status Warning node has not checked in for {0} seconds".format(
34 | metrics['timeSinceCheckIn']
35 | )
36 | else:
37 | print "status OK node checked in {0} seconds ago".format(
38 | metrics['timeSinceCheckIn']
39 | )
40 | return metrics
41 |
42 |
43 | logfile = '/var/log/chef/client.log'
44 | metrics = {'timeSinceCheckIn': 0, 'checkInDuration': 0, 'processesAmount': 0}
45 |
46 | try:
47 | metrics = PopulateMetrics(logfile)
48 |
49 | # Anything OS related with file handling should warn and exit
50 | except IOError as err:
51 | print "status Warning {0}".format(err)
52 |
53 | # Handle the log regex not returning a poplated array
54 | except IndexError:
55 | print "status OK node has not generated a parsable log"
56 |
57 | except:
58 | print "status Warning unhandled error executing script"
59 |
60 | # Always print out metrics to allow REACH to report metrics
61 | finally:
62 | print "metric timeSinceCheckIn int32", metrics['timeSinceCheckIn']
63 | print "metric checkInDuration int32", metrics['checkInDuration']
64 | print "metric numberOfClients int32", metrics['processesAmount']
65 |
--------------------------------------------------------------------------------
/cloud_queues.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2.7
2 | """
3 | Copyright 2014 Rackspace
4 |
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 |
9 | http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | ----
17 |
18 | Rackspace Cloud Monitoring Plugin for Cloud Queues
19 |
20 | Retrieves Stats for number of unclaimed(free), claimed, and total messages in a given queue.
21 | Useful for triggering Autoscale webhooks based on number os messages in a Cloud Queue.
22 |
23 | Requires:
24 | pyrax - https://github.com/rackspace/pyrax
25 |
26 | Usage:
27 | Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
28 |
29 | Setup a CLoud Monitoring Check of type agent.plugin to run
30 | python ./cloud-queues.py
31 |
32 | Eg.
33 | python ./cloud-queues.py myQueue
34 |
35 | The following is an example 'criteria' for a Rackspace Monitoring Alarm:
36 |
37 | if (metric['queue.free'] >= 100) {
38 | return new AlarmStatus(CRITICAL, 'over 100 msgs unclaimed msgs'
39 | if (metric['queue.free'] >= '50') {
40 | return new AlarmStatus(WARNING, 'more than 60 unclaimed msgs');
41 | }
42 | return new AlarmStatus(OK, 'Less than 50 unclaimed msgs');
43 |
44 | Please note that you will need to adjust the thresholds based on what works for you.
45 |
46 | Available metrics are
47 | queue.claimed
48 | queue.total
49 | queue.free
50 |
51 | """
52 | import os
53 | import argparse
54 | import pyrax
55 |
56 | def get_queue_stats(queueName):
57 |
58 | pyrax.settings.set('identity_type', 'rackspace')
59 | pyrax.set_credential_file(os.path.expanduser("~/.rackspace_cloud_credentials"))
60 |
61 | try:
62 | cq = pyrax.queues
63 | except pyrax.exceptions.PyraxException:
64 | print 'status err Unable to get queue', queueName
65 | return
66 |
67 | try:
68 | stats = cq.get_stats(queueName)
69 | print 'status success'
70 | except pyrax.exceptions.NotFound:
71 | print 'status err Unable to get queue stats', queueName
72 | return
73 |
74 | for key,value in stats.items():
75 | if type(value) is int:
76 | print 'metric queue.%s int %s' % (key, value)
77 |
78 |
79 | if __name__ == "__main__":
80 | parser = argparse.ArgumentParser()
81 | parser.add_argument("queueName", help="Cloud Queue name")
82 | args = parser.parse_args()
83 | get_queue_stats(args.queueName)
84 |
85 |
86 |
--------------------------------------------------------------------------------
/cloudbackup_mon.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # cloudbackup_mon.sh
4 | # Rackspace Cloud Monitoring Plugin to help detect if there are
5 | # problems with Cloud Backups.
6 | #
7 | # Copyright (c) 2013, Stephen Lang
8 | # All rights reserved.
9 | #
10 | # Redistribution and use in source and binary forms, with or without
11 | # modification, are permitted provided that the following conditions are met:
12 | #
13 | # Redistributions of source code must retain the above copyright notice,
14 | # this list of conditions and the following disclaimer.
15 | #
16 | # Redistributions in binary form must reproduce the above copyright
17 | # notice, this list of conditions and the following disclaimer in the
18 | # documentation and/or other materials provided with the distribution.
19 | #
20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
24 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 | # POSSIBILITY OF SUCH DAMAGE.
31 | #
32 | # Curl Command:
33 | # curl -i -X POST -H 'Host: monitoring.api.rackspacecloud.com' -H
34 | # 'Accept-Encoding: gzip,deflate' -H 'X-Auth-Token: YOUR_API_TOKEN'
35 | # -H 'Content-Type: application/json; charset=UTF-8' -H 'Accept: application/json'
36 | # --data-binary '{"label": "Cloud Backup Check", "type": "agent.plugin", "details":
37 | # {"args": ["YOUR_API_KEY"],"file": "cloudbackup_mon.sh"}}' --compress
38 | # 'https://monitoring.api.rackspacecloud.com:443/v1.0/YOUR_ACCOUNT/entities/YOUR_ENTITY/checks'
39 | #
40 | # Usage:
41 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
42 | #
43 | # It needs to be passed 2-3 params by the backup system:
44 | #
45 | # apikey datacenter [backupid]
46 | #
47 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
48 | #
49 | # if (metric['diagnostics'] != 'No errors') {
50 | # return new AlarmStatus(CRITICAL, 'Errors found during last Cloud Backup: #{diagnostics}');
51 | # }
52 | # if (metric['reason'] != 'Success') {
53 | # return new AlarmStatus(CRITICAL, 'The last Cloud Backup was not successful.');
54 | # }
55 | # if (metric['state'] != 'Completed') {
56 | # return new AlarmStatus(CRITICAL, 'The last Cloud Backup was not completed.');
57 | # }
58 | # if (metric['agent_running'] == 0) {
59 | # return new AlarmStatus(CRITICAL, 'The Cloud Backup Agent is not running.');
60 | # }
61 | # if (metric['age'] > 129600) {
62 | # return new AlarmStatus(CRITICAL, 'The last Cloud Backup is more than 36 hours old!');
63 | # }
64 | # return new AlarmStatus(OK, 'The last Cloud Backup was successful.');
65 |
66 | function help {
67 |
68 | cat <
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | """
18 |
19 | import os
20 | import sys
21 |
22 | import argparse
23 | import pyrax
24 |
25 |
26 | def check_usage(instance_id, threshold, region):
27 | pyrax.set_credential_file(
28 | os.path.expanduser("~/.rackspace_cloud_credentials"))
29 | cdb = pyrax.connect_to_cloud_databases(region=region)
30 |
31 | matched_instance = None
32 | for instance in cdb.list():
33 | if instance.id == instance_id:
34 | matched_instance = instance
35 | if not matched_instance:
36 | print 'status err Unable to find instance', instance_id
37 | sys.exit(1)
38 |
39 | # Force usage lookup
40 | matched_instance.get()
41 | database_size = matched_instance.volume['size']
42 | database_usage = matched_instance.volume['used']
43 | percentage_used = database_usage / database_size
44 |
45 | if percentage_used >= threshold:
46 | print 'status err usage over threshold'
47 | else:
48 | print 'status ok usage within threshold'
49 |
50 | print "metric database_GB_container_size float", database_size
51 | print "metric database_GB_used float", database_usage
52 | print "metric percentage_used float", percentage_used
53 |
54 | if __name__ == "__main__":
55 | parser = argparse.ArgumentParser()
56 | parser.add_argument("instance", help="Cloud Database instance id")
57 | parser.add_argument("region", help="Cloud region, e.g. DFW or ORD")
58 | parser.add_argument("threshold", nargs='?', default=85.0, type=float,
59 | help="Storage threshold to alert on")
60 | args = parser.parse_args()
61 | check_usage(args.instance, args.threshold, args.region)
62 |
63 |
--------------------------------------------------------------------------------
/cloudload_balancer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Rackspace Cloud Monitoring plugin to provide cloud load balancer
4 |
5 | Requirement:
6 | pyrax - https://github.com/rackspace/pyrax
7 |
8 | Copyright 2013 Rackspace
9 |
10 | Licensed under the Apache License, Version 2.0 (the "License");
11 | you may not use this file except in compliance with the License.
12 | You may obtain a copy of the License at
13 |
14 | http://www.apache.org/licenses/LICENSE-2.0
15 |
16 | Unless required by applicable law or agreed to in writing, software
17 | distributed under the License is distributed on an "AS IS" BASIS,
18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 | See the License for the specific language governing permissions and
20 | limitations under the License.
21 | """
22 | import os
23 | import argparse
24 | import pyrax
25 |
26 | USAGE_STATS = [
27 | {'key': 'incoming', 'ref': 'incomingTransfer', 'unit': 'int'},
28 | {'key': 'incoming_ssl', 'ref': 'incomingTransferSsl', 'unit': 'int'},
29 | {'key': 'outgoing', 'ref': 'outgoingTransfer', 'unit': 'int'},
30 | {'key': 'outgoing_ssl', 'ref': 'outgoingTransferSsl', 'unit': 'int'}
31 | ]
32 |
33 |
34 | STATS = [
35 | {'key': 'connect_timeout', 'ref': 'connectTimeOut', 'unit': 'int'},
36 | {'key': 'connect_error', 'ref': 'connectError', 'unit': 'int'},
37 | {'key': 'connect_failure', 'ref': 'connectFailure', 'unit': 'int'},
38 | {'key': 'data_timed_out', 'ref': 'dataTimedOut', 'unit': 'int'},
39 | {'key': 'keep_alive_timed_out', 'ref': 'keepAliveTimedOut', 'unit': 'int'},
40 | {'key': 'max_conns', 'ref': 'maxConn', 'unit': 'int'}
41 | ]
42 |
43 |
44 | def check_usage(instance_id, region):
45 | pyrax.settings.set('identity_type', 'rackspace')
46 | pyrax.set_credential_file(
47 | os.path.expanduser("~/.rackspace_cloud_credentials"),
48 | region=region)
49 |
50 | clb = pyrax.cloud_loadbalancers
51 |
52 | try:
53 | instance = clb.get(instance_id)
54 | print 'status ok'
55 | except pyrax.exceptions.NotFound:
56 | print 'status err Unable to find instance', instance_id
57 | return
58 |
59 | mgr = instance.manager
60 | status = instance.status
61 | nodes = instance.nodes
62 | name = instance.name.lower().replace('-', '_')
63 | usage = mgr.get_usage(instance)
64 | usage = usage['loadBalancerUsageRecords'][-1]
65 |
66 | if status == 'ACTIVE':
67 | print 'metric %s.status float 100.0' % (name)
68 | else:
69 | print 'metric %s.status float 0.0' % (name)
70 |
71 | for stat in USAGE_STATS:
72 | print 'metric %s.%s %s %s' % \
73 | (name, stat['key'], stat['unit'], usage[stat['ref']])
74 |
75 | online_nodes = 0
76 | offline_nodes = 0
77 | draining_nodes = 0
78 | total_nodes = len(nodes)
79 |
80 | for node in nodes:
81 | if node.status == 'ONLINE' and node.condition == 'ENABLED':
82 | online_nodes = online_nodes + 1
83 | if node.status == 'OFFLINE' or node.condition == 'DISABLED':
84 | offline_nodes = offline_nodes + 1
85 | if node.status == 'DRAINING' or node.condition == 'DRAINING':
86 | draining_nodes = draining_nodes + 1
87 |
88 | print 'metric %s.total_nodes int %s' % (name, total_nodes)
89 | print 'metric %s.online_nodes int %s' % (name, online_nodes)
90 | print 'metric %s.offline_nodes int %s' % (name, offline_nodes)
91 | print 'metric %s.draining_nodes int %s' % (name, draining_nodes)
92 |
93 | stats = mgr.get_stats(instance)
94 | for stat in STATS:
95 | print 'metric %s.%s %s %s' % \
96 | (name, stat['key'], stat['unit'], stats[stat['ref']])
97 |
98 |
99 | if __name__ == "__main__":
100 | parser = argparse.ArgumentParser()
101 | parser.add_argument("instance", help="Cloud Load Balancer instance id")
102 | parser.add_argument("region", help="Cloud region, e.g. DFW or ORD")
103 | args = parser.parse_args()
104 | check_usage(args.instance, args.region.upper())
105 |
--------------------------------------------------------------------------------
/cman_nodes.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | ## Rackspace Cloud Monitoring Plug-In
3 | ## CMAN nodes check
4 | #
5 | # Author: James Turnbull
6 | # Copyright (c) 2012 James Turnbull
7 | #
8 | # MIT License:
9 | # Permission is hereby granted, free of charge, to any person obtaining a copy
10 | # of this software and associated documentation files (the "Software"), to deal
11 | # in the Software without restriction, including without limitation the rights
12 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 | # copies of the Software, and to permit persons to whom the Software is
14 | # furnished to do so, subject to the following conditions:
15 | #
16 | # The above copyright notice and this permission notice shall be included in
17 | # all copies or substantial portions of the Software.
18 | #
19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 | # THE SOFTWARE.
26 | #
27 | # Usage:
28 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
29 | #
30 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
31 | #
32 | # if (metric['NODE_STATUS'] == 'X') {
33 | # return new AlarmStatus(CRITICAL, 'Host is not a member of the cluster!');
34 | # }
35 | #
36 | # if (metric['NODE_STATUS'] < 'd') {
37 | # return new AlarmStatus(CRITICAL, 'Host is disallowed from cluster!');
38 | # }
39 | #
40 | # return new AlarmStatus(OK, 'Host is a member of the cluster.');
41 | #
42 |
43 | # If the plugin fails in any way, print why and exit nonzero.
44 | def fail(status="Unknown failure")
45 | puts "status #{status}"
46 | exit 1
47 | end
48 |
49 | # Store metrics in a hash and don't print them until we've completed
50 | def metric(name,type,value)
51 | @metrics[name] = {
52 | :type => type,
53 | :value => value
54 | }
55 | end
56 |
57 | # Once the script has succeeded without errors, print metrics lines.
58 | def output_success
59 | puts "status Cman node status for #{@hostname}"
60 | @metrics.each do |name,v|
61 | puts "metric #{name} #{v[:type]} #{v[:value]}"
62 | end
63 | end
64 |
65 | begin
66 | require 'optparse'
67 | rescue
68 | fail "Failed to load required ruby gems!"
69 | end
70 |
71 | @metrics = {}
72 | options = {}
73 |
74 | args = ARGV.dup
75 |
76 | OptionParser.new do |o|
77 | o.banner = "Usage: #{$0} [options]"
78 | o.on('-h', '--hostname HOSTNAME', 'Check status of node lid option') do |h|
79 | options[:host] = h
80 | end
81 | o.on_tail('-h', '--help', 'Show this message') { puts o; exit }
82 | o.parse!(args)
83 | end
84 |
85 | @hostname = options[:host] || `hostname -s`.chomp
86 |
87 | begin
88 | node_status = `cman_tool nodes -n #{@hostname} -F type`
89 | metric("node_status","string","#{node_status}")
90 | rescue => e
91 | fail "Problem running cman_tool plugin: #{e.message}"
92 | end
93 |
94 | output_success
95 |
--------------------------------------------------------------------------------
/consul.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Script to return status and metrics for Consul
4 | #
5 | # Justin Phelps
6 | # All Rights Reserved.
7 | #
8 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
9 | # not use this file except in compliance with the License.
10 | # You may obtain a copy of the License at
11 | #
12 | # http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS,
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 | #
20 | # Example alarm criteria:
21 | #
22 | # if (metric['node_count'] < 5) {
23 | # return new AlarmStatus(WARNING, 'Node Count is below 5.');
24 | # }
25 | #
26 | # if (metric['node_count'] < 3) {
27 | # return new AlarmStatus(CRITICAL, 'Node Count is below 3.');
28 | # }
29 | #
30 | # return new AlarmStatus(OK, 'Node Count is within range.');
31 | #
32 |
33 | import psutil
34 | import json
35 | import urllib2
36 |
37 | def check_process_name(name):
38 | """Returns status of given process."""
39 | for proc in psutil.process_iter():
40 | try:
41 | pinfo = proc.as_dict(attrs=['name'])
42 | except psutil.NoSuchProcess:
43 | pass
44 | else:
45 | if pinfo['name'] == name:
46 | return 'status ok consul is running'
47 | return 'status error consul is not running'
48 |
49 | def consul_http2json(url):
50 | """Returns data from the HTTP interface as a dict."""
51 | try:
52 | response = urllib2.urlopen(url)
53 | except urllib2.URLError:
54 | pass
55 | else:
56 | html = response.read()
57 | data = json.loads(html)
58 | return data
59 |
60 | def consul_agent_type():
61 | """Returns the type of agent that is running."""
62 | try:
63 | agent_info = consul_http2json("http://localhost:8500/v1/agent/self?pretty=1")
64 | agent_type = agent_info['Config']['Server']
65 | except:
66 | return 'metric agent_type string unknown'
67 | else:
68 | if agent_type is True:
69 | return 'metric agent_type string server'
70 | else:
71 | return 'metric agent_type string client'
72 |
73 | def consul_node_count():
74 | """Returns the number of consul nodes running as seen by this specific node."""
75 | try:
76 | nodes = consul_http2json("http://localhost:8500/v1/catalog/nodes?pretty=1")
77 | count = len(nodes)
78 | except:
79 | return 'metric node_count string unknown'
80 | else:
81 | return 'metric node_count int32 {0} nodes'.format(count)
82 |
83 | def main():
84 | status = check_process_name("consul")
85 | print(status)
86 | agent_type = consul_agent_type()
87 | print(agent_type)
88 | node_count = consul_node_count()
89 | print(node_count)
90 |
91 | if __name__ == '__main__':
92 | main()
93 |
--------------------------------------------------------------------------------
/content_check.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | #
4 | # usage: content_check.py [-h] [--timeout TIMEOUT] url pattern
5 | #
6 | # Rackspace Monitoring plugin to check a URL for a regular expression. Useful if
7 | # the URL you need to check is not publicly accessible, but can be reached by
8 | # another entity. Returns the metric 'found' with a value of 'yes' or 'no'.
9 | #
10 | # positional arguments:
11 | # url url to check
12 | # pattern regex to check for
13 | #
14 | # optional arguments:
15 | # -h, --help show this help message and exit
16 | # --timeout TIMEOUT timeout in seconds (default 5)
17 | #
18 | #
19 | # content_check.py - Rackspace Cloud Monitoring plugin
20 | # Copyright (C) 2015 Carl George
21 | #
22 | # This program is free software: you can redistribute it and/or modify
23 | # it under the terms of the GNU General Public License as published by
24 | # the Free Software Foundation, either version 3 of the License, or
25 | # (at your option) any later version.
26 | #
27 | # This program is distributed in the hope that it will be useful,
28 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
29 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 | # GNU General Public License for more details.
31 | #
32 | # You should have received a copy of the GNU General Public License
33 | # along with this program. If not, see .
34 |
35 |
36 | """Rackspace Monitoring plugin to check a URL for a regular expression. Useful
37 | if the URL you need to check is not publicly accessible, but can be reached by
38 | another entity. Returns the metric 'found' with a value of 'yes' or 'no'.
39 | """
40 |
41 |
42 | from __future__ import print_function
43 |
44 | import argparse
45 | import re
46 |
47 | try:
48 | from urllib.request import urlopen
49 | from urllib.error import HTTPError
50 | except ImportError:
51 | from urllib2 import urlopen, HTTPError
52 |
53 |
54 | parser = argparse.ArgumentParser(description=__doc__)
55 | parser.add_argument('url', help='url to check')
56 | parser.add_argument('pattern', help='regex to check for')
57 | parser.add_argument('--timeout', type=int, default=5, help='timeout in seconds (default 5)')
58 | args = parser.parse_args()
59 |
60 | if not args.url.startswith('http'):
61 | args.url = 'http://{0}'.format(args.url)
62 |
63 | try:
64 | request = urlopen(args.url, timeout=args.timeout)
65 | page = request.read().decode('utf-8')
66 | except HTTPError as e:
67 | raise SystemExit('{0} {1} ({2})'.format(e.code, e.reason, args.url))
68 |
69 | m = re.search(args.pattern, page)
70 |
71 | if m:
72 | print('status ok\nmetric found string yes')
73 | else:
74 | print('status err\nmetric found string no')
75 |
--------------------------------------------------------------------------------
/curl.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Rackspace Cloud Monitoring Plug-In
4 | # Simple curl request test that can be used to query internal hosts
5 | #
6 | # (C)2014 James Buchan
7 | # All Rights Reserved.
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
10 | # not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | #
21 | # Usage:
22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
23 | #
24 | # It accepts one argument, which should be the site you wish to query.
25 | #
26 | # Returns 4 metrics:
27 | # - code: The final status code returned
28 | # - time_connect: The total time, in seconds, that the full operation lasted
29 | # - time_total: The time, in seconds, it took from the start until the TCP
30 | # connect to the remote host (or proxy) was completed
31 | #
32 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
33 | #
34 | # if (metric['code'] != "200") {
35 | # return new AlarmStatus(CRITICAL, '#{code} response received. Expected 200.');
36 | # }
37 | # return new AlarmStatus(OK, '200 response received');
38 | #
39 |
40 | response=$(curl -sS -f -o /dev/null -I -w "%{response_code} %{time_connect} %{time_total}" $1 2>&1)
41 |
42 | if [ $? -eq 0 ]
43 | then
44 | echo "status ok connection made"
45 | echo "metric code string $(echo $response | awk {'print $1'})"
46 | echo "metric time_connect double $(echo $response | awk {'print $2'})"
47 | echo "metric time_total double $(echo $response | awk {'print $3'})"
48 | exit 0
49 | else
50 | #remove statistics from our status line, only keep the error
51 | echo "status $(echo $response | awk -F'000 ' '{$0=$1}1' )"
52 | fi
53 |
54 | exit 1
55 |
--------------------------------------------------------------------------------
/curl_check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Rackspace Cloud Monitoring Plug-In
4 | # Simple curl request test that can be used to query internal hosts
5 | #
6 | # (C)2014 James Buchan
7 | # All Rights Reserved.
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
10 | # not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | #
21 | # Usage:
22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
23 | #
24 | # It accepts one argument, which should be the site you wish to query.
25 | #
26 | # Returns 4 metrics:
27 | # - code: The final status code returned
28 | # - time_connect: The total time, in seconds, that the full operation lasted
29 | # - time_total: The time, in seconds, it took from the start until the TCP
30 | # connect to the remote host (or proxy) was completed
31 | # - url: The last URL that was queried (if redirects occurred)
32 | #
33 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
34 | #
35 | # if (metric['code'] != '200') {
36 | # return new AlarmStatus(CRITICAL, '#{code} response received. Expected 200.');
37 | # }
38 | # return new AlarmStatus(OK, '200 response received');
39 | #
40 |
41 | function extract_header()
42 | {
43 | ret=$(echo "$1" | grep "$2:" | tail -1 | cut -d' ' -f 2- | tr -d '\n\r' )
44 | [ -n "$ret" ] && echo -n $ret
45 | }
46 |
47 | response=$(curl -sS -L -f -I -w "Response-Code: %{response_code}\nTime-Connect: %{time_connect}\nTime-Total: %{time_total}\nURL-Effective: %{url_effective}\n" $1 2>&1)
48 |
49 | if [ $? -eq 0 ]
50 | then
51 | echo "status ok connection made"
52 | echo "metric code string $(extract_header "$response" Response-Code)"
53 | echo "metric time_connect double $(extract_header "$response" Time-Connect) seconds"
54 | echo "metric time_total double $(extract_header "$response" Time-Total) seconds"
55 | echo "metric url string $(extract_header "$response" URL-Effective)"
56 |
57 | etag=$(extract_header "$response" ETag)
58 | [ -n "$etag" ] && echo "metric etag string $etag"
59 |
60 | length=$(extract_header "$response" Content-Length)
61 | [ -n "$length" ] && echo "metric content_length uint32 $length bytes"
62 |
63 | modified=$(extract_header "$response" Last-Modified)
64 | if [ -n "$modified" ]
65 | then
66 | modified_seconds=$(date --date="$modified" +"%s")
67 | age=$(($(date +"%s") - $modified_seconds))
68 | echo "metric page_age uint64 $age seconds"
69 | fi
70 |
71 | exit 0
72 | else
73 | #remove statistics from our status line, only keep the error
74 | echo "status $(echo $response | awk -F'000 ' '{$0=$1}1' )"
75 | fi
76 |
77 | exit 1
78 |
--------------------------------------------------------------------------------
/dir_stats.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Gather stats on the directory size, number of files and oldest file name
4 |
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | set -e
18 | TARGET="${1}"
19 |
20 | if [ "z" = "z${TARGET}" ]
21 | then
22 | echo "status err missing target argument"
23 | exit 1
24 | fi
25 |
26 | if [ ! -d ${TARGET} ]
27 | then
28 | echo "status err ${TARGET} does not exist or is not a directory"
29 | exit 1
30 | fi
31 |
32 | SIZE="$(du -sm ${TARGET} | awk '{print $1}')"
33 | NB_FILES="$(find ${TARGET} -type f | wc -l)"
34 | if [ ${NB_FILES} -gt 0 ]
35 | then
36 | OLDEST_FILE_STAT="$(find ${TARGET} -type f -printf "%T@ %p\n" | sort -n | head -n1)"
37 | OLDEST_FILE_NAME="$(echo ${OLDEST_FILE_STAT} | cut -d ' ' -f2)"
38 | OLDEST_FILE_MTIME="$(echo ${OLDEST_FILE_STAT} | cut -d ' ' -f1 | cut -d '.' -f1)"
39 | OLDEST_FILE_AGE=$((`date +%s`-${OLDEST_FILE_MTIME}))
40 | else
41 | OLDEST_FILE_NAME='no_files'
42 | OLDEST_FILE_AGE=0
43 | fi
44 |
45 | echo "status ok target uses ${SIZE} MB in ${NB_FILES} files"
46 | echo "metric total_size uint64 ${SIZE} megabytes"
47 | echo "metric total_files uint64 ${NB_FILES} files"
48 | echo "metric oldest_file_name string ${OLDEST_FILE_NAME}"
49 | echo "metric oldest_file_age uint64 ${OLDEST_FILE_AGE} seconds"
50 |
--------------------------------------------------------------------------------
/directory.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #
3 | # Copyright 2015 Rackspace
4 |
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 |
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ----
17 | # Custom check for a directory presence, files, oldest file name & age and size (in Mbytes)
18 | #
19 | # if (metric['size'] < 0) {
20 | # return new AlarmStatus(CRITICAL, 'Directory #{dir} check failed - no such directory?');
21 | # }
22 | # if (metric['size'] > 500) {
23 | # return new AlarmStatus(WARNING, 'Directory #{dir} is #{size} Mbytes');
24 | # }
25 | # if (metric['size'] > 1000) {
26 | # return new AlarmStatus(CRITICAL, 'Directory #{dir} is #{size} Mbytes');
27 | # }
28 | #
29 | # E.g.
30 | # ./Directory.sh DIRECTORY
31 |
32 | set -e
33 | TARGET="${1}"
34 |
35 | if [ "z" = "z${TARGET}" ]
36 | then
37 | echo "status err missing target argument"
38 | echo "Usage: $0 DIRECTORY"
39 | exit 1
40 | fi
41 |
42 | if [ ! -d ${TARGET} ]
43 | then
44 | echo "status err ${TARGET} does not exist or is not a directory"
45 | exit 1
46 | fi
47 |
48 | SIZE="$(du -sm ${TARGET} | awk '{print $1}')"
49 | NB_FILES="$(find ${TARGET} -type f | wc -l)"
50 | if [ ${NB_FILES} -gt 0 ]
51 | then
52 | OLDEST_FILE_STAT="$(find ${TARGET} -type f -printf "%T@ %p\n" | sort -n | head -n1)"
53 | OLDEST_FILE_NAME="$(echo ${OLDEST_FILE_STAT} | cut -d ' ' -f2)"
54 | OLDEST_FILE_MTIME="$(echo ${OLDEST_FILE_STAT} | cut -d ' ' -f1 | cut -d '.' -f1)"
55 | OLDEST_FILE_AGE=$((`date +%s`-${OLDEST_FILE_MTIME}))
56 | else
57 | OLDEST_FILE_NAME='no_files'
58 | OLDEST_FILE_AGE=0
59 | fi
60 |
61 | echo "status ok target uses ${SIZE} MB in ${NB_FILES} files"
62 | echo "metric dir string ${TARGET}"
63 | echo "metric total_size uint64 ${SIZE} megabytes"
64 | echo "metric total_files uint64 ${NB_FILES} files"
65 | echo "metric oldest_file_name string ${OLDEST_FILE_NAME}"
66 | echo "metric oldest_file_age uint64 ${OLDEST_FILE_AGE} seconds"
67 |
--------------------------------------------------------------------------------
/dns_resolution.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # dns-resolve.sh
4 | # Rackspace Cloud Monitoring Plugin to verify the current return status of DNS lookups.
5 | #
6 | # Copyright (c) 2014, Lindsey Anderson
7 | # Copyright (c) 2015, Michael Burns
8 | # All rights reserved.
9 | #
10 | # Redistribution and use in source and binary forms, with or without
11 | # modification, are permitted provided that the following conditions are met:
12 | #
13 | # Redistributions of source code must retain the above copyright notice,
14 | # this list of conditions and the following disclaimer.
15 | #
16 | # Redistributions in binary form must reproduce the above copyright
17 | # notice, this list of conditions and the following disclaimer in the
18 | # documentation and/or other materials provided with the distribution.
19 | #
20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
24 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 | # POSSIBILITY OF SUCH DAMAGE.
31 | #
32 | # Example criteria:
33 | #
34 | #if (metric['dns_lookup'] != 'successful'){
35 | # return new AlarmStatus(CRITICAL, 'DNS Lookups are unavailable.');
36 | #}
37 | #return new AlarmStatus(OK, 'DNS Lookups from this server are responsive.');
38 |
39 |
40 | RESOLVE=${1:-"example.com"}
41 | TYPE=${2:-"A"}
42 |
43 | res=$(dig +noall +answer ${RESOLVE} ${TYPE} | head -1)
44 |
45 | if [ -z "$res" ]; then
46 | echo "status critical dns_lookup unsuccessful"
47 | echo "metric dns_lookup string failed"
48 | else
49 | echo "status ok dns_lookup successful"
50 | echo "metric dns_lookup string successful"
51 | fi
52 |
--------------------------------------------------------------------------------
/docker_check.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Rackspace Cloud Monitoring Plugin for Docker."""
3 |
4 | # Copyright 2015 Frank Ritchie
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | # -----
19 | #
20 | # This plugin monitors the Docker service via the 'docker info' command.
21 | # By default the monitor fails if the check does not complete successfully.
22 | # Metrics for:
23 | #
24 | # - the number of images
25 | # - the number of containers
26 | # - the number of go routines
27 | # - the driver used
28 | # - data space used
29 | # - total data space
30 | # - metadata space used
31 | # - total metadata space
32 | #
33 | # are also reported.
34 | #
35 | # Requires:
36 | # Python 2.6 or greater
37 | # docker-py: https://github.com/docker/docker-py
38 | #
39 | # Usage:
40 | # Place script in /usr/lib/rackspace-monitoring-agent/plugins.
41 | # Ensure file is executable (755).
42 | #
43 | # Set up a Cloud Monitoring Check of type agent.plugin to run
44 | #
45 | # docker_check.py -u
46 | #
47 | # The URL is optional and can be a TCP or Unix socket, e.g.
48 | #
49 | # docker_check.py -u tcp://0.0.0.0:2376
50 | # or
51 | # docker_check.py -u unix://var/run/docker.sock
52 | #
53 | # The default URL is unix://var/run/docker.sock.
54 | #
55 | # There is no need to define specific custom alert criteria.
56 | # As stated, the monitor fails if the stats cannot be collected.
57 | # It is possible to define custom alert criteria with the reported
58 | # metrics if desired.
59 | #
60 |
61 | import sys
62 | from docker import Client
63 | from optparse import OptionParser
64 |
65 |
66 | class DockerService(object):
67 | """Create an object for a Docker service. Assume it is stopped."""
68 |
69 | def __init__(self, url):
70 |
71 | self.url = url
72 | self.docker_running = False
73 |
74 | def docker_stats(self):
75 | """Connect to the Docker object and get stats. Error out on failure."""
76 |
77 | docker_conn = Client(base_url=self.url)
78 |
79 | try:
80 | docker_info = docker_conn.info()
81 | self.docker_running = True
82 | # Apologies for the broad exception, it just works here.
83 | except Exception:
84 | self.docker_running = False
85 |
86 | if self.docker_running:
87 | # Create a dict from the list of lists 'docker info' uses
88 | # to report Driver Status stats.
89 | driver_status = dict([(metric[0], metric[1]) for metric in \
90 | docker_info['DriverStatus']])
91 |
92 | print 'metric images int64', docker_info['Images']
93 | print 'metric containers int64', docker_info['Containers']
94 | print 'metric go_routines int64', docker_info['NGoroutines']
95 | print 'metric driver string', docker_info['Driver']
96 |
97 | data_space_used_scalar, data_space_used_unit = \
98 | driver_status['Data Space Used'].split()
99 | print 'metric data_space_used float', \
100 | data_space_used_scalar, data_space_used_unit
101 |
102 | data_space_total_scalar, data_space_total_unit = \
103 | driver_status['Data Space Total'].split()
104 | print 'metric data_space_total float', \
105 | data_space_total_scalar, data_space_total_unit
106 |
107 | metadata_space_used_scalar, metadata_space_used_unit = \
108 | driver_status['Metadata Space Used'].split()
109 | print 'metric metadata_space_used float', \
110 | metadata_space_used_scalar, metadata_space_used_unit
111 |
112 | metadata_space_total_scalar, metadata_space_total_unit = \
113 | driver_status['Metadata Space Total'].split()
114 | print 'metric metadata_space_total float', \
115 | metadata_space_total_scalar, metadata_space_total_unit
116 |
117 | print 'status ok succeeded in obtaining docker stats.'
118 | else:
119 | print 'status err failed to obtain docker stats.'
120 | sys.exit(1)
121 |
122 |
123 | def main():
124 | """Instantiate a DockerService object and collect stats."""
125 |
126 | parser = OptionParser()
127 | parser.add_option('-u', '--url', default='unix://var/run/docker.sock',
128 | help='URL for Docker service (Unix or TCP socket).')
129 | (opts, args) = parser.parse_args()
130 |
131 | docker_service = DockerService(opts.url)
132 | docker_service.docker_stats()
133 |
134 | if __name__ == '__main__':
135 | main()
136 |
--------------------------------------------------------------------------------
/docker_stats_check.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Rackspace Cloud Monitoring Plugin for Docker Stats."""
3 |
4 | # Copyright 2015 Nachiket Torwekar
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 | # -----
19 | #
20 | # This plugin monitors the Docker containers via the 'docker stats' command.
21 | # By default the monitor fails if the check does not complete successfully.
22 | # Metrics for:
23 | #
24 | # - cpu_total_usage
25 | # - cpu_system_usage
26 | # - cpu_kernel_mode_usage
27 | # - cpu_user_mode_usage
28 | # - cpu_user_mode_usage
29 | # - memory_max_usage
30 | # - memory_total_cache
31 | # - network_rx_bytes
32 | # - network_rx_packets
33 | # - network_tx_bytes
34 | # - network_tx_packets
35 | #
36 | # are also reported.
37 | #
38 | # Requires:
39 | # Python 2.6 or greater
40 | # docker-py: https://github.com/docker/docker-py
41 | #
42 | # Usage:
43 | # Place script in /usr/lib/rackspace-monitoring-agent/plugins.
44 | # Ensure file is executable (755).
45 | #
46 | # Set up a Cloud Monitoring Check of type agent.plugin to run
47 | #
48 | # docker_stats_check.py -u -c
49 | #
50 | # The URL is optional and can be a TCP or Unix socket, e.g.
51 | #
52 | # docker_stats_check.py -u tcp://0.0.0.0:2376
53 | # or
54 | # docker_stats_check.py -u unix://var/run/docker.sock
55 | #
56 | # The default URL is unix://var/run/docker.sock.
57 | #
58 | # The container can be name or id
59 | # docker_stats_check.py -u unix://var/run/docker.sock -c agitated_leakey
60 | # or
61 | # docker_stats_check.py -u unix://var/run/docker.sock -c 1f3b3b8f0fcc
62 | #
63 | # There is no need to define specific custom alert criteria.
64 | # As stated, the monitor fails if the stats cannot be collected.
65 | # It is possible to define custom alert criteria with the reported
66 | # metrics if desired.
67 | #
68 |
69 | import sys
70 | from docker import Client
71 | from optparse import OptionParser
72 | from subprocess import call
73 | import json
74 |
75 | class DockerService(object):
76 | """Create an object for a Docker service. Assume it is stopped."""
77 |
78 | def __init__(self, url, container):
79 |
80 | self.url = url
81 | self.container = container
82 | self.docker_running = False
83 |
84 | def docker_stats(self):
85 | """Connect to the Docker object and get stats. Error out on failure."""
86 |
87 | docker_conn = Client(base_url=self.url)
88 |
89 | try:
90 | stats = docker_conn.stats(self.container)
91 | self.docker_running = True
92 | # Apologies for the broad exception, it just works here.
93 | except Exception:
94 | self.docker_running = False
95 |
96 | if self.docker_running:
97 | print 'status ok succeeded in obtaining docker container stats.'
98 | for stat in stats:
99 | s = json.loads(stat)
100 | print 'metric cpu_total_usage int64', s['cpu_stats']['cpu_usage']['total_usage']
101 | print 'metric cpu_system_usage int64', s['cpu_stats']['system_cpu_usage']
102 | print 'metric cpu_kernel_mode_usage int64', s['cpu_stats']['cpu_usage']['usage_in_kernelmode']
103 | print 'metric cpu_user_mode_usage int64', s['cpu_stats']['cpu_usage']['usage_in_usermode']
104 | print 'metric memory_max_usage int64', s['memory_stats']['max_usage']
105 | print 'metric memory_total_cache int64', s['memory_stats']['stats']['total_cache']
106 | print 'metric pids_current int64', s['pids_stats']['current']
107 | if s.has_key('network'):
108 | print_network_stat(s['network'])
109 | elif s.has_key('networks'):
110 | tot = { "rx_bytes": 0, "rx_packets": 0, "tx_bytes": 0, "tx_packets": 0 }
111 | for ifname in s['networks']:
112 | tot['rx_bytes'] += s['networks'][ifname]['rx_bytes']
113 | tot['rx_packets'] += s['networks'][ifname]['rx_packets']
114 | tot['tx_bytes'] += s['networks'][ifname]['tx_bytes']
115 | tot['tx_packets'] += s['networks'][ifname]['tx_packets']
116 | print_network_stat(s['networks'][ifname], suffix='_' + ifname)
117 | print_network_stat(tot)
118 |
119 | sys.exit(0);
120 | else:
121 | print 'status err failed to obtain docker container stats.'
122 | sys.exit(1)
123 |
124 | def print_network_stat(n, suffix=''):
125 | print "metric network_rx_bytes%s int64 %d" % (suffix, n['rx_bytes'])
126 | print "metric network_rx_packets%s int64 %d" % (suffix, n['rx_packets'])
127 | print "metric network_tx_bytes%s int64 %d" % (suffix, n['tx_bytes'])
128 | print "metric network_tx_packets%s int64 %d" % (suffix, n['tx_packets'])
129 |
130 |
131 | def main():
132 | """Instantiate a DockerStats object and collect stats."""
133 |
134 | parser = OptionParser()
135 | parser.add_option('-u', '--url', default='unix://var/run/docker.sock',
136 | help='URL for Docker service (Unix or TCP socket).')
137 | parser.add_option('-c', '--container',
138 | help='Name or Id of container that you want to monitor')
139 | (opts, args) = parser.parse_args()
140 | if opts.container is None:
141 | parser.error("options -c is mandatory")
142 |
143 | docker_service = DockerService(opts.url, opts.container)
144 | docker_service.docker_stats()
145 |
146 | if __name__ == '__main__':
147 | main()
--------------------------------------------------------------------------------
/elasticsearch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Rackspace Cloud Monitoring plugin for elasticsearch cluster health
4 | and node stats.
5 |
6 | There are some questionable choices in modules (urllib2 vs requests,
7 | optparse vs argparse). These questions can be answered by considering
8 | compatability issues with older python versions like what we find stock
9 | on Red Hat Enterprise Linux systems.
10 |
11 | This plugin provides various groups of metrics.
12 | * cluster-health gives an overview of the cluster status
13 | * stats-store gives local node metrics about storing
14 | * stats-index gives local node metrics about indexing
15 | * stats-get gives local node metrics about gets
16 | * stats-search gives local node metrics about searches
17 | * stats-docs gives local node metrics about docs
18 |
19 | Examples:
20 | $ ./elasticsearch.py --stats-docs
21 | $ ./elasticsearch.py -H http://localhost:9200 --cluster-health
22 |
23 | This means you can call this plugin for up to 6 different checks for
24 | various metrics groups about your elasticsearch cluster.
25 |
26 | Copyright 2013 Victor Watkins
27 |
28 | Licensed under the Apache License, Version 2.0 (the "License");
29 | you may not use this file except in compliance with the License.
30 | You may obtain a copy of the License at
31 |
32 | http://www.apache.org/licenses/LICENSE-2.0
33 |
34 | Unless required by applicable law or agreed to in writing, software
35 | distributed under the License is distributed on an "AS IS" BASIS,
36 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
37 | See the License for the specific language governing permissions and
38 | limitations under the License.
39 | """
40 |
41 |
42 | import urllib2
43 | import json
44 |
45 | from sys import exit
46 | from optparse import OptionParser, OptionGroup
47 |
48 |
49 | STATUS_OK = "status Elasticsearch returned a response"
50 |
51 |
52 | def bug_out(why):
53 | '''Something went wrong. Tell the agent what, then die.'''
54 |
55 | print "status", why
56 | exit(1)
57 |
58 |
59 | def call_to_cluster(host, path):
60 | '''Call a given path to the cluster and return JSON.'''
61 |
62 | try:
63 | r = urllib2.urlopen('{h}{p}'.format(h=host, p=path))
64 | except (urllib2.URLError, ValueError) as e:
65 | bug_out(e)
66 |
67 | try:
68 | response = json.loads(r.read())
69 | except Exception as e: # improve this...
70 | bug_out(e)
71 |
72 | return response
73 |
74 |
75 | def get_stats(host, keyname):
76 | '''Return a dict of stats from /_cluster/nodes/_local/stats.
77 | Keyname can be one of: docs, search, indexing, store, get'''
78 |
79 | h = call_to_cluster(host, '/_cluster/nodes/_local/stats')
80 |
81 | node_name = h['nodes'].keys()[0]
82 | stats = h['nodes'][node_name]['indices'][keyname]
83 |
84 | return stats
85 |
86 |
87 | def cluster_health(option, opt, value, parser):
88 | '''Print metrics about /_cluster/health.'''
89 |
90 | h = call_to_cluster(parser.values.host, '/_cluster/health')
91 |
92 | print STATUS_OK
93 | print "metric status string", h['status']
94 | print "metric number_of_nodes uint32", h['number_of_nodes']
95 | print "metric unassigned_shards uint32", h['unassigned_shards']
96 | print "metric timed_out string", h['timed_out']
97 | print "metric active_primary_shards uint32", h['active_primary_shards']
98 | print "metric cluster_name string", h['cluster_name']
99 | print "metric relocating_shards uint32", h['relocating_shards']
100 | print "metric active_shards uint32", h['active_shards']
101 | print "metric initializing_shards uint32", h['initializing_shards']
102 | print "metric number_of_data_nodes uint32", h['number_of_data_nodes']
103 |
104 |
105 | def stats_store(option, opt, value, parser):
106 | '''Print store metrics from /_cluster/nodes/_local/stats.'''
107 |
108 | s = get_stats(parser.values.host, 'store')
109 |
110 | print STATUS_OK
111 | print "metric size_in_bytes uint64", s['size_in_bytes']
112 | print "metric throttle_time_in_millis uint32", s['throttle_time_in_millis']
113 |
114 |
115 | def stats_indexing(option, opt, value, parser):
116 | '''Print indexing metrics from /_cluster/nodes/_local/stats.'''
117 |
118 | s = get_stats(parser.values.host, 'indexing')
119 |
120 | print STATUS_OK
121 | print "metric delete_time_in_millis uint32", s['delete_time_in_millis']
122 | print "metric delete_total uint64", s['delete_total']
123 | print "metric delete_current uint32", s['delete_current']
124 | print "metric index_time_in_millis uint32", s['index_time_in_millis']
125 | print "metric index_total uint64", s['index_total']
126 | print "metric index_current uint32", s['index_current']
127 |
128 |
129 | def stats_get(option, opt, value, parser):
130 | '''Print GET metrics from /_cluster/nodes/_local/stats.'''
131 |
132 | s = get_stats(parser.values.host, 'get')
133 |
134 | print STATUS_OK
135 | print "metric missing_total uint32", s['missing_total']
136 | print "metric exists_total uint32", s['exists_total']
137 | print "metric current uint32", s['current']
138 | print "metric time_in_millis uint32", s['time_in_millis']
139 | print "metric missing_time_in_millis", s['missing_time_in_millis']
140 | print "metric exists_time_in_millis", s['exists_time_in_millis']
141 | print "metric total uint32", s['total']
142 |
143 |
144 | def stats_search(option, opt, value, parser):
145 | '''Print search metrics from /_cluster/nodes/_local/stats.'''
146 |
147 | s = get_stats(parser.values.host, 'search')
148 |
149 | print STATUS_OK
150 | print "metric query_total uint64", s['query_total']
151 | print "metric fetch_time_in_millis uint32", s['fetch_time_in_millis']
152 | print "metric fetch_total uint64", s['fetch_total']
153 | print "metric query_time_in_millis uint32", s['query_time_in_millis']
154 | print "metric open_contexts uint32", s['open_contexts']
155 | print "metric fetch_current uint32", s['fetch_current']
156 | print "metric query_current uint32", s['query_current']
157 |
158 |
159 | def stats_docs(option, opt, value, parser):
160 | '''Print doc metrics from /_cluster/nodes/_local/stats.'''
161 |
162 | s = get_stats(parser.values.host, 'docs')
163 |
164 | print STATUS_OK
165 | print "metric count uint64", s['count']
166 | print "metric deleted uint32", s['deleted']
167 |
168 |
169 | if __name__ == "__main__":
170 | parser = OptionParser()
171 |
172 | parser.add_option("-H", "--host",
173 | action="store", type="string", dest="host",
174 | default="http://localhost:9200")
175 |
176 | mg = OptionGroup(parser, "Possible Metric Groups")
177 | mg.add_option("--cluster-health", action="callback",
178 | callback=cluster_health)
179 | mg.add_option("--stats-store", action="callback",
180 | callback=stats_store)
181 | mg.add_option("--stats-indexing", action="callback",
182 | callback=stats_indexing)
183 | mg.add_option("--stats-get", action="callback",
184 | callback=stats_get)
185 | mg.add_option("--stats-search", action="callback",
186 | callback=stats_search)
187 | mg.add_option("--stats-docs", action="callback",
188 | callback=stats_docs)
189 |
190 | parser.add_option_group(mg)
191 | (options, args) = parser.parse_args()
192 |
--------------------------------------------------------------------------------
/etcd.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Rackspace Cloud Monitoring plugin for etcd node stats.
4 |
5 | Example:
6 | $ ./etcd.py --url http://localhost:4001
7 |
8 | Example alarm criteria:
9 |
10 | if (metric['state'] != 'follower' && metric['state'] != 'leader') {
11 | return new AlarmStatus(CRITICAL, 'Node is neither leader nor follower.');
12 | }
13 |
14 | if (metric['state'] == 'follower') {
15 | return new AlarmStatus(OK, 'Node is following #{leader}.');
16 | }
17 |
18 | if (metric['state'] == 'leader') {
19 | return new AlarmStatus(OK, 'Node is leading the cluster.');
20 | }
21 |
22 | Copyright 2014 Simon Vetter
23 |
24 | Based on Victor Watkins' elasticsearch plugin:
25 | Copyright 2013 Victor Watkins
26 |
27 | Licensed under the Apache License, Version 2.0 (the "License");
28 | you may not use this file except in compliance with the License.
29 | You may obtain a copy of the License at
30 |
31 | http://www.apache.org/licenses/LICENSE-2.0
32 |
33 | Unless required by applicable law or agreed to in writing, software
34 | distributed under the License is distributed on an "AS IS" BASIS,
35 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
36 | See the License for the specific language governing permissions and
37 | limitations under the License.
38 | """
39 | import urllib2
40 | import json
41 |
42 | from sys import exit
43 | from optparse import OptionParser, OptionGroup
44 |
45 |
46 | STATUS_OK = "status etcd returned a response"
47 |
48 |
49 | def bug_out(why):
50 | '''Something went wrong. Tell the agent what, then die.'''
51 |
52 | print "status", why
53 | exit(1)
54 |
55 |
56 | def call_to_server(url, path):
57 | '''Call a given path to the server and return JSON.'''
58 |
59 | try:
60 | r = urllib2.urlopen('{u}{p}'.format(u=url, p=path))
61 | except (urllib2.URLError, ValueError) as e:
62 | bug_out(e)
63 |
64 | try:
65 | response = json.loads(r.read())
66 | except Exception as e: # improve this...
67 | bug_out(e)
68 |
69 | return response
70 |
71 |
72 | def get_stats(url):
73 | '''Return a dict of stats from /v2/stats/self'''
74 |
75 | s = call_to_server(url, '/v2/stats/self')
76 |
77 | # i've seen etcd return {"state":""}, so make sure the agent accepts it
78 | if not s['state']:
79 | s['state'] = "unknown"
80 |
81 | print STATUS_OK
82 | print "metric state string", s['state']
83 | print "metric leader string", s['leaderInfo']['leader']
84 | print "metric recvAppendRequestCnt uint64", s['recvAppendRequestCnt']
85 | print "metric sendAppendRequestCnt uint64", s['sendAppendRequestCnt']
86 |
87 | exit(0)
88 |
89 |
90 | if __name__ == "__main__":
91 | parser = OptionParser()
92 |
93 | parser.add_option("--url",
94 | action="store", type="string", dest="url",
95 | default="http://localhost:4001")
96 |
97 | (options, args) = parser.parse_args()
98 | get_stats(parser.values.url);
99 |
--------------------------------------------------------------------------------
/examples/example.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | ## Rackspace Cloud Monitoring Plug-In
3 | ## Example ruby plug-in
4 | #
5 | # (C)2013 Jay Faulkner
6 | # All Rights Reserved.
7 | #
8 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
9 | # not use this file except in compliance with the License.
10 | # You may obtain a copy of the License at
11 | #
12 | # http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS,
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 | #
20 | # example.rb
21 | # - A ruby example of a Rackspace Cloud Montioring Agent plugin
22 |
23 | # If the plugin fails in any way, print why and exit nonzero.
24 | def fail(status="Unknown failure")
25 | puts "status #{status}"
26 | exit 1
27 | end
28 |
29 | # Store metrics in a hash and don't print them until we've completed
30 | def metric(name,type,value)
31 | @metrics[name] = {
32 | :type => type,
33 | :value => value
34 | }
35 | end
36 |
37 | # Once the script has succeeded without errors, print metrics lines.
38 | def output_success
39 | puts "status Your new plugin is reporting metrics!"
40 | @metrics.each do |name,v|
41 | puts "metric #{name} #{v[:type]} #{v[:value]}"
42 | end
43 | end
44 |
45 | begin
46 | require 'optparse'
47 | rescue
48 | fail "Failed to load required ruby gems!"
49 | end
50 |
51 | @metrics = {}
52 | options = {}
53 |
54 | args = ARGV.dup
55 |
56 | OptionParser.new do |o|
57 | o.banner = "Usage: #{$0} [options]"
58 | o.on('-o', '--my-option OPTION', 'Set OPTION to be a valid option') do |s|
59 | options[:option] = s
60 | end
61 | o.on_tail('-h', '--help', 'Show this message') { puts o; exit }
62 | o.parse!(args)
63 | end
64 |
65 | # Error handling by option/input validation and begin;rescue;end is recommended
66 | if false
67 | fail "I checked to make sure this would succeed, and it didn't"
68 | end
69 |
70 | # Gather metrics using your own code here.
71 | # Call metric(name,type,value) for every metric you want to record.
72 |
73 | # Faking metrics for this example
74 | metric("example","int64",40895)
75 | metric("fake_http_code","string","500")
76 |
77 | output_success
78 |
--------------------------------------------------------------------------------
/file_info.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Rackspace Cloud Monitoring plugin to provide file/directory information.
4 |
5 | The three metrics returned for the target:
6 | - age, calculated from ctime
7 | - size, in bytes
8 | - permissions, octal
9 |
10 | Copyright 2013 Steve Katen
11 |
12 | Licensed under the Apache License, Version 2.0 (the "License");
13 | you may not use this file except in compliance with the License.
14 | You may obtain a copy of the License at
15 |
16 | http://www.apache.org/licenses/LICENSE-2.0
17 |
18 | Unless required by applicable law or agreed to in writing, software
19 | distributed under the License is distributed on an "AS IS" BASIS,
20 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 | See the License for the specific language governing permissions and
22 | limitations under the License.
23 | """
24 | import sys
25 | import os
26 | import time
27 |
28 |
29 | def main():
30 | if len(sys.argv) != 2:
31 | print "Requires a full path to the target passed as an argument"
32 | sys.exit(0)
33 |
34 | path = sys.argv[1]
35 | if not os.path.exists(path):
36 | print "status err target does not exist"
37 | sys.exit(0)
38 |
39 | try:
40 | details = os.stat(path)
41 | age = int(time.time() - details.st_ctime)
42 | size = details.st_size
43 | mode = oct(details.st_mode & 0777)
44 |
45 | print "status ok target exists"
46 | print "metric age int", age
47 | print "metric bytes int", size
48 | print "metric mode string", mode
49 | except Exception, e:
50 | print "status err Exception discovered: {}".format(str(e))
51 |
52 |
53 | if __name__ == '__main__':
54 | main()
55 |
--------------------------------------------------------------------------------
/hadoop_hbase.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Rackspace Cloud Monitoring Plugin to read HBase metrics.
4 | #
5 | # USAGE;
6 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
7 | # and run like this:
8 | #
9 | # hadoop_hbase.py [OPTIONS]
10 | #
11 | # OPTIONS
12 | # -b PATH Pass in the hbase binary path
13 | # -u user Set the Hadoop HBase user name envariable.
14 | #
15 | # Requires: Python 2.6+ May work on Python 3+
16 | #
17 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
18 | #
19 | #
20 | # if (metric['dead_regionservers'] > 0) {
21 | # return new AlarmStatus(WARNING, 'HBase has #{dead_regionservers} dead region servers');
22 | # }
23 | #
24 | # if (metric['dead_regionservers_percent'] > 20) {
25 | # return new AlarmStatus(CRITICAL, 'HBase has #{dead_regionservers_percent}% dead region servers');
26 | # }
27 | #
28 | # return new AlarmStatus(OK, 'HBase OK');
29 | #
30 | #
31 | # Copyright (c) 2014, Dave Beckett
32 | # All rights reserved.
33 | #
34 | # Redistribution and use in source and binary forms, with or without
35 | # modification, are permitted provided that the following conditions are met:
36 | #
37 | # Redistributions of source code must retain the above copyright notice,
38 | # this list of conditions and the following disclaimer.
39 | #
40 | # Redistributions in binary form must reproduce the above copyright
41 | # notice, this list of conditions and the following disclaimer in the
42 | # documentation and/or other materials provided with the distribution.
43 | #
44 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
45 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
46 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
47 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
48 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
49 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
50 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
51 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
52 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
53 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
54 | # POSSIBILITY OF SUCH DAMAGE.
55 | #
56 | #
57 |
58 | from __future__ import print_function
59 |
60 | import os
61 | import re
62 | import sys
63 | import subprocess
64 | from tempfile import NamedTemporaryFile
65 |
66 | try:
67 | from subprocess import DEVNULL # py3k
68 | except ImportError:
69 | import os
70 | DEVNULL = open(os.devnull, 'wb')
71 |
72 | import argparse
73 |
74 |
75 | HBASE='/usr/bin/hbase';
76 |
77 | # Constants
78 | ERROR_RE = re.compile('r^ERROR: (.+)')
79 | LIVE_RE = re.compile(r'^(\d+) live servers')
80 | DEAD_RE = re.compile(r'^(\d+) dead servers')
81 | LOAD_RE = re.compile(r'^^Aggregate load: (\d+)')
82 | REGIONS_RE = re.compile(r'^^Aggregate load: \d+, regions: (\d+)')
83 |
84 |
85 | def get_hbase_status_metrics(hbase):
86 | """ Get HBase status metrics
87 |
88 | :param hbase Path to 'hbase' command
89 | """
90 |
91 | f = None
92 | try:
93 | f = NamedTemporaryFile(delete=False)
94 | f.write("status 'simple'\nexit\n")
95 | f.close()
96 | except Exception, e:
97 | raise Exception("write to {0} failed {1}".format(f.name if f else "", str(e)))
98 |
99 | # Call the hbase CLI command to get basic status
100 | cmd = [hbase, 'shell', f.name]
101 | metrics = {}
102 | total_rs = 0
103 | try:
104 | # Py2.7+ adds check_output so the DEVNULL line can be removed
105 | # result = subprocess.check_output(cmd, stdin=None, stderr=None)
106 | result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=DEVNULL).communicate()[0]
107 | for line in result.split('\n'):
108 | matches = ERROR_RE.match(line)
109 | if matches is not None:
110 | raise Exception("hbase shell returned error {0}".format(matches.group(1)))
111 |
112 | matches = LIVE_RE.match(line)
113 | if matches is not None:
114 | v = int(matches.group(1))
115 | total_rs += v
116 | metrics['live_regionservers'] = (v, 'uint32')
117 |
118 | matches = DEAD_RE.match(line)
119 | if matches is not None:
120 | v = int(matches.group(1))
121 | total_rs += v
122 | metrics['dead_regionservers'] = (v, 'uint32')
123 |
124 | matches = LOAD_RE.match(line)
125 | if matches is not None:
126 | v = int(matches.group(1))
127 | metrics['aggregate_load'] = (v, 'uint32')
128 |
129 | matches = REGIONS_RE.match(line)
130 | if matches is not None:
131 | v = int(matches.group(1))
132 | metrics['regions'] = (v, 'uint32')
133 |
134 | except Exception, e:
135 | raise Exception("command {0} failed {1}".format(str(cmd), str(e)))
136 | finally:
137 | os.unlink(f.name)
138 |
139 | metrics['total_regionservers'] = (total_rs, 'uint32')
140 |
141 | for k in ['live_regionservers', 'dead_regionservers']:
142 | if k in metrics:
143 | v = "{0:.2f}".format(metrics[k][0] * 100.0 / total_rs)
144 | metrics[k + '_percent'] = (v, 'double')
145 |
146 | return metrics
147 |
148 |
149 | def main():
150 | """Main method"""
151 |
152 | parser = argparse.ArgumentParser(description='HBase status metrics')
153 | parser.add_argument('-b', '--hbase',
154 | default = HBASE,
155 | help = 'hbase command (default: {0})'.format(HBASE))
156 | parser.add_argument('-u', '--user',
157 | default = None,
158 | help = 'user')
159 |
160 | args = parser.parse_args()
161 |
162 | ######################################################################
163 |
164 | hbase = args.hbase
165 | user = args.user
166 | if user is not None:
167 | os.putenv("HADOOP_USER_NAME", user)
168 |
169 | try:
170 | metrics = get_hbase_status_metrics(hbase)
171 | print("status ok")
172 | for k, t in metrics.iteritems():
173 | if t is not None:
174 | (v, type_str) = t
175 | print("metric {0} {1} {2}".format(k, type_str, v))
176 | except Exception, e:
177 | print("status err exception {0}".format(str(e)))
178 |
179 | sys.exit(0)
180 |
181 | if __name__ == '__main__':
182 | main()
183 |
--------------------------------------------------------------------------------
/hadoop_hdfs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Rackspace Cloud Monitoring Plugin to read HDFS metrics.
4 | #
5 | # USAGE;
6 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
7 | # and run like this:
8 | #
9 | # hadoop_hdfs.py [OPTIONS]
10 | #
11 | # OPTIONS
12 | # -H PATH Pass in the hadoop binary path
13 | # -u user Set the Hadoop HDFS user name envariable.
14 | #
15 | # Requires: Python 2.7+ and should work with Python 3
16 | #
17 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
18 | #
19 | #
20 | # if (metric['datanodes_dead'] > 0) {
21 | # return new AlarmStatus(CRITICAL, 'HDFS has #{datanodes_dead} dead datanodes');
22 | # }
23 | #
24 | # if (metric['datanodes_dead'] > 2) {
25 | # return new AlarmStatus(CRITICAL, 'HDFS has #{datanodes_dead} dead datanodes');
26 | # }
27 | #
28 | # if (metric['blocks_missing'] > 0) {
29 | # return new AlarmStatus(CRITICAL, 'HDFS has #{blocks_missing} missing blocks');
30 | # }
31 | #
32 | # if (metric['free_percent'] < 20) {
33 | # return new AlarmStatus(WARNING, 'HDFS has #{free_percent} free');
34 | # }
35 | #
36 | # if (metric['free_percent'] < 10) {
37 | # return new AlarmStatus(CRITICAL, 'HDFS has #{free_percent} free');
38 | # }
39 | #
40 | # return new AlarmStatus(OK, 'HDFS OK');
41 | #
42 | #
43 | # Copyright (c) 2014, Dave Beckett
44 | # All rights reserved.
45 | #
46 | # Redistribution and use in source and binary forms, with or without
47 | # modification, are permitted provided that the following conditions are met:
48 | #
49 | # Redistributions of source code must retain the above copyright notice,
50 | # this list of conditions and the following disclaimer.
51 | #
52 | # Redistributions in binary form must reproduce the above copyright
53 | # notice, this list of conditions and the following disclaimer in the
54 | # documentation and/or other materials provided with the distribution.
55 | #
56 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
57 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
60 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
61 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
62 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
63 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
64 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
65 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
66 | # POSSIBILITY OF SUCH DAMAGE.
67 | #
68 | #
69 |
70 | from __future__ import print_function
71 |
72 | import os
73 | import re
74 | import sys
75 | import subprocess
76 |
77 | try:
78 | from subprocess import DEVNULL # py3k
79 | except ImportError:
80 | import os
81 | DEVNULL = open(os.devnull, 'wb')
82 |
83 | # 2.7+
84 | import argparse
85 |
86 |
87 | HADOOP='/usr/bin/hadoop';
88 |
89 | # Constants
90 |
91 | METRIC_CONFIG = {
92 | # Bytes
93 | 'total' : (None, 'uint64'), # calculated below
94 | 'total_configured' : (re.compile(r'Configured Capacity: (\d+)'), 'uint64'),
95 | 'total_present' : (re.compile(r'Present Capacity: (\d+)'), 'uint64'),
96 | 'free' : (re.compile(r'DFS Remaining: (\d+)'), 'uint64'),
97 | 'free_percent' : (None, 'double'), # calculated below
98 | 'used' : (re.compile(r'DFS Used: (\d+)'), 'uint64'),
99 | 'used_percent' : (re.compile(r'DFS Used%: (\d+)'), 'double'),
100 |
101 | # Blocks
102 | 'blocks_under_replicated' : (re.compile(r'Under replicated blocks: (\d+)'), 'uint64'),
103 | 'blocks_missing' : (re.compile(r'Missing blocks: (\d+)'), 'uint64'),
104 | 'blocks_with_corrupt_replicas' : (re.compile(r'Blocks with corrupt replicas: (\d+)'), 'uint64'),
105 |
106 | # Datanodes
107 | # These 4 are not calculated yet; they need datanode blocks parsing
108 | 'used_non_dfs' : (None, 'uint64'),
109 | 'used_non_dfs_percent' : (None, 'double'),
110 | 'datanode_remaining_max' : (None, 'uint32'),
111 | 'datanode_remaining_min' : (None, 'uint32'),
112 |
113 | 'datanodes_available' : (re.compile(r'Datanodes available: (\d+)'), 'uint32'),
114 | 'datanodes_dead' : (re.compile(r'Datanodes available: \d+ \(\d+ total, (\d+) dead'), 'uint32'),
115 | 'datanodes_total' : (re.compile(r'Datanodes available: \d+ \((\d+) total'), 'uint32'),
116 | }
117 |
118 |
119 |
120 | def get_hdfs_status_metrics(hadoop):
121 | """ Get HDFS status metrics
122 |
123 | May throw a subprocess exception if the hadoop command fails.
124 |
125 | """
126 |
127 | # Call the hdfs CLI command to get basic status
128 | cmd = [hadoop, 'dfsadmin', '-report']
129 | metrics = {}
130 | try:
131 | # Py2.7+
132 | # result = subprocess.check_output(cmd, stdin=None, stderr=None)
133 | result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=DEVNULL).communicate()[0]
134 | except Exception, e:
135 | raise Exception("command {0} failed {1}".format(str(cmd), str(e)))
136 |
137 | for line in result.split('\n'):
138 | if line.startswith('Name: '):
139 | break
140 | for (k, v) in METRIC_CONFIG.iteritems():
141 | (regexp, type_str) = v
142 | if regexp is not None:
143 | matches = regexp.match(line)
144 | if matches is not None:
145 | metrics[k] = (matches.group(1), type_str)
146 |
147 | # Calculate capacity
148 | if 'total_configured' in metrics:
149 | metrics['capacity'] = metrics['total_configured']
150 |
151 | # Calculate free_percent
152 | if 'free' in metrics and 'total_configured' in metrics:
153 | remaining = int(metrics['free'][0])
154 | capacity = int(metrics['total_configured'][0])
155 | v = "{0:.2f}".format(remaining * 100.0 / capacity)
156 | metrics['free_percent'] = (v, 'double')
157 | else:
158 | metrics['free_percent'] = None
159 |
160 | return metrics
161 |
162 |
163 | def main():
164 | """Main method"""
165 |
166 | parser = argparse.ArgumentParser(description='HDFS status metrics')
167 | parser.add_argument('-H', '--hadoop',
168 | default = HADOOP,
169 | help = 'hadoop command (default: {0})'.format(HADOOP))
170 | parser.add_argument('-u', '--user',
171 | default = None,
172 | help = 'user')
173 |
174 | args = parser.parse_args()
175 |
176 | ######################################################################
177 |
178 | hadoop = args.hadoop
179 | user = args.user
180 | if user is not None:
181 | os.putenv("HADOOP_USER_NAME", user)
182 |
183 | try:
184 | metrics = get_hdfs_status_metrics(hadoop)
185 | print("status ok")
186 | for k, t in metrics.iteritems():
187 | if t is not None:
188 | (v, type_str) = t
189 | print("metric {0} {1} {2}".format(k, type_str, v))
190 | except Exception, e:
191 | print("status err exception {0}".format(str(e)))
192 |
193 | sys.exit(0)
194 |
195 | if __name__ == '__main__':
196 | main()
197 |
--------------------------------------------------------------------------------
/hadoop_jobtracker.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Rackspace Cloud Monitoring Plugin to read HDFS metrics.
4 | #
5 | # USAGE;
6 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
7 | # and run like this:
8 | #
9 | # hadoop_jobtracker.py [OPTIONS]
10 | #
11 | # OPTIONS
12 | # -n host Set the namenode host (REQUIRED)
13 | # -p port Set the namenode port
14 | # -u user Set the Hadoop HDFS user name envariable.
15 | #
16 | #
17 | # Requires: Python 2.7+
18 | #
19 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
20 | #
21 | #
22 | # if (metric['dead_nodes'] > 0) {
23 | # return new AlarmStatus(CRITICAL, 'Map-Reduce has #{dead_nodes} dead nodes');
24 | # }
25 | #
26 | # if (metric['dead_nodes'] > 2) {
27 | # return new AlarmStatus(CRITICAL, 'Map-Reduce has #{dead_nodes} dead nodes');
28 | # }
29 | #
30 | # return new AlarmStatus(OK, 'Map-Reduce Job Tracker OK');
31 | #
32 | #
33 | # Copyright (c) 2014, Dave Beckett
34 | # All rights reserved.
35 | #
36 | # Redistribution and use in source and binary forms, with or without
37 | # modification, are permitted provided that the following conditions are met:
38 | #
39 | # Redistributions of source code must retain the above copyright notice,
40 | # this list of conditions and the following disclaimer.
41 | #
42 | # Redistributions in binary form must reproduce the above copyright
43 | # notice, this list of conditions and the following disclaimer in the
44 | # documentation and/or other materials provided with the distribution.
45 | #
46 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
47 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
48 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
49 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
50 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
51 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
52 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
53 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
54 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
55 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
56 | # POSSIBILITY OF SUCH DAMAGE.
57 | #
58 | #
59 |
60 | from __future__ import print_function
61 |
62 | import json
63 | import os
64 | import re
65 | import sys
66 |
67 | # 2.7+
68 | import argparse
69 |
70 | import urllib2
71 |
72 |
73 | # Constants
74 |
75 | # All types are uint32
76 | METRIC_TYPE = 'uint32'
77 |
78 | METRIC_NAMES = [
79 | 'reduce_slots',
80 | 'map_slots_used',
81 | 'total_nodes',
82 | 'dead_nodes',
83 | 'map_slots',
84 | 'alive_nodes',
85 | 'reduce_slots_used'
86 | ]
87 |
88 |
89 | def get_namenode_bean_data(namenode_host, namenode_port = 50030):
90 | """ Get the namenode bean data for namenode """
91 |
92 | # JMX URI for the hadoop namenode to get the JobTrackerInfo
93 | url = "http://{0}:{1}/jmx?qry=Hadoop%3Aservice%3DJobTracker%2Cname%3DJobTrackerInfo".format(namenode_host, namenode_port)
94 |
95 | beans = None
96 | try:
97 | response = urllib2.urlopen(url)
98 | content = response.read()
99 | data = json.loads(content)
100 | beans = data['beans'][0]
101 | except Exception, e:
102 | raise Exception("Error reading {0} url JSON - {1}".format(url, str(e)))
103 |
104 | return beans
105 |
106 | def get_summary_metrics(d):
107 | """Get metrics for the AliveNodesInfoJson"""
108 |
109 | nodes_count = d['nodes']
110 | alive_count = d['alive']
111 | slots = d['slots']
112 | metrics = {
113 | 'total_nodes' : nodes_count,
114 | 'alive_nodes' : alive_count,
115 | 'dead_nodes' : nodes_count - alive_count,
116 | 'map_slots' : slots['map_slots'],
117 | 'reduce_slots' : slots['reduce_slots'],
118 | 'map_slots_used' : slots['map_slots_used'],
119 | 'reduce_slots_used' : slots['reduce_slots_used']
120 | }
121 |
122 | return metrics
123 |
124 |
125 | def get_job_tracker_metrics(beans):
126 | metrics = {}
127 |
128 | # Process the summary data
129 | summaryData = None
130 | summaryJson = beans.get('SummaryJson', None)
131 | if summaryJson is not None:
132 | summaryData = None
133 | try:
134 | summaryData = json.loads(summaryJson)
135 | except Exception, e:
136 | raise Exception("Error reading summary JSON - {0}: {1}".format(str(e), summaryJson))
137 |
138 | if summaryData is not None:
139 | m = get_summary_metrics(summaryData)
140 | metrics.update(m)
141 |
142 | if summaryData is None:
143 | raise Exception("No SummaryJson data in XML")
144 | return None
145 |
146 | return metrics
147 |
148 |
149 | def main():
150 | """Main method"""
151 |
152 | parser = argparse.ArgumentParser(description='HDFS status metrics')
153 | parser.add_argument('-n', '--namenode',
154 | default = None,
155 | help = 'namenode host')
156 | parser.add_argument('-p', '--port',
157 | default = 50030,
158 | help = 'namenode port (Default 50030)')
159 | parser.add_argument('-u', '--user',
160 | default = None,
161 | help = 'user')
162 |
163 | args = parser.parse_args()
164 |
165 | ######################################################################
166 |
167 | user = args.user
168 | if user is not None:
169 | os.putenv("HADOOP_USER_NAME", user)
170 | namenode_host = args.namenode
171 | if namenode_host is None:
172 | print("Must give namenode host name")
173 | sys.exit(1)
174 | namenode_port = args.port
175 |
176 | try:
177 | beans = get_namenode_bean_data(namenode_host, namenode_port)
178 | if beans is None:
179 | sys.exit(1)
180 |
181 | metrics = get_job_tracker_metrics(beans)
182 | if metrics is None:
183 | sys.exit(1)
184 |
185 | print("status ok")
186 | for k, v in metrics.iteritems():
187 | print("metric {0} {1} {2}".format(k, METRIC_TYPE, v))
188 | except Exception, e:
189 | print("status err exception {0}".format(str(e)))
190 |
191 | sys.exit(0)
192 |
193 | if __name__ == '__main__':
194 | main()
195 |
--------------------------------------------------------------------------------
/haproxy.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | ## Rackspace Cloud Monitoring Plug-In
3 | ## HAProxy Stats
4 | #
5 | # ----------------------------------------------------------------------------
6 | # "THE BEER-WARE LICENSE" (Revision 42):
7 | # wrote this file. As long as you retain this notice you
8 | # can do whatever you want with this stuff. If we meet some day, and you think
9 | # this stuff is worth it, you can buy me a beer in return
10 | # ----------------------------------------------------------------------------
11 | #
12 | # haproxy.rb
13 | # - Takes HAProxy stats and grabs connections, rate, and check time
14 | # for every listener and every backend server, and prints it using
15 | # Rackspace Cloud Montioring metric lines
16 | #
17 | # Usage:
18 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
19 | #
20 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
21 | #
22 | # if (metric['connections'] == 0) {
23 | # return new AlarmStatus(CRITICAL, 'No connections to your HAProxy!
24 | #
25 | # if (metric['connections'] < '10') {
26 | # return new AlarmStatus(WARNING, 'Less than 10 connections to your HAProxy!');
27 | # }
28 | #
29 | # return new AlarmStatus(OK, 'HAProxy connections normal');
30 | #
31 | # Please note that you will need to adjust the thresholds based on workload.
32 | # Also, there are other metrics this plugin reports you may find useful, but
33 | # the metricnames for these will vary based on your HAProxy cluster name.
34 | #
35 |
36 | def fail(status = 'Unknown failure')
37 | puts "status #{status}"
38 | exit 1
39 | end
40 |
41 | def metric(name, type, value)
42 | @metrics[name] = {
43 | :type => type,
44 | :value => value
45 | }
46 | end
47 |
48 | def output_success
49 | puts 'status HAProxy is running and reporting metrics'
50 | @metrics.each do |name, v|
51 | puts "metric #{name} #{v[:type]} #{v[:value]}"
52 | end
53 | end
54 |
55 | begin
56 | require 'optparse'
57 | require 'socket'
58 | rescue
59 | fail 'Failed to load required ruby gems'
60 | end
61 |
62 | @metrics = {}
63 | options = {
64 | :limit => 10
65 | }
66 |
67 | args = ARGV.dup
68 |
69 | OptionParser.new do |o|
70 | o.banner = "Usage: #{$PROGRAM_NAME} [options]"
71 | o.on('-s', '--stats-socket SOCKET', 'Specify the HAProxy stats socket') do |s|
72 | options[:sock] = s
73 | end
74 | o.on('-l', '--limit BACKEND_COUNT', 'Specify a limit of how many backends to report. Default is 10.') do |l|
75 | options[:limit] = l.to_i
76 | end
77 | o.on_tail('-h', '--help', 'Show this message') { puts o; exit }
78 | o.parse!(args)
79 | end
80 |
81 | fail 'You must specify the haproxy stats socket' if options[:sock].nil?
82 |
83 | pid = `pidof haproxy`.chomp.to_i || fail('HAProxy is not running')
84 |
85 | # get global frontend stats
86 | begin
87 | ctl = UNIXSocket.new(options[:sock])
88 | ctl.puts 'show info'
89 |
90 | while (line = ctl.gets)
91 | if line =~ /^CurrConns:/
92 | line = line.split(':')
93 | metric('connections', 'int', line[1].to_i)
94 | end
95 | if line =~ /^ConnRate:/
96 | line = line.split(':')
97 | metric('connection_rate', 'int', line[1].to_i)
98 | end
99 | end
100 | ctl.close
101 | rescue
102 | fail "Problem reading global stats from #{options[:sock]}"
103 | end
104 |
105 | # get per-backend stats
106 | begin
107 | ctl = UNIXSocket.new(options[:sock])
108 | ctl.puts 'show stat'
109 |
110 | i = 0
111 | while (line = ctl.gets)
112 | next unless line =~ /^[^#]\w+/
113 | line = line.split(',')
114 | host = "#{line[0]}_#{line[1]}".tr('-', '_').tr('.', '_')
115 | if i < options[:limit]
116 | metric("#{host}_request_rate", 'int', line[47].to_i)
117 | metric("#{host}_total_requests", 'gauge', line[49].to_i)
118 | metric("#{host}_current_queue", 'int', line[3].to_i)
119 | metric("#{host}_health_check_duration","int",line[35].to_i)
120 | i += 1
121 | end
122 | end
123 | ctl.close
124 | rescue
125 | fail "Problem reading backend stats from #{options[:sock]}"
126 | end
127 |
128 | output_success
129 |
--------------------------------------------------------------------------------
/jmx-gather.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | #
3 | # Rackspace Cloud Monitoring Plug-In
4 | # Gathers JMX MBean attribute values from a specified ObjectName via a remote JMX RMI endpoint.
5 | #
6 | # (c) 2017 Geoff Bourne
7 | # All Rights Reserved.
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
10 | # not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | #
21 | # Usage:
22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
23 | #
24 | # It accepts three or more arguments:
25 | # * host:port of the JMX RMI endpoint to access, such as "localhost:9080"
26 | # * the MBean's ObjectName, such as "java.lang:type=Memory"
27 | # * an attribute name of the MBean, such as "HeapMemoryUsage"
28 | # * an attribute name...
29 | #
30 | # Returns a metric for each attribute that was found. Metrics are hardcoded to be typed as "gauge" since their
31 | # originating type can vary and is dictated by the MBean accessed.
32 | # NOTE:
33 | # * if the MBean was not found, the status is reported "ok" with no metrics reported
34 | # * if an attribute is not found on the given MBean that attribute's metric line is simply omitted
35 |
36 | # Cleanup and setup our JavaScript code to run through Java's jrunscript
37 | trap "rm -f /tmp/gather-mbean-$$.js" EXIT INT QUIT TSTP
38 |
39 | cat > /tmp/gather-mbean-$$.js <
8 | # Copyright (c) 2016, Horizon Discovery Plc.
9 | # All rights reserved.
10 | #
11 | # Redistribution and use in source and binary forms, with or without
12 | # modification, are permitted provided that the following conditions are met:
13 | #
14 | # Redistributions of source code must retain the above copyright notice,
15 | # this list of conditions and the following disclaimer.
16 | #
17 | # Redistributions in binary form must reproduce the above copyright
18 | # notice, this list of conditions and the following disclaimer in the
19 | # documentation and/or other materials provided with the distribution.
20 | #
21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 | # POSSIBILITY OF SUCH DAMAGE.
32 | #
33 | # Curl Command:
34 | # curl -i -X POST -H 'Host: monitoring.api.rackspacecloud.com' -H
35 | # 'Accept-Encoding: gzip,deflate' -H 'X-Auth-Token: YOUR_API_TOKEN' -H
36 | # 'Content-Type: application/json; charset=UTF-8' -H 'Accept:
37 | # application/json' --data-binary '{"label": "Long Process Check", "type":
38 | # "agent.plugin", "details": {"args": ["PROCESS_NAME", "TIMEOUT"],"file":
39 | # "long_process.sh"}}' --compress
40 | # 'https://monitoring.api.rackspacecloud.com:443/v1.0/YOUR_ACCOUNT/entities/YOUR_ENTITY/checks'
41 | #
42 | # Usage:
43 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
44 | #
45 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
46 | #
47 | # if (metric['numprocs'] > 0) {
48 | # return new AlarmStatus(CRITICAL, '#{numprocs} long running processes(s): #{pids}');
49 | # }
50 | #
51 | # return new AlarmStatus(OK, 'No long running processes.');
52 |
53 | if [ "$#" -ne 2 ]; then
54 | cat < /dev/null
85 |
86 | # Check for a failure in the pipe
87 | PIPEEXIT=$?
88 | if [ $PIPEEXIT -ne 0 ]; then
89 | echo "status Fail"
90 | exit $PIPEEXIT
91 | fi
92 |
93 | # Numeric metric to compare against (number of processes)
94 | # Convert to array and count
95 | NUMPROCS=($PIDS)
96 | NUMPROCS=${#NUMPROCS[@]}
97 |
98 | echo "status Success"
99 | echo "metric numprocs int $NUMPROCS"
100 | if [ $NUMPROCS -ne 0 ]; then
101 | echo "metric pids string $PIDS"
102 | else
103 | echo "metric pids string -"
104 | fi
105 |
--------------------------------------------------------------------------------
/lsyncd-status.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # lsyncd-status.sh
4 | # Rackspace Cloud Monitoring Plugin to verify the current return status of lsyncd.
5 | #
6 | # Copyright (c) 2013, Lindsey Anderson
7 | # All rights reserved.
8 | #
9 | # Redistribution and use in source and binary forms, with or without
10 | # modification, are permitted provided that the following conditions are met:
11 | #
12 | # Redistributions of source code must retain the above copyright notice,
13 | # this list of conditions and the following disclaimer.
14 | #
15 | # Redistributions in binary form must reproduce the above copyright
16 | # notice, this list of conditions and the following disclaimer in the
17 | # documentation and/or other materials provided with the distribution.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 | # POSSIBILITY OF SUCH DAMAGE.
30 | #
31 | #
32 | # Verify the current status of lsyncd
33 | # *****************
34 | # Note this data may take a few minutes to populate metric data at first
35 | # *****************
36 | #
37 | # Example criteria :
38 | #
39 | # if (metric['lsyncd_status'] != 'running') {
40 | # return new AlarmStatus(CRITICAL, 'Lsyncd Service is NOT running.');
41 | # }
42 | #
43 | # if (metric['lsyncd_status'] == 'running' && metric['percent_used_watches'] >= 80) {
44 | # return new AlarmStatus(WARNING, 'Lsyncd is running but the number of directories has reached 80% of notify watches.');
45 | # }
46 | #
47 | # if (metric['lsyncd_status'] == 'running' && metric['percent_used_watches'] >= 95) {
48 | # return new AlarmStatus(CRITICAL, 'Lsyncd is running but the number of directories has reached 95% of notify watches.');
49 | # }
50 | #
51 | # return new AlarmStatus(OK, 'Lsyncd Service is running.');
52 | #
53 | # REQUIRES 'bc' to be installed
54 |
55 | SERVICE="lsyncd"
56 |
57 | # Attempt to locate lsyncd configuration file
58 | if [ -e /etc/lsyncd.lua ]; then
59 | lsyncd_conf_file="/etc/lsyncd.lua"
60 | elif [ -e /etc/lsyncd.conf ]; then
61 | lsyncd_conf_file="/etc/lsyncd.conf"
62 | elif [ -e /etc/lsyncd/lsyncd.conf.lua ]; then
63 | lsyncd_conf_file="/etc/lsyncd/lsyncd.conf.lua"
64 | elif [ -e /etc/lsyncd/lsyncd.conf ]; then
65 | lsyncd_conf_file="/etc/lsyncd/lsyncd.conf"
66 | else
67 | echo "status ${SERVICE} not installed"
68 | exit 1
69 | fi
70 |
71 | # Test if the service is running
72 | RESULT=$(pgrep -x ${SERVICE})
73 | if [[ "${RESULT:-null}" = null ]]; then
74 | echo "metric ${SERVICE}_status string notrunning"
75 | else
76 | echo "metric ${SERVICE}_status string running"
77 | fi
78 |
79 | # Calculate current inotify watches
80 | current_inotify_watches=$(awk '{print $3}' <(sysctl fs.inotify.max_user_watches))
81 |
82 | # 2.1.x status file contains number of directories watched. Avoids I/O overhead of find command.
83 | lsyncd_status_file=$(sed -n 's/.*statusFile\s*=\s*"\(.*\)",.*/\1/p' $lsyncd_conf_file)
84 | if [ -e "$lsyncd_status_file" ]; then
85 | current_directories_to_watch=$(sed -n "s/Inotify watching \([0-9][0-9]*\) directories/\1/p" "$lsyncd_status_file")
86 | fi
87 |
88 | # Fall back to old method if current_directories_to_watch is not a number
89 | if ! [[ "$current_directories_to_watch" =~ ^[0-9]+$ ]] ; then
90 | # Store the values we pull from the configuration file to an array
91 | watch_list=()
92 | for dir_watch in $(grep "source=\"/" ${lsyncd_conf_file} | grep -ve '^[ ]*--' ); do
93 | current_dir=$(echo $dir_watch | cut -d'=' -f2| sed -e "s/\"//g" -e "s/,//g")
94 | watch_list=("${watch_list[@]}" "${current_dir}")
95 | done
96 | # Force unique values in this array - not calculating for multiple directories
97 | sorted_unique_dirs=$(echo "${watch_list[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ')
98 | # calculate current directories to watch
99 | current_directories_to_watch=0
100 | for SOURCE in ${sorted_unique_dirs[@]}; do
101 | current_directories_to_watch=$(echo ${current_directories_to_watch}+$(find ${SOURCE} -type d | wc -l | awk '{print $1}') | bc -l)
102 | done
103 | #current_directories_to_watch=$(find ${SOURCE} -type d | wc -l | awk '{print $1}')
104 | fi
105 |
106 | # calculate percenentage of total
107 | current_percentage=$(echo "${current_directories_to_watch}/${current_inotify_watches}" | bc -l | awk '{printf "%f", $1*100}')
108 |
109 | echo "metric percent_used_watches double ${current_percentage}"
110 |
111 |
--------------------------------------------------------------------------------
/megaraid.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Rackspace Cloud Monitoring Plug-In
3 | # megaraid plugin to query SMART status of drives attached to LSI megaraid or
4 | # DELL PERC {3,700} raid controllers.
5 | #
6 | # ----------------------------------------------------------------------------
7 | # "THE BEER-WARE LICENSE" (Revision 42):
8 | # wrote this file. As long as you retain this notice
9 | # you can do whatever you want with this stuff. If we meet some day, and you
10 | # think this stuff is worth it, you can buy me a beer in return
11 | # ----------------------------------------------------------------------------
12 | #
13 | # Usage:
14 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
15 | #
16 | # This plugin returns 5 metrics:
17 | # - failed : the number of drives in failed state,
18 | # - prefail : the number of drives in prefail state,
19 | # - unknown : the number of drives for which the smart state could not
20 | # be determined,
21 | # - ok : the number of drives in OK state,
22 | # - report : a string reporting the drive id, vendor, serial number
23 | # as well as the smart state for non-ok drives.
24 | # e.g. /dev/bus/0 -d megaraid,4 SEAGATE 6SL28GNF FAILED \
25 | # ^controller & drive ids ^vendor ^serial# ^state
26 | # ( HARDWARE IMPENDING FAILURE GENERAL HARD DRIVE FAILURE [asc=5d, ascq=10] )
27 | # ^SMART health status for this drive
28 | #
29 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
30 | #
31 | # if (metric['failed'] != 0) {
32 | # return new AlarmStatus(CRITICAL, '#{failed} failed drive(s): #{report}');
33 | # }
34 | #
35 | # if (metric['prefail'] != 0) {
36 | # return new AlarmStatus(WARNING, '#{prefail} prefail drive(s): #{report}');
37 | # }
38 | #
39 | # if (metric['unknown'] != 0) {
40 | # return new AlarmStatus(WARNING, '#{unknown} unknown drive(s): #{report}');
41 | # }
42 | #
43 | # return new AlarmStatus(OK, '#{ok} drive(s) OK');
44 | #
45 | # Things to keep in mind:
46 | # - this plugin needs a fairly recent version of smartmontools (tested OK with 6.2)
47 | # (apt-get install smartmontools) but does NOT need megacli.
48 | # - on big and loaded arrays, the plugin can take more than 10s (default agent plugin
49 | # timeout) to complete. Some disks are slower than others, not surprisingly.
50 | # - as of now, this plugin only checks individual drives and not the status of the
51 | # array as seen by the controller. I'd add it, but it seems hard to extract without
52 | # megacli which I'm trying to stay away from. If you know of a way, please let me
53 | # know.
54 | #
55 | #
56 | SMARTCTL=$(which smartctl)
57 |
58 | OK_CNT=0
59 | PREFAIL_CNT=0
60 | FAILED_CNT=0
61 | UNKNOWN_CNT=0
62 | REPORT=""
63 |
64 | # discover all drives
65 | DEVLIST=$(${SMARTCTL} --scan 2>/dev/null)
66 | if [ $? -ne 0 ]
67 | then
68 | echo status failed to perform drive discovery
69 | exit 1
70 | fi
71 |
72 | while read DEV
73 | do
74 | STAT=$(${SMARTCTL} ${DEV} --info --health 2>/dev/null)
75 | STATRC=$?
76 | SHS=$(echo "${STAT}" | grep -i 'smart health status:' | cut -d':' -f2)
77 | DRIVE_ID=$(echo "${STAT}" | grep -iE '(vendor:|serial number:)' | cut -d':' -f2 | xargs)
78 |
79 | # Bit 3: SMART status check returned "DISK FAILING".
80 | if [ $((${STATRC} & (2**3))) -ne 0 ]; then
81 | ((FAILED_CNT++))
82 | REPORT="${REPORT} ${DEV} ${DRIVE_ID} FAILED (${SHS} ) "
83 | # Bit 4: We found prefail Attributes <= threshold.
84 | # Bit 5: SMART status check returned "DISK OK" but we found that some (usage or prefail)
85 | # attributes have been <= threshold at some time in the past.
86 | elif [ $((${STATRC} & (2**4) | ${STATRC} & (2**5))) -ne 0 ]; then
87 | ((PREFAIL_CNT++))
88 | REPORT="${REPORT} ${DEV} ${DRIVE_ID} PREFAIL (${SHS} ) "
89 | # Anything else (drive open failed, smart command failed, etc.) maps to unknown to me
90 | elif [ ${STATRC} -ne 0 ]; then
91 | ((UNKNOWN_CNT++))
92 | REPORT="${REPORT} ${DEV} ${DRIVE_ID} UNKNOWN (${SHS} ) "
93 | else
94 | ((OK_CNT++))
95 | fi
96 | # only care for /dev/bus devices. /dev/sd* are logical disks
97 | # and do not respond to any SMART command.
98 | done < <(echo "${DEVLIST}" | grep /dev/bus/ | cut -d'#' -f1)
99 |
100 | if [ "z${REPORT}" == "z" ]; then
101 | REPORT="all drives OK"
102 | fi
103 |
104 | echo "status smart status retrieved"
105 | echo "metric failed uint32 ${FAILED_CNT}"
106 | echo "metric prefail uint32 ${PREFAIL_CNT}"
107 | echo "metric unknown uint32 ${UNKNOWN_CNT}"
108 | echo "metric ok uint32 ${OK_CNT}"
109 | echo "metric report string ${REPORT}"
110 |
111 | exit 0
112 |
--------------------------------------------------------------------------------
/memcached_stats.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Rackspace Cloud Monitoring plugin to provide memcached statistics.
4 |
5 | Copyright 2013 Steve Katen
6 |
7 | Licensed under the Apache License, Version 2.0 (the "License");
8 | you may not use this file except in compliance with the License.
9 | You may obtain a copy of the License at
10 |
11 | http://www.apache.org/licenses/LICENSE-2.0
12 |
13 | Unless required by applicable law or agreed to in writing, software
14 | distributed under the License is distributed on an "AS IS" BASIS,
15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | See the License for the specific language governing permissions and
17 | limitations under the License.
18 |
19 |
20 | Minimal Example criteria:
21 |
22 | if (metric['legacy_state'] != 'ok') {
23 | return new AlarmStatus(CRITICAL, 'memcache is NOT running.');
24 | }
25 | return new AlarmStatus(OK, 'memcache is running.');
26 |
27 | """
28 | import sys
29 | import telnetlib
30 | import re
31 | import socket
32 |
33 |
34 | def memcached_stats(host, port):
35 | regex = re.compile(ur"STAT (.*) (.*)\r")
36 | try:
37 | c = telnetlib.Telnet(host, port)
38 | except socket.error:
39 | return
40 | else:
41 | c.write("stats\n")
42 | return dict(regex.findall(c.read_until('END')))
43 |
44 |
45 | def hit_percent(hits, misses):
46 | total = hits + misses
47 | if total > 0:
48 | return 100 * float(hits) / float(total)
49 | else:
50 | return 0.0
51 |
52 |
53 | def fill_percent(used, total):
54 | return 100 * float(used) / float(total)
55 |
56 |
57 | def main():
58 | if len(sys.argv) != 3:
59 | print "Usage: %s " % sys.argv[0]
60 | sys.exit(0)
61 |
62 | host = sys.argv[1]
63 | port = sys.argv[2]
64 | s = memcached_stats(host, port)
65 |
66 | if not s:
67 | print "status err unable to generate statistics"
68 | sys.exit(0)
69 |
70 | print "status ok memcached statistics generated"
71 | print "metric uptime int", s['uptime']
72 | print "metric curr_connections int", s['curr_connections']
73 | print "metric listen_disabled_num int", s['listen_disabled_num']
74 | print "metric curr_items int", s['curr_items']
75 | print "metric total_items int", s['total_items']
76 | print "metric evictions int", s['evictions']
77 | print "metric hit_percent float", hit_percent(int(s['get_hits']),
78 | int(s['get_misses']))
79 | print "metric fill_percent float", fill_percent(int(s['bytes']),
80 | int(s['limit_maxbytes']))
81 |
82 | if __name__ == '__main__':
83 | main()
84 |
--------------------------------------------------------------------------------
/mongodb_stats.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Rackspace Cloud Monitoring plugin to provide mongodb statistics.
4 |
5 | Requirement:
6 | pymongo - http://api.mongodb.org/python/current/
7 |
8 | Copyright 2013 Steve Katen
9 |
10 | Licensed under the Apache License, Version 2.0 (the "License");
11 | you may not use this file except in compliance with the License.
12 | You may obtain a copy of the License at
13 |
14 | http://www.apache.org/licenses/LICENSE-2.0
15 |
16 | Unless required by applicable law or agreed to in writing, software
17 | distributed under the License is distributed on an "AS IS" BASIS,
18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 | See the License for the specific language governing permissions and
20 | limitations under the License.
21 | """
22 | import sys
23 | try:
24 | from pymongo import MongoClient as Client
25 | except ImportError:
26 | from pymongo import Connection as Client
27 | from pymongo.errors import ConnectionFailure, AutoReconnect
28 |
29 |
30 | def mongodb_stats(host, p, database, username, password):
31 | port = int(p)
32 | try:
33 | if username and password and database:
34 | c = Client("mongodb://"+username+":"+password+"@"+host+"/"+database, port)
35 | elif username and password:
36 | c = Client('mongodb://'+username+':'+password+'@'+host+'/', port)
37 | elif database:
38 | c = Client('mongodb://'+host+'/'+database, port)
39 | else:
40 | c = Client(host, port)
41 | except ConnectionFailure, AutoReconnect:
42 | return None
43 | else:
44 | return c.test.command("serverStatus")
45 |
46 |
47 | def main():
48 | if len(sys.argv) != 6:
49 | print "Usage: %s " % sys.argv[0]
50 | sys.exit(0)
51 |
52 | s = mongodb_stats(*sys.argv[1:])
53 |
54 | if not s:
55 | print "status err unable to generate statistics"
56 | sys.exit(0)
57 |
58 | print "status ok mongodb statistics generated"
59 | print "metric uptime float", s['uptime']
60 | print "metric conn_available int", s['connections']['available']
61 | print "metric conn_current int", s['connections']['current']
62 | print "metric conn_percent float", float(s['connections']['current']
63 | / s['connections']['available'])
64 |
65 | print "metric mem_mapped int", s['mem']['mapped']
66 | print "metric index_hits int", s['indexCounters']['hits']
67 | print "metric index_misses int", s['indexCounters']['misses']
68 | try:
69 | print "metric index_percent int", float(s['indexCounters']['hits']
70 | / s['indexCounters']['accesses'])
71 | except ZeroDivisionError:
72 | print "metric index_percent int 0"
73 |
74 | if (s['indexCounters']['btree']):
75 | print "metric index_hits int", s['indexCounters']['btree']['hits']
76 | print "metric index_misses int", s['indexCounters']['btree']['misses']
77 | print "metric index_percent int", float(s['indexCounters']['btree']['hits']
78 | / s['indexCounters']['btree']['accesses'])
79 | else:
80 | print s['indexCounters']['btree']['hits']
81 | print "metric index_hits int", s['indexCounters']['hits']
82 | print "metric index_misses int", s['indexCounters']['misses']
83 | print "metric index_percent int", float(s['indexCounters']['hits']
84 | / s['indexCounters']['accesses'])
85 |
86 | if 'repl' in s:
87 | print "metric is_replicating string true"
88 | print "metric is_master string", s['repl']['ismaster']
89 | print "metric is_secondary string", s['repl']['secondary']
90 | else:
91 | print "metric is_replicating string false"
92 |
93 | if __name__ == '__main__':
94 | main()
95 |
--------------------------------------------------------------------------------
/murmur_monitor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | ## This requires ICE to be enabled on murmur, and the Murmur Mice package
4 | ## http://wiki.mumble.info/wiki/Mice
5 |
6 | ## This check returns number of active connections, how many people are able to receive
7 | ## a voice stream, how many are connected by deafened (could not receive voice),
8 | ## and how many virtual servers are present.
9 |
10 | import sys
11 | import os
12 | sys.path.append('/opt/murmur/ice')
13 |
14 | sys.stdout = open(os.devnull, "w")
15 | import mice
16 | sys.stdout = sys.__stdout__
17 |
18 | numberServers = len(mice.m.getAllServers())
19 | usersOnline = 0
20 | usersListening = 0
21 | usersDeaf = 0
22 |
23 | serverId = 1
24 | #print "Getting stats for", numberServers,"servers"
25 | for serverId in range(1, numberServers+1):
26 | # print "Getting stats for server", serverId
27 | server = mice.m.getServer(serverId)
28 | # print server
29 | users = server.getUsers()
30 | # print "Users online:", len(users), "List: ", users.keys()
31 | for user in users:
32 | usersOnline += 1
33 | if users[user].selfDeaf or users[user].deaf:
34 | usersDeaf += 1
35 | else:
36 | usersListening += 1
37 |
38 | print "status ok"
39 | print "metric servers int", numberServers, "servers"
40 | print "metric online int", usersOnline,"users"
41 | print "metric deaf int", usersDeaf,"users"
42 | print "metric listening int", usersListening,"users"
43 |
44 |
--------------------------------------------------------------------------------
/mysql_ping.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Rackspace Cloud Monitoring Alert to verify MySQL server is running on system
4 |
5 | does a 'mysqladmin ping' to determine if service is running
6 | returns Status OK if service is alive, else Status ERROR.
7 |
8 | NOTE: must have a /root/.my.cnf file with access to mysql
9 |
10 | Example criteria :
11 |
12 | if (metric['legacy_state'] != 'ok') {
13 | return new AlarmStatus(CRITICAL, 'MySQL Server is NOT healthy.');
14 | }
15 |
16 | return new AlarmStatus(OK, 'MySQL Server is running.');
17 |
18 | """
19 | import sys
20 | import os
21 |
22 | stat = os.popen('mysqladmin --defaults-file=/root/.my.cnf ping')
23 | report = stat.read()
24 |
25 | if report =="mysqld is alive\n":
26 | print "status ok ok"
27 | sys.exit(0)
28 | else:
29 | print "status error"
30 | sys.exit(1)
31 |
--------------------------------------------------------------------------------
/mysql_replication.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | ## Rackspace Cloud Monitoring Plug-In
4 | ## MySQL Replication State Validation
5 | #
6 | # (C)2013 Chris Mecca
7 | # All Rights Reserved.
8 | #
9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
10 | # not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | # http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | #
21 | # Usage:
22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
23 | #
24 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
25 | #
26 | # if (metric['SLAVE_STATUS'] != 'ONLINE') {
27 | # return new AlarmStatus(CRITICAL, 'MySQL Replication is OFFLINE.');
28 | # }
29 | #
30 | # if (metric['SLAVE_STATUS'] == 'ONLINE' && metric['SECONDS_BEHIND_MASTER'] \
31 | # >= 120 && metric['SECONDS_BEHIND_MASTER'] < 300) {
32 | # return new AlarmStatus(WARNING, 'MySQL Replication ONLINE \
33 | # but Slave is more than 2 minutes behind Master.');
34 | # }
35 | #
36 | # if (metric['SLAVE_STATUS'] == 'ONLINE' && metric['SECONDS_BEHIND_MASTER'] \
37 | # >= 300) {
38 | # return new AlarmStatus(CRITICAL, 'MySQL Replication ONLINE \
39 | # but Slave is more than 5 minutes behind Master.');
40 | # }
41 | #
42 | # return new AlarmStatus(OK, 'MySQL Replication is ONLINE');
43 |
44 |
45 | import sys
46 | import subprocess
47 | import shlex
48 |
49 |
50 | def mysql_repchk(arg):
51 | proc = subprocess.Popen(shlex.split(arg),
52 | stdout=subprocess.PIPE,
53 | stderr=subprocess.PIPE,
54 | shell=False)
55 |
56 | out, err = proc.communicate()
57 | ret = proc.returncode
58 | return ret, out, err
59 |
60 | RETCODE, OUTPUT, ERR = mysql_repchk('/usr/bin/mysql \
61 | --defaults-file=/root/.my.cnf \
62 | -e "SHOW SLAVE STATUS\\G"')
63 |
64 | if RETCODE:
65 | print >> sys.stderr, "There was an error (%d): \n" % RETCODE
66 | print >> sys.stderr, ERR
67 |
68 | if OUTPUT != "":
69 | SHOW_STATUS_LIST = OUTPUT.split('\n')
70 | del SHOW_STATUS_LIST[0]
71 | del SHOW_STATUS_LIST[-1]
72 |
73 | SLAVE_STATUS = {}
74 | for i in SHOW_STATUS_LIST:
75 | if ":" in i:
76 | SLAVE_STATUS[i.split(':')[0].strip()] = i.split(':')[1].strip()
77 |
78 | if SLAVE_STATUS["Slave_IO_Running"] == "Yes" and \
79 | SLAVE_STATUS["Slave_SQL_Running"] == "Yes" and \
80 | SLAVE_STATUS["Last_Errno"] == "0":
81 |
82 | print "status OK\n" \
83 | "metric SLAVE_STATUS string ONLINE\n" \
84 | "metric SECONDS_BEHIND_MASTER int " \
85 | + SLAVE_STATUS["Seconds_Behind_Master"]
86 | else:
87 | print "status OK\n" \
88 | "metric SLAVE_STATUS string OFFLINE\n" \
89 | "metric SECONDS_BEHIND_MASTER int " \
90 | + SLAVE_STATUS["Seconds_Behind_Master"]
91 |
92 | else:
93 | print "status ERROR\nmetric SLAVE_STATUS string NOT_CONFIGURED"
94 |
--------------------------------------------------------------------------------
/nfs-status.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # nfs-status.sh
4 | # Rackspace Cloud Monitoring Plugin to verify the current return status of nfs.
5 | #
6 | # Copyright (c) 2015, Philip Eatherington
7 | # All rights reserved.
8 | #
9 | # Redistribution and use in source and binary forms, with or without
10 | # modification, are permitted provided that the following conditions are met:
11 | #
12 | # Redistributions of source code must retain the above copyright notice,
13 | # this list of conditions and the following disclaimer.
14 | #
15 | # Redistributions in binary form must reproduce the above copyright
16 | # notice, this list of conditions and the following disclaimer in the
17 | # documentation and/or other materials provided with the distribution.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 | # POSSIBILITY OF SUCH DAMAGE.
30 | #
31 | #
32 | # Verify the current status of NFS shares
33 | #
34 | # Example criteria :
35 | #
36 | # if (metric['nfs_status'] != 'ok') {
37 | # return new AlarmStatus(CRITICAL, 'NFS Service is NOT healthy.');
38 | # }
39 | #
40 | # return new AlarmStatus(OK, 'NFS Service is running.');
41 | #
42 | # REQUIRES 'showmount' to be installed (part of NFS utils)
43 |
44 | HOST=$1
45 | DIR=$2
46 |
47 | OUTPUT=$(showmount -e ${HOST} 2>&1)
48 | if [[ $OUTPUT = *'Connection refused'* ]]
49 | then
50 | state='Error: connection refused '
51 | error=$OUTPUT
52 | elif [[ $OUTPUT = *'Program not registered'* ]]
53 | then
54 | state='Error: NFS not running on host'
55 | error=$OUTPUT
56 | elif [[ $OUTPUT = *$DIR* ]]
57 | then
58 | state='ok'
59 | error='ok'
60 | else
61 | state='Error: No shares found'
62 | error=$OUTPUT
63 | fi
64 | echo 'status' $state
65 | echo 'metric nfs_status string' $error
66 |
--------------------------------------------------------------------------------
/nginx_status_check.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Rackspace Cloud Monitoring Plugin for Nginx Status Page."""
3 | # Copyright 2014 Frank Ritchie
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # -----
18 | #
19 | # This plugin monitors the metrics produced by the Nginx ngx_http_stub_status_module
20 | # module. The module generates an HTML page that contains basic status information.
21 | #
22 | # For more info see:
23 | #
24 | # http://nginx.org/en/docs/http/ngx_http_stub_status_module.html
25 | #
26 | # For advanced metrics the NGINX Plus product is required.
27 | #
28 | # By default the monitor fails if the check does not complete successfully.
29 | #
30 | # Metrics for:
31 | #
32 | # - Active connections
33 | # - Accepted connections
34 | # - Handled connections
35 | # - Number of requests
36 | # - Connections reading
37 | # - Connections writing
38 | # - Connections waiting
39 | #
40 | # are also reported.
41 | #
42 | # Requires:
43 | # Python 2.6 or greater
44 | # Nginx with ngx_http_stub_status_module enabled.
45 | #
46 | # In short, you will need to add a localtion block to the Nginx
47 | # server block, e.g.
48 | #
49 | # location /nginx_status {
50 | # stub_status on;
51 | # access_log off;
52 | # allow 127.0.0.1;
53 | # }
54 | #
55 | # Usage:
56 | # Place script in /usr/lib/rackspace-monitoring-agent/plugins.
57 | # Ensure file is executable (755).
58 | #
59 | # Set up a Cloud Monitoring Check of type agent.plugin to run
60 | #
61 | # nginx_status_check.py -u
62 | #
63 | # The URL is optional and defaults to:
64 | #
65 | # http://0.0.0.0/nginx_status
66 | #
67 | # There is no need to define specific custom alert criteria.
68 | # As stated, the monitor fails if the metrics cannot be collected.
69 | # It is possible to define custom alert criteria with the reported
70 | # metrics if desired.
71 | #
72 |
73 | import re
74 | import sys
75 | import urllib2
76 | from optparse import OptionParser
77 |
78 | class NginxStatus(object):
79 | """Create an object for an Nginx Status URL. Assume URL is not available."""
80 |
81 | def __init__(self, url):
82 |
83 | self.url = url
84 | self.nginx_status_available = False
85 |
86 | def nginx_status_metrics(self):
87 | """Connect to the Nginx Status URL object. Error out on failure."""
88 |
89 | try:
90 | nginx_status_conn = urllib2.urlopen(self.url)
91 | nginx_status_data = nginx_status_conn.read()
92 | self.nginx_status_available = True
93 | except urllib2.URLError:
94 | print 'status err URLError: check the URL and that Nginx running.'
95 | sys.exit(1)
96 | except Exception:
97 | print 'status err failed to obtain nginx status metrics.'
98 | sys.exit(1)
99 |
100 | if self.nginx_status_available:
101 | # Use regexes to parse /nginx_stats.
102 | match1 = re.search(r'Active connections:\s+(\d+)', nginx_status_data)
103 | match2 = re.search(r'\s*(\d+)\s+(\d+)\s+(\d+)', nginx_status_data)
104 | match3 = re.search(r'Reading:\s*(\d+)\s*Writing:\s*(\d+)\s*'
105 | 'Waiting:\s*(\d+)', nginx_status_data)
106 | print 'metric active_connections int64', int(match1.group(1))
107 | print 'metric accepted_connections int64', int(match2.group(1))
108 | print 'metric handled_connections int64', int(match2.group(2))
109 | print 'metric number_of_requests int64', int(match2.group(3))
110 | print 'metric connections_reading int64', int(match3.group(1))
111 | print 'metric connections_writing int64', int(match3.group(2))
112 | print 'metric connections_waiting int64', int(match3.group(3))
113 | print 'status ok succeeded in obtaining nginx status metrics.'
114 | else:
115 | print 'status err failed to obtain nginx status metrics.'
116 | sys.exit(1)
117 |
118 |
119 | def main():
120 | """Instantiate an NginxStatus object and collect stats."""
121 |
122 | parser = OptionParser()
123 | parser.add_option('-u', '--url', default='http://0.0.0.0/nginx_status',
124 | help='URL for Nginx Status page.')
125 | (opts, args) = parser.parse_args()
126 |
127 | nginx_status = NginxStatus(opts.url)
128 | nginx_status.nginx_status_metrics()
129 |
130 | if __name__ == '__main__':
131 | main()
132 |
--------------------------------------------------------------------------------
/ntp_offset.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # ntp_offset.sh
4 | # Rackspace Cloud Monitoring Plugin to verify the time offset from ntp
5 | #
6 | # Copyright (c) 2013, Jordan Evans
7 | # Copyright (c) 2014, Simon Vetter
8 | # All rights reserved.
9 | #
10 | # Redistribution and use in source and binary forms, with or without
11 | # modification, are permitted provided that the following conditions are met:
12 | #
13 | # Redistributions of source code must retain the above copyright notice,
14 | # this list of conditions and the following disclaimer.
15 | #
16 | # Redistributions in binary form must reproduce the above copyright
17 | # notice, this list of conditions and the following disclaimer in the
18 | # documentation and/or other materials provided with the distribution.
19 | #
20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
24 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 | # POSSIBILITY OF SUCH DAMAGE.
31 | #
32 | #
33 | # This plugin expects to find ntpq and awk in the environment
34 | # it reports the average ntp offset from ntpq in milliseconds.
35 | #
36 | # Example alarm code:
37 | # :set consecutiveCount=3
38 | # if (metric['ntp_offset'] > 10000 || metric['ntp_offset'] < -10000) {
39 | # return new AlarmStatus(CRITICAL, 'ntp offset is too high.');
40 | # }
41 | # return new AlarmStatus(OK, 'ntp offset is fine.');
42 | #
43 | # if (metric['active_sources'] < 2) {
44 | # return new AlarmStatus(WARNING, 'ntpd is only using #{active_sources} sources');
45 | # }
46 | # return new AlarmStatus(OK, 'ntpd has #{active_sources} active sources');
47 | #
48 |
49 | NTPQ_BIN=$(which ntpq)
50 | AWK_BIN=$(which awk)
51 |
52 | if [[ -x $NTPQ_BIN ]] && [[ -x $AWK_BIN ]]
53 | then
54 | # only select line starting with * (system peer), + (candidate), # (selected), and o (pps sys peer)
55 | OUTPUT=$($NTPQ_BIN -pn | $AWK_BIN '{ if ($1 ~ "^[\\*\\+#o].*" && $9 ~ /[0-9]/) print $9};' | cut -f 1 -d '.')
56 |
57 | for x in ${OUTPUT}
58 | do
59 | sum=$(($sum + $x))
60 | count=$(($count + 1))
61 | done
62 |
63 | if [[ ${count} -gt 0 ]]; then
64 | avg=$(($sum / $count))
65 | echo "status ok got ntp stats"
66 | echo "metric ntp_offset int32 ${avg} milliseconds"
67 | echo "metric active_sources uint32 ${count} sources"
68 | exit 0
69 | else
70 | echo "status err could not compute ntp offset: no reachable or active source"
71 | fi
72 | else
73 | echo "status err could not compute ntp offset: ntpq and/or awk could not be found"
74 | fi
75 |
76 | exit 1
77 |
--------------------------------------------------------------------------------
/onmetal_v1_smart.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Script for monitoring remaining useful lifetime of OnMetal v1 SATADOM.
4 | #
5 | # Requires the following binaries installed & on path:
6 | # - smartctl
7 | # - lsblk
8 | #
9 | # Suggested alarm criteria:
10 | #
11 | # if (metric['percent_pe_cycles_used'] >= 1) {
12 | # return new AlarmStatus(CRITICAL, 'Drive is beyond expected life.');
13 | # }
14 | #
15 | # if (metric['percent_pe_cycles_used'] >= .8) {
16 | # return new AlarmStatus(WARNING, 'Drive >= 80% of its expected life.');
17 | # }
18 | #
19 | # return new AlarmStatus(OK, 'Drive less than 80% through its expected life.');
20 |
21 | import subprocess
22 | import sys
23 |
24 | DEVICE = "/dev/sda"
25 |
26 | SATADOM_PE_MAX = {
27 | '32G MLC SATADOM': 3000,
28 | '7 PIN SATA FDM': 3000,
29 | 'Fastable SD 131 7': 3000,
30 | 'Fastable SD131 7': 3000,
31 | 'SATADOM-SH TYPE': 100000,
32 | 'SATADOM-SH TYPE C 3SE': 100000,
33 | }
34 |
35 |
36 | def _fail(msg="Unknown Error"):
37 | print("status err {}".format(msg))
38 | sys.exit(1)
39 |
40 |
41 | def _get_smartctl_attributes():
42 | try:
43 | out = subprocess.check_output(['smartctl', '--attributes', DEVICE])
44 | except:
45 | _fail("failed running smartctl")
46 |
47 | header = None
48 | it = iter(out.split('\n'))
49 | for line in it:
50 | # note(JayF): skip forward until we get to the header and pull
51 | # it out
52 | if line.strip().startswith('ID#'):
53 | header = line.strip().split()
54 | break
55 |
56 | attributes = {}
57 | # note(JayF): All lines at this point contain metrics or are blank.
58 | for line in it:
59 | line = line.strip()
60 | if not line:
61 | continue
62 | linelist = line.split()
63 | # note(JayF): match up headers to values to generate a dict
64 | key = linelist[0] + '-' + linelist[1]
65 | value = dict(zip(header[2:], linelist[2:]))
66 | attributes[key] = value
67 |
68 | return attributes
69 |
70 |
71 | def _calculate_pe_cycles(actual_value):
72 | return int(hex(int(actual_value))[-4:], 16)
73 |
74 |
75 | def _calculate_life_expectancy(pe_cycle_current, pe_cycle_max):
76 | # note(JayF): Force one of the values to a float to avoid int division
77 | return "{:f}".format(pe_cycle_current / float(pe_cycle_max))
78 |
79 |
80 | def _get_satadom_model():
81 | try:
82 | model = subprocess.check_output(
83 | ['lsblk', '-oMODEL', DEVICE]).strip().split('\n')[1]
84 | except:
85 | _fail("failed running lsblk")
86 |
87 | if model not in SATADOM_PE_MAX.keys():
88 | _fail("UNKNOWN SATADOM MODEL")
89 | exit(1)
90 | else:
91 | return model
92 |
93 |
94 | attrs = _get_smartctl_attributes()
95 | life_remaining = _calculate_life_expectancy(
96 | _calculate_pe_cycles(attrs['173-Unknown_Attribute']['RAW_VALUE']),
97 | SATADOM_PE_MAX[_get_satadom_model()])
98 |
99 | print("status ok smart stats gathered successfully")
100 | print("metric percent_pe_cycles_used float {}".format(life_remaining))
101 |
--------------------------------------------------------------------------------
/open_files.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2015 Brad Ison
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | """
18 | Rackspace Cloud Monitoring agent plugin to count open files on Linux.
19 |
20 | This check records the number of file handles in use on a Linux
21 | system using the proc file system:
22 |
23 | https://www.kernel.org/doc/Documentation/sysctl/fs.txt
24 |
25 | Example alarm criteria:
26 |
27 | if (metric['open_files'] > 65535) {
28 | return new AlarmStatus(CRITICAL, "Too many open files!");
29 | }
30 |
31 | """
32 |
33 | import sys
34 |
35 |
36 | PROC_FILE = "/proc/sys/fs/file-nr"
37 |
38 |
39 | try:
40 | open_nr, free_nr, max = open(PROC_FILE).readline().split("\t")
41 | open_files = int(open_nr) - int(free_nr)
42 | except Exception as e:
43 | print "status error {}".format(e)
44 | sys.exit(1)
45 |
46 |
47 | print "status ok {} open files".format(open_files)
48 | print "metric open_files uint32 {}".format(open_files)
49 |
--------------------------------------------------------------------------------
/pg_check.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Simple PostgeSQL status check for Rackspace Cloud Monitoring
4 | #
5 | # (C)2014 Christopher Coffey
6 | # All Rights Reserved.
7 | #
8 | # Licensed under the Apache License, Version 2.0 (the "License"); you may
9 | # not use this file except in compliance with the License.
10 | # You may obtain a copy of the License at
11 | #
12 | # http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS,
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 | #
20 | # Usage:
21 | # Place file in the /usr/lib/rackspace-monitoring-agent/plugins/ directory
22 | #
23 | # No need to define specific custom alert criteria, Status ok is only acceptable
24 | # response, All other responses trigger alert (default responses expected).
25 | #
26 | # SAMPLE monitoring-postgresql.yaml monitoring file to be placed in
27 | # /etc/rackspace-monitoring-agent.conf.d/
28 | # --------------------------------
29 | # type: agent.plugin
30 | # label: postgresql status
31 | # period: 300
32 | # timeout: 30
33 | # details:
34 | # file: pg_check.py
35 | #
36 |
37 | import sys
38 | import os
39 |
40 | stat = os.popen('pg_isready')
41 | report = stat.read()
42 |
43 | if report.find("accepting connections") != -1:
44 | print "status ok"
45 | sys.exit(0)
46 | else:
47 | print "status error"
48 | sys.exit(1)
49 |
--------------------------------------------------------------------------------
/php-fpm_status_check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2016 gustavo panizzo
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # -----
17 | #
18 | # This plugin monitors the metrics produced by the PHP-FPM status page
19 | # pm.status_path needs to be enabled per pool you want to monitor
20 | #
21 | # pm.status_path = /status-for-php-fpm
22 | #
23 | # and you need the cgi-fcgi command
24 | # yum install fcgi
25 | # apt-get install libfcgi0ldbl
26 | #
27 | # For more info see:
28 | #
29 | # http://php.net/manual/en/install.fpm.configuration.php
30 | #
31 | # By default the monitor fails if the check does not complete successfully.
32 | #
33 | # Metrics for:
34 | #
35 | # accepted conn
36 | # listen queue
37 | # max listen queue
38 | # listen queue len
39 | # idle processes
40 | # active processes
41 | # total processes
42 | # max active processes
43 | # max children reached
44 | #
45 | # are also reported.
46 | #
47 | #
48 | # Usage:
49 | # Place script in /usr/lib/rackspace-monitoring-agent/plugins.
50 | # Ensure file is executable (755).
51 | #
52 | # Set up a Cloud Monitoring Check of type agent.plugin to run
53 | #
54 | # php-fpm_status_check.sh SOCKET_PATH STATUS_URL
55 | #
56 | # Both are optional and default to:
57 | #
58 | # /var/run/php-fpm/www.sock
59 | # /status-for-php-fpm
60 | #
61 | # There is no need to define specific custom alert criteria.
62 | # As stated, the monitor fails if the metrics cannot be collected.
63 | # It is possible to define custom alert criteria with the reported
64 | # metrics if desired.
65 | #
66 | # Example criteria :
67 | #
68 | #if (metric['max_children_reached'] > 0) {
69 | # return CRITICAL, "Max Children reached"
70 | #}
71 | #if (metric['legacy_state'] != 'ok') {
72 | # return CRITICAL, "PHP-PFM is not running correctly or misconfigured check"
73 | #}
74 | #
75 | #return OK, "PHP-FPM is running correctly"
76 |
77 |
78 | CGIFCGI=$(which cgi-fcgi 2>/dev/null)
79 | if [ $? != 0 ]; then
80 | #echo "status error: Could not find cgi-fcgi."
81 | #echo "status error"
82 | echo "status err failed to obtain metrics."
83 | exit 1
84 | fi
85 |
86 | SOCKET=${1-/var/run/php-fpm/www.sock}
87 | STATUS_PATH=${2-/status-for-php-fpm}
88 | OUTPUT=$(mktemp)
89 |
90 | SCRIPT_NAME="${STATUS_PATH}" SCRIPT_FILENAME="${STATUS_PATH}" REQUEST_METHOD=GET $CGIFCGI -bind -connect ${SOCKET} 2>/dev/null > $OUTPUT
91 | if [ $? != 0 ]; then
92 | #echo "status error"
93 | echo "status err failed to obtain metrics."
94 | exit 1
95 | fi
96 |
97 | accepted_conn=$(grep "^accepted conn:" $OUTPUT | awk '{print $3}') 2>/dev/null
98 | listen_queue=$(grep "^listen queue:" $OUTPUT | awk '{print $3}') 2>/dev/null
99 | max_listen_queue=$(grep "^max listen queue:" $OUTPUT | awk '{print $4}') 2>/dev/null
100 | listen_queue_len=$(grep "^listen queue len:" $OUTPUT | awk '{print $4}') 2>/dev/null
101 | idle_processes=$(grep "^idle processes:" $OUTPUT | awk '{print $3}') 2>/dev/null
102 | active_processes=$(grep "^active processes:" $OUTPUT | awk '{print $3}') 2>/dev/null
103 | total_processes=$(grep "^total processes:" $OUTPUT | awk '{print $3}') 2>/dev/null
104 | max_active_processes=$(grep "^max active processes:" $OUTPUT | awk '{print $4}') 2>/dev/null
105 | max_children_reached=$(grep "^max children reached:" $OUTPUT | awk '{print $4}') 2>/dev/null
106 |
107 | echo "status ok succeeded in obtaining metrics."
108 | echo "metric accepted_conn uint32 $accepted_conn"
109 | echo "metric listen_queue uint32 $listen_queue"
110 | echo "metric max_listen_queue uint32 $max_listen_queue"
111 | echo "metric listen_queue_len uint32 $listen_queue_len"
112 | echo "metric idle_processes uint32 $idle_processes"
113 | echo "metric active_processes uint32 $active_processes"
114 | echo "metric total_processes uint32 $total_processes"
115 | echo "metric max_active_processes uint32 $max_active_processes"
116 | echo "metric max_children_reached uint32 $max_children_reached"
117 |
118 | rm -f $OUTPUT
119 | exit 0
120 |
--------------------------------------------------------------------------------
/ping.sh:
--------------------------------------------------------------------------------
1 | #/usr/sbin/env bash
2 | : <<'DESCRIPTION'
3 |
4 | Rackspace Cloud Monitoring Plug-In
5 | This is a plugin to monitor ICMP response times of hosts accessible by the server
6 |
7 | ----------------------------------------------------------------------------
8 | "THE BEER-WARE LICENSE" (Revision 42):
9 | wrote this file. As long as you retain this notice you
10 | can do whatever you want with this stuff. If we meet some day, and you think
11 | this stuff is worth it, you can buy me a beer in return.
12 | ----------------------------------------------------------------------------
13 |
14 | Usage:
15 | - Place plug-in in folder /usr/lib/rackspace-monitoring-agent/plugins
16 | - Ensure that it is executable
17 | chmod +x ping.sh
18 | - Configure Custom Plugin type check in Rackspace Intelligence
19 | Specify only the script's name and the hostname/IP to ping, e.g.:
20 | ping.sh 192.168.0.1
21 | Count is the amount of ICMP probes sent in a singe check, and interval is the
22 | number of seconds between them. They are both optional. Their default values
23 | are 5 pings with an interval of 2 seconds.
24 | - Configure an Alert (optional, see example below).
25 |
26 | This plugin returns 4 metrics:
27 | - minimum, average, maximum: statistics returned by the GNU ping utility
28 | in the format "round-trip min/avg/max/stddev = 9.429/35.460/79.698/27.657 ms"
29 | - lost_packets: the percentage of the packets lost out of the number of probes
30 | sent in this check run
31 |
32 | Example alert:
33 |
34 | --- start copying after this line ---
35 |
36 | if (metric['average'] >= 30 ) {
37 | return new AlarmStatus(WARNING, 'Average round-trip took #{average}ms');
38 | }
39 | if (metric['lost_packets'] >= 40) {
40 | return new AlarmStatus(WARNING, 'Packet loss was #{lost_packets}%');
41 | }
42 | if (metric['legacy_state'] != "ok") {
43 | return new AlarmStatus(CRITICAL, 'Error: #{legacy_state}');
44 | }
45 | return new AlarmStatus(OK, 'All good');
46 |
47 | --- stop copying before this line ---
48 |
49 | DESCRIPTION
50 |
51 | ping_stats=$(ping -i "${3:-2}" -q -w 30 -n -c "${2:-5}" "${1}" 2>&1 | tail -2)
52 | min_ping="$(echo ${ping_stats} | sed -e "s#.\+= \([.0-9]\+\).\+#\\1#g")"
53 | avg_ping="$(echo ${ping_stats} | cut -d'/' -f5)"
54 | max_ping="$(echo ${ping_stats} | cut -d'/' -f6)"
55 | loss_percent="$(echo ${ping_stats} | sed -e "s#.\+ \([0-9]\+\)%.\+#\1#")"
56 |
57 | if [ -n "$(echo "${avg_ping}" | grep "^[.0-9]\+$" -)" ]
58 | then
59 | echo "status ok"
60 | echo "metric minimum double ${min_ping} milliseconds"
61 | echo "metric average double ${avg_ping} milliseconds"
62 | echo "metric maximum double ${max_ping} milliseconds"
63 | echo "metric lost_packets int32 ${loss_percent} percent"
64 | else
65 | echo "status error: ping probe fail"
66 | exit 1
67 | fi
68 |
--------------------------------------------------------------------------------
/port_check.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Rackspace Cloud Monitoring plugin to check port, particularly
4 | useful for services that aren't accessible to a remote port check.
5 |
6 | Copyright 2013 Steve Katen
7 |
8 | Licensed under the Apache License, Version 2.0 (the "License");
9 | you may not use this file except in compliance with the License.
10 | You may obtain a copy of the License at
11 |
12 | http://www.apache.org/licenses/LICENSE-2.0
13 |
14 | Unless required by applicable law or agreed to in writing, software
15 | distributed under the License is distributed on an "AS IS" BASIS,
16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | See the License for the specific language governing permissions and
18 | limitations under the License.
19 | """
20 | import sys
21 | import socket
22 |
23 |
24 | def socket_open(host, port):
25 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
26 | try:
27 | s.connect((host, int(port)))
28 | s.shutdown(2)
29 | s.close()
30 | except socket.error:
31 | return "CLOSED"
32 | else:
33 | return "OPEN"
34 |
35 |
36 | def main():
37 | if len(sys.argv) != 3:
38 | print "Usage: %s " % sys.argv[0]
39 | sys.exit(0)
40 |
41 | host = sys.argv[1]
42 | port = sys.argv[2]
43 | p = socket_open(host, port)
44 |
45 | if not p:
46 | print "status err no connection"
47 | sys.exit(0)
48 |
49 | print "status OK"
50 | print "metric port int", port
51 | print "metric status string", p
52 |
53 |
54 | if __name__ == '__main__':
55 | main()
56 |
--------------------------------------------------------------------------------
/port_check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Description: Custom plugin which checks that some service is listening on the
4 | # specified port.
5 | # Author: Tomaz Muraus
6 | # License: MIT
7 | #
8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | #
15 | # The above copyright notice and this permission notice shall be included in
16 | # all copies or substantial portions of the Software.
17 | #
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 | # THE SOFTWARE.
25 |
26 | if [ $# -ne 3 ]; then
27 | echo "Usage: $0 "
28 | exit 100
29 | fi
30 |
31 | PROTOCOL=$1
32 | IP=$2
33 | PORT=$3
34 |
35 | OPTIONS=""
36 |
37 | if [ $PROTOCOL = "udp" ]; then
38 | OPTIONS="-u"
39 | fi
40 |
41 | nc ${OPTIONS} ${IP} ${PORT} < /dev/null > /dev/null 2>&1
42 |
43 | if [ $? -ne 0 ]; then
44 | echo "status Nothing listening on port ${IP}:${PORT} (${PROTOCOL})"
45 | echo "metric listening string no"
46 | else
47 | echo "status Service listening on ${IP}:${PORT} (${PROTOCOL})"
48 | echo "metric listening string yes"
49 | fi
50 |
--------------------------------------------------------------------------------
/process_mon.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # process_mon.sh
4 | # Rackspace Cloud Monitoring Plugin to check if process is running.
5 | #
6 | # Copyright (c) 2013, Stephen Lang
7 | # All rights reserved.
8 | #
9 | # Redistribution and use in source and binary forms, with or without
10 | # modification, are permitted provided that the following conditions are met:
11 | #
12 | # Redistributions of source code must retain the above copyright notice,
13 | # this list of conditions and the following disclaimer.
14 | #
15 | # Redistributions in binary form must reproduce the above copyright
16 | # notice, this list of conditions and the following disclaimer in the
17 | # documentation and/or other materials provided with the distribution.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 | # POSSIBILITY OF SUCH DAMAGE.
30 | #
31 | # Curl Command:
32 | # curl -i -X POST -H 'Host: monitoring.api.rackspacecloud.com' -H
33 | # 'Accept-Encoding: gzip,deflate' -H 'X-Auth-Token: YOUR_API_TOKEN' -H
34 | # 'Content-Type: application/json; charset=UTF-8' -H 'Accept:
35 | # application/json' --data-binary '{"label": "Process Check", "type":
36 | # "agent.plugin", "details": {"args": ["PROCESS_NAME"],"file":
37 | # "process_mon.sh"}}' --compress
38 | # 'https://monitoring.api.rackspacecloud.com:443/v1.0/YOUR_ACCOUNT/entities/YOUR_ENTITY/checks'
39 | #
40 | # Usage:
41 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins
42 | #
43 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm:
44 | #
45 | # if (metric['process_mon'] == 0) {
46 | # return new AlarmStatus(CRITICAL, 'Process not running.');
47 | # }
48 | #
49 | # return new AlarmStatus(OK, 'Process running normally.');
50 |
51 | function help {
52 |
53 | cat <
4 | # License: MIT
5 | #
6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
7 | # of this software and associated documentation files (the 'Software'), to deal
8 | # in the Software without restriction, including without limitation the rights
9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | # copies of the Software, and to permit persons to whom the Software is
11 | # furnished to do so, subject to the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included in
14 | # all copies or substantial portions of the Software.
15 | #
16 | # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 | # THE SOFTWARE.
23 | #
24 | # Rackspace Cloud Monitoring plugin for monitoring a RabbitMQ stats. This plugin
25 | # was adapted from the Cloudkick rabbitmq.py plugin by Tomaz Muraus:
26 | # https://github.com/cloudkick/agent-plugins/blob/master/rabbitmq.py
27 | #
28 | # Example usage (arguments which you pass in to the plugin):
29 | #
30 | # Monitor queue "bg_jobs" memory usage, number of consumers and number of
31 | # messages:
32 | #
33 | # --action list_queues --queue bg_jobs --parameters memory,consumers,messages
34 | #
35 | # Monitor exchange "amqp.direct" type, durability and auto_delete value
36 | #
37 | # --action list_exchanges --exchange amqp.direct --parameters type,durable,auto_delete
38 | #
39 | # Monitor queue "bg_jobs" memory usage, number of consumers, number of
40 | # messages and alert if messages over 100
41 | #
42 | # --action list_queues --queue bg_jobs --queue-length 100 ---parameters memory,consumers,messages
43 | #
44 |
45 | import re
46 | import sys
47 | import subprocess
48 | import optparse
49 |
50 | METRIC_TYPES = {
51 | 'list_queues': {
52 | 'name': 'string',
53 | 'durable': 'string',
54 | 'auto_delete': 'string',
55 | 'arguments': 'string',
56 | 'pid': 'int',
57 | 'owner_pid': 'int',
58 | 'messages_ready': 'int',
59 | 'messages_unacknowledged': 'int',
60 | 'messages': 'int',
61 | 'consumers': 'int',
62 | 'memory': 'int'
63 | },
64 |
65 | 'list_exchanges': {
66 | 'name': 'string',
67 | 'type': 'string',
68 | 'durable': 'string',
69 | 'auto_delete': 'string',
70 | 'internal': 'string',
71 | 'argument': 'string'
72 | }
73 | }
74 |
75 | def retrieve_stats(vhost, action, queue, exchange, parameters,
76 | rabbitmqctl_path):
77 | value = queue or exchange
78 | command = [ rabbitmqctl_path, action, '-p', vhost ]
79 | parameters = parameters.split(',')
80 |
81 | parameters = [ p.lower() for p in parameters \
82 | if p.lower() in METRIC_TYPES[action].keys() ]
83 |
84 | command.extend( [ 'name' ] + parameters)
85 | process1 = subprocess.Popen(command, stdout=subprocess.PIPE,
86 | stderr=subprocess.STDOUT)
87 | process2 = subprocess.Popen([ 'grep', value ], stdin=process1.stdout,
88 | stdout=subprocess.PIPE,
89 | stderr=subprocess.PIPE)
90 | process1.stdout.close()
91 | stdout, stderr = process2.communicate()
92 |
93 | if stderr:
94 | return None, stderr
95 |
96 | stdout = stdout.split('\n')
97 | stdout = stdout[0]
98 |
99 | if not stdout:
100 | return None, 'Empty output'
101 |
102 | return parse_stats( [ 'name' ] + parameters, stdout), None
103 |
104 | def parse_stats(parameters, data):
105 | values = re.split('\s+', data)
106 |
107 | stats = {}
108 | for index, parameter in enumerate(parameters):
109 | stats[parameter] = values[index]
110 |
111 | return stats
112 |
113 | def print_metrics(action, metrics):
114 | for key, value in metrics.iteritems():
115 | metric_type = METRIC_TYPES[action].get(key, None)
116 |
117 | if not metric_type:
118 | continue
119 |
120 | print 'metric %s %s %s' % (key, metric_type, value)
121 |
122 | if __name__ == '__main__':
123 | parser = optparse.OptionParser()
124 | parser.add_option('--path', action='store', dest='rabbitmqctl_path',
125 | default='rabbitmqctl',
126 | help='Path to the rabbitmqctl binary (optional)')
127 | parser.add_option('--action', action='store', dest='action',
128 | help='Action (list_queues or list_exchanges)')
129 | parser.add_option('--vhost', action='store', dest='vhost', default='/',
130 | help='Vhost (optional)')
131 | parser.add_option('--queue', action='store', dest='queue',
132 | help='Queue name')
133 | parser.add_option('--exchange', action='store', dest='exchange',
134 | help='Exchange name')
135 | parser.add_option('--parameters', action='store', dest='parameters',
136 | default='messages',
137 | help='Comma separated list of parameters to retrieve (default = messages)')
138 | parser.add_option('--queue-length', type='int', action='store', dest='length',
139 | help='Max messages in the queue before alert')
140 |
141 | (options, args) = parser.parse_args(sys.argv)
142 |
143 | rabbitmqctl_path = options.rabbitmqctl_path
144 | action = getattr(options, 'action', None)
145 | vhost = options.vhost
146 | queue = getattr(options, 'queue', None)
147 | exchange = getattr(options, 'exchange', None)
148 | parameters = options.parameters
149 | length = getattr(options, 'length', None)
150 |
151 | if not action:
152 | print 'status err Missing required argument: action'
153 | sys.exit(1)
154 |
155 | if action == 'list_queues' and not queue:
156 | print 'status err Missing required argument: queue'
157 | sys.exit(1)
158 | elif action == 'list_exchanges' and not exchange:
159 | print 'status err Missing required argument: exchange'
160 | sys.exit(1)
161 |
162 | if action not in METRIC_TYPES.keys():
163 | print 'status err Invalid action: %s' % (action)
164 | sys.exit(1)
165 |
166 | if not parameters:
167 | print 'status err Missing required argument: parameters'
168 | sys.exit(1)
169 |
170 | metrics, error = retrieve_stats(vhost, action, queue, exchange,
171 | parameters, rabbitmqctl_path)
172 |
173 | if error:
174 | print 'status err %s' % (error)
175 | sys.exit(1)
176 | if length is not None and metrics.has_key('messages'):
177 | if int(metrics['messages']) > length:
178 | print 'status err Message queue %s at %d and above threshold of %d' % (
179 | queue, int(metrics['messages']), length)
180 | sys.exit(1)
181 | print 'status ok metrics successfully retrieved'
182 | print_metrics(action, metrics)
183 |
--------------------------------------------------------------------------------
/redis_slave_count.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Name: redis_slave_count.sh
4 | # Description: Custom plugin that returns number of slaves connected to redis.
5 |
6 | # Copyright 2014 Zachary Deptawa
7 | #
8 | # Licensed under the Apache License, Version 2.0 (the "License");
9 | # you may not use this file except in compliance with the License.
10 | # You may obtain a copy of the License at
11 | #
12 | # http://www.apache.org/licenses/LICENSE-2.0
13 | #
14 | # Unless required by applicable law or agreed to in writing, software
15 | # distributed under the License is distributed on an "AS IS" BASIS,
16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | # See the License for the specific language governing permissions and
18 | # limitations under the License.
19 |
20 | # SYNOPSIS: ./redis_slave_count.sh [host] [port] [password]...
21 | # USAGE EXAMPLE: ./redis_slave_count.sh 127.0.0.1 6379 abcdef12
22 | #
23 | # Note: If no host/port/password given, it will default to host 0.0.0.0, port 6379, no pass.
24 |
25 | # What the plugin does:
26 | # - Looks for 'connected_slaves' line in output of `redis-cli INFO` command and
27 | # returns that value as a metric.
28 | # - Returns non-zero and 'status error' if 'connected_slaves' not found.
29 |
30 | # Rackspace Cloud Monitoring Plugin Usage:
31 | # - Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins.
32 | # - Create a check that calls this plugin.
33 | # - Pass 'args' as 'host port' or 'host port pass' if needed.
34 | # NOTE: If you are unable to create the check, make sure you're
35 | # passing the args as an array!
36 | # - Create an alarm based on the criteria you're looking for.
37 | #
38 | # The following is an example 'criteria' for a Rackspace Cloud Monitoring Alarm:
39 | #
40 | # if (metric['connected_slaves'] == 0) {
41 | # return new AlarmStatus(CRITICAL, 'No slaves connected.');
42 | # }
43 | #
44 | # return new AlarmStatus(OK, 'Slaves are connected.');
45 |
46 |
47 | # If host arg is set, set $HOST. Else, default $HOST to '0.0.0.0'.
48 | if [ $1 ]; then
49 | HOST=$1
50 | else
51 | HOST=0.0.0.0
52 | fi
53 |
54 | # If port arg is set, set $PORT. Else, default $PORT to '6379'.
55 | if [ $2 ]; then
56 | PORT=$2
57 | else
58 | PORT=6379
59 | fi
60 |
61 | if [ $3 ]; then
62 | PASS=$3
63 | fi
64 |
65 | # Get the info and connected_slaves output.
66 | if [ $3 ]; then
67 | INFO=`redis-cli -h $HOST -p $PORT -a $PASS INFO`
68 | SLAVE_COUNT=`redis-cli -h $HOST -p $PORT -a $PASS INFO |grep connected_slaves |awk -F':' {'print$2'}`
69 | else
70 | INFO=`redis-cli -h $HOST -p $PORT INFO`
71 | SLAVE_COUNT=`redis-cli -h $HOST -p $PORT INFO |grep connected_slaves |awk -F':' {'print$2'}`
72 | fi
73 |
74 | # If $SLAVE_COUNT, return metrics. Else fail.
75 | if [ $SLAVE_COUNT ]; then
76 | echo "metric connected_slaves int $SLAVE_COUNT"
77 | else
78 | echo "status error - unable to pull stats from redis INFO"
79 | exit 1
80 | fi
81 |
--------------------------------------------------------------------------------
/solrmon.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | import socket
4 | import httplib
5 | import xml.etree.ElementTree
6 |
7 | from pprint import pprint
8 |
9 | def solr_ok(uri="localhost:8983"):
10 | try:
11 | c = httplib.HTTPConnection(uri)
12 | c.request("GET", "/solr/admin/ping")
13 | r = c.getresponse()
14 | except socket.error:
15 | return False
16 | if r.status == 200:
17 | return True
18 | else:
19 | return False
20 |
21 |
22 | def solrstats(uri="localhost:8983"):
23 | solr_stats = {}
24 | try:
25 | c = httplib.HTTPConnection(uri)
26 | c.request("GET", "/solr/admin/system")
27 | r = c.getresponse()
28 | if r.status == 200:
29 | xmldoc = xml.etree.ElementTree.fromstring(r.read())
30 | uptime_elements = xmldoc.findall(".//*[@name='upTimeMS']")
31 | if len(uptime_elements) > 0:
32 | solr_stats['upTimeMS'] = uptime_elements[0].text
33 | else:
34 | pass
35 | except socket.error:
36 | pass
37 |
38 | try:
39 | c = httplib.HTTPConnection(uri)
40 | c.request("GET", "/solr/admin/luke")
41 | r = c.getresponse()
42 | if r.status == 200:
43 | xmldoc = xml.etree.ElementTree.fromstring(r.read())
44 | luke_elements = xmldoc.findall(".//*[@name='numDocs']")
45 | if len(luke_elements) > 0:
46 | solr_stats['numDocs'] = luke_elements[0].text
47 | else:
48 | pass
49 | except socket.error:
50 | pass
51 |
52 | return solr_stats
53 |
54 |
55 | if __name__ == '__main__':
56 | if solr_ok():
57 | print "status OK solr responded to solr.PingRequestHandler query"
58 | else:
59 | print "status Critical solr failed to respond, or reported an error"
60 |
61 | solr_stats = solrstats()
62 | for stat in solr_stats.keys():
63 | print 'metric %s int64 %s' % (stat, solr_stats[stat])
64 |
65 |
--------------------------------------------------------------------------------
/ssl_cert_expiration.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Rackspace Cloud Monitoring plugin to check if a ssl cert is expired.
4 |
5 | # Example:
6 | # $ ./ssl_cert_expiration.sh
7 |
8 | # Example Alarm Criteria:
9 | # if (metric['cert_end_in'] <= 0) {
10 | # return new AlarmStatus(CRITICAL, 'Certificate has expired on host')
11 | # }
12 | # if (metric['cert_end_in'] < 604800) {
13 | # return new AlarmStatus(WARNING, 'Certificate expires in less than 1 week');
14 | # }
15 | # return new AlarmStatus(OK, 'Certificate valid for more than 1 week');
16 |
17 | # Copyright 2015 Rackspace
18 |
19 | # Licensed under the Apache License, Version 2.0 (the "License");
20 | # you may not use this file except in compliance with the License.
21 | # You may obtain a copy of the License at
22 |
23 | # http://www.apache.org/licenses/LICENSE-2.0
24 |
25 | # Unless required by applicable law or agreed to in writing, software
26 | # distributed under the License is distributed on an "AS IS" BASIS,
27 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28 | # See the License for the specific language governing permissions and
29 | # limitations under the License.
30 |
31 | if [ $# -ne 2 ]; then
32 | echo "Usage: $0 "
33 | exit 100
34 | fi
35 |
36 | HOST=$1
37 | PORT=$2
38 |
39 | EXPIRATION_DATE=$(echo ""|openssl s_client -connect $HOST:$PORT 2>/dev/null | openssl x509 -noout -enddate | sed 's/^not.*\=//')
40 |
41 | REMAINING_SECONDS=$(( $(date -u -d"$EXPIRATION_DATE" +%s) - $(date +%s) ))
42 |
43 | echo "status ok"
44 | echo "metric cert_end_in int ${REMAINING_SECONDS}"
45 |
--------------------------------------------------------------------------------
/ssl_protocols_check.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Description: Agent plugin which detects supported SSL / TLS protocol versions.
4 | # Author: Tomaz Muraus
5 | # License: MIT
6 | #
7 | # Permission is hereby granted, free of charge, to any person obtaining a copy
8 | # of this software and associated documentation files (the "Software"), to deal
9 | # in the Software without restriction, including without limitation the rights
10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | # copies of the Software, and to permit persons to whom the Software is
12 | # furnished to do so, subject to the following conditions:
13 | #
14 | # The above copyright notice and this permission notice shall be included in
15 | # all copies or substantial portions of the Software.
16 | #
17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 | # THE SOFTWARE.
24 |
25 | if [ $# -ne 1 ] && [ $# -ne 2 ]; then
26 | echo "Usage: $0 [port]"
27 | exit 100
28 | fi
29 |
30 | IP=$1
31 |
32 | if [ $# -eq 2 ]; then
33 | PORT=$2
34 | else
35 | PORT=443
36 | fi
37 |
38 | SUPPORTED_PROTOCOLS=()
39 |
40 | OUTPUT=$(openssl s_client -ssl2 -connect ${IP}:${PORT} < /dev/null 2>&1)
41 |
42 | if grep -q "DONE " <<< ${OUTPUT}; then
43 | SUPPORTED_PROTOCOLS[${#SUPPORTED_PROTOCOLS[@]}]="ssl_2_0"
44 | echo "metric ssl_2_0 string yes"
45 | elif grep -q "wrong version number" <<< ${OUTPUT}; then
46 | echo "metric ssl_2_0 string no"
47 | elif grep -q "unknown option" <<< ${OUTPUT}; then
48 | echo "openssl doesn't support SSL v2.0, probably using openssl >= 1.0.0" >&2
49 | echo "metric ssl_2_0 string unknown"
50 | fi
51 |
52 | OUTPUT=$(openssl s_client -ssl3 -connect ${IP}:${PORT} < /dev/null 2>&1)
53 |
54 | if grep -q "DONE" <<< ${OUTPUT}; then
55 | SUPPORTED_PROTOCOLS[${#SUPPORTED_PROTOCOLS[@]}]="ssl_3_0"
56 | echo "metric ssl_3_0 string yes"
57 | elif grep -q "wrong version number" <<< ${OUTPUT}; then
58 | echo "metric ssl_3_0 string no"
59 | elif grep -q "unknown option " <<< ${OUTPUT}; then
60 | echo "metric ssl_3_0 string unknown"
61 | fi
62 |
63 | OUTPUT=$(openssl s_client -tls1 -connect ${IP}:${PORT} < /dev/null 2>&1)
64 |
65 | if grep -q "DONE" <<< ${OUTPUT}; then
66 | SUPPORTED_PROTOCOLS[${#SUPPORTED_PROTOCOLS[@]}]="tls_1_0"
67 | echo "metric tls_1_0 string yes"
68 | elif grep -q "wrong version number" <<< ${OUTPUT}; then
69 | echo "metric tls_1_0 string no"
70 | elif grep -q "unknown option " <<< ${OUTPUT}; then
71 | echo "metric tls_1_0 string unknown"
72 | fi
73 |
74 | OUTPUT=$(openssl s_client -tls1_1 -connect ${IP}:${PORT} < /dev/null 2>&1)
75 |
76 | if grep -q "DONE" <<< ${OUTPUT}; then
77 | SUPPORTED_PROTOCOLS[${#SUPPORTED_PROTOCOLS[@]}]="tls_1_1"
78 | echo "metric tls_1_1 string yes"
79 | elif grep -q "wrong version number" <<< ${OUTPUT}; then
80 | echo "metric tls_1_1 string no"
81 | elif grep -q "unknown option " <<< ${OUTPUT}; then
82 | echo "openssl doesn't support TLS v1.1, probably using openssl < 1.0.0" >&2
83 | echo "metric tls_1_1 string unknown"
84 | fi
85 |
86 | OUTPUT=$(openssl s_client -tls1_2 -connect ${IP}:${PORT} < /dev/null 2>&1)
87 |
88 | if grep -q "DONE" <<< ${OUTPUT}; then
89 | SUPPORTED_PROTOCOLS[${#SUPPORTED_PROTOCOLS[@]}]="tls_1_2"
90 | echo "metric tls_1_2 string yes"
91 | elif grep -q "wrong version number" <<< ${OUTPUT}; then
92 | echo "metric tls_1_2 string no"
93 | elif grep -q "unknown option " <<< ${OUTPUT}; then
94 | echo "openssl doesn't support TLS v1.2, probably using openssl < 1.0.0" >&2
95 | echo "metric tls_1_2 string unknown"
96 | fi
97 |
98 | SUPPORTED_PROTOCOLS=$(IFS=$','; echo "${SUPPORTED_PROTOCOLS[*]}")
99 | echo "status Supported protocols: ${SUPPORTED_PROTOCOLS}"
100 |
--------------------------------------------------------------------------------
/statsd_metric_emitter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Copyright 2015 Rackspace
4 |
5 | Licensed under the Apache License, Version 2.0 (the "License");
6 | you may not use this file except in compliance with the License.
7 | You may obtain a copy of the License at
8 |
9 | http://www.apache.org/licenses/LICENSE-2.0
10 |
11 | Unless required by applicable law or agreed to in writing, software
12 | distributed under the License is distributed on an "AS IS" BASIS,
13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | See the License for the specific language governing permissions and
15 | limitations under the License.
16 | ----
17 |
18 | Rackspace cloud monitoring plugin for statsd metrics.
19 |
20 | Requires a directory path to watch and a list of metrics to filter out and return data about.
21 |
22 | E.g.
23 | python statsd_metric_emitter.py /foo/bar metric1 metric2 ... metricn
24 | """
25 |
26 | import os.path
27 | import sys
28 | import glob
29 | import json
30 |
31 | ck_metrics = []
32 | filtered_metrics = []
33 |
34 | def output_check_status(status, message):
35 | ck_metrics.append("status %s %s" % (status, message))
36 |
37 | if status is "err":
38 | print("status %s" % (status, message))
39 | sys.exit()
40 |
41 | def output_metrics(metrics):
42 | """
43 | Outputs the parsed metrics to the agent.
44 | """
45 | # TODO these need to work for a few different types
46 | for metric_type in ("counters", "timers", "gauges"):
47 | metric = metrics.get(metric_type)
48 | if metric is None:
49 | continue
50 | for name, val in ((k, v) for k, v in metric.iteritems() if not k.startswith('statsd.')):
51 | if name in filtered_metrics:
52 | for k, v in val.iteritems():
53 | ck_metric = "metric %s %s %f" % (name + '.' + k, 'float', v)
54 | ck_metrics.append(ck_metric)
55 |
56 | def parse_file(file_path, offset=0):
57 | """
58 | Opens a metrics file from statsd and parses its json.
59 |
60 | Returns the offset of what we last read so we can seek
61 | directly to it next time.
62 | """
63 | with open(file_path, 'rb') as fd:
64 | fd.seek(offset)
65 | data = fd.read()
66 | for line in data.split("\n"):
67 | if line:
68 | output_metrics(json.loads(line))
69 |
70 | return fd.tell()
71 |
72 | def find_latest_flush(files):
73 | s = sorted(files)
74 | if len(s) is 0:
75 | return None
76 | currentFile = s.pop()
77 | for i in s:
78 | os.remove(i)
79 | return currentFile
80 |
81 | def main():
82 | if len(sys.argv) < 2:
83 | print("status err: 500 Expected a watch directory as argument (quitting)")
84 | sys.exit(1)
85 | if len(sys.argv) < 3:
86 | print("status err: 500 At least one metric name is required for filtering (quitting)")
87 | sys.exit(2)
88 | watch_dir = sys.argv[1]
89 | for i in range(2, len(sys.argv)):
90 | filtered_metrics.append(sys.argv[i])
91 | files = glob.glob(os.path.join(watch_dir, '[0-9]*.json'))
92 | currentFile = find_latest_flush(files)
93 | if currentFile is None:
94 | output_check_status('err', '204 NO CONTENT')
95 | else:
96 | parse_file(currentFile)
97 | output_check_status('ok', '200 OK')
98 | print('\n'.join(ck_metrics))
99 |
100 | if __name__ == "__main__":
101 | main()
102 |
103 |
--------------------------------------------------------------------------------
/systemctl_status.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Copyright (c) 2018 Shane F. Carr
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | #
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | #
23 | # # # # # # # #
24 | #
25 | # This check returns information on the health of systemctl services.
26 | # For more information on systemctl status strings, see:
27 | # https://www.freedesktop.org/software/systemd/man/systemctl.html#is-system-running
28 | #
29 | # Suggested alarm:
30 | #
31 | # if (metric['systemctl_status'] != "running" && metric['systemctl_status'] != "starting") {
32 | # return new AlarmStatus(CRITICAL, 'SystemCTL status is #{systemctl_status}! Details: #{systemctl_failed_units}');
33 | # }
34 | # return new AlarmStatus(OK, 'SystemCTL status is #{systemctl_status}');
35 |
36 | STATE=$(systemctl is-system-running)
37 | DETAILS=$(systemctl list-units --state=failed --no-legend --no-pager | tr '\n' ' ')
38 |
39 | echo "status ok succeeded in obtaining metrics"
40 | echo "metric systemctl_status string $STATE"
41 | if [ -z "$DETAILS" ]; then
42 | echo "metric systemctl_failed_units string (no failed units)";
43 | else
44 | echo "metric systemctl_failed_units string $DETAILS";
45 | fi
46 |
--------------------------------------------------------------------------------
/ubuntu_updates_check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Description: Custom plugin returns number of pending security and other
4 | # updated on a Ubuntu based system.
5 | # Author: Tomaz Muraus
6 | # License: MIT
7 | #
8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | #
15 | # The above copyright notice and this permission notice shall be included in
16 | # all copies or substantial portions of the Software.
17 | #
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 | # THE SOFTWARE.
25 |
26 | OUTPUT=$(/usr/lib/update-notifier/apt-check 2>&1)
27 |
28 | if [ $? -ne 0 ]; then
29 | echo "Failed to retrieve a number of pending updates"
30 | exit 100
31 | fi
32 |
33 | PENDING_OTHER=$(echo "${OUTPUT}" | cut -d ";" -f 1)
34 | PENDING_SECURITY=$(echo "${OUTPUT}" | cut -d ";" -f 2)
35 | REBOOT_REQUIRED="no"
36 |
37 | if [ -f "/var/run/reboot-required" ]; then
38 | REBOOT_REQUIRED="yes"
39 | fi
40 |
41 | if [ $((PENDING_OTHER+PENDING_SECURITY)) -gt 0 ]; then
42 | UPGRADABLE_PACKAGES=$(apt list --upgradable 2>/dev/null | grep -v Listing | awk -F'/' '{print $1}' | paste -sd ',' -)
43 | else
44 | UPGRADABLE_PACKAGES="none"
45 | fi
46 |
47 | echo "status Pending updates: security ${PENDING_SECURITY}, other: ${PENDING_OTHER}"
48 |
49 | echo "metric pending_security uint32 ${PENDING_SECURITY}"
50 | echo "metric pending_other uint32 ${PENDING_OTHER}"
51 | echo "metric reboot_required string ${REBOOT_REQUIRED}"
52 | echo "metric upgradable_packages string ${UPGRADABLE_PACKAGES}"
53 |
54 | exit 0
--------------------------------------------------------------------------------
/uptime_reset_detector.sh:
--------------------------------------------------------------------------------
1 | #!/bin/env bash
2 |
3 | # uptime_reset_detector.sh v 0.1.0a
4 | # This script uses /dev/shm (volatile ramdisk )to detect reboots.
5 | # Only works on Linux.
6 | #
7 | # Rackspace Cloud Monitoring Plugin to detect uptime resets.
8 | #
9 | # Copyright (c) 2017, Brian King
10 | # All rights reserved.
11 | #
12 | #
13 | # Licensed under the Apache License, Version 2.0 (the "License");
14 | # you may not use this file except in compliance with the License.
15 | # You may obtain a copy of the License at
16 | #
17 | # http://www.apache.org/licenses/LICENSE-2.0
18 | #
19 | # Unless required by applicable law or agreed to in writing, software
20 | # distributed under the License is distributed on an "AS IS" BASIS,
21 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 | # See the License for the specific language governing permissions and
23 | # limitations under the License.
24 |
25 | # Example criteria:
26 | #
27 | # if (metric['uptime_reset_detected'] == 'true'){
28 | # return new AlarmStatus(CRITICAL, 'Uptime reset detected.');
29 | # }
30 | # return new AlarmStatus(OK, 'Server has not rebooted since the last time we checked.');
31 |
32 |
33 | if [ -e /dev/shm/.lastreboot ]; then
34 |
35 | echo "status ok uptime_reset_detected false"
36 | echo "metric uptime_reset_detected string false just_rebooted"
37 |
38 | exit 0
39 |
40 | else
41 |
42 | echo "status critical uptime_reset_detected true"
43 | echo "metric uptime_reset_detected string true just_rebooted"
44 |
45 | #We're not doing anything but checking for the presence of the file yet, but
46 | # future versions could capture and report the delta between reboots
47 |
48 | uptime -s > /dev/shm/.lastreboot
49 |
50 | exit 1
51 |
52 | fi
--------------------------------------------------------------------------------
/varnish.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # varnish.sh
4 | # Rackspace Cloud Monitoring Plugin to collect metrics from varnishstat.
5 | #
6 | # Copyright (c) 2013, Rob Szumski
7 | # All rights reserved.
8 | #
9 | # Redistribution and use in source and binary forms, with or without
10 | # modification, are permitted provided that the following conditions are met:
11 | #
12 | # Redistributions of source code must retain the above copyright notice,
13 | # this list of conditions and the following disclaimer.
14 | #
15 | # Redistributions in binary form must reproduce the above copyright
16 | # notice, this list of conditions and the following disclaimer in the
17 | # documentation and/or other materials provided with the distribution.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 | # POSSIBILITY OF SUCH DAMAGE.
30 | #
31 | # See https://github.com/robszumski/rackspace-monitoring-varnish for a readme
32 | # and more information
33 | #
34 | # this plugin can optinally print particular statistics, just pass them as args
35 | # varnish.sh cache_hit,cache_hitpass,cache_miss
36 | #
37 | #
38 | # Example Criteria
39 | # if (metric['healthy_backends'] < 1) {
40 | # return new AlarmStatus(CRITICAL, 'Varnish doesnt have any backends!');
41 | #}
42 | #
43 | #if (metric['healthy_backends'] < 2) {
44 | # return new AlarmStatus(WARNING, 'Varnish only has #{healthy_backends} healthy backend.');
45 | #}
46 | #
47 | # NOTE: if you are running Varnish < 4 comment out healthy backends metrics (they don't work)
48 | #
49 |
50 | return new AlarmStatus(OK, 'Varnish has \#{healthy_backends} backends.');
51 |
52 | # check if service is running
53 | SERVICE=varnish
54 | VARNISHSTAT=/usr/bin/varnishstat
55 | VARNISHADM=/usr/bin/varnishadm
56 |
57 | if P=$(pgrep $SERVICE | wc -l)
58 | then
59 | echo "status $SERVICE is running ($P instances)"
60 | else
61 | echo "status $SERVICE is not running"
62 | fi
63 |
64 | # output number of processes
65 | echo "metric processes int32 $P"
66 |
67 | # calculate hit percent
68 | hits=$($VARNISHSTAT -1 -f cache_hit | awk '{print $2}')
69 | connections=$($VARNISHSTAT -1 -f client_req| awk '{print $2}')
70 | hit_percent=$(echo "scale=8;($hits/$connections)" | bc | awk '{printf "%f", $1*100}')
71 | echo "metric hit_percent double "$hit_percent
72 |
73 | # calculate # of healthy backends
74 | healthy=$($VARNISHADM backend.list | grep -c "Healthy")
75 | echo "metric healthy_backends int32" $healthy
76 |
77 | [ ! -z $1 ] && $VARNISHSTAT -1 -f $1 | awk ' { print "metric " $1 " gauge " $2 } '
78 |
--------------------------------------------------------------------------------
/varnish4.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # varnish4.sh
4 | # Rackspace Cloud Monitoring Plugin to collect metrics from varnishstat.
5 | #
6 | # Copyright (c) 2013, Rob Szumski
7 | # All rights reserved.
8 | #
9 | # Redistribution and use in source and binary forms, with or without
10 | # modification, are permitted provided that the following conditions are met:
11 | #
12 | # Redistributions of source code must retain the above copyright notice,
13 | # this list of conditions and the following disclaimer.
14 | #
15 | # Redistributions in binary form must reproduce the above copyright
16 | # notice, this list of conditions and the following disclaimer in the
17 | # documentation and/or other materials provided with the distribution.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 | # POSSIBILITY OF SUCH DAMAGE.
30 |
31 | # check if service is running
32 | SERVICE=varnish
33 | VARNISHSTAT=/usr/bin/varnishstat
34 | VARNISHADM=/usr/bin/varnishadm
35 | HITSNAME=MAIN.cache_hit
36 | MISSESNAME=MAIN.cache_miss
37 |
38 | if P=$(pgrep $SERVICE | wc -l)
39 | then
40 | echo "status success"
41 | else
42 | echo "status down"
43 | fi
44 |
45 | # output number of processes
46 | echo "metric processes int32 $P"
47 |
48 | # calculate hit rate
49 | # cache_hit/(cache_hit + cache_miss)
50 | hits=$($VARNISHSTAT -1 -f $HITSNAME | awk '{print $2}')
51 | misses=$($VARNISHSTAT -1 -f $MISSESNAME | awk '{print $2}')
52 | hit_rate=$(echo "scale=8;($hits/($hits + $misses))" | bc | awk '{printf "%f", $1*100}')
53 | echo "metric hit_rate double" $hit_rate
54 |
55 | # calculate # of healthy backends
56 | healthy=$($VARNISHADM backend.list | grep -c "Healthy")
57 | echo "metric healthy_backends int32" $healthy
58 |
59 | [ ! -z $1 ] && $VARNISHSTAT -1 -f $1 | awk ' { print "metric " $1 " gauge " $2 } '
60 |
--------------------------------------------------------------------------------
/windows/get-counters.ps1:
--------------------------------------------------------------------------------
1 | <#
2 | Rackspace Cloud Monitoring Plug-In
3 | This is a plugin to gather Windows performance counters for use
4 | in Rackspace Monitoring checks.
5 |
6 | (c) 2018 Rackspace US, Inc
7 |
8 | All Rights Reserved.
9 | Licensed under the Apache License, Version 2.0 (the "License"); you may
10 | not use this file except in compliance with the License.
11 | You may obtain a copy of the License at
12 |
13 | http://www.apache.org/licenses/LICENSE-2.0
14 |
15 | Unless required by applicable law or agreed to in writing, software
16 | distributed under the License is distributed on an "AS IS" BASIS,
17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | See the License for the specific language governing permissions and
19 | limitations under the License.
20 |
21 | Usage:
22 | Place plug-in in C:\Program Files\Rackspace Monitoring\plugins
23 |
24 | It accepts a single argument which is the CounterPath as used in perfmon.
25 | For example, this will gather all counters for the logical disk C:
26 |
27 | '\LogicalDisk(c:)\*'
28 |
29 | This plugin returns a metric for each counter gathered where the metric
30 | name is normalized into
31 |
32 | $object.$instance.$counter
33 |
34 | For example:
35 |
36 | logicaldisk.c.pct_free_space
37 | logicaldisk.c.free_megabytes
38 | logicaldisk.c.current_disk_queue_length
39 | logicaldisk.c.pct_disk_time
40 | logicaldisk.c.avg_disk_queue_length
41 | logicaldisk.c.pct_disk_read_time
42 | logicaldisk.c.avg_disk_read_queue_length
43 |
44 | #>
45 |
46 | function CM-GetCounters($CounterPath) {
47 | $results = Get-Counter -Counter $CounterPath
48 | $results.CounterSamples | ForEach-Object {
49 | $path = $_.Path
50 | $val = $_.CookedValue
51 | $metric = ($path -replace '\\\\.*?\\','' -replace '%','pct' -replace '\\','.' -replace '/',' per ' -replace '\(','.' -replace '[):]','' -replace '\.\s+','_' -replace '\s+','_').ToLower() -replace '[^a-z0-9:\.]','_'
52 | Write-Output "metric $metric double $val"
53 | }
54 | Write-Output "status ok success"
55 | }
56 |
57 | if($args.Count -lt 1) {
58 | Write-Output "status err Missing required parameter: CounterPath"
59 | exit
60 | }
61 |
62 | CM-GetCounters $args[0]
--------------------------------------------------------------------------------
/windows/ping.ps1:
--------------------------------------------------------------------------------
1 | <#
2 |
3 | Rackspace Cloud Monitoring Plug-In
4 |
5 | This is a plugin to monitor ICMP response times of hosts accessible by the server
6 |
7 | ----------------------------------------------------------------------------
8 | "THE BEER-WARE LICENSE" (Revision 42):
9 | wrote this file. As long as you retain this notice you
10 | can do whatever you want with this stuff. If we meet some day, and you think
11 | this stuff is worth it, you can buy me a beer in return.
12 | ----------------------------------------------------------------------------
13 |
14 | Usage:
15 | - Place plug-in in folder C:\Program Files\Rackspace Monitoring\plugins
16 | - Configure Custom Plugin type check in Rackspace Intelligence
17 | Specify only the script's name and the hostname/IP to ping, e.g.:
18 | ping.ps1 192.168.0.1
19 | Count is the amount of ICMP probes sent in a singe check, and interval is the
20 | number of seconds between them. They are both optional. Their default values
21 | are 5 pings with an interval of 2 seconds.
22 | - Configure an Alert (optional, see example below).
23 |
24 | This plugin returns 4 metrics:
25 | - minimum, average, maximum: statistics returned by the Windows ping utility
26 | in the format "Minimum = 0ms, Maximum = 17ms, Average = 4ms
27 | - lost_packets: the percentage of the packets lost out of the number of probes
28 | sent
29 |
30 | Example alert:
31 |
32 | --- start copying after this line ---
33 |
34 | if (metric['average'] >= 30 ) {
35 | return new AlarmStatus(WARNING, 'Average round-trip took #{average}ms');
36 | }
37 | if (metric['lost_packets'] >= 40) {
38 | return new AlarmStatus(WARNING, 'Packet loss was #{lost_packets}%');
39 | }
40 | if (metric['legacy_state'] != "ok") {
41 | return new AlarmStatus(CRITICAL, 'Error: #{legacy_state}');
42 | }
43 | return new AlarmStatus(OK, 'All good');
44 |
45 | --- stop copying before this line ---
46 |
47 | #>
48 |
49 | function CM-Ping($TargetHost, $count, $interval) {
50 | $ping_command = "ping -n 1 -w 30 $TargetHost"
51 | $lost_packets=0
52 | if (-not $count) { $count = 5 }
53 | if (-not $interval ) { $interval = 2 }
54 | [int[]] $ping_min, $ping_max, $ping_avg = @()
55 | for ($i=0; $i -lt $count; $i++) {
56 | $lines = iex $ping_command | select-string "loss|average"
57 | if (0 -eq $LASTEXITCODE) {
58 | $stats_loss = $lines[0]
59 | $stats_ping = $lines[1]
60 | if ([int]"$stats_loss".split("(")[1].split("%")[0] -gt 0) {
61 | $lost_packets++
62 | }
63 | $result_ping = Foreach ($metric in "$stats_ping".split(",")) { $metric.Replace(" Minimum = ", "").Replace(" Maximum = ", "").
64 | Replace(" Average = ", "").Replace("ms", "") }
65 | $ping_min += [int]$result_ping[0]
66 | $ping_max += [int]$result_ping[1]
67 | $ping_avg += [int]$result_ping[2]
68 | sleep $interval
69 | }
70 | else {
71 | $lost_packets++
72 | }
73 | }
74 | if ( $lines ) {
75 | Write-Output "metric minimum int32 $(($ping_min | measure -Minimum).Minimum) milliseconds"
76 | Write-Output "metric average double $(($ping_min | measure -Average).Average) milliseconds"
77 | Write-Output "metric maximum int32 $(($ping_min | measure -Maximum).Maximum) milliseconds"
78 | Write-Output "metric lost_packets int32 $([int](([int]$lost_packets / [int]$count) * 100)) percent"
79 | Write-Output "status ok"
80 | }
81 | else {
82 | Write-Output "status err $TargetHost could not be reached"
83 | }
84 | }
85 |
86 | if($args.Count -lt 1) {
87 | Write-Output "status err Missing required parameter"
88 | exit
89 | }
90 |
91 | CM-Ping -TargetHost $args[0] $args[1] $args[2]
92 |
--------------------------------------------------------------------------------
/windows/service_mon.ps1:
--------------------------------------------------------------------------------
1 | <#
2 |
3 | Script to return status of a Windows service.
4 |
5 | Teddy Schmitz
6 | All Rights Reserved.
7 |
8 | Licensed under the Apache License, Version 2.0 (the "License"); you may
9 | not use this file except in compliance with the License.
10 | You may obtain a copy of the License at
11 |
12 | http://www.apache.org/licenses/LICENSE-2.0
13 |
14 | Unless required by applicable law or agreed to in writing, software
15 | distributed under the License is distributed on an "AS IS" BASIS,
16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | See the License for the specific language governing permissions and
18 | limitations under the License.
19 |
20 | Configuration:
21 |
22 | You must supply the name of the service in the arguments section of the agent config.
23 | For example to monitor the plug in play service your check config should look like this:
24 |
25 | "details": {
26 | "args": [
27 | "Plug and Play"
28 | ],
29 | "file": "service_mon.ps1"
30 | }
31 |
32 |
33 | Example alarm criteria:
34 |
35 | if (metric['service_status'] != 'running') {
36 | return new AlarmStatus(CRITICAL, 'Service is NOT running.');
37 | }
38 |
39 |
40 | #>
41 |
42 |
43 | function FuncCheckService{
44 | param($ServiceName)
45 | try{
46 | $arrService = Get-Service -Name $ServiceName -ErrorAction Stop
47 | }
48 | catch [Microsoft.PowerShell.Commands.ServiceCommandException]
49 | {
50 | Write-Output "status err $ServiceName service not found"
51 | exit
52 | }
53 | if ($arrService.Status -ne "Running")
54 | {
55 | Write-Output "metric service_status string notrunning"
56 | Write-Output "status ok found service"
57 | }
58 | if ($arrService.Status -eq "running")
59 | {
60 | Write-Output "metric service_status string running"
61 | Write-Output "status ok found service"
62 | }
63 | }
64 |
65 | if($args.Count -lt 1){
66 | Write-Output "status err no service specified"
67 | exit
68 | }
69 | FuncCheckService -ServiceName $args[0]
--------------------------------------------------------------------------------
/yum_updates_check.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Description: Custom plugin returns number of pending updates on a
4 | # yum-based system.
5 | # Author: Andrew Regner
6 | # License: MIT
7 | #
8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | #
15 | # The above copyright notice and this permission notice shall be included in
16 | # all copies or substantial portions of the Software.
17 | #
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 | # THE SOFTWARE.
25 |
26 | yum check-update | awk '
27 | $2 ~ /^[0-9]/ {
28 | count[$3] += 1;
29 | total += 1;
30 | }
31 |
32 | END {
33 | if(total > 0)
34 | printf("status pending updates: %d\n", total);
35 | else
36 | printf("status no updates\n");
37 |
38 | printf("metric total_updates uint32 %d\n", total);
39 | for(repo in count)
40 | printf("metric pending_%s uint32 %d\n", repo, count[repo]);
41 | }'
42 |
--------------------------------------------------------------------------------