├── .gitmodules ├── README.md ├── activemq-status.sh ├── api_limit_ram_usage.py ├── cass_compaction.py ├── cassandra_pending_tpstats.py ├── centos_reboot_check.sh ├── check-mount.sh ├── check-mtime.sh ├── check_nodering.py ├── check_openmanage.sh ├── chef_node_checkin.py ├── cloud_queues.py ├── cloudbackup_mon.sh ├── clouddatabases_volume.py ├── cloudload_balancer.py ├── cman_nodes.rb ├── consul.py ├── content_check.py ├── curl.sh ├── curl_check.sh ├── dir_monitor.py ├── dir_stats.sh ├── directory.sh ├── dns_resolution.sh ├── docker_check.py ├── docker_stats_check.py ├── elasticsearch.py ├── etcd.py ├── examples └── example.rb ├── file_info.py ├── galera.py ├── hadoop_hbase.py ├── hadoop_hdfs.py ├── hadoop_jobtracker.py ├── haproxy.rb ├── holland_mysqldump.py ├── jmx-gather.sh ├── latest_alarm_state.py ├── long_process.sh ├── lsyncd-status.sh ├── megaraid.sh ├── memcached_stats.py ├── mongodb_stats.py ├── murmur_monitor.py ├── mysql_ping.py ├── mysql_replication.py ├── nfs-status.sh ├── nginx_status_check.py ├── ntp_offset.sh ├── onmetal_v1_smart.py ├── open_files.py ├── pg_check.py ├── php-fpm_status_check.sh ├── ping.sh ├── port_check.py ├── port_check.sh ├── process_mon.sh ├── rabbitmq.py ├── redis_slave_count.sh ├── solrmon.py ├── ssl_cert_expiration.sh ├── ssl_protocols_check.sh ├── statsd_metric_emitter.py ├── systemctl_status.sh ├── ubuntu_updates_check.sh ├── uptime_reset_detector.sh ├── varnish.sh ├── varnish4.sh ├── windows ├── get-counters.ps1 ├── ping.ps1 └── service_mon.ps1 └── yum_updates_check.sh /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "contrib/rpc-maas"] 2 | path = contrib/rpc-maas 3 | url = https://github.com/rcbops/rpc-maas.git 4 | [submodule "contrib/uptime-monitor"] 5 | path = contrib/uptime-monitor 6 | url = https://github.com/racker/uptime-monitor.git 7 | [submodule "contrib/conveyer"] 8 | path = contrib/conveyer 9 | url = https://github.com/sam-falvo/conveyer.git 10 | [submodule "contrib/rpc-openstack"] 11 | path = contrib/rpc-openstack 12 | url = https://github.com/rcbops/rpc-openstack.git 13 | [submodule "contrib/gardener"] 14 | path = contrib/gardener 15 | url = https://github.com/creationix/gardener.git 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rackspace Monitoring Agent Custom plugins 2 | 3 | This repository contains contributed custom plugins for the Rackspace Cloud 4 | Monitoring agent. For details about installing plugins, see [agent plugin check documentation](https://developer.rackspace.com/docs/rackspace-monitoring/v1/tech-ref-info/check-type-reference/#agent-plugin). 5 | 6 | ## How to Contribute 7 | 8 | You can contribute your plugins by first forking the repo, committing your changes, and then opening a pull request through github. If you have any questions, feel free to reach out to us on #cloudmonitoring on freenode IRC. 9 | 10 | ## Plugin Requirements 11 | 12 | Each plugin must fulfill the following properties: 13 | 14 | * Output a status message to STDOUT 15 | * Output one or more metrics if it succeeds in obtaining them to STDOUT 16 | * Contain an appropriate license header 17 | * Contain example alarm criteria 18 | 19 | ## Status 20 | 21 | The status message should be of the form status $status_string, For example, it might be: 22 | 23 | status ok succeeded in obtaining metrics 24 | 25 | or 26 | 27 | status err failed to obtain metrics 28 | 29 | The status string should be a summary of the results, with actionable information if it fails. 30 | 31 | ## Metrics 32 | 33 | The metrics message should be of the form metric $name $type $value [unit], for example: 34 | 35 | metric time int32 1 seconds 36 | 37 | The units are optional, and if present should be a string representing the units of the metric measurement. Units may not be provided on string metrics, and may not contain any spaces. 38 | 39 | The available types are: 40 | 41 | * string 42 | * float 43 | * double 44 | * int32 45 | * int64 46 | * uint32 47 | * uint64 48 | * gauge 49 | 50 | ## Alarm Criteria 51 | 52 | Each script should contain, just below the license header, in a comment, an example alarm criteria that can be used for the plugin. See the [Rackspace Cloud Monitoring Documentation](https://developer.rackspace.com/docs/rackspace-monitoring/v1/tech-ref-info/alert-triggers-and-alarms/) for how to write alarm criteria. 53 | 54 | ## Submodules 55 | 56 | Submodules of repositories are stored in the contrib folder in this repo. 57 | There are more plugins in that folder, some of these plugins have dependencies and their own readmes. 58 | 59 | The contrib directory contains submodules of more custom plugins that have been used by other teams, including those from openstack and rackspace. These are older plugins for Icehouse/Juno, 60 | newer plugins for Kilo can be found at [rcbops/rpc-openstack](https://github.com/rcbops/rpc-openstack) or inside contrib/rpc-openstack/maas/plugins. 61 | 62 | You can pull the submodules with 63 | ``` 64 | git pull --recurse-submodules 65 | git submodule update --recursive 66 | ``` 67 | 68 | 69 | ## License Header 70 | 71 | The exact content will depend on your chosen license, but we recommend BSD, Apache 2.0, or MIT Licenses. Regardless of license choice the header should contain the author's (or authors') name(s) and email address(es). 72 | -------------------------------------------------------------------------------- /activemq-status.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Description: Custom plugin which checks activemq status. 4 | 5 | # Copyright 2013 Ted Neykov 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | # Looks for 'CurrentStatus' line in output of `activemq-admin query` command. 20 | # Query command can return multiple lines containing 'CurrentStatus'. 21 | # Returns non-zero and 'status error' if any of the lines are not 'CurrentStatus = Good'. 22 | 23 | # Look for the activemq-admin script in /opt 24 | amq_bin=$(find /opt/ -name 'activemq-admin' | egrep '/bin/activemq-admin') 25 | 26 | if [ -z $amq_bin ]; then 27 | echo "status error: Could not find activemq-admin." 28 | exit 1 29 | fi 30 | 31 | amq_query=`"$amq_bin" query` 32 | curr_status=`echo "$amq_query" | grep CurrentStatus` 33 | 34 | echo "$curr_status" | 35 | while read -r line; do 36 | line_status=`echo "$line" | awk '{print $3}'` 37 | if [ "$line_status" == 'Good' ]; then 38 | : 39 | else 40 | # Found non "Good" status or empty line 41 | exit 1 42 | fi 43 | done 44 | 45 | if [ $? -eq 0 ]; then 46 | echo "status good" 47 | else 48 | echo "status error" 49 | exit 1 50 | fi 51 | -------------------------------------------------------------------------------- /api_limit_ram_usage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | '''Rackspace Cloud Servers RAM usage monitor''' 3 | '''Pulls metrics from API for Total RAM allowed for Cloud Servers and Total RAM currently used (for API limits) and returns percent_ram_used metric to Cloud Monitoring API. Threshold percentage of when to set status to warning is set at command line argument along with username and api key''' 4 | # Copyright 2013 Rackspace 5 | 6 | # All Rights Reserved. 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 9 | # not use this file except in compliance with the License. You may obtain 10 | # a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 16 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 17 | # License for the specific language governing permissions and limitations 18 | # under the License. 19 | import sys 20 | import pyrax 21 | import argparse 22 | 23 | 24 | def main(): 25 | '''Script execution''' 26 | parser = argparse.ArgumentParser(description='get percent of api limit ' 27 | 'of ram used') 28 | parser.add_argument('-u', '--username', help='Rackspace Username', 29 | required=True) 30 | parser.add_argument('-a', '--apikey', help='Rackspace API Key', 31 | required=True) 32 | parser.add_argument('-m', '--maxthreshold', 33 | help='API Percent Used Threshold, integer between ' 34 | '1-99', 35 | required=True) 36 | parser.add_argument('-r', '--region', help='Rackspace Regional Datacenter', 37 | required=True) 38 | parser.add_argument('--human', 39 | help='Format output for humans, not Cloud Monitoring', 40 | action='store_true') 41 | args = parser.parse_args() 42 | 43 | if int(args.maxthreshold) < 1 or int(args.maxthreshold) > 99: 44 | print "You must enter a valid integer from 1-99 for maxthreshold" 45 | sys.exit(2) 46 | pyrax.set_setting("identity_type", "rackspace") 47 | pyrax.set_credentials(args.username, args.apikey) 48 | 49 | (ram_used, ram_allowed) = getlimits(args.region) 50 | display_usage(ram_used, ram_allowed, args.maxthreshold, args.human) 51 | 52 | 53 | def getlimits(region): 54 | '''Returns the RAM usage and limits''' 55 | compute = pyrax.cloudservers 56 | cslimits = compute.limits.get(region) 57 | # Convert the generator to a list 58 | cslimits_list = [rate for rate in cslimits.absolute] 59 | # Pull out max_ram api limit and total used ram from list 60 | max_ram = [ 61 | x.value for x in cslimits_list if x.name == "maxTotalRAMSize"][0] 62 | total_ram = [x.value for x in cslimits_list if x.name == "totalRAMUsed"][0] 63 | return (total_ram, max_ram) 64 | 65 | 66 | def display_usage(ram_used, ram_allowed, alert_percentage, human): 67 | '''Print RAM usage information''' 68 | percent_ram = (float(ram_used) / float(ram_allowed)) * 100 69 | percent_ram_used = round(float(("%.2f" % percent_ram))) 70 | 71 | if human: 72 | print "Current RAM Usage: %sMB" % ram_used 73 | print "Max RAM API Limit: %sMB" % ram_allowed 74 | if percent_ram_used >= float(alert_percentage): 75 | print "WARNING: Percent of API Limit Used: %d%%" % ( 76 | percent_ram_used) 77 | else: 78 | print "OK: Percent of API Limit Used: %0d%%" % percent_ram_used 79 | else: 80 | # Cloud Monitoring-aware output 81 | if percent_ram_used < float(alert_percentage): 82 | print "status ok Percent RAM Used", percent_ram_used 83 | print "metric percent_ram_used float", percent_ram_used 84 | else: 85 | print "status err Percent RAM Used", percent_ram_used 86 | print "metric percent_ram_used float", percent_ram_used 87 | 88 | 89 | if __name__ == "__main__": 90 | main() 91 | -------------------------------------------------------------------------------- /cass_compaction.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # 3 | # Rackspace Cloud Monitoring Plug-In 4 | # Checks the number of compactions pending in a cassandra node. 5 | # 6 | # (c) 2017 Jim Wang 7 | # All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 10 | # not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # 21 | # Usage: 22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 23 | # 24 | # It accepts three arguments, the path to the nodetool executable, the cassandra hostname 25 | # and the port on which to run on 26 | # 27 | # Returns the number of pending compactions on the cass node 28 | # 29 | #! /usr/bin/python 30 | 31 | from subprocess import check_output 32 | import sys 33 | import string 34 | import argparse 35 | 36 | parser = argparse.ArgumentParser(description='Run nodetool to check for inconsistent state') 37 | parser.add_argument('-p', '--port', dest='portforcassandra', default='9080', help='port that cassandra is running on') 38 | parser.add_argument('-t', '--tool', dest='pathtonodetool', default='/opt/cassandra/bin/', help='path to nodetool executable (ex /opt/cassandra/bin)') 39 | parser.add_argument('-o', '--host', dest='cassandrahost', default='localhost', help='host cassandra is running on.') 40 | 41 | args = parser.parse_args(); 42 | 43 | 44 | node_tool_output = check_output([args.pathtonodetool + 'nodetool', '-h', 45 | args.cassandrahost, '-p', args.portforcassandra, 'compactionstats']) 46 | pending_compactions = int(node_tool_output.splitlines()[0].split(':')[1]) 47 | print 'metric pending_compactions uint32 ' + str(pending_compactions) 48 | -------------------------------------------------------------------------------- /cassandra_pending_tpstats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """ 3 | Rackspace Cloud Monitoring plugin for pending cassandra tpstats (uses nodetool tpstats) 4 | 5 | Example: 6 | $ ./cassandra_pending_tpstats.py 7 | 8 | Example alarm criteria: 9 | if (metric['cassandra_pending_foo'] > 10) { 10 | return new AlarmStatus(CRITICAL, 'Over 10 pending connections, increase resources') 11 | } 12 | 13 | if (metric['cassandra_pending_foo'] < 5) { 14 | return new AlarmStatus(CRITICAL, 'Under 5 pending connections, decrease resources') 15 | } 16 | 17 | Copyright 2015 Rackspace 18 | 19 | Licensed under the Apache License, Version 2.0 (the "License"); 20 | you may not use this file except in compliance with the License. 21 | You may obtain a copy of the License at 22 | 23 | http://www.apache.org/licenses/LICENSE-2.0 24 | 25 | Unless required by applicable law or agreed to in writing, software 26 | distributed under the License is distributed on an "AS IS" BASIS, 27 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 28 | See the License for the specific language governing permissions and 29 | limitations under the License. 30 | """ 31 | import re 32 | import socket 33 | import subprocess 34 | 35 | def camel_to_underscore(name): 36 | s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) 37 | return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() 38 | 39 | def parse_tpstats(output): 40 | return re.compile(r'([A-Za-z_]+)\s+\d+\s+(\d+)').findall(output) 41 | 42 | 43 | statsd = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 44 | hostname = socket.gethostname().replace(".", "_") 45 | 46 | output, error = subprocess.Popen(['nodetool','tpstats'], stdout=subprocess.PIPE).communicate() 47 | 48 | if not error: 49 | for pool_name, count in parse_tpstats(output): 50 | pool_name = camel_to_underscore(pool_name) 51 | 52 | print "status ok" 53 | print "metric cassandra_pending_{pool} int {count}".format(pool=pool_name, count=count) 54 | else: 55 | print "status err" 56 | -------------------------------------------------------------------------------- /centos_reboot_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Description: Custom plugin returns if there is a kernel update pending on 4 | # CentOS. Useful for knowing when to reboot a server updated by yum-cron. 5 | # Author: Russell Obets - Adapted from ubuntu_updates_check.sh by Tomas Muraus. 6 | # License: MIT 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 | # THE SOFTWARE. 25 | 26 | 27 | LAST_KERNEL=$(rpm -q --last kernel | perl -pe 's/^kernel-(\S+).*/$1/' | head -1) 28 | CURRENT_KERNEL=$(uname -r) 29 | 30 | REBOOT_REQUIRED="yes" 31 | 32 | if [ $LAST_KERNEL = $CURRENT_KERNEL ]; then 33 | REBOOT_REQUIRED="no" 34 | echo "status ok Currently running latest installed kernel - ${CURRENT_KERNEL}" 35 | else 36 | echo "status ok Pending kernel updates: running ${CURRENT_KERNEL}, available: ${LAST_KERNEL}" 37 | fi 38 | 39 | echo "metric reboot_required string ${REBOOT_REQUIRED}" 40 | 41 | -------------------------------------------------------------------------------- /check-mount.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Rackspace Cloud Monitoring Plug-In 4 | # Check that a mounted filesystem is mounted 5 | # 6 | # (c) 2017 Teddy Caddy 7 | # All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 10 | # not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # 21 | # Usage: 22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 23 | # 24 | # It accepts two arguments: 25 | # - path: the path of the mount point you want to check 26 | # - flag_file: (optional) check that a flag file exists and is readable 27 | # 28 | # Returns 5 metrics: 29 | # - path: the input path paramter 30 | # - flag_file: the input flag_file parameter 31 | # - mounted: returns 1 if mount point is mounted 32 | # - flag_file_exists: returns 1 if flag file exists 33 | # - flag_file_readable: returns 1 if flag file is readable 34 | # 35 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 36 | # 37 | # if (metric['mounted'] != 1) { 38 | # return new AlarmStatus(CRITICAL, 'The mount point is not mounted: #{path}'); 39 | # } 40 | 41 | # if (metric['flag_file'] != '' && metric['flag_file_exists'] != 1) { 42 | # return new AlarmStatus(CRITICAL, 'The flag file does not exist: #{path}/#{flag_file}'); 43 | # } 44 | 45 | # if (metric['flag_file'] != '' && metric['flag_file_readable'] != 1) { 46 | # return new AlarmStatus(CRITICAL, 'The flag file is not readable: #{path}/#{flag_file}'); 47 | # } 48 | 49 | # return new AlarmStatus(OK, 'The mount point is OK: #{path}'); 50 | # 51 | path=$1 52 | flag_file="$1/$2" 53 | 54 | mounted=0 55 | flag_file_exists=0 56 | flag_file_readable=0 57 | 58 | if [ -d $path ]; then 59 | mounts=$(cat /proc/mounts) 60 | if [[ $mounts == *"$path"* ]]; then 61 | mounted=1 62 | if [ ! -z "${2// }" ]; then 63 | if [ -e $flag_file ]; then 64 | flag_file_exists=1 65 | if [ -r $flag_file ]; then 66 | flag_file_readable=1 67 | fi 68 | fi 69 | fi 70 | fi 71 | fi 72 | 73 | echo "metric path string $path" 74 | echo "metric flag_file string $2" 75 | echo "metric mounted int64 $mounted" 76 | echo "metric flag_file_exists int64 $flag_file_exists" 77 | echo "metric flag_file_readable int64 $flag_file_readable" 78 | exit 0 79 | -------------------------------------------------------------------------------- /check-mtime.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Rackspace Cloud Monitoring Plug-In 4 | # Check the mtime of a file and how long it has been since it has been modified 5 | # 6 | # (c) 2015 Justin Gallardo 7 | # All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 10 | # not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # 21 | # Usage: 22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 23 | # 24 | # It accepts one argument, which should be the file you wish to check the mtime of. 25 | # 26 | # Returns 2 metrics: 27 | # - mtime: The time(unix epoch) the file was last modified 28 | # - age: The number of seconds that have elapsed since the file was modified 29 | # 30 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 31 | # 32 | # if (metric['age'] > 3600) { 33 | # return new AlarmStatus(CRITICAL, 'The file has not been modified in more than 1 hour. Last modified #{age} seconds ago'); 34 | # } 35 | # return new AlarmStatus(OK, 'The file was last modified #{age} seconds ago.'); 36 | # 37 | file=$1 38 | 39 | if [ ! -e $file ]; then 40 | echo "status critical \"$file\" does not exist" 41 | exit 1 42 | fi 43 | 44 | if [ ! -r $file ]; then 45 | echo "status critical \"$file\" is not readable" 46 | exit 1 47 | fi 48 | 49 | mtime=$(stat -c%Y $file) 50 | now=$(date '+%s') 51 | age=$(( $now - $mtime )) 52 | 53 | echo "status ok file statted" 54 | echo "metric mtime uint64 $mtime" 55 | echo "metric age uint64 $age seconds" 56 | exit 0 57 | -------------------------------------------------------------------------------- /check_nodering.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | # 3 | # Rackspace Cloud Monitoring Plug-In 4 | # Check the status of a cassandra nodering and make sure none of the nodes 5 | # have a '?' as a status. 6 | # 7 | # (c) 2017 Jim Wang 8 | # All Rights Reserved. 9 | # 10 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 11 | # not use this file except in compliance with the License. 12 | # You may obtain a copy of the License at 13 | # 14 | # http://www.apache.org/licenses/LICENSE-2.0 15 | # 16 | # Unless required by applicable law or agreed to in writing, software 17 | # distributed under the License is distributed on an "AS IS" BASIS, 18 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | # See the License for the specific language governing permissions and 20 | # limitations under the License. 21 | # 22 | # Usage: 23 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 24 | # 25 | # It accepts three arguments, the path to the nodetool executable, the cassandra hostname 26 | # and the port on which to run on 27 | # 28 | # Returns 1 metric, nodering_status: 29 | # 0 if the nodes don't have a '?' as a status 30 | # 1 if the nodes have a '?' as a status 31 | # 32 | #! /usr/bin/python 33 | 34 | from subprocess import check_output 35 | import sys 36 | import argparse 37 | 38 | parser = argparse.ArgumentParser(description='Run nodetool to check for inconsistent state') 39 | parser.add_argument('-p', '--port', dest='portforcassandra', default='9080', help='port that cassandra is running on') 40 | parser.add_argument('-t', '--tool', dest='pathtonodetool', default='/opt/cassandra/bin/', help='path to nodetool executable (ex /opt/cassandra/bin)') 41 | parser.add_argument('-o', '--host', dest='cassandrahost', default='localhost', help='host cassandra is running on.') 42 | 43 | args = parser.parse_args(); 44 | 45 | nodetooloutput = check_output([args.pathtonodetool + '/nodetool', '-h', 46 | args.cassandrahost, '-p', args.portforcassandra, 'ring']) 47 | 48 | if nodetooloutput.find('?') >= 0 : 49 | print 'status critical nodering not consistent' 50 | print 'metric nodering_status uint32 1' 51 | sys.exit(2) 52 | else : 53 | print 'status ok nodering in consistent state' 54 | print 'metric nodering_status uint32 0' 55 | sys.exit(0) 56 | -------------------------------------------------------------------------------- /check_openmanage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Rackspace Cloud Monitoring Plug-In 4 | # check_openmanage wrapper plugin to check health of Dell servers via OMSA 5 | 6 | # ---------------------------------------------------------------------------- 7 | # "THE BEER-WARE LICENSE" (Revision 42): 8 | # wrote this file. As long as you retain this notice you 9 | # can do whatever you want with this stuff. If we meet some day, and you think 10 | # this stuff is worth it, you can buy me a beer in return. 11 | # ---------------------------------------------------------------------------- 12 | 13 | # Depends on Dell's OMSA being installed along with Trond Hasle Amundsen's 14 | # wonderful check_openmanage plugin for Nagios (avaiable via EPEL on RPM based 15 | # distributions or directly from his web site: 16 | # http://folk.uio.no/trondham/software/check_openmanage.html 17 | 18 | # Usage: 19 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 20 | # 21 | # This plugin returns 1 metric: 22 | # - status : the exit status returned from the check_openmanage script 23 | # 24 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 25 | # 26 | # :set consecutiveCount=3 27 | # 28 | # if (metric['status'] >= 3) { 29 | # return new AlarmStatus(CRITICAL, '#{report}'); 30 | # } 31 | # 32 | # if (metric['status'] == 2) { 33 | # return new AlarmStatus(CRITICAL, '#{report}'); 34 | # } 35 | # 36 | # if (metric['status'] == 1) { 37 | # return new AlarmStatus(WARNING, '#{report}'); 38 | # } 39 | # 40 | # return new AlarmStatus(OK, '#{report}'); 41 | # 42 | # Things to keep in mind: 43 | # - this plugin will try to find check_openmanage in one of the "normal" distribution 44 | # managed paths where it might reside, or the actual shell path as a last resort. 45 | # - by default, we're ignoring uncertified drive warnings. Feel free to change that 46 | # if it's important in your environment. 47 | 48 | # Previous version of this wrapper used an opt_args argument. But given 49 | # you can set arguments to pass via the Monitoring API, that has been 50 | # replaced with $@ below. You might want to update your check with 51 | # something like the following: 52 | # 53 | # { 54 | # "details": { 55 | # "file": "check_openmanage.sh", 56 | # "args": [ 57 | # "-b", 58 | # "pdisk_cert=all", 59 | # "-b", 60 | # "ctrl_fw=all" 61 | # ] 62 | # } 63 | # } 64 | 65 | search=(/usr/lib64/nagios/plugins /usr/lib/nagios/plugins) 66 | 67 | for i in ${search[@]}; do 68 | if [[ -x ${i}/check_openmanage ]]; then 69 | check_cmd="${i}/check_openmanage" 70 | break 71 | fi 72 | done 73 | 74 | if [[ -z "${check_cmd}" ]]; then 75 | if ! check_cmd=$(which check_openmanage); then 76 | echo "status Could not find check_openmanage script!" 77 | exit 1 78 | fi 79 | fi 80 | 81 | report=$(${check_cmd} ${@}) 82 | status=$? 83 | 84 | echo "status successfully ran check_openmanage wrapper plugin" 85 | echo "metric status int32 ${status}" 86 | echo "metric report string $(echo -E ${report} | head -1)" 87 | 88 | exit 0 89 | -------------------------------------------------------------------------------- /chef_node_checkin.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | from datetime import datetime 3 | import os 4 | 5 | 6 | def PopulateMetrics(log): 7 | clientRuns = [] 8 | logFile = open(log, 'r') 9 | for line in logFile: 10 | if 'INFO: Chef Run complete in' in line: 11 | date = line[1:11].split('-') 12 | time = line[12:20].split(':') 13 | clientRuns.append(datetime(int(date[0]), int(date[1]), int( 14 | date[2]), int(time[0]), int(time[1]), int(time[2]), 0)) 15 | metrics['checkInDuration'] = int(float(line.split()[6])) 16 | 17 | metrics['timeSinceCheckIn'] = int( 18 | datetime.now().strftime('%s')) - int( 19 | sorted(clientRuns)[-1].strftime('%s') 20 | ) 21 | 22 | metrics['processesAmount'] = 0 23 | pids = [pid for pid in os.listdir('/proc') if pid.isdigit()] 24 | for pid in pids: 25 | if 'chef-client' in open(os.path.join('/proc', pid, 'cmdline'), 'rb').read(): 26 | metrics['processesAmount'] += 1 27 | 28 | if metrics['timeSinceCheckIn'] > 86400: 29 | print "status Critcal node has not checked in for {0} seconds".format( 30 | metrics['timeSinceCheckIn'] 31 | ) 32 | elif metrics['timeSinceCheckIn'] > 3600: 33 | print "status Warning node has not checked in for {0} seconds".format( 34 | metrics['timeSinceCheckIn'] 35 | ) 36 | else: 37 | print "status OK node checked in {0} seconds ago".format( 38 | metrics['timeSinceCheckIn'] 39 | ) 40 | return metrics 41 | 42 | 43 | logfile = '/var/log/chef/client.log' 44 | metrics = {'timeSinceCheckIn': 0, 'checkInDuration': 0, 'processesAmount': 0} 45 | 46 | try: 47 | metrics = PopulateMetrics(logfile) 48 | 49 | # Anything OS related with file handling should warn and exit 50 | except IOError as err: 51 | print "status Warning {0}".format(err) 52 | 53 | # Handle the log regex not returning a poplated array 54 | except IndexError: 55 | print "status OK node has not generated a parsable log" 56 | 57 | except: 58 | print "status Warning unhandled error executing script" 59 | 60 | # Always print out metrics to allow REACH to report metrics 61 | finally: 62 | print "metric timeSinceCheckIn int32", metrics['timeSinceCheckIn'] 63 | print "metric checkInDuration int32", metrics['checkInDuration'] 64 | print "metric numberOfClients int32", metrics['processesAmount'] 65 | -------------------------------------------------------------------------------- /cloud_queues.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | """ 3 | Copyright 2014 Rackspace 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | ---- 17 | 18 | Rackspace Cloud Monitoring Plugin for Cloud Queues 19 | 20 | Retrieves Stats for number of unclaimed(free), claimed, and total messages in a given queue. 21 | Useful for triggering Autoscale webhooks based on number os messages in a Cloud Queue. 22 | 23 | Requires: 24 | pyrax - https://github.com/rackspace/pyrax 25 | 26 | Usage: 27 | Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 28 | 29 | Setup a CLoud Monitoring Check of type agent.plugin to run 30 | python ./cloud-queues.py 31 | 32 | Eg. 33 | python ./cloud-queues.py myQueue 34 | 35 | The following is an example 'criteria' for a Rackspace Monitoring Alarm: 36 | 37 | if (metric['queue.free'] >= 100) { 38 | return new AlarmStatus(CRITICAL, 'over 100 msgs unclaimed msgs' 39 | if (metric['queue.free'] >= '50') { 40 | return new AlarmStatus(WARNING, 'more than 60 unclaimed msgs'); 41 | } 42 | return new AlarmStatus(OK, 'Less than 50 unclaimed msgs'); 43 | 44 | Please note that you will need to adjust the thresholds based on what works for you. 45 | 46 | Available metrics are 47 | queue.claimed 48 | queue.total 49 | queue.free 50 | 51 | """ 52 | import os 53 | import argparse 54 | import pyrax 55 | 56 | def get_queue_stats(queueName): 57 | 58 | pyrax.settings.set('identity_type', 'rackspace') 59 | pyrax.set_credential_file(os.path.expanduser("~/.rackspace_cloud_credentials")) 60 | 61 | try: 62 | cq = pyrax.queues 63 | except pyrax.exceptions.PyraxException: 64 | print 'status err Unable to get queue', queueName 65 | return 66 | 67 | try: 68 | stats = cq.get_stats(queueName) 69 | print 'status success' 70 | except pyrax.exceptions.NotFound: 71 | print 'status err Unable to get queue stats', queueName 72 | return 73 | 74 | for key,value in stats.items(): 75 | if type(value) is int: 76 | print 'metric queue.%s int %s' % (key, value) 77 | 78 | 79 | if __name__ == "__main__": 80 | parser = argparse.ArgumentParser() 81 | parser.add_argument("queueName", help="Cloud Queue name") 82 | args = parser.parse_args() 83 | get_queue_stats(args.queueName) 84 | 85 | 86 | -------------------------------------------------------------------------------- /cloudbackup_mon.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # cloudbackup_mon.sh 4 | # Rackspace Cloud Monitoring Plugin to help detect if there are 5 | # problems with Cloud Backups. 6 | # 7 | # Copyright (c) 2013, Stephen Lang 8 | # All rights reserved. 9 | # 10 | # Redistribution and use in source and binary forms, with or without 11 | # modification, are permitted provided that the following conditions are met: 12 | # 13 | # Redistributions of source code must retain the above copyright notice, 14 | # this list of conditions and the following disclaimer. 15 | # 16 | # Redistributions in binary form must reproduce the above copyright 17 | # notice, this list of conditions and the following disclaimer in the 18 | # documentation and/or other materials provided with the distribution. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 24 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 | # POSSIBILITY OF SUCH DAMAGE. 31 | # 32 | # Curl Command: 33 | # curl -i -X POST -H 'Host: monitoring.api.rackspacecloud.com' -H 34 | # 'Accept-Encoding: gzip,deflate' -H 'X-Auth-Token: YOUR_API_TOKEN' 35 | # -H 'Content-Type: application/json; charset=UTF-8' -H 'Accept: application/json' 36 | # --data-binary '{"label": "Cloud Backup Check", "type": "agent.plugin", "details": 37 | # {"args": ["YOUR_API_KEY"],"file": "cloudbackup_mon.sh"}}' --compress 38 | # 'https://monitoring.api.rackspacecloud.com:443/v1.0/YOUR_ACCOUNT/entities/YOUR_ENTITY/checks' 39 | # 40 | # Usage: 41 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 42 | # 43 | # It needs to be passed 2-3 params by the backup system: 44 | # 45 | # apikey datacenter [backupid] 46 | # 47 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 48 | # 49 | # if (metric['diagnostics'] != 'No errors') { 50 | # return new AlarmStatus(CRITICAL, 'Errors found during last Cloud Backup: #{diagnostics}'); 51 | # } 52 | # if (metric['reason'] != 'Success') { 53 | # return new AlarmStatus(CRITICAL, 'The last Cloud Backup was not successful.'); 54 | # } 55 | # if (metric['state'] != 'Completed') { 56 | # return new AlarmStatus(CRITICAL, 'The last Cloud Backup was not completed.'); 57 | # } 58 | # if (metric['agent_running'] == 0) { 59 | # return new AlarmStatus(CRITICAL, 'The Cloud Backup Agent is not running.'); 60 | # } 61 | # if (metric['age'] > 129600) { 62 | # return new AlarmStatus(CRITICAL, 'The last Cloud Backup is more than 36 hours old!'); 63 | # } 64 | # return new AlarmStatus(OK, 'The last Cloud Backup was successful.'); 65 | 66 | function help { 67 | 68 | cat < 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | """ 18 | 19 | import os 20 | import sys 21 | 22 | import argparse 23 | import pyrax 24 | 25 | 26 | def check_usage(instance_id, threshold, region): 27 | pyrax.set_credential_file( 28 | os.path.expanduser("~/.rackspace_cloud_credentials")) 29 | cdb = pyrax.connect_to_cloud_databases(region=region) 30 | 31 | matched_instance = None 32 | for instance in cdb.list(): 33 | if instance.id == instance_id: 34 | matched_instance = instance 35 | if not matched_instance: 36 | print 'status err Unable to find instance', instance_id 37 | sys.exit(1) 38 | 39 | # Force usage lookup 40 | matched_instance.get() 41 | database_size = matched_instance.volume['size'] 42 | database_usage = matched_instance.volume['used'] 43 | percentage_used = database_usage / database_size 44 | 45 | if percentage_used >= threshold: 46 | print 'status err usage over threshold' 47 | else: 48 | print 'status ok usage within threshold' 49 | 50 | print "metric database_GB_container_size float", database_size 51 | print "metric database_GB_used float", database_usage 52 | print "metric percentage_used float", percentage_used 53 | 54 | if __name__ == "__main__": 55 | parser = argparse.ArgumentParser() 56 | parser.add_argument("instance", help="Cloud Database instance id") 57 | parser.add_argument("region", help="Cloud region, e.g. DFW or ORD") 58 | parser.add_argument("threshold", nargs='?', default=85.0, type=float, 59 | help="Storage threshold to alert on") 60 | args = parser.parse_args() 61 | check_usage(args.instance, args.threshold, args.region) 62 | 63 | -------------------------------------------------------------------------------- /cloudload_balancer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Rackspace Cloud Monitoring plugin to provide cloud load balancer 4 | 5 | Requirement: 6 | pyrax - https://github.com/rackspace/pyrax 7 | 8 | Copyright 2013 Rackspace 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | """ 22 | import os 23 | import argparse 24 | import pyrax 25 | 26 | USAGE_STATS = [ 27 | {'key': 'incoming', 'ref': 'incomingTransfer', 'unit': 'int'}, 28 | {'key': 'incoming_ssl', 'ref': 'incomingTransferSsl', 'unit': 'int'}, 29 | {'key': 'outgoing', 'ref': 'outgoingTransfer', 'unit': 'int'}, 30 | {'key': 'outgoing_ssl', 'ref': 'outgoingTransferSsl', 'unit': 'int'} 31 | ] 32 | 33 | 34 | STATS = [ 35 | {'key': 'connect_timeout', 'ref': 'connectTimeOut', 'unit': 'int'}, 36 | {'key': 'connect_error', 'ref': 'connectError', 'unit': 'int'}, 37 | {'key': 'connect_failure', 'ref': 'connectFailure', 'unit': 'int'}, 38 | {'key': 'data_timed_out', 'ref': 'dataTimedOut', 'unit': 'int'}, 39 | {'key': 'keep_alive_timed_out', 'ref': 'keepAliveTimedOut', 'unit': 'int'}, 40 | {'key': 'max_conns', 'ref': 'maxConn', 'unit': 'int'} 41 | ] 42 | 43 | 44 | def check_usage(instance_id, region): 45 | pyrax.settings.set('identity_type', 'rackspace') 46 | pyrax.set_credential_file( 47 | os.path.expanduser("~/.rackspace_cloud_credentials"), 48 | region=region) 49 | 50 | clb = pyrax.cloud_loadbalancers 51 | 52 | try: 53 | instance = clb.get(instance_id) 54 | print 'status ok' 55 | except pyrax.exceptions.NotFound: 56 | print 'status err Unable to find instance', instance_id 57 | return 58 | 59 | mgr = instance.manager 60 | status = instance.status 61 | nodes = instance.nodes 62 | name = instance.name.lower().replace('-', '_') 63 | usage = mgr.get_usage(instance) 64 | usage = usage['loadBalancerUsageRecords'][-1] 65 | 66 | if status == 'ACTIVE': 67 | print 'metric %s.status float 100.0' % (name) 68 | else: 69 | print 'metric %s.status float 0.0' % (name) 70 | 71 | for stat in USAGE_STATS: 72 | print 'metric %s.%s %s %s' % \ 73 | (name, stat['key'], stat['unit'], usage[stat['ref']]) 74 | 75 | online_nodes = 0 76 | offline_nodes = 0 77 | draining_nodes = 0 78 | total_nodes = len(nodes) 79 | 80 | for node in nodes: 81 | if node.status == 'ONLINE' and node.condition == 'ENABLED': 82 | online_nodes = online_nodes + 1 83 | if node.status == 'OFFLINE' or node.condition == 'DISABLED': 84 | offline_nodes = offline_nodes + 1 85 | if node.status == 'DRAINING' or node.condition == 'DRAINING': 86 | draining_nodes = draining_nodes + 1 87 | 88 | print 'metric %s.total_nodes int %s' % (name, total_nodes) 89 | print 'metric %s.online_nodes int %s' % (name, online_nodes) 90 | print 'metric %s.offline_nodes int %s' % (name, offline_nodes) 91 | print 'metric %s.draining_nodes int %s' % (name, draining_nodes) 92 | 93 | stats = mgr.get_stats(instance) 94 | for stat in STATS: 95 | print 'metric %s.%s %s %s' % \ 96 | (name, stat['key'], stat['unit'], stats[stat['ref']]) 97 | 98 | 99 | if __name__ == "__main__": 100 | parser = argparse.ArgumentParser() 101 | parser.add_argument("instance", help="Cloud Load Balancer instance id") 102 | parser.add_argument("region", help="Cloud region, e.g. DFW or ORD") 103 | args = parser.parse_args() 104 | check_usage(args.instance, args.region.upper()) 105 | -------------------------------------------------------------------------------- /cman_nodes.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | ## Rackspace Cloud Monitoring Plug-In 3 | ## CMAN nodes check 4 | # 5 | # Author: James Turnbull 6 | # Copyright (c) 2012 James Turnbull 7 | # 8 | # MIT License: 9 | # Permission is hereby granted, free of charge, to any person obtaining a copy 10 | # of this software and associated documentation files (the "Software"), to deal 11 | # in the Software without restriction, including without limitation the rights 12 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | # copies of the Software, and to permit persons to whom the Software is 14 | # furnished to do so, subject to the following conditions: 15 | # 16 | # The above copyright notice and this permission notice shall be included in 17 | # all copies or substantial portions of the Software. 18 | # 19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 25 | # THE SOFTWARE. 26 | # 27 | # Usage: 28 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 29 | # 30 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 31 | # 32 | # if (metric['NODE_STATUS'] == 'X') { 33 | # return new AlarmStatus(CRITICAL, 'Host is not a member of the cluster!'); 34 | # } 35 | # 36 | # if (metric['NODE_STATUS'] < 'd') { 37 | # return new AlarmStatus(CRITICAL, 'Host is disallowed from cluster!'); 38 | # } 39 | # 40 | # return new AlarmStatus(OK, 'Host is a member of the cluster.'); 41 | # 42 | 43 | # If the plugin fails in any way, print why and exit nonzero. 44 | def fail(status="Unknown failure") 45 | puts "status #{status}" 46 | exit 1 47 | end 48 | 49 | # Store metrics in a hash and don't print them until we've completed 50 | def metric(name,type,value) 51 | @metrics[name] = { 52 | :type => type, 53 | :value => value 54 | } 55 | end 56 | 57 | # Once the script has succeeded without errors, print metrics lines. 58 | def output_success 59 | puts "status Cman node status for #{@hostname}" 60 | @metrics.each do |name,v| 61 | puts "metric #{name} #{v[:type]} #{v[:value]}" 62 | end 63 | end 64 | 65 | begin 66 | require 'optparse' 67 | rescue 68 | fail "Failed to load required ruby gems!" 69 | end 70 | 71 | @metrics = {} 72 | options = {} 73 | 74 | args = ARGV.dup 75 | 76 | OptionParser.new do |o| 77 | o.banner = "Usage: #{$0} [options]" 78 | o.on('-h', '--hostname HOSTNAME', 'Check status of node lid option') do |h| 79 | options[:host] = h 80 | end 81 | o.on_tail('-h', '--help', 'Show this message') { puts o; exit } 82 | o.parse!(args) 83 | end 84 | 85 | @hostname = options[:host] || `hostname -s`.chomp 86 | 87 | begin 88 | node_status = `cman_tool nodes -n #{@hostname} -F type` 89 | metric("node_status","string","#{node_status}") 90 | rescue => e 91 | fail "Problem running cman_tool plugin: #{e.message}" 92 | end 93 | 94 | output_success 95 | -------------------------------------------------------------------------------- /consul.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Script to return status and metrics for Consul 4 | # 5 | # Justin Phelps 6 | # All Rights Reserved. 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 9 | # not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # 20 | # Example alarm criteria: 21 | # 22 | # if (metric['node_count'] < 5) { 23 | # return new AlarmStatus(WARNING, 'Node Count is below 5.'); 24 | # } 25 | # 26 | # if (metric['node_count'] < 3) { 27 | # return new AlarmStatus(CRITICAL, 'Node Count is below 3.'); 28 | # } 29 | # 30 | # return new AlarmStatus(OK, 'Node Count is within range.'); 31 | # 32 | 33 | import psutil 34 | import json 35 | import urllib2 36 | 37 | def check_process_name(name): 38 | """Returns status of given process.""" 39 | for proc in psutil.process_iter(): 40 | try: 41 | pinfo = proc.as_dict(attrs=['name']) 42 | except psutil.NoSuchProcess: 43 | pass 44 | else: 45 | if pinfo['name'] == name: 46 | return 'status ok consul is running' 47 | return 'status error consul is not running' 48 | 49 | def consul_http2json(url): 50 | """Returns data from the HTTP interface as a dict.""" 51 | try: 52 | response = urllib2.urlopen(url) 53 | except urllib2.URLError: 54 | pass 55 | else: 56 | html = response.read() 57 | data = json.loads(html) 58 | return data 59 | 60 | def consul_agent_type(): 61 | """Returns the type of agent that is running.""" 62 | try: 63 | agent_info = consul_http2json("http://localhost:8500/v1/agent/self?pretty=1") 64 | agent_type = agent_info['Config']['Server'] 65 | except: 66 | return 'metric agent_type string unknown' 67 | else: 68 | if agent_type is True: 69 | return 'metric agent_type string server' 70 | else: 71 | return 'metric agent_type string client' 72 | 73 | def consul_node_count(): 74 | """Returns the number of consul nodes running as seen by this specific node.""" 75 | try: 76 | nodes = consul_http2json("http://localhost:8500/v1/catalog/nodes?pretty=1") 77 | count = len(nodes) 78 | except: 79 | return 'metric node_count string unknown' 80 | else: 81 | return 'metric node_count int32 {0} nodes'.format(count) 82 | 83 | def main(): 84 | status = check_process_name("consul") 85 | print(status) 86 | agent_type = consul_agent_type() 87 | print(agent_type) 88 | node_count = consul_node_count() 89 | print(node_count) 90 | 91 | if __name__ == '__main__': 92 | main() 93 | -------------------------------------------------------------------------------- /content_check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # 4 | # usage: content_check.py [-h] [--timeout TIMEOUT] url pattern 5 | # 6 | # Rackspace Monitoring plugin to check a URL for a regular expression. Useful if 7 | # the URL you need to check is not publicly accessible, but can be reached by 8 | # another entity. Returns the metric 'found' with a value of 'yes' or 'no'. 9 | # 10 | # positional arguments: 11 | # url url to check 12 | # pattern regex to check for 13 | # 14 | # optional arguments: 15 | # -h, --help show this help message and exit 16 | # --timeout TIMEOUT timeout in seconds (default 5) 17 | # 18 | # 19 | # content_check.py - Rackspace Cloud Monitoring plugin 20 | # Copyright (C) 2015 Carl George 21 | # 22 | # This program is free software: you can redistribute it and/or modify 23 | # it under the terms of the GNU General Public License as published by 24 | # the Free Software Foundation, either version 3 of the License, or 25 | # (at your option) any later version. 26 | # 27 | # This program is distributed in the hope that it will be useful, 28 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 29 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 | # GNU General Public License for more details. 31 | # 32 | # You should have received a copy of the GNU General Public License 33 | # along with this program. If not, see . 34 | 35 | 36 | """Rackspace Monitoring plugin to check a URL for a regular expression. Useful 37 | if the URL you need to check is not publicly accessible, but can be reached by 38 | another entity. Returns the metric 'found' with a value of 'yes' or 'no'. 39 | """ 40 | 41 | 42 | from __future__ import print_function 43 | 44 | import argparse 45 | import re 46 | 47 | try: 48 | from urllib.request import urlopen 49 | from urllib.error import HTTPError 50 | except ImportError: 51 | from urllib2 import urlopen, HTTPError 52 | 53 | 54 | parser = argparse.ArgumentParser(description=__doc__) 55 | parser.add_argument('url', help='url to check') 56 | parser.add_argument('pattern', help='regex to check for') 57 | parser.add_argument('--timeout', type=int, default=5, help='timeout in seconds (default 5)') 58 | args = parser.parse_args() 59 | 60 | if not args.url.startswith('http'): 61 | args.url = 'http://{0}'.format(args.url) 62 | 63 | try: 64 | request = urlopen(args.url, timeout=args.timeout) 65 | page = request.read().decode('utf-8') 66 | except HTTPError as e: 67 | raise SystemExit('{0} {1} ({2})'.format(e.code, e.reason, args.url)) 68 | 69 | m = re.search(args.pattern, page) 70 | 71 | if m: 72 | print('status ok\nmetric found string yes') 73 | else: 74 | print('status err\nmetric found string no') 75 | -------------------------------------------------------------------------------- /curl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Rackspace Cloud Monitoring Plug-In 4 | # Simple curl request test that can be used to query internal hosts 5 | # 6 | # (C)2014 James Buchan 7 | # All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 10 | # not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # 21 | # Usage: 22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 23 | # 24 | # It accepts one argument, which should be the site you wish to query. 25 | # 26 | # Returns 4 metrics: 27 | # - code: The final status code returned 28 | # - time_connect: The total time, in seconds, that the full operation lasted 29 | # - time_total: The time, in seconds, it took from the start until the TCP 30 | # connect to the remote host (or proxy) was completed 31 | # 32 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 33 | # 34 | # if (metric['code'] != "200") { 35 | # return new AlarmStatus(CRITICAL, '#{code} response received. Expected 200.'); 36 | # } 37 | # return new AlarmStatus(OK, '200 response received'); 38 | # 39 | 40 | response=$(curl -sS -f -o /dev/null -I -w "%{response_code} %{time_connect} %{time_total}" $1 2>&1) 41 | 42 | if [ $? -eq 0 ] 43 | then 44 | echo "status ok connection made" 45 | echo "metric code string $(echo $response | awk {'print $1'})" 46 | echo "metric time_connect double $(echo $response | awk {'print $2'})" 47 | echo "metric time_total double $(echo $response | awk {'print $3'})" 48 | exit 0 49 | else 50 | #remove statistics from our status line, only keep the error 51 | echo "status $(echo $response | awk -F'000 ' '{$0=$1}1' )" 52 | fi 53 | 54 | exit 1 55 | -------------------------------------------------------------------------------- /curl_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Rackspace Cloud Monitoring Plug-In 4 | # Simple curl request test that can be used to query internal hosts 5 | # 6 | # (C)2014 James Buchan 7 | # All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 10 | # not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # 21 | # Usage: 22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 23 | # 24 | # It accepts one argument, which should be the site you wish to query. 25 | # 26 | # Returns 4 metrics: 27 | # - code: The final status code returned 28 | # - time_connect: The total time, in seconds, that the full operation lasted 29 | # - time_total: The time, in seconds, it took from the start until the TCP 30 | # connect to the remote host (or proxy) was completed 31 | # - url: The last URL that was queried (if redirects occurred) 32 | # 33 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 34 | # 35 | # if (metric['code'] != '200') { 36 | # return new AlarmStatus(CRITICAL, '#{code} response received. Expected 200.'); 37 | # } 38 | # return new AlarmStatus(OK, '200 response received'); 39 | # 40 | 41 | function extract_header() 42 | { 43 | ret=$(echo "$1" | grep "$2:" | tail -1 | cut -d' ' -f 2- | tr -d '\n\r' ) 44 | [ -n "$ret" ] && echo -n $ret 45 | } 46 | 47 | response=$(curl -sS -L -f -I -w "Response-Code: %{response_code}\nTime-Connect: %{time_connect}\nTime-Total: %{time_total}\nURL-Effective: %{url_effective}\n" $1 2>&1) 48 | 49 | if [ $? -eq 0 ] 50 | then 51 | echo "status ok connection made" 52 | echo "metric code string $(extract_header "$response" Response-Code)" 53 | echo "metric time_connect double $(extract_header "$response" Time-Connect) seconds" 54 | echo "metric time_total double $(extract_header "$response" Time-Total) seconds" 55 | echo "metric url string $(extract_header "$response" URL-Effective)" 56 | 57 | etag=$(extract_header "$response" ETag) 58 | [ -n "$etag" ] && echo "metric etag string $etag" 59 | 60 | length=$(extract_header "$response" Content-Length) 61 | [ -n "$length" ] && echo "metric content_length uint32 $length bytes" 62 | 63 | modified=$(extract_header "$response" Last-Modified) 64 | if [ -n "$modified" ] 65 | then 66 | modified_seconds=$(date --date="$modified" +"%s") 67 | age=$(($(date +"%s") - $modified_seconds)) 68 | echo "metric page_age uint64 $age seconds" 69 | fi 70 | 71 | exit 0 72 | else 73 | #remove statistics from our status line, only keep the error 74 | echo "status $(echo $response | awk -F'000 ' '{$0=$1}1' )" 75 | fi 76 | 77 | exit 1 78 | -------------------------------------------------------------------------------- /dir_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Gather stats on the directory size, number of files and oldest file name 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -e 18 | TARGET="${1}" 19 | 20 | if [ "z" = "z${TARGET}" ] 21 | then 22 | echo "status err missing target argument" 23 | exit 1 24 | fi 25 | 26 | if [ ! -d ${TARGET} ] 27 | then 28 | echo "status err ${TARGET} does not exist or is not a directory" 29 | exit 1 30 | fi 31 | 32 | SIZE="$(du -sm ${TARGET} | awk '{print $1}')" 33 | NB_FILES="$(find ${TARGET} -type f | wc -l)" 34 | if [ ${NB_FILES} -gt 0 ] 35 | then 36 | OLDEST_FILE_STAT="$(find ${TARGET} -type f -printf "%T@ %p\n" | sort -n | head -n1)" 37 | OLDEST_FILE_NAME="$(echo ${OLDEST_FILE_STAT} | cut -d ' ' -f2)" 38 | OLDEST_FILE_MTIME="$(echo ${OLDEST_FILE_STAT} | cut -d ' ' -f1 | cut -d '.' -f1)" 39 | OLDEST_FILE_AGE=$((`date +%s`-${OLDEST_FILE_MTIME})) 40 | else 41 | OLDEST_FILE_NAME='no_files' 42 | OLDEST_FILE_AGE=0 43 | fi 44 | 45 | echo "status ok target uses ${SIZE} MB in ${NB_FILES} files" 46 | echo "metric total_size uint64 ${SIZE} megabytes" 47 | echo "metric total_files uint64 ${NB_FILES} files" 48 | echo "metric oldest_file_name string ${OLDEST_FILE_NAME}" 49 | echo "metric oldest_file_age uint64 ${OLDEST_FILE_AGE} seconds" 50 | -------------------------------------------------------------------------------- /directory.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Copyright 2015 Rackspace 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ---- 17 | # Custom check for a directory presence, files, oldest file name & age and size (in Mbytes) 18 | # 19 | # if (metric['size'] < 0) { 20 | # return new AlarmStatus(CRITICAL, 'Directory #{dir} check failed - no such directory?'); 21 | # } 22 | # if (metric['size'] > 500) { 23 | # return new AlarmStatus(WARNING, 'Directory #{dir} is #{size} Mbytes'); 24 | # } 25 | # if (metric['size'] > 1000) { 26 | # return new AlarmStatus(CRITICAL, 'Directory #{dir} is #{size} Mbytes'); 27 | # } 28 | # 29 | # E.g. 30 | # ./Directory.sh DIRECTORY 31 | 32 | set -e 33 | TARGET="${1}" 34 | 35 | if [ "z" = "z${TARGET}" ] 36 | then 37 | echo "status err missing target argument" 38 | echo "Usage: $0 DIRECTORY" 39 | exit 1 40 | fi 41 | 42 | if [ ! -d ${TARGET} ] 43 | then 44 | echo "status err ${TARGET} does not exist or is not a directory" 45 | exit 1 46 | fi 47 | 48 | SIZE="$(du -sm ${TARGET} | awk '{print $1}')" 49 | NB_FILES="$(find ${TARGET} -type f | wc -l)" 50 | if [ ${NB_FILES} -gt 0 ] 51 | then 52 | OLDEST_FILE_STAT="$(find ${TARGET} -type f -printf "%T@ %p\n" | sort -n | head -n1)" 53 | OLDEST_FILE_NAME="$(echo ${OLDEST_FILE_STAT} | cut -d ' ' -f2)" 54 | OLDEST_FILE_MTIME="$(echo ${OLDEST_FILE_STAT} | cut -d ' ' -f1 | cut -d '.' -f1)" 55 | OLDEST_FILE_AGE=$((`date +%s`-${OLDEST_FILE_MTIME})) 56 | else 57 | OLDEST_FILE_NAME='no_files' 58 | OLDEST_FILE_AGE=0 59 | fi 60 | 61 | echo "status ok target uses ${SIZE} MB in ${NB_FILES} files" 62 | echo "metric dir string ${TARGET}" 63 | echo "metric total_size uint64 ${SIZE} megabytes" 64 | echo "metric total_files uint64 ${NB_FILES} files" 65 | echo "metric oldest_file_name string ${OLDEST_FILE_NAME}" 66 | echo "metric oldest_file_age uint64 ${OLDEST_FILE_AGE} seconds" 67 | -------------------------------------------------------------------------------- /dns_resolution.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # dns-resolve.sh 4 | # Rackspace Cloud Monitoring Plugin to verify the current return status of DNS lookups. 5 | # 6 | # Copyright (c) 2014, Lindsey Anderson 7 | # Copyright (c) 2015, Michael Burns 8 | # All rights reserved. 9 | # 10 | # Redistribution and use in source and binary forms, with or without 11 | # modification, are permitted provided that the following conditions are met: 12 | # 13 | # Redistributions of source code must retain the above copyright notice, 14 | # this list of conditions and the following disclaimer. 15 | # 16 | # Redistributions in binary form must reproduce the above copyright 17 | # notice, this list of conditions and the following disclaimer in the 18 | # documentation and/or other materials provided with the distribution. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 24 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 | # POSSIBILITY OF SUCH DAMAGE. 31 | # 32 | # Example criteria: 33 | # 34 | #if (metric['dns_lookup'] != 'successful'){ 35 | # return new AlarmStatus(CRITICAL, 'DNS Lookups are unavailable.'); 36 | #} 37 | #return new AlarmStatus(OK, 'DNS Lookups from this server are responsive.'); 38 | 39 | 40 | RESOLVE=${1:-"example.com"} 41 | TYPE=${2:-"A"} 42 | 43 | res=$(dig +noall +answer ${RESOLVE} ${TYPE} | head -1) 44 | 45 | if [ -z "$res" ]; then 46 | echo "status critical dns_lookup unsuccessful" 47 | echo "metric dns_lookup string failed" 48 | else 49 | echo "status ok dns_lookup successful" 50 | echo "metric dns_lookup string successful" 51 | fi 52 | -------------------------------------------------------------------------------- /docker_check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Rackspace Cloud Monitoring Plugin for Docker.""" 3 | 4 | # Copyright 2015 Frank Ritchie 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # ----- 19 | # 20 | # This plugin monitors the Docker service via the 'docker info' command. 21 | # By default the monitor fails if the check does not complete successfully. 22 | # Metrics for: 23 | # 24 | # - the number of images 25 | # - the number of containers 26 | # - the number of go routines 27 | # - the driver used 28 | # - data space used 29 | # - total data space 30 | # - metadata space used 31 | # - total metadata space 32 | # 33 | # are also reported. 34 | # 35 | # Requires: 36 | # Python 2.6 or greater 37 | # docker-py: https://github.com/docker/docker-py 38 | # 39 | # Usage: 40 | # Place script in /usr/lib/rackspace-monitoring-agent/plugins. 41 | # Ensure file is executable (755). 42 | # 43 | # Set up a Cloud Monitoring Check of type agent.plugin to run 44 | # 45 | # docker_check.py -u 46 | # 47 | # The URL is optional and can be a TCP or Unix socket, e.g. 48 | # 49 | # docker_check.py -u tcp://0.0.0.0:2376 50 | # or 51 | # docker_check.py -u unix://var/run/docker.sock 52 | # 53 | # The default URL is unix://var/run/docker.sock. 54 | # 55 | # There is no need to define specific custom alert criteria. 56 | # As stated, the monitor fails if the stats cannot be collected. 57 | # It is possible to define custom alert criteria with the reported 58 | # metrics if desired. 59 | # 60 | 61 | import sys 62 | from docker import Client 63 | from optparse import OptionParser 64 | 65 | 66 | class DockerService(object): 67 | """Create an object for a Docker service. Assume it is stopped.""" 68 | 69 | def __init__(self, url): 70 | 71 | self.url = url 72 | self.docker_running = False 73 | 74 | def docker_stats(self): 75 | """Connect to the Docker object and get stats. Error out on failure.""" 76 | 77 | docker_conn = Client(base_url=self.url) 78 | 79 | try: 80 | docker_info = docker_conn.info() 81 | self.docker_running = True 82 | # Apologies for the broad exception, it just works here. 83 | except Exception: 84 | self.docker_running = False 85 | 86 | if self.docker_running: 87 | # Create a dict from the list of lists 'docker info' uses 88 | # to report Driver Status stats. 89 | driver_status = dict([(metric[0], metric[1]) for metric in \ 90 | docker_info['DriverStatus']]) 91 | 92 | print 'metric images int64', docker_info['Images'] 93 | print 'metric containers int64', docker_info['Containers'] 94 | print 'metric go_routines int64', docker_info['NGoroutines'] 95 | print 'metric driver string', docker_info['Driver'] 96 | 97 | data_space_used_scalar, data_space_used_unit = \ 98 | driver_status['Data Space Used'].split() 99 | print 'metric data_space_used float', \ 100 | data_space_used_scalar, data_space_used_unit 101 | 102 | data_space_total_scalar, data_space_total_unit = \ 103 | driver_status['Data Space Total'].split() 104 | print 'metric data_space_total float', \ 105 | data_space_total_scalar, data_space_total_unit 106 | 107 | metadata_space_used_scalar, metadata_space_used_unit = \ 108 | driver_status['Metadata Space Used'].split() 109 | print 'metric metadata_space_used float', \ 110 | metadata_space_used_scalar, metadata_space_used_unit 111 | 112 | metadata_space_total_scalar, metadata_space_total_unit = \ 113 | driver_status['Metadata Space Total'].split() 114 | print 'metric metadata_space_total float', \ 115 | metadata_space_total_scalar, metadata_space_total_unit 116 | 117 | print 'status ok succeeded in obtaining docker stats.' 118 | else: 119 | print 'status err failed to obtain docker stats.' 120 | sys.exit(1) 121 | 122 | 123 | def main(): 124 | """Instantiate a DockerService object and collect stats.""" 125 | 126 | parser = OptionParser() 127 | parser.add_option('-u', '--url', default='unix://var/run/docker.sock', 128 | help='URL for Docker service (Unix or TCP socket).') 129 | (opts, args) = parser.parse_args() 130 | 131 | docker_service = DockerService(opts.url) 132 | docker_service.docker_stats() 133 | 134 | if __name__ == '__main__': 135 | main() 136 | -------------------------------------------------------------------------------- /docker_stats_check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Rackspace Cloud Monitoring Plugin for Docker Stats.""" 3 | 4 | # Copyright 2015 Nachiket Torwekar 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | # ----- 19 | # 20 | # This plugin monitors the Docker containers via the 'docker stats' command. 21 | # By default the monitor fails if the check does not complete successfully. 22 | # Metrics for: 23 | # 24 | # - cpu_total_usage 25 | # - cpu_system_usage 26 | # - cpu_kernel_mode_usage 27 | # - cpu_user_mode_usage 28 | # - cpu_user_mode_usage 29 | # - memory_max_usage 30 | # - memory_total_cache 31 | # - network_rx_bytes 32 | # - network_rx_packets 33 | # - network_tx_bytes 34 | # - network_tx_packets 35 | # 36 | # are also reported. 37 | # 38 | # Requires: 39 | # Python 2.6 or greater 40 | # docker-py: https://github.com/docker/docker-py 41 | # 42 | # Usage: 43 | # Place script in /usr/lib/rackspace-monitoring-agent/plugins. 44 | # Ensure file is executable (755). 45 | # 46 | # Set up a Cloud Monitoring Check of type agent.plugin to run 47 | # 48 | # docker_stats_check.py -u -c 49 | # 50 | # The URL is optional and can be a TCP or Unix socket, e.g. 51 | # 52 | # docker_stats_check.py -u tcp://0.0.0.0:2376 53 | # or 54 | # docker_stats_check.py -u unix://var/run/docker.sock 55 | # 56 | # The default URL is unix://var/run/docker.sock. 57 | # 58 | # The container can be name or id 59 | # docker_stats_check.py -u unix://var/run/docker.sock -c agitated_leakey 60 | # or 61 | # docker_stats_check.py -u unix://var/run/docker.sock -c 1f3b3b8f0fcc 62 | # 63 | # There is no need to define specific custom alert criteria. 64 | # As stated, the monitor fails if the stats cannot be collected. 65 | # It is possible to define custom alert criteria with the reported 66 | # metrics if desired. 67 | # 68 | 69 | import sys 70 | from docker import Client 71 | from optparse import OptionParser 72 | from subprocess import call 73 | import json 74 | 75 | class DockerService(object): 76 | """Create an object for a Docker service. Assume it is stopped.""" 77 | 78 | def __init__(self, url, container): 79 | 80 | self.url = url 81 | self.container = container 82 | self.docker_running = False 83 | 84 | def docker_stats(self): 85 | """Connect to the Docker object and get stats. Error out on failure.""" 86 | 87 | docker_conn = Client(base_url=self.url) 88 | 89 | try: 90 | stats = docker_conn.stats(self.container) 91 | self.docker_running = True 92 | # Apologies for the broad exception, it just works here. 93 | except Exception: 94 | self.docker_running = False 95 | 96 | if self.docker_running: 97 | print 'status ok succeeded in obtaining docker container stats.' 98 | for stat in stats: 99 | s = json.loads(stat) 100 | print 'metric cpu_total_usage int64', s['cpu_stats']['cpu_usage']['total_usage'] 101 | print 'metric cpu_system_usage int64', s['cpu_stats']['system_cpu_usage'] 102 | print 'metric cpu_kernel_mode_usage int64', s['cpu_stats']['cpu_usage']['usage_in_kernelmode'] 103 | print 'metric cpu_user_mode_usage int64', s['cpu_stats']['cpu_usage']['usage_in_usermode'] 104 | print 'metric memory_max_usage int64', s['memory_stats']['max_usage'] 105 | print 'metric memory_total_cache int64', s['memory_stats']['stats']['total_cache'] 106 | print 'metric pids_current int64', s['pids_stats']['current'] 107 | if s.has_key('network'): 108 | print_network_stat(s['network']) 109 | elif s.has_key('networks'): 110 | tot = { "rx_bytes": 0, "rx_packets": 0, "tx_bytes": 0, "tx_packets": 0 } 111 | for ifname in s['networks']: 112 | tot['rx_bytes'] += s['networks'][ifname]['rx_bytes'] 113 | tot['rx_packets'] += s['networks'][ifname]['rx_packets'] 114 | tot['tx_bytes'] += s['networks'][ifname]['tx_bytes'] 115 | tot['tx_packets'] += s['networks'][ifname]['tx_packets'] 116 | print_network_stat(s['networks'][ifname], suffix='_' + ifname) 117 | print_network_stat(tot) 118 | 119 | sys.exit(0); 120 | else: 121 | print 'status err failed to obtain docker container stats.' 122 | sys.exit(1) 123 | 124 | def print_network_stat(n, suffix=''): 125 | print "metric network_rx_bytes%s int64 %d" % (suffix, n['rx_bytes']) 126 | print "metric network_rx_packets%s int64 %d" % (suffix, n['rx_packets']) 127 | print "metric network_tx_bytes%s int64 %d" % (suffix, n['tx_bytes']) 128 | print "metric network_tx_packets%s int64 %d" % (suffix, n['tx_packets']) 129 | 130 | 131 | def main(): 132 | """Instantiate a DockerStats object and collect stats.""" 133 | 134 | parser = OptionParser() 135 | parser.add_option('-u', '--url', default='unix://var/run/docker.sock', 136 | help='URL for Docker service (Unix or TCP socket).') 137 | parser.add_option('-c', '--container', 138 | help='Name or Id of container that you want to monitor') 139 | (opts, args) = parser.parse_args() 140 | if opts.container is None: 141 | parser.error("options -c is mandatory") 142 | 143 | docker_service = DockerService(opts.url, opts.container) 144 | docker_service.docker_stats() 145 | 146 | if __name__ == '__main__': 147 | main() -------------------------------------------------------------------------------- /elasticsearch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Rackspace Cloud Monitoring plugin for elasticsearch cluster health 4 | and node stats. 5 | 6 | There are some questionable choices in modules (urllib2 vs requests, 7 | optparse vs argparse). These questions can be answered by considering 8 | compatability issues with older python versions like what we find stock 9 | on Red Hat Enterprise Linux systems. 10 | 11 | This plugin provides various groups of metrics. 12 | * cluster-health gives an overview of the cluster status 13 | * stats-store gives local node metrics about storing 14 | * stats-index gives local node metrics about indexing 15 | * stats-get gives local node metrics about gets 16 | * stats-search gives local node metrics about searches 17 | * stats-docs gives local node metrics about docs 18 | 19 | Examples: 20 | $ ./elasticsearch.py --stats-docs 21 | $ ./elasticsearch.py -H http://localhost:9200 --cluster-health 22 | 23 | This means you can call this plugin for up to 6 different checks for 24 | various metrics groups about your elasticsearch cluster. 25 | 26 | Copyright 2013 Victor Watkins 27 | 28 | Licensed under the Apache License, Version 2.0 (the "License"); 29 | you may not use this file except in compliance with the License. 30 | You may obtain a copy of the License at 31 | 32 | http://www.apache.org/licenses/LICENSE-2.0 33 | 34 | Unless required by applicable law or agreed to in writing, software 35 | distributed under the License is distributed on an "AS IS" BASIS, 36 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 37 | See the License for the specific language governing permissions and 38 | limitations under the License. 39 | """ 40 | 41 | 42 | import urllib2 43 | import json 44 | 45 | from sys import exit 46 | from optparse import OptionParser, OptionGroup 47 | 48 | 49 | STATUS_OK = "status Elasticsearch returned a response" 50 | 51 | 52 | def bug_out(why): 53 | '''Something went wrong. Tell the agent what, then die.''' 54 | 55 | print "status", why 56 | exit(1) 57 | 58 | 59 | def call_to_cluster(host, path): 60 | '''Call a given path to the cluster and return JSON.''' 61 | 62 | try: 63 | r = urllib2.urlopen('{h}{p}'.format(h=host, p=path)) 64 | except (urllib2.URLError, ValueError) as e: 65 | bug_out(e) 66 | 67 | try: 68 | response = json.loads(r.read()) 69 | except Exception as e: # improve this... 70 | bug_out(e) 71 | 72 | return response 73 | 74 | 75 | def get_stats(host, keyname): 76 | '''Return a dict of stats from /_cluster/nodes/_local/stats. 77 | Keyname can be one of: docs, search, indexing, store, get''' 78 | 79 | h = call_to_cluster(host, '/_cluster/nodes/_local/stats') 80 | 81 | node_name = h['nodes'].keys()[0] 82 | stats = h['nodes'][node_name]['indices'][keyname] 83 | 84 | return stats 85 | 86 | 87 | def cluster_health(option, opt, value, parser): 88 | '''Print metrics about /_cluster/health.''' 89 | 90 | h = call_to_cluster(parser.values.host, '/_cluster/health') 91 | 92 | print STATUS_OK 93 | print "metric status string", h['status'] 94 | print "metric number_of_nodes uint32", h['number_of_nodes'] 95 | print "metric unassigned_shards uint32", h['unassigned_shards'] 96 | print "metric timed_out string", h['timed_out'] 97 | print "metric active_primary_shards uint32", h['active_primary_shards'] 98 | print "metric cluster_name string", h['cluster_name'] 99 | print "metric relocating_shards uint32", h['relocating_shards'] 100 | print "metric active_shards uint32", h['active_shards'] 101 | print "metric initializing_shards uint32", h['initializing_shards'] 102 | print "metric number_of_data_nodes uint32", h['number_of_data_nodes'] 103 | 104 | 105 | def stats_store(option, opt, value, parser): 106 | '''Print store metrics from /_cluster/nodes/_local/stats.''' 107 | 108 | s = get_stats(parser.values.host, 'store') 109 | 110 | print STATUS_OK 111 | print "metric size_in_bytes uint64", s['size_in_bytes'] 112 | print "metric throttle_time_in_millis uint32", s['throttle_time_in_millis'] 113 | 114 | 115 | def stats_indexing(option, opt, value, parser): 116 | '''Print indexing metrics from /_cluster/nodes/_local/stats.''' 117 | 118 | s = get_stats(parser.values.host, 'indexing') 119 | 120 | print STATUS_OK 121 | print "metric delete_time_in_millis uint32", s['delete_time_in_millis'] 122 | print "metric delete_total uint64", s['delete_total'] 123 | print "metric delete_current uint32", s['delete_current'] 124 | print "metric index_time_in_millis uint32", s['index_time_in_millis'] 125 | print "metric index_total uint64", s['index_total'] 126 | print "metric index_current uint32", s['index_current'] 127 | 128 | 129 | def stats_get(option, opt, value, parser): 130 | '''Print GET metrics from /_cluster/nodes/_local/stats.''' 131 | 132 | s = get_stats(parser.values.host, 'get') 133 | 134 | print STATUS_OK 135 | print "metric missing_total uint32", s['missing_total'] 136 | print "metric exists_total uint32", s['exists_total'] 137 | print "metric current uint32", s['current'] 138 | print "metric time_in_millis uint32", s['time_in_millis'] 139 | print "metric missing_time_in_millis", s['missing_time_in_millis'] 140 | print "metric exists_time_in_millis", s['exists_time_in_millis'] 141 | print "metric total uint32", s['total'] 142 | 143 | 144 | def stats_search(option, opt, value, parser): 145 | '''Print search metrics from /_cluster/nodes/_local/stats.''' 146 | 147 | s = get_stats(parser.values.host, 'search') 148 | 149 | print STATUS_OK 150 | print "metric query_total uint64", s['query_total'] 151 | print "metric fetch_time_in_millis uint32", s['fetch_time_in_millis'] 152 | print "metric fetch_total uint64", s['fetch_total'] 153 | print "metric query_time_in_millis uint32", s['query_time_in_millis'] 154 | print "metric open_contexts uint32", s['open_contexts'] 155 | print "metric fetch_current uint32", s['fetch_current'] 156 | print "metric query_current uint32", s['query_current'] 157 | 158 | 159 | def stats_docs(option, opt, value, parser): 160 | '''Print doc metrics from /_cluster/nodes/_local/stats.''' 161 | 162 | s = get_stats(parser.values.host, 'docs') 163 | 164 | print STATUS_OK 165 | print "metric count uint64", s['count'] 166 | print "metric deleted uint32", s['deleted'] 167 | 168 | 169 | if __name__ == "__main__": 170 | parser = OptionParser() 171 | 172 | parser.add_option("-H", "--host", 173 | action="store", type="string", dest="host", 174 | default="http://localhost:9200") 175 | 176 | mg = OptionGroup(parser, "Possible Metric Groups") 177 | mg.add_option("--cluster-health", action="callback", 178 | callback=cluster_health) 179 | mg.add_option("--stats-store", action="callback", 180 | callback=stats_store) 181 | mg.add_option("--stats-indexing", action="callback", 182 | callback=stats_indexing) 183 | mg.add_option("--stats-get", action="callback", 184 | callback=stats_get) 185 | mg.add_option("--stats-search", action="callback", 186 | callback=stats_search) 187 | mg.add_option("--stats-docs", action="callback", 188 | callback=stats_docs) 189 | 190 | parser.add_option_group(mg) 191 | (options, args) = parser.parse_args() 192 | -------------------------------------------------------------------------------- /etcd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Rackspace Cloud Monitoring plugin for etcd node stats. 4 | 5 | Example: 6 | $ ./etcd.py --url http://localhost:4001 7 | 8 | Example alarm criteria: 9 | 10 | if (metric['state'] != 'follower' && metric['state'] != 'leader') { 11 | return new AlarmStatus(CRITICAL, 'Node is neither leader nor follower.'); 12 | } 13 | 14 | if (metric['state'] == 'follower') { 15 | return new AlarmStatus(OK, 'Node is following #{leader}.'); 16 | } 17 | 18 | if (metric['state'] == 'leader') { 19 | return new AlarmStatus(OK, 'Node is leading the cluster.'); 20 | } 21 | 22 | Copyright 2014 Simon Vetter 23 | 24 | Based on Victor Watkins' elasticsearch plugin: 25 | Copyright 2013 Victor Watkins 26 | 27 | Licensed under the Apache License, Version 2.0 (the "License"); 28 | you may not use this file except in compliance with the License. 29 | You may obtain a copy of the License at 30 | 31 | http://www.apache.org/licenses/LICENSE-2.0 32 | 33 | Unless required by applicable law or agreed to in writing, software 34 | distributed under the License is distributed on an "AS IS" BASIS, 35 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 36 | See the License for the specific language governing permissions and 37 | limitations under the License. 38 | """ 39 | import urllib2 40 | import json 41 | 42 | from sys import exit 43 | from optparse import OptionParser, OptionGroup 44 | 45 | 46 | STATUS_OK = "status etcd returned a response" 47 | 48 | 49 | def bug_out(why): 50 | '''Something went wrong. Tell the agent what, then die.''' 51 | 52 | print "status", why 53 | exit(1) 54 | 55 | 56 | def call_to_server(url, path): 57 | '''Call a given path to the server and return JSON.''' 58 | 59 | try: 60 | r = urllib2.urlopen('{u}{p}'.format(u=url, p=path)) 61 | except (urllib2.URLError, ValueError) as e: 62 | bug_out(e) 63 | 64 | try: 65 | response = json.loads(r.read()) 66 | except Exception as e: # improve this... 67 | bug_out(e) 68 | 69 | return response 70 | 71 | 72 | def get_stats(url): 73 | '''Return a dict of stats from /v2/stats/self''' 74 | 75 | s = call_to_server(url, '/v2/stats/self') 76 | 77 | # i've seen etcd return {"state":""}, so make sure the agent accepts it 78 | if not s['state']: 79 | s['state'] = "unknown" 80 | 81 | print STATUS_OK 82 | print "metric state string", s['state'] 83 | print "metric leader string", s['leaderInfo']['leader'] 84 | print "metric recvAppendRequestCnt uint64", s['recvAppendRequestCnt'] 85 | print "metric sendAppendRequestCnt uint64", s['sendAppendRequestCnt'] 86 | 87 | exit(0) 88 | 89 | 90 | if __name__ == "__main__": 91 | parser = OptionParser() 92 | 93 | parser.add_option("--url", 94 | action="store", type="string", dest="url", 95 | default="http://localhost:4001") 96 | 97 | (options, args) = parser.parse_args() 98 | get_stats(parser.values.url); 99 | -------------------------------------------------------------------------------- /examples/example.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | ## Rackspace Cloud Monitoring Plug-In 3 | ## Example ruby plug-in 4 | # 5 | # (C)2013 Jay Faulkner 6 | # All Rights Reserved. 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 9 | # not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # 20 | # example.rb 21 | # - A ruby example of a Rackspace Cloud Montioring Agent plugin 22 | 23 | # If the plugin fails in any way, print why and exit nonzero. 24 | def fail(status="Unknown failure") 25 | puts "status #{status}" 26 | exit 1 27 | end 28 | 29 | # Store metrics in a hash and don't print them until we've completed 30 | def metric(name,type,value) 31 | @metrics[name] = { 32 | :type => type, 33 | :value => value 34 | } 35 | end 36 | 37 | # Once the script has succeeded without errors, print metrics lines. 38 | def output_success 39 | puts "status Your new plugin is reporting metrics!" 40 | @metrics.each do |name,v| 41 | puts "metric #{name} #{v[:type]} #{v[:value]}" 42 | end 43 | end 44 | 45 | begin 46 | require 'optparse' 47 | rescue 48 | fail "Failed to load required ruby gems!" 49 | end 50 | 51 | @metrics = {} 52 | options = {} 53 | 54 | args = ARGV.dup 55 | 56 | OptionParser.new do |o| 57 | o.banner = "Usage: #{$0} [options]" 58 | o.on('-o', '--my-option OPTION', 'Set OPTION to be a valid option') do |s| 59 | options[:option] = s 60 | end 61 | o.on_tail('-h', '--help', 'Show this message') { puts o; exit } 62 | o.parse!(args) 63 | end 64 | 65 | # Error handling by option/input validation and begin;rescue;end is recommended 66 | if false 67 | fail "I checked to make sure this would succeed, and it didn't" 68 | end 69 | 70 | # Gather metrics using your own code here. 71 | # Call metric(name,type,value) for every metric you want to record. 72 | 73 | # Faking metrics for this example 74 | metric("example","int64",40895) 75 | metric("fake_http_code","string","500") 76 | 77 | output_success 78 | -------------------------------------------------------------------------------- /file_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Rackspace Cloud Monitoring plugin to provide file/directory information. 4 | 5 | The three metrics returned for the target: 6 | - age, calculated from ctime 7 | - size, in bytes 8 | - permissions, octal 9 | 10 | Copyright 2013 Steve Katen 11 | 12 | Licensed under the Apache License, Version 2.0 (the "License"); 13 | you may not use this file except in compliance with the License. 14 | You may obtain a copy of the License at 15 | 16 | http://www.apache.org/licenses/LICENSE-2.0 17 | 18 | Unless required by applicable law or agreed to in writing, software 19 | distributed under the License is distributed on an "AS IS" BASIS, 20 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 21 | See the License for the specific language governing permissions and 22 | limitations under the License. 23 | """ 24 | import sys 25 | import os 26 | import time 27 | 28 | 29 | def main(): 30 | if len(sys.argv) != 2: 31 | print "Requires a full path to the target passed as an argument" 32 | sys.exit(0) 33 | 34 | path = sys.argv[1] 35 | if not os.path.exists(path): 36 | print "status err target does not exist" 37 | sys.exit(0) 38 | 39 | try: 40 | details = os.stat(path) 41 | age = int(time.time() - details.st_ctime) 42 | size = details.st_size 43 | mode = oct(details.st_mode & 0777) 44 | 45 | print "status ok target exists" 46 | print "metric age int", age 47 | print "metric bytes int", size 48 | print "metric mode string", mode 49 | except Exception, e: 50 | print "status err Exception discovered: {}".format(str(e)) 51 | 52 | 53 | if __name__ == '__main__': 54 | main() 55 | -------------------------------------------------------------------------------- /hadoop_hbase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Rackspace Cloud Monitoring Plugin to read HBase metrics. 4 | # 5 | # USAGE; 6 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 7 | # and run like this: 8 | # 9 | # hadoop_hbase.py [OPTIONS] 10 | # 11 | # OPTIONS 12 | # -b PATH Pass in the hbase binary path 13 | # -u user Set the Hadoop HBase user name envariable. 14 | # 15 | # Requires: Python 2.6+ May work on Python 3+ 16 | # 17 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 18 | # 19 | # 20 | # if (metric['dead_regionservers'] > 0) { 21 | # return new AlarmStatus(WARNING, 'HBase has #{dead_regionservers} dead region servers'); 22 | # } 23 | # 24 | # if (metric['dead_regionservers_percent'] > 20) { 25 | # return new AlarmStatus(CRITICAL, 'HBase has #{dead_regionservers_percent}% dead region servers'); 26 | # } 27 | # 28 | # return new AlarmStatus(OK, 'HBase OK'); 29 | # 30 | # 31 | # Copyright (c) 2014, Dave Beckett 32 | # All rights reserved. 33 | # 34 | # Redistribution and use in source and binary forms, with or without 35 | # modification, are permitted provided that the following conditions are met: 36 | # 37 | # Redistributions of source code must retain the above copyright notice, 38 | # this list of conditions and the following disclaimer. 39 | # 40 | # Redistributions in binary form must reproduce the above copyright 41 | # notice, this list of conditions and the following disclaimer in the 42 | # documentation and/or other materials provided with the distribution. 43 | # 44 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 45 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 46 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 47 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 48 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 49 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 50 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 51 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 52 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 53 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 54 | # POSSIBILITY OF SUCH DAMAGE. 55 | # 56 | # 57 | 58 | from __future__ import print_function 59 | 60 | import os 61 | import re 62 | import sys 63 | import subprocess 64 | from tempfile import NamedTemporaryFile 65 | 66 | try: 67 | from subprocess import DEVNULL # py3k 68 | except ImportError: 69 | import os 70 | DEVNULL = open(os.devnull, 'wb') 71 | 72 | import argparse 73 | 74 | 75 | HBASE='/usr/bin/hbase'; 76 | 77 | # Constants 78 | ERROR_RE = re.compile('r^ERROR: (.+)') 79 | LIVE_RE = re.compile(r'^(\d+) live servers') 80 | DEAD_RE = re.compile(r'^(\d+) dead servers') 81 | LOAD_RE = re.compile(r'^^Aggregate load: (\d+)') 82 | REGIONS_RE = re.compile(r'^^Aggregate load: \d+, regions: (\d+)') 83 | 84 | 85 | def get_hbase_status_metrics(hbase): 86 | """ Get HBase status metrics 87 | 88 | :param hbase Path to 'hbase' command 89 | """ 90 | 91 | f = None 92 | try: 93 | f = NamedTemporaryFile(delete=False) 94 | f.write("status 'simple'\nexit\n") 95 | f.close() 96 | except Exception, e: 97 | raise Exception("write to {0} failed {1}".format(f.name if f else "", str(e))) 98 | 99 | # Call the hbase CLI command to get basic status 100 | cmd = [hbase, 'shell', f.name] 101 | metrics = {} 102 | total_rs = 0 103 | try: 104 | # Py2.7+ adds check_output so the DEVNULL line can be removed 105 | # result = subprocess.check_output(cmd, stdin=None, stderr=None) 106 | result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=DEVNULL).communicate()[0] 107 | for line in result.split('\n'): 108 | matches = ERROR_RE.match(line) 109 | if matches is not None: 110 | raise Exception("hbase shell returned error {0}".format(matches.group(1))) 111 | 112 | matches = LIVE_RE.match(line) 113 | if matches is not None: 114 | v = int(matches.group(1)) 115 | total_rs += v 116 | metrics['live_regionservers'] = (v, 'uint32') 117 | 118 | matches = DEAD_RE.match(line) 119 | if matches is not None: 120 | v = int(matches.group(1)) 121 | total_rs += v 122 | metrics['dead_regionservers'] = (v, 'uint32') 123 | 124 | matches = LOAD_RE.match(line) 125 | if matches is not None: 126 | v = int(matches.group(1)) 127 | metrics['aggregate_load'] = (v, 'uint32') 128 | 129 | matches = REGIONS_RE.match(line) 130 | if matches is not None: 131 | v = int(matches.group(1)) 132 | metrics['regions'] = (v, 'uint32') 133 | 134 | except Exception, e: 135 | raise Exception("command {0} failed {1}".format(str(cmd), str(e))) 136 | finally: 137 | os.unlink(f.name) 138 | 139 | metrics['total_regionservers'] = (total_rs, 'uint32') 140 | 141 | for k in ['live_regionservers', 'dead_regionservers']: 142 | if k in metrics: 143 | v = "{0:.2f}".format(metrics[k][0] * 100.0 / total_rs) 144 | metrics[k + '_percent'] = (v, 'double') 145 | 146 | return metrics 147 | 148 | 149 | def main(): 150 | """Main method""" 151 | 152 | parser = argparse.ArgumentParser(description='HBase status metrics') 153 | parser.add_argument('-b', '--hbase', 154 | default = HBASE, 155 | help = 'hbase command (default: {0})'.format(HBASE)) 156 | parser.add_argument('-u', '--user', 157 | default = None, 158 | help = 'user') 159 | 160 | args = parser.parse_args() 161 | 162 | ###################################################################### 163 | 164 | hbase = args.hbase 165 | user = args.user 166 | if user is not None: 167 | os.putenv("HADOOP_USER_NAME", user) 168 | 169 | try: 170 | metrics = get_hbase_status_metrics(hbase) 171 | print("status ok") 172 | for k, t in metrics.iteritems(): 173 | if t is not None: 174 | (v, type_str) = t 175 | print("metric {0} {1} {2}".format(k, type_str, v)) 176 | except Exception, e: 177 | print("status err exception {0}".format(str(e))) 178 | 179 | sys.exit(0) 180 | 181 | if __name__ == '__main__': 182 | main() 183 | -------------------------------------------------------------------------------- /hadoop_hdfs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Rackspace Cloud Monitoring Plugin to read HDFS metrics. 4 | # 5 | # USAGE; 6 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 7 | # and run like this: 8 | # 9 | # hadoop_hdfs.py [OPTIONS] 10 | # 11 | # OPTIONS 12 | # -H PATH Pass in the hadoop binary path 13 | # -u user Set the Hadoop HDFS user name envariable. 14 | # 15 | # Requires: Python 2.7+ and should work with Python 3 16 | # 17 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 18 | # 19 | # 20 | # if (metric['datanodes_dead'] > 0) { 21 | # return new AlarmStatus(CRITICAL, 'HDFS has #{datanodes_dead} dead datanodes'); 22 | # } 23 | # 24 | # if (metric['datanodes_dead'] > 2) { 25 | # return new AlarmStatus(CRITICAL, 'HDFS has #{datanodes_dead} dead datanodes'); 26 | # } 27 | # 28 | # if (metric['blocks_missing'] > 0) { 29 | # return new AlarmStatus(CRITICAL, 'HDFS has #{blocks_missing} missing blocks'); 30 | # } 31 | # 32 | # if (metric['free_percent'] < 20) { 33 | # return new AlarmStatus(WARNING, 'HDFS has #{free_percent} free'); 34 | # } 35 | # 36 | # if (metric['free_percent'] < 10) { 37 | # return new AlarmStatus(CRITICAL, 'HDFS has #{free_percent} free'); 38 | # } 39 | # 40 | # return new AlarmStatus(OK, 'HDFS OK'); 41 | # 42 | # 43 | # Copyright (c) 2014, Dave Beckett 44 | # All rights reserved. 45 | # 46 | # Redistribution and use in source and binary forms, with or without 47 | # modification, are permitted provided that the following conditions are met: 48 | # 49 | # Redistributions of source code must retain the above copyright notice, 50 | # this list of conditions and the following disclaimer. 51 | # 52 | # Redistributions in binary form must reproduce the above copyright 53 | # notice, this list of conditions and the following disclaimer in the 54 | # documentation and/or other materials provided with the distribution. 55 | # 56 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 57 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 60 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 61 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 62 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 63 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 64 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 65 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 66 | # POSSIBILITY OF SUCH DAMAGE. 67 | # 68 | # 69 | 70 | from __future__ import print_function 71 | 72 | import os 73 | import re 74 | import sys 75 | import subprocess 76 | 77 | try: 78 | from subprocess import DEVNULL # py3k 79 | except ImportError: 80 | import os 81 | DEVNULL = open(os.devnull, 'wb') 82 | 83 | # 2.7+ 84 | import argparse 85 | 86 | 87 | HADOOP='/usr/bin/hadoop'; 88 | 89 | # Constants 90 | 91 | METRIC_CONFIG = { 92 | # Bytes 93 | 'total' : (None, 'uint64'), # calculated below 94 | 'total_configured' : (re.compile(r'Configured Capacity: (\d+)'), 'uint64'), 95 | 'total_present' : (re.compile(r'Present Capacity: (\d+)'), 'uint64'), 96 | 'free' : (re.compile(r'DFS Remaining: (\d+)'), 'uint64'), 97 | 'free_percent' : (None, 'double'), # calculated below 98 | 'used' : (re.compile(r'DFS Used: (\d+)'), 'uint64'), 99 | 'used_percent' : (re.compile(r'DFS Used%: (\d+)'), 'double'), 100 | 101 | # Blocks 102 | 'blocks_under_replicated' : (re.compile(r'Under replicated blocks: (\d+)'), 'uint64'), 103 | 'blocks_missing' : (re.compile(r'Missing blocks: (\d+)'), 'uint64'), 104 | 'blocks_with_corrupt_replicas' : (re.compile(r'Blocks with corrupt replicas: (\d+)'), 'uint64'), 105 | 106 | # Datanodes 107 | # These 4 are not calculated yet; they need datanode blocks parsing 108 | 'used_non_dfs' : (None, 'uint64'), 109 | 'used_non_dfs_percent' : (None, 'double'), 110 | 'datanode_remaining_max' : (None, 'uint32'), 111 | 'datanode_remaining_min' : (None, 'uint32'), 112 | 113 | 'datanodes_available' : (re.compile(r'Datanodes available: (\d+)'), 'uint32'), 114 | 'datanodes_dead' : (re.compile(r'Datanodes available: \d+ \(\d+ total, (\d+) dead'), 'uint32'), 115 | 'datanodes_total' : (re.compile(r'Datanodes available: \d+ \((\d+) total'), 'uint32'), 116 | } 117 | 118 | 119 | 120 | def get_hdfs_status_metrics(hadoop): 121 | """ Get HDFS status metrics 122 | 123 | May throw a subprocess exception if the hadoop command fails. 124 | 125 | """ 126 | 127 | # Call the hdfs CLI command to get basic status 128 | cmd = [hadoop, 'dfsadmin', '-report'] 129 | metrics = {} 130 | try: 131 | # Py2.7+ 132 | # result = subprocess.check_output(cmd, stdin=None, stderr=None) 133 | result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=DEVNULL).communicate()[0] 134 | except Exception, e: 135 | raise Exception("command {0} failed {1}".format(str(cmd), str(e))) 136 | 137 | for line in result.split('\n'): 138 | if line.startswith('Name: '): 139 | break 140 | for (k, v) in METRIC_CONFIG.iteritems(): 141 | (regexp, type_str) = v 142 | if regexp is not None: 143 | matches = regexp.match(line) 144 | if matches is not None: 145 | metrics[k] = (matches.group(1), type_str) 146 | 147 | # Calculate capacity 148 | if 'total_configured' in metrics: 149 | metrics['capacity'] = metrics['total_configured'] 150 | 151 | # Calculate free_percent 152 | if 'free' in metrics and 'total_configured' in metrics: 153 | remaining = int(metrics['free'][0]) 154 | capacity = int(metrics['total_configured'][0]) 155 | v = "{0:.2f}".format(remaining * 100.0 / capacity) 156 | metrics['free_percent'] = (v, 'double') 157 | else: 158 | metrics['free_percent'] = None 159 | 160 | return metrics 161 | 162 | 163 | def main(): 164 | """Main method""" 165 | 166 | parser = argparse.ArgumentParser(description='HDFS status metrics') 167 | parser.add_argument('-H', '--hadoop', 168 | default = HADOOP, 169 | help = 'hadoop command (default: {0})'.format(HADOOP)) 170 | parser.add_argument('-u', '--user', 171 | default = None, 172 | help = 'user') 173 | 174 | args = parser.parse_args() 175 | 176 | ###################################################################### 177 | 178 | hadoop = args.hadoop 179 | user = args.user 180 | if user is not None: 181 | os.putenv("HADOOP_USER_NAME", user) 182 | 183 | try: 184 | metrics = get_hdfs_status_metrics(hadoop) 185 | print("status ok") 186 | for k, t in metrics.iteritems(): 187 | if t is not None: 188 | (v, type_str) = t 189 | print("metric {0} {1} {2}".format(k, type_str, v)) 190 | except Exception, e: 191 | print("status err exception {0}".format(str(e))) 192 | 193 | sys.exit(0) 194 | 195 | if __name__ == '__main__': 196 | main() 197 | -------------------------------------------------------------------------------- /hadoop_jobtracker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Rackspace Cloud Monitoring Plugin to read HDFS metrics. 4 | # 5 | # USAGE; 6 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 7 | # and run like this: 8 | # 9 | # hadoop_jobtracker.py [OPTIONS] 10 | # 11 | # OPTIONS 12 | # -n host Set the namenode host (REQUIRED) 13 | # -p port Set the namenode port 14 | # -u user Set the Hadoop HDFS user name envariable. 15 | # 16 | # 17 | # Requires: Python 2.7+ 18 | # 19 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 20 | # 21 | # 22 | # if (metric['dead_nodes'] > 0) { 23 | # return new AlarmStatus(CRITICAL, 'Map-Reduce has #{dead_nodes} dead nodes'); 24 | # } 25 | # 26 | # if (metric['dead_nodes'] > 2) { 27 | # return new AlarmStatus(CRITICAL, 'Map-Reduce has #{dead_nodes} dead nodes'); 28 | # } 29 | # 30 | # return new AlarmStatus(OK, 'Map-Reduce Job Tracker OK'); 31 | # 32 | # 33 | # Copyright (c) 2014, Dave Beckett 34 | # All rights reserved. 35 | # 36 | # Redistribution and use in source and binary forms, with or without 37 | # modification, are permitted provided that the following conditions are met: 38 | # 39 | # Redistributions of source code must retain the above copyright notice, 40 | # this list of conditions and the following disclaimer. 41 | # 42 | # Redistributions in binary form must reproduce the above copyright 43 | # notice, this list of conditions and the following disclaimer in the 44 | # documentation and/or other materials provided with the distribution. 45 | # 46 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 47 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 48 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 49 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 50 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 51 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 52 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 53 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 54 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 55 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 56 | # POSSIBILITY OF SUCH DAMAGE. 57 | # 58 | # 59 | 60 | from __future__ import print_function 61 | 62 | import json 63 | import os 64 | import re 65 | import sys 66 | 67 | # 2.7+ 68 | import argparse 69 | 70 | import urllib2 71 | 72 | 73 | # Constants 74 | 75 | # All types are uint32 76 | METRIC_TYPE = 'uint32' 77 | 78 | METRIC_NAMES = [ 79 | 'reduce_slots', 80 | 'map_slots_used', 81 | 'total_nodes', 82 | 'dead_nodes', 83 | 'map_slots', 84 | 'alive_nodes', 85 | 'reduce_slots_used' 86 | ] 87 | 88 | 89 | def get_namenode_bean_data(namenode_host, namenode_port = 50030): 90 | """ Get the namenode bean data for namenode """ 91 | 92 | # JMX URI for the hadoop namenode to get the JobTrackerInfo 93 | url = "http://{0}:{1}/jmx?qry=Hadoop%3Aservice%3DJobTracker%2Cname%3DJobTrackerInfo".format(namenode_host, namenode_port) 94 | 95 | beans = None 96 | try: 97 | response = urllib2.urlopen(url) 98 | content = response.read() 99 | data = json.loads(content) 100 | beans = data['beans'][0] 101 | except Exception, e: 102 | raise Exception("Error reading {0} url JSON - {1}".format(url, str(e))) 103 | 104 | return beans 105 | 106 | def get_summary_metrics(d): 107 | """Get metrics for the AliveNodesInfoJson""" 108 | 109 | nodes_count = d['nodes'] 110 | alive_count = d['alive'] 111 | slots = d['slots'] 112 | metrics = { 113 | 'total_nodes' : nodes_count, 114 | 'alive_nodes' : alive_count, 115 | 'dead_nodes' : nodes_count - alive_count, 116 | 'map_slots' : slots['map_slots'], 117 | 'reduce_slots' : slots['reduce_slots'], 118 | 'map_slots_used' : slots['map_slots_used'], 119 | 'reduce_slots_used' : slots['reduce_slots_used'] 120 | } 121 | 122 | return metrics 123 | 124 | 125 | def get_job_tracker_metrics(beans): 126 | metrics = {} 127 | 128 | # Process the summary data 129 | summaryData = None 130 | summaryJson = beans.get('SummaryJson', None) 131 | if summaryJson is not None: 132 | summaryData = None 133 | try: 134 | summaryData = json.loads(summaryJson) 135 | except Exception, e: 136 | raise Exception("Error reading summary JSON - {0}: {1}".format(str(e), summaryJson)) 137 | 138 | if summaryData is not None: 139 | m = get_summary_metrics(summaryData) 140 | metrics.update(m) 141 | 142 | if summaryData is None: 143 | raise Exception("No SummaryJson data in XML") 144 | return None 145 | 146 | return metrics 147 | 148 | 149 | def main(): 150 | """Main method""" 151 | 152 | parser = argparse.ArgumentParser(description='HDFS status metrics') 153 | parser.add_argument('-n', '--namenode', 154 | default = None, 155 | help = 'namenode host') 156 | parser.add_argument('-p', '--port', 157 | default = 50030, 158 | help = 'namenode port (Default 50030)') 159 | parser.add_argument('-u', '--user', 160 | default = None, 161 | help = 'user') 162 | 163 | args = parser.parse_args() 164 | 165 | ###################################################################### 166 | 167 | user = args.user 168 | if user is not None: 169 | os.putenv("HADOOP_USER_NAME", user) 170 | namenode_host = args.namenode 171 | if namenode_host is None: 172 | print("Must give namenode host name") 173 | sys.exit(1) 174 | namenode_port = args.port 175 | 176 | try: 177 | beans = get_namenode_bean_data(namenode_host, namenode_port) 178 | if beans is None: 179 | sys.exit(1) 180 | 181 | metrics = get_job_tracker_metrics(beans) 182 | if metrics is None: 183 | sys.exit(1) 184 | 185 | print("status ok") 186 | for k, v in metrics.iteritems(): 187 | print("metric {0} {1} {2}".format(k, METRIC_TYPE, v)) 188 | except Exception, e: 189 | print("status err exception {0}".format(str(e))) 190 | 191 | sys.exit(0) 192 | 193 | if __name__ == '__main__': 194 | main() 195 | -------------------------------------------------------------------------------- /haproxy.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | ## Rackspace Cloud Monitoring Plug-In 3 | ## HAProxy Stats 4 | # 5 | # ---------------------------------------------------------------------------- 6 | # "THE BEER-WARE LICENSE" (Revision 42): 7 | # wrote this file. As long as you retain this notice you 8 | # can do whatever you want with this stuff. If we meet some day, and you think 9 | # this stuff is worth it, you can buy me a beer in return 10 | # ---------------------------------------------------------------------------- 11 | # 12 | # haproxy.rb 13 | # - Takes HAProxy stats and grabs connections, rate, and check time 14 | # for every listener and every backend server, and prints it using 15 | # Rackspace Cloud Montioring metric lines 16 | # 17 | # Usage: 18 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 19 | # 20 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 21 | # 22 | # if (metric['connections'] == 0) { 23 | # return new AlarmStatus(CRITICAL, 'No connections to your HAProxy! 24 | # 25 | # if (metric['connections'] < '10') { 26 | # return new AlarmStatus(WARNING, 'Less than 10 connections to your HAProxy!'); 27 | # } 28 | # 29 | # return new AlarmStatus(OK, 'HAProxy connections normal'); 30 | # 31 | # Please note that you will need to adjust the thresholds based on workload. 32 | # Also, there are other metrics this plugin reports you may find useful, but 33 | # the metricnames for these will vary based on your HAProxy cluster name. 34 | # 35 | 36 | def fail(status = 'Unknown failure') 37 | puts "status #{status}" 38 | exit 1 39 | end 40 | 41 | def metric(name, type, value) 42 | @metrics[name] = { 43 | :type => type, 44 | :value => value 45 | } 46 | end 47 | 48 | def output_success 49 | puts 'status HAProxy is running and reporting metrics' 50 | @metrics.each do |name, v| 51 | puts "metric #{name} #{v[:type]} #{v[:value]}" 52 | end 53 | end 54 | 55 | begin 56 | require 'optparse' 57 | require 'socket' 58 | rescue 59 | fail 'Failed to load required ruby gems' 60 | end 61 | 62 | @metrics = {} 63 | options = { 64 | :limit => 10 65 | } 66 | 67 | args = ARGV.dup 68 | 69 | OptionParser.new do |o| 70 | o.banner = "Usage: #{$PROGRAM_NAME} [options]" 71 | o.on('-s', '--stats-socket SOCKET', 'Specify the HAProxy stats socket') do |s| 72 | options[:sock] = s 73 | end 74 | o.on('-l', '--limit BACKEND_COUNT', 'Specify a limit of how many backends to report. Default is 10.') do |l| 75 | options[:limit] = l.to_i 76 | end 77 | o.on_tail('-h', '--help', 'Show this message') { puts o; exit } 78 | o.parse!(args) 79 | end 80 | 81 | fail 'You must specify the haproxy stats socket' if options[:sock].nil? 82 | 83 | pid = `pidof haproxy`.chomp.to_i || fail('HAProxy is not running') 84 | 85 | # get global frontend stats 86 | begin 87 | ctl = UNIXSocket.new(options[:sock]) 88 | ctl.puts 'show info' 89 | 90 | while (line = ctl.gets) 91 | if line =~ /^CurrConns:/ 92 | line = line.split(':') 93 | metric('connections', 'int', line[1].to_i) 94 | end 95 | if line =~ /^ConnRate:/ 96 | line = line.split(':') 97 | metric('connection_rate', 'int', line[1].to_i) 98 | end 99 | end 100 | ctl.close 101 | rescue 102 | fail "Problem reading global stats from #{options[:sock]}" 103 | end 104 | 105 | # get per-backend stats 106 | begin 107 | ctl = UNIXSocket.new(options[:sock]) 108 | ctl.puts 'show stat' 109 | 110 | i = 0 111 | while (line = ctl.gets) 112 | next unless line =~ /^[^#]\w+/ 113 | line = line.split(',') 114 | host = "#{line[0]}_#{line[1]}".tr('-', '_').tr('.', '_') 115 | if i < options[:limit] 116 | metric("#{host}_request_rate", 'int', line[47].to_i) 117 | metric("#{host}_total_requests", 'gauge', line[49].to_i) 118 | metric("#{host}_current_queue", 'int', line[3].to_i) 119 | metric("#{host}_health_check_duration","int",line[35].to_i) 120 | i += 1 121 | end 122 | end 123 | ctl.close 124 | rescue 125 | fail "Problem reading backend stats from #{options[:sock]}" 126 | end 127 | 128 | output_success 129 | -------------------------------------------------------------------------------- /jmx-gather.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Rackspace Cloud Monitoring Plug-In 4 | # Gathers JMX MBean attribute values from a specified ObjectName via a remote JMX RMI endpoint. 5 | # 6 | # (c) 2017 Geoff Bourne 7 | # All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 10 | # not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # 21 | # Usage: 22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 23 | # 24 | # It accepts three or more arguments: 25 | # * host:port of the JMX RMI endpoint to access, such as "localhost:9080" 26 | # * the MBean's ObjectName, such as "java.lang:type=Memory" 27 | # * an attribute name of the MBean, such as "HeapMemoryUsage" 28 | # * an attribute name... 29 | # 30 | # Returns a metric for each attribute that was found. Metrics are hardcoded to be typed as "gauge" since their 31 | # originating type can vary and is dictated by the MBean accessed. 32 | # NOTE: 33 | # * if the MBean was not found, the status is reported "ok" with no metrics reported 34 | # * if an attribute is not found on the given MBean that attribute's metric line is simply omitted 35 | 36 | # Cleanup and setup our JavaScript code to run through Java's jrunscript 37 | trap "rm -f /tmp/gather-mbean-$$.js" EXIT INT QUIT TSTP 38 | 39 | cat > /tmp/gather-mbean-$$.js < 8 | # Copyright (c) 2016, Horizon Discovery Plc. 9 | # All rights reserved. 10 | # 11 | # Redistribution and use in source and binary forms, with or without 12 | # modification, are permitted provided that the following conditions are met: 13 | # 14 | # Redistributions of source code must retain the above copyright notice, 15 | # this list of conditions and the following disclaimer. 16 | # 17 | # Redistributions in binary form must reproduce the above copyright 18 | # notice, this list of conditions and the following disclaimer in the 19 | # documentation and/or other materials provided with the distribution. 20 | # 21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 25 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 | # POSSIBILITY OF SUCH DAMAGE. 32 | # 33 | # Curl Command: 34 | # curl -i -X POST -H 'Host: monitoring.api.rackspacecloud.com' -H 35 | # 'Accept-Encoding: gzip,deflate' -H 'X-Auth-Token: YOUR_API_TOKEN' -H 36 | # 'Content-Type: application/json; charset=UTF-8' -H 'Accept: 37 | # application/json' --data-binary '{"label": "Long Process Check", "type": 38 | # "agent.plugin", "details": {"args": ["PROCESS_NAME", "TIMEOUT"],"file": 39 | # "long_process.sh"}}' --compress 40 | # 'https://monitoring.api.rackspacecloud.com:443/v1.0/YOUR_ACCOUNT/entities/YOUR_ENTITY/checks' 41 | # 42 | # Usage: 43 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 44 | # 45 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 46 | # 47 | # if (metric['numprocs'] > 0) { 48 | # return new AlarmStatus(CRITICAL, '#{numprocs} long running processes(s): #{pids}'); 49 | # } 50 | # 51 | # return new AlarmStatus(OK, 'No long running processes.'); 52 | 53 | if [ "$#" -ne 2 ]; then 54 | cat < /dev/null 85 | 86 | # Check for a failure in the pipe 87 | PIPEEXIT=$? 88 | if [ $PIPEEXIT -ne 0 ]; then 89 | echo "status Fail" 90 | exit $PIPEEXIT 91 | fi 92 | 93 | # Numeric metric to compare against (number of processes) 94 | # Convert to array and count 95 | NUMPROCS=($PIDS) 96 | NUMPROCS=${#NUMPROCS[@]} 97 | 98 | echo "status Success" 99 | echo "metric numprocs int $NUMPROCS" 100 | if [ $NUMPROCS -ne 0 ]; then 101 | echo "metric pids string $PIDS" 102 | else 103 | echo "metric pids string -" 104 | fi 105 | -------------------------------------------------------------------------------- /lsyncd-status.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # lsyncd-status.sh 4 | # Rackspace Cloud Monitoring Plugin to verify the current return status of lsyncd. 5 | # 6 | # Copyright (c) 2013, Lindsey Anderson 7 | # All rights reserved. 8 | # 9 | # Redistribution and use in source and binary forms, with or without 10 | # modification, are permitted provided that the following conditions are met: 11 | # 12 | # Redistributions of source code must retain the above copyright notice, 13 | # this list of conditions and the following disclaimer. 14 | # 15 | # Redistributions in binary form must reproduce the above copyright 16 | # notice, this list of conditions and the following disclaimer in the 17 | # documentation and/or other materials provided with the distribution. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | # POSSIBILITY OF SUCH DAMAGE. 30 | # 31 | # 32 | # Verify the current status of lsyncd 33 | # ***************** 34 | # Note this data may take a few minutes to populate metric data at first 35 | # ***************** 36 | # 37 | # Example criteria : 38 | # 39 | # if (metric['lsyncd_status'] != 'running') { 40 | # return new AlarmStatus(CRITICAL, 'Lsyncd Service is NOT running.'); 41 | # } 42 | # 43 | # if (metric['lsyncd_status'] == 'running' && metric['percent_used_watches'] >= 80) { 44 | # return new AlarmStatus(WARNING, 'Lsyncd is running but the number of directories has reached 80% of notify watches.'); 45 | # } 46 | # 47 | # if (metric['lsyncd_status'] == 'running' && metric['percent_used_watches'] >= 95) { 48 | # return new AlarmStatus(CRITICAL, 'Lsyncd is running but the number of directories has reached 95% of notify watches.'); 49 | # } 50 | # 51 | # return new AlarmStatus(OK, 'Lsyncd Service is running.'); 52 | # 53 | # REQUIRES 'bc' to be installed 54 | 55 | SERVICE="lsyncd" 56 | 57 | # Attempt to locate lsyncd configuration file 58 | if [ -e /etc/lsyncd.lua ]; then 59 | lsyncd_conf_file="/etc/lsyncd.lua" 60 | elif [ -e /etc/lsyncd.conf ]; then 61 | lsyncd_conf_file="/etc/lsyncd.conf" 62 | elif [ -e /etc/lsyncd/lsyncd.conf.lua ]; then 63 | lsyncd_conf_file="/etc/lsyncd/lsyncd.conf.lua" 64 | elif [ -e /etc/lsyncd/lsyncd.conf ]; then 65 | lsyncd_conf_file="/etc/lsyncd/lsyncd.conf" 66 | else 67 | echo "status ${SERVICE} not installed" 68 | exit 1 69 | fi 70 | 71 | # Test if the service is running 72 | RESULT=$(pgrep -x ${SERVICE}) 73 | if [[ "${RESULT:-null}" = null ]]; then 74 | echo "metric ${SERVICE}_status string notrunning" 75 | else 76 | echo "metric ${SERVICE}_status string running" 77 | fi 78 | 79 | # Calculate current inotify watches 80 | current_inotify_watches=$(awk '{print $3}' <(sysctl fs.inotify.max_user_watches)) 81 | 82 | # 2.1.x status file contains number of directories watched. Avoids I/O overhead of find command. 83 | lsyncd_status_file=$(sed -n 's/.*statusFile\s*=\s*"\(.*\)",.*/\1/p' $lsyncd_conf_file) 84 | if [ -e "$lsyncd_status_file" ]; then 85 | current_directories_to_watch=$(sed -n "s/Inotify watching \([0-9][0-9]*\) directories/\1/p" "$lsyncd_status_file") 86 | fi 87 | 88 | # Fall back to old method if current_directories_to_watch is not a number 89 | if ! [[ "$current_directories_to_watch" =~ ^[0-9]+$ ]] ; then 90 | # Store the values we pull from the configuration file to an array 91 | watch_list=() 92 | for dir_watch in $(grep "source=\"/" ${lsyncd_conf_file} | grep -ve '^[ ]*--' ); do 93 | current_dir=$(echo $dir_watch | cut -d'=' -f2| sed -e "s/\"//g" -e "s/,//g") 94 | watch_list=("${watch_list[@]}" "${current_dir}") 95 | done 96 | # Force unique values in this array - not calculating for multiple directories 97 | sorted_unique_dirs=$(echo "${watch_list[@]}" | tr ' ' '\n' | sort -u | tr '\n' ' ') 98 | # calculate current directories to watch 99 | current_directories_to_watch=0 100 | for SOURCE in ${sorted_unique_dirs[@]}; do 101 | current_directories_to_watch=$(echo ${current_directories_to_watch}+$(find ${SOURCE} -type d | wc -l | awk '{print $1}') | bc -l) 102 | done 103 | #current_directories_to_watch=$(find ${SOURCE} -type d | wc -l | awk '{print $1}') 104 | fi 105 | 106 | # calculate percenentage of total 107 | current_percentage=$(echo "${current_directories_to_watch}/${current_inotify_watches}" | bc -l | awk '{printf "%f", $1*100}') 108 | 109 | echo "metric percent_used_watches double ${current_percentage}" 110 | 111 | -------------------------------------------------------------------------------- /megaraid.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Rackspace Cloud Monitoring Plug-In 3 | # megaraid plugin to query SMART status of drives attached to LSI megaraid or 4 | # DELL PERC {3,700} raid controllers. 5 | # 6 | # ---------------------------------------------------------------------------- 7 | # "THE BEER-WARE LICENSE" (Revision 42): 8 | # wrote this file. As long as you retain this notice 9 | # you can do whatever you want with this stuff. If we meet some day, and you 10 | # think this stuff is worth it, you can buy me a beer in return 11 | # ---------------------------------------------------------------------------- 12 | # 13 | # Usage: 14 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 15 | # 16 | # This plugin returns 5 metrics: 17 | # - failed : the number of drives in failed state, 18 | # - prefail : the number of drives in prefail state, 19 | # - unknown : the number of drives for which the smart state could not 20 | # be determined, 21 | # - ok : the number of drives in OK state, 22 | # - report : a string reporting the drive id, vendor, serial number 23 | # as well as the smart state for non-ok drives. 24 | # e.g. /dev/bus/0 -d megaraid,4 SEAGATE 6SL28GNF FAILED \ 25 | # ^controller & drive ids ^vendor ^serial# ^state 26 | # ( HARDWARE IMPENDING FAILURE GENERAL HARD DRIVE FAILURE [asc=5d, ascq=10] ) 27 | # ^SMART health status for this drive 28 | # 29 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 30 | # 31 | # if (metric['failed'] != 0) { 32 | # return new AlarmStatus(CRITICAL, '#{failed} failed drive(s): #{report}'); 33 | # } 34 | # 35 | # if (metric['prefail'] != 0) { 36 | # return new AlarmStatus(WARNING, '#{prefail} prefail drive(s): #{report}'); 37 | # } 38 | # 39 | # if (metric['unknown'] != 0) { 40 | # return new AlarmStatus(WARNING, '#{unknown} unknown drive(s): #{report}'); 41 | # } 42 | # 43 | # return new AlarmStatus(OK, '#{ok} drive(s) OK'); 44 | # 45 | # Things to keep in mind: 46 | # - this plugin needs a fairly recent version of smartmontools (tested OK with 6.2) 47 | # (apt-get install smartmontools) but does NOT need megacli. 48 | # - on big and loaded arrays, the plugin can take more than 10s (default agent plugin 49 | # timeout) to complete. Some disks are slower than others, not surprisingly. 50 | # - as of now, this plugin only checks individual drives and not the status of the 51 | # array as seen by the controller. I'd add it, but it seems hard to extract without 52 | # megacli which I'm trying to stay away from. If you know of a way, please let me 53 | # know. 54 | # 55 | # 56 | SMARTCTL=$(which smartctl) 57 | 58 | OK_CNT=0 59 | PREFAIL_CNT=0 60 | FAILED_CNT=0 61 | UNKNOWN_CNT=0 62 | REPORT="" 63 | 64 | # discover all drives 65 | DEVLIST=$(${SMARTCTL} --scan 2>/dev/null) 66 | if [ $? -ne 0 ] 67 | then 68 | echo status failed to perform drive discovery 69 | exit 1 70 | fi 71 | 72 | while read DEV 73 | do 74 | STAT=$(${SMARTCTL} ${DEV} --info --health 2>/dev/null) 75 | STATRC=$? 76 | SHS=$(echo "${STAT}" | grep -i 'smart health status:' | cut -d':' -f2) 77 | DRIVE_ID=$(echo "${STAT}" | grep -iE '(vendor:|serial number:)' | cut -d':' -f2 | xargs) 78 | 79 | # Bit 3: SMART status check returned "DISK FAILING". 80 | if [ $((${STATRC} & (2**3))) -ne 0 ]; then 81 | ((FAILED_CNT++)) 82 | REPORT="${REPORT} ${DEV} ${DRIVE_ID} FAILED (${SHS} ) " 83 | # Bit 4: We found prefail Attributes <= threshold. 84 | # Bit 5: SMART status check returned "DISK OK" but we found that some (usage or prefail) 85 | # attributes have been <= threshold at some time in the past. 86 | elif [ $((${STATRC} & (2**4) | ${STATRC} & (2**5))) -ne 0 ]; then 87 | ((PREFAIL_CNT++)) 88 | REPORT="${REPORT} ${DEV} ${DRIVE_ID} PREFAIL (${SHS} ) " 89 | # Anything else (drive open failed, smart command failed, etc.) maps to unknown to me 90 | elif [ ${STATRC} -ne 0 ]; then 91 | ((UNKNOWN_CNT++)) 92 | REPORT="${REPORT} ${DEV} ${DRIVE_ID} UNKNOWN (${SHS} ) " 93 | else 94 | ((OK_CNT++)) 95 | fi 96 | # only care for /dev/bus devices. /dev/sd* are logical disks 97 | # and do not respond to any SMART command. 98 | done < <(echo "${DEVLIST}" | grep /dev/bus/ | cut -d'#' -f1) 99 | 100 | if [ "z${REPORT}" == "z" ]; then 101 | REPORT="all drives OK" 102 | fi 103 | 104 | echo "status smart status retrieved" 105 | echo "metric failed uint32 ${FAILED_CNT}" 106 | echo "metric prefail uint32 ${PREFAIL_CNT}" 107 | echo "metric unknown uint32 ${UNKNOWN_CNT}" 108 | echo "metric ok uint32 ${OK_CNT}" 109 | echo "metric report string ${REPORT}" 110 | 111 | exit 0 112 | -------------------------------------------------------------------------------- /memcached_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Rackspace Cloud Monitoring plugin to provide memcached statistics. 4 | 5 | Copyright 2013 Steve Katen 6 | 7 | Licensed under the Apache License, Version 2.0 (the "License"); 8 | you may not use this file except in compliance with the License. 9 | You may obtain a copy of the License at 10 | 11 | http://www.apache.org/licenses/LICENSE-2.0 12 | 13 | Unless required by applicable law or agreed to in writing, software 14 | distributed under the License is distributed on an "AS IS" BASIS, 15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | See the License for the specific language governing permissions and 17 | limitations under the License. 18 | 19 | 20 | Minimal Example criteria: 21 | 22 | if (metric['legacy_state'] != 'ok') { 23 | return new AlarmStatus(CRITICAL, 'memcache is NOT running.'); 24 | } 25 | return new AlarmStatus(OK, 'memcache is running.'); 26 | 27 | """ 28 | import sys 29 | import telnetlib 30 | import re 31 | import socket 32 | 33 | 34 | def memcached_stats(host, port): 35 | regex = re.compile(ur"STAT (.*) (.*)\r") 36 | try: 37 | c = telnetlib.Telnet(host, port) 38 | except socket.error: 39 | return 40 | else: 41 | c.write("stats\n") 42 | return dict(regex.findall(c.read_until('END'))) 43 | 44 | 45 | def hit_percent(hits, misses): 46 | total = hits + misses 47 | if total > 0: 48 | return 100 * float(hits) / float(total) 49 | else: 50 | return 0.0 51 | 52 | 53 | def fill_percent(used, total): 54 | return 100 * float(used) / float(total) 55 | 56 | 57 | def main(): 58 | if len(sys.argv) != 3: 59 | print "Usage: %s " % sys.argv[0] 60 | sys.exit(0) 61 | 62 | host = sys.argv[1] 63 | port = sys.argv[2] 64 | s = memcached_stats(host, port) 65 | 66 | if not s: 67 | print "status err unable to generate statistics" 68 | sys.exit(0) 69 | 70 | print "status ok memcached statistics generated" 71 | print "metric uptime int", s['uptime'] 72 | print "metric curr_connections int", s['curr_connections'] 73 | print "metric listen_disabled_num int", s['listen_disabled_num'] 74 | print "metric curr_items int", s['curr_items'] 75 | print "metric total_items int", s['total_items'] 76 | print "metric evictions int", s['evictions'] 77 | print "metric hit_percent float", hit_percent(int(s['get_hits']), 78 | int(s['get_misses'])) 79 | print "metric fill_percent float", fill_percent(int(s['bytes']), 80 | int(s['limit_maxbytes'])) 81 | 82 | if __name__ == '__main__': 83 | main() 84 | -------------------------------------------------------------------------------- /mongodb_stats.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Rackspace Cloud Monitoring plugin to provide mongodb statistics. 4 | 5 | Requirement: 6 | pymongo - http://api.mongodb.org/python/current/ 7 | 8 | Copyright 2013 Steve Katen 9 | 10 | Licensed under the Apache License, Version 2.0 (the "License"); 11 | you may not use this file except in compliance with the License. 12 | You may obtain a copy of the License at 13 | 14 | http://www.apache.org/licenses/LICENSE-2.0 15 | 16 | Unless required by applicable law or agreed to in writing, software 17 | distributed under the License is distributed on an "AS IS" BASIS, 18 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 | See the License for the specific language governing permissions and 20 | limitations under the License. 21 | """ 22 | import sys 23 | try: 24 | from pymongo import MongoClient as Client 25 | except ImportError: 26 | from pymongo import Connection as Client 27 | from pymongo.errors import ConnectionFailure, AutoReconnect 28 | 29 | 30 | def mongodb_stats(host, p, database, username, password): 31 | port = int(p) 32 | try: 33 | if username and password and database: 34 | c = Client("mongodb://"+username+":"+password+"@"+host+"/"+database, port) 35 | elif username and password: 36 | c = Client('mongodb://'+username+':'+password+'@'+host+'/', port) 37 | elif database: 38 | c = Client('mongodb://'+host+'/'+database, port) 39 | else: 40 | c = Client(host, port) 41 | except ConnectionFailure, AutoReconnect: 42 | return None 43 | else: 44 | return c.test.command("serverStatus") 45 | 46 | 47 | def main(): 48 | if len(sys.argv) != 6: 49 | print "Usage: %s " % sys.argv[0] 50 | sys.exit(0) 51 | 52 | s = mongodb_stats(*sys.argv[1:]) 53 | 54 | if not s: 55 | print "status err unable to generate statistics" 56 | sys.exit(0) 57 | 58 | print "status ok mongodb statistics generated" 59 | print "metric uptime float", s['uptime'] 60 | print "metric conn_available int", s['connections']['available'] 61 | print "metric conn_current int", s['connections']['current'] 62 | print "metric conn_percent float", float(s['connections']['current'] 63 | / s['connections']['available']) 64 | 65 | print "metric mem_mapped int", s['mem']['mapped'] 66 | print "metric index_hits int", s['indexCounters']['hits'] 67 | print "metric index_misses int", s['indexCounters']['misses'] 68 | try: 69 | print "metric index_percent int", float(s['indexCounters']['hits'] 70 | / s['indexCounters']['accesses']) 71 | except ZeroDivisionError: 72 | print "metric index_percent int 0" 73 | 74 | if (s['indexCounters']['btree']): 75 | print "metric index_hits int", s['indexCounters']['btree']['hits'] 76 | print "metric index_misses int", s['indexCounters']['btree']['misses'] 77 | print "metric index_percent int", float(s['indexCounters']['btree']['hits'] 78 | / s['indexCounters']['btree']['accesses']) 79 | else: 80 | print s['indexCounters']['btree']['hits'] 81 | print "metric index_hits int", s['indexCounters']['hits'] 82 | print "metric index_misses int", s['indexCounters']['misses'] 83 | print "metric index_percent int", float(s['indexCounters']['hits'] 84 | / s['indexCounters']['accesses']) 85 | 86 | if 'repl' in s: 87 | print "metric is_replicating string true" 88 | print "metric is_master string", s['repl']['ismaster'] 89 | print "metric is_secondary string", s['repl']['secondary'] 90 | else: 91 | print "metric is_replicating string false" 92 | 93 | if __name__ == '__main__': 94 | main() 95 | -------------------------------------------------------------------------------- /murmur_monitor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ## This requires ICE to be enabled on murmur, and the Murmur Mice package 4 | ## http://wiki.mumble.info/wiki/Mice 5 | 6 | ## This check returns number of active connections, how many people are able to receive 7 | ## a voice stream, how many are connected by deafened (could not receive voice), 8 | ## and how many virtual servers are present. 9 | 10 | import sys 11 | import os 12 | sys.path.append('/opt/murmur/ice') 13 | 14 | sys.stdout = open(os.devnull, "w") 15 | import mice 16 | sys.stdout = sys.__stdout__ 17 | 18 | numberServers = len(mice.m.getAllServers()) 19 | usersOnline = 0 20 | usersListening = 0 21 | usersDeaf = 0 22 | 23 | serverId = 1 24 | #print "Getting stats for", numberServers,"servers" 25 | for serverId in range(1, numberServers+1): 26 | # print "Getting stats for server", serverId 27 | server = mice.m.getServer(serverId) 28 | # print server 29 | users = server.getUsers() 30 | # print "Users online:", len(users), "List: ", users.keys() 31 | for user in users: 32 | usersOnline += 1 33 | if users[user].selfDeaf or users[user].deaf: 34 | usersDeaf += 1 35 | else: 36 | usersListening += 1 37 | 38 | print "status ok" 39 | print "metric servers int", numberServers, "servers" 40 | print "metric online int", usersOnline,"users" 41 | print "metric deaf int", usersDeaf,"users" 42 | print "metric listening int", usersListening,"users" 43 | 44 | -------------------------------------------------------------------------------- /mysql_ping.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Rackspace Cloud Monitoring Alert to verify MySQL server is running on system 4 | 5 | does a 'mysqladmin ping' to determine if service is running 6 | returns Status OK if service is alive, else Status ERROR. 7 | 8 | NOTE: must have a /root/.my.cnf file with access to mysql 9 | 10 | Example criteria : 11 | 12 | if (metric['legacy_state'] != 'ok') { 13 | return new AlarmStatus(CRITICAL, 'MySQL Server is NOT healthy.'); 14 | } 15 | 16 | return new AlarmStatus(OK, 'MySQL Server is running.'); 17 | 18 | """ 19 | import sys 20 | import os 21 | 22 | stat = os.popen('mysqladmin --defaults-file=/root/.my.cnf ping') 23 | report = stat.read() 24 | 25 | if report =="mysqld is alive\n": 26 | print "status ok ok" 27 | sys.exit(0) 28 | else: 29 | print "status error" 30 | sys.exit(1) 31 | -------------------------------------------------------------------------------- /mysql_replication.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ## Rackspace Cloud Monitoring Plug-In 4 | ## MySQL Replication State Validation 5 | # 6 | # (C)2013 Chris Mecca 7 | # All Rights Reserved. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 10 | # not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | # 21 | # Usage: 22 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 23 | # 24 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 25 | # 26 | # if (metric['SLAVE_STATUS'] != 'ONLINE') { 27 | # return new AlarmStatus(CRITICAL, 'MySQL Replication is OFFLINE.'); 28 | # } 29 | # 30 | # if (metric['SLAVE_STATUS'] == 'ONLINE' && metric['SECONDS_BEHIND_MASTER'] \ 31 | # >= 120 && metric['SECONDS_BEHIND_MASTER'] < 300) { 32 | # return new AlarmStatus(WARNING, 'MySQL Replication ONLINE \ 33 | # but Slave is more than 2 minutes behind Master.'); 34 | # } 35 | # 36 | # if (metric['SLAVE_STATUS'] == 'ONLINE' && metric['SECONDS_BEHIND_MASTER'] \ 37 | # >= 300) { 38 | # return new AlarmStatus(CRITICAL, 'MySQL Replication ONLINE \ 39 | # but Slave is more than 5 minutes behind Master.'); 40 | # } 41 | # 42 | # return new AlarmStatus(OK, 'MySQL Replication is ONLINE'); 43 | 44 | 45 | import sys 46 | import subprocess 47 | import shlex 48 | 49 | 50 | def mysql_repchk(arg): 51 | proc = subprocess.Popen(shlex.split(arg), 52 | stdout=subprocess.PIPE, 53 | stderr=subprocess.PIPE, 54 | shell=False) 55 | 56 | out, err = proc.communicate() 57 | ret = proc.returncode 58 | return ret, out, err 59 | 60 | RETCODE, OUTPUT, ERR = mysql_repchk('/usr/bin/mysql \ 61 | --defaults-file=/root/.my.cnf \ 62 | -e "SHOW SLAVE STATUS\\G"') 63 | 64 | if RETCODE: 65 | print >> sys.stderr, "There was an error (%d): \n" % RETCODE 66 | print >> sys.stderr, ERR 67 | 68 | if OUTPUT != "": 69 | SHOW_STATUS_LIST = OUTPUT.split('\n') 70 | del SHOW_STATUS_LIST[0] 71 | del SHOW_STATUS_LIST[-1] 72 | 73 | SLAVE_STATUS = {} 74 | for i in SHOW_STATUS_LIST: 75 | if ":" in i: 76 | SLAVE_STATUS[i.split(':')[0].strip()] = i.split(':')[1].strip() 77 | 78 | if SLAVE_STATUS["Slave_IO_Running"] == "Yes" and \ 79 | SLAVE_STATUS["Slave_SQL_Running"] == "Yes" and \ 80 | SLAVE_STATUS["Last_Errno"] == "0": 81 | 82 | print "status OK\n" \ 83 | "metric SLAVE_STATUS string ONLINE\n" \ 84 | "metric SECONDS_BEHIND_MASTER int " \ 85 | + SLAVE_STATUS["Seconds_Behind_Master"] 86 | else: 87 | print "status OK\n" \ 88 | "metric SLAVE_STATUS string OFFLINE\n" \ 89 | "metric SECONDS_BEHIND_MASTER int " \ 90 | + SLAVE_STATUS["Seconds_Behind_Master"] 91 | 92 | else: 93 | print "status ERROR\nmetric SLAVE_STATUS string NOT_CONFIGURED" 94 | -------------------------------------------------------------------------------- /nfs-status.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # nfs-status.sh 4 | # Rackspace Cloud Monitoring Plugin to verify the current return status of nfs. 5 | # 6 | # Copyright (c) 2015, Philip Eatherington 7 | # All rights reserved. 8 | # 9 | # Redistribution and use in source and binary forms, with or without 10 | # modification, are permitted provided that the following conditions are met: 11 | # 12 | # Redistributions of source code must retain the above copyright notice, 13 | # this list of conditions and the following disclaimer. 14 | # 15 | # Redistributions in binary form must reproduce the above copyright 16 | # notice, this list of conditions and the following disclaimer in the 17 | # documentation and/or other materials provided with the distribution. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | # POSSIBILITY OF SUCH DAMAGE. 30 | # 31 | # 32 | # Verify the current status of NFS shares 33 | # 34 | # Example criteria : 35 | # 36 | # if (metric['nfs_status'] != 'ok') { 37 | # return new AlarmStatus(CRITICAL, 'NFS Service is NOT healthy.'); 38 | # } 39 | # 40 | # return new AlarmStatus(OK, 'NFS Service is running.'); 41 | # 42 | # REQUIRES 'showmount' to be installed (part of NFS utils) 43 | 44 | HOST=$1 45 | DIR=$2 46 | 47 | OUTPUT=$(showmount -e ${HOST} 2>&1) 48 | if [[ $OUTPUT = *'Connection refused'* ]] 49 | then 50 | state='Error: connection refused ' 51 | error=$OUTPUT 52 | elif [[ $OUTPUT = *'Program not registered'* ]] 53 | then 54 | state='Error: NFS not running on host' 55 | error=$OUTPUT 56 | elif [[ $OUTPUT = *$DIR* ]] 57 | then 58 | state='ok' 59 | error='ok' 60 | else 61 | state='Error: No shares found' 62 | error=$OUTPUT 63 | fi 64 | echo 'status' $state 65 | echo 'metric nfs_status string' $error 66 | -------------------------------------------------------------------------------- /nginx_status_check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Rackspace Cloud Monitoring Plugin for Nginx Status Page.""" 3 | # Copyright 2014 Frank Ritchie 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # ----- 18 | # 19 | # This plugin monitors the metrics produced by the Nginx ngx_http_stub_status_module 20 | # module. The module generates an HTML page that contains basic status information. 21 | # 22 | # For more info see: 23 | # 24 | # http://nginx.org/en/docs/http/ngx_http_stub_status_module.html 25 | # 26 | # For advanced metrics the NGINX Plus product is required. 27 | # 28 | # By default the monitor fails if the check does not complete successfully. 29 | # 30 | # Metrics for: 31 | # 32 | # - Active connections 33 | # - Accepted connections 34 | # - Handled connections 35 | # - Number of requests 36 | # - Connections reading 37 | # - Connections writing 38 | # - Connections waiting 39 | # 40 | # are also reported. 41 | # 42 | # Requires: 43 | # Python 2.6 or greater 44 | # Nginx with ngx_http_stub_status_module enabled. 45 | # 46 | # In short, you will need to add a localtion block to the Nginx 47 | # server block, e.g. 48 | # 49 | # location /nginx_status { 50 | # stub_status on; 51 | # access_log off; 52 | # allow 127.0.0.1; 53 | # } 54 | # 55 | # Usage: 56 | # Place script in /usr/lib/rackspace-monitoring-agent/plugins. 57 | # Ensure file is executable (755). 58 | # 59 | # Set up a Cloud Monitoring Check of type agent.plugin to run 60 | # 61 | # nginx_status_check.py -u 62 | # 63 | # The URL is optional and defaults to: 64 | # 65 | # http://0.0.0.0/nginx_status 66 | # 67 | # There is no need to define specific custom alert criteria. 68 | # As stated, the monitor fails if the metrics cannot be collected. 69 | # It is possible to define custom alert criteria with the reported 70 | # metrics if desired. 71 | # 72 | 73 | import re 74 | import sys 75 | import urllib2 76 | from optparse import OptionParser 77 | 78 | class NginxStatus(object): 79 | """Create an object for an Nginx Status URL. Assume URL is not available.""" 80 | 81 | def __init__(self, url): 82 | 83 | self.url = url 84 | self.nginx_status_available = False 85 | 86 | def nginx_status_metrics(self): 87 | """Connect to the Nginx Status URL object. Error out on failure.""" 88 | 89 | try: 90 | nginx_status_conn = urllib2.urlopen(self.url) 91 | nginx_status_data = nginx_status_conn.read() 92 | self.nginx_status_available = True 93 | except urllib2.URLError: 94 | print 'status err URLError: check the URL and that Nginx running.' 95 | sys.exit(1) 96 | except Exception: 97 | print 'status err failed to obtain nginx status metrics.' 98 | sys.exit(1) 99 | 100 | if self.nginx_status_available: 101 | # Use regexes to parse /nginx_stats. 102 | match1 = re.search(r'Active connections:\s+(\d+)', nginx_status_data) 103 | match2 = re.search(r'\s*(\d+)\s+(\d+)\s+(\d+)', nginx_status_data) 104 | match3 = re.search(r'Reading:\s*(\d+)\s*Writing:\s*(\d+)\s*' 105 | 'Waiting:\s*(\d+)', nginx_status_data) 106 | print 'metric active_connections int64', int(match1.group(1)) 107 | print 'metric accepted_connections int64', int(match2.group(1)) 108 | print 'metric handled_connections int64', int(match2.group(2)) 109 | print 'metric number_of_requests int64', int(match2.group(3)) 110 | print 'metric connections_reading int64', int(match3.group(1)) 111 | print 'metric connections_writing int64', int(match3.group(2)) 112 | print 'metric connections_waiting int64', int(match3.group(3)) 113 | print 'status ok succeeded in obtaining nginx status metrics.' 114 | else: 115 | print 'status err failed to obtain nginx status metrics.' 116 | sys.exit(1) 117 | 118 | 119 | def main(): 120 | """Instantiate an NginxStatus object and collect stats.""" 121 | 122 | parser = OptionParser() 123 | parser.add_option('-u', '--url', default='http://0.0.0.0/nginx_status', 124 | help='URL for Nginx Status page.') 125 | (opts, args) = parser.parse_args() 126 | 127 | nginx_status = NginxStatus(opts.url) 128 | nginx_status.nginx_status_metrics() 129 | 130 | if __name__ == '__main__': 131 | main() 132 | -------------------------------------------------------------------------------- /ntp_offset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # ntp_offset.sh 4 | # Rackspace Cloud Monitoring Plugin to verify the time offset from ntp 5 | # 6 | # Copyright (c) 2013, Jordan Evans 7 | # Copyright (c) 2014, Simon Vetter 8 | # All rights reserved. 9 | # 10 | # Redistribution and use in source and binary forms, with or without 11 | # modification, are permitted provided that the following conditions are met: 12 | # 13 | # Redistributions of source code must retain the above copyright notice, 14 | # this list of conditions and the following disclaimer. 15 | # 16 | # Redistributions in binary form must reproduce the above copyright 17 | # notice, this list of conditions and the following disclaimer in the 18 | # documentation and/or other materials provided with the distribution. 19 | # 20 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 24 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 25 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 26 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 27 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 28 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 30 | # POSSIBILITY OF SUCH DAMAGE. 31 | # 32 | # 33 | # This plugin expects to find ntpq and awk in the environment 34 | # it reports the average ntp offset from ntpq in milliseconds. 35 | # 36 | # Example alarm code: 37 | # :set consecutiveCount=3 38 | # if (metric['ntp_offset'] > 10000 || metric['ntp_offset'] < -10000) { 39 | # return new AlarmStatus(CRITICAL, 'ntp offset is too high.'); 40 | # } 41 | # return new AlarmStatus(OK, 'ntp offset is fine.'); 42 | # 43 | # if (metric['active_sources'] < 2) { 44 | # return new AlarmStatus(WARNING, 'ntpd is only using #{active_sources} sources'); 45 | # } 46 | # return new AlarmStatus(OK, 'ntpd has #{active_sources} active sources'); 47 | # 48 | 49 | NTPQ_BIN=$(which ntpq) 50 | AWK_BIN=$(which awk) 51 | 52 | if [[ -x $NTPQ_BIN ]] && [[ -x $AWK_BIN ]] 53 | then 54 | # only select line starting with * (system peer), + (candidate), # (selected), and o (pps sys peer) 55 | OUTPUT=$($NTPQ_BIN -pn | $AWK_BIN '{ if ($1 ~ "^[\\*\\+#o].*" && $9 ~ /[0-9]/) print $9};' | cut -f 1 -d '.') 56 | 57 | for x in ${OUTPUT} 58 | do 59 | sum=$(($sum + $x)) 60 | count=$(($count + 1)) 61 | done 62 | 63 | if [[ ${count} -gt 0 ]]; then 64 | avg=$(($sum / $count)) 65 | echo "status ok got ntp stats" 66 | echo "metric ntp_offset int32 ${avg} milliseconds" 67 | echo "metric active_sources uint32 ${count} sources" 68 | exit 0 69 | else 70 | echo "status err could not compute ntp offset: no reachable or active source" 71 | fi 72 | else 73 | echo "status err could not compute ntp offset: ntpq and/or awk could not be found" 74 | fi 75 | 76 | exit 1 77 | -------------------------------------------------------------------------------- /onmetal_v1_smart.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Script for monitoring remaining useful lifetime of OnMetal v1 SATADOM. 4 | # 5 | # Requires the following binaries installed & on path: 6 | # - smartctl 7 | # - lsblk 8 | # 9 | # Suggested alarm criteria: 10 | # 11 | # if (metric['percent_pe_cycles_used'] >= 1) { 12 | # return new AlarmStatus(CRITICAL, 'Drive is beyond expected life.'); 13 | # } 14 | # 15 | # if (metric['percent_pe_cycles_used'] >= .8) { 16 | # return new AlarmStatus(WARNING, 'Drive >= 80% of its expected life.'); 17 | # } 18 | # 19 | # return new AlarmStatus(OK, 'Drive less than 80% through its expected life.'); 20 | 21 | import subprocess 22 | import sys 23 | 24 | DEVICE = "/dev/sda" 25 | 26 | SATADOM_PE_MAX = { 27 | '32G MLC SATADOM': 3000, 28 | '7 PIN SATA FDM': 3000, 29 | 'Fastable SD 131 7': 3000, 30 | 'Fastable SD131 7': 3000, 31 | 'SATADOM-SH TYPE': 100000, 32 | 'SATADOM-SH TYPE C 3SE': 100000, 33 | } 34 | 35 | 36 | def _fail(msg="Unknown Error"): 37 | print("status err {}".format(msg)) 38 | sys.exit(1) 39 | 40 | 41 | def _get_smartctl_attributes(): 42 | try: 43 | out = subprocess.check_output(['smartctl', '--attributes', DEVICE]) 44 | except: 45 | _fail("failed running smartctl") 46 | 47 | header = None 48 | it = iter(out.split('\n')) 49 | for line in it: 50 | # note(JayF): skip forward until we get to the header and pull 51 | # it out 52 | if line.strip().startswith('ID#'): 53 | header = line.strip().split() 54 | break 55 | 56 | attributes = {} 57 | # note(JayF): All lines at this point contain metrics or are blank. 58 | for line in it: 59 | line = line.strip() 60 | if not line: 61 | continue 62 | linelist = line.split() 63 | # note(JayF): match up headers to values to generate a dict 64 | key = linelist[0] + '-' + linelist[1] 65 | value = dict(zip(header[2:], linelist[2:])) 66 | attributes[key] = value 67 | 68 | return attributes 69 | 70 | 71 | def _calculate_pe_cycles(actual_value): 72 | return int(hex(int(actual_value))[-4:], 16) 73 | 74 | 75 | def _calculate_life_expectancy(pe_cycle_current, pe_cycle_max): 76 | # note(JayF): Force one of the values to a float to avoid int division 77 | return "{:f}".format(pe_cycle_current / float(pe_cycle_max)) 78 | 79 | 80 | def _get_satadom_model(): 81 | try: 82 | model = subprocess.check_output( 83 | ['lsblk', '-oMODEL', DEVICE]).strip().split('\n')[1] 84 | except: 85 | _fail("failed running lsblk") 86 | 87 | if model not in SATADOM_PE_MAX.keys(): 88 | _fail("UNKNOWN SATADOM MODEL") 89 | exit(1) 90 | else: 91 | return model 92 | 93 | 94 | attrs = _get_smartctl_attributes() 95 | life_remaining = _calculate_life_expectancy( 96 | _calculate_pe_cycles(attrs['173-Unknown_Attribute']['RAW_VALUE']), 97 | SATADOM_PE_MAX[_get_satadom_model()]) 98 | 99 | print("status ok smart stats gathered successfully") 100 | print("metric percent_pe_cycles_used float {}".format(life_remaining)) 101 | -------------------------------------------------------------------------------- /open_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2015 Brad Ison 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | Rackspace Cloud Monitoring agent plugin to count open files on Linux. 19 | 20 | This check records the number of file handles in use on a Linux 21 | system using the proc file system: 22 | 23 | https://www.kernel.org/doc/Documentation/sysctl/fs.txt 24 | 25 | Example alarm criteria: 26 | 27 | if (metric['open_files'] > 65535) { 28 | return new AlarmStatus(CRITICAL, "Too many open files!"); 29 | } 30 | 31 | """ 32 | 33 | import sys 34 | 35 | 36 | PROC_FILE = "/proc/sys/fs/file-nr" 37 | 38 | 39 | try: 40 | open_nr, free_nr, max = open(PROC_FILE).readline().split("\t") 41 | open_files = int(open_nr) - int(free_nr) 42 | except Exception as e: 43 | print "status error {}".format(e) 44 | sys.exit(1) 45 | 46 | 47 | print "status ok {} open files".format(open_files) 48 | print "metric open_files uint32 {}".format(open_files) 49 | -------------------------------------------------------------------------------- /pg_check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Simple PostgeSQL status check for Rackspace Cloud Monitoring 4 | # 5 | # (C)2014 Christopher Coffey 6 | # All Rights Reserved. 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); you may 9 | # not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | # 20 | # Usage: 21 | # Place file in the /usr/lib/rackspace-monitoring-agent/plugins/ directory 22 | # 23 | # No need to define specific custom alert criteria, Status ok is only acceptable 24 | # response, All other responses trigger alert (default responses expected). 25 | # 26 | # SAMPLE monitoring-postgresql.yaml monitoring file to be placed in 27 | # /etc/rackspace-monitoring-agent.conf.d/ 28 | # -------------------------------- 29 | # type: agent.plugin 30 | # label: postgresql status 31 | # period: 300 32 | # timeout: 30 33 | # details: 34 | # file: pg_check.py 35 | # 36 | 37 | import sys 38 | import os 39 | 40 | stat = os.popen('pg_isready') 41 | report = stat.read() 42 | 43 | if report.find("accepting connections") != -1: 44 | print "status ok" 45 | sys.exit(0) 46 | else: 47 | print "status error" 48 | sys.exit(1) 49 | -------------------------------------------------------------------------------- /php-fpm_status_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2016 gustavo panizzo 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # ----- 17 | # 18 | # This plugin monitors the metrics produced by the PHP-FPM status page 19 | # pm.status_path needs to be enabled per pool you want to monitor 20 | # 21 | # pm.status_path = /status-for-php-fpm 22 | # 23 | # and you need the cgi-fcgi command 24 | # yum install fcgi 25 | # apt-get install libfcgi0ldbl 26 | # 27 | # For more info see: 28 | # 29 | # http://php.net/manual/en/install.fpm.configuration.php 30 | # 31 | # By default the monitor fails if the check does not complete successfully. 32 | # 33 | # Metrics for: 34 | # 35 | # accepted conn 36 | # listen queue 37 | # max listen queue 38 | # listen queue len 39 | # idle processes 40 | # active processes 41 | # total processes 42 | # max active processes 43 | # max children reached 44 | # 45 | # are also reported. 46 | # 47 | # 48 | # Usage: 49 | # Place script in /usr/lib/rackspace-monitoring-agent/plugins. 50 | # Ensure file is executable (755). 51 | # 52 | # Set up a Cloud Monitoring Check of type agent.plugin to run 53 | # 54 | # php-fpm_status_check.sh SOCKET_PATH STATUS_URL 55 | # 56 | # Both are optional and default to: 57 | # 58 | # /var/run/php-fpm/www.sock 59 | # /status-for-php-fpm 60 | # 61 | # There is no need to define specific custom alert criteria. 62 | # As stated, the monitor fails if the metrics cannot be collected. 63 | # It is possible to define custom alert criteria with the reported 64 | # metrics if desired. 65 | # 66 | # Example criteria : 67 | # 68 | #if (metric['max_children_reached'] > 0) { 69 | # return CRITICAL, "Max Children reached" 70 | #} 71 | #if (metric['legacy_state'] != 'ok') { 72 | # return CRITICAL, "PHP-PFM is not running correctly or misconfigured check" 73 | #} 74 | # 75 | #return OK, "PHP-FPM is running correctly" 76 | 77 | 78 | CGIFCGI=$(which cgi-fcgi 2>/dev/null) 79 | if [ $? != 0 ]; then 80 | #echo "status error: Could not find cgi-fcgi." 81 | #echo "status error" 82 | echo "status err failed to obtain metrics." 83 | exit 1 84 | fi 85 | 86 | SOCKET=${1-/var/run/php-fpm/www.sock} 87 | STATUS_PATH=${2-/status-for-php-fpm} 88 | OUTPUT=$(mktemp) 89 | 90 | SCRIPT_NAME="${STATUS_PATH}" SCRIPT_FILENAME="${STATUS_PATH}" REQUEST_METHOD=GET $CGIFCGI -bind -connect ${SOCKET} 2>/dev/null > $OUTPUT 91 | if [ $? != 0 ]; then 92 | #echo "status error" 93 | echo "status err failed to obtain metrics." 94 | exit 1 95 | fi 96 | 97 | accepted_conn=$(grep "^accepted conn:" $OUTPUT | awk '{print $3}') 2>/dev/null 98 | listen_queue=$(grep "^listen queue:" $OUTPUT | awk '{print $3}') 2>/dev/null 99 | max_listen_queue=$(grep "^max listen queue:" $OUTPUT | awk '{print $4}') 2>/dev/null 100 | listen_queue_len=$(grep "^listen queue len:" $OUTPUT | awk '{print $4}') 2>/dev/null 101 | idle_processes=$(grep "^idle processes:" $OUTPUT | awk '{print $3}') 2>/dev/null 102 | active_processes=$(grep "^active processes:" $OUTPUT | awk '{print $3}') 2>/dev/null 103 | total_processes=$(grep "^total processes:" $OUTPUT | awk '{print $3}') 2>/dev/null 104 | max_active_processes=$(grep "^max active processes:" $OUTPUT | awk '{print $4}') 2>/dev/null 105 | max_children_reached=$(grep "^max children reached:" $OUTPUT | awk '{print $4}') 2>/dev/null 106 | 107 | echo "status ok succeeded in obtaining metrics." 108 | echo "metric accepted_conn uint32 $accepted_conn" 109 | echo "metric listen_queue uint32 $listen_queue" 110 | echo "metric max_listen_queue uint32 $max_listen_queue" 111 | echo "metric listen_queue_len uint32 $listen_queue_len" 112 | echo "metric idle_processes uint32 $idle_processes" 113 | echo "metric active_processes uint32 $active_processes" 114 | echo "metric total_processes uint32 $total_processes" 115 | echo "metric max_active_processes uint32 $max_active_processes" 116 | echo "metric max_children_reached uint32 $max_children_reached" 117 | 118 | rm -f $OUTPUT 119 | exit 0 120 | -------------------------------------------------------------------------------- /ping.sh: -------------------------------------------------------------------------------- 1 | #/usr/sbin/env bash 2 | : <<'DESCRIPTION' 3 | 4 | Rackspace Cloud Monitoring Plug-In 5 | This is a plugin to monitor ICMP response times of hosts accessible by the server 6 | 7 | ---------------------------------------------------------------------------- 8 | "THE BEER-WARE LICENSE" (Revision 42): 9 | wrote this file. As long as you retain this notice you 10 | can do whatever you want with this stuff. If we meet some day, and you think 11 | this stuff is worth it, you can buy me a beer in return. 12 | ---------------------------------------------------------------------------- 13 | 14 | Usage: 15 | - Place plug-in in folder /usr/lib/rackspace-monitoring-agent/plugins 16 | - Ensure that it is executable 17 | chmod +x ping.sh 18 | - Configure Custom Plugin type check in Rackspace Intelligence 19 | Specify only the script's name and the hostname/IP to ping, e.g.: 20 | ping.sh 192.168.0.1 21 | Count is the amount of ICMP probes sent in a singe check, and interval is the 22 | number of seconds between them. They are both optional. Their default values 23 | are 5 pings with an interval of 2 seconds. 24 | - Configure an Alert (optional, see example below). 25 | 26 | This plugin returns 4 metrics: 27 | - minimum, average, maximum: statistics returned by the GNU ping utility 28 | in the format "round-trip min/avg/max/stddev = 9.429/35.460/79.698/27.657 ms" 29 | - lost_packets: the percentage of the packets lost out of the number of probes 30 | sent in this check run 31 | 32 | Example alert: 33 | 34 | --- start copying after this line --- 35 | 36 | if (metric['average'] >= 30 ) { 37 | return new AlarmStatus(WARNING, 'Average round-trip took #{average}ms'); 38 | } 39 | if (metric['lost_packets'] >= 40) { 40 | return new AlarmStatus(WARNING, 'Packet loss was #{lost_packets}%'); 41 | } 42 | if (metric['legacy_state'] != "ok") { 43 | return new AlarmStatus(CRITICAL, 'Error: #{legacy_state}'); 44 | } 45 | return new AlarmStatus(OK, 'All good'); 46 | 47 | --- stop copying before this line --- 48 | 49 | DESCRIPTION 50 | 51 | ping_stats=$(ping -i "${3:-2}" -q -w 30 -n -c "${2:-5}" "${1}" 2>&1 | tail -2) 52 | min_ping="$(echo ${ping_stats} | sed -e "s#.\+= \([.0-9]\+\).\+#\\1#g")" 53 | avg_ping="$(echo ${ping_stats} | cut -d'/' -f5)" 54 | max_ping="$(echo ${ping_stats} | cut -d'/' -f6)" 55 | loss_percent="$(echo ${ping_stats} | sed -e "s#.\+ \([0-9]\+\)%.\+#\1#")" 56 | 57 | if [ -n "$(echo "${avg_ping}" | grep "^[.0-9]\+$" -)" ] 58 | then 59 | echo "status ok" 60 | echo "metric minimum double ${min_ping} milliseconds" 61 | echo "metric average double ${avg_ping} milliseconds" 62 | echo "metric maximum double ${max_ping} milliseconds" 63 | echo "metric lost_packets int32 ${loss_percent} percent" 64 | else 65 | echo "status error: ping probe fail" 66 | exit 1 67 | fi 68 | -------------------------------------------------------------------------------- /port_check.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Rackspace Cloud Monitoring plugin to check port, particularly 4 | useful for services that aren't accessible to a remote port check. 5 | 6 | Copyright 2013 Steve Katen 7 | 8 | Licensed under the Apache License, Version 2.0 (the "License"); 9 | you may not use this file except in compliance with the License. 10 | You may obtain a copy of the License at 11 | 12 | http://www.apache.org/licenses/LICENSE-2.0 13 | 14 | Unless required by applicable law or agreed to in writing, software 15 | distributed under the License is distributed on an "AS IS" BASIS, 16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | See the License for the specific language governing permissions and 18 | limitations under the License. 19 | """ 20 | import sys 21 | import socket 22 | 23 | 24 | def socket_open(host, port): 25 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 26 | try: 27 | s.connect((host, int(port))) 28 | s.shutdown(2) 29 | s.close() 30 | except socket.error: 31 | return "CLOSED" 32 | else: 33 | return "OPEN" 34 | 35 | 36 | def main(): 37 | if len(sys.argv) != 3: 38 | print "Usage: %s " % sys.argv[0] 39 | sys.exit(0) 40 | 41 | host = sys.argv[1] 42 | port = sys.argv[2] 43 | p = socket_open(host, port) 44 | 45 | if not p: 46 | print "status err no connection" 47 | sys.exit(0) 48 | 49 | print "status OK" 50 | print "metric port int", port 51 | print "metric status string", p 52 | 53 | 54 | if __name__ == '__main__': 55 | main() 56 | -------------------------------------------------------------------------------- /port_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Description: Custom plugin which checks that some service is listening on the 4 | # specified port. 5 | # Author: Tomaz Muraus 6 | # License: MIT 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 | # THE SOFTWARE. 25 | 26 | if [ $# -ne 3 ]; then 27 | echo "Usage: $0 " 28 | exit 100 29 | fi 30 | 31 | PROTOCOL=$1 32 | IP=$2 33 | PORT=$3 34 | 35 | OPTIONS="" 36 | 37 | if [ $PROTOCOL = "udp" ]; then 38 | OPTIONS="-u" 39 | fi 40 | 41 | nc ${OPTIONS} ${IP} ${PORT} < /dev/null > /dev/null 2>&1 42 | 43 | if [ $? -ne 0 ]; then 44 | echo "status Nothing listening on port ${IP}:${PORT} (${PROTOCOL})" 45 | echo "metric listening string no" 46 | else 47 | echo "status Service listening on ${IP}:${PORT} (${PROTOCOL})" 48 | echo "metric listening string yes" 49 | fi 50 | -------------------------------------------------------------------------------- /process_mon.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # process_mon.sh 4 | # Rackspace Cloud Monitoring Plugin to check if process is running. 5 | # 6 | # Copyright (c) 2013, Stephen Lang 7 | # All rights reserved. 8 | # 9 | # Redistribution and use in source and binary forms, with or without 10 | # modification, are permitted provided that the following conditions are met: 11 | # 12 | # Redistributions of source code must retain the above copyright notice, 13 | # this list of conditions and the following disclaimer. 14 | # 15 | # Redistributions in binary form must reproduce the above copyright 16 | # notice, this list of conditions and the following disclaimer in the 17 | # documentation and/or other materials provided with the distribution. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | # POSSIBILITY OF SUCH DAMAGE. 30 | # 31 | # Curl Command: 32 | # curl -i -X POST -H 'Host: monitoring.api.rackspacecloud.com' -H 33 | # 'Accept-Encoding: gzip,deflate' -H 'X-Auth-Token: YOUR_API_TOKEN' -H 34 | # 'Content-Type: application/json; charset=UTF-8' -H 'Accept: 35 | # application/json' --data-binary '{"label": "Process Check", "type": 36 | # "agent.plugin", "details": {"args": ["PROCESS_NAME"],"file": 37 | # "process_mon.sh"}}' --compress 38 | # 'https://monitoring.api.rackspacecloud.com:443/v1.0/YOUR_ACCOUNT/entities/YOUR_ENTITY/checks' 39 | # 40 | # Usage: 41 | # Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins 42 | # 43 | # The following is an example 'criteria' for a Rackspace Monitoring Alarm: 44 | # 45 | # if (metric['process_mon'] == 0) { 46 | # return new AlarmStatus(CRITICAL, 'Process not running.'); 47 | # } 48 | # 49 | # return new AlarmStatus(OK, 'Process running normally.'); 50 | 51 | function help { 52 | 53 | cat < 4 | # License: MIT 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the 'Software'), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in 14 | # all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 | # THE SOFTWARE. 23 | # 24 | # Rackspace Cloud Monitoring plugin for monitoring a RabbitMQ stats. This plugin 25 | # was adapted from the Cloudkick rabbitmq.py plugin by Tomaz Muraus: 26 | # https://github.com/cloudkick/agent-plugins/blob/master/rabbitmq.py 27 | # 28 | # Example usage (arguments which you pass in to the plugin): 29 | # 30 | # Monitor queue "bg_jobs" memory usage, number of consumers and number of 31 | # messages: 32 | # 33 | # --action list_queues --queue bg_jobs --parameters memory,consumers,messages 34 | # 35 | # Monitor exchange "amqp.direct" type, durability and auto_delete value 36 | # 37 | # --action list_exchanges --exchange amqp.direct --parameters type,durable,auto_delete 38 | # 39 | # Monitor queue "bg_jobs" memory usage, number of consumers, number of 40 | # messages and alert if messages over 100 41 | # 42 | # --action list_queues --queue bg_jobs --queue-length 100 ---parameters memory,consumers,messages 43 | # 44 | 45 | import re 46 | import sys 47 | import subprocess 48 | import optparse 49 | 50 | METRIC_TYPES = { 51 | 'list_queues': { 52 | 'name': 'string', 53 | 'durable': 'string', 54 | 'auto_delete': 'string', 55 | 'arguments': 'string', 56 | 'pid': 'int', 57 | 'owner_pid': 'int', 58 | 'messages_ready': 'int', 59 | 'messages_unacknowledged': 'int', 60 | 'messages': 'int', 61 | 'consumers': 'int', 62 | 'memory': 'int' 63 | }, 64 | 65 | 'list_exchanges': { 66 | 'name': 'string', 67 | 'type': 'string', 68 | 'durable': 'string', 69 | 'auto_delete': 'string', 70 | 'internal': 'string', 71 | 'argument': 'string' 72 | } 73 | } 74 | 75 | def retrieve_stats(vhost, action, queue, exchange, parameters, 76 | rabbitmqctl_path): 77 | value = queue or exchange 78 | command = [ rabbitmqctl_path, action, '-p', vhost ] 79 | parameters = parameters.split(',') 80 | 81 | parameters = [ p.lower() for p in parameters \ 82 | if p.lower() in METRIC_TYPES[action].keys() ] 83 | 84 | command.extend( [ 'name' ] + parameters) 85 | process1 = subprocess.Popen(command, stdout=subprocess.PIPE, 86 | stderr=subprocess.STDOUT) 87 | process2 = subprocess.Popen([ 'grep', value ], stdin=process1.stdout, 88 | stdout=subprocess.PIPE, 89 | stderr=subprocess.PIPE) 90 | process1.stdout.close() 91 | stdout, stderr = process2.communicate() 92 | 93 | if stderr: 94 | return None, stderr 95 | 96 | stdout = stdout.split('\n') 97 | stdout = stdout[0] 98 | 99 | if not stdout: 100 | return None, 'Empty output' 101 | 102 | return parse_stats( [ 'name' ] + parameters, stdout), None 103 | 104 | def parse_stats(parameters, data): 105 | values = re.split('\s+', data) 106 | 107 | stats = {} 108 | for index, parameter in enumerate(parameters): 109 | stats[parameter] = values[index] 110 | 111 | return stats 112 | 113 | def print_metrics(action, metrics): 114 | for key, value in metrics.iteritems(): 115 | metric_type = METRIC_TYPES[action].get(key, None) 116 | 117 | if not metric_type: 118 | continue 119 | 120 | print 'metric %s %s %s' % (key, metric_type, value) 121 | 122 | if __name__ == '__main__': 123 | parser = optparse.OptionParser() 124 | parser.add_option('--path', action='store', dest='rabbitmqctl_path', 125 | default='rabbitmqctl', 126 | help='Path to the rabbitmqctl binary (optional)') 127 | parser.add_option('--action', action='store', dest='action', 128 | help='Action (list_queues or list_exchanges)') 129 | parser.add_option('--vhost', action='store', dest='vhost', default='/', 130 | help='Vhost (optional)') 131 | parser.add_option('--queue', action='store', dest='queue', 132 | help='Queue name') 133 | parser.add_option('--exchange', action='store', dest='exchange', 134 | help='Exchange name') 135 | parser.add_option('--parameters', action='store', dest='parameters', 136 | default='messages', 137 | help='Comma separated list of parameters to retrieve (default = messages)') 138 | parser.add_option('--queue-length', type='int', action='store', dest='length', 139 | help='Max messages in the queue before alert') 140 | 141 | (options, args) = parser.parse_args(sys.argv) 142 | 143 | rabbitmqctl_path = options.rabbitmqctl_path 144 | action = getattr(options, 'action', None) 145 | vhost = options.vhost 146 | queue = getattr(options, 'queue', None) 147 | exchange = getattr(options, 'exchange', None) 148 | parameters = options.parameters 149 | length = getattr(options, 'length', None) 150 | 151 | if not action: 152 | print 'status err Missing required argument: action' 153 | sys.exit(1) 154 | 155 | if action == 'list_queues' and not queue: 156 | print 'status err Missing required argument: queue' 157 | sys.exit(1) 158 | elif action == 'list_exchanges' and not exchange: 159 | print 'status err Missing required argument: exchange' 160 | sys.exit(1) 161 | 162 | if action not in METRIC_TYPES.keys(): 163 | print 'status err Invalid action: %s' % (action) 164 | sys.exit(1) 165 | 166 | if not parameters: 167 | print 'status err Missing required argument: parameters' 168 | sys.exit(1) 169 | 170 | metrics, error = retrieve_stats(vhost, action, queue, exchange, 171 | parameters, rabbitmqctl_path) 172 | 173 | if error: 174 | print 'status err %s' % (error) 175 | sys.exit(1) 176 | if length is not None and metrics.has_key('messages'): 177 | if int(metrics['messages']) > length: 178 | print 'status err Message queue %s at %d and above threshold of %d' % ( 179 | queue, int(metrics['messages']), length) 180 | sys.exit(1) 181 | print 'status ok metrics successfully retrieved' 182 | print_metrics(action, metrics) 183 | -------------------------------------------------------------------------------- /redis_slave_count.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Name: redis_slave_count.sh 4 | # Description: Custom plugin that returns number of slaves connected to redis. 5 | 6 | # Copyright 2014 Zachary Deptawa 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | 20 | # SYNOPSIS: ./redis_slave_count.sh [host] [port] [password]... 21 | # USAGE EXAMPLE: ./redis_slave_count.sh 127.0.0.1 6379 abcdef12 22 | # 23 | # Note: If no host/port/password given, it will default to host 0.0.0.0, port 6379, no pass. 24 | 25 | # What the plugin does: 26 | # - Looks for 'connected_slaves' line in output of `redis-cli INFO` command and 27 | # returns that value as a metric. 28 | # - Returns non-zero and 'status error' if 'connected_slaves' not found. 29 | 30 | # Rackspace Cloud Monitoring Plugin Usage: 31 | # - Place plug-in in /usr/lib/rackspace-monitoring-agent/plugins. 32 | # - Create a check that calls this plugin. 33 | # - Pass 'args' as 'host port' or 'host port pass' if needed. 34 | # NOTE: If you are unable to create the check, make sure you're 35 | # passing the args as an array! 36 | # - Create an alarm based on the criteria you're looking for. 37 | # 38 | # The following is an example 'criteria' for a Rackspace Cloud Monitoring Alarm: 39 | # 40 | # if (metric['connected_slaves'] == 0) { 41 | # return new AlarmStatus(CRITICAL, 'No slaves connected.'); 42 | # } 43 | # 44 | # return new AlarmStatus(OK, 'Slaves are connected.'); 45 | 46 | 47 | # If host arg is set, set $HOST. Else, default $HOST to '0.0.0.0'. 48 | if [ $1 ]; then 49 | HOST=$1 50 | else 51 | HOST=0.0.0.0 52 | fi 53 | 54 | # If port arg is set, set $PORT. Else, default $PORT to '6379'. 55 | if [ $2 ]; then 56 | PORT=$2 57 | else 58 | PORT=6379 59 | fi 60 | 61 | if [ $3 ]; then 62 | PASS=$3 63 | fi 64 | 65 | # Get the info and connected_slaves output. 66 | if [ $3 ]; then 67 | INFO=`redis-cli -h $HOST -p $PORT -a $PASS INFO` 68 | SLAVE_COUNT=`redis-cli -h $HOST -p $PORT -a $PASS INFO |grep connected_slaves |awk -F':' {'print$2'}` 69 | else 70 | INFO=`redis-cli -h $HOST -p $PORT INFO` 71 | SLAVE_COUNT=`redis-cli -h $HOST -p $PORT INFO |grep connected_slaves |awk -F':' {'print$2'}` 72 | fi 73 | 74 | # If $SLAVE_COUNT, return metrics. Else fail. 75 | if [ $SLAVE_COUNT ]; then 76 | echo "metric connected_slaves int $SLAVE_COUNT" 77 | else 78 | echo "status error - unable to pull stats from redis INFO" 79 | exit 1 80 | fi 81 | -------------------------------------------------------------------------------- /solrmon.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import socket 4 | import httplib 5 | import xml.etree.ElementTree 6 | 7 | from pprint import pprint 8 | 9 | def solr_ok(uri="localhost:8983"): 10 | try: 11 | c = httplib.HTTPConnection(uri) 12 | c.request("GET", "/solr/admin/ping") 13 | r = c.getresponse() 14 | except socket.error: 15 | return False 16 | if r.status == 200: 17 | return True 18 | else: 19 | return False 20 | 21 | 22 | def solrstats(uri="localhost:8983"): 23 | solr_stats = {} 24 | try: 25 | c = httplib.HTTPConnection(uri) 26 | c.request("GET", "/solr/admin/system") 27 | r = c.getresponse() 28 | if r.status == 200: 29 | xmldoc = xml.etree.ElementTree.fromstring(r.read()) 30 | uptime_elements = xmldoc.findall(".//*[@name='upTimeMS']") 31 | if len(uptime_elements) > 0: 32 | solr_stats['upTimeMS'] = uptime_elements[0].text 33 | else: 34 | pass 35 | except socket.error: 36 | pass 37 | 38 | try: 39 | c = httplib.HTTPConnection(uri) 40 | c.request("GET", "/solr/admin/luke") 41 | r = c.getresponse() 42 | if r.status == 200: 43 | xmldoc = xml.etree.ElementTree.fromstring(r.read()) 44 | luke_elements = xmldoc.findall(".//*[@name='numDocs']") 45 | if len(luke_elements) > 0: 46 | solr_stats['numDocs'] = luke_elements[0].text 47 | else: 48 | pass 49 | except socket.error: 50 | pass 51 | 52 | return solr_stats 53 | 54 | 55 | if __name__ == '__main__': 56 | if solr_ok(): 57 | print "status OK solr responded to solr.PingRequestHandler query" 58 | else: 59 | print "status Critical solr failed to respond, or reported an error" 60 | 61 | solr_stats = solrstats() 62 | for stat in solr_stats.keys(): 63 | print 'metric %s int64 %s' % (stat, solr_stats[stat]) 64 | 65 | -------------------------------------------------------------------------------- /ssl_cert_expiration.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Rackspace Cloud Monitoring plugin to check if a ssl cert is expired. 4 | 5 | # Example: 6 | # $ ./ssl_cert_expiration.sh 7 | 8 | # Example Alarm Criteria: 9 | # if (metric['cert_end_in'] <= 0) { 10 | # return new AlarmStatus(CRITICAL, 'Certificate has expired on host') 11 | # } 12 | # if (metric['cert_end_in'] < 604800) { 13 | # return new AlarmStatus(WARNING, 'Certificate expires in less than 1 week'); 14 | # } 15 | # return new AlarmStatus(OK, 'Certificate valid for more than 1 week'); 16 | 17 | # Copyright 2015 Rackspace 18 | 19 | # Licensed under the Apache License, Version 2.0 (the "License"); 20 | # you may not use this file except in compliance with the License. 21 | # You may obtain a copy of the License at 22 | 23 | # http://www.apache.org/licenses/LICENSE-2.0 24 | 25 | # Unless required by applicable law or agreed to in writing, software 26 | # distributed under the License is distributed on an "AS IS" BASIS, 27 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 28 | # See the License for the specific language governing permissions and 29 | # limitations under the License. 30 | 31 | if [ $# -ne 2 ]; then 32 | echo "Usage: $0 " 33 | exit 100 34 | fi 35 | 36 | HOST=$1 37 | PORT=$2 38 | 39 | EXPIRATION_DATE=$(echo ""|openssl s_client -connect $HOST:$PORT 2>/dev/null | openssl x509 -noout -enddate | sed 's/^not.*\=//') 40 | 41 | REMAINING_SECONDS=$(( $(date -u -d"$EXPIRATION_DATE" +%s) - $(date +%s) )) 42 | 43 | echo "status ok" 44 | echo "metric cert_end_in int ${REMAINING_SECONDS}" 45 | -------------------------------------------------------------------------------- /ssl_protocols_check.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Description: Agent plugin which detects supported SSL / TLS protocol versions. 4 | # Author: Tomaz Muraus 5 | # License: MIT 6 | # 7 | # Permission is hereby granted, free of charge, to any person obtaining a copy 8 | # of this software and associated documentation files (the "Software"), to deal 9 | # in the Software without restriction, including without limitation the rights 10 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | # copies of the Software, and to permit persons to whom the Software is 12 | # furnished to do so, subject to the following conditions: 13 | # 14 | # The above copyright notice and this permission notice shall be included in 15 | # all copies or substantial portions of the Software. 16 | # 17 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 | # THE SOFTWARE. 24 | 25 | if [ $# -ne 1 ] && [ $# -ne 2 ]; then 26 | echo "Usage: $0 [port]" 27 | exit 100 28 | fi 29 | 30 | IP=$1 31 | 32 | if [ $# -eq 2 ]; then 33 | PORT=$2 34 | else 35 | PORT=443 36 | fi 37 | 38 | SUPPORTED_PROTOCOLS=() 39 | 40 | OUTPUT=$(openssl s_client -ssl2 -connect ${IP}:${PORT} < /dev/null 2>&1) 41 | 42 | if grep -q "DONE " <<< ${OUTPUT}; then 43 | SUPPORTED_PROTOCOLS[${#SUPPORTED_PROTOCOLS[@]}]="ssl_2_0" 44 | echo "metric ssl_2_0 string yes" 45 | elif grep -q "wrong version number" <<< ${OUTPUT}; then 46 | echo "metric ssl_2_0 string no" 47 | elif grep -q "unknown option" <<< ${OUTPUT}; then 48 | echo "openssl doesn't support SSL v2.0, probably using openssl >= 1.0.0" >&2 49 | echo "metric ssl_2_0 string unknown" 50 | fi 51 | 52 | OUTPUT=$(openssl s_client -ssl3 -connect ${IP}:${PORT} < /dev/null 2>&1) 53 | 54 | if grep -q "DONE" <<< ${OUTPUT}; then 55 | SUPPORTED_PROTOCOLS[${#SUPPORTED_PROTOCOLS[@]}]="ssl_3_0" 56 | echo "metric ssl_3_0 string yes" 57 | elif grep -q "wrong version number" <<< ${OUTPUT}; then 58 | echo "metric ssl_3_0 string no" 59 | elif grep -q "unknown option " <<< ${OUTPUT}; then 60 | echo "metric ssl_3_0 string unknown" 61 | fi 62 | 63 | OUTPUT=$(openssl s_client -tls1 -connect ${IP}:${PORT} < /dev/null 2>&1) 64 | 65 | if grep -q "DONE" <<< ${OUTPUT}; then 66 | SUPPORTED_PROTOCOLS[${#SUPPORTED_PROTOCOLS[@]}]="tls_1_0" 67 | echo "metric tls_1_0 string yes" 68 | elif grep -q "wrong version number" <<< ${OUTPUT}; then 69 | echo "metric tls_1_0 string no" 70 | elif grep -q "unknown option " <<< ${OUTPUT}; then 71 | echo "metric tls_1_0 string unknown" 72 | fi 73 | 74 | OUTPUT=$(openssl s_client -tls1_1 -connect ${IP}:${PORT} < /dev/null 2>&1) 75 | 76 | if grep -q "DONE" <<< ${OUTPUT}; then 77 | SUPPORTED_PROTOCOLS[${#SUPPORTED_PROTOCOLS[@]}]="tls_1_1" 78 | echo "metric tls_1_1 string yes" 79 | elif grep -q "wrong version number" <<< ${OUTPUT}; then 80 | echo "metric tls_1_1 string no" 81 | elif grep -q "unknown option " <<< ${OUTPUT}; then 82 | echo "openssl doesn't support TLS v1.1, probably using openssl < 1.0.0" >&2 83 | echo "metric tls_1_1 string unknown" 84 | fi 85 | 86 | OUTPUT=$(openssl s_client -tls1_2 -connect ${IP}:${PORT} < /dev/null 2>&1) 87 | 88 | if grep -q "DONE" <<< ${OUTPUT}; then 89 | SUPPORTED_PROTOCOLS[${#SUPPORTED_PROTOCOLS[@]}]="tls_1_2" 90 | echo "metric tls_1_2 string yes" 91 | elif grep -q "wrong version number" <<< ${OUTPUT}; then 92 | echo "metric tls_1_2 string no" 93 | elif grep -q "unknown option " <<< ${OUTPUT}; then 94 | echo "openssl doesn't support TLS v1.2, probably using openssl < 1.0.0" >&2 95 | echo "metric tls_1_2 string unknown" 96 | fi 97 | 98 | SUPPORTED_PROTOCOLS=$(IFS=$','; echo "${SUPPORTED_PROTOCOLS[*]}") 99 | echo "status Supported protocols: ${SUPPORTED_PROTOCOLS}" 100 | -------------------------------------------------------------------------------- /statsd_metric_emitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Copyright 2015 Rackspace 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | ---- 17 | 18 | Rackspace cloud monitoring plugin for statsd metrics. 19 | 20 | Requires a directory path to watch and a list of metrics to filter out and return data about. 21 | 22 | E.g. 23 | python statsd_metric_emitter.py /foo/bar metric1 metric2 ... metricn 24 | """ 25 | 26 | import os.path 27 | import sys 28 | import glob 29 | import json 30 | 31 | ck_metrics = [] 32 | filtered_metrics = [] 33 | 34 | def output_check_status(status, message): 35 | ck_metrics.append("status %s %s" % (status, message)) 36 | 37 | if status is "err": 38 | print("status %s" % (status, message)) 39 | sys.exit() 40 | 41 | def output_metrics(metrics): 42 | """ 43 | Outputs the parsed metrics to the agent. 44 | """ 45 | # TODO these need to work for a few different types 46 | for metric_type in ("counters", "timers", "gauges"): 47 | metric = metrics.get(metric_type) 48 | if metric is None: 49 | continue 50 | for name, val in ((k, v) for k, v in metric.iteritems() if not k.startswith('statsd.')): 51 | if name in filtered_metrics: 52 | for k, v in val.iteritems(): 53 | ck_metric = "metric %s %s %f" % (name + '.' + k, 'float', v) 54 | ck_metrics.append(ck_metric) 55 | 56 | def parse_file(file_path, offset=0): 57 | """ 58 | Opens a metrics file from statsd and parses its json. 59 | 60 | Returns the offset of what we last read so we can seek 61 | directly to it next time. 62 | """ 63 | with open(file_path, 'rb') as fd: 64 | fd.seek(offset) 65 | data = fd.read() 66 | for line in data.split("\n"): 67 | if line: 68 | output_metrics(json.loads(line)) 69 | 70 | return fd.tell() 71 | 72 | def find_latest_flush(files): 73 | s = sorted(files) 74 | if len(s) is 0: 75 | return None 76 | currentFile = s.pop() 77 | for i in s: 78 | os.remove(i) 79 | return currentFile 80 | 81 | def main(): 82 | if len(sys.argv) < 2: 83 | print("status err: 500 Expected a watch directory as argument (quitting)") 84 | sys.exit(1) 85 | if len(sys.argv) < 3: 86 | print("status err: 500 At least one metric name is required for filtering (quitting)") 87 | sys.exit(2) 88 | watch_dir = sys.argv[1] 89 | for i in range(2, len(sys.argv)): 90 | filtered_metrics.append(sys.argv[i]) 91 | files = glob.glob(os.path.join(watch_dir, '[0-9]*.json')) 92 | currentFile = find_latest_flush(files) 93 | if currentFile is None: 94 | output_check_status('err', '204 NO CONTENT') 95 | else: 96 | parse_file(currentFile) 97 | output_check_status('ok', '200 OK') 98 | print('\n'.join(ck_metrics)) 99 | 100 | if __name__ == "__main__": 101 | main() 102 | 103 | -------------------------------------------------------------------------------- /systemctl_status.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright (c) 2018 Shane F. Carr 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | # 23 | # # # # # # # # 24 | # 25 | # This check returns information on the health of systemctl services. 26 | # For more information on systemctl status strings, see: 27 | # https://www.freedesktop.org/software/systemd/man/systemctl.html#is-system-running 28 | # 29 | # Suggested alarm: 30 | # 31 | # if (metric['systemctl_status'] != "running" && metric['systemctl_status'] != "starting") { 32 | # return new AlarmStatus(CRITICAL, 'SystemCTL status is #{systemctl_status}! Details: #{systemctl_failed_units}'); 33 | # } 34 | # return new AlarmStatus(OK, 'SystemCTL status is #{systemctl_status}'); 35 | 36 | STATE=$(systemctl is-system-running) 37 | DETAILS=$(systemctl list-units --state=failed --no-legend --no-pager | tr '\n' ' ') 38 | 39 | echo "status ok succeeded in obtaining metrics" 40 | echo "metric systemctl_status string $STATE" 41 | if [ -z "$DETAILS" ]; then 42 | echo "metric systemctl_failed_units string (no failed units)"; 43 | else 44 | echo "metric systemctl_failed_units string $DETAILS"; 45 | fi 46 | -------------------------------------------------------------------------------- /ubuntu_updates_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Description: Custom plugin returns number of pending security and other 4 | # updated on a Ubuntu based system. 5 | # Author: Tomaz Muraus 6 | # License: MIT 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 | # THE SOFTWARE. 25 | 26 | OUTPUT=$(/usr/lib/update-notifier/apt-check 2>&1) 27 | 28 | if [ $? -ne 0 ]; then 29 | echo "Failed to retrieve a number of pending updates" 30 | exit 100 31 | fi 32 | 33 | PENDING_OTHER=$(echo "${OUTPUT}" | cut -d ";" -f 1) 34 | PENDING_SECURITY=$(echo "${OUTPUT}" | cut -d ";" -f 2) 35 | REBOOT_REQUIRED="no" 36 | 37 | if [ -f "/var/run/reboot-required" ]; then 38 | REBOOT_REQUIRED="yes" 39 | fi 40 | 41 | if [ $((PENDING_OTHER+PENDING_SECURITY)) -gt 0 ]; then 42 | UPGRADABLE_PACKAGES=$(apt list --upgradable 2>/dev/null | grep -v Listing | awk -F'/' '{print $1}' | paste -sd ',' -) 43 | else 44 | UPGRADABLE_PACKAGES="none" 45 | fi 46 | 47 | echo "status Pending updates: security ${PENDING_SECURITY}, other: ${PENDING_OTHER}" 48 | 49 | echo "metric pending_security uint32 ${PENDING_SECURITY}" 50 | echo "metric pending_other uint32 ${PENDING_OTHER}" 51 | echo "metric reboot_required string ${REBOOT_REQUIRED}" 52 | echo "metric upgradable_packages string ${UPGRADABLE_PACKAGES}" 53 | 54 | exit 0 -------------------------------------------------------------------------------- /uptime_reset_detector.sh: -------------------------------------------------------------------------------- 1 | #!/bin/env bash 2 | 3 | # uptime_reset_detector.sh v 0.1.0a 4 | # This script uses /dev/shm (volatile ramdisk )to detect reboots. 5 | # Only works on Linux. 6 | # 7 | # Rackspace Cloud Monitoring Plugin to detect uptime resets. 8 | # 9 | # Copyright (c) 2017, Brian King 10 | # All rights reserved. 11 | # 12 | # 13 | # Licensed under the Apache License, Version 2.0 (the "License"); 14 | # you may not use this file except in compliance with the License. 15 | # You may obtain a copy of the License at 16 | # 17 | # http://www.apache.org/licenses/LICENSE-2.0 18 | # 19 | # Unless required by applicable law or agreed to in writing, software 20 | # distributed under the License is distributed on an "AS IS" BASIS, 21 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 | # See the License for the specific language governing permissions and 23 | # limitations under the License. 24 | 25 | # Example criteria: 26 | # 27 | # if (metric['uptime_reset_detected'] == 'true'){ 28 | # return new AlarmStatus(CRITICAL, 'Uptime reset detected.'); 29 | # } 30 | # return new AlarmStatus(OK, 'Server has not rebooted since the last time we checked.'); 31 | 32 | 33 | if [ -e /dev/shm/.lastreboot ]; then 34 | 35 | echo "status ok uptime_reset_detected false" 36 | echo "metric uptime_reset_detected string false just_rebooted" 37 | 38 | exit 0 39 | 40 | else 41 | 42 | echo "status critical uptime_reset_detected true" 43 | echo "metric uptime_reset_detected string true just_rebooted" 44 | 45 | #We're not doing anything but checking for the presence of the file yet, but 46 | # future versions could capture and report the delta between reboots 47 | 48 | uptime -s > /dev/shm/.lastreboot 49 | 50 | exit 1 51 | 52 | fi -------------------------------------------------------------------------------- /varnish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # varnish.sh 4 | # Rackspace Cloud Monitoring Plugin to collect metrics from varnishstat. 5 | # 6 | # Copyright (c) 2013, Rob Szumski 7 | # All rights reserved. 8 | # 9 | # Redistribution and use in source and binary forms, with or without 10 | # modification, are permitted provided that the following conditions are met: 11 | # 12 | # Redistributions of source code must retain the above copyright notice, 13 | # this list of conditions and the following disclaimer. 14 | # 15 | # Redistributions in binary form must reproduce the above copyright 16 | # notice, this list of conditions and the following disclaimer in the 17 | # documentation and/or other materials provided with the distribution. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | # POSSIBILITY OF SUCH DAMAGE. 30 | # 31 | # See https://github.com/robszumski/rackspace-monitoring-varnish for a readme 32 | # and more information 33 | # 34 | # this plugin can optinally print particular statistics, just pass them as args 35 | # varnish.sh cache_hit,cache_hitpass,cache_miss 36 | # 37 | # 38 | # Example Criteria 39 | # if (metric['healthy_backends'] < 1) { 40 | # return new AlarmStatus(CRITICAL, 'Varnish doesnt have any backends!'); 41 | #} 42 | # 43 | #if (metric['healthy_backends'] < 2) { 44 | # return new AlarmStatus(WARNING, 'Varnish only has #{healthy_backends} healthy backend.'); 45 | #} 46 | # 47 | # NOTE: if you are running Varnish < 4 comment out healthy backends metrics (they don't work) 48 | # 49 | 50 | return new AlarmStatus(OK, 'Varnish has \#{healthy_backends} backends.'); 51 | 52 | # check if service is running 53 | SERVICE=varnish 54 | VARNISHSTAT=/usr/bin/varnishstat 55 | VARNISHADM=/usr/bin/varnishadm 56 | 57 | if P=$(pgrep $SERVICE | wc -l) 58 | then 59 | echo "status $SERVICE is running ($P instances)" 60 | else 61 | echo "status $SERVICE is not running" 62 | fi 63 | 64 | # output number of processes 65 | echo "metric processes int32 $P" 66 | 67 | # calculate hit percent 68 | hits=$($VARNISHSTAT -1 -f cache_hit | awk '{print $2}') 69 | connections=$($VARNISHSTAT -1 -f client_req| awk '{print $2}') 70 | hit_percent=$(echo "scale=8;($hits/$connections)" | bc | awk '{printf "%f", $1*100}') 71 | echo "metric hit_percent double "$hit_percent 72 | 73 | # calculate # of healthy backends 74 | healthy=$($VARNISHADM backend.list | grep -c "Healthy") 75 | echo "metric healthy_backends int32" $healthy 76 | 77 | [ ! -z $1 ] && $VARNISHSTAT -1 -f $1 | awk ' { print "metric " $1 " gauge " $2 } ' 78 | -------------------------------------------------------------------------------- /varnish4.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # varnish4.sh 4 | # Rackspace Cloud Monitoring Plugin to collect metrics from varnishstat. 5 | # 6 | # Copyright (c) 2013, Rob Szumski 7 | # All rights reserved. 8 | # 9 | # Redistribution and use in source and binary forms, with or without 10 | # modification, are permitted provided that the following conditions are met: 11 | # 12 | # Redistributions of source code must retain the above copyright notice, 13 | # this list of conditions and the following disclaimer. 14 | # 15 | # Redistributions in binary form must reproduce the above copyright 16 | # notice, this list of conditions and the following disclaimer in the 17 | # documentation and/or other materials provided with the distribution. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | # POSSIBILITY OF SUCH DAMAGE. 30 | 31 | # check if service is running 32 | SERVICE=varnish 33 | VARNISHSTAT=/usr/bin/varnishstat 34 | VARNISHADM=/usr/bin/varnishadm 35 | HITSNAME=MAIN.cache_hit 36 | MISSESNAME=MAIN.cache_miss 37 | 38 | if P=$(pgrep $SERVICE | wc -l) 39 | then 40 | echo "status success" 41 | else 42 | echo "status down" 43 | fi 44 | 45 | # output number of processes 46 | echo "metric processes int32 $P" 47 | 48 | # calculate hit rate 49 | # cache_hit/(cache_hit + cache_miss) 50 | hits=$($VARNISHSTAT -1 -f $HITSNAME | awk '{print $2}') 51 | misses=$($VARNISHSTAT -1 -f $MISSESNAME | awk '{print $2}') 52 | hit_rate=$(echo "scale=8;($hits/($hits + $misses))" | bc | awk '{printf "%f", $1*100}') 53 | echo "metric hit_rate double" $hit_rate 54 | 55 | # calculate # of healthy backends 56 | healthy=$($VARNISHADM backend.list | grep -c "Healthy") 57 | echo "metric healthy_backends int32" $healthy 58 | 59 | [ ! -z $1 ] && $VARNISHSTAT -1 -f $1 | awk ' { print "metric " $1 " gauge " $2 } ' 60 | -------------------------------------------------------------------------------- /windows/get-counters.ps1: -------------------------------------------------------------------------------- 1 | <# 2 | Rackspace Cloud Monitoring Plug-In 3 | This is a plugin to gather Windows performance counters for use 4 | in Rackspace Monitoring checks. 5 | 6 | (c) 2018 Rackspace US, Inc 7 | 8 | All Rights Reserved. 9 | Licensed under the Apache License, Version 2.0 (the "License"); you may 10 | not use this file except in compliance with the License. 11 | You may obtain a copy of the License at 12 | 13 | http://www.apache.org/licenses/LICENSE-2.0 14 | 15 | Unless required by applicable law or agreed to in writing, software 16 | distributed under the License is distributed on an "AS IS" BASIS, 17 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | See the License for the specific language governing permissions and 19 | limitations under the License. 20 | 21 | Usage: 22 | Place plug-in in C:\Program Files\Rackspace Monitoring\plugins 23 | 24 | It accepts a single argument which is the CounterPath as used in perfmon. 25 | For example, this will gather all counters for the logical disk C: 26 | 27 | '\LogicalDisk(c:)\*' 28 | 29 | This plugin returns a metric for each counter gathered where the metric 30 | name is normalized into 31 | 32 | $object.$instance.$counter 33 | 34 | For example: 35 | 36 | logicaldisk.c.pct_free_space 37 | logicaldisk.c.free_megabytes 38 | logicaldisk.c.current_disk_queue_length 39 | logicaldisk.c.pct_disk_time 40 | logicaldisk.c.avg_disk_queue_length 41 | logicaldisk.c.pct_disk_read_time 42 | logicaldisk.c.avg_disk_read_queue_length 43 | 44 | #> 45 | 46 | function CM-GetCounters($CounterPath) { 47 | $results = Get-Counter -Counter $CounterPath 48 | $results.CounterSamples | ForEach-Object { 49 | $path = $_.Path 50 | $val = $_.CookedValue 51 | $metric = ($path -replace '\\\\.*?\\','' -replace '%','pct' -replace '\\','.' -replace '/',' per ' -replace '\(','.' -replace '[):]','' -replace '\.\s+','_' -replace '\s+','_').ToLower() -replace '[^a-z0-9:\.]','_' 52 | Write-Output "metric $metric double $val" 53 | } 54 | Write-Output "status ok success" 55 | } 56 | 57 | if($args.Count -lt 1) { 58 | Write-Output "status err Missing required parameter: CounterPath" 59 | exit 60 | } 61 | 62 | CM-GetCounters $args[0] -------------------------------------------------------------------------------- /windows/ping.ps1: -------------------------------------------------------------------------------- 1 | <# 2 | 3 | Rackspace Cloud Monitoring Plug-In 4 | 5 | This is a plugin to monitor ICMP response times of hosts accessible by the server 6 | 7 | ---------------------------------------------------------------------------- 8 | "THE BEER-WARE LICENSE" (Revision 42): 9 | wrote this file. As long as you retain this notice you 10 | can do whatever you want with this stuff. If we meet some day, and you think 11 | this stuff is worth it, you can buy me a beer in return. 12 | ---------------------------------------------------------------------------- 13 | 14 | Usage: 15 | - Place plug-in in folder C:\Program Files\Rackspace Monitoring\plugins 16 | - Configure Custom Plugin type check in Rackspace Intelligence 17 | Specify only the script's name and the hostname/IP to ping, e.g.: 18 | ping.ps1 192.168.0.1 19 | Count is the amount of ICMP probes sent in a singe check, and interval is the 20 | number of seconds between them. They are both optional. Their default values 21 | are 5 pings with an interval of 2 seconds. 22 | - Configure an Alert (optional, see example below). 23 | 24 | This plugin returns 4 metrics: 25 | - minimum, average, maximum: statistics returned by the Windows ping utility 26 | in the format "Minimum = 0ms, Maximum = 17ms, Average = 4ms 27 | - lost_packets: the percentage of the packets lost out of the number of probes 28 | sent 29 | 30 | Example alert: 31 | 32 | --- start copying after this line --- 33 | 34 | if (metric['average'] >= 30 ) { 35 | return new AlarmStatus(WARNING, 'Average round-trip took #{average}ms'); 36 | } 37 | if (metric['lost_packets'] >= 40) { 38 | return new AlarmStatus(WARNING, 'Packet loss was #{lost_packets}%'); 39 | } 40 | if (metric['legacy_state'] != "ok") { 41 | return new AlarmStatus(CRITICAL, 'Error: #{legacy_state}'); 42 | } 43 | return new AlarmStatus(OK, 'All good'); 44 | 45 | --- stop copying before this line --- 46 | 47 | #> 48 | 49 | function CM-Ping($TargetHost, $count, $interval) { 50 | $ping_command = "ping -n 1 -w 30 $TargetHost" 51 | $lost_packets=0 52 | if (-not $count) { $count = 5 } 53 | if (-not $interval ) { $interval = 2 } 54 | [int[]] $ping_min, $ping_max, $ping_avg = @() 55 | for ($i=0; $i -lt $count; $i++) { 56 | $lines = iex $ping_command | select-string "loss|average" 57 | if (0 -eq $LASTEXITCODE) { 58 | $stats_loss = $lines[0] 59 | $stats_ping = $lines[1] 60 | if ([int]"$stats_loss".split("(")[1].split("%")[0] -gt 0) { 61 | $lost_packets++ 62 | } 63 | $result_ping = Foreach ($metric in "$stats_ping".split(",")) { $metric.Replace(" Minimum = ", "").Replace(" Maximum = ", ""). 64 | Replace(" Average = ", "").Replace("ms", "") } 65 | $ping_min += [int]$result_ping[0] 66 | $ping_max += [int]$result_ping[1] 67 | $ping_avg += [int]$result_ping[2] 68 | sleep $interval 69 | } 70 | else { 71 | $lost_packets++ 72 | } 73 | } 74 | if ( $lines ) { 75 | Write-Output "metric minimum int32 $(($ping_min | measure -Minimum).Minimum) milliseconds" 76 | Write-Output "metric average double $(($ping_min | measure -Average).Average) milliseconds" 77 | Write-Output "metric maximum int32 $(($ping_min | measure -Maximum).Maximum) milliseconds" 78 | Write-Output "metric lost_packets int32 $([int](([int]$lost_packets / [int]$count) * 100)) percent" 79 | Write-Output "status ok" 80 | } 81 | else { 82 | Write-Output "status err $TargetHost could not be reached" 83 | } 84 | } 85 | 86 | if($args.Count -lt 1) { 87 | Write-Output "status err Missing required parameter" 88 | exit 89 | } 90 | 91 | CM-Ping -TargetHost $args[0] $args[1] $args[2] 92 | -------------------------------------------------------------------------------- /windows/service_mon.ps1: -------------------------------------------------------------------------------- 1 | <# 2 | 3 | Script to return status of a Windows service. 4 | 5 | Teddy Schmitz 6 | All Rights Reserved. 7 | 8 | Licensed under the Apache License, Version 2.0 (the "License"); you may 9 | not use this file except in compliance with the License. 10 | You may obtain a copy of the License at 11 | 12 | http://www.apache.org/licenses/LICENSE-2.0 13 | 14 | Unless required by applicable law or agreed to in writing, software 15 | distributed under the License is distributed on an "AS IS" BASIS, 16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | See the License for the specific language governing permissions and 18 | limitations under the License. 19 | 20 | Configuration: 21 | 22 | You must supply the name of the service in the arguments section of the agent config. 23 | For example to monitor the plug in play service your check config should look like this: 24 | 25 | "details": { 26 | "args": [ 27 | "Plug and Play" 28 | ], 29 | "file": "service_mon.ps1" 30 | } 31 | 32 | 33 | Example alarm criteria: 34 | 35 | if (metric['service_status'] != 'running') { 36 | return new AlarmStatus(CRITICAL, 'Service is NOT running.'); 37 | } 38 | 39 | 40 | #> 41 | 42 | 43 | function FuncCheckService{ 44 | param($ServiceName) 45 | try{ 46 | $arrService = Get-Service -Name $ServiceName -ErrorAction Stop 47 | } 48 | catch [Microsoft.PowerShell.Commands.ServiceCommandException] 49 | { 50 | Write-Output "status err $ServiceName service not found" 51 | exit 52 | } 53 | if ($arrService.Status -ne "Running") 54 | { 55 | Write-Output "metric service_status string notrunning" 56 | Write-Output "status ok found service" 57 | } 58 | if ($arrService.Status -eq "running") 59 | { 60 | Write-Output "metric service_status string running" 61 | Write-Output "status ok found service" 62 | } 63 | } 64 | 65 | if($args.Count -lt 1){ 66 | Write-Output "status err no service specified" 67 | exit 68 | } 69 | FuncCheckService -ServiceName $args[0] -------------------------------------------------------------------------------- /yum_updates_check.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Description: Custom plugin returns number of pending updates on a 4 | # yum-based system. 5 | # Author: Andrew Regner 6 | # License: MIT 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in 16 | # all copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 | # THE SOFTWARE. 25 | 26 | yum check-update | awk ' 27 | $2 ~ /^[0-9]/ { 28 | count[$3] += 1; 29 | total += 1; 30 | } 31 | 32 | END { 33 | if(total > 0) 34 | printf("status pending updates: %d\n", total); 35 | else 36 | printf("status no updates\n"); 37 | 38 | printf("metric total_updates uint32 %d\n", total); 39 | for(repo in count) 40 | printf("metric pending_%s uint32 %d\n", repo, count[repo]); 41 | }' 42 | --------------------------------------------------------------------------------