├── README └── check_graphite_data /README: -------------------------------------------------------------------------------- 1 | Some handy scripts we've written for Nagios, at Etsy: 2 | 3 | 4 | check_graphite_data: 5 | Alert on data, based on number from Graphite. 6 | Usage: 7 | check_graphite_data 8 | Options: 9 | -c --crit= Critical threshold 10 | -w --warn= Warning threshold 11 | -u --url= Graphite graph URL 12 | -r Reverse - Alert when the value is UNDER warn/crit instead of OVER 13 | -s --seconds= Average over the last N seconds of data 14 | --d1 --d2 Diff the latest values between two graphs 15 | -W --holt-winters Perform a Holt-Winters check 16 | -U --critupper Upper Holt-Winters band breach causes a crit, 17 | - breaching lower band causes a warn 18 | -L --critlower Lower Holt-Winters band breach causes a crit, 19 | - breaching upper band causes a warn 20 | (If -W, but neither -U nor -L are given, we will always warn) 21 | --scale= scale, eg 8 to treat bytes as bits 22 | --scale-invert= scale, eg 1073741824 to treat bytes as GiB 23 | - --scale{-invert} lets you give thresholds in sane units 24 | -------------------------------------------------------------------------------- /check_graphite_data: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import getopt 4 | import sys 5 | import urllib 6 | 7 | STATE_OK = 0 8 | STATE_WARNING = 1 9 | STATE_CRITICAL = 2 10 | STATE_UNKNOWN = 3 11 | STATE_DEPENDENT = 4 12 | 13 | def usage(): 14 | print 'Usage:' 15 | print '\tcheck_graphite_data ' 16 | print 'Options:' 17 | print '\t-c --crit=\t\tCritical threshold' 18 | print '\t-w --warn=\t\tWarning threshold' 19 | print '\t-u --url=\t\tGraphite graph URL' 20 | print '\t-r\t\t\t\tReverse - Alert when the value is UNDER warn/crit instead of OVER' 21 | print '\t-s --seconds=\tAverage over the last N seconds of data' 22 | print '\t--d1 --d2 \t\tDiff the latest values between two graphs' 23 | print '\t-W --holt-winters\t\tPerform a Holt-Winters check' 24 | print '\t-U --critupper\t\t\tUpper Holt-Winters band breach causes a crit,' 25 | print '\t\t\t\t\t- breaching lower band causes a warn' 26 | print '\t-L --critlower\t\t\tLower Holt-Winters band breach causes a crit,' 27 | print '\t\t\t\t\t- breaching upper band causes a warn' 28 | print '\t(If -W, but neither -U nor -L are given, we will always warn)' 29 | print '\t--scale=\t\tscale, eg 8 to treat bytes as bits' 30 | print '\t--scale-invert=\tscale, eg 1073741824 to treat bytes as GiB' 31 | print '\t\t\t\t\t- --scale{-invert} lets you give thresholds in sane units' 32 | 33 | 34 | def pull_graphite_data(url): 35 | """Pull down raw data from Graphite""" 36 | # Make sure the url ends with '&rawData' 37 | if not url.endswith('&rawData'): 38 | url = url + '&rawData' 39 | 40 | # Catch URL errors 41 | try: 42 | data = urllib.urlopen(url).read() 43 | if len(data) == 0: 44 | print "Error: No data was returned. Did you specify an existing metric? - " + url 45 | sys.exit(STATE_UNKNOWN) 46 | return data 47 | except Exception,e: 48 | print "Error: "+ str(e) +" - " + url 49 | sys.exit(STATE_UNKNOWN) 50 | 51 | def eval_graphite_data(data, seconds): 52 | """Get the most recent correct value from the data""" 53 | 54 | sample_period = int(data.split('|')[0].split(',')[-1]) 55 | all_data_points = data.split('|')[-1].split(',') 56 | 57 | # Evaluate what graphite returned, should either be a float, or None 58 | # First, if the number of seconds of data we want to examine is smaller or 59 | # equals the graphite sample period, just grab the latest data point. 60 | # If that data point is None, grab the one before it. 61 | # If that is None too, return 0.0. 62 | if seconds <= sample_period: 63 | if not all_data_points[-1].startswith("None"): 64 | data_value = float(all_data_points[-1]) 65 | elif not all_data_points[-2].startswith("None"): 66 | data_value = float(all_data_points[-2]) 67 | else: 68 | data_value = 0.0 69 | else: 70 | # Second, if we requested more than on graphite sample period, work out how 71 | # many sample periods we wanted (python always rounds division *down*) 72 | data_points = (seconds/sample_period) 73 | data_set = [ float(x) for x in all_data_points[-data_points:] 74 | if not x.startswith("None") ] 75 | if data_set: 76 | data_value = float( sum(data_set) / len(data_set) ) 77 | else: 78 | data_value = 0.0 79 | return data_value 80 | 81 | 82 | def get_hw_value(url, seconds=0): 83 | """Get the Holt-Winters value from a Graphite graph""" 84 | 85 | data = pull_graphite_data(url) 86 | for line in data.split(): 87 | if line.startswith('holtWintersConfidenceUpper'): 88 | graphite_upper = eval_graphite_data(line, seconds) 89 | elif line.startswith('holtWintersConfidenceLower'): 90 | graphite_lower = eval_graphite_data(line, seconds) 91 | else: 92 | graphite_data = eval_graphite_data(line, seconds) 93 | 94 | return graphite_data, graphite_lower, graphite_upper 95 | 96 | 97 | def get_value(url, seconds=0): 98 | """Get the value from a Graphite graph""" 99 | 100 | data = pull_graphite_data(url) 101 | data_value = eval_graphite_data(data, seconds) 102 | return data_value 103 | 104 | 105 | def main(argv): 106 | try: 107 | opts, args = getopt.getopt(argv, 'hWULru:c:w:s:', 108 | ['help', 'holt-winters', 'critupper', 109 | 'critlower', 'url=', 'crit=', 'warn=', 110 | 'seconds=', 'd1=', 'd2=' 111 | ,'scale=' ,'scale-invert=' 112 | ]) 113 | except getopt.GetoptError, err: 114 | print str(err) 115 | sys.exit(STATE_UNKNOWN) 116 | 117 | url = None 118 | warn = None 119 | crit = None 120 | seconds = 0 121 | diff1 = None 122 | diff2 = None 123 | reverse = False 124 | hw = None 125 | critupper = None 126 | critlower = None 127 | scale = 1 128 | for opt, arg in opts: 129 | if opt in ('-h', '--help'): 130 | usage() 131 | sys.exit() 132 | elif opt in ('-u', '--url'): 133 | url = arg 134 | elif opt in ('-w', '--warn'): 135 | warn = float(arg) 136 | elif opt in ('-c', '--crit'): 137 | crit = float(arg) 138 | elif opt in ('-s', '--seconds'): 139 | seconds = int(arg) 140 | elif opt in ('-r'): 141 | reverse = True 142 | elif opt in ('--d1'): 143 | diff1 = arg 144 | elif opt in ('--d2'): 145 | diff2 = arg 146 | elif opt in ('-W', '--holtwinters'): 147 | hw = True 148 | elif opt in ('-U', '--critupper'): 149 | critupper = True 150 | elif opt in ('-L', '--critlower'): 151 | critlower = True 152 | elif opt in ('--scale'): 153 | scale = float(arg) 154 | elif opt in ('--scale-invert'): 155 | scale = 1 / float(arg) 156 | if not hw and ((url == None) or (warn == None) or (crit == None)) \ 157 | and not diff1 and not diff2: 158 | usage() 159 | sys.exit(STATE_UNKNOWN) 160 | 161 | if (diff1 == None and diff2 != None) or (diff1 != None and diff2 == None): 162 | usage() 163 | sys.exit(STATE_UNKNOWN) 164 | 165 | if hw: 166 | graphite_data, graphite_lower, graphite_upper = get_hw_value(url, seconds) 167 | print 'Current value: %s, lower band: %s, upper band: %s' % \ 168 | (graphite_data, graphite_lower, graphite_upper) 169 | if (graphite_data > graphite_upper) or (graphite_data < graphite_lower): 170 | if critupper or critlower: 171 | sys.exit(STATE_CRITICAL) 172 | else: 173 | sys.exit(STATE_WARNING) 174 | else: 175 | sys.exit(STATE_OK) 176 | elif diff1 or diff2: 177 | graphite_data1 = get_value(diff1, seconds) 178 | graphite_data2 = get_value(diff2, seconds) 179 | graphite_data = abs(graphite_data1 - graphite_data2) 180 | else: 181 | graphite_data = get_value(url, seconds) 182 | graphite_data *= scale 183 | 184 | print 'Current value: %s, warn threshold: %s, crit threshold: %s' % \ 185 | (graphite_data, warn, crit) 186 | if reverse == True: 187 | if crit >= graphite_data: 188 | sys.exit(STATE_CRITICAL) 189 | elif warn >= graphite_data: 190 | sys.exit(STATE_WARNING) 191 | else: 192 | sys.exit(STATE_OK) 193 | else: 194 | if graphite_data >= crit: 195 | sys.exit(STATE_CRITICAL) 196 | elif graphite_data >= warn: 197 | sys.exit(STATE_WARNING) 198 | else: 199 | sys.exit(STATE_OK) 200 | 201 | 202 | if __name__ == '__main__': 203 | main(sys.argv[1:]) 204 | --------------------------------------------------------------------------------