├── graphite_metrics ├── __init__.py ├── sinks │ ├── __init__.py │ ├── dump.py │ ├── carbon_socket.py │ └── librato_metrics.py ├── processors │ ├── __init__.py │ └── hostname_prefix.py ├── loops │ ├── __init__.py │ └── basic.py ├── collectors │ ├── stats.py │ ├── irq.py │ ├── ping.py │ ├── memstats.py │ ├── slabinfo.py │ ├── memfrag.py │ ├── __init__.py │ ├── iptables_counts.py │ ├── cron_log.py │ ├── _ping.py │ ├── cjdns_peer_stats.py │ ├── sysstat.py │ └── cgacct.py ├── harvestd.py └── harvestd.yaml ├── .gitignore ├── MANIFEST.in ├── requirements.txt ├── COPYING ├── setup.py └── README.md /graphite_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /*.egg-info 2 | /build 3 | /dist 4 | /README.txt 5 | *.pyc 6 | *.pyo 7 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include graphite_metrics/harvestd.yaml 2 | include COPYING README.txt 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PyYAML==3.09 2 | dbus-python==0.84 3 | distribute==0.6.24 4 | gevent==0.13.6 5 | greenlet==0.3.1 6 | iso8601==0.1.4 7 | layered-yaml-attrdict-config==12.05.3 8 | requests==0.11.1 9 | simplejson==2.1.1 10 | xattr==0.6.2 11 | -------------------------------------------------------------------------------- /graphite_metrics/sinks/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | 5 | import logging 6 | log = logging.getLogger(__name__) 7 | 8 | 9 | class Sink(object): 10 | 11 | def __init__(self, conf): 12 | self.conf = conf 13 | 14 | def dispatch(self, *tuples): 15 | raise NotImplementedError( 'Sink.dispatch method should be overidden in sink' 16 | ' subclasses to dispatch (metric_name, value, timestamp) tuples to whatever destination.' ) 17 | -------------------------------------------------------------------------------- /graphite_metrics/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | 5 | import logging 6 | log = logging.getLogger(__name__) 7 | 8 | 9 | class Processor(object): 10 | 11 | def __init__(self, conf): 12 | self.conf = conf 13 | 14 | def process(self, dp_tuple, sinks): 15 | raise NotImplementedError( 'Processor.process method' 16 | ' should be overidden in processor subclasses to mangle' 17 | ' (name, value, timestamp) tuple in some way.' ) 18 | -------------------------------------------------------------------------------- /COPYING: -------------------------------------------------------------------------------- 1 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 2 | Version 2, December 2004 3 | 4 | Copyright (C) 2012 Mike Kazantsev 5 | 6 | Everyone is permitted to copy and distribute verbatim or modified 7 | copies of this license document, and changing it is allowed as long 8 | as the name is changed. 9 | 10 | DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 11 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 12 | 13 | 0. You just DO WHAT THE FUCK YOU WANT TO. 14 | -------------------------------------------------------------------------------- /graphite_metrics/sinks/dump.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | 5 | from . import Sink 6 | 7 | import logging 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | class Dumper(Sink): 12 | 13 | 'Just dumps the data to log. Useful for debugging.' 14 | 15 | def dispatch(self, *tuples): 16 | log.info('--- dump of {} datapoints'.format(len(tuples))) 17 | for name, value, ts_dp in tuples: 18 | log.info('Datapoint: {} {} {}'.format(name, value, ts_dp)) 19 | log.info('--- dump end') 20 | 21 | 22 | sink = Dumper 23 | -------------------------------------------------------------------------------- /graphite_metrics/loops/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from time import time 5 | 6 | import logging 7 | log = logging.getLogger(__name__) 8 | 9 | # Global configuration for harvestd, 10 | # intended to be set before initializing loops, 11 | # but should not be really relied upon - can be empty. 12 | cfg = dict() 13 | 14 | 15 | class Loop(object): 16 | 17 | def __init__(self, conf, time_func=time): 18 | self.conf, self.time_func = conf, time_func 19 | 20 | def start(self, collectors, processors, sinks): 21 | raise NotImplementedError( 'Loop.start method should be' 22 | ' overidden in loop subclasses to start poll/process/send loop' 23 | ' using passed Collector, Processor and Sink objects.' ) 24 | -------------------------------------------------------------------------------- /graphite_metrics/collectors/stats.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from io import open 5 | 6 | from . import Collector, Datapoint 7 | 8 | import logging 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | class Stats(Collector): 13 | 14 | def read(self): 15 | with open('/proc/stat', 'rb') as table: 16 | for line in table: 17 | label, vals = line.split(None, 1) 18 | total = int(vals.split(None, 1)[0]) 19 | if label == 'intr': name = 'irq.total.hard' 20 | elif label == 'softirq': name = 'irq.total.soft' 21 | elif label == 'processes': name = 'processes.forks' 22 | else: continue # no more useful data here 23 | yield Datapoint(name, 'counter', total, None) 24 | 25 | 26 | collector = Stats 27 | -------------------------------------------------------------------------------- /graphite_metrics/processors/hostname_prefix.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | import os 5 | 6 | from . import Processor 7 | 8 | import logging 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | class HostnamePrefix(Processor): 13 | 14 | 'Adds a hostname as a prefix to metric name.' 15 | 16 | def __init__(self, *argz, **kwz): 17 | super(HostnamePrefix, self).__init__(*argz, **kwz) 18 | self.prefix = self.conf.hostname 19 | if self.prefix is None: self.prefix = os.uname()[1] 20 | if not self.prefix.endswith('.'): self.prefix += '.' 21 | 22 | def process(self, dp_tuple, sinks): 23 | name, value, ts_dp = dp_tuple 24 | return (self.prefix + name, value, ts_dp), sinks 25 | 26 | 27 | processor = HostnamePrefix 28 | -------------------------------------------------------------------------------- /graphite_metrics/collectors/irq.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from io import open 5 | 6 | from . import Collector, Datapoint 7 | 8 | import logging 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | class IRQ(Collector): 13 | 14 | @staticmethod 15 | def _parse_irq_table(table): 16 | irqs = dict() 17 | bindings = map(bytes.lower, table.readline().strip().split()) 18 | bindings_cnt = len(bindings) 19 | for line in it.imap(bytes.strip, table): 20 | irq, line = line.split(None, 1) 21 | irq = irq.rstrip(':').lower() 22 | if irq in irqs: 23 | log.warn('Conflicting irq name/id: {!r}, skipping'.format(irq)) 24 | continue 25 | irqs[irq] = map(int, line.split(None, bindings_cnt)[:bindings_cnt]) 26 | return bindings, irqs 27 | 28 | def read(self): 29 | irq_tables = list() 30 | # /proc/interrupts 31 | with open('/proc/interrupts', 'rb') as table: 32 | irq_tables.append(self._parse_irq_table(table)) 33 | # /proc/softirqs 34 | with open('/proc/softirqs', 'rb') as table: 35 | irq_tables.append(self._parse_irq_table(table)) 36 | # dispatch 37 | for bindings, irqs in irq_tables: 38 | for irq, counts in irqs.viewitems(): 39 | if sum(counts) == 0: continue 40 | for bind, count in it.izip(bindings, counts): 41 | yield Datapoint('irq.{}.{}'.format(irq, bind), 'counter', count, None) 42 | 43 | 44 | collector = IRQ 45 | -------------------------------------------------------------------------------- /graphite_metrics/collectors/ping.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from subprocess import Popen, PIPE 5 | from io import open 6 | import os, signal 7 | 8 | from . import Collector, Datapoint 9 | 10 | import logging 11 | log = logging.getLogger(__name__) 12 | 13 | 14 | class PingerInterface(Collector): 15 | 16 | def __init__(self, *argz, **kwz): 17 | super(PingerInterface, self).__init__(*argz, **kwz) 18 | self.hosts = dict(it.chain( 19 | ( ('v4:{}'.format(spec), name) 20 | for name, spec in (self.conf.hosts.ipv4 or dict()).viewitems() ), 21 | ( ('v6:{}'.format(spec), name) 22 | for name, spec in (self.conf.hosts.ipv6 or dict()).viewitems() ) )) 23 | if not self.hosts: 24 | log.info('No valid hosts to ping specified, disabling collector') 25 | self.conf.enabled = False 26 | else: self.spawn_pinger() 27 | 28 | def spawn_pinger(self): 29 | cmd = ( 30 | ['python', os.path.join(os.path.dirname(__file__), '_ping.py')] 31 | + map(bytes, [ self.conf.interval, 32 | self.conf.resolve.no_reply or 0, self.conf.resolve.time or 0, 33 | self.conf.ewma_factor, os.getpid(), self.conf.resolve.max_retries ]) 34 | + self.hosts.keys() ) 35 | log.debug('Starting pinger subprocess: {}'.format(' '.join(cmd))) 36 | self.proc = Popen(cmd, stdout=PIPE, close_fds=True) 37 | self.proc.stdout.readline() # wait until it's initialized 38 | 39 | def read(self): 40 | err = self.proc.poll() 41 | if err is not None: 42 | log.warn( 'Pinger subprocess has failed' 43 | ' (exit code: {}), restarting it'.format(err) ) 44 | self.spawn_pinger() 45 | else: 46 | self.proc.send_signal(signal.SIGQUIT) 47 | for line in iter(self.proc.stdout.readline, ''): 48 | line = line.strip() 49 | if not line: break 50 | host, ts_offset, rtt, lost = line.split() 51 | host = self.hosts[host] 52 | yield Datapoint('network.ping.{}.ping'.format(host), 'gauge', float(rtt), None) 53 | yield Datapoint('network.ping.{}.droprate'.format(host), 'counter', int(lost), None) 54 | 55 | 56 | collector = PingerInterface 57 | -------------------------------------------------------------------------------- /graphite_metrics/loops/basic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | 5 | from . import Loop 6 | 7 | import logging 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | class BasicLoop(Loop): 12 | 13 | 'Simple synchronous "while True: fetch && process && send" loop.' 14 | 15 | def start(self, collectors, processors, sinks): 16 | from time import time, sleep 17 | 18 | ts = self.time_func() 19 | while True: 20 | data = list() 21 | for name, collector in collectors.viewitems(): 22 | log.debug('Polling data from a collector (name: {}): {}'.format(name, collector)) 23 | try: data.extend(collector.read()) 24 | except Exception as err: 25 | log.exception( 'Failed to poll collector' 26 | ' (name: {}, obj: {}): {}'.format(name, collector, err) ) 27 | 28 | ts_now = self.time_func() 29 | sink_data = dict() # to batch datapoints on per-sink basis 30 | 31 | log.debug('Processing {} datapoints'.format(len(data))) 32 | for dp in it.ifilter(None, (dp.get(ts=ts_now) for dp in data)): 33 | proc_sinks = sinks.copy() 34 | for name, proc in processors.viewitems(): 35 | if dp is None: break 36 | try: dp, sinks = proc.process(dp, sinks) 37 | except Exception as err: 38 | log.exception(( 'Failed to process datapoint (data: {},' 39 | ' processor: {}, obj: {}): {}, discarding' ).format(dp, name, proc, err)) 40 | break 41 | else: 42 | if dp is None: continue 43 | for name, sink in proc_sinks.viewitems(): 44 | try: sink_data[name].append(dp) 45 | except KeyError: sink_data[name] = [dp] 46 | 47 | log.debug('Dispatching data to {} sink(s)'.format(len(sink_data))) 48 | if not self.conf.debug.dry_run: 49 | for name, tuples in sink_data.viewitems(): 50 | log.debug(( 'Sending {} datapoints to sink' 51 | ' (name: {}): {}' ).format(len(tuples), name, sink)) 52 | try: sinks[name].dispatch(*tuples) 53 | except Exception as err: 54 | log.exception( 'Failed to dispatch data to sink' 55 | ' (name: {}, obj: {}): {}'.format(name, sink, err) ) 56 | 57 | while ts < ts_now: ts += self.conf.interval 58 | ts_sleep = max(0, ts - self.time_func()) 59 | log.debug('Sleep: {}s'.format(ts_sleep)) 60 | sleep(ts_sleep) 61 | 62 | 63 | loop = BasicLoop 64 | -------------------------------------------------------------------------------- /graphite_metrics/sinks/carbon_socket.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from time import sleep 5 | import socket 6 | 7 | from . import Sink 8 | 9 | import logging 10 | log = logging.getLogger(__name__) 11 | 12 | 13 | class CarbonSocket(Sink): 14 | 15 | '''Simple blocking non-buffering sender 16 | to graphite carbon tcp linereceiver interface.''' 17 | 18 | def __init__(self, conf): 19 | super(CarbonSocket, self).__init__(conf) 20 | if not self.conf.debug.dry_run: self.connect() 21 | 22 | def connect(self, send=None): 23 | host, port = self.conf.host 24 | reconnects = self.conf.max_reconnects 25 | while True: 26 | try: 27 | try: 28 | addrinfo = list(reversed(socket.getaddrinfo( 29 | host, port, socket.AF_UNSPEC, socket.SOCK_STREAM ))) 30 | except socket.error as err: 31 | raise socket.gaierror(err.message) 32 | assert addrinfo, addrinfo 33 | while addrinfo: 34 | # Try connecting to all of the returned addresses 35 | af, socktype, proto, canonname, sa = addrinfo.pop() 36 | try: 37 | self.sock = socket.socket(af, socktype, proto) 38 | self.sock.connect(sa) 39 | except socket.error: 40 | if not addrinfo: raise 41 | log.debug('Connected to Carbon at {}:{}'.format(*sa)) 42 | if send: self.sock.sendall(send) 43 | 44 | except (socket.error, socket.gaierror) as err: 45 | if reconnects is not None: 46 | reconnects -= 1 47 | if reconnects <= 0: raise 48 | if isinstance(err, socket.gaierror): 49 | log.info('Failed to resolve host ({!r}): {}'.format(host, err)) 50 | else: log.info('Failed to connect to {}:{}: {}'.format(host, port, err)) 51 | if self.conf.reconnect_delay: 52 | sleep(max(0, self.conf.reconnect_delay)) 53 | 54 | else: break 55 | 56 | def close(self): 57 | try: self.sock.close() 58 | except: pass 59 | 60 | def reconnect(self, send=None): 61 | self.close() 62 | self.connect(send=send) 63 | 64 | def dispatch(self, *tuples): 65 | reconnects = self.conf.max_reconnects 66 | packet = ''.join(it.starmap('{} {} {}\n'.format, tuples)) 67 | try: self.sock.sendall(packet) 68 | except socket.error as err: 69 | log.error('Failed to send data to Carbon server: {}'.format(err)) 70 | self.reconnect(send=packet) 71 | 72 | 73 | sink = CarbonSocket 74 | -------------------------------------------------------------------------------- /graphite_metrics/collectors/memstats.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | import re 5 | 6 | from . import Collector, Datapoint 7 | 8 | import logging 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | class MemStats(Collector): 13 | 14 | _warn_hp = True 15 | 16 | @staticmethod 17 | def _camelcase_fix( name, 18 | _re1=re.compile(r'(.)([A-Z][a-z]+)'), 19 | _re2=re.compile(r'([a-z0-9])([A-Z])'), 20 | _re3=re.compile(r'_+') ): 21 | return _re3.sub('_', _re2.sub( 22 | r'\1_\2', _re1.sub(r'\1_\2', name) )).lower() 23 | 24 | def read(self): 25 | # /proc/vmstat 26 | with open('/proc/vmstat', 'rb') as table: 27 | for line in table: 28 | metric, val = line.strip().split(None, 1) 29 | val = int(val) 30 | if metric.startswith('nr_'): 31 | yield Datapoint( 'memory.pages.allocation.{}'\ 32 | .format(metric[3:]), 'gauge', val, None ) 33 | else: 34 | yield Datapoint( 'memory.pages.activity.{}'\ 35 | .format(metric), 'gauge', val, None ) 36 | # /proc/meminfo 37 | with open('/proc/meminfo', 'rb') as table: 38 | table = dict(line.strip().split(None, 1) for line in table) 39 | hp_size = table.pop('Hugepagesize:', None) 40 | if hp_size and not hp_size.endswith(' kB'): hp_size = None 41 | if hp_size: hp_size = int(hp_size[:-3]) 42 | elif self._warn_hp: 43 | log.warn('Unable to get hugepage size from /proc/meminfo') 44 | self._warn_hp = False 45 | for metric, val in table.viewitems(): 46 | if metric.startswith('DirectMap'): continue # static info 47 | # Name mangling 48 | metric = self._camelcase_fix( 49 | metric.rstrip(':').replace('(', '_').replace(')', '') ) 50 | if metric.startswith('s_'): metric = 'slab_{}'.format(metric[2:]) 51 | elif metric.startswith('mem_'): metric = metric[4:] 52 | elif metric == 'slab': metric = 'slab_total' 53 | # Value processing 54 | try: val, val_unit = val.split() 55 | except ValueError: # no units assumed as number of pages 56 | if not metric.startswith('huge_pages_'): 57 | log.warn( 'Unhandled page-measured' 58 | ' metric in /etc/meminfo: {}'.format(metric) ) 59 | continue 60 | val = int(val) * hp_size 61 | else: 62 | if val_unit != 'kB': 63 | log.warn('Unhandled unit type in /etc/meminfo: {}'.format(unit)) 64 | continue 65 | val = int(val) 66 | yield Datapoint( 'memory.allocation.{}'\ 67 | .format(metric), 'gauge', val * 1024, None ) 68 | 69 | 70 | collector = MemStats 71 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from glob import iglob 5 | import os, sys 6 | 7 | from setuptools import setup, find_packages 8 | 9 | pkg_root = os.path.dirname(__file__) 10 | 11 | entry_points = dict(console_scripts=['harvestd = graphite_metrics.harvestd:main']) 12 | entry_points.update( 13 | ('graphite_metrics.{}'.format(ep_type), list( 14 | '{0} = graphite_metrics.{1}.{0}'\ 15 | .format(os.path.basename(fn)[:-3], ep_type) 16 | for fn in iglob(os.path.join( 17 | pkg_root, 'graphite_metrics', ep_type, '[!_]*.py' )) )) 18 | for ep_type in ['collectors', 'processors', 'sinks', 'loops'] ) 19 | 20 | # Error-handling here is to allow package to be built w/o README included 21 | try: readme = open(os.path.join(pkg_root, 'README.txt')).read() 22 | except IOError: readme = '' 23 | 24 | setup( 25 | 26 | name = 'graphite-metrics', 27 | version = '15.7.0', 28 | author = 'Mike Kazantsev', 29 | author_email = 'mk.fraggod@gmail.com', 30 | license = 'WTFPL', 31 | keywords = 'graphite sysstat systemd cgroups metrics proc', 32 | url = 'http://github.com/mk-fg/graphite-metrics', 33 | 34 | description = 'Standalone Graphite metric data collectors for' 35 | ' various stuff thats not (or poorly) handled by other monitoring daemons', 36 | long_description = readme, 37 | 38 | classifiers = [ 39 | 'Development Status :: 4 - Beta', 40 | 'Environment :: No Input/Output (Daemon)', 41 | 'Intended Audience :: Developers', 42 | 'Intended Audience :: System Administrators', 43 | 'Intended Audience :: Telecommunications Industry', 44 | 'License :: OSI Approved', 45 | 'Operating System :: POSIX', 46 | 'Operating System :: Unix', 47 | 'Programming Language :: Python', 48 | 'Programming Language :: Python :: 2.7', 49 | 'Programming Language :: Python :: 2 :: Only', 50 | 'Topic :: Internet', 51 | 'Topic :: Internet :: Log Analysis', 52 | 'Topic :: System :: Monitoring', 53 | 'Topic :: System :: Networking :: Monitoring', 54 | 'Topic :: System :: Operating System Kernels :: Linux' ], 55 | 56 | install_requires = ['layered-yaml-attrdict-config', 'setuptools'], 57 | extras_require = { 58 | 'collectors.cgacct': ['dbus-python'], 59 | 'collectors.cron_log': ['xattr', 'iso8601'], 60 | 'collectors.sysstat': ['xattr'], 61 | 'sinks.librato_metrics': ['requests'], 62 | 'sinks.librato_metrics.async': ['gevent'] }, 63 | 64 | packages = find_packages(), 65 | package_data = {'': ['README.txt'], 'graphite_metrics': ['harvestd.yaml']}, 66 | exclude_package_data = {'': ['README.*']}, 67 | 68 | entry_points = entry_points ) 69 | -------------------------------------------------------------------------------- /graphite_metrics/collectors/slabinfo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from collections import namedtuple 5 | from io import open 6 | 7 | from . import Collector, Datapoint, page_size 8 | 9 | import logging 10 | log = logging.getLogger(__name__) 11 | 12 | 13 | class SlabInfo(Collector): 14 | 15 | version_check = '2.1' 16 | 17 | def __init__(self, *argz, **kwz): 18 | super(SlabInfo, self).__init__(*argz, **kwz) 19 | 20 | for k in 'include_prefixes', 'exclude_prefixes': 21 | if not self.conf.get(k): self.conf[k] = list() 22 | 23 | with open('/proc/slabinfo', 'rb') as table: 24 | line = table.readline() 25 | self.version = line.split(':')[-1].strip() 26 | if self.version_check\ 27 | and self.version != self.version_check: 28 | log.warn( 'Slabinfo header indicates' 29 | ' different schema version (expecting: {}): {}'\ 30 | .format(self.version_check, line) ) 31 | line = table.readline().strip().split() 32 | if line[0] != '#' or line[1] != 'name': 33 | log.error('Unexpected slabinfo format, not processing it') 34 | return 35 | headers = dict(name=0) 36 | for idx,header in enumerate(line[2:], 1): 37 | if header[0] == '<' and header[-1] == '>': headers[header[1:-1]] = idx 38 | pick = 'name', 'active_objs', 'objsize', 'pagesperslab', 'active_slabs', 'num_slabs' 39 | picker = op.itemgetter(*op.itemgetter(*pick)(headers)) 40 | record = namedtuple('slabinfo_record', ' '.join(pick)) 41 | self.parse_line = lambda line: record(*( (int(val) if idx else val) 42 | for idx,val in enumerate(picker(line.strip().split())) )) 43 | 44 | # http://elinux.org/Slab_allocator 45 | def read(self): 46 | parse_line, ps = self.parse_line, page_size 47 | with open('/proc/slabinfo', 'rb') as table: 48 | table.readline(), table.readline() # header 49 | for line in table: 50 | info = parse_line(line) 51 | for prefix in self.conf.include_prefixes: 52 | if info.name.startswith(prefix): break # force-include 53 | else: 54 | for prefix in self.conf.exclude_prefixes: 55 | if info.name.startswith(prefix): 56 | info = None 57 | break 58 | if info: 59 | vals = [ 60 | ('obj_active', info.active_objs * info.objsize), 61 | ('slab_active', info.active_slabs * info.pagesperslab * ps), 62 | ('slab_allocated', info.num_slabs * info.pagesperslab * ps) ] 63 | if self.conf.pass_zeroes or sum(it.imap(op.itemgetter(1), vals)) != 0: 64 | for val_name, val in vals: 65 | yield Datapoint( 'memory.slabs.{}.bytes_{}'\ 66 | .format(info.name, val_name), 'gauge', val, None ) 67 | 68 | 69 | collector = SlabInfo 70 | -------------------------------------------------------------------------------- /graphite_metrics/collectors/memfrag.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from io import open 5 | import re 6 | 7 | from . import Collector, Datapoint, page_size_kb 8 | 9 | import logging 10 | log = logging.getLogger(__name__) 11 | 12 | 13 | class MemFrag(Collector): 14 | 15 | def read( self, 16 | _re_buddyinfo=re.compile(r'^\s*Node\s+(?P\d+)' 17 | r',\s+zone\s+(?P\S+)\s+(?P.*)$'), 18 | _re_ptinfo=re.compile(r'^\s*Node\s+(?P\d+)' 19 | r',\s+zone\s+(?P\S+),\s+type\s+(?P\S+)\s+(?P.*)$') ): 20 | mmap, pskb = dict(), page_size_kb 21 | 22 | # /proc/buddyinfo 23 | with open('/proc/buddyinfo', 'rb') as table: 24 | for line in it.imap(bytes.strip, table): 25 | match = _re_buddyinfo.search(line) 26 | if not match: 27 | log.warn('Unrecognized line in /proc/buddyinfo, skipping: {!r}'.format(line)) 28 | continue 29 | node, zone = int(match.group('node')), match.group('zone').lower() 30 | counts = dict( ('{}k'.format(pskb*2**order),count) 31 | for order,count in enumerate(it.imap(int, match.group('counts').strip().split())) ) 32 | if node not in mmap: mmap[node] = dict() 33 | if zone not in mmap[node]: mmap[node][zone] = dict() 34 | mmap[node][zone]['available'] = counts 35 | 36 | # /proc/pagetypeinfo 37 | with open('/proc/pagetypeinfo', 'rb') as table: 38 | page_counts_found = False 39 | while True: 40 | line = table.readline() 41 | if not line: break 42 | elif 'Free pages count' not in line: 43 | while line.strip(): line = table.readline() 44 | continue 45 | elif page_counts_found: 46 | log.warn( 'More than one free pages' 47 | ' counters section found in /proc/pagetypeinfo' ) 48 | continue 49 | else: 50 | page_counts_found = True 51 | for line in it.imap(bytes.strip, table): 52 | if not line: break 53 | match = _re_ptinfo.search(line) 54 | if not match: 55 | log.warn( 'Unrecognized line' 56 | ' in /proc/pagetypeinfo, skipping: {!r}'.format(line) ) 57 | continue 58 | node, zone, mtype = int(match.group('node')),\ 59 | match.group('zone').lower(), match.group('mtype').lower() 60 | counts = dict( ('{}k'.format(pskb*2**order),count) 61 | for order,count in enumerate(it.imap(int, match.group('counts').strip().split())) ) 62 | if node not in mmap: mmap[node] = dict() 63 | if zone not in mmap[node]: mmap[node][zone] = dict() 64 | mmap[node][zone][mtype] = counts 65 | if not page_counts_found: 66 | log.warn('Failed to find free pages counters in /proc/pagetypeinfo') 67 | 68 | # Dispatch values from mmap 69 | for node,zones in mmap.viewitems(): 70 | for zone,mtypes in zones.viewitems(): 71 | for mtype,counts in mtypes.viewitems(): 72 | if sum(counts.viewvalues()) == 0: continue 73 | for size,count in counts.viewitems(): 74 | yield Datapoint( 'memory.fragmentation.{}'\ 75 | .format('.'.join(it.imap( bytes, 76 | ['node_{}'.format(node),zone,mtype,size] ))), 77 | 'gauge', count, None ) 78 | 79 | 80 | collector = MemFrag 81 | -------------------------------------------------------------------------------- /graphite_metrics/sinks/librato_metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from time import time 5 | import types 6 | 7 | from requests.auth import HTTPBasicAuth 8 | import requests 9 | 10 | try: from simplejson import dumps 11 | except ImportError: from json import dumps 12 | 13 | from . import Sink 14 | 15 | import logging 16 | log = logging.getLogger(__name__) 17 | 18 | 19 | class LibratoMetrics(Sink): 20 | 21 | '''Interface to a Librato Metrics API v1. Uses JSON Array format. 22 | Relevant part of the docs: http://dev.librato.com/v1/post/metrics''' 23 | 24 | def __init__(self, *argz, **kwz): 25 | super(LibratoMetrics, self).__init__(*argz, **kwz) 26 | 27 | # Try to set reasonable defaults 28 | if self.conf.http_parameters.timeout is None: 29 | try: 30 | from . import cfg 31 | self.conf.http_parameters.timeout = cfg.loop.interval / 2 32 | except (ImportError, KeyError): self.conf.http_parameters.timeout = 30 33 | self.conf.http_parameters.auth = HTTPBasicAuth(*self.conf.http_parameters.auth) 34 | 35 | requests.defaults.keep_alive = True 36 | requests.defaults.max_retries = max(3, self.conf.http_parameters.timeout / 5) 37 | 38 | # Try to init concurrent (async) dispatcher 39 | self.send = lambda chunk, **kwz: requests.post(data=chunk, **kwz) 40 | if self.conf.chunk_data.enabled or self.conf.chunk_data.enabled is None: 41 | try: from requests import async 42 | except RuntimeError as err: 43 | if self.conf.chunk_data.enabled: raise 44 | else: 45 | log.warn(( 'Failed to initialize requests.async' 46 | ' engine (gevent module missing?): {}, concurrent' 47 | ' (chunked) measurements submission will be disabled' ).format(err)) 48 | self.conf.chunk_data.enabled = False 49 | else: 50 | self.conf.chunk_data.enabled = True 51 | if not self.conf.chunk_data.max_concurrent_requests\ 52 | or self.conf.chunk_data.max_concurrent_requests <= 0: 53 | self.conf.chunk_data.max_concurrent_requests = None 54 | self.send = lambda *chunks, **kwz:\ 55 | map( op.methodcaller('raise_for_status'), 56 | async.map( 57 | list(async.post(data=chunk, **kwz) for chunk in chunks), 58 | size=self.conf.chunk_data.max_concurrent_requests ) ) 59 | 60 | def measurement(self, name, value, ts_dp=None): 61 | measurement = dict() 62 | if self.conf.source_from_prefix: 63 | measurement['source'], name = name.split('.', 1) 64 | elif self.conf.source: measurement['source'] = self.conf.source 65 | if ts_dp: measurement['measure_time'] = ts_dp 66 | measurement.update(name=name, value=value) 67 | return measurement 68 | 69 | def dispatch(self, *tuples): 70 | data = dict() 71 | if self.conf.unified_measure_time: 72 | data['measure_time'] = int(time()) 73 | tuples = list((name, value, None) for name, value, ts_dp in tuples) 74 | if self.conf.chunk_data.enabled\ 75 | and len(tuples) > self.conf.chunk_data.max_chunk_size: 76 | chunks, n = list(), 0 77 | while n < len(tuples): 78 | n_to = n + self.conf.chunk_data.max_chunk_size 79 | chunk = data.copy() 80 | chunk['gauges'] = list(it.starmap(self.measurement, tuples[n:n_to])) 81 | chunks.append(chunk) 82 | n = n_to 83 | log.debug(( 'Splitting {} measurements' 84 | ' into {} concurrent requests' ).format(len(tuples), len(chunks))) 85 | data = map(dumps, chunks) 86 | del tuples, chunk, chunks # to gc ram from this corpus of data 87 | else: # single chunk 88 | data['gauges'] = list(it.starmap(self.measurement, tuples)) 89 | data = [dumps(data)] 90 | del tuples 91 | self.send(*data, headers={ 92 | 'content-type': 'application/json' }, **self.conf.http_parameters) 93 | 94 | 95 | sink = LibratoMetrics 96 | -------------------------------------------------------------------------------- /graphite_metrics/collectors/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from collections import namedtuple 5 | from glob import iglob 6 | from time import time 7 | import os 8 | 9 | import logging 10 | log = logging.getLogger(__name__) 11 | 12 | 13 | page_size = os.sysconf('SC_PAGE_SIZE') 14 | page_size_kb = page_size // 1024 15 | user_hz = os.sysconf('SC_CLK_TCK') 16 | sector_bytes = 512 17 | 18 | # Global configuration for harvestd, 19 | # intended to be set before initializing collectors, 20 | # but should not be really relied upon - can be empty. 21 | cfg = dict() 22 | 23 | 24 | def rate_limit(max_interval=20, sampling=3, f=lambda x: x): 25 | '''x rises by 1 from 0 on each iteraton, back to 0 on triggering. 26 | f(x) should rise up to f(max_interval) in some way (with default 27 | "f(x)=x" probability rises lineary with 100% chance on "x=max_interval"). 28 | "sampling" affect probablility in an "c=1-(1-c0)*(1-c1)*...*(1-cx)" exponential way.''' 29 | from random import random 30 | val = 0 31 | val_max = float(f(max_interval)) 32 | while True: 33 | if val % sampling == 0: 34 | trigger = random() > (val_max - f(val)) / val_max 35 | if trigger: val = 0 36 | yield trigger 37 | else: yield False 38 | val += 1 39 | 40 | 41 | def dev_resolve( major, minor, 42 | log_fails=True, _cache = dict(), _cache_time=600 ): 43 | ts_now, dev_cached = time(), False 44 | while True: 45 | if not _cache: ts = 0 46 | else: 47 | dev = major, minor 48 | dev_cached, ts = (None, _cache[None])\ 49 | if dev not in _cache else _cache[dev] 50 | # Update cache, if necessary 51 | if ts_now > ts + _cache_time or dev_cached is False: 52 | _cache.clear() 53 | for link in it.chain(iglob('/dev/mapper/*'), iglob('/dev/sd*'), iglob('/dev/xvd*')): 54 | link_name = os.path.basename(link) 55 | try: link_dev = os.stat(link).st_rdev 56 | except OSError: continue # EPERM, EINVAL 57 | _cache[(os.major(link_dev), os.minor(link_dev))] = link_name, ts_now 58 | _cache[None] = ts_now 59 | continue # ...and try again 60 | if dev_cached: dev_cached = dev_cached.replace('.', '_') 61 | elif log_fails: 62 | log.warn( 'Unable to resolve device' 63 | ' from major/minor numbers: {}:{}'.format(major, minor) ) 64 | return dev_cached or None 65 | 66 | 67 | class Collector(object): 68 | 69 | def __init__(self, conf): 70 | self.conf = conf 71 | 72 | def read(self): 73 | raise NotImplementedError( 'Collector.read method should be' 74 | ' overidden in collector subclasses to return list of Datapoint objects.' ) 75 | # return [Datapoint(...), Datapoint(...), ...] 76 | 77 | 78 | class Datapoint(namedtuple('Value', 'name type value ts')): 79 | 80 | # These are globals 81 | _counter_cache = dict() 82 | _counter_cache_check_ts = 0 83 | _counter_cache_check_timeout = 12 * 3600 # 12h 84 | _counter_cache_check_count = 4 # cleanup will trigger every timeout/count period 85 | 86 | @classmethod 87 | def _counter_cache_cleanup(cls, ts_min): 88 | cleanup_list = list( k for k,(v,ts_chk) in 89 | cls._counter_cache.viewitems() if ts_min > ts_chk ) 90 | log.debug('Counter cache cleanup: {} buckets'.format(len(cleanup_list))) 91 | for k in cleanup_list: del cls._counter_cache[k] 92 | 93 | def get(self, ts=None, prefix=None): 94 | ts = self.ts or ts or time() 95 | if ts > Datapoint._counter_cache_check_ts: 96 | Datapoint._counter_cache_cleanup( 97 | ts - Datapoint._counter_cache_check_timeout ) 98 | Datapoint._counter_cache_check_ts = ts\ 99 | + Datapoint._counter_cache_check_timeout\ 100 | / Datapoint._counter_cache_check_count 101 | if self.type == 'counter': 102 | if self.name not in Datapoint._counter_cache: 103 | log.debug('Initializing bucket for new counter: {}'.format(self.name)) 104 | Datapoint._counter_cache[self.name] = self.value, ts 105 | return None 106 | v0, ts0 = Datapoint._counter_cache[self.name] 107 | if ts == ts0: 108 | log.warn('Double-poll of a counter for {!r}'.format(self.name)) 109 | return None 110 | value = float(self.value - v0) / (ts - ts0) 111 | Datapoint._counter_cache[self.name] = self.value, ts 112 | if value < 0: 113 | # TODO: handle overflows properly, w/ limits 114 | log.debug( 'Detected counter overflow' 115 | ' (negative delta): {}, {} -> {}'.format(self.name, v0, self.value) ) 116 | return None 117 | elif self.type == 'gauge': value = self.value 118 | else: raise TypeError('Unknown type: {}'.format(self.type)) 119 | name = self.name if not prefix else '{}.{}'.format(prefix, self.name) 120 | return name, value, int(ts) 121 | -------------------------------------------------------------------------------- /graphite_metrics/collectors/iptables_counts.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from subprocess import Popen, PIPE 5 | from collections import namedtuple, defaultdict 6 | from io import open 7 | import os, errno 8 | 9 | from . import Collector, Datapoint 10 | 11 | import logging 12 | log = logging.getLogger(__name__) 13 | 14 | 15 | class IPTables(Collector): 16 | 17 | iptables = dict(ipv4='iptables-save', ipv6='ip6tables-save') # binaries 18 | metric_units = metric_tpl = None 19 | 20 | def __init__(self, *argz, **kwz): 21 | super(IPTables, self).__init__(*argz, **kwz) 22 | 23 | if not self.conf.rule_metrics_path.ipv4\ 24 | and not self.conf.rule_metrics_path.ipv6: 25 | log.info('No paths for rule_metrics_path specified, disabling collector') 26 | self.conf.enabled = False 27 | 28 | assert self.conf.units in ['pkt', 'bytes', 'both', 'both_flat'] 29 | if self.conf.units.startswith('both'): 30 | self.metric_units = ['pkt', 'bytes'] 31 | self.metric_tpl = '{}.{}' if self.conf.units == 'both' else '{}_{}' 32 | else: self.metric_units, self.metric_tpl = self.conf.units, '{}' 33 | 34 | 35 | _rule_metrics = namedtuple('RuleMetrics', 'table path mtime') 36 | _rule_metrics_cache = dict() 37 | 38 | @property 39 | def rule_metrics(self): 40 | rule_metrics = dict() 41 | for v in 'ipv4', 'ipv6': 42 | path = self.conf.rule_metrics_path[v] 43 | try: 44 | if not path: raise OSError() 45 | mtime = os.stat(path).st_mtime 46 | except (OSError, IOError) as err: 47 | if err.args and err.errno != errno.ENOENT: raise # to raise EPERM, EACCES and such 48 | self._rule_metrics_cache[v] = None 49 | continue 50 | cache = self._rule_metrics_cache.get(v) 51 | if not cache or path != cache.path or mtime != cache.mtime: 52 | log.debug('Detected rule_metrics file update: {} (cached: {})'.format(path, cache)) 53 | metrics_table = dict() 54 | with open(path, 'rb') as src: 55 | for line in it.imap(op.methodcaller('strip'), src): 56 | if not line: continue 57 | table, chain, rule, metric = line.split(None, 3) 58 | metrics_table[table, chain, int(rule)] = metric 59 | cache = self._rule_metrics_cache[v]\ 60 | = self._rule_metrics(metrics_table, path, mtime) 61 | rule_metrics[v] = cache 62 | return rule_metrics 63 | 64 | 65 | _table_hash = dict() 66 | 67 | def read(self): 68 | metric_counts = dict() 69 | hashes = defaultdict(lambda: defaultdict(list)) 70 | 71 | for v, metrics in self.rule_metrics.viewitems(): 72 | if not metrics: continue 73 | 74 | # Used to detect rule changes 75 | try: 76 | hash_old, metrics_old, warnings = self._table_hash[v] 77 | if metrics is not metrics_old: raise KeyError 78 | except KeyError: hash_old, warnings = None, dict() 79 | hash_new = hashes[v] 80 | 81 | # iptables-save invocation and output processing loop 82 | proc = Popen([self.iptables[v], '-c'], stdout=PIPE) 83 | chain_counts = defaultdict(int) 84 | for line in it.imap(op.methodcaller('strip'), proc.stdout): 85 | if line[0] != '[': # chain/table spec or comment 86 | if line[0] == '*': table = line[1:] 87 | continue 88 | counts, append, chain, rule = line.split(None, 3) 89 | assert append == '-A' 90 | 91 | rule_key = table, chain 92 | chain_counts[rule_key] += 1 # iptables rules are 1-indexed 93 | chain_count = chain_counts[rule_key] 94 | # log.debug('{}, Rule: {}'.format([table, chain, chain_count], rule)) 95 | hash_new[rule_key].append(rule) # but py lists are 0-indexed 96 | try: metric = metrics.table[table, chain, chain_count] 97 | except KeyError: continue # no point checking rules w/o metrics attached 98 | # log.debug('Metric: {} ({}), rule: {}'.format( 99 | # metric, [table, chain, chain_count], rule )) 100 | 101 | # Check for changed rules 102 | try: rule_chk = hash_old and hash_old[rule_key][chain_count - 1] 103 | except (KeyError, IndexError): rule_chk = None 104 | if hash_old and rule_chk != rule: 105 | if chain_count not in warnings: 106 | log.warn( 107 | ( 'Detected changed netfilter rule (chain: {}, pos: {})' 108 | ' without corresponding rule_metrics file update: {}' )\ 109 | .format(chain, chain_count, rule) ) 110 | warnings[chain_count] = True 111 | if self.conf.discard_changed_rules: continue 112 | 113 | counts = map(int, counts.strip('[]').split(':', 1)) 114 | try: 115 | metric_counts[metric] = list(it.starmap( 116 | op.add, it.izip(metric_counts[metric], counts) )) 117 | except KeyError: metric_counts[metric] = counts 118 | proc.wait() 119 | 120 | # Detect if there are any changes in the table, 121 | # possibly messing the metrics, even if corresponding rules are the same 122 | hash_new = dict( (rule_key, tuple(rules)) 123 | for rule_key, rules in hash_new.viewitems() ) 124 | if hash_old\ 125 | and frozenset(hash_old.viewitems()) != frozenset(hash_new.viewitems()): 126 | log.warn('Detected iptables changes without changes to rule_metrics file') 127 | hash_old = None 128 | if not hash_old: self._table_hash[v] = hash_new, metrics, dict() 129 | 130 | # Dispatch collected metrics 131 | for metric, counts in metric_counts.viewitems(): 132 | for unit, count in it.izip(['pkt', 'bytes'], counts): 133 | if unit not in self.metric_units: continue 134 | yield Datapoint(self.metric_tpl.format(metric, unit), 'counter', count, None) 135 | 136 | 137 | collector = IPTables 138 | -------------------------------------------------------------------------------- /graphite_metrics/collectors/cron_log.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | import re, iso8601, calendar 5 | 6 | from . import Collector, Datapoint 7 | 8 | import logging 9 | log = logging.getLogger(__name__) 10 | 11 | 12 | def file_follow( src, open_tail=True, 13 | read_interval_min=0.1, 14 | read_interval_max=20, read_interval_mul=1.1, 15 | rotation_check_interval=20, yield_file=False, **open_kwz ): 16 | from time import time, sleep 17 | from io import open 18 | import os, types 19 | 20 | open_tail = open_tail and isinstance(src, types.StringTypes) 21 | src_open = lambda: open(path, mode='rb', **open_kwz) 22 | stat = lambda f: (os.fstat(f) if isinstance(f, int) else os.stat(f)) 23 | sanity_chk_stats = lambda stat: (stat.st_ino, stat.st_dev) 24 | sanity_chk_ts = lambda ts=None: (ts or time()) + rotation_check_interval 25 | 26 | if isinstance(src, types.StringTypes): src, path = None, src 27 | else: 28 | path = src.name 29 | src_inode, src_inode_ts =\ 30 | sanity_chk_stats(stat(src.fileno())), sanity_chk_ts() 31 | line, read_chk = '', read_interval_min 32 | 33 | while True: 34 | 35 | if not src: # (re)open 36 | src = src_open() 37 | if open_tail: 38 | src.seek(0, os.SEEK_END) 39 | open_tail = False 40 | src_inode, src_inode_ts =\ 41 | sanity_chk_stats(stat(src.fileno())), sanity_chk_ts() 42 | src_inode_chk = None 43 | 44 | ts = time() 45 | if ts > src_inode_ts: # rotation check 46 | src_inode_chk, src_inode_ts =\ 47 | sanity_chk_stats(stat(path)), sanity_chk_ts(ts) 48 | if stat(src.fileno()).st_size < src.tell(): src.seek(0) # truncated 49 | else: src_inode_chk = None 50 | 51 | buff = src.readline() 52 | if not buff: # eof 53 | if src_inode_chk and src_inode_chk != src_inode: # rotated 54 | src.close() 55 | src, line = None, '' 56 | continue 57 | if read_chk is None: 58 | yield (buff if not yield_file else (buff, src)) 59 | else: 60 | sleep(read_chk) 61 | read_chk *= read_interval_mul 62 | if read_chk > read_interval_max: 63 | read_chk = read_interval_max 64 | else: 65 | line += buff 66 | read_chk = read_interval_min 67 | 68 | if line and line[-1] == '\n': # complete line 69 | try: 70 | val = yield (line if not yield_file else (line, src)) 71 | if val is not None: raise KeyboardInterrupt 72 | except KeyboardInterrupt: break 73 | line = '' 74 | 75 | src.close() 76 | 77 | 78 | def file_follow_durable( path, 79 | min_dump_interval=10, 80 | xattr_name='user.collectd.logtail.pos', xattr_update=True, 81 | **follow_kwz ): 82 | '''Records log position into xattrs after reading line every 83 | min_dump_interval seconds. 84 | Checksum of the last line at the position 85 | is also recorded (so line itself don't have to fit into xattr) to make sure 86 | file wasn't truncated between last xattr dump and re-open.''' 87 | 88 | from xattr import xattr 89 | from io import open 90 | from hashlib import sha1 91 | from time import time 92 | import struct 93 | 94 | # Try to restore position 95 | src = open(path, mode='rb') 96 | src_xattr = xattr(src) 97 | try: 98 | if not xattr_name: raise KeyError 99 | pos = src_xattr[xattr_name] 100 | except KeyError: pos = None 101 | if pos: 102 | data_len = struct.calcsize('=I') 103 | (pos,), chksum = struct.unpack('=I', pos[:data_len]), pos[data_len:] 104 | (data_len,), chksum = struct.unpack('=I', chksum[:data_len]), chksum[data_len:] 105 | try: 106 | src.seek(pos - data_len) 107 | if sha1(src.read(data_len)).digest() != chksum: 108 | raise IOError('Last log line doesnt match checksum') 109 | except (OSError, IOError) as err: 110 | collectd.info('Failed to restore log position: {}'.format(err)) 111 | src.seek(0) 112 | tailer = file_follow(src, yield_file=True, **follow_kwz) 113 | 114 | # ...and keep it updated 115 | pos_dump_ts_get = lambda ts=None: (ts or time()) + min_dump_interval 116 | pos_dump_ts = pos_dump_ts_get() 117 | while True: 118 | line, src_chk = next(tailer) 119 | if not line: pos_dump_ts = 0 # force-write xattr 120 | ts = time() 121 | if ts > pos_dump_ts: 122 | if src is not src_chk: 123 | src, src_xattr = src_chk, xattr(src_chk) 124 | pos_new = src.tell() 125 | if pos != pos_new: 126 | pos = pos_new 127 | if xattr_update: 128 | src_xattr[xattr_name] =\ 129 | struct.pack('=I', pos)\ 130 | + struct.pack('=I', len(line))\ 131 | + sha1(line).digest() 132 | pos_dump_ts = pos_dump_ts_get(ts) 133 | if (yield line.decode('utf-8', 'replace')): 134 | tailer.send(StopIteration) 135 | break 136 | 137 | 138 | class CronJobs(Collector): 139 | 140 | lines, aliases = dict(), list() 141 | 142 | def __init__(self, *argz, **kwz): 143 | super(CronJobs, self).__init__(*argz, **kwz) 144 | 145 | try: 146 | src, self.lines, self.aliases =\ 147 | op.attrgetter('source', 'lines', 'aliases')(self.conf) 148 | if not (src and self.lines and self.aliases): raise KeyError() 149 | except KeyError as err: 150 | if err.args: 151 | log.error('Failed to get required config parameter "{}"'.format(err.args[0])) 152 | else: 153 | log.warn( 'Collector requires all of "source",' 154 | ' "lines" and "aliases" specified to work properly' ) 155 | self.conf.enabled = False 156 | return 157 | 158 | for k,v in self.lines.viewitems(): self.lines[k] = re.compile(v) 159 | for idx,(k,v) in enumerate(self.aliases): self.aliases[idx] = k, re.compile(v) 160 | self.log_tailer = file_follow_durable( src, read_interval_min=None, 161 | xattr_name=self.conf.xattr_name, xattr_update=not self.conf.debug.dry_run ) 162 | 163 | def read(self, _re_sanitize=re.compile('\s+|-')): 164 | # Cron 165 | if self.log_tailer: 166 | for line in iter(self.log_tailer.next, u''): 167 | # log.debug('LINE: {!r}'.format(line)) 168 | ts, line = line.strip().split(None, 1) 169 | ts = calendar.timegm(iso8601.parse_date(ts).utctimetuple()) 170 | matched = False 171 | for ev, regex in self.lines.viewitems(): 172 | if not regex: continue 173 | match = regex.search(line) 174 | if match: 175 | matched = True 176 | job = match.group('job') 177 | for alias, regex in self.aliases: 178 | group = alias[1:] if alias.startswith('_') else None 179 | alias_match = regex.search(job) 180 | if alias_match: 181 | if group is not None: 182 | job = _re_sanitize.sub('_', alias_match.group(group)) 183 | else: job = alias 184 | break 185 | else: 186 | log.warn('No alias for cron job: {!r}, skipping'.format(line)) 187 | continue 188 | try: value = float(match.group('val')) 189 | except IndexError: value = 1 190 | # log.debug('TS: {}, EV: {}, JOB: {}'.format(ts, ev, job)) 191 | yield Datapoint('cron.tasks.{}.{}'.format(job, ev), 'gauge', value, ts) 192 | if not matched: 193 | log.debug('Failed to match line: {!r}'.format(line)) 194 | 195 | 196 | collector = CronJobs 197 | -------------------------------------------------------------------------------- /graphite_metrics/harvestd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import itertools as it, operator as op, functools as ft 5 | from lya import AttrDict, configure_logging 6 | from collections import OrderedDict 7 | import os, sys 8 | 9 | 10 | def main(): 11 | import argparse 12 | parser = argparse.ArgumentParser( 13 | description='Collect and dispatch various metrics to destinations.') 14 | parser.add_argument('-t', '--destination', metavar='host[:port]', 15 | help='host[:port] (default port: 2003, can be overidden' 16 | ' via config file) of sink destination endpoint (e.g. carbon' 17 | ' linereceiver tcp port, by default).') 18 | parser.add_argument('-i', '--interval', type=int, metavar='seconds', 19 | help='Interval between collecting and sending the datapoints.') 20 | 21 | parser.add_argument('-e', '--collector-enable', 22 | action='append', metavar='collector', default=list(), 23 | help='Enable only the specified metric collectors,' 24 | ' can be specified multiple times.') 25 | parser.add_argument('-d', '--collector-disable', 26 | action='append', metavar='collector', default=list(), 27 | help='Explicitly disable specified metric collectors,' 28 | ' can be specified multiple times. Overrides --collector-enable.') 29 | 30 | parser.add_argument('-s', '--sink-enable', 31 | action='append', metavar='sink', default=list(), 32 | help='Enable only the specified datapoint sinks,' 33 | ' can be specified multiple times.') 34 | parser.add_argument('-x', '--sink-disable', 35 | action='append', metavar='sink', default=list(), 36 | help='Explicitly disable specified datapoint sinks,' 37 | ' can be specified multiple times. Overrides --sink-enable.') 38 | 39 | parser.add_argument('-p', '--processor-enable', 40 | action='append', metavar='processor', default=list(), 41 | help='Enable only the specified datapoint processors,' 42 | ' can be specified multiple times.') 43 | parser.add_argument('-z', '--processor-disable', 44 | action='append', metavar='processor', default=list(), 45 | help='Explicitly disable specified datapoint processors,' 46 | ' can be specified multiple times. Overrides --processor-enable.') 47 | 48 | parser.add_argument('-c', '--config', 49 | action='append', metavar='path', default=list(), 50 | help='Configuration files to process.' 51 | ' Can be specified more than once.' 52 | ' Values from the latter ones override values in the former.' 53 | ' Available CLI options override the values in any config.') 54 | 55 | parser.add_argument('-a', '--xattr-emulation', metavar='db-path', 56 | help='Emulate filesystem extended attributes (used in' 57 | ' some collectors like sysstat or cron_log), storing per-path' 58 | ' data in a simple shelve db.') 59 | parser.add_argument('-n', '--dry-run', 60 | action='store_true', help='Do not actually send data.') 61 | parser.add_argument('--debug-memleaks', action='store_true', 62 | help='Import guppy and enable its manhole to debug memleaks (requires guppy module).') 63 | parser.add_argument('--debug', 64 | action='store_true', help='Verbose operation mode.') 65 | optz = parser.parse_args() 66 | 67 | # Read configuration files 68 | cfg = AttrDict.from_yaml('{}.yaml'.format( 69 | os.path.splitext(os.path.realpath(__file__))[0] )) 70 | for k in optz.config: cfg.update_yaml(k) 71 | 72 | # Logging 73 | import logging 74 | configure_logging( cfg.logging, 75 | logging.DEBUG if optz.debug else logging.WARNING ) 76 | if not cfg.logging.tracebacks: 77 | class NoTBLogger(logging.Logger): 78 | def exception(self, *argz, **kwz): self.error(*argz, **kwz) 79 | logging.setLoggerClass(NoTBLogger) 80 | log = logging.getLogger(__name__) 81 | 82 | # Manholes 83 | if optz.debug_memleaks: 84 | import guppy 85 | from guppy.heapy import Remote 86 | Remote.on() 87 | 88 | # Fill "auto-detected" blanks in the configuration, CLI overrides 89 | try: 90 | if optz.destination: cfg.sinks._default.host = optz.destination 91 | cfg.sinks._default.host = cfg.sinks._default.host.rsplit(':', 1) 92 | if len(cfg.sinks._default.host) == 1: 93 | cfg.sinks._default.host =\ 94 | cfg.sinks._default.host[0], cfg.sinks._default.default_port 95 | else: cfg.sinks._default.host[1] = int(cfg.sinks._default.host[1]) 96 | except KeyError: pass 97 | if optz.interval: cfg.loop.interval = optz.interval 98 | if optz.dry_run: cfg.debug.dry_run = optz.dry_run 99 | if optz.xattr_emulation: cfg.core.xattr_emulation = optz.xattr_emulation 100 | 101 | # Fake "xattr" module, if requested 102 | if cfg.core.xattr_emulation: 103 | import shelve 104 | xattr_db = shelve.open(cfg.core.xattr_emulation, 'c') 105 | class xattr_path(object): 106 | def __init__(self, base): 107 | assert isinstance(base, str) 108 | self.base = base 109 | def key(self, k): return '{}\0{}'.format(self.base, k) 110 | def __setitem__(self, k, v): xattr_db[self.key(k)] = v 111 | def __getitem__(self, k): return xattr_db[self.key(k)] 112 | def __del__(self): xattr_db.sync() 113 | class xattr_module(object): xattr = xattr_path 114 | sys.modules['xattr'] = xattr_module 115 | 116 | # Override "enabled" collector/sink parameters, based on CLI 117 | ep_conf = dict() 118 | for ep, enabled, disabled in\ 119 | [ ('collectors', optz.collector_enable, optz.collector_disable), 120 | ('processors', optz.processor_enable, optz.processor_disable), 121 | ('sinks', optz.sink_enable, optz.sink_disable) ]: 122 | conf = cfg[ep] 123 | conf_base = conf.pop('_default') 124 | if 'debug' not in conf_base: conf_base['debug'] = cfg.debug 125 | ep_conf[ep] = conf_base, conf, OrderedDict(), enabled, disabled 126 | 127 | # Init global cfg for collectors/sinks' usage 128 | from graphite_metrics import collectors, sinks, loops 129 | collectors.cfg = sinks.cfg = loops.cfg = cfg 130 | 131 | # Init pluggable components 132 | import pkg_resources 133 | 134 | for ep_type in 'collector', 'processor', 'sink': 135 | ep_key = '{}s'.format(ep_type) # a bit of a hack 136 | conf_base, conf, objects, enabled, disabled = ep_conf[ep_key] 137 | ep_dict = dict( (ep.name, ep) for ep in 138 | pkg_resources.iter_entry_points('graphite_metrics.{}'.format(ep_key)) ) 139 | eps = OrderedDict( 140 | (name, (ep_dict.pop(name), subconf or AttrDict())) 141 | for name, subconf in conf.viewitems() if name in ep_dict ) 142 | eps.update( (name, (module, conf_base)) 143 | for name, module in ep_dict.viewitems() ) 144 | for ep_name, (ep_module, subconf) in eps.viewitems(): 145 | if ep_name[0] == '_': 146 | log.debug( 'Skipping {} enty point,' 147 | ' prefixed by underscore: {}'.format(ep_type, ep_name) ) 148 | subconf.rebase(conf_base) # fill in "_default" collector parameters 149 | if enabled: 150 | if ep_name in enabled: subconf['enabled'] = True 151 | else: subconf['enabled'] = False 152 | if disabled and ep_name in disabled: subconf['enabled'] = False 153 | if subconf.get('enabled', True): 154 | log.debug('Loading {}: {}'.format(ep_type, ep_name)) 155 | try: obj = getattr(ep_module.load(), ep_type)(subconf) 156 | except Exception as err: 157 | log.exception('Failed to load/init {} ({}): {}'.format(ep_type, ep_name, err)) 158 | subconf.enabled = False 159 | obj = None 160 | if subconf.get('enabled', True): objects[ep_name] = obj 161 | else: 162 | log.debug(( '{} {} (entry point: {})' 163 | ' was disabled after init' ).format(ep_type.title(), obj, ep_name)) 164 | if ep_type != 'processor' and not objects: 165 | log.fatal('No {}s were properly enabled/loaded, bailing out'.format(ep_type)) 166 | sys.exit(1) 167 | log.debug('{}: {}'.format(ep_key.title(), objects)) 168 | 169 | loop = dict( (ep.name, ep) for ep in 170 | pkg_resources.iter_entry_points('graphite_metrics.loops') ) 171 | conf = AttrDict(**cfg.loop) 172 | if 'debug' not in conf: conf.debug = cfg.debug 173 | loop = loop[cfg.loop.name].load().loop(conf) 174 | 175 | collectors, processors, sinks = it.imap( op.itemgetter(2), 176 | op.itemgetter('collectors', 'processors', 'sinks')(ep_conf) ) 177 | log.debug( 178 | 'Starting main loop: {} ({} collectors, {} processors, {} sinks)'\ 179 | .format(loop, len(collectors), len(processors), len(sinks)) ) 180 | loop.start(collectors, processors, sinks) 181 | 182 | if __name__ == '__main__': main() 183 | -------------------------------------------------------------------------------- /graphite_metrics/collectors/_ping.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from __future__ import print_function 4 | 5 | import itertools as it, operator as op, functools as ft 6 | from contextlib import closing 7 | from select import epoll, EPOLLIN, EPOLLOUT 8 | from time import time, sleep 9 | import os, sys, socket, struct, random, signal, re, logging 10 | 11 | 12 | class LinkError(Exception): pass 13 | 14 | class Pinger(object): 15 | 16 | @staticmethod 17 | def calculate_checksum(src): 18 | shift, src = sys.byteorder != 'little', bytearray(src) 19 | chksum = 0 20 | for c in src: 21 | chksum += (c << 8) if shift else c 22 | shift = not shift 23 | chksum = (chksum & 0xffff) + (chksum >> 16) 24 | chksum += chksum >> 16 25 | chksum = ~chksum & 0xffff 26 | return struct.pack('!H', socket.htons(chksum)) 27 | 28 | 29 | def resolve(self, host, family=0, socktype=0, proto=0, flags=0): 30 | try: f, host = host.split(':', 1) 31 | except ValueError: pass 32 | else: 33 | assert f in ['v4', 'v6'], f 34 | if f == 'v4': 35 | family, sock = socket.AF_INET, self.ipv4 36 | elif f == 'v6': 37 | family, sock = socket.AF_INET6, self.ipv6 38 | match = re.search(r'^\[([0-9:a-fA-F]+)\]$', host) 39 | if match: host = match.group(1) 40 | addrs = set( addrinfo[-1] for addrinfo in 41 | socket.getaddrinfo(host, 0, family, socktype, proto, flags) ) 42 | return sock, random.choice(list(addrs)) 43 | 44 | def test_link(self, addrinfo, ping_id=0xffff, seq=0): 45 | 'Test if it is possible to send packets out at all (i.e. link is not down).' 46 | try: self.pkt_send(addrinfo, ping_id, seq) 47 | except IOError as err: raise LinkError(str(err)) 48 | 49 | def pkt_send(self, addrinfo, ping_id, seq): 50 | sock, addr = addrinfo 51 | if sock is self.ipv4: icmp_type = 0x08 52 | elif sock is self.ipv6: icmp_type = 0x80 53 | else: raise ValueError(sock) 54 | ts = time() 55 | ts_secs = int(ts) 56 | ts_usecs = int((ts - ts_secs) * 1e6) 57 | # Timestamp is packed in wireshark-friendly format 58 | # Using time.clock() would probably be better here, 59 | # as it should work better with time corrections (by e.g. ntpd) 60 | pkt = bytearray(struct.pack( '!BBHHHII', 61 | icmp_type, 0, 0, ping_id, seq, ts_secs, ts_usecs )) 62 | pkt[2:4] = self.calculate_checksum(pkt) 63 | sock.sendto(bytes(pkt), addr) 64 | 65 | def pkt_recv(self, sock): 66 | # None gets returned in cases when we get whatever other icmp thing 67 | pkt, src = sock.recvfrom(2048) 68 | if sock is self.ipv4: start = 20 69 | elif sock is self.ipv6: start = 0 70 | else: raise ValueError(sock) 71 | try: pkt = struct.unpack('!BBHHHII', pkt[start:start + 16]) 72 | except struct.error: return 73 | if sock is self.ipv4 and (pkt[0] != 0 or pkt[1] != 0): return 74 | elif sock is self.ipv6 and (pkt[0] != 0x81 or pkt[1] != 0): return 75 | return src[0], pkt[3], pkt[4], pkt[5] + (pkt[6] / 1e6) # addr, ping_id, seq, ts 76 | 77 | 78 | def start(self, *args, **kws): 79 | with\ 80 | closing(socket.socket( socket.AF_INET, 81 | socket.SOCK_RAW, socket.getprotobyname('icmp') )) as self.ipv4,\ 82 | closing(socket.socket( socket.AF_INET6, 83 | socket.SOCK_RAW, socket.getprotobyname('ipv6-icmp') )) as self.ipv6: 84 | return self._start(*args, **kws) 85 | 86 | def _start( self, host_specs, interval, 87 | resolve_no_reply, resolve_fixed, ewma_factor, ping_pid, log=None, 88 | warn_tries=5, warn_repeat=None, warn_delay_k=5, warn_delay_min=5 ): 89 | ts = time() 90 | seq_gen = it.chain.from_iterable(it.imap(xrange, it.repeat(2**15))) 91 | resolve_fixed_deadline = ts + resolve_fixed 92 | resolve_retry = dict() 93 | self.discard_rtts = False 94 | if not log: log = logging.getLogger(__name__) 95 | 96 | ### First resolve all hosts, waiting for it, if necessary 97 | hosts, host_ids = dict(), dict() 98 | for host in host_specs: 99 | while True: 100 | ping_id = random.randint(0, 0xfffe) 101 | if ping_id not in host_ids: break 102 | warn = warn_ts = 0 103 | while True: 104 | try: 105 | addrinfo = self.resolve(host) 106 | self.test_link(addrinfo) 107 | 108 | except (socket.gaierror, socket.error, LinkError) as err: 109 | ts = time() 110 | if warn < warn_tries: 111 | warn_force, warn_chk = False, True 112 | else: 113 | warn_force, warn_chk = True, warn_repeat\ 114 | and (warn_repeat is True or ts - warn_ts > warn_repeat) 115 | if warn_chk: warn_ts = ts 116 | err_info = type(err).__name__ 117 | if str(err): err_info += ': {}'.format(err) 118 | (log.warn if warn_chk else log.info)\ 119 | ( '{}Unable to resolve/send-to name spec: {} ({})'\ 120 | .format('' if not warn_force else '(STILL) ', host, err_info) ) 121 | warn += 1 122 | if warn_repeat is not True and warn == warn_tries: 123 | log.warn( 'Disabling name-resolver/link-test warnings (failures: {},' 124 | ' name spec: {}) until next successful attempt'.format(warn, host) ) 125 | sleep(max(interval / float(warn_delay_k), warn_delay_min)) 126 | 127 | else: 128 | hosts[host] = host_ids[ping_id] = dict( 129 | ping_id=ping_id, addrinfo=addrinfo, 130 | last_reply=0, rtt=0, sent=0, recv=0 ) 131 | if warn >= warn_tries: 132 | log.warn('Was able to resolve host spec: {} (attempts: {})'.format(host, warn)) 133 | break 134 | 135 | ### Handler to emit results on-demand 136 | def dump(sig, frm): 137 | self.discard_rtts = True # make sure results won't be tainted by this delay 138 | ts = time() 139 | try: 140 | for spec, host in hosts.viewitems(): 141 | sys.stdout.write('{} {:.10f} {:.10f} {:010d}\n'.format( 142 | spec, ts - host['last_reply'], host['rtt'], 143 | max(host['sent'] - host['recv'] - 1, 0) )) # 1 pkt can be in-transit 144 | if host['sent'] > 2**30: host['sent'] = host['recv'] = 0 145 | sys.stdout.write('\n') 146 | sys.stdout.flush() 147 | except IOError: sys.exit() 148 | signal.signal(signal.SIGQUIT, dump) 149 | 150 | ### Actual ping-loop 151 | poller, sockets = epoll(), dict() 152 | for sock in self.ipv4, self.ipv6: 153 | sockets[sock.fileno()] = sock 154 | poller.register(sock, EPOLLIN) 155 | sys.stdout.write('\n') 156 | sys.stdout.flush() 157 | 158 | ts_send = 0 # when last packet(s) were sent out 159 | while True: 160 | while True: 161 | poll_time = max(0, ts_send + interval - time()) 162 | try: 163 | poll_res = poller.poll(poll_time) 164 | if not poll_res or not poll_res[0][1] & EPOLLIN: break 165 | pkt = self.pkt_recv(sockets[poll_res[0][0]]) 166 | if not pkt: continue 167 | addr, ping_id, seq, ts_pkt = pkt 168 | except IOError: continue 169 | if not ts_send: continue 170 | ts = time() 171 | try: host = host_ids[ping_id] 172 | except KeyError: pass 173 | else: 174 | host['last_reply'] = ts 175 | host['recv'] += 1 176 | if not self.discard_rtts: 177 | host['rtt'] = host['rtt'] + ewma_factor * (ts - ts_pkt - host['rtt']) 178 | 179 | if resolve_retry: 180 | for spec, host in resolve_retry.items(): 181 | try: host['addrinfo'] = self.resolve(spec) 182 | except socket.gaierror as err: 183 | log.warn('Failed to resolve spec: {} (host: {}): {}'.format(spec, host, err)) 184 | host['resolve_fails'] = host.get('resolve_fails', 0) + 1 185 | if host['resolve_fails'] >= warn_tries: 186 | log.error(( 'Failed to resolve host spec {} (host: {}) after {} attempts,' 187 | ' exiting (so subprocess can be restarted)' ).format(spec, host, warn_tries)) 188 | # More complex "retry until forever" logic is used on process start, 189 | # so exit here should be performed only once per major (non-transient) failure 190 | sys.exit(0) 191 | else: 192 | host['resolve_fails'] = 0 193 | del resolve_retry[spec] 194 | 195 | if time() > resolve_fixed_deadline: 196 | for spec,host in hosts.viewitems(): 197 | try: host['addrinfo'] = self.resolve(spec) 198 | except socket.gaierror: resolve_retry[spec] = host 199 | resolve_fixed_deadline = ts + resolve_fixed 200 | 201 | if ping_pid: 202 | try: os.kill(ping_pid, 0) 203 | except OSError: sys.exit() 204 | 205 | resolve_reply_deadline = time() - resolve_no_reply 206 | self.discard_rtts, seq = False, next(seq_gen) 207 | for spec, host in hosts.viewitems(): 208 | if host['last_reply'] < resolve_reply_deadline: 209 | try: host['addrinfo'] = self.resolve(spec) 210 | except socket.gaierror: resolve_retry[spec] = host 211 | send_retries = 30 212 | while True: 213 | try: self.pkt_send(host['addrinfo'], host['ping_id'], seq) 214 | except IOError as err: 215 | send_retries -= 1 216 | if send_retries == 0: 217 | log.error(( 'Failed sending pings from socket to host spec {}' 218 | ' (host: {}) attempts ({}), killing pinger (so it can be restarted).' )\ 219 | .format(spec, host, err)) 220 | sys.exit(0) # same idea as with resolver errors above 221 | continue 222 | else: break 223 | host['sent'] += 1 224 | ts_send = time() # used to calculate when to send next batch of pings 225 | 226 | 227 | if __name__ == '__main__': 228 | signal.signal(signal.SIGQUIT, signal.SIG_IGN) 229 | logging.basicConfig() 230 | # Inputs 231 | Pinger().start( sys.argv[7:], interval=float(sys.argv[1]), 232 | resolve_no_reply=float(sys.argv[2]), resolve_fixed=float(sys.argv[3]), 233 | ewma_factor=float(sys.argv[4]), ping_pid=int(sys.argv[5]), 234 | warn_tries=int(sys.argv[6]), log=logging.getLogger('pinger'), 235 | warn_repeat=8 * 3600, warn_delay_k=5, warn_delay_min=5 ) 236 | # Output on SIGQUIT: "host_spec time_since_last_reply rtt_median pkt_lost" 237 | # pkt_lost is a counter ("sent - received" for whole runtime) 238 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | graphite-metrics: metric collectors for various stuff not (or poorly) handled by other monitoring daemons 2 | -------------------- 3 | 4 | Core of the project is a simple daemon (harvestd), which collects metric values 5 | and sends them to graphite carbon daemon (and/or other configured destinations) 6 | once per interval. 7 | 8 | Includes separate data collection components ("collectors") for processing of: 9 | 10 | * /proc/slabinfo for useful-to-watch values, not everything (configurable). 11 | * /proc/vmstat and /proc/meminfo in a consistent way. 12 | * /proc/stat for irq, softirq, forks. 13 | * /proc/buddyinfo and /proc/pagetypeinfo (memory fragmentation). 14 | * /proc/interrupts and /proc/softirqs. 15 | * Cron log to produce start/finish events and duration for each job into a 16 | separate metrics, adapts jobs to metric names with regexes. 17 | * Per-system-service accounting using 18 | [systemd](http://www.freedesktop.org/wiki/Software/systemd) and it's cgroups 19 | ("Default...Accounting=" options in system.conf have to be enabled for more 20 | recent versions). 21 | * [sysstat](http://sebastien.godard.pagesperso-orange.fr/) data from sadc logs 22 | (use something like `sadc -F -L -S DISK -S XDISK -S POWER 60` to have more 23 | stuff logged there) via sadf binary and it's json export (`sadf -j`, supported 24 | since sysstat-10.0.something, iirc). 25 | * iptables rule "hits" packet and byte counters, taken from ip{,6}tables-save, 26 | mapped via separate "table chain_name rule_no metric_name" file, which should 27 | be generated along with firewall rules (I use [this 28 | script](https://github.com/mk-fg/trilobite) to do that). 29 | 30 | Additional metric collectors can be added via setuptools/distribute 31 | graphite_metrics.collectors [entry 32 | point](http://packages.python.org/distribute/setuptools.html?highlight=entry%20points#dynamic-discovery-of-services-and-plugins) 33 | and confgured via the common configuration mechanism. 34 | 35 | Same for the datapoint sinks (destinations - it doesn't have to be a single 36 | carbon host), datapoint processors (mangle/rename/filter datapoints) and the 37 | main loop, which can be replaced with the async (simple case - threads or 38 | gevent) or buffering loop. 39 | 40 | Currently supported backends (data destinations, sinks): 41 | 42 | * [graphite carbon 43 | daemon](http://graphite.readthedocs.org/en/latest/carbon-daemons.html) 44 | (enabled/used by default) 45 | * [librato metrics](https://metrics.librato.com/) 46 | 47 | Look at the shipped collectors, processors, sinks and loops and their base 48 | classes (like 49 | [graphite_metrics.sinks.Sink](https://github.com/mk-fg/graphite-metrics/blob/master/graphite_metrics/sinks/__init__.py) 50 | or 51 | [loops.Basic](https://github.com/mk-fg/graphite-metrics/blob/master/graphite_metrics/loops/basic.py)) 52 | for API examples. 53 | 54 | 55 | Installation 56 | -------------------- 57 | 58 | It's a regular package for Python 2.7 (not 3.X). 59 | 60 | Using [pip](http://pip-installer.org/) is the best way: 61 | 62 | % pip install graphite-metrics 63 | 64 | If you don't have it, use: 65 | 66 | % easy_install pip 67 | % pip install graphite-metrics 68 | 69 | Alternatively ([see 70 | also](http://www.pip-installer.org/en/latest/installing.html)): 71 | 72 | % curl https://raw.github.com/pypa/pip/master/contrib/get-pip.py | python 73 | % pip install graphite-metrics 74 | 75 | Or, if you absolutely must: 76 | 77 | % easy_install graphite-metrics 78 | 79 | But, you really shouldn't do that. 80 | 81 | Current-git version can be installed like this: 82 | 83 | % pip install 'git+https://github.com/mk-fg/graphite-metrics.git#egg=graphite-metrics' 84 | 85 | ### Requirements 86 | 87 | Basic requirements are (pip or easy_install should handle these for you): 88 | 89 | * [setuptools / distribute](https://pypi.python.org/pypi/distribute/) (for entry points) 90 | * [layered-yaml-attrdict-config](https://pypi.python.org/pypi/layered-yaml-attrdict-config/) 91 | 92 | Some shipped modules require additional packages to function (which can be 93 | installed automatically by specifying extras on install, example: `pip install 94 | 'graphite-metrics[collectors.cgacct]'`): 95 | 96 | * collectors 97 | 98 | * cgacct 99 | * [dbus-python](https://pypi.python.org/pypi/dbus-python/) 100 | 101 | * cron_log 102 | * [xattr](http://pypi.python.org/pypi/xattr/) (unless --xattr-emulation is used) 103 | * [iso8601](http://pypi.python.org/pypi/iso8601/) 104 | 105 | * sysstat 106 | * [xattr](http://pypi.python.org/pypi/xattr/) (unless --xattr-emulation is used) 107 | * (optional) [simplejson](http://pypi.python.org/pypi/simplejson/) - for 108 | better performance than stdlib json module 109 | 110 | * sinks 111 | 112 | * librato_metrics 113 | * [requests](http://pypi.python.org/pypi/requests/) 114 | * (optional) [simplejson](http://pypi.python.org/pypi/simplejson/) - for 115 | better performance than stdlib json module 116 | * (optional) [gevent](http://pypi.python.org/pypi/gevent/) - to enable 117 | constant-time (more scalable) async submissions of large data chunks via 118 | concurrent API requests 119 | 120 | Also see 121 | [requirements.txt](https://github.com/mk-fg/graphite-metrics/blob/master/requirements.txt) 122 | file or "install_requires" and "extras_require" in 123 | [setup.py](https://github.com/mk-fg/graphite-metrics/blob/master/setup.py). 124 | 125 | 126 | Running 127 | -------------------- 128 | 129 | First run should probably look like this: 130 | 131 | % harvestd --debug -s dump -i10 132 | 133 | That will use default configuration with all the collectors enabled, dumping 134 | data to stderr (only "dump" data-sink enabled) and using short (5s) interval 135 | between collected datapoints, dumpng additional info about what's being done. 136 | 137 | After that, see [default harvestd.yaml configuration 138 | file](https://github.com/mk-fg/graphite-metrics/blob/master/graphite_metrics/harvestd.yaml), 139 | which contains configuration for all loaded collectors and can/should be 140 | overidden using -c option. 141 | 142 | Note that you don't have to specify all the options in each override-config, 143 | just the ones you need to update. 144 | 145 | For example, simple configuration file (say, /etc/harvestd.yaml) just to specify 146 | carbon host and log lines format (dropping timestamp, since it will be piped to 147 | syslog or systemd-journal anyway) might look like this: 148 | 149 | sinks: 150 | carbon_socket: 151 | host: carbon.example.host 152 | 153 | logging: 154 | formatters: 155 | basic: 156 | format: '%(levelname)s :: %(name)s: %(message)s' 157 | 158 | And be started like this: `harvestd -c /etc/harvestd.yaml` 159 | 160 | See `harvestd --help` output for a full CLI reference. 161 | 162 | 163 | Caveats, Stern Warnings and Apocalyptic Prophecies 164 | -------------------- 165 | 166 | While most stock collectors here pull metrics from /proc once per some interval, 167 | same as the other tools, be especially wary of the ones that process memory 168 | metrics, like /proc/slabinfo and cgroup value parsers. 169 | 170 | So-called "files" in /proc are actually callbacks in the kernel code, and to get 171 | consistent reading for the whole slabinfo table, (at least some versions) of the 172 | kernel have to lock some operations, causing unexpected lags and delays on the 173 | whole system under some workloads (e.g. memcache servers). 174 | 175 | cgroup data collector processes lots of files, potentially dozens, hundreds or 176 | even thoursands per collection cycle, which may also cause similar issues. 177 | 178 | Special thanks to Marcus Barczak for pointing that out. 179 | 180 | 181 | Rationale 182 | -------------------- 183 | 184 | Most other tools can (in theory) collect this data, and I've used 185 | [collectd](http://collectd.org) for most of these, but it: 186 | 187 | * Doesn't provide some of the most useful stuff - nfs stats, disk utilization 188 | time percentage, etc. 189 | 190 | * Fails to collect some other stats, producing strange values like 0'es, 191 | unrealistic or negative values (for io, network, sensors, ...). 192 | 193 | * General-purpose plugins like "tail" add lot of complexity, making 194 | configuration into a mess, while still lacking some basic functionality which 195 | 10 lines of code (plugin) can easily provide (support is there, but see 196 | below). 197 | 198 | * Plugins change metric names from the ones provided by /proc, referenced in 199 | kernel Documentation and on the internets, making collected data unnecessary 200 | hard to interpret and raising questions about it's meaning (which is 201 | increasingly important for low-level and calculated metrics). 202 | 203 | Initially I've tried to address these issues (implement the same collectors) 204 | with collectd plugins, but it's python plugin system turned out to be leaking 205 | RAM and collectd itself segfaults something like once-a-day, even in the latest 206 | releases, although probably because of issues in C plugins. 207 | 208 | Plus, collectd data requires post-processing anyway - proper metric namespaces, 209 | counter handling, etc. 210 | 211 | Given that the alternative is to just get the data and echo it as "name val 212 | timestamp" to tcp socket, decided to avoid the extra complexity and problems 213 | that collectd provides. 214 | 215 | Other than collectd, I've experimented with 216 | [ganglia](http://ganglia.sourceforge.net/), 217 | [munin](http://munin-monitoring.org/), and some other monitoring 218 | infrastructures, but found little justification in re-using their aggregation 219 | and/or collection infrastructure, if not outright limitations (like static data 220 | schema in ganglia). 221 | 222 | Daemon binary is (weirdly) called "harvestd" because "metricsd" name is already 223 | used to refer to [another related daemon](https://github.com/kpumuk/metricsd) 224 | (also, [there's a "metrics" w/o "d"](https://github.com/codahale/metrics), 225 | probably others), and is too generic to be used w/o extra confusion, I think. 226 | That, and I seem to lack creativity to come up with a saner name ("reaperd" 227 | sounds too MassEffect'ish these days). 228 | -------------------------------------------------------------------------------- /graphite_metrics/collectors/cjdns_peer_stats.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from io import open 5 | from hashlib import sha256, sha512 6 | from base64 import b32decode 7 | from collections import defaultdict 8 | import os, sys, json, socket, time, types 9 | from . import Collector, Datapoint 10 | 11 | import logging 12 | log = logging.getLogger(__name__) 13 | 14 | 15 | ### For bencode bits below 16 | # Derived from a thing under BitTorrent Open Source License, written by Petru Paler 17 | 18 | # Different from vanilla bencode in: 19 | # * Handling "leading zeroes" in keys (doesn't error - for cjdns compat) 20 | # * encode_none method (to "n") 21 | # * encode_string encodes unicode as utf-8 bytes 22 | 23 | def _ns_class(cls_name, cls_parents, cls_attrs): 24 | for k, v in cls_attrs.viewitems(): 25 | if isinstance(v, types.FunctionType): 26 | cls_attrs[k] = classmethod(v) 27 | return type(cls_name, cls_parents, cls_attrs) 28 | 29 | class BTEError(Exception): pass 30 | 31 | class Bencached(object): 32 | __slots__ = 'bencoded', 33 | def __init__(self, s): self.bencoded = s 34 | 35 | class BTE(object): 36 | __metaclass__ = _ns_class 37 | 38 | unicode_enc = 'utf-8' 39 | enable_none = False 40 | enable_bool = True 41 | cjdns_compat = True 42 | 43 | def decode_int(cls, x, f): 44 | f += 1 45 | newf = x.index('e', f) 46 | n = int(x[f:newf]) 47 | if x[f] == '-': 48 | if x[f + 1] == '0': raise ValueError 49 | elif x[f] == '0' and newf != f+1: raise ValueError 50 | return n, newf+1 51 | def decode_string(cls, x, f): 52 | colon = x.index(':', f) 53 | n = int(x[f:colon]) 54 | if not cls.cjdns_compat\ 55 | and x[f] == '0' and colon != f+1: raise ValueError 56 | colon += 1 57 | return (x[colon:colon+n], colon+n) 58 | def decode_list(cls, x, f): 59 | r, f = [], f+1 60 | while x[f] != 'e': 61 | v, f = cls.decode_func[x[f]](cls, x, f) 62 | r.append(v) 63 | return r, f + 1 64 | def decode_dict(cls, x, f): 65 | r, f = {}, f+1 66 | while x[f] != 'e': 67 | k, f = cls.decode_string(x, f) 68 | r[k], f = cls.decode_func[x[f]](cls, x, f) 69 | return r, f + 1 70 | def decode_none(cls, x, f): 71 | if not cls.enable_none: raise ValueError(x[f]) 72 | return None, f+1 73 | decode_func = dict(l=decode_list, d=decode_dict, i=decode_int, n=decode_none) 74 | for n in xrange(10): decode_func[bytes(n)] = decode_string 75 | 76 | def encode_bencached(cls, x, r): r.append(x.bencoded) 77 | def encode_int(cls, x, r): r.extend(('i', str(x), 'e')) 78 | def encode_float(cls, x, r): r.extend(('f', struct.pack('!d', x), 'e')) 79 | def encode_bool(cls, x, r): 80 | if not cls.enable_bool: raise ValueError(x) 81 | if x: cls.encode_int(1, r) 82 | else: cls.encode_int(0, r) 83 | def encode_string(cls, x, r): 84 | if isinstance(x, unicode): 85 | if not cls.unicode_enc: raise ValueError(x) 86 | x = x.encode(cls.unicode_enc) 87 | r.extend((str(len(x)), ':', x)) 88 | def encode_list(cls, x, r): 89 | r.append('l') 90 | for i in x: cls.encode_func[type(i)](cls, i, r) 91 | r.append('e') 92 | def encode_dict(cls, x, r): 93 | r.append('d') 94 | ilist = x.items() 95 | ilist.sort() 96 | for k, v in ilist: 97 | r.extend((str(len(k)), ':', k)) 98 | cls.encode_func[type(v)](cls, v, r) 99 | r.append('e') 100 | def encode_none(cls, x, r): 101 | if not cls.enable_none: raise ValueError(x) 102 | r.append('n') 103 | encode_func = { 104 | Bencached: encode_bencached, 105 | unicode: encode_string, 106 | str: encode_string, 107 | types.IntType: encode_int, 108 | types.LongType: encode_int, 109 | types.FloatType: encode_float, 110 | types.ListType: encode_list, 111 | types.TupleType: encode_list, 112 | types.DictType: encode_dict, 113 | types.BooleanType: encode_bool, 114 | types.NoneType: encode_none, 115 | } 116 | 117 | def bdecode(cls, x): 118 | try: r, l = cls.decode_func[x[0]](cls, x, 0) 119 | except (IndexError, KeyError, ValueError) as err: 120 | raise BTEError('Not a valid bencoded string: {}'.format(err)) 121 | if l != len(x): 122 | raise BTEError('Invalid bencoded value (data after valid prefix)') 123 | return r 124 | 125 | def bencode(cls, x): 126 | r = [] 127 | cls.encode_func[type(x)](cls, x, r) 128 | return ''.join(r) 129 | 130 | 131 | def pubkey_to_ipv6(key, 132 | _cjdns_b32_map = [ # directly from util/Base32.h 133 | 99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99, 134 | 99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99, 135 | 99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99, 136 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,99,99,99,99,99,99, 137 | 99,99,10,11,12,99,13,14,15,99,16,17,18,19,20,99, 138 | 21,22,23,24,25,26,27,28,29,30,31,99,99,99,99,99, 139 | 99,99,10,11,12,99,13,14,15,99,16,17,18,19,20,99, 140 | 21,22,23,24,25,26,27,28,29,30,31,99,99,99,99,99 ]): 141 | if key.endswith('.k'): key = key[:-2] 142 | 143 | bits, byte, res = 0, 0, list() 144 | for c in key: 145 | n = _cjdns_b32_map[ord(c)] 146 | if n > 31: raise ValueError('Invalid key: {!r}, char: {!r}'.format(key, n)) 147 | byte |= n << bits 148 | bits += 5 149 | if bits >= 8: 150 | bits -= 8 151 | res.append(chr(byte & 0xff)) 152 | byte >>= 8 153 | if bits >= 5 or byte: 154 | raise ValueError('Invalid key length: {!r} (leftover bits: {})'.format(key, bits)) 155 | res = ''.join(res) 156 | 157 | addr = sha512(sha512(res).digest()).hexdigest()[:32] 158 | if addr[:2] != 'fc': 159 | raise ValueError( 'Invalid cjdns key (first' 160 | ' addr byte is not 0xfc, addr: {!r}): {!r}'.format(addr, key) ) 161 | return addr 162 | 163 | 164 | class PeerStatsFailure(Exception): 165 | 166 | def __init__(self, msg, err=None): 167 | if err is not None: msg += ': {} {}'.format(type(err), err) 168 | super(PeerStatsFailure, self).__init__(msg) 169 | 170 | def __hash__(self): 171 | return hash(self.message) 172 | 173 | 174 | class CjdnsPeerStats(Collector): 175 | 176 | last_err = None 177 | last_err_count = None # None (pre-init), True (shut-up mode) or int 178 | last_err_count_max = 3 # max repeated errors to report 179 | 180 | def __init__(self, *argz, **kwz): 181 | super(CjdnsPeerStats, self).__init__(*argz, **kwz) 182 | 183 | assert self.conf.filter.direction in\ 184 | ['any', 'incoming', 'outgoing'], self.conf.filter.direction 185 | 186 | if isinstance(self.conf.peer_id, types.StringTypes): 187 | self.conf.peer_id = [self.conf.peer_id] 188 | 189 | conf_admin, conf_admin_path = None,\ 190 | os.path.expanduser(self.conf.cjdnsadmin_conf) 191 | try: 192 | with open(conf_admin_path) as src: conf_admin = json.load(src) 193 | except (OSError, IOError) as err: 194 | log.warn('Unable to open cjdnsadmin config: %s', err) 195 | except ValueError as err: 196 | log.warn('Unable to process cjdnsadmin config: %s', err) 197 | if conf_admin is None: 198 | log.error('Failed to process cjdnsadmin config, disabling collector') 199 | self.conf.enabled = False 200 | return 201 | 202 | sock_addr = conf_admin['addr'], conf_admin['port'] 203 | self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 204 | self.sock.settimeout(self.conf.timeout) 205 | log.debug('Using cjdns socket: {}:{}'.format(*sock_addr)) 206 | self.sock.connect(sock_addr) 207 | 208 | self.admin_password = conf_admin['password'] 209 | self.peer_ipv6_cache = dict() 210 | 211 | def get_stats_page(self, page, password, bs=2**30): 212 | try: 213 | self.sock.send(BTE.bencode(dict(q='cookie'))) 214 | cookie = BTE.bdecode(self.sock.recv(bs))['cookie'] 215 | except Exception as err: 216 | raise PeerStatsFailure('Failed to get auth cookie', err) 217 | 218 | req = dict( q='auth', 219 | aq='InterfaceController_peerStats', 220 | args=dict(page=page), 221 | hash=sha256('{}{}'.format(password, cookie)).hexdigest(), 222 | cookie=cookie, txid=os.urandom(5).encode('hex') ) 223 | req['hash'] = sha256(BTE.bencode(req)).hexdigest() 224 | 225 | try: 226 | self.sock.send(BTE.bencode(req)) 227 | for n in xrange(self.conf.recv_retries + 1): 228 | resp = BTE.bdecode(self.sock.recv(bs)) 229 | if resp.get('txid') != req['txid']: # likely timed-out responses to old requests 230 | log.warn('Received out-of-order response (n: %s, request: %s): %s', n, req, resp) 231 | continue 232 | return resp['peers'], resp.get('more', False) 233 | except Exception as err: 234 | raise PeerStatsFailure('Failure communicating with cjdns', err) 235 | raise PeerStatsFailure( 'Too many bogus (wrong or no txid) responses' 236 | ' in a row (count: {}), last req/res: {} / {}'.format(self.conf.recv_retries, req, resp) ) 237 | 238 | def get_peer_stats(self): 239 | peers, page, more = list(), 0, True 240 | while more: 241 | stats, more = self.get_stats_page(page, self.admin_password) 242 | peers.extend(stats) 243 | page += 1 244 | return peers 245 | 246 | def read(self): 247 | try: peers = self.get_peer_stats() 248 | # PeerStatsFailure errors' reporting is rate-limited 249 | except PeerStatsFailure as err: 250 | if hash(err) == hash(self.last_err): 251 | if self.last_err_count is True: return 252 | elif self.last_err_count < self.last_err_count_max: self.last_err_count += 1 253 | else: 254 | log.warn( 'Failed getting cjdns peer stats:' 255 | ' {} -- disabling reporting of recurring errors'.format(err) ) 256 | self.last_err_count = True 257 | return 258 | else: self.last_err, self.last_err_count = err, 1 259 | log.warn('Failed getting cjdns peer stats: {}'.format(err)) 260 | return 261 | else: 262 | if self.last_err_count is True: 263 | log.warn('Previous recurring failure ({}) was resolved'.format(self.last_err)) 264 | self.last_err = self.last_err_count = None 265 | 266 | # Detect peers with 2 links having different isIncoming 267 | peers_bidir = dict() 268 | for peer in peers: 269 | val = peers_bidir.get(peer['publicKey']) 270 | if val is False: peers_bidir[peer['publicKey']] = True 271 | elif val is None: peers_bidir[peer['publicKey']] = False 272 | 273 | ts, peer_states = time.time(), defaultdict(int) 274 | for peer in peers: 275 | state = peer['state'].lower() 276 | peer_states[state] += 1 277 | 278 | # Check filters 279 | if self.conf.filter.established_only and state != 'established': continue 280 | if self.conf.filter.direction != 'any': 281 | if self.conf.filter.direction == 'incoming' and not peer['isIncoming']: continue 282 | elif self.conf.filter.direction == 'outgoing' and peer['isIncoming']: continue 283 | else: raise ValueError(self.conf.filter.direction) 284 | 285 | # Generate metric name 286 | pubkey = peer['publicKey'] 287 | if pubkey.endswith('.k'): pubkey = pubkey[:-2] 288 | peer['pubkey'] = pubkey 289 | if 'ipv6' in self.conf.peer_id: 290 | if pubkey not in self.peer_ipv6_cache: 291 | self.peer_ipv6_cache[pubkey] = pubkey_to_ipv6(pubkey) 292 | peer['ipv6'] = self.peer_ipv6_cache[pubkey] 293 | for k in self.conf.peer_id: 294 | if k in peer: 295 | peer_id = peer[k] 296 | break 297 | else: raise KeyError(self.conf.peer_id, peer) 298 | name = '{}.{}.{{}}'.format(self.conf.prefix, peer_id) 299 | if peers_bidir[peer['publicKey']]: 300 | name = name.format('incoming_{}' if peer['isIncoming'] else 'outgoing_{}') 301 | 302 | # Per-peer metrics 303 | name_bytes = name.format('bytes_{}') 304 | for k, d in [('bytesIn', 'in'), ('bytesOut', 'out')]: 305 | yield Datapoint(name_bytes.format(d), 'counter', peer[k], ts) 306 | if self.conf.special_metrics.peer_link: 307 | link = 1 if state == 'established' else 0 308 | yield Datapoint(name.format(self.conf.special_metrics.peer_link), 'gauge', link, ts) 309 | 310 | # Common metrics 311 | if self.conf.special_metrics.count: 312 | yield Datapoint(self.conf.special_metrics.count, 'gauge', len(peers), ts) 313 | if self.conf.special_metrics.count_state: 314 | for k, v in peer_states.viewitems(): 315 | name = '{}.{}'.format(self.conf.special_metrics.count_state, k) 316 | yield Datapoint(name, 'gauge', v, ts) 317 | 318 | 319 | collector = CjdnsPeerStats 320 | -------------------------------------------------------------------------------- /graphite_metrics/collectors/sysstat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import itertools as it, operator as op, functools as ft 4 | from subprocess import Popen, PIPE, STDOUT 5 | from time import time, sleep, strptime, mktime 6 | from calendar import timegm 7 | from datetime import datetime, timedelta 8 | from xattr import xattr 9 | import os, sys, socket, struct 10 | 11 | from . import Collector, Datapoint, dev_resolve, sector_bytes, rate_limit 12 | 13 | try: from simplejson import loads, dumps, JSONDecodeError 14 | except ImportError: 15 | from json import loads, dumps 16 | JSONDecodeError = ValueError 17 | 18 | import logging 19 | log = logging.getLogger(__name__) 20 | 21 | 22 | class SADF(Collector): 23 | 24 | 25 | def __init__(self, *argz, **kwz): 26 | super(SADF, self).__init__(*argz, **kwz) 27 | 28 | # Set force_interval margins, if used 29 | if self.conf.force_interval: 30 | try: 31 | from . import cfg 32 | interval = cfg.loop.interval 33 | except (ImportError, KeyError, AttributeError): 34 | log.warn( 'Failed to apply force_interval option' 35 | ' - unable to access global configuration to get data collection interval' ) 36 | self.force_interval = None 37 | else: 38 | if self.conf.force_interval_fuzz: 39 | fuzz = interval * self.conf.force_interval_fuzz / 100.0 40 | else: fuzz = 0 41 | self.force_interval = interval - fuzz, interval + fuzz 42 | else: self.force_interval = None 43 | 44 | self.rate_limit = rate_limit( 45 | max_interval=self.conf.rate.max_interval, 46 | sampling=self.conf.rate.sampling )\ 47 | if self.conf.rate.limiting_enabled else None 48 | 49 | 50 | def process_entry(self, entry): 51 | 52 | # Timestamp 53 | try: ts = entry.pop('timestamp') 54 | except KeyError: 55 | log.info( 'Detected sysstat entry' 56 | ' without timestamp, skipping: {!r}'.format(entry) ) 57 | return # happens, no idea what to do with these 58 | interval = ts['interval'] 59 | for fmt in '%Y-%m-%d %H-%M-%S', '%Y-%m-%d %H:%M:%S': 60 | try: 61 | ts = (mktime if not ts['utc'] else timegm)\ 62 | (strptime('{} {}'.format(ts['date'], ts['time']), fmt)) 63 | except ValueError: pass 64 | else: break 65 | else: 66 | raise ValueError( 'Unable to process' 67 | ' sysstat timestamp: {!r} {!r}'.format(ts['date'], ts['time']) ) 68 | 69 | # Metrics 70 | metrics = list() 71 | 72 | if self.conf.skip.sections: 73 | for k in self.conf.skip.sections: 74 | if k in entry: del entry[k] 75 | else: log.debug('Section-to-skip {!r} not found in sysstat entry'.format(k)) 76 | process_redundant = not self.conf.skip.redundant 77 | 78 | if 'cpu-load-all' in entry: 79 | for stats in entry.pop('cpu-load-all'): 80 | prefix = stats.pop('cpu') 81 | if prefix == 'all': continue # can be derived by aggregator/webapp 82 | prefix = ['cpu', prefix] 83 | metrics.extend((prefix + [k], v) for k,v in stats.viewitems()) 84 | 85 | if 'process-and-context-switch' in entry: 86 | stats = entry.pop('process-and-context-switch') 87 | metrics.append((['misc', 'contextswitch'], stats['cswch'])) 88 | if process_redundant: # also processed in "stats" 89 | metrics.append((['processes', 'forks'], stats['proc'])) 90 | 91 | if process_redundant: 92 | if 'interrupts' in entry: # with "irq" 93 | for stats in entry.pop('interrupts'): 94 | if stats['intr'] == 'sum': continue # can be derived by aggregator/webapp 95 | metrics.append((['irq', stats['intr'], 'sum'], stats['value'])) 96 | if 'swap-pages' in entry: # with "memstats" 97 | for k,v in entry.pop('swap-pages').viewitems(): 98 | metrics.append((['memory', 'pages', 'activity', k], v)) 99 | # if 'memory' in entry: # with "memstats" 100 | # if 'hugepages' in entry: # with "memstats" 101 | 102 | if 'disk' in entry: 103 | for disk in entry.pop('disk'): 104 | dev_sadf = disk['disk-device'] 105 | if not dev_sadf.startswith('dev'): 106 | log.warn('Unknown device name format: {}, skipping'.format(dev_sadf)) 107 | continue 108 | dev = dev_resolve(*it.imap(int, dev_sadf[3:].split('-')), log_fails=False) 109 | if dev is None: 110 | log.warn('Unable to resolve name for device {!r}, skipping'.format(dev_sadf)) 111 | continue 112 | prefix = ['disk', 'load', dev] 113 | metrics.extend([ 114 | (prefix + ['utilization'], disk['util-percent']), 115 | (prefix + ['req_size'], disk['avgrq-sz']), 116 | (prefix + ['queue_len'], disk['avgqu-sz']), 117 | (prefix + ['bytes_read'], sector_bytes * disk['rd_sec']), 118 | (prefix + ['bytes_write'], sector_bytes * disk['wr_sec']), 119 | (prefix + ['serve_time'], disk['await']), 120 | (prefix + ['tps'], disk['tps']) ]) 121 | # if 'io' in entry: # can be derived by aggregator/webapp 122 | 123 | if 'paging' in entry: 124 | metrics.append(( 125 | ['memory', 'pages', 'vm_efficiency'], 126 | entry.pop('paging')['vmeff-percent'] )) 127 | # XXX: lots of redundant metrics here 128 | 129 | if 'queue' in entry: 130 | stats = entry.pop('queue') 131 | for n in 1, 5, 15: 132 | k = 'ldavg-{}'.format(n) 133 | metrics.append((['load', k], stats[k])) 134 | metrics.extend( 135 | (['processes', 'state', k], stats[k]) 136 | for k in ['runq-sz', 'plist-sz', 'blocked'] ) 137 | 138 | if 'kernel' in entry: 139 | stats = entry.pop('kernel') 140 | metrics.extend([ 141 | (['misc', 'dent_unused'], stats['dentunusd']), 142 | (['misc', 'file_handles'], stats['file-nr']), 143 | (['misc', 'inode_handles'], stats['inode-nr']), 144 | (['misc', 'pty'], stats['pty-nr']) ]) 145 | 146 | if 'network' in entry: 147 | stats = entry.pop('network') 148 | iface_stats = stats.get('net-dev', list()) 149 | for iface in iface_stats: 150 | prefix = ['network', 'interfaces', iface['iface']] 151 | metrics.extend([ 152 | (prefix + ['rx', 'bytes'], iface['rxkB'] * 2**10), 153 | (prefix + ['rx', 'packets', 'total'], iface['rxpck']), 154 | (prefix + ['rx', 'packets', 'compressed'], iface['rxcmp']), 155 | (prefix + ['rx', 'packets', 'multicast'], iface['rxmcst']), 156 | (prefix + ['tx', 'bytes'], iface['txkB'] * 2**10), 157 | (prefix + ['tx', 'packets', 'total'], iface['txpck']), 158 | (prefix + ['tx', 'packets', 'compressed'], iface['txpck']) ]) 159 | iface_stats = stats.get('net-edev', list()) 160 | iface_errs_common = [('err', 'total'), ('fifo', 'overflow_fifo'), ('drop', 'overflow_kbuff')] 161 | for iface in iface_stats: 162 | prefix = ['network', 'interfaces', iface['iface']] 163 | for src,dst in iface_errs_common + [('fram', 'frame_alignment')]: 164 | metrics.append((prefix + ['rx', 'errors', dst], iface['rx{}'.format(src)])) 165 | for src,dst in iface_errs_common + [('carr', 'carrier')]: 166 | metrics.append((prefix + ['tx', 'errors', dst], iface['tx{}'.format(src)])) 167 | metrics.append((prefix + ['tx', 'errors', 'collision'], iface['coll'])) 168 | if 'net-nfs' in stats: 169 | for k,v in stats['net-nfs'].viewitems(): 170 | metrics.append((['network', 'nfs', 'client', k], v)) 171 | for k,v in stats['net-nfsd'].viewitems(): 172 | metrics.append((['network', 'nfs', 'server', k], v)) 173 | if 'net-sock' in stats: 174 | for k,v in stats['net-sock'].viewitems(): 175 | if k.endswith('sck'): 176 | k = k[:-3] 177 | if k == 'tot': k = 'total' 178 | metrics.append((['network', 'sockets', k], v)) 179 | 180 | if 'power-management' in entry: 181 | stats = entry.pop('power-management') 182 | for metric in stats.get('temperature', list()): 183 | name = ['sensors', 'temperature', metric['device'].replace('.', '_')] 184 | if 'number' in metric: name.append(bytes(metric['number'])) 185 | metrics.append((name, metric['degC'])) 186 | 187 | return ts, interval, metrics 188 | 189 | 190 | def _read(self, ts_to=None): 191 | if not ts_to: ts_to = datetime.now() 192 | 193 | sa_days = dict( (ts.day, ts) 194 | for ts in ((ts_to - timedelta(i)) 195 | for i in xrange(self.conf.skip.older_than_days+1)) ) 196 | sa_files = sorted(it.ifilter( 197 | op.methodcaller('startswith', 'sa'), os.listdir(self.conf.sa_path) )) 198 | host = os.uname()[1] # to check vs nodename in data 199 | log.debug('SA files to process: {}'.format(sa_files)) 200 | 201 | for sa in sa_files: 202 | sa_day = int(sa[2:]) 203 | try: sa_day = sa_days[sa_day] 204 | except KeyError: continue # too old or new 205 | 206 | sa = os.path.join(self.conf.sa_path, sa) 207 | log.debug('Processing file: {}'.format(sa)) 208 | 209 | # Read xattr timestamp 210 | sa_xattr = xattr(sa) 211 | try: sa_ts_from = sa_xattr[self.conf.xattr_name] 212 | except KeyError: sa_ts_from = None 213 | if sa_ts_from: 214 | sa_ts_from = datetime.fromtimestamp( 215 | struct.unpack('=I', sa_ts_from)[0] ) 216 | if sa_day - sa_ts_from > timedelta(1) + timedelta(seconds=60): 217 | log.debug( 'Discarding xattr timestamp, because' 218 | ' it doesnt seem to belong to the same date as file' 219 | ' (day: {}, xattr: {})'.format(sa_day, sa_ts_from) ) 220 | sa_ts_from = None 221 | if sa_ts_from and sa_ts_from.date() != sa_day.date(): 222 | log.debug('File xattr timestamp points to the next day, skipping file') 223 | continue 224 | if not self.conf.max_dump_span: sa_ts_to = None 225 | else: 226 | # Use 00:00 of sa_day + max_dump_span if there's no xattr 227 | ts = sa_ts_from or datetime(sa_day.year, sa_day.month, sa_day.day) 228 | sa_ts_to = ts + timedelta(0, self.conf.max_dump_span) 229 | # Avoid adding restrictions, if they make no sense anyway 230 | if sa_ts_to >= datetime.now(): sa_ts_to = None 231 | 232 | # Get data from sadf 233 | sa_cmd = ['sadf', '-jt'] 234 | if sa_ts_from: sa_cmd.extend(['-s', sa_ts_from.strftime('%H:%M:%S')]) 235 | if sa_ts_to: sa_cmd.extend(['-e', sa_ts_to.strftime('%H:%M:%S')]) 236 | sa_cmd.extend(['--', '-A']) 237 | sa_cmd.append(sa) 238 | log.debug('sadf command: {}'.format(sa_cmd)) 239 | sa_proc = Popen(sa_cmd, stdout=PIPE) 240 | try: data = loads(sa_proc.stdout.read()) 241 | except JSONDecodeError as err: 242 | log.exception(( 'Failed to process sadf (file:' 243 | ' {}, command: {}) output: {}' ).format(sa, sa_cmd, err)) 244 | data = None 245 | if sa_proc.wait(): 246 | log.error('sadf (command: {}) exited with error'.format(sa_cmd)) 247 | data = None 248 | if not data: 249 | log.warn('Skipping processing of sa file: {}'.format(sa)) 250 | continue 251 | 252 | # Process and dispatch the datapoints 253 | sa_ts_max = 0 254 | for data in data['sysstat']['hosts']: 255 | if data['nodename'] != host: 256 | log.warn( 'Mismatching hostname in sa data:' 257 | ' {} (uname: {}), skipping'.format(data['nodename'], host) ) 258 | continue 259 | sa_day_ts = mktime(sa_day.timetuple()) 260 | # Read the data 261 | for ts, interval, metrics in it.ifilter( 262 | None, it.imap(self.process_entry, data['statistics']) ): 263 | if ts - 1 > sa_ts_max: 264 | # has to be *before* beginning of the next interval 265 | sa_ts_max = ts - 1 266 | if abs(ts - sa_day_ts) > 24*3600 + interval + 1: 267 | log.warn( 'Dropping sample because of timestamp mismatch' 268 | ' (timestamp: {}, expected date: {})'.format(ts, sa_day_ts) ) 269 | continue 270 | if self.force_interval and ( 271 | interval < self.force_interval[0] 272 | or interval > self.force_interval[1] ): 273 | log.warn( 'Dropping sample because of interval mismatch' 274 | ' (file: {sa}, interval: {interval},' 275 | ' required: {margins[0]}-{margins[1]}, timestamp: {ts})'\ 276 | .format(sa=sa, interval=interval, ts=ts, margins=self.force_interval) ) 277 | continue 278 | ts_val = int(ts) 279 | for name, val in metrics: 280 | yield Datapoint('.'.join(name), 'gauge', val, ts_val) 281 | 282 | # Update xattr timestamp, if any entries were processed 283 | if sa_ts_max: 284 | log.debug('Updating xattr timestamp to {}'.format(sa_ts_max)) 285 | if not self.conf.debug.dry_run: 286 | sa_xattr[self.conf.xattr_name] = struct.pack('=I', int(sa_ts_max)) 287 | 288 | 289 | def read(self): 290 | if not self.rate_limit or next(self.rate_limit): 291 | log.debug('Running sysstat data processing cycle') 292 | return self._read() 293 | else: return list() 294 | 295 | 296 | collector = SADF 297 | -------------------------------------------------------------------------------- /graphite_metrics/harvestd.yaml: -------------------------------------------------------------------------------- 1 | ### Default (baseline) configuration parameters. 2 | ### DO NOT ever change this config, use -c commandline option instead! 3 | 4 | 5 | collectors: 6 | # Modules that collect the actual datapoints to be sent 7 | 8 | _default: # used as a base for all other sections here 9 | enabled: true 10 | # debug: # auto-filled from global "debug" section, if not specified 11 | 12 | ping: 13 | # Reports average (ewma) rtt of icmp ping to each specified host and packet loss (if any). 14 | interval: 5 # seconds between sending-out pings 15 | ewma_factor: 0.3 # ewma factor for rtt values 16 | resolve: 17 | no_reply: 30 # re-resolve hostnames after 30 seconds w/o reply 18 | time: 600 # re-resolve hostnames after fixed 600s intervals 19 | # "max_retries" restarts ping subprocess (e.g. to apply changes to 20 | # /etc/hosts or other libc resolver configuration) after N name resolution failures. 21 | # Also, if resolver fails even after restart (i.e. on start), disable warnings 22 | # (but issuing a message on next success) after that number of retries. 23 | max_retries: 5 24 | hosts: # explicitly split into ipv4/ipv6 to control how hostnames are resolved 25 | ipv4: 26 | # google_com: google.com 27 | # google_dns: 8.8.8.8 28 | ipv6: 29 | # ipv6_google_com: ipv6.google.com 30 | # ipv6_tunnelbroker_net: ipv6.tunnelbroker.net 31 | 32 | cron_log: 33 | # Reports start/stop, run time and errors for cron jobs from a logfile. 34 | # I use simple wrappers for cron-jobs to produce these logs (among other things): 35 | # https://github.com/mk-fg/fgtk#task https://github.com/mk-fg/fgtk/tree/master/task 36 | source: # must be filled with path to a log file 37 | aliases: # either [alias, regexp] or ["_" + regexp_group, regexp], see "_script" example below 38 | # - ['logrotate', '(^|\b)logrotate\b'] 39 | # - ['locate', '(^|\b)updatedb\b'] 40 | # - ['_script', '/etc/cron\.\w+/*(?P