├── graphite_metrics
    ├── __init__.py
    ├── sinks
    │   ├── __init__.py
    │   ├── dump.py
    │   ├── carbon_socket.py
    │   └── librato_metrics.py
    ├── processors
    │   ├── __init__.py
    │   └── hostname_prefix.py
    ├── loops
    │   ├── __init__.py
    │   └── basic.py
    ├── collectors
    │   ├── stats.py
    │   ├── irq.py
    │   ├── ping.py
    │   ├── memstats.py
    │   ├── slabinfo.py
    │   ├── memfrag.py
    │   ├── __init__.py
    │   ├── iptables_counts.py
    │   ├── cron_log.py
    │   ├── _ping.py
    │   ├── cjdns_peer_stats.py
    │   ├── sysstat.py
    │   └── cgacct.py
    ├── harvestd.py
    └── harvestd.yaml
├── .gitignore
├── MANIFEST.in
├── requirements.txt
├── COPYING
├── setup.py
└── README.md


/graphite_metrics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /*.egg-info
2 | /build
3 | /dist
4 | /README.txt
5 | *.pyc
6 | *.pyo
7 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include graphite_metrics/harvestd.yaml
2 | include COPYING README.txt
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | PyYAML==3.09
 2 | dbus-python==0.84
 3 | distribute==0.6.24
 4 | gevent==0.13.6
 5 | greenlet==0.3.1
 6 | iso8601==0.1.4
 7 | layered-yaml-attrdict-config==12.05.3
 8 | requests==0.11.1
 9 | simplejson==2.1.1
10 | xattr==0.6.2
11 | 


--------------------------------------------------------------------------------
/graphite_metrics/sinks/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | 
 5 | import logging
 6 | log = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | class Sink(object):
10 | 
11 | 	def __init__(self, conf):
12 | 		self.conf = conf
13 | 
14 | 	def dispatch(self, *tuples):
15 | 		raise NotImplementedError( 'Sink.dispatch method should be overidden in sink'
16 | 			' subclasses to dispatch (metric_name, value, timestamp) tuples to whatever destination.' )
17 | 


--------------------------------------------------------------------------------
/graphite_metrics/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | 
 5 | import logging
 6 | log = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | class Processor(object):
10 | 
11 | 	def __init__(self, conf):
12 | 		self.conf = conf
13 | 
14 | 	def process(self, dp_tuple, sinks):
15 | 		raise NotImplementedError( 'Processor.process method'
16 | 			' should be overidden in processor subclasses to mangle'
17 | 			' (name, value, timestamp) tuple in some way.' )
18 | 


--------------------------------------------------------------------------------
/COPYING:
--------------------------------------------------------------------------------
 1 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 2 |                     Version 2, December 2004
 3 | 
 4 |  Copyright (C) 2012 Mike Kazantsev <mk.fraggod@gmail.com>
 5 | 
 6 |  Everyone is permitted to copy and distribute verbatim or modified
 7 |  copies of this license document, and changing it is allowed as long
 8 |  as the name is changed.
 9 | 
10 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
11 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
12 | 
13 |   0. You just DO WHAT THE FUCK YOU WANT TO.
14 | 


--------------------------------------------------------------------------------
/graphite_metrics/sinks/dump.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | 
 5 | from . import Sink
 6 | 
 7 | import logging
 8 | log = logging.getLogger(__name__)
 9 | 
10 | 
11 | class Dumper(Sink):
12 | 
13 | 	'Just dumps the data to log. Useful for debugging.'
14 | 
15 | 	def dispatch(self, *tuples):
16 | 		log.info('--- dump of {} datapoints'.format(len(tuples)))
17 | 		for name, value, ts_dp in tuples:
18 | 			log.info('Datapoint: {} {} {}'.format(name, value, ts_dp))
19 | 		log.info('--- dump end')
20 | 
21 | 
22 | sink = Dumper
23 | 


--------------------------------------------------------------------------------
/graphite_metrics/loops/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | from time import time
 5 | 
 6 | import logging
 7 | log = logging.getLogger(__name__)
 8 | 
 9 | # Global configuration for harvestd,
10 | #  intended to be set before initializing loops,
11 | #  but should not be really relied upon - can be empty.
12 | cfg = dict()
13 | 
14 | 
15 | class Loop(object):
16 | 
17 | 	def __init__(self, conf, time_func=time):
18 | 		self.conf, self.time_func = conf, time_func
19 | 
20 | 	def start(self, collectors, processors, sinks):
21 | 		raise NotImplementedError( 'Loop.start method should be'
22 | 			' overidden in loop subclasses to start poll/process/send loop'
23 | 			' using passed Collector, Processor and Sink objects.' )
24 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/stats.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | from io import open
 5 | 
 6 | from . import Collector, Datapoint
 7 | 
 8 | import logging
 9 | log = logging.getLogger(__name__)
10 | 
11 | 
12 | class Stats(Collector):
13 | 
14 | 	def read(self):
15 | 		with open('/proc/stat', 'rb') as table:
16 | 			for line in table:
17 | 				label, vals = line.split(None, 1)
18 | 				total = int(vals.split(None, 1)[0])
19 | 				if label == 'intr': name = 'irq.total.hard'
20 | 				elif label == 'softirq': name = 'irq.total.soft'
21 | 				elif label == 'processes': name = 'processes.forks'
22 | 				else: continue # no more useful data here
23 | 				yield Datapoint(name, 'counter', total, None)
24 | 
25 | 
26 | collector = Stats
27 | 


--------------------------------------------------------------------------------
/graphite_metrics/processors/hostname_prefix.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | import os
 5 | 
 6 | from . import Processor
 7 | 
 8 | import logging
 9 | log = logging.getLogger(__name__)
10 | 
11 | 
12 | class HostnamePrefix(Processor):
13 | 
14 | 	'Adds a hostname as a prefix to metric name.'
15 | 
16 | 	def __init__(self, *argz, **kwz):
17 | 		super(HostnamePrefix, self).__init__(*argz, **kwz)
18 | 		self.prefix = self.conf.hostname
19 | 		if self.prefix is None: self.prefix = os.uname()[1]
20 | 		if not self.prefix.endswith('.'): self.prefix += '.'
21 | 
22 | 	def process(self, dp_tuple, sinks):
23 | 		name, value, ts_dp = dp_tuple
24 | 		return (self.prefix + name, value, ts_dp), sinks
25 | 
26 | 
27 | processor = HostnamePrefix
28 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/irq.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | from io import open
 5 | 
 6 | from . import Collector, Datapoint
 7 | 
 8 | import logging
 9 | log = logging.getLogger(__name__)
10 | 
11 | 
12 | class IRQ(Collector):
13 | 
14 | 	@staticmethod
15 | 	def _parse_irq_table(table):
16 | 		irqs = dict()
17 | 		bindings = map(bytes.lower, table.readline().strip().split())
18 | 		bindings_cnt = len(bindings)
19 | 		for line in it.imap(bytes.strip, table):
20 | 			irq, line = line.split(None, 1)
21 | 			irq = irq.rstrip(':').lower()
22 | 			if irq in irqs:
23 | 				log.warn('Conflicting irq name/id: {!r}, skipping'.format(irq))
24 | 				continue
25 | 			irqs[irq] = map(int, line.split(None, bindings_cnt)[:bindings_cnt])
26 | 		return bindings, irqs
27 | 
28 | 	def read(self):
29 | 		irq_tables = list()
30 | 		# /proc/interrupts
31 | 		with open('/proc/interrupts', 'rb') as table:
32 | 			irq_tables.append(self._parse_irq_table(table))
33 | 		# /proc/softirqs
34 | 		with open('/proc/softirqs', 'rb') as table:
35 | 			irq_tables.append(self._parse_irq_table(table))
36 | 		# dispatch
37 | 		for bindings, irqs in irq_tables:
38 | 			for irq, counts in irqs.viewitems():
39 | 				if sum(counts) == 0: continue
40 | 				for bind, count in it.izip(bindings, counts):
41 | 					yield Datapoint('irq.{}.{}'.format(irq, bind), 'counter', count, None)
42 | 
43 | 
44 | collector = IRQ
45 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/ping.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | from subprocess import Popen, PIPE
 5 | from io import open
 6 | import os, signal
 7 | 
 8 | from . import Collector, Datapoint
 9 | 
10 | import logging
11 | log = logging.getLogger(__name__)
12 | 
13 | 
14 | class PingerInterface(Collector):
15 | 
16 | 	def __init__(self, *argz, **kwz):
17 | 		super(PingerInterface, self).__init__(*argz, **kwz)
18 | 		self.hosts = dict(it.chain(
19 | 			( ('v4:{}'.format(spec), name)
20 | 				for name, spec in (self.conf.hosts.ipv4 or dict()).viewitems() ),
21 | 			( ('v6:{}'.format(spec), name)
22 | 				for name, spec in (self.conf.hosts.ipv6 or dict()).viewitems() ) ))
23 | 		if not self.hosts:
24 | 			log.info('No valid hosts to ping specified, disabling collector')
25 | 			self.conf.enabled = False
26 | 		else: self.spawn_pinger()
27 | 
28 | 	def spawn_pinger(self):
29 | 		cmd = (
30 | 			['python', os.path.join(os.path.dirname(__file__), '_ping.py')]
31 | 				+ map(bytes, [ self.conf.interval,
32 | 					self.conf.resolve.no_reply or 0, self.conf.resolve.time or 0,
33 | 					self.conf.ewma_factor, os.getpid(), self.conf.resolve.max_retries ])
34 | 				+ self.hosts.keys() )
35 | 		log.debug('Starting pinger subprocess: {}'.format(' '.join(cmd)))
36 | 		self.proc = Popen(cmd, stdout=PIPE, close_fds=True)
37 | 		self.proc.stdout.readline() # wait until it's initialized
38 | 
39 | 	def read(self):
40 | 		err = self.proc.poll()
41 | 		if err is not None:
42 | 			log.warn( 'Pinger subprocess has failed'
43 | 				' (exit code: {}), restarting it'.format(err) )
44 | 			self.spawn_pinger()
45 | 		else:
46 | 			self.proc.send_signal(signal.SIGQUIT)
47 | 			for line in iter(self.proc.stdout.readline, ''):
48 | 				line = line.strip()
49 | 				if not line: break
50 | 				host, ts_offset, rtt, lost = line.split()
51 | 				host = self.hosts[host]
52 | 				yield Datapoint('network.ping.{}.ping'.format(host), 'gauge', float(rtt), None)
53 | 				yield Datapoint('network.ping.{}.droprate'.format(host), 'counter', int(lost), None)
54 | 
55 | 
56 | collector = PingerInterface
57 | 


--------------------------------------------------------------------------------
/graphite_metrics/loops/basic.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | 
 5 | from . import Loop
 6 | 
 7 | import logging
 8 | log = logging.getLogger(__name__)
 9 | 
10 | 
11 | class BasicLoop(Loop):
12 | 
13 | 	'Simple synchronous "while True: fetch && process && send" loop.'
14 | 
15 | 	def start(self, collectors, processors, sinks):
16 | 		from time import time, sleep
17 | 
18 | 		ts = self.time_func()
19 | 		while True:
20 | 			data = list()
21 | 			for name, collector in collectors.viewitems():
22 | 				log.debug('Polling data from a collector (name: {}): {}'.format(name, collector))
23 | 				try: data.extend(collector.read())
24 | 				except Exception as err:
25 | 					log.exception( 'Failed to poll collector'
26 | 						' (name: {}, obj: {}): {}'.format(name, collector, err) )
27 | 
28 | 			ts_now = self.time_func()
29 | 			sink_data = dict() # to batch datapoints on per-sink basis
30 | 
31 | 			log.debug('Processing {} datapoints'.format(len(data)))
32 | 			for dp in it.ifilter(None, (dp.get(ts=ts_now) for dp in data)):
33 | 				proc_sinks = sinks.copy()
34 | 				for name, proc in processors.viewitems():
35 | 					if dp is None: break
36 | 					try: dp, sinks = proc.process(dp, sinks)
37 | 					except Exception as err:
38 | 						log.exception(( 'Failed to process datapoint (data: {},'
39 | 							' processor: {}, obj: {}): {}, discarding' ).format(dp, name, proc, err))
40 | 						break
41 | 				else:
42 | 					if dp is None: continue
43 | 					for name, sink in proc_sinks.viewitems():
44 | 						try: sink_data[name].append(dp)
45 | 						except KeyError: sink_data[name] = [dp]
46 | 
47 | 			log.debug('Dispatching data to {} sink(s)'.format(len(sink_data)))
48 | 			if not self.conf.debug.dry_run:
49 | 				for name, tuples in sink_data.viewitems():
50 | 					log.debug(( 'Sending {} datapoints to sink'
51 | 						' (name: {}): {}' ).format(len(tuples), name, sink))
52 | 					try: sinks[name].dispatch(*tuples)
53 | 					except Exception as err:
54 | 						log.exception( 'Failed to dispatch data to sink'
55 | 							' (name: {}, obj: {}): {}'.format(name, sink, err) )
56 | 
57 | 			while ts < ts_now: ts += self.conf.interval
58 | 			ts_sleep = max(0, ts - self.time_func())
59 | 			log.debug('Sleep: {}s'.format(ts_sleep))
60 | 			sleep(ts_sleep)
61 | 
62 | 
63 | loop = BasicLoop
64 | 


--------------------------------------------------------------------------------
/graphite_metrics/sinks/carbon_socket.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | from time import sleep
 5 | import socket
 6 | 
 7 | from . import Sink
 8 | 
 9 | import logging
10 | log = logging.getLogger(__name__)
11 | 
12 | 
13 | class CarbonSocket(Sink):
14 | 
15 | 	'''Simple blocking non-buffering sender
16 | 		to graphite carbon tcp linereceiver interface.'''
17 | 
18 | 	def __init__(self, conf):
19 | 		super(CarbonSocket, self).__init__(conf)
20 | 		if not self.conf.debug.dry_run: self.connect()
21 | 
22 | 	def connect(self, send=None):
23 | 		host, port = self.conf.host
24 | 		reconnects = self.conf.max_reconnects
25 | 		while True:
26 | 			try:
27 | 				try:
28 | 					addrinfo = list(reversed(socket.getaddrinfo(
29 | 						host, port, socket.AF_UNSPEC, socket.SOCK_STREAM )))
30 | 				except socket.error as err:
31 | 					raise socket.gaierror(err.message)
32 | 				assert addrinfo, addrinfo
33 | 				while addrinfo:
34 | 					# Try connecting to all of the returned addresses
35 | 					af, socktype, proto, canonname, sa = addrinfo.pop()
36 | 					try:
37 | 						self.sock = socket.socket(af, socktype, proto)
38 | 						self.sock.connect(sa)
39 | 					except socket.error:
40 | 						if not addrinfo: raise
41 | 				log.debug('Connected to Carbon at {}:{}'.format(*sa))
42 | 				if send: self.sock.sendall(send)
43 | 
44 | 			except (socket.error, socket.gaierror) as err:
45 | 				if reconnects is not None:
46 | 					reconnects -= 1
47 | 					if reconnects <= 0: raise
48 | 				if isinstance(err, socket.gaierror):
49 | 					log.info('Failed to resolve host ({!r}): {}'.format(host, err))
50 | 				else: log.info('Failed to connect to {}:{}: {}'.format(host, port, err))
51 | 				if self.conf.reconnect_delay:
52 | 					sleep(max(0, self.conf.reconnect_delay))
53 | 
54 | 			else: break
55 | 
56 | 	def close(self):
57 | 		try: self.sock.close()
58 | 		except: pass
59 | 
60 | 	def reconnect(self, send=None):
61 | 		self.close()
62 | 		self.connect(send=send)
63 | 
64 | 	def dispatch(self, *tuples):
65 | 		reconnects = self.conf.max_reconnects
66 | 		packet = ''.join(it.starmap('{} {} {}\n'.format, tuples))
67 | 		try: self.sock.sendall(packet)
68 | 		except socket.error as err:
69 | 			log.error('Failed to send data to Carbon server: {}'.format(err))
70 | 			self.reconnect(send=packet)
71 | 
72 | 
73 | sink = CarbonSocket
74 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/memstats.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | import re
 5 | 
 6 | from . import Collector, Datapoint
 7 | 
 8 | import logging
 9 | log = logging.getLogger(__name__)
10 | 
11 | 
12 | class MemStats(Collector):
13 | 
14 | 	_warn_hp = True
15 | 
16 | 	@staticmethod
17 | 	def _camelcase_fix( name,
18 | 			_re1=re.compile(r'(.)([A-Z][a-z]+)'),
19 | 			_re2=re.compile(r'([a-z0-9])([A-Z])'),
20 | 			_re3=re.compile(r'_+') ):
21 | 		return _re3.sub('_', _re2.sub(
22 | 			r'\1_\2', _re1.sub(r'\1_\2', name) )).lower()
23 | 
24 | 	def read(self):
25 | 		# /proc/vmstat
26 | 		with open('/proc/vmstat', 'rb') as table:
27 | 			for line in table:
28 | 				metric, val = line.strip().split(None, 1)
29 | 				val = int(val)
30 | 				if metric.startswith('nr_'):
31 | 					yield Datapoint( 'memory.pages.allocation.{}'\
32 | 						.format(metric[3:]), 'gauge', val, None )
33 | 				else:
34 | 					yield Datapoint( 'memory.pages.activity.{}'\
35 | 						.format(metric), 'gauge', val, None )
36 | 		# /proc/meminfo
37 | 		with open('/proc/meminfo', 'rb') as table:
38 | 			table = dict(line.strip().split(None, 1) for line in table)
39 | 		hp_size = table.pop('Hugepagesize:', None)
40 | 		if hp_size and not hp_size.endswith(' kB'): hp_size = None
41 | 		if hp_size: hp_size = int(hp_size[:-3])
42 | 		elif self._warn_hp:
43 | 			log.warn('Unable to get hugepage size from /proc/meminfo')
44 | 			self._warn_hp = False
45 | 		for metric, val in table.viewitems():
46 | 			if metric.startswith('DirectMap'): continue # static info
47 | 			# Name mangling
48 | 			metric = self._camelcase_fix(
49 | 				metric.rstrip(':').replace('(', '_').replace(')', '') )
50 | 			if metric.startswith('s_'): metric = 'slab_{}'.format(metric[2:])
51 | 			elif metric.startswith('mem_'): metric = metric[4:]
52 | 			elif metric == 'slab': metric = 'slab_total'
53 | 			# Value processing
54 | 			try: val, val_unit = val.split()
55 | 			except ValueError: # no units assumed as number of pages
56 | 				if not metric.startswith('huge_pages_'):
57 | 					log.warn( 'Unhandled page-measured'
58 | 						' metric in /etc/meminfo: {}'.format(metric) )
59 | 					continue
60 | 				val = int(val) * hp_size
61 | 			else:
62 | 				if val_unit != 'kB':
63 | 					log.warn('Unhandled unit type in /etc/meminfo: {}'.format(unit))
64 | 					continue
65 | 				val = int(val)
66 | 			yield Datapoint( 'memory.allocation.{}'\
67 | 				.format(metric), 'gauge', val * 1024, None )
68 | 
69 | 
70 | collector = MemStats
71 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | from glob import iglob
 5 | import os, sys
 6 | 
 7 | from setuptools import setup, find_packages
 8 | 
 9 | pkg_root = os.path.dirname(__file__)
10 | 
11 | entry_points = dict(console_scripts=['harvestd = graphite_metrics.harvestd:main'])
12 | entry_points.update(
13 | 	('graphite_metrics.{}'.format(ep_type), list(
14 | 		'{0} = graphite_metrics.{1}.{0}'\
15 | 			.format(os.path.basename(fn)[:-3], ep_type)
16 | 		for fn in iglob(os.path.join(
17 | 			pkg_root, 'graphite_metrics', ep_type, '[!_]*.py' )) ))
18 | 	for ep_type in ['collectors', 'processors', 'sinks', 'loops'] )
19 | 
20 | # Error-handling here is to allow package to be built w/o README included
21 | try: readme = open(os.path.join(pkg_root, 'README.txt')).read()
22 | except IOError: readme = ''
23 | 
24 | setup(
25 | 
26 | 	name = 'graphite-metrics',
27 | 	version = '15.7.0',
28 | 	author = 'Mike Kazantsev',
29 | 	author_email = 'mk.fraggod@gmail.com',
30 | 	license = 'WTFPL',
31 | 	keywords = 'graphite sysstat systemd cgroups metrics proc',
32 | 	url = 'http://github.com/mk-fg/graphite-metrics',
33 | 
34 | 	description = 'Standalone Graphite metric data collectors for'
35 | 		' various stuff thats not (or poorly) handled by other monitoring daemons',
36 | 	long_description = readme,
37 | 
38 | 	classifiers = [
39 | 		'Development Status :: 4 - Beta',
40 | 		'Environment :: No Input/Output (Daemon)',
41 | 		'Intended Audience :: Developers',
42 | 		'Intended Audience :: System Administrators',
43 | 		'Intended Audience :: Telecommunications Industry',
44 | 		'License :: OSI Approved',
45 | 		'Operating System :: POSIX',
46 | 		'Operating System :: Unix',
47 | 		'Programming Language :: Python',
48 | 		'Programming Language :: Python :: 2.7',
49 | 		'Programming Language :: Python :: 2 :: Only',
50 | 		'Topic :: Internet',
51 | 		'Topic :: Internet :: Log Analysis',
52 | 		'Topic :: System :: Monitoring',
53 | 		'Topic :: System :: Networking :: Monitoring',
54 | 		'Topic :: System :: Operating System Kernels :: Linux' ],
55 | 
56 | 	install_requires = ['layered-yaml-attrdict-config', 'setuptools'],
57 | 	extras_require = {
58 | 		'collectors.cgacct': ['dbus-python'],
59 | 		'collectors.cron_log': ['xattr', 'iso8601'],
60 | 		'collectors.sysstat': ['xattr'],
61 | 		'sinks.librato_metrics': ['requests'],
62 | 		'sinks.librato_metrics.async': ['gevent'] },
63 | 
64 | 	packages = find_packages(),
65 | 	package_data = {'': ['README.txt'], 'graphite_metrics': ['harvestd.yaml']},
66 | 	exclude_package_data = {'': ['README.*']},
67 | 
68 | 	entry_points = entry_points )
69 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/slabinfo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | from collections import namedtuple
 5 | from io import open
 6 | 
 7 | from . import Collector, Datapoint, page_size
 8 | 
 9 | import logging
10 | log = logging.getLogger(__name__)
11 | 
12 | 
13 | class SlabInfo(Collector):
14 | 
15 | 	version_check = '2.1'
16 | 
17 | 	def __init__(self, *argz, **kwz):
18 | 		super(SlabInfo, self).__init__(*argz, **kwz)
19 | 
20 | 		for k in 'include_prefixes', 'exclude_prefixes':
21 | 			if not self.conf.get(k): self.conf[k] = list()
22 | 
23 | 		with open('/proc/slabinfo', 'rb') as table:
24 | 			line = table.readline()
25 | 			self.version = line.split(':')[-1].strip()
26 | 			if self.version_check\
27 | 					and self.version != self.version_check:
28 | 				log.warn( 'Slabinfo header indicates'
29 | 						' different schema version (expecting: {}): {}'\
30 | 					.format(self.version_check, line) )
31 | 			line = table.readline().strip().split()
32 | 			if line[0] != '#' or line[1] != 'name':
33 | 				log.error('Unexpected slabinfo format, not processing it')
34 | 				return
35 | 			headers = dict(name=0)
36 | 			for idx,header in enumerate(line[2:], 1):
37 | 				if header[0] == '<' and header[-1] == '>': headers[header[1:-1]] = idx
38 | 			pick = 'name', 'active_objs', 'objsize', 'pagesperslab', 'active_slabs', 'num_slabs'
39 | 			picker = op.itemgetter(*op.itemgetter(*pick)(headers))
40 | 			record = namedtuple('slabinfo_record', ' '.join(pick))
41 | 			self.parse_line = lambda line: record(*( (int(val) if idx else val)
42 | 					for idx,val in enumerate(picker(line.strip().split())) ))
43 | 
44 | 	# http://elinux.org/Slab_allocator
45 | 	def read(self):
46 | 		parse_line, ps = self.parse_line, page_size
47 | 		with open('/proc/slabinfo', 'rb') as table:
48 | 			table.readline(), table.readline() # header
49 | 			for line in table:
50 | 				info = parse_line(line)
51 | 				for prefix in self.conf.include_prefixes:
52 | 					if info.name.startswith(prefix): break # force-include
53 | 				else:
54 | 					for prefix in self.conf.exclude_prefixes:
55 | 						if info.name.startswith(prefix):
56 | 							info = None
57 | 							break
58 | 				if info:
59 | 					vals = [
60 | 						('obj_active', info.active_objs * info.objsize),
61 | 						('slab_active', info.active_slabs * info.pagesperslab * ps),
62 | 						('slab_allocated', info.num_slabs * info.pagesperslab * ps) ]
63 | 					if self.conf.pass_zeroes or sum(it.imap(op.itemgetter(1), vals)) != 0:
64 | 						for val_name, val in vals:
65 | 							yield Datapoint( 'memory.slabs.{}.bytes_{}'\
66 | 								.format(info.name, val_name), 'gauge', val, None )
67 | 
68 | 
69 | collector = SlabInfo
70 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/memfrag.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | from io import open
 5 | import re
 6 | 
 7 | from . import Collector, Datapoint, page_size_kb
 8 | 
 9 | import logging
10 | log = logging.getLogger(__name__)
11 | 
12 | 
13 | class MemFrag(Collector):
14 | 
15 | 	def read( self,
16 | 			_re_buddyinfo=re.compile(r'^\s*Node\s+(?P<node>\d+)'
17 | 				r',\s+zone\s+(?P<zone>\S+)\s+(?P<counts>.*)$'),
18 | 			_re_ptinfo=re.compile(r'^\s*Node\s+(?P<node>\d+)'
19 | 				r',\s+zone\s+(?P<zone>\S+),\s+type\s+(?P<mtype>\S+)\s+(?P<counts>.*)$') ):
20 | 		mmap, pskb = dict(), page_size_kb
21 | 
22 | 		# /proc/buddyinfo
23 | 		with open('/proc/buddyinfo', 'rb') as table:
24 | 			for line in it.imap(bytes.strip, table):
25 | 				match = _re_buddyinfo.search(line)
26 | 				if not match:
27 | 					log.warn('Unrecognized line in /proc/buddyinfo, skipping: {!r}'.format(line))
28 | 					continue
29 | 				node, zone = int(match.group('node')), match.group('zone').lower()
30 | 				counts = dict( ('{}k'.format(pskb*2**order),count)
31 | 					for order,count in enumerate(it.imap(int, match.group('counts').strip().split())) )
32 | 				if node not in mmap: mmap[node] = dict()
33 | 				if zone not in mmap[node]: mmap[node][zone] = dict()
34 | 				mmap[node][zone]['available'] = counts
35 | 
36 | 		# /proc/pagetypeinfo
37 | 		with open('/proc/pagetypeinfo', 'rb') as table:
38 | 			page_counts_found = False
39 | 			while True:
40 | 				line = table.readline()
41 | 				if not line: break
42 | 				elif 'Free pages count' not in line:
43 | 					while line.strip(): line = table.readline()
44 | 					continue
45 | 				elif page_counts_found:
46 | 					log.warn( 'More than one free pages'
47 | 						' counters section found in /proc/pagetypeinfo' )
48 | 					continue
49 | 				else:
50 | 					page_counts_found = True
51 | 					for line in it.imap(bytes.strip, table):
52 | 						if not line: break
53 | 						match = _re_ptinfo.search(line)
54 | 						if not match:
55 | 							log.warn( 'Unrecognized line'
56 | 								' in /proc/pagetypeinfo, skipping: {!r}'.format(line) )
57 | 							continue
58 | 						node, zone, mtype = int(match.group('node')),\
59 | 							match.group('zone').lower(), match.group('mtype').lower()
60 | 						counts = dict( ('{}k'.format(pskb*2**order),count)
61 | 							for order,count in enumerate(it.imap(int, match.group('counts').strip().split())) )
62 | 						if node not in mmap: mmap[node] = dict()
63 | 						if zone not in mmap[node]: mmap[node][zone] = dict()
64 | 						mmap[node][zone][mtype] = counts
65 | 			if not page_counts_found:
66 | 				log.warn('Failed to find free pages counters in /proc/pagetypeinfo')
67 | 
68 | 		# Dispatch values from mmap
69 | 		for node,zones in mmap.viewitems():
70 | 			for zone,mtypes in zones.viewitems():
71 | 				for mtype,counts in mtypes.viewitems():
72 | 					if sum(counts.viewvalues()) == 0: continue
73 | 					for size,count in counts.viewitems():
74 | 						yield Datapoint( 'memory.fragmentation.{}'\
75 | 								.format('.'.join(it.imap( bytes,
76 | 									['node_{}'.format(node),zone,mtype,size] ))),
77 | 							'gauge', count, None )
78 | 
79 | 
80 | collector = MemFrag
81 | 


--------------------------------------------------------------------------------
/graphite_metrics/sinks/librato_metrics.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import itertools as it, operator as op, functools as ft
 4 | from time import time
 5 | import types
 6 | 
 7 | from requests.auth import HTTPBasicAuth
 8 | import requests
 9 | 
10 | try: from simplejson import dumps
11 | except ImportError: from json import dumps
12 | 
13 | from . import Sink
14 | 
15 | import logging
16 | log = logging.getLogger(__name__)
17 | 
18 | 
19 | class LibratoMetrics(Sink):
20 | 
21 | 	'''Interface to a Librato Metrics API v1. Uses JSON Array format.
22 | 		Relevant part of the docs: http://dev.librato.com/v1/post/metrics'''
23 | 
24 | 	def __init__(self, *argz, **kwz):
25 | 		super(LibratoMetrics, self).__init__(*argz, **kwz)
26 | 
27 | 		# Try to set reasonable defaults
28 | 		if self.conf.http_parameters.timeout is None:
29 | 			try:
30 | 				from . import cfg
31 | 				self.conf.http_parameters.timeout = cfg.loop.interval / 2
32 | 			except (ImportError, KeyError): self.conf.http_parameters.timeout = 30
33 | 		self.conf.http_parameters.auth = HTTPBasicAuth(*self.conf.http_parameters.auth)
34 | 
35 | 		requests.defaults.keep_alive = True
36 | 		requests.defaults.max_retries = max(3, self.conf.http_parameters.timeout / 5)
37 | 
38 | 		# Try to init concurrent (async) dispatcher
39 | 		self.send = lambda chunk, **kwz: requests.post(data=chunk, **kwz)
40 | 		if self.conf.chunk_data.enabled or self.conf.chunk_data.enabled is None:
41 | 			try: from requests import async
42 | 			except RuntimeError as err:
43 | 				if self.conf.chunk_data.enabled: raise
44 | 				else:
45 | 					log.warn(( 'Failed to initialize requests.async'
46 | 						' engine (gevent module missing?): {}, concurrent'
47 | 						' (chunked) measurements submission will be disabled' ).format(err))
48 | 					self.conf.chunk_data.enabled = False
49 | 			else:
50 | 				self.conf.chunk_data.enabled = True
51 | 				if not self.conf.chunk_data.max_concurrent_requests\
52 | 						or self.conf.chunk_data.max_concurrent_requests <= 0:
53 | 					self.conf.chunk_data.max_concurrent_requests = None
54 | 				self.send = lambda *chunks, **kwz:\
55 | 					map( op.methodcaller('raise_for_status'),
56 | 						async.map(
57 | 							list(async.post(data=chunk, **kwz) for chunk in chunks),
58 | 							size=self.conf.chunk_data.max_concurrent_requests ) )
59 | 
60 | 	def measurement(self, name, value, ts_dp=None):
61 | 		measurement = dict()
62 | 		if self.conf.source_from_prefix:
63 | 			measurement['source'], name = name.split('.', 1)
64 | 		elif self.conf.source: measurement['source'] = self.conf.source
65 | 		if ts_dp: measurement['measure_time'] = ts_dp
66 | 		measurement.update(name=name, value=value)
67 | 		return measurement
68 | 
69 | 	def dispatch(self, *tuples):
70 | 		data = dict()
71 | 		if self.conf.unified_measure_time:
72 | 			data['measure_time'] = int(time())
73 | 			tuples = list((name, value, None) for name, value, ts_dp in tuples)
74 | 		if self.conf.chunk_data.enabled\
75 | 				and len(tuples) > self.conf.chunk_data.max_chunk_size:
76 | 			chunks, n = list(), 0
77 | 			while n < len(tuples):
78 | 				n_to = n + self.conf.chunk_data.max_chunk_size
79 | 				chunk = data.copy()
80 | 				chunk['gauges'] = list(it.starmap(self.measurement, tuples[n:n_to]))
81 | 				chunks.append(chunk)
82 | 				n = n_to
83 | 			log.debug(( 'Splitting {} measurements'
84 | 				' into {} concurrent requests' ).format(len(tuples), len(chunks)))
85 | 			data = map(dumps, chunks)
86 | 			del tuples, chunk, chunks # to gc ram from this corpus of data
87 | 		else: # single chunk
88 | 			data['gauges'] = list(it.starmap(self.measurement, tuples))
89 | 			data = [dumps(data)]
90 | 			del tuples
91 | 		self.send(*data, headers={
92 | 			'content-type': 'application/json' }, **self.conf.http_parameters)
93 | 
94 | 
95 | sink = LibratoMetrics
96 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import itertools as it, operator as op, functools as ft
  4 | from collections import namedtuple
  5 | from glob import iglob
  6 | from time import time
  7 | import os
  8 | 
  9 | import logging
 10 | log = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | page_size = os.sysconf('SC_PAGE_SIZE')
 14 | page_size_kb = page_size // 1024
 15 | user_hz = os.sysconf('SC_CLK_TCK')
 16 | sector_bytes = 512
 17 | 
 18 | # Global configuration for harvestd,
 19 | #  intended to be set before initializing collectors,
 20 | #  but should not be really relied upon - can be empty.
 21 | cfg = dict()
 22 | 
 23 | 
 24 | def rate_limit(max_interval=20, sampling=3, f=lambda x: x):
 25 | 	'''x rises by 1 from 0 on each iteraton, back to 0 on triggering.
 26 | 		f(x) should rise up to f(max_interval) in some way (with default
 27 | 			"f(x)=x" probability rises lineary with 100% chance on "x=max_interval").
 28 | 		"sampling" affect probablility in an "c=1-(1-c0)*(1-c1)*...*(1-cx)" exponential way.'''
 29 | 	from random import random
 30 | 	val = 0
 31 | 	val_max = float(f(max_interval))
 32 | 	while True:
 33 | 		if val % sampling == 0:
 34 | 			trigger = random() > (val_max - f(val)) / val_max
 35 | 			if trigger: val = 0
 36 | 			yield trigger
 37 | 		else: yield False
 38 | 		val += 1
 39 | 
 40 | 
 41 | def dev_resolve( major, minor,
 42 | 		log_fails=True, _cache = dict(), _cache_time=600 ):
 43 | 	ts_now, dev_cached = time(), False
 44 | 	while True:
 45 | 		if not _cache: ts = 0
 46 | 		else:
 47 | 			dev = major, minor
 48 | 			dev_cached, ts = (None, _cache[None])\
 49 | 				if dev not in _cache else _cache[dev]
 50 | 		# Update cache, if necessary
 51 | 		if ts_now > ts + _cache_time or dev_cached is False:
 52 | 			_cache.clear()
 53 | 			for link in it.chain(iglob('/dev/mapper/*'), iglob('/dev/sd*'), iglob('/dev/xvd*')):
 54 | 				link_name = os.path.basename(link)
 55 | 				try: link_dev = os.stat(link).st_rdev
 56 | 				except OSError: continue # EPERM, EINVAL
 57 | 				_cache[(os.major(link_dev), os.minor(link_dev))] = link_name, ts_now
 58 | 			_cache[None] = ts_now
 59 | 			continue # ...and try again
 60 | 		if dev_cached: dev_cached = dev_cached.replace('.', '_')
 61 | 		elif log_fails:
 62 | 			log.warn( 'Unable to resolve device'
 63 | 				' from major/minor numbers: {}:{}'.format(major, minor) )
 64 | 		return dev_cached or None
 65 | 
 66 | 
 67 | class Collector(object):
 68 | 
 69 | 	def __init__(self, conf):
 70 | 		self.conf = conf
 71 | 
 72 | 	def read(self):
 73 | 		raise NotImplementedError( 'Collector.read method should be'
 74 | 			' overidden in collector subclasses to return list of Datapoint objects.' )
 75 | 		# return [Datapoint(...), Datapoint(...), ...]
 76 | 
 77 | 
 78 | class Datapoint(namedtuple('Value', 'name type value ts')):
 79 | 
 80 | 	# These are globals
 81 | 	_counter_cache = dict()
 82 | 	_counter_cache_check_ts = 0
 83 | 	_counter_cache_check_timeout = 12 * 3600 # 12h
 84 | 	_counter_cache_check_count = 4 # cleanup will trigger every timeout/count period
 85 | 
 86 | 	@classmethod
 87 | 	def _counter_cache_cleanup(cls, ts_min):
 88 | 		cleanup_list = list( k for k,(v,ts_chk) in
 89 | 			cls._counter_cache.viewitems() if ts_min > ts_chk )
 90 | 		log.debug('Counter cache cleanup: {} buckets'.format(len(cleanup_list)))
 91 | 		for k in cleanup_list: del cls._counter_cache[k]
 92 | 
 93 | 	def get(self, ts=None, prefix=None):
 94 | 		ts = self.ts or ts or time()
 95 | 		if ts > Datapoint._counter_cache_check_ts:
 96 | 			Datapoint._counter_cache_cleanup(
 97 | 				ts - Datapoint._counter_cache_check_timeout )
 98 | 			Datapoint._counter_cache_check_ts = ts\
 99 | 				+ Datapoint._counter_cache_check_timeout\
100 | 				/ Datapoint._counter_cache_check_count
101 | 		if self.type == 'counter':
102 | 			if self.name not in Datapoint._counter_cache:
103 | 				log.debug('Initializing bucket for new counter: {}'.format(self.name))
104 | 				Datapoint._counter_cache[self.name] = self.value, ts
105 | 				return None
106 | 			v0, ts0 = Datapoint._counter_cache[self.name]
107 | 			if ts == ts0:
108 | 				log.warn('Double-poll of a counter for {!r}'.format(self.name))
109 | 				return None
110 | 			value = float(self.value - v0) / (ts - ts0)
111 | 			Datapoint._counter_cache[self.name] = self.value, ts
112 | 			if value < 0:
113 | 				# TODO: handle overflows properly, w/ limits
114 | 				log.debug( 'Detected counter overflow'
115 | 					' (negative delta): {}, {} -> {}'.format(self.name, v0, self.value) )
116 | 				return None
117 | 		elif self.type == 'gauge': value = self.value
118 | 		else: raise TypeError('Unknown type: {}'.format(self.type))
119 | 		name = self.name if not prefix else '{}.{}'.format(prefix, self.name)
120 | 		return name, value, int(ts)
121 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/iptables_counts.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import itertools as it, operator as op, functools as ft
  4 | from subprocess import Popen, PIPE
  5 | from collections import namedtuple, defaultdict
  6 | from io import open
  7 | import os, errno
  8 | 
  9 | from . import Collector, Datapoint
 10 | 
 11 | import logging
 12 | log = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class IPTables(Collector):
 16 | 
 17 | 	iptables = dict(ipv4='iptables-save', ipv6='ip6tables-save') # binaries
 18 | 	metric_units = metric_tpl = None
 19 | 
 20 | 	def __init__(self, *argz, **kwz):
 21 | 		super(IPTables, self).__init__(*argz, **kwz)
 22 | 
 23 | 		if not self.conf.rule_metrics_path.ipv4\
 24 | 				and not self.conf.rule_metrics_path.ipv6:
 25 | 			log.info('No paths for rule_metrics_path specified, disabling collector')
 26 | 			self.conf.enabled = False
 27 | 
 28 | 		assert self.conf.units in ['pkt', 'bytes', 'both', 'both_flat']
 29 | 		if self.conf.units.startswith('both'):
 30 | 			self.metric_units = ['pkt', 'bytes']
 31 | 			self.metric_tpl = '{}.{}' if self.conf.units == 'both' else '{}_{}'
 32 | 		else: self.metric_units, self.metric_tpl = self.conf.units, '{}'
 33 | 
 34 | 
 35 | 	_rule_metrics = namedtuple('RuleMetrics', 'table path mtime')
 36 | 	_rule_metrics_cache = dict()
 37 | 
 38 | 	@property
 39 | 	def rule_metrics(self):
 40 | 		rule_metrics = dict()
 41 | 		for v in 'ipv4', 'ipv6':
 42 | 			path = self.conf.rule_metrics_path[v]
 43 | 			try:
 44 | 				if not path: raise OSError()
 45 | 				mtime = os.stat(path).st_mtime
 46 | 			except (OSError, IOError) as err:
 47 | 				if err.args and err.errno != errno.ENOENT: raise # to raise EPERM, EACCES and such
 48 | 				self._rule_metrics_cache[v] = None
 49 | 				continue
 50 | 			cache = self._rule_metrics_cache.get(v)
 51 | 			if not cache or path != cache.path or mtime != cache.mtime:
 52 | 				log.debug('Detected rule_metrics file update: {} (cached: {})'.format(path, cache))
 53 | 				metrics_table = dict()
 54 | 				with open(path, 'rb') as src:
 55 | 					for line in it.imap(op.methodcaller('strip'), src):
 56 | 						if not line: continue
 57 | 						table, chain, rule, metric = line.split(None, 3)
 58 | 						metrics_table[table, chain, int(rule)] = metric
 59 | 				cache = self._rule_metrics_cache[v]\
 60 | 					= self._rule_metrics(metrics_table, path, mtime)
 61 | 			rule_metrics[v] = cache
 62 | 		return rule_metrics
 63 | 
 64 | 
 65 | 	_table_hash = dict()
 66 | 
 67 | 	def read(self):
 68 | 		metric_counts = dict()
 69 | 		hashes = defaultdict(lambda: defaultdict(list))
 70 | 
 71 | 		for v, metrics in self.rule_metrics.viewitems():
 72 | 			if not metrics: continue
 73 | 
 74 | 			# Used to detect rule changes
 75 | 			try:
 76 | 				hash_old, metrics_old, warnings = self._table_hash[v]
 77 | 				if metrics is not metrics_old: raise KeyError
 78 | 			except KeyError: hash_old, warnings = None, dict()
 79 | 			hash_new = hashes[v]
 80 | 
 81 | 			# iptables-save invocation and output processing loop
 82 | 			proc = Popen([self.iptables[v], '-c'], stdout=PIPE)
 83 | 			chain_counts = defaultdict(int)
 84 | 			for line in it.imap(op.methodcaller('strip'), proc.stdout):
 85 | 				if line[0] != '[': # chain/table spec or comment
 86 | 					if line[0] == '*': table = line[1:]
 87 | 					continue
 88 | 				counts, append, chain, rule = line.split(None, 3)
 89 | 				assert append == '-A'
 90 | 
 91 | 				rule_key = table, chain
 92 | 				chain_counts[rule_key] += 1 # iptables rules are 1-indexed
 93 | 				chain_count = chain_counts[rule_key]
 94 | 				# log.debug('{}, Rule: {}'.format([table, chain, chain_count], rule))
 95 | 				hash_new[rule_key].append(rule) # but py lists are 0-indexed
 96 | 				try: metric = metrics.table[table, chain, chain_count]
 97 | 				except KeyError: continue # no point checking rules w/o metrics attached
 98 | 				# log.debug('Metric: {} ({}), rule: {}'.format(
 99 | 				# 	metric, [table, chain, chain_count], rule ))
100 | 
101 | 				# Check for changed rules
102 | 				try: rule_chk = hash_old and hash_old[rule_key][chain_count - 1]
103 | 				except (KeyError, IndexError): rule_chk = None
104 | 				if hash_old and rule_chk != rule:
105 | 					if chain_count not in warnings:
106 | 						log.warn(
107 | 							( 'Detected changed netfilter rule (chain: {}, pos: {})'
108 | 								' without corresponding rule_metrics file update: {}' )\
109 | 							.format(chain, chain_count, rule) )
110 | 						warnings[chain_count] = True
111 | 					if self.conf.discard_changed_rules: continue
112 | 
113 | 				counts = map(int, counts.strip('[]').split(':', 1))
114 | 				try:
115 | 					metric_counts[metric] = list(it.starmap(
116 | 						op.add, it.izip(metric_counts[metric], counts) ))
117 | 				except KeyError: metric_counts[metric] = counts
118 | 			proc.wait()
119 | 
120 | 			# Detect if there are any changes in the table,
121 | 			#  possibly messing the metrics, even if corresponding rules are the same
122 | 			hash_new = dict( (rule_key, tuple(rules))
123 | 				for rule_key, rules in hash_new.viewitems() )
124 | 			if hash_old\
125 | 					and frozenset(hash_old.viewitems()) != frozenset(hash_new.viewitems()):
126 | 				log.warn('Detected iptables changes without changes to rule_metrics file')
127 | 				hash_old = None
128 | 			if not hash_old: self._table_hash[v] = hash_new, metrics, dict()
129 | 
130 | 		# Dispatch collected metrics
131 | 		for metric, counts in metric_counts.viewitems():
132 | 			for unit, count in it.izip(['pkt', 'bytes'], counts):
133 | 				if unit not in self.metric_units: continue
134 | 				yield Datapoint(self.metric_tpl.format(metric, unit), 'counter', count, None)
135 | 
136 | 
137 | collector = IPTables
138 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/cron_log.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import itertools as it, operator as op, functools as ft
  4 | import re, iso8601, calendar
  5 | 
  6 | from . import Collector, Datapoint
  7 | 
  8 | import logging
  9 | log = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | def file_follow( src, open_tail=True,
 13 | 		read_interval_min=0.1,
 14 | 			read_interval_max=20, read_interval_mul=1.1,
 15 | 		rotation_check_interval=20, yield_file=False, **open_kwz ):
 16 | 	from time import time, sleep
 17 | 	from io import open
 18 | 	import os, types
 19 | 
 20 | 	open_tail = open_tail and isinstance(src, types.StringTypes)
 21 | 	src_open = lambda: open(path, mode='rb', **open_kwz)
 22 | 	stat = lambda f: (os.fstat(f) if isinstance(f, int) else os.stat(f))
 23 | 	sanity_chk_stats = lambda stat: (stat.st_ino, stat.st_dev)
 24 | 	sanity_chk_ts = lambda ts=None: (ts or time()) + rotation_check_interval
 25 | 
 26 | 	if isinstance(src, types.StringTypes): src, path = None, src
 27 | 	else:
 28 | 		path = src.name
 29 | 		src_inode, src_inode_ts =\
 30 | 			sanity_chk_stats(stat(src.fileno())), sanity_chk_ts()
 31 | 	line, read_chk = '', read_interval_min
 32 | 
 33 | 	while True:
 34 | 
 35 | 		if not src: # (re)open
 36 | 			src = src_open()
 37 | 			if open_tail:
 38 | 				src.seek(0, os.SEEK_END)
 39 | 				open_tail = False
 40 | 			src_inode, src_inode_ts =\
 41 | 				sanity_chk_stats(stat(src.fileno())), sanity_chk_ts()
 42 | 			src_inode_chk = None
 43 | 
 44 | 		ts = time()
 45 | 		if ts > src_inode_ts: # rotation check
 46 | 			src_inode_chk, src_inode_ts =\
 47 | 				sanity_chk_stats(stat(path)), sanity_chk_ts(ts)
 48 | 			if stat(src.fileno()).st_size < src.tell(): src.seek(0) # truncated
 49 | 		else: src_inode_chk = None
 50 | 
 51 | 		buff = src.readline()
 52 | 		if not buff: # eof
 53 | 			if src_inode_chk and src_inode_chk != src_inode: # rotated
 54 | 				src.close()
 55 | 				src, line = None, ''
 56 | 				continue
 57 | 			if read_chk is None:
 58 | 				yield (buff if not yield_file else (buff, src))
 59 | 			else:
 60 | 				sleep(read_chk)
 61 | 				read_chk *= read_interval_mul
 62 | 				if read_chk > read_interval_max:
 63 | 					read_chk = read_interval_max
 64 | 		else:
 65 | 			line += buff
 66 | 			read_chk = read_interval_min
 67 | 
 68 | 		if line and line[-1] == '\n': # complete line
 69 | 			try:
 70 | 				val = yield (line if not yield_file else (line, src))
 71 | 				if val is not None: raise KeyboardInterrupt
 72 | 			except KeyboardInterrupt: break
 73 | 			line = ''
 74 | 
 75 | 	src.close()
 76 | 
 77 | 
 78 | def file_follow_durable( path,
 79 | 		min_dump_interval=10,
 80 | 		xattr_name='user.collectd.logtail.pos', xattr_update=True,
 81 | 		**follow_kwz ):
 82 | 	'''Records log position into xattrs after reading line every
 83 | 			min_dump_interval seconds.
 84 | 		Checksum of the last line at the position
 85 | 			is also recorded (so line itself don't have to fit into xattr) to make sure
 86 | 			file wasn't truncated between last xattr dump and re-open.'''
 87 | 
 88 | 	from xattr import xattr
 89 | 	from io import open
 90 | 	from hashlib import sha1
 91 | 	from time import time
 92 | 	import struct
 93 | 
 94 | 	# Try to restore position
 95 | 	src = open(path, mode='rb')
 96 | 	src_xattr = xattr(src)
 97 | 	try:
 98 | 		if not xattr_name: raise KeyError
 99 | 		pos = src_xattr[xattr_name]
100 | 	except KeyError: pos = None
101 | 	if pos:
102 | 		data_len = struct.calcsize('=I')
103 | 		(pos,), chksum = struct.unpack('=I', pos[:data_len]), pos[data_len:]
104 | 		(data_len,), chksum = struct.unpack('=I', chksum[:data_len]), chksum[data_len:]
105 | 		try:
106 | 			src.seek(pos - data_len)
107 | 			if sha1(src.read(data_len)).digest() != chksum:
108 | 				raise IOError('Last log line doesnt match checksum')
109 | 		except (OSError, IOError) as err:
110 | 			collectd.info('Failed to restore log position: {}'.format(err))
111 | 			src.seek(0)
112 | 	tailer = file_follow(src, yield_file=True, **follow_kwz)
113 | 
114 | 	# ...and keep it updated
115 | 	pos_dump_ts_get = lambda ts=None: (ts or time()) + min_dump_interval
116 | 	pos_dump_ts = pos_dump_ts_get()
117 | 	while True:
118 | 		line, src_chk = next(tailer)
119 | 		if not line: pos_dump_ts = 0 # force-write xattr
120 | 		ts = time()
121 | 		if ts > pos_dump_ts:
122 | 			if src is not src_chk:
123 | 				src, src_xattr = src_chk, xattr(src_chk)
124 | 			pos_new = src.tell()
125 | 			if pos != pos_new:
126 | 				pos = pos_new
127 | 				if xattr_update:
128 | 					src_xattr[xattr_name] =\
129 | 						struct.pack('=I', pos)\
130 | 						+ struct.pack('=I', len(line))\
131 | 						+ sha1(line).digest()
132 | 			pos_dump_ts = pos_dump_ts_get(ts)
133 | 		if (yield line.decode('utf-8', 'replace')):
134 | 			tailer.send(StopIteration)
135 | 			break
136 | 
137 | 
138 | class CronJobs(Collector):
139 | 
140 | 	lines, aliases = dict(), list()
141 | 
142 | 	def __init__(self, *argz, **kwz):
143 | 		super(CronJobs, self).__init__(*argz, **kwz)
144 | 
145 | 		try:
146 | 			src, self.lines, self.aliases =\
147 | 				op.attrgetter('source', 'lines', 'aliases')(self.conf)
148 | 			if not (src and self.lines and self.aliases): raise KeyError()
149 | 		except KeyError as err:
150 | 			if err.args:
151 | 				log.error('Failed to get required config parameter "{}"'.format(err.args[0]))
152 | 			else:
153 | 				log.warn( 'Collector requires all of "source",'
154 | 					' "lines" and "aliases" specified to work properly' )
155 | 			self.conf.enabled = False
156 | 			return
157 | 
158 | 		for k,v in self.lines.viewitems(): self.lines[k] = re.compile(v)
159 | 		for idx,(k,v) in enumerate(self.aliases): self.aliases[idx] = k, re.compile(v)
160 | 		self.log_tailer = file_follow_durable( src, read_interval_min=None,
161 | 			xattr_name=self.conf.xattr_name, xattr_update=not self.conf.debug.dry_run )
162 | 
163 | 	def read(self, _re_sanitize=re.compile('\s+|-')):
164 | 		# Cron
165 | 		if self.log_tailer:
166 | 			for line in iter(self.log_tailer.next, u''):
167 | 				# log.debug('LINE: {!r}'.format(line))
168 | 				ts, line = line.strip().split(None, 1)
169 | 				ts = calendar.timegm(iso8601.parse_date(ts).utctimetuple())
170 | 				matched = False
171 | 				for ev, regex in self.lines.viewitems():
172 | 					if not regex: continue
173 | 					match = regex.search(line)
174 | 					if match:
175 | 						matched = True
176 | 						job = match.group('job')
177 | 						for alias, regex in self.aliases:
178 | 							group = alias[1:] if alias.startswith('_') else None
179 | 							alias_match = regex.search(job)
180 | 							if alias_match:
181 | 								if group is not None:
182 | 									job = _re_sanitize.sub('_', alias_match.group(group))
183 | 								else: job = alias
184 | 								break
185 | 						else:
186 | 							log.warn('No alias for cron job: {!r}, skipping'.format(line))
187 | 							continue
188 | 						try: value = float(match.group('val'))
189 | 						except IndexError: value = 1
190 | 						# log.debug('TS: {}, EV: {}, JOB: {}'.format(ts, ev, job))
191 | 						yield Datapoint('cron.tasks.{}.{}'.format(job, ev), 'gauge', value, ts)
192 | 				if not matched:
193 | 					log.debug('Failed to match line: {!r}'.format(line))
194 | 
195 | 
196 | collector = CronJobs
197 | 


--------------------------------------------------------------------------------
/graphite_metrics/harvestd.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import itertools as it, operator as op, functools as ft
  5 | from lya import AttrDict, configure_logging
  6 | from collections import OrderedDict
  7 | import os, sys
  8 | 
  9 | 
 10 | def main():
 11 | 	import argparse
 12 | 	parser = argparse.ArgumentParser(
 13 | 		description='Collect and dispatch various metrics to destinations.')
 14 | 	parser.add_argument('-t', '--destination', metavar='host[:port]',
 15 | 		help='host[:port] (default port: 2003, can be overidden'
 16 | 			' via config file) of sink destination endpoint (e.g. carbon'
 17 | 			' linereceiver tcp port, by default).')
 18 | 	parser.add_argument('-i', '--interval', type=int, metavar='seconds',
 19 | 		help='Interval between collecting and sending the datapoints.')
 20 | 
 21 | 	parser.add_argument('-e', '--collector-enable',
 22 | 		action='append', metavar='collector', default=list(),
 23 | 		help='Enable only the specified metric collectors,'
 24 | 				' can be specified multiple times.')
 25 | 	parser.add_argument('-d', '--collector-disable',
 26 | 		action='append', metavar='collector', default=list(),
 27 | 		help='Explicitly disable specified metric collectors,'
 28 | 			' can be specified multiple times. Overrides --collector-enable.')
 29 | 
 30 | 	parser.add_argument('-s', '--sink-enable',
 31 | 		action='append', metavar='sink', default=list(),
 32 | 		help='Enable only the specified datapoint sinks,'
 33 | 				' can be specified multiple times.')
 34 | 	parser.add_argument('-x', '--sink-disable',
 35 | 		action='append', metavar='sink', default=list(),
 36 | 		help='Explicitly disable specified datapoint sinks,'
 37 | 			' can be specified multiple times. Overrides --sink-enable.')
 38 | 
 39 | 	parser.add_argument('-p', '--processor-enable',
 40 | 		action='append', metavar='processor', default=list(),
 41 | 		help='Enable only the specified datapoint processors,'
 42 | 				' can be specified multiple times.')
 43 | 	parser.add_argument('-z', '--processor-disable',
 44 | 		action='append', metavar='processor', default=list(),
 45 | 		help='Explicitly disable specified datapoint processors,'
 46 | 			' can be specified multiple times. Overrides --processor-enable.')
 47 | 
 48 | 	parser.add_argument('-c', '--config',
 49 | 		action='append', metavar='path', default=list(),
 50 | 		help='Configuration files to process.'
 51 | 			' Can be specified more than once.'
 52 | 			' Values from the latter ones override values in the former.'
 53 | 			' Available CLI options override the values in any config.')
 54 | 
 55 | 	parser.add_argument('-a', '--xattr-emulation', metavar='db-path',
 56 | 		help='Emulate filesystem extended attributes (used in'
 57 | 			' some collectors like sysstat or cron_log), storing per-path'
 58 | 			' data in a simple shelve db.')
 59 | 	parser.add_argument('-n', '--dry-run',
 60 | 		action='store_true', help='Do not actually send data.')
 61 | 	parser.add_argument('--debug-memleaks', action='store_true',
 62 | 		help='Import guppy and enable its manhole to debug memleaks (requires guppy module).')
 63 | 	parser.add_argument('--debug',
 64 | 		action='store_true', help='Verbose operation mode.')
 65 | 	optz = parser.parse_args()
 66 | 
 67 | 	# Read configuration files
 68 | 	cfg = AttrDict.from_yaml('{}.yaml'.format(
 69 | 		os.path.splitext(os.path.realpath(__file__))[0] ))
 70 | 	for k in optz.config: cfg.update_yaml(k)
 71 | 
 72 | 	# Logging
 73 | 	import logging
 74 | 	configure_logging( cfg.logging,
 75 | 		logging.DEBUG if optz.debug else logging.WARNING )
 76 | 	if not cfg.logging.tracebacks:
 77 | 		class NoTBLogger(logging.Logger):
 78 | 			def exception(self, *argz, **kwz): self.error(*argz, **kwz)
 79 | 		logging.setLoggerClass(NoTBLogger)
 80 | 	log = logging.getLogger(__name__)
 81 | 
 82 | 	# Manholes
 83 | 	if optz.debug_memleaks:
 84 | 		import guppy
 85 | 		from guppy.heapy import Remote
 86 | 		Remote.on()
 87 | 
 88 | 	# Fill "auto-detected" blanks in the configuration, CLI overrides
 89 | 	try:
 90 | 		if optz.destination: cfg.sinks._default.host = optz.destination
 91 | 		cfg.sinks._default.host = cfg.sinks._default.host.rsplit(':', 1)
 92 | 		if len(cfg.sinks._default.host) == 1:
 93 | 			cfg.sinks._default.host =\
 94 | 				cfg.sinks._default.host[0], cfg.sinks._default.default_port
 95 | 		else: cfg.sinks._default.host[1] = int(cfg.sinks._default.host[1])
 96 | 	except KeyError: pass
 97 | 	if optz.interval: cfg.loop.interval = optz.interval
 98 | 	if optz.dry_run: cfg.debug.dry_run = optz.dry_run
 99 | 	if optz.xattr_emulation: cfg.core.xattr_emulation = optz.xattr_emulation
100 | 
101 | 	# Fake "xattr" module, if requested
102 | 	if cfg.core.xattr_emulation:
103 | 		import shelve
104 | 		xattr_db = shelve.open(cfg.core.xattr_emulation, 'c')
105 | 		class xattr_path(object):
106 | 			def __init__(self, base):
107 | 				assert isinstance(base, str)
108 | 				self.base = base
109 | 			def key(self, k): return '{}\0{}'.format(self.base, k)
110 | 			def __setitem__(self, k, v): xattr_db[self.key(k)] = v
111 | 			def __getitem__(self, k): return xattr_db[self.key(k)]
112 | 			def __del__(self): xattr_db.sync()
113 | 		class xattr_module(object): xattr = xattr_path
114 | 		sys.modules['xattr'] = xattr_module
115 | 
116 | 	# Override "enabled" collector/sink parameters, based on CLI
117 | 	ep_conf = dict()
118 | 	for ep, enabled, disabled in\
119 | 			[ ('collectors', optz.collector_enable, optz.collector_disable),
120 | 				('processors', optz.processor_enable, optz.processor_disable),
121 | 				('sinks', optz.sink_enable, optz.sink_disable) ]:
122 | 		conf = cfg[ep]
123 | 		conf_base = conf.pop('_default')
124 | 		if 'debug' not in conf_base: conf_base['debug'] = cfg.debug
125 | 		ep_conf[ep] = conf_base, conf, OrderedDict(), enabled, disabled
126 | 
127 | 	# Init global cfg for collectors/sinks' usage
128 | 	from graphite_metrics import collectors, sinks, loops
129 | 	collectors.cfg = sinks.cfg = loops.cfg = cfg
130 | 
131 | 	# Init pluggable components
132 | 	import pkg_resources
133 | 
134 | 	for ep_type in 'collector', 'processor', 'sink':
135 | 		ep_key = '{}s'.format(ep_type) # a bit of a hack
136 | 		conf_base, conf, objects, enabled, disabled = ep_conf[ep_key]
137 | 		ep_dict = dict( (ep.name, ep) for ep in
138 | 			pkg_resources.iter_entry_points('graphite_metrics.{}'.format(ep_key)) )
139 | 		eps = OrderedDict(
140 | 			(name, (ep_dict.pop(name), subconf or AttrDict()))
141 | 			for name, subconf in conf.viewitems() if name in ep_dict )
142 | 		eps.update( (name, (module, conf_base))
143 | 			for name, module in ep_dict.viewitems() )
144 | 		for ep_name, (ep_module, subconf) in eps.viewitems():
145 | 			if ep_name[0] == '_':
146 | 				log.debug( 'Skipping {} enty point,'
147 | 					' prefixed by underscore: {}'.format(ep_type, ep_name) )
148 | 			subconf.rebase(conf_base) # fill in "_default" collector parameters
149 | 			if enabled:
150 | 				if ep_name in enabled: subconf['enabled'] = True
151 | 				else: subconf['enabled'] = False
152 | 			if disabled and ep_name in disabled: subconf['enabled'] = False
153 | 			if subconf.get('enabled', True):
154 | 				log.debug('Loading {}: {}'.format(ep_type, ep_name))
155 | 				try: obj = getattr(ep_module.load(), ep_type)(subconf)
156 | 				except Exception as err:
157 | 					log.exception('Failed to load/init {} ({}): {}'.format(ep_type, ep_name, err))
158 | 					subconf.enabled = False
159 | 					obj = None
160 | 				if subconf.get('enabled', True): objects[ep_name] = obj
161 | 				else:
162 | 					log.debug(( '{} {} (entry point: {})'
163 | 						' was disabled after init' ).format(ep_type.title(), obj, ep_name))
164 | 		if ep_type != 'processor' and not objects:
165 | 			log.fatal('No {}s were properly enabled/loaded, bailing out'.format(ep_type))
166 | 			sys.exit(1)
167 | 		log.debug('{}: {}'.format(ep_key.title(), objects))
168 | 
169 | 	loop = dict( (ep.name, ep) for ep in
170 | 		pkg_resources.iter_entry_points('graphite_metrics.loops') )
171 | 	conf = AttrDict(**cfg.loop)
172 | 	if 'debug' not in conf: conf.debug = cfg.debug
173 | 	loop = loop[cfg.loop.name].load().loop(conf)
174 | 
175 | 	collectors, processors, sinks = it.imap( op.itemgetter(2),
176 | 		op.itemgetter('collectors', 'processors', 'sinks')(ep_conf) )
177 | 	log.debug(
178 | 		'Starting main loop: {} ({} collectors, {} processors, {} sinks)'\
179 | 		.format(loop, len(collectors), len(processors), len(sinks)) )
180 | 	loop.start(collectors, processors, sinks)
181 | 
182 | if __name__ == '__main__': main()
183 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/_ping.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | from __future__ import print_function
  4 | 
  5 | import itertools as it, operator as op, functools as ft
  6 | from contextlib import closing
  7 | from select import epoll, EPOLLIN, EPOLLOUT
  8 | from time import time, sleep
  9 | import os, sys, socket, struct, random, signal, re, logging
 10 | 
 11 | 
 12 | class LinkError(Exception): pass
 13 | 
 14 | class Pinger(object):
 15 | 
 16 | 	@staticmethod
 17 | 	def calculate_checksum(src):
 18 | 		shift, src = sys.byteorder != 'little', bytearray(src)
 19 | 		chksum = 0
 20 | 		for c in src:
 21 | 			chksum += (c << 8) if shift else c
 22 | 			shift = not shift
 23 | 		chksum = (chksum & 0xffff) + (chksum >> 16)
 24 | 		chksum += chksum >> 16
 25 | 		chksum = ~chksum & 0xffff
 26 | 		return struct.pack('!H', socket.htons(chksum))
 27 | 
 28 | 
 29 | 	def resolve(self, host, family=0, socktype=0, proto=0, flags=0):
 30 | 		try: f, host = host.split(':', 1)
 31 | 		except ValueError: pass
 32 | 		else:
 33 | 			assert f in ['v4', 'v6'], f
 34 | 			if f == 'v4':
 35 | 				family, sock = socket.AF_INET, self.ipv4
 36 | 			elif f == 'v6':
 37 | 				family, sock = socket.AF_INET6, self.ipv6
 38 | 				match = re.search(r'^\[([0-9:a-fA-F]+)\]$', host)
 39 | 				if match: host = match.group(1)
 40 | 		addrs = set( addrinfo[-1] for addrinfo in
 41 | 			socket.getaddrinfo(host, 0, family, socktype, proto, flags) )
 42 | 		return sock, random.choice(list(addrs))
 43 | 
 44 | 	def test_link(self, addrinfo, ping_id=0xffff, seq=0):
 45 | 		'Test if it is possible to send packets out at all (i.e. link is not down).'
 46 | 		try: self.pkt_send(addrinfo, ping_id, seq)
 47 | 		except IOError as err: raise LinkError(str(err))
 48 | 
 49 | 	def pkt_send(self, addrinfo, ping_id, seq):
 50 | 		sock, addr = addrinfo
 51 | 		if sock is self.ipv4: icmp_type = 0x08
 52 | 		elif sock is self.ipv6: icmp_type = 0x80
 53 | 		else: raise ValueError(sock)
 54 | 		ts = time()
 55 | 		ts_secs = int(ts)
 56 | 		ts_usecs = int((ts - ts_secs) * 1e6)
 57 | 		# Timestamp is packed in wireshark-friendly format
 58 | 		# Using time.clock() would probably be better here,
 59 | 		#  as it should work better with time corrections (by e.g. ntpd)
 60 | 		pkt = bytearray(struct.pack( '!BBHHHII',
 61 | 			icmp_type, 0, 0, ping_id, seq, ts_secs, ts_usecs ))
 62 | 		pkt[2:4] = self.calculate_checksum(pkt)
 63 | 		sock.sendto(bytes(pkt), addr)
 64 | 
 65 | 	def pkt_recv(self, sock):
 66 | 		# None gets returned in cases when we get whatever other icmp thing
 67 | 		pkt, src = sock.recvfrom(2048)
 68 | 		if sock is self.ipv4: start = 20
 69 | 		elif sock is self.ipv6: start = 0
 70 | 		else: raise ValueError(sock)
 71 | 		try: pkt = struct.unpack('!BBHHHII', pkt[start:start + 16])
 72 | 		except struct.error: return
 73 | 		if sock is self.ipv4 and (pkt[0] != 0 or pkt[1] != 0): return
 74 | 		elif sock is self.ipv6 and (pkt[0] != 0x81 or pkt[1] != 0): return
 75 | 		return src[0], pkt[3], pkt[4], pkt[5] + (pkt[6] / 1e6) # addr, ping_id, seq, ts
 76 | 
 77 | 
 78 | 	def start(self, *args, **kws):
 79 | 		with\
 80 | 				closing(socket.socket( socket.AF_INET,
 81 | 					socket.SOCK_RAW, socket.getprotobyname('icmp') )) as self.ipv4,\
 82 | 				closing(socket.socket( socket.AF_INET6,
 83 | 					socket.SOCK_RAW, socket.getprotobyname('ipv6-icmp') )) as self.ipv6:
 84 | 			return self._start(*args, **kws)
 85 | 
 86 | 	def _start( self, host_specs, interval,
 87 | 			resolve_no_reply, resolve_fixed, ewma_factor, ping_pid, log=None,
 88 | 			warn_tries=5, warn_repeat=None, warn_delay_k=5, warn_delay_min=5 ):
 89 | 		ts = time()
 90 | 		seq_gen = it.chain.from_iterable(it.imap(xrange, it.repeat(2**15)))
 91 | 		resolve_fixed_deadline = ts + resolve_fixed
 92 | 		resolve_retry = dict()
 93 | 		self.discard_rtts = False
 94 | 		if not log: log = logging.getLogger(__name__)
 95 | 
 96 | 		### First resolve all hosts, waiting for it, if necessary
 97 | 		hosts, host_ids = dict(), dict()
 98 | 		for host in host_specs:
 99 | 			while True:
100 | 				ping_id = random.randint(0, 0xfffe)
101 | 				if ping_id not in host_ids: break
102 | 			warn = warn_ts = 0
103 | 			while True:
104 | 				try:
105 | 					addrinfo = self.resolve(host)
106 | 					self.test_link(addrinfo)
107 | 
108 | 				except (socket.gaierror, socket.error, LinkError) as err:
109 | 					ts = time()
110 | 					if warn < warn_tries:
111 | 						warn_force, warn_chk = False, True
112 | 					else:
113 | 						warn_force, warn_chk = True, warn_repeat\
114 | 							and (warn_repeat is True or ts - warn_ts > warn_repeat)
115 | 					if warn_chk: warn_ts = ts
116 | 					err_info = type(err).__name__
117 | 					if str(err): err_info += ': {}'.format(err)
118 | 					(log.warn if warn_chk else log.info)\
119 | 						( '{}Unable to resolve/send-to name spec: {} ({})'\
120 | 							.format('' if not warn_force else '(STILL) ', host, err_info) )
121 | 					warn += 1
122 | 					if warn_repeat is not True and warn == warn_tries:
123 | 						log.warn( 'Disabling name-resolver/link-test warnings (failures: {},'
124 | 							' name spec: {}) until next successful attempt'.format(warn, host) )
125 | 					sleep(max(interval / float(warn_delay_k), warn_delay_min))
126 | 
127 | 				else:
128 | 					hosts[host] = host_ids[ping_id] = dict(
129 | 						ping_id=ping_id, addrinfo=addrinfo,
130 | 						last_reply=0, rtt=0, sent=0, recv=0 )
131 | 					if warn >= warn_tries:
132 | 						log.warn('Was able to resolve host spec: {} (attempts: {})'.format(host, warn))
133 | 					break
134 | 
135 | 		### Handler to emit results on-demand
136 | 		def dump(sig, frm):
137 | 			self.discard_rtts = True # make sure results won't be tainted by this delay
138 | 			ts = time()
139 | 			try:
140 | 				for spec, host in hosts.viewitems():
141 | 					sys.stdout.write('{} {:.10f} {:.10f} {:010d}\n'.format(
142 | 						spec, ts - host['last_reply'], host['rtt'],
143 | 						max(host['sent'] - host['recv'] - 1, 0) )) # 1 pkt can be in-transit
144 | 					if host['sent'] > 2**30: host['sent'] = host['recv'] = 0
145 | 				sys.stdout.write('\n')
146 | 				sys.stdout.flush()
147 | 			except IOError: sys.exit()
148 | 		signal.signal(signal.SIGQUIT, dump)
149 | 
150 | 		### Actual ping-loop
151 | 		poller, sockets = epoll(), dict()
152 | 		for sock in self.ipv4, self.ipv6:
153 | 			sockets[sock.fileno()] = sock
154 | 			poller.register(sock, EPOLLIN)
155 | 		sys.stdout.write('\n')
156 | 		sys.stdout.flush()
157 | 
158 | 		ts_send = 0 # when last packet(s) were sent out
159 | 		while True:
160 | 			while True:
161 | 				poll_time = max(0, ts_send + interval - time())
162 | 				try:
163 | 					poll_res = poller.poll(poll_time)
164 | 					if not poll_res or not poll_res[0][1] & EPOLLIN: break
165 | 					pkt = self.pkt_recv(sockets[poll_res[0][0]])
166 | 					if not pkt: continue
167 | 					addr, ping_id, seq, ts_pkt = pkt
168 | 				except IOError: continue
169 | 				if not ts_send: continue
170 | 				ts = time()
171 | 				try: host = host_ids[ping_id]
172 | 				except KeyError: pass
173 | 				else:
174 | 					host['last_reply'] = ts
175 | 					host['recv'] += 1
176 | 					if not self.discard_rtts:
177 | 						host['rtt'] = host['rtt'] + ewma_factor * (ts - ts_pkt - host['rtt'])
178 | 
179 | 			if resolve_retry:
180 | 				for spec, host in resolve_retry.items():
181 | 					try: host['addrinfo'] = self.resolve(spec)
182 | 					except socket.gaierror as err:
183 | 						log.warn('Failed to resolve spec: {} (host: {}): {}'.format(spec, host, err))
184 | 						host['resolve_fails'] = host.get('resolve_fails', 0) + 1
185 | 						if host['resolve_fails'] >= warn_tries:
186 | 							log.error(( 'Failed to resolve host spec {} (host: {}) after {} attempts,'
187 | 								' exiting (so subprocess can be restarted)' ).format(spec, host, warn_tries))
188 | 							# More complex "retry until forever" logic is used on process start,
189 | 							#  so exit here should be performed only once per major (non-transient) failure
190 | 							sys.exit(0)
191 | 					else:
192 | 						host['resolve_fails'] = 0
193 | 						del resolve_retry[spec]
194 | 
195 | 			if time() > resolve_fixed_deadline:
196 | 				for spec,host in hosts.viewitems():
197 | 					try: host['addrinfo'] = self.resolve(spec)
198 | 					except socket.gaierror: resolve_retry[spec] = host
199 | 				resolve_fixed_deadline = ts + resolve_fixed
200 | 
201 | 			if ping_pid:
202 | 				try: os.kill(ping_pid, 0)
203 | 				except OSError: sys.exit()
204 | 
205 | 			resolve_reply_deadline = time() - resolve_no_reply
206 | 			self.discard_rtts, seq = False, next(seq_gen)
207 | 			for spec, host in hosts.viewitems():
208 | 				if host['last_reply'] < resolve_reply_deadline:
209 | 					try: host['addrinfo'] = self.resolve(spec)
210 | 					except socket.gaierror: resolve_retry[spec] = host
211 | 				send_retries = 30
212 | 				while True:
213 | 					try: self.pkt_send(host['addrinfo'], host['ping_id'], seq)
214 | 					except IOError as err:
215 | 						send_retries -= 1
216 | 						if send_retries == 0:
217 | 							log.error(( 'Failed sending pings from socket to host spec {}'
218 | 									' (host: {}) attempts ({}), killing pinger (so it can be restarted).' )\
219 | 								.format(spec, host, err))
220 | 							sys.exit(0) # same idea as with resolver errors above
221 | 						continue
222 | 					else: break
223 | 					host['sent'] += 1
224 | 			ts_send = time() # used to calculate when to send next batch of pings
225 | 
226 | 
227 | if __name__ == '__main__':
228 | 	signal.signal(signal.SIGQUIT, signal.SIG_IGN)
229 | 	logging.basicConfig()
230 | 	# Inputs
231 | 	Pinger().start( sys.argv[7:], interval=float(sys.argv[1]),
232 | 		resolve_no_reply=float(sys.argv[2]), resolve_fixed=float(sys.argv[3]),
233 | 		ewma_factor=float(sys.argv[4]), ping_pid=int(sys.argv[5]),
234 | 		warn_tries=int(sys.argv[6]), log=logging.getLogger('pinger'),
235 | 		warn_repeat=8 * 3600, warn_delay_k=5, warn_delay_min=5 )
236 | 	# Output on SIGQUIT: "host_spec time_since_last_reply rtt_median pkt_lost"
237 | 	#  pkt_lost is a counter ("sent - received" for whole runtime)
238 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | graphite-metrics: metric collectors for various stuff not (or poorly) handled by other monitoring daemons
  2 | --------------------
  3 | 
  4 | Core of the project is a simple daemon (harvestd), which collects metric values
  5 | and sends them to graphite carbon daemon (and/or other configured destinations)
  6 | once per interval.
  7 | 
  8 | Includes separate data collection components ("collectors") for processing of:
  9 | 
 10 | * /proc/slabinfo for useful-to-watch values, not everything (configurable).
 11 | * /proc/vmstat and /proc/meminfo in a consistent way.
 12 | * /proc/stat for irq, softirq, forks.
 13 | * /proc/buddyinfo and /proc/pagetypeinfo (memory fragmentation).
 14 | * /proc/interrupts and /proc/softirqs.
 15 | * Cron log to produce start/finish events and duration for each job into a
 16 | 	separate metrics, adapts jobs to metric names with regexes.
 17 | * Per-system-service accounting using
 18 | 	[systemd](http://www.freedesktop.org/wiki/Software/systemd) and it's cgroups
 19 | 	("Default...Accounting=" options in system.conf have to be enabled for more
 20 | 	recent versions).
 21 | * [sysstat](http://sebastien.godard.pagesperso-orange.fr/) data from sadc logs
 22 | 	(use something like `sadc -F -L -S DISK -S XDISK -S POWER 60` to have more
 23 | 	stuff logged there) via sadf binary and it's json export (`sadf -j`, supported
 24 | 	since sysstat-10.0.something, iirc).
 25 | * iptables rule "hits" packet and byte counters, taken from ip{,6}tables-save,
 26 | 	mapped via separate "table chain_name rule_no metric_name" file, which should
 27 | 	be generated along with firewall rules (I use [this
 28 | 	script](https://github.com/mk-fg/trilobite) to do that).
 29 | 
 30 | Additional metric collectors can be added via setuptools/distribute
 31 | graphite_metrics.collectors [entry
 32 | point](http://packages.python.org/distribute/setuptools.html?highlight=entry%20points#dynamic-discovery-of-services-and-plugins)
 33 | and confgured via the common configuration mechanism.
 34 | 
 35 | Same for the datapoint sinks (destinations - it doesn't have to be a single
 36 | carbon host), datapoint processors (mangle/rename/filter datapoints) and the
 37 | main loop, which can be replaced with the async (simple case - threads or
 38 | gevent) or buffering loop.
 39 | 
 40 | Currently supported backends (data destinations, sinks):
 41 | 
 42 | * [graphite carbon
 43 | 	daemon](http://graphite.readthedocs.org/en/latest/carbon-daemons.html)
 44 | 	(enabled/used by default)
 45 | * [librato metrics](https://metrics.librato.com/)
 46 | 
 47 | Look at the shipped collectors, processors, sinks and loops and their base
 48 | classes (like
 49 | [graphite_metrics.sinks.Sink](https://github.com/mk-fg/graphite-metrics/blob/master/graphite_metrics/sinks/__init__.py)
 50 | or
 51 | [loops.Basic](https://github.com/mk-fg/graphite-metrics/blob/master/graphite_metrics/loops/basic.py))
 52 | for API examples.
 53 | 
 54 | 
 55 | Installation
 56 | --------------------
 57 | 
 58 | It's a regular package for Python 2.7 (not 3.X).
 59 | 
 60 | Using [pip](http://pip-installer.org/) is the best way:
 61 | 
 62 | 	% pip install graphite-metrics
 63 | 
 64 | If you don't have it, use:
 65 | 
 66 | 	% easy_install pip
 67 | 	% pip install graphite-metrics
 68 | 
 69 | Alternatively ([see
 70 | also](http://www.pip-installer.org/en/latest/installing.html)):
 71 | 
 72 | 	% curl https://raw.github.com/pypa/pip/master/contrib/get-pip.py | python
 73 | 	% pip install graphite-metrics
 74 | 
 75 | Or, if you absolutely must:
 76 | 
 77 | 	% easy_install graphite-metrics
 78 | 
 79 | But, you really shouldn't do that.
 80 | 
 81 | Current-git version can be installed like this:
 82 | 
 83 | 	% pip install 'git+https://github.com/mk-fg/graphite-metrics.git#egg=graphite-metrics'
 84 | 
 85 | ### Requirements
 86 | 
 87 | Basic requirements are (pip or easy_install should handle these for you):
 88 | 
 89 | * [setuptools / distribute](https://pypi.python.org/pypi/distribute/) (for entry points)
 90 | * [layered-yaml-attrdict-config](https://pypi.python.org/pypi/layered-yaml-attrdict-config/)
 91 | 
 92 | Some shipped modules require additional packages to function (which can be
 93 | installed automatically by specifying extras on install, example: `pip install
 94 | 'graphite-metrics[collectors.cgacct]'`):
 95 | 
 96 | * collectors
 97 | 
 98 | 	* cgacct
 99 | 		* [dbus-python](https://pypi.python.org/pypi/dbus-python/)
100 | 
101 | 	* cron_log
102 | 		* [xattr](http://pypi.python.org/pypi/xattr/) (unless --xattr-emulation is used)
103 | 		* [iso8601](http://pypi.python.org/pypi/iso8601/)
104 | 
105 | 	* sysstat
106 | 		* [xattr](http://pypi.python.org/pypi/xattr/) (unless --xattr-emulation is used)
107 | 		* (optional) [simplejson](http://pypi.python.org/pypi/simplejson/) - for
108 | 			better performance than stdlib json module
109 | 
110 | * sinks
111 | 
112 | 	* librato_metrics
113 | 		* [requests](http://pypi.python.org/pypi/requests/)
114 | 		* (optional) [simplejson](http://pypi.python.org/pypi/simplejson/) - for
115 | 			better performance than stdlib json module
116 | 		* (optional) [gevent](http://pypi.python.org/pypi/gevent/) - to enable
117 | 			constant-time (more scalable) async submissions of large data chunks via
118 | 			concurrent API requests
119 | 
120 | Also see
121 | [requirements.txt](https://github.com/mk-fg/graphite-metrics/blob/master/requirements.txt)
122 | file or "install_requires" and "extras_require" in
123 | [setup.py](https://github.com/mk-fg/graphite-metrics/blob/master/setup.py).
124 | 
125 | 
126 | Running
127 | --------------------
128 | 
129 | First run should probably look like this:
130 | 
131 | 	% harvestd --debug -s dump -i10
132 | 
133 | That will use default configuration with all the collectors enabled, dumping
134 | data to stderr (only "dump" data-sink enabled) and using short (5s) interval
135 | between collected datapoints, dumpng additional info about what's being done.
136 | 
137 | After that, see [default harvestd.yaml configuration
138 | file](https://github.com/mk-fg/graphite-metrics/blob/master/graphite_metrics/harvestd.yaml),
139 | which contains configuration for all loaded collectors and can/should be
140 | overidden using -c option.
141 | 
142 | Note that you don't have to specify all the options in each override-config,
143 | just the ones you need to update.
144 | 
145 | For example, simple configuration file (say, /etc/harvestd.yaml) just to specify
146 | carbon host and log lines format (dropping timestamp, since it will be piped to
147 | syslog or systemd-journal anyway) might look like this:
148 | 
149 | 	sinks:
150 | 	  carbon_socket:
151 | 	    host: carbon.example.host
152 | 
153 | 	logging:
154 | 	  formatters:
155 | 	    basic:
156 | 	      format: '%(levelname)s :: %(name)s: %(message)s'
157 | 
158 | And be started like this: `harvestd -c /etc/harvestd.yaml`
159 | 
160 | See `harvestd --help` output for a full CLI reference.
161 | 
162 | 
163 | Caveats, Stern Warnings and Apocalyptic Prophecies
164 | --------------------
165 | 
166 | While most stock collectors here pull metrics from /proc once per some interval,
167 | same as the other tools, be especially wary of the ones that process memory
168 | metrics, like /proc/slabinfo and cgroup value parsers.
169 | 
170 | So-called "files" in /proc are actually callbacks in the kernel code, and to get
171 | consistent reading for the whole slabinfo table, (at least some versions) of the
172 | kernel have to lock some operations, causing unexpected lags and delays on the
173 | whole system under some workloads (e.g. memcache servers).
174 | 
175 | cgroup data collector processes lots of files, potentially dozens, hundreds or
176 | even thoursands per collection cycle, which may also cause similar issues.
177 | 
178 | Special thanks to Marcus Barczak for pointing that out.
179 | 
180 | 
181 | Rationale
182 | --------------------
183 | 
184 | Most other tools can (in theory) collect this data, and I've used
185 | [collectd](http://collectd.org) for most of these, but it:
186 | 
187 | * Doesn't provide some of the most useful stuff - nfs stats, disk utilization
188 | 	time percentage, etc.
189 | 
190 | * Fails to collect some other stats, producing strange values like 0'es,
191 | 	unrealistic or negative values (for io, network, sensors, ...).
192 | 
193 | * General-purpose plugins like "tail" add lot of complexity, making
194 | 	configuration into a mess, while still lacking some basic functionality which
195 | 	10 lines of code (plugin) can easily provide (support is there, but see
196 | 	below).
197 | 
198 | * Plugins change metric names from the ones provided by /proc, referenced in
199 | 	kernel Documentation and on the internets, making collected data unnecessary
200 | 	hard to interpret and raising questions about it's meaning (which is
201 | 	increasingly important for low-level and calculated metrics).
202 | 
203 | Initially I've tried to address these issues (implement the same collectors)
204 | with collectd plugins, but it's python plugin system turned out to be leaking
205 | RAM and collectd itself segfaults something like once-a-day, even in the latest
206 | releases, although probably because of issues in C plugins.
207 | 
208 | Plus, collectd data requires post-processing anyway - proper metric namespaces,
209 | counter handling, etc.
210 | 
211 | Given that the alternative is to just get the data and echo it as "name val
212 | timestamp" to tcp socket, decided to avoid the extra complexity and problems
213 | that collectd provides.
214 | 
215 | Other than collectd, I've experimented with
216 | [ganglia](http://ganglia.sourceforge.net/),
217 | [munin](http://munin-monitoring.org/), and some other monitoring
218 | infrastructures, but found little justification in re-using their aggregation
219 | and/or collection infrastructure, if not outright limitations (like static data
220 | schema in ganglia).
221 | 
222 | Daemon binary is (weirdly) called "harvestd" because "metricsd" name is already
223 | used to refer to [another related daemon](https://github.com/kpumuk/metricsd)
224 | (also, [there's a "metrics" w/o "d"](https://github.com/codahale/metrics),
225 | probably others), and is too generic to be used w/o extra confusion, I think.
226 | That, and I seem to lack creativity to come up with a saner name ("reaperd"
227 | sounds too MassEffect'ish these days).
228 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/cjdns_peer_stats.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import itertools as it, operator as op, functools as ft
  4 | from io import open
  5 | from hashlib import sha256, sha512
  6 | from base64 import b32decode
  7 | from collections import defaultdict
  8 | import os, sys, json, socket, time, types
  9 | from . import Collector, Datapoint
 10 | 
 11 | import logging
 12 | log = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | ### For bencode bits below
 16 | # Derived from a thing under BitTorrent Open Source License, written by Petru Paler
 17 | 
 18 | # Different from vanilla bencode in:
 19 | #  * Handling "leading zeroes" in keys (doesn't error - for cjdns compat)
 20 | #  * encode_none method (to "n")
 21 | #  * encode_string encodes unicode as utf-8 bytes
 22 | 
 23 | def _ns_class(cls_name, cls_parents, cls_attrs):
 24 | 	for k, v in cls_attrs.viewitems():
 25 | 		if isinstance(v, types.FunctionType):
 26 | 			cls_attrs[k] = classmethod(v)
 27 | 	return type(cls_name, cls_parents, cls_attrs)
 28 | 
 29 | class BTEError(Exception): pass
 30 | 
 31 | class Bencached(object):
 32 | 	__slots__ = 'bencoded',
 33 | 	def __init__(self, s): self.bencoded = s
 34 | 
 35 | class BTE(object):
 36 | 	__metaclass__ = _ns_class
 37 | 
 38 | 	unicode_enc = 'utf-8'
 39 | 	enable_none = False
 40 | 	enable_bool = True
 41 | 	cjdns_compat = True
 42 | 
 43 | 	def decode_int(cls, x, f):
 44 | 		f += 1
 45 | 		newf = x.index('e', f)
 46 | 		n = int(x[f:newf])
 47 | 		if x[f] == '-':
 48 | 			if x[f + 1] == '0': raise ValueError
 49 | 		elif x[f] == '0' and newf != f+1: raise ValueError
 50 | 		return n, newf+1
 51 | 	def decode_string(cls, x, f):
 52 | 		colon = x.index(':', f)
 53 | 		n = int(x[f:colon])
 54 | 		if not cls.cjdns_compat\
 55 | 			and x[f] == '0' and colon != f+1: raise ValueError
 56 | 		colon += 1
 57 | 		return (x[colon:colon+n], colon+n)
 58 | 	def decode_list(cls, x, f):
 59 | 		r, f = [], f+1
 60 | 		while x[f] != 'e':
 61 | 			v, f = cls.decode_func[x[f]](cls, x, f)
 62 | 			r.append(v)
 63 | 		return r, f + 1
 64 | 	def decode_dict(cls, x, f):
 65 | 		r, f = {}, f+1
 66 | 		while x[f] != 'e':
 67 | 			k, f = cls.decode_string(x, f)
 68 | 			r[k], f = cls.decode_func[x[f]](cls, x, f)
 69 | 		return r, f + 1
 70 | 	def decode_none(cls, x, f):
 71 | 		if not cls.enable_none: raise ValueError(x[f])
 72 | 		return None, f+1
 73 | 	decode_func = dict(l=decode_list, d=decode_dict, i=decode_int, n=decode_none)
 74 | 	for n in xrange(10): decode_func[bytes(n)] = decode_string
 75 | 
 76 | 	def encode_bencached(cls, x, r): r.append(x.bencoded)
 77 | 	def encode_int(cls, x, r): r.extend(('i', str(x), 'e'))
 78 | 	def encode_float(cls, x, r): r.extend(('f', struct.pack('!d', x), 'e'))
 79 | 	def encode_bool(cls, x, r):
 80 | 		if not cls.enable_bool: raise ValueError(x)
 81 | 		if x: cls.encode_int(1, r)
 82 | 		else: cls.encode_int(0, r)
 83 | 	def encode_string(cls, x, r):
 84 | 		if isinstance(x, unicode):
 85 | 			if not cls.unicode_enc: raise ValueError(x)
 86 | 			x = x.encode(cls.unicode_enc)
 87 | 		r.extend((str(len(x)), ':', x))
 88 | 	def encode_list(cls, x, r):
 89 | 		r.append('l')
 90 | 		for i in x: cls.encode_func[type(i)](cls, i, r)
 91 | 		r.append('e')
 92 | 	def encode_dict(cls, x, r):
 93 | 		r.append('d')
 94 | 		ilist = x.items()
 95 | 		ilist.sort()
 96 | 		for k, v in ilist:
 97 | 			r.extend((str(len(k)), ':', k))
 98 | 			cls.encode_func[type(v)](cls, v, r)
 99 | 		r.append('e')
100 | 	def encode_none(cls, x, r):
101 | 		if not cls.enable_none: raise ValueError(x)
102 | 		r.append('n')
103 | 	encode_func = {
104 | 		Bencached: encode_bencached,
105 | 		unicode: encode_string,
106 | 		str: encode_string,
107 | 		types.IntType: encode_int,
108 | 		types.LongType: encode_int,
109 | 		types.FloatType: encode_float,
110 | 		types.ListType: encode_list,
111 | 		types.TupleType: encode_list,
112 | 		types.DictType: encode_dict,
113 | 		types.BooleanType: encode_bool,
114 | 		types.NoneType: encode_none,
115 | 	}
116 | 
117 | 	def bdecode(cls, x):
118 | 		try: r, l = cls.decode_func[x[0]](cls, x, 0)
119 | 		except (IndexError, KeyError, ValueError) as err:
120 | 			raise BTEError('Not a valid bencoded string: {}'.format(err))
121 | 		if l != len(x):
122 | 			raise BTEError('Invalid bencoded value (data after valid prefix)')
123 | 		return r
124 | 
125 | 	def bencode(cls, x):
126 | 		r = []
127 | 		cls.encode_func[type(x)](cls, x, r)
128 | 		return ''.join(r)
129 | 
130 | 
131 | def pubkey_to_ipv6(key,
132 | 		_cjdns_b32_map = [ # directly from util/Base32.h
133 | 			99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,
134 | 			99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,
135 | 			99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,
136 | 			0, 1, 2, 3, 4, 5, 6, 7, 8, 9,99,99,99,99,99,99,
137 | 			99,99,10,11,12,99,13,14,15,99,16,17,18,19,20,99,
138 | 			21,22,23,24,25,26,27,28,29,30,31,99,99,99,99,99,
139 | 			99,99,10,11,12,99,13,14,15,99,16,17,18,19,20,99,
140 | 			21,22,23,24,25,26,27,28,29,30,31,99,99,99,99,99 ]):
141 | 	if key.endswith('.k'): key = key[:-2]
142 | 
143 | 	bits, byte, res = 0, 0, list()
144 | 	for c in key:
145 | 		n = _cjdns_b32_map[ord(c)]
146 | 		if n > 31: raise ValueError('Invalid key: {!r}, char: {!r}'.format(key, n))
147 | 		byte |= n << bits
148 | 		bits += 5
149 | 		if bits >= 8:
150 | 			bits -= 8
151 | 			res.append(chr(byte & 0xff))
152 | 			byte >>= 8
153 | 	if bits >= 5 or byte:
154 | 		raise ValueError('Invalid key length: {!r} (leftover bits: {})'.format(key, bits))
155 | 	res = ''.join(res)
156 | 
157 | 	addr = sha512(sha512(res).digest()).hexdigest()[:32]
158 | 	if addr[:2] != 'fc':
159 | 		raise ValueError( 'Invalid cjdns key (first'
160 | 			' addr byte is not 0xfc, addr: {!r}): {!r}'.format(addr, key) )
161 | 	return addr
162 | 
163 | 
164 | class PeerStatsFailure(Exception):
165 | 
166 | 	def __init__(self, msg, err=None):
167 | 		if err is not None: msg += ': {} {}'.format(type(err), err)
168 | 		super(PeerStatsFailure, self).__init__(msg)
169 | 
170 | 	def __hash__(self):
171 | 		return hash(self.message)
172 | 
173 | 
174 | class CjdnsPeerStats(Collector):
175 | 
176 | 	last_err = None
177 | 	last_err_count = None # None (pre-init), True (shut-up mode) or int
178 | 	last_err_count_max = 3 # max repeated errors to report
179 | 
180 | 	def __init__(self, *argz, **kwz):
181 | 		super(CjdnsPeerStats, self).__init__(*argz, **kwz)
182 | 
183 | 		assert self.conf.filter.direction in\
184 | 			['any', 'incoming', 'outgoing'], self.conf.filter.direction
185 | 
186 | 		if isinstance(self.conf.peer_id, types.StringTypes):
187 | 			self.conf.peer_id = [self.conf.peer_id]
188 | 
189 | 		conf_admin, conf_admin_path = None,\
190 | 			os.path.expanduser(self.conf.cjdnsadmin_conf)
191 | 		try:
192 | 			with open(conf_admin_path) as src: conf_admin = json.load(src)
193 | 		except (OSError, IOError) as err:
194 | 			log.warn('Unable to open cjdnsadmin config: %s', err)
195 | 		except ValueError as err:
196 | 			log.warn('Unable to process cjdnsadmin config: %s', err)
197 | 		if conf_admin is None:
198 | 			log.error('Failed to process cjdnsadmin config, disabling collector')
199 | 			self.conf.enabled = False
200 | 			return
201 | 
202 | 		sock_addr = conf_admin['addr'], conf_admin['port']
203 | 		self.sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
204 | 		self.sock.settimeout(self.conf.timeout)
205 | 		log.debug('Using cjdns socket: {}:{}'.format(*sock_addr))
206 | 		self.sock.connect(sock_addr)
207 | 
208 | 		self.admin_password = conf_admin['password']
209 | 		self.peer_ipv6_cache = dict()
210 | 
211 | 	def get_stats_page(self, page, password, bs=2**30):
212 | 		try:
213 | 			self.sock.send(BTE.bencode(dict(q='cookie')))
214 | 			cookie = BTE.bdecode(self.sock.recv(bs))['cookie']
215 | 		except Exception as err:
216 | 			raise PeerStatsFailure('Failed to get auth cookie', err)
217 | 
218 | 		req = dict( q='auth',
219 | 			aq='InterfaceController_peerStats',
220 | 			args=dict(page=page),
221 | 			hash=sha256('{}{}'.format(password, cookie)).hexdigest(),
222 | 			cookie=cookie, txid=os.urandom(5).encode('hex') )
223 | 		req['hash'] = sha256(BTE.bencode(req)).hexdigest()
224 | 
225 | 		try:
226 | 			self.sock.send(BTE.bencode(req))
227 | 			for n in xrange(self.conf.recv_retries + 1):
228 | 				resp = BTE.bdecode(self.sock.recv(bs))
229 | 				if resp.get('txid') != req['txid']: # likely timed-out responses to old requests
230 | 					log.warn('Received out-of-order response (n: %s, request: %s): %s', n, req, resp)
231 | 					continue
232 | 				return resp['peers'], resp.get('more', False)
233 | 		except Exception as err:
234 | 			raise PeerStatsFailure('Failure communicating with cjdns', err)
235 | 		raise PeerStatsFailure( 'Too many bogus (wrong or no txid) responses'
236 | 			' in a row (count: {}), last req/res: {} / {}'.format(self.conf.recv_retries, req, resp) )
237 | 
238 | 	def get_peer_stats(self):
239 | 		peers, page, more = list(), 0, True
240 | 		while more:
241 | 			stats, more = self.get_stats_page(page, self.admin_password)
242 | 			peers.extend(stats)
243 | 			page += 1
244 | 		return peers
245 | 
246 | 	def read(self):
247 | 		try: peers = self.get_peer_stats()
248 | 		# PeerStatsFailure errors' reporting is rate-limited
249 | 		except PeerStatsFailure as err:
250 | 			if hash(err) == hash(self.last_err):
251 | 				if self.last_err_count is True: return
252 | 				elif self.last_err_count < self.last_err_count_max: self.last_err_count += 1
253 | 				else:
254 | 					log.warn( 'Failed getting cjdns peer stats:'
255 | 						' {} -- disabling reporting of recurring errors'.format(err) )
256 | 					self.last_err_count = True
257 | 					return
258 | 			else: self.last_err, self.last_err_count = err, 1
259 | 			log.warn('Failed getting cjdns peer stats: {}'.format(err))
260 | 			return
261 | 		else:
262 | 			if self.last_err_count is True:
263 | 				log.warn('Previous recurring failure ({}) was resolved'.format(self.last_err))
264 | 				self.last_err = self.last_err_count = None
265 | 
266 | 		# Detect peers with 2 links having different isIncoming
267 | 		peers_bidir = dict()
268 | 		for peer in peers:
269 | 			val = peers_bidir.get(peer['publicKey'])
270 | 			if val is False: peers_bidir[peer['publicKey']] = True
271 | 			elif val is None: peers_bidir[peer['publicKey']] = False
272 | 
273 | 		ts, peer_states = time.time(), defaultdict(int)
274 | 		for peer in peers:
275 | 			state = peer['state'].lower()
276 | 			peer_states[state] += 1
277 | 
278 | 			# Check filters
279 | 			if self.conf.filter.established_only and state != 'established': continue
280 | 			if self.conf.filter.direction != 'any':
281 | 				if self.conf.filter.direction == 'incoming' and not peer['isIncoming']: continue
282 | 				elif self.conf.filter.direction == 'outgoing' and peer['isIncoming']: continue
283 | 				else: raise ValueError(self.conf.filter.direction)
284 | 
285 | 			# Generate metric name
286 | 			pubkey = peer['publicKey']
287 | 			if pubkey.endswith('.k'): pubkey = pubkey[:-2]
288 | 			peer['pubkey'] = pubkey
289 | 			if 'ipv6' in self.conf.peer_id:
290 | 				if pubkey not in self.peer_ipv6_cache:
291 | 					self.peer_ipv6_cache[pubkey] = pubkey_to_ipv6(pubkey)
292 | 				peer['ipv6'] = self.peer_ipv6_cache[pubkey]
293 | 			for k in self.conf.peer_id:
294 | 				if k in peer:
295 | 					peer_id = peer[k]
296 | 					break
297 | 			else: raise KeyError(self.conf.peer_id, peer)
298 | 			name = '{}.{}.{{}}'.format(self.conf.prefix, peer_id)
299 | 			if peers_bidir[peer['publicKey']]:
300 | 				name = name.format('incoming_{}' if peer['isIncoming'] else 'outgoing_{}')
301 | 
302 | 			# Per-peer metrics
303 | 			name_bytes = name.format('bytes_{}')
304 | 			for k, d in [('bytesIn', 'in'), ('bytesOut', 'out')]:
305 | 				yield Datapoint(name_bytes.format(d), 'counter', peer[k], ts)
306 | 			if self.conf.special_metrics.peer_link:
307 | 				link = 1 if state == 'established' else 0
308 | 				yield Datapoint(name.format(self.conf.special_metrics.peer_link), 'gauge', link, ts)
309 | 
310 | 		# Common metrics
311 | 		if self.conf.special_metrics.count:
312 | 			yield Datapoint(self.conf.special_metrics.count, 'gauge', len(peers), ts)
313 | 		if self.conf.special_metrics.count_state:
314 | 			for k, v in peer_states.viewitems():
315 | 				name = '{}.{}'.format(self.conf.special_metrics.count_state, k)
316 | 				yield Datapoint(name, 'gauge', v, ts)
317 | 
318 | 
319 | collector = CjdnsPeerStats
320 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/sysstat.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import itertools as it, operator as op, functools as ft
  4 | from subprocess import Popen, PIPE, STDOUT
  5 | from time import time, sleep, strptime, mktime
  6 | from calendar import timegm
  7 | from datetime import datetime, timedelta
  8 | from xattr import xattr
  9 | import os, sys, socket, struct
 10 | 
 11 | from . import Collector, Datapoint, dev_resolve, sector_bytes, rate_limit
 12 | 
 13 | try: from simplejson import loads, dumps, JSONDecodeError
 14 | except ImportError:
 15 | 	from json import loads, dumps
 16 | 	JSONDecodeError = ValueError
 17 | 
 18 | import logging
 19 | log = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | class SADF(Collector):
 23 | 
 24 | 
 25 | 	def __init__(self, *argz, **kwz):
 26 | 		super(SADF, self).__init__(*argz, **kwz)
 27 | 
 28 | 		# Set force_interval margins, if used
 29 | 		if self.conf.force_interval:
 30 | 			try:
 31 | 				from . import cfg
 32 | 				interval = cfg.loop.interval
 33 | 			except (ImportError, KeyError, AttributeError):
 34 | 				log.warn( 'Failed to apply force_interval option'
 35 | 					' - unable to access global configuration to get data collection interval' )
 36 | 				self.force_interval = None
 37 | 			else:
 38 | 				if self.conf.force_interval_fuzz:
 39 | 					fuzz = interval * self.conf.force_interval_fuzz / 100.0
 40 | 				else: fuzz = 0
 41 | 				self.force_interval = interval - fuzz, interval + fuzz
 42 | 		else: self.force_interval = None
 43 | 
 44 | 		self.rate_limit = rate_limit(
 45 | 				max_interval=self.conf.rate.max_interval,
 46 | 				sampling=self.conf.rate.sampling )\
 47 | 			if self.conf.rate.limiting_enabled else None
 48 | 
 49 | 
 50 | 	def process_entry(self, entry):
 51 | 
 52 | 		# Timestamp
 53 | 		try: ts = entry.pop('timestamp')
 54 | 		except KeyError:
 55 | 			log.info( 'Detected sysstat entry'
 56 | 				' without timestamp, skipping: {!r}'.format(entry) )
 57 | 			return # happens, no idea what to do with these
 58 | 		interval = ts['interval']
 59 | 		for fmt in '%Y-%m-%d %H-%M-%S', '%Y-%m-%d %H:%M:%S':
 60 | 			try:
 61 | 				ts = (mktime if not ts['utc'] else timegm)\
 62 | 					(strptime('{} {}'.format(ts['date'], ts['time']), fmt))
 63 | 			except ValueError: pass
 64 | 			else: break
 65 | 		else:
 66 | 			raise ValueError( 'Unable to process'
 67 | 				' sysstat timestamp: {!r} {!r}'.format(ts['date'], ts['time']) )
 68 | 
 69 | 		# Metrics
 70 | 		metrics = list()
 71 | 
 72 | 		if self.conf.skip.sections:
 73 | 			for k in self.conf.skip.sections:
 74 | 				if k in entry: del entry[k]
 75 | 				else: log.debug('Section-to-skip {!r} not found in sysstat entry'.format(k))
 76 | 		process_redundant = not self.conf.skip.redundant
 77 | 
 78 | 		if 'cpu-load-all' in entry:
 79 | 			for stats in entry.pop('cpu-load-all'):
 80 | 				prefix = stats.pop('cpu')
 81 | 				if prefix == 'all': continue # can be derived by aggregator/webapp
 82 | 				prefix = ['cpu', prefix]
 83 | 				metrics.extend((prefix + [k], v) for k,v in stats.viewitems())
 84 | 
 85 | 		if 'process-and-context-switch' in entry:
 86 | 			stats = entry.pop('process-and-context-switch')
 87 | 			metrics.append((['misc', 'contextswitch'], stats['cswch']))
 88 | 			if process_redundant: # also processed in "stats"
 89 | 				metrics.append((['processes', 'forks'], stats['proc']))
 90 | 
 91 | 		if process_redundant:
 92 | 			if 'interrupts' in entry: # with "irq"
 93 | 				for stats in entry.pop('interrupts'):
 94 | 					if stats['intr'] == 'sum': continue # can be derived by aggregator/webapp
 95 | 					metrics.append((['irq', stats['intr'], 'sum'], stats['value']))
 96 | 			if 'swap-pages' in entry: # with "memstats"
 97 | 				for k,v in entry.pop('swap-pages').viewitems():
 98 | 					metrics.append((['memory', 'pages', 'activity', k], v))
 99 | 			# if 'memory' in entry: # with "memstats"
100 | 			# if 'hugepages' in entry: # with "memstats"
101 | 
102 | 		if 'disk' in entry:
103 | 			for disk in entry.pop('disk'):
104 | 				dev_sadf = disk['disk-device']
105 | 				if not dev_sadf.startswith('dev'):
106 | 					log.warn('Unknown device name format: {}, skipping'.format(dev_sadf))
107 | 					continue
108 | 				dev = dev_resolve(*it.imap(int, dev_sadf[3:].split('-')), log_fails=False)
109 | 				if dev is None:
110 | 					log.warn('Unable to resolve name for device {!r}, skipping'.format(dev_sadf))
111 | 					continue
112 | 				prefix = ['disk', 'load', dev]
113 | 				metrics.extend([
114 | 					(prefix + ['utilization'], disk['util-percent']),
115 | 					(prefix + ['req_size'], disk['avgrq-sz']),
116 | 					(prefix + ['queue_len'], disk['avgqu-sz']),
117 | 					(prefix + ['bytes_read'], sector_bytes * disk['rd_sec']),
118 | 					(prefix + ['bytes_write'], sector_bytes * disk['wr_sec']),
119 | 					(prefix + ['serve_time'], disk['await']),
120 | 					(prefix + ['tps'], disk['tps']) ])
121 | 		# if 'io' in entry: # can be derived by aggregator/webapp
122 | 
123 | 		if 'paging' in entry:
124 | 			metrics.append((
125 | 				['memory', 'pages', 'vm_efficiency'],
126 | 				entry.pop('paging')['vmeff-percent'] ))
127 | 			# XXX: lots of redundant metrics here
128 | 
129 | 		if 'queue' in entry:
130 | 			stats = entry.pop('queue')
131 | 			for n in 1, 5, 15:
132 | 				k = 'ldavg-{}'.format(n)
133 | 				metrics.append((['load', k], stats[k]))
134 | 			metrics.extend(
135 | 				(['processes', 'state', k], stats[k])
136 | 				for k in ['runq-sz', 'plist-sz', 'blocked'] )
137 | 
138 | 		if 'kernel' in entry:
139 | 			stats = entry.pop('kernel')
140 | 			metrics.extend([
141 | 				(['misc', 'dent_unused'], stats['dentunusd']),
142 | 				(['misc', 'file_handles'], stats['file-nr']),
143 | 				(['misc', 'inode_handles'], stats['inode-nr']),
144 | 				(['misc', 'pty'], stats['pty-nr']) ])
145 | 
146 | 		if 'network' in entry:
147 | 			stats = entry.pop('network')
148 | 			iface_stats = stats.get('net-dev', list())
149 | 			for iface in iface_stats:
150 | 				prefix = ['network', 'interfaces', iface['iface']]
151 | 				metrics.extend([
152 | 					(prefix + ['rx', 'bytes'], iface['rxkB'] * 2**10),
153 | 					(prefix + ['rx', 'packets', 'total'], iface['rxpck']),
154 | 					(prefix + ['rx', 'packets', 'compressed'], iface['rxcmp']),
155 | 					(prefix + ['rx', 'packets', 'multicast'], iface['rxmcst']),
156 | 					(prefix + ['tx', 'bytes'], iface['txkB'] * 2**10),
157 | 					(prefix + ['tx', 'packets', 'total'], iface['txpck']),
158 | 					(prefix + ['tx', 'packets', 'compressed'], iface['txpck']) ])
159 | 			iface_stats = stats.get('net-edev', list())
160 | 			iface_errs_common = [('err', 'total'), ('fifo', 'overflow_fifo'), ('drop', 'overflow_kbuff')]
161 | 			for iface in iface_stats:
162 | 				prefix = ['network', 'interfaces', iface['iface']]
163 | 				for src,dst in iface_errs_common + [('fram', 'frame_alignment')]:
164 | 					metrics.append((prefix + ['rx', 'errors', dst], iface['rx{}'.format(src)]))
165 | 				for src,dst in iface_errs_common + [('carr', 'carrier')]:
166 | 					metrics.append((prefix + ['tx', 'errors', dst], iface['tx{}'.format(src)]))
167 | 				metrics.append((prefix + ['tx', 'errors', 'collision'], iface['coll']))
168 | 			if 'net-nfs' in stats:
169 | 				for k,v in stats['net-nfs'].viewitems():
170 | 					metrics.append((['network', 'nfs', 'client', k], v))
171 | 				for k,v in stats['net-nfsd'].viewitems():
172 | 					metrics.append((['network', 'nfs', 'server', k], v))
173 | 			if 'net-sock' in stats:
174 | 				for k,v in stats['net-sock'].viewitems():
175 | 					if k.endswith('sck'):
176 | 						k = k[:-3]
177 | 						if k == 'tot': k = 'total'
178 | 						metrics.append((['network', 'sockets', k], v))
179 | 
180 | 		if 'power-management' in entry:
181 | 			stats = entry.pop('power-management')
182 | 			for metric in stats.get('temperature', list()):
183 | 				name = ['sensors', 'temperature', metric['device'].replace('.', '_')]
184 | 				if 'number' in metric: name.append(bytes(metric['number']))
185 | 				metrics.append((name, metric['degC']))
186 | 
187 | 		return ts, interval, metrics
188 | 
189 | 
190 | 	def _read(self, ts_to=None):
191 | 		if not ts_to: ts_to = datetime.now()
192 | 
193 | 		sa_days = dict( (ts.day, ts)
194 | 			for ts in ((ts_to - timedelta(i))
195 | 			for i in xrange(self.conf.skip.older_than_days+1)) )
196 | 		sa_files = sorted(it.ifilter(
197 | 			op.methodcaller('startswith', 'sa'), os.listdir(self.conf.sa_path) ))
198 | 		host = os.uname()[1] # to check vs nodename in data
199 | 		log.debug('SA files to process: {}'.format(sa_files))
200 | 
201 | 		for sa in sa_files:
202 | 			sa_day = int(sa[2:])
203 | 			try: sa_day = sa_days[sa_day]
204 | 			except KeyError: continue # too old or new
205 | 
206 | 			sa = os.path.join(self.conf.sa_path, sa)
207 | 			log.debug('Processing file: {}'.format(sa))
208 | 
209 | 			# Read xattr timestamp
210 | 			sa_xattr = xattr(sa)
211 | 			try: sa_ts_from = sa_xattr[self.conf.xattr_name]
212 | 			except KeyError: sa_ts_from = None
213 | 			if sa_ts_from:
214 | 				sa_ts_from = datetime.fromtimestamp(
215 | 					struct.unpack('=I', sa_ts_from)[0] )
216 | 				if sa_day - sa_ts_from > timedelta(1) + timedelta(seconds=60):
217 | 					log.debug( 'Discarding xattr timestamp, because'
218 | 						' it doesnt seem to belong to the same date as file'
219 | 						' (day: {}, xattr: {})'.format(sa_day, sa_ts_from) )
220 | 					sa_ts_from = None
221 | 				if sa_ts_from and sa_ts_from.date() != sa_day.date():
222 | 					log.debug('File xattr timestamp points to the next day, skipping file')
223 | 					continue
224 | 			if not self.conf.max_dump_span: sa_ts_to = None
225 | 			else:
226 | 				# Use 00:00 of sa_day + max_dump_span if there's no xattr
227 | 				ts = sa_ts_from or datetime(sa_day.year, sa_day.month, sa_day.day)
228 | 				sa_ts_to = ts + timedelta(0, self.conf.max_dump_span)
229 | 				# Avoid adding restrictions, if they make no sense anyway
230 | 				if sa_ts_to >= datetime.now(): sa_ts_to = None
231 | 
232 | 			# Get data from sadf
233 | 			sa_cmd = ['sadf', '-jt']
234 | 			if sa_ts_from: sa_cmd.extend(['-s', sa_ts_from.strftime('%H:%M:%S')])
235 | 			if sa_ts_to: sa_cmd.extend(['-e', sa_ts_to.strftime('%H:%M:%S')])
236 | 			sa_cmd.extend(['--', '-A'])
237 | 			sa_cmd.append(sa)
238 | 			log.debug('sadf command: {}'.format(sa_cmd))
239 | 			sa_proc = Popen(sa_cmd, stdout=PIPE)
240 | 			try: data = loads(sa_proc.stdout.read())
241 | 			except JSONDecodeError as err:
242 | 				log.exception(( 'Failed to process sadf (file:'
243 | 					' {}, command: {}) output: {}' ).format(sa, sa_cmd, err))
244 | 				data = None
245 | 			if sa_proc.wait():
246 | 				log.error('sadf (command: {}) exited with error'.format(sa_cmd))
247 | 				data = None
248 | 			if not data:
249 | 				log.warn('Skipping processing of sa file: {}'.format(sa))
250 | 				continue
251 | 
252 | 			# Process and dispatch the datapoints
253 | 			sa_ts_max = 0
254 | 			for data in data['sysstat']['hosts']:
255 | 				if data['nodename'] != host:
256 | 					log.warn( 'Mismatching hostname in sa data:'
257 | 						' {} (uname: {}), skipping'.format(data['nodename'], host) )
258 | 					continue
259 | 				sa_day_ts = mktime(sa_day.timetuple())
260 | 				# Read the data
261 | 				for ts, interval, metrics in it.ifilter(
262 | 						None, it.imap(self.process_entry, data['statistics']) ):
263 | 					if ts - 1 > sa_ts_max:
264 | 						# has to be *before* beginning of the next interval
265 | 						sa_ts_max = ts - 1
266 | 					if abs(ts - sa_day_ts) > 24*3600 + interval + 1:
267 | 						log.warn( 'Dropping sample because of timestamp mismatch'
268 | 							' (timestamp: {}, expected date: {})'.format(ts, sa_day_ts) )
269 | 						continue
270 | 					if self.force_interval and (
271 | 							interval < self.force_interval[0]
272 | 							or interval > self.force_interval[1] ):
273 | 						log.warn( 'Dropping sample because of interval mismatch'
274 | 							' (file: {sa}, interval: {interval},'
275 | 							' required: {margins[0]}-{margins[1]}, timestamp: {ts})'\
276 | 								.format(sa=sa, interval=interval, ts=ts, margins=self.force_interval) )
277 | 						continue
278 | 					ts_val = int(ts)
279 | 					for name, val in metrics:
280 | 						yield Datapoint('.'.join(name), 'gauge', val, ts_val)
281 | 
282 | 			# Update xattr timestamp, if any entries were processed
283 | 			if sa_ts_max:
284 | 				log.debug('Updating xattr timestamp to {}'.format(sa_ts_max))
285 | 				if not self.conf.debug.dry_run:
286 | 					sa_xattr[self.conf.xattr_name] = struct.pack('=I', int(sa_ts_max))
287 | 
288 | 
289 | 	def read(self):
290 | 		if not self.rate_limit or next(self.rate_limit):
291 | 			log.debug('Running sysstat data processing cycle')
292 | 			return self._read()
293 | 		else: return list()
294 | 
295 | 
296 | collector = SADF
297 | 


--------------------------------------------------------------------------------
/graphite_metrics/harvestd.yaml:
--------------------------------------------------------------------------------
  1 | ### Default (baseline) configuration parameters.
  2 | ### DO NOT ever change this config, use -c commandline option instead!
  3 | 
  4 | 
  5 | collectors:
  6 |   # Modules that collect the actual datapoints to be sent
  7 | 
  8 |   _default: # used as a base for all other sections here
  9 |     enabled: true
 10 |     # debug: # auto-filled from global "debug" section, if not specified
 11 | 
 12 |   ping:
 13 |     # Reports average (ewma) rtt of icmp ping to each specified host and packet loss (if any).
 14 |     interval: 5 # seconds between sending-out pings
 15 |     ewma_factor: 0.3 # ewma factor for rtt values
 16 |     resolve:
 17 |       no_reply: 30 # re-resolve hostnames after 30 seconds w/o reply
 18 |       time: 600 # re-resolve hostnames after fixed 600s intervals
 19 |       # "max_retries" restarts ping subprocess (e.g. to apply changes to
 20 |       #   /etc/hosts or other libc resolver configuration) after N name resolution failures.
 21 |       # Also, if resolver fails even after restart (i.e. on start), disable warnings
 22 |       #  (but issuing a message on next success) after that number of retries.
 23 |       max_retries: 5
 24 |     hosts: # explicitly split into ipv4/ipv6 to control how hostnames are resolved
 25 |       ipv4:
 26 |         # google_com: google.com
 27 |         # google_dns: 8.8.8.8
 28 |       ipv6:
 29 |         # ipv6_google_com: ipv6.google.com
 30 |         # ipv6_tunnelbroker_net: ipv6.tunnelbroker.net
 31 | 
 32 |   cron_log:
 33 |     # Reports start/stop, run time and errors for cron jobs from a logfile.
 34 |     # I use simple wrappers for cron-jobs to produce these logs (among other things):
 35 |     #  https://github.com/mk-fg/fgtk#task https://github.com/mk-fg/fgtk/tree/master/task
 36 |     source: # must be filled with path to a log file
 37 |     aliases: # either [alias, regexp] or ["_" + regexp_group, regexp], see "_script" example below
 38 |       # - ['logrotate', '(^|\b)logrotate\b']
 39 |       # - ['locate', '(^|\b)updatedb\b']
 40 |       # - ['_script', '/etc/cron\.\w+/*(?P<script>\S+)(\s+|$)']
 41 |     lines: # only named regexp groups here are mandatory, all lines are optional
 42 |       init: 'task\[(\d+|-)\]:\s+Queued\b[^:]*: (?P<job>.*)$'
 43 |       start: 'task\[(\d+|-)\]:\s+Started\b[^:]*: (?P<job>.*)$'
 44 |       finish: 'task\[(\d+|-)\]:\s+Finished\b[^:]*: (?P<job>.*)$'
 45 |       duration: 'task\[(\d+|-)\]:\s+Finished \([^):]*\bduration=(?P<val>\d+)[,)][^:]*: (?P<job>.*)$'
 46 |       error: 'task\[(\d+|-)\]:\s+Finished \([^):]*\bstatus=0*[^0]+0*[,)][^:]*: (?P<job>.*)$'
 47 |     xattr_name: user.collectd.logtail.pos # used to mark "last position" in sa logs
 48 | 
 49 |   slabinfo:
 50 |     # Reports RAM usage by kernel, allocated via slab subsystem.
 51 |     include_prefixes: # takes priority over exclude_prefixes
 52 |     exclude_prefixes: ['kmalloc-', 'kmem_cache', 'dma-kmalloc-']
 53 |     pass_zeroes: False # to skip creating a lot metrics for always-0 (for particular hosts) slab counts
 54 | 
 55 |   cgacct:
 56 |     # Accounting of cpu/mem/io for systemd-created per-service cgroups.
 57 |     cg_root: /sys/fs/cgroup
 58 |     systemd_prefix: system.slice # was just "system" for older versions
 59 |     resource_controllers: ['cpuacct', 'memory', 'blkio'] # mapped to methods in cgacct.py
 60 | 
 61 |   sysstat:
 62 |     # Processing of sysstat logs - cpu, io, network, temperatures, etc.
 63 |     # See collectors/sysstat.py for full list of parameters.
 64 |     force_interval: true # skip intervals of different length than core.interval
 65 |     force_interval_fuzz: 10 # +/- % to consider acceptable interval fuzz
 66 |     sa_path: /var/log/sa
 67 |     rate: # see "graphite_metrics.collectors.rate_limit"
 68 |       limiting_enabled: true
 69 |       max_interval: 30 # cycles
 70 |       sampling: 3
 71 |     skip:
 72 |       redundant: true # skip metrics, redundant with other default collectors
 73 |       sections: # optional list of sections in "sadf -j -- -A" output to skip, example: ['disk', 'cpu-load-all']
 74 |       older_than_days: 4 # do not check sysstat logs older than this number of days on each run
 75 |     xattr_name: user.sa_carbon.pos # used to mark "last position" in sa logs
 76 | 
 77 |     # Max timestan to dump with "sadf -j" in seconds.
 78 |     # Use if resulting json is too huge for processing in one go (e.g. ram-wise).
 79 |     max_dump_span: # example: 7200
 80 | 
 81 |   iptables_counts:
 82 |     # Packet/byte counters from iptables/ip6tables.
 83 |     # In my case, these bindings are generated from higher-level configuration
 84 |     #  by trilobite script (https://github.com/mk-fg/trilobite).
 85 |     rule_metrics_path:
 86 |       # Paths to files with "table_name chain_name rule_no metric_name"
 87 |       #  lines for iptables/ip6tables.
 88 |       # Example line in such files: "filter FORWARD 30 network.services.tor.out"
 89 |       ipv4: # example: /var/lib/iptables/metrics.list
 90 |       ipv6: # example: /var/lib/ip6tables/metrics.list
 91 |     # One of: pkt, bytes, both (metric.pkt + metric.bytes), both_flat (metric_pkt + metric_bytes)
 92 |     units: both_flat
 93 |     # Consider counter invalid (and skip it) if rule has changed without rule_metrics file update
 94 |     discard_changed_rules: true
 95 | 
 96 |   irq:
 97 |     # Interrupt counters (/proc/interrupts, /proc/softirqs) processing.
 98 |     # No configuration.
 99 |   memstats:
100 |     # System memory usage statistics (/proc/vmstat, /proc/meminfo).
101 |     # No configuration.
102 |   memfrag:
103 |     # Memory fragmentation statistics (/proc/buddyinfo, /proc/pagetypeinfo).
104 |     # No configuration.
105 |   stats:
106 |     # General system statistics (/proc/stats) - irq.total.{hard,soft}, processes.forks, etc.
107 |     # No configuration.
108 | 
109 |   cjdns_peer_stats:
110 |     # Traffic/state stats for cjdns daemon - https://github.com/cjdelisle/cjdns/
111 |     # Collects these via InterfaceController_peerStats() admin interface call.
112 |     # Doesn't need/use threaded cjdnsadmin module that comes with it.
113 |     enabled: false # more rare than other stats
114 |     # How to get peer metric name.
115 |     # Can be either "pubkey", "ipv6" or whatever key (e.g. "user")
116 |     #  that cjdns returns or a list of these, to use first one available (e.g. ["user", "ipv6"]).
117 |     # Note that "pubkey" and "ipv6" keys are synthetic and always available.
118 |     peer_id: ipv6
119 |     # Path to standard cjdcmd/cjdmaid/cjdnsadmin configuration file,
120 |     #  which should contain address, port and password keys.
121 |     # See https://github.com/cjdelisle/cjdns/blob/master/contrib/python/cjdnsadminmaker.py
122 |     cjdnsadmin_conf: ~/.cjdnsadmin
123 |     # Prefix under which create "<peer_id>.{bytes_in,bytes_out}" counters
124 |     prefix: network.services.cjdns.peers
125 |     filter:
126 |       # Log stats only for peers with following connection properties.
127 |       direction: any # one of "any", "incoming", "outgoing"
128 |       established_only: true # don't send byte counters of configured but disconnected peers
129 |     # Some extra metrics to pass along with byte counters.
130 |     # Each one can be set to null or false to skip sending it.
131 |     special_metrics:
132 |       # Add specified key for each peer, set to 0 or 1, depending on connection state.
133 |       peer_link: link
134 |       # Total number of configured peers.
135 |       count: network.services.cjdns.peer_state.total
136 |       # Prefix for counts of peers by state (e.g. "established", "unresponsive", etc).
137 |       count_state: network.services.cjdns.peer_state
138 |     timeout: 8 # how long to wait for cjdns responses
139 |     recv_retries: 10 # how many responses with wrong txid (likely prev timeouts) to tolerate
140 | 
141 |   # self_profiling: # TODO
142 |   #   main_loop: true
143 |   #   collectors: true
144 | 
145 | 
146 | processors:
147 |   # Modules that process the datapoints before they are passed to sinks
148 |   # Datapoints are passed to processors in the same order as they're specified here,
149 |   #  with all the entry points without config section afterwards in no particular order
150 |   # Passed a list of sinks along with the datapoints,
151 |   #  so can facilitate filtering, by dropping particular sinks from the list
152 | 
153 |   _default: # used as a base for all other sections here
154 |     enabled: true
155 |     # debug: # auto-filled from global "debug" section, if not specified
156 | 
157 |   hostname_prefix:
158 |     hostname: # uname(), if unset
159 | 
160 | 
161 | sinks:
162 |   _default: # used as a base for all other sections here
163 |     # Default host/port for sinks can be overidden by CLI flags
164 |     host: localhost # can be specified as "host[:port]"
165 |     default_port: 2003
166 | 
167 |     enabled: false # should be explicitly enabled
168 |     # debug: # auto-filled from global "debug" section, if not specified
169 | 
170 |   carbon_socket:
171 |     enabled: true # the only sink enabled by default
172 |     max_reconnects: # before bailing out with the error
173 |     reconnect_delay: 5 # seconds
174 | 
175 |   librato_metrics: # see http://dev.librato.com/v1/post/metrics
176 | 
177 |     http_parameters:
178 |       # See http://docs.python-requests.org/en/latest/api/#main-interface for a complete list
179 |       url: https://metrics-api.librato.com/v1/metrics
180 |       auth: ['example@librato.com:', '75AFDB82'] # override with the actual values, no url-encoding needed
181 |       timeout: # defaults to half of the loop.interval or 30, if former is inaccessible
182 |       # Might be useful in some setups:
183 |       #  proxies:
184 |       #  cert:
185 |       #  verify: false
186 | 
187 |     # Derive "source" field from first component of metric name
188 |     # See also "hostname_prefix" processor
189 |     source_from_prefix: true
190 |     # Explicit source specification, overrides "source_from_prefix", if set
191 |     # If neither "source" or "source_from_prefix" are set, it won't be sent at all
192 |     source:
193 |     # Discard "measure_time" field of individual metrics,
194 |     #  sending just one value (when data reached the sink)
195 |     # Saves quite a bit of traffic (roughly 1/3),
196 |     #  but MUST NOT be used with historical data collectors, like sysstat
197 |     unified_measure_time: false
198 |     # Split measurement submissions into concurrent requests, as suggested by docs
199 |     # Goal is to minimize overall submission time given the current api limitations
200 |     # Uses async api in requests module, which requires gevent (gevent.org),
201 |     #  will be disabled with a warning (or fail, if enabled explicitly), if unavailable
202 |     chunk_data:
203 |       # Can be explicitly disabled (enabled: false) to remove gevent-related
204 |       #  warnings on init, or enabled (=true) to fail if async api is unavailable
205 |       enabled:
206 |       max_chunk_size: 500
207 |       max_concurrent_requests: 10 # 0 or false to remove this limit
208 | 
209 |   # dump: # just logs all the datapoints with level=INFO for testing purposes
210 |   #   enabled: true
211 | 
212 | 
213 | loop:
214 |   name: basic # entry point name to use, only one loop can be used
215 |   interval: 60 # seconds
216 | 
217 | 
218 | core:
219 |   # Emulate filesystem extended attributes (used in some collectors
220 |   #  like sysstat or cron_log), storing per-path data in a simple shelve db.
221 |   # Done by faking "xattr" module. Attached data will be lost on path changes.
222 |   # Specify a path to db file (will be created) to use it.
223 |   xattr_emulation:
224 | 
225 | debug: # values here can be overidden by special CLI flags
226 |   dry_run: false
227 | 
228 | 
229 | logging: # see http://docs.python.org/library/logging.config.html
230 |   # "custom" level means WARNING or DEBUG, depending on CLI options
231 |   warnings: true # capture python warnings
232 |   tracebacks: true # much easier to debug with these, but noisy and multiline
233 |   version: 1
234 |   formatters:
235 |     basic:
236 |       format: '%(asctime)s :: %(levelname)s :: %(name)s: %(message)s'
237 |       datefmt: '%Y-%m-%d %H:%M:%S'
238 |   handlers:
239 |     console:
240 |       class: logging.StreamHandler
241 |       stream: ext://sys.stdout
242 |       formatter: basic
243 |       level: custom
244 |     # file:
245 |     #   class: logging.handlers.WatchedFileHandler
246 |     #   filename: /var/log/harvestd.log
247 |     #   formatter: basic
248 |     #   encoding: utf-8
249 |     #   level: DEBUG
250 |   # loggers:
251 |   #   graphite_metrics.collectors.irq:
252 |   #     level: ERROR
253 |   root:
254 |     handlers: [console]
255 |     level: custom
256 | 


--------------------------------------------------------------------------------
/graphite_metrics/collectors/cgacct.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import itertools as it, operator as op, functools as ft
  4 | from collections import deque
  5 | from contextlib import contextmanager
  6 | from io import open
  7 | from os.path import join, ismount
  8 | import os, re, dbus, fcntl, stat
  9 | 
 10 | from . import Collector, Datapoint, user_hz, dev_resolve
 11 | 
 12 | import logging
 13 | log = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class CGAcct(Collector):
 17 | 
 18 | 
 19 | 	def __init__(self, *argz, **kwz):
 20 | 		super(CGAcct, self).__init__(*argz, **kwz)
 21 | 
 22 | 		self.stuck_list = join(self.conf.cg_root, 'sticky.cgacct')
 23 | 
 24 | 		# Check which info is available, if any
 25 | 		self.rc_collectors = list()
 26 | 		for rc in self.conf.resource_controllers:
 27 | 			try: rc_collector = getattr(self, rc)
 28 | 			except AttributeError:
 29 | 				log.warn( 'Unable to find processor'
 30 | 					' method for rc {!r} metrics, skipping it'.format(rc) )
 31 | 				continue
 32 | 			rc_path = join(self.conf.cg_root, rc)
 33 | 			if not ismount(rc_path + '/'):
 34 | 				log.warn(( 'Specified rc path ({}) does not'
 35 | 					' seem to be a mountpoint, skipping it' ).format(rc_path))
 36 | 				continue
 37 | 			log.debug('Using cgacct collector for rc: {}'.format(rc))
 38 | 			self.rc_collectors.append(rc_collector)
 39 | 
 40 | 		if not self.rc_collectors: # no point doing anything else
 41 | 			log.info('No cgroup rcs to poll (rc_collectors), disabling collector')
 42 | 			self.conf.enabled = False
 43 | 			return
 44 | 
 45 | 		# List of cgroup sticky bits, set by this service
 46 | 		self._stuck_list_file = open(self.stuck_list, 'ab+')
 47 | 		self._stuck_list = dict()
 48 | 		fcntl.lockf(self._stuck_list_file, fcntl.LOCK_EX | fcntl.LOCK_NB)
 49 | 		self._stuck_list_file.seek(0)
 50 | 		for line in self._stuck_list_file:
 51 | 			rc, svc = line.strip().split()
 52 | 			if rc not in self._stuck_list: self._stuck_list[rc] = set()
 53 | 			self._stuck_list[rc].add(svc)
 54 | 
 55 | 
 56 | 	def _cg_svc_dir(self, rc, svc=None):
 57 | 		path = join(self.conf.cg_root, rc, self.conf.systemd_prefix)
 58 | 		if not svc: return path
 59 | 		svc = svc.rsplit('@', 1)
 60 | 		return join(path, '{}.service'.format(svc[0] + '@'), svc[1])\
 61 | 			if len(svc) > 1 else join(path, '{}.service'.format(svc[0]))
 62 | 
 63 | 	def _cg_svc_metrics(self, rc, metric, svc_instances):
 64 | 		return (join( self._cg_svc_dir(rc, svc),
 65 | 			'{}.{}'.format(rc, metric) ) for svc in svc_instances)
 66 | 
 67 | 	@contextmanager
 68 | 	def _cg_metric(self, path, **kwz):
 69 | 		try:
 70 | 			with open(path, mode='rb', **kwz) as src: yield src
 71 | 		except (OSError, IOError) as err:
 72 | 			log.debug('Failed to open cgroup metric: {}'.format(path, err))
 73 | 			raise
 74 | 
 75 | 	@staticmethod
 76 | 	def _svc_name(svc): return svc.replace('@', '').replace('.', '_')
 77 | 
 78 | 
 79 | 	@staticmethod
 80 | 	def _systemd_services():
 81 | 		for unit in dbus.Interface( dbus.SystemBus().get_object(
 82 | 					'org.freedesktop.systemd1', '/org/freedesktop/systemd1' ),
 83 | 				'org.freedesktop.systemd1.Manager' ).ListUnits():
 84 | 			name, state = it.imap(str, op.itemgetter(0, 4)(unit))
 85 | 			if name.endswith('.service') and state in ('running', 'start'): yield name[:-8]
 86 | 
 87 | 	def _systemd_cg_stick(self, rc, services):
 88 | 		if rc not in self._stuck_list: self._stuck_list[rc] = set()
 89 | 		stuck_update, stuck = False, set(self._stuck_list[rc])
 90 | 		services = set(services) # will be filtered and returned
 91 | 		# Process services, make their cgroups persistent
 92 | 		for svc in list(services):
 93 | 			if svc not in stuck:
 94 | 				svc_tasks = join(self._cg_svc_dir(rc, svc), 'tasks')
 95 | 				try:
 96 | 					os.chmod( svc_tasks,
 97 | 						stat.S_IMODE(os.stat(svc_tasks).st_mode) | stat.S_ISVTX )
 98 | 				except OSError: services.discard(svc) # not running
 99 | 				else:
100 | 					self._stuck_list[rc].add(svc)
101 | 					stuck_update = True
102 | 			else: stuck.remove(svc) # to exclude it from the cleanup loop
103 | 		# Process stuck cgroups for removed services,
104 | 		#  try dropping these, otherwise just unstick and forget
105 | 		for svc in stuck:
106 | 			svc_dir = self._cg_svc_dir(rc, svc)
107 | 			try: os.rmdir(svc_dir)
108 | 			except OSError:
109 | 				log.debug( 'Non-empty cgroup for'
110 | 					' not-running service ({}): {}'.format(svc, svc_dir) )
111 | 				svc_tasks = join(svc_dir, 'tasks')
112 | 				try:
113 | 					os.chmod( svc_tasks,
114 | 						stat.S_IMODE(os.stat(svc_tasks).st_mode) & ~stat.S_ISVTX )
115 | 				except OSError:
116 | 					log.debug('Failed to unstick cgroup tasks file: {}'.format(svc_tasks))
117 | 			self._stuck_list[rc].remove(svc)
118 | 			stuck_update = True
119 | 		# Save list updates, if any
120 | 		if stuck_update:
121 | 			self._stuck_list_file.seek(0)
122 | 			self._stuck_list_file.truncate()
123 | 			for rc, stuck in self._stuck_list.viewitems():
124 | 				for svc in stuck: self._stuck_list_file.write('{} {}\n'.format(rc, svc))
125 | 			self._stuck_list_file.flush()
126 | 		return services
127 | 
128 | 	_systemd_sticky_instances = lambda self, rc, services: (
129 | 		(self._svc_name(svc), list(svc_instances))
130 | 		for svc, svc_instances in it.groupby(
131 | 			sorted(set(services).intersection(self._systemd_cg_stick(rc, services))),
132 | 			key=lambda k: (k.rsplit('@', 1)[0]+'@' if '@' in k else k) ) )
133 | 
134 | 
135 | 	def cpuacct( self, services,
136 | 			_name = 'processes.services.{}.cpu.{}'.format,
137 | 			_stats=('user', 'system') ):
138 | 		## "stats" counters (user/system) are reported in USER_HZ - 1/Xth of second
139 | 		##  yielded values are in seconds, so counter should have 0-1 range,
140 | 		##  when divided by the interval
141 | 		## Not parsed: usage (should be sum of percpu)
142 | 		for svc, svc_instances in self._systemd_sticky_instances('cpuacct', services):
143 | 			if svc == 'total':
144 | 				log.warn('Detected service name conflict with "total" aggregation')
145 | 				continue
146 | 			# user/system jiffies
147 | 			stat = dict()
148 | 			for path in self._cg_svc_metrics('cpuacct', 'stat', svc_instances):
149 | 				try:
150 | 					with self._cg_metric(path) as src:
151 | 						for name, val in (line.strip().split() for line in src):
152 | 							if name not in _stats: continue
153 | 							try: stat[name] += int(val)
154 | 							except KeyError: stat[name] = int(val)
155 | 				except (OSError, IOError): pass
156 | 			for name in _stats:
157 | 				if name not in stat: continue
158 | 				yield Datapoint( _name(svc, name),
159 | 					'counter', float(stat[name]) / user_hz, None )
160 | 			# usage clicks
161 | 			usage = None
162 | 			for path in self._cg_svc_metrics('cpuacct', 'usage', svc_instances):
163 | 				try:
164 | 					with self._cg_metric(path) as src:
165 | 						usage = (0 if usage is None else usage) + int(src.read().strip())
166 | 				except (OSError, IOError): pass
167 | 			if usage is not None:
168 | 				yield Datapoint(_name(svc, 'usage'), 'counter', usage, None)
169 | 
170 | 
171 | 	@staticmethod
172 | 	def _iostat(pid, _conv=dict( read_bytes=('r', 1),
173 | 			write_bytes=('w', 1), cancelled_write_bytes=('w', -1),
174 | 			syscr=('rc', 1), syscw=('wc', 1) )):
175 | 		res = dict()
176 | 		for line in open('/proc/{}/io'.format(pid), 'rb'):
177 | 			line = line.strip()
178 | 			if not line: continue
179 | 			try: name, val = line.split(':', 1)
180 | 			except ValueError:
181 | 				log.warn('Unrecognized line format in proc/{}/io: {!r}'.format(pid, line))
182 | 				continue
183 | 			try: k,m = _conv[name]
184 | 			except KeyError: continue
185 | 			if k not in res: res[k] = 0
186 | 			res[k] += int(val.strip()) * m
187 | 		try: res = op.itemgetter('r', 'w', 'rc', 'wc')(res)
188 | 		except KeyError:
189 | 			raise OSError('Incomplete IO data for pid {}'.format(pid))
190 | 		# comm is used to make sure it's the same process
191 | 		return open('/proc/{}/comm'.format(pid), 'rb').read(), res
192 | 
193 | 	@staticmethod
194 | 	def _read_ids(src):
195 | 		return set(it.imap(int, it.ifilter( None,
196 | 			it.imap(str.strip, src.readlines()) )))
197 | 
198 | 	def blkio( self, services,
199 | 			_caches=deque([dict()], maxlen=2),
200 | 			_re_line = re.compile( r'^(?P<dev>\d+:\d+)\s+'
201 | 				r'(?P<iotype>Read|Write)\s+(?P<count>\d+)$' ),
202 | 			_name = 'processes.services.{}.io.{}'.format ):
203 | 		# Caches are for syscall io
204 | 		cache_prev = _caches[-1]
205 | 		cache_update = dict()
206 | 
207 | 		for svc, svc_instances in self._systemd_sticky_instances('blkio', services):
208 | 
209 | 			## Block IO
210 | 			## Only reads/writes are accounted, sync/async is meaningless now,
211 | 			##  because only sync ops are counted anyway
212 | 			svc_io = dict()
213 | 			for metric, src in [ ('bytes', 'io_service_bytes'),
214 | 					('time', 'io_service_time'), ('ops', 'io_serviced') ]:
215 | 				dst = svc_io.setdefault(metric, dict())
216 | 				for path in self._cg_svc_metrics('blkio', src, svc_instances):
217 | 					try:
218 | 						with self._cg_metric(path) as src:
219 | 							for line in src:
220 | 								match = _re_line.search(line.strip())
221 | 								if not match: continue # "Total" line, empty line
222 | 								dev = dev_resolve(*map(int, match.group('dev').split(':')))
223 | 								if dev is None: continue
224 | 								dev = dst.setdefault(dev, dict())
225 | 								iotype, val = match.group('iotype').lower(), int(match.group('count'))
226 | 								if iotype not in dev: dev[iotype] = val
227 | 								else: dev[iotype] += val
228 | 					except (OSError, IOError): pass
229 | 			for metric, devs in svc_io.viewitems():
230 | 				for dev, vals in devs.viewitems():
231 | 					if {'read', 'write'} != frozenset(vals):
232 | 						log.warn('Unexpected IO counter types: {}'.format(vals))
233 | 						continue
234 | 					for k,v in vals.viewitems():
235 | 						if not v: continue # no point writing always-zeroes for most devices
236 | 						yield Datapoint(_name( svc,
237 | 							'blkio.{}.{}_{}'.format(dev, metric, k) ), 'counter', v, None)
238 | 
239 | 			## Syscall IO
240 | 			## Counters from blkio seem to be less useful in general,
241 | 			##  so /proc/*/io stats are collected for all processes in cgroup
242 | 			## Should be very inaccurate if pids are respawning
243 | 			tids, pids = set(), set()
244 | 			for base in it.imap(ft.partial(
245 | 					self._cg_svc_dir, 'blkio' ), svc_instances):
246 | 				try:
247 | 					with self._cg_metric(join(base, 'tasks')) as src:
248 | 						tids.update(self._read_ids(src)) # just to count them
249 | 					with self._cg_metric(join(base, 'cgroup.procs')) as src:
250 | 						pids.update(self._read_ids(src))
251 | 				except (OSError, IOError): continue
252 | 			# Process/thread count - only collected here
253 | 			yield Datapoint( 'processes.services.'
254 | 				'{}.threads'.format(svc), 'gauge', len(tids), None)
255 | 			yield Datapoint( 'processes.services.'
256 | 				'{}.processes'.format(svc), 'gauge', len(pids), None )
257 | 
258 | 			# Actual io metrics
259 | 			svc_update = list()
260 | 			for pid in pids:
261 | 				try: comm, res = self._iostat(pid)
262 | 				except (OSError, IOError): continue
263 | 				svc_update.append(((svc, pid, comm), res))
264 | 			delta_total = list(it.repeat(0, 4))
265 | 			for k,res in svc_update:
266 | 				try: delta = map(op.sub, res, cache_prev[k])
267 | 				except KeyError: continue
268 | 				delta_total = map(op.add, delta, delta_total)
269 | 			for k,v in it.izip(['bytes_read', 'bytes_write', 'ops_read', 'ops_write'], delta_total):
270 | 				yield Datapoint(_name(svc, k), 'gauge', v, None)
271 | 			cache_update.update(svc_update)
272 | 		_caches.append(cache_update)
273 | 
274 | 
275 | 	def memory( self, services,
276 | 			_name = 'processes.services.{}.memory.{}'.format ):
277 | 		for svc, svc_instances in self._systemd_sticky_instances('memory', services):
278 | 			vals = dict()
279 | 
280 | 			for path in self._cg_svc_metrics('memory', 'stat', svc_instances):
281 | 				try:
282 | 					with self._cg_metric(path) as src:
283 | 						for line in src:
284 | 							name, val = line.strip().split()
285 | 							if not name.startswith('total_'): continue
286 | 							name = name[6:]
287 | 							val, k = int(val), ( _name(svc, name),
288 | 								'gauge' if not name.startswith('pg') else 'counter' )
289 | 							if k not in vals: vals[k] = val
290 | 							else: vals[k] += val
291 | 				except (OSError, IOError): pass
292 | 
293 | 			for prefix in None, 'kmem', 'memsw':
294 | 				k = '{}.usage_in_bytes' if prefix else 'usage_in_bytes'
295 | 				name = 'usage'
296 | 				if prefix: name += '_' + prefix
297 | 				for path in self._cg_svc_metrics('memory', k, svc_instances):
298 | 					try:
299 | 						with self._cg_metric(path) as src:
300 | 							vals[_name(svc, name), 'gauge'] = int(src.read().strip())
301 | 					except (OSError, IOError): pass
302 | 
303 | 			for (name, val_type), val in vals.viewitems():
304 | 				yield Datapoint(name, val_type, val, None)
305 | 
306 | 
307 | 	def read(self):
308 | 		services = list(self._systemd_services())
309 | 		for dp in it.chain.from_iterable(
310 | 			func(services) for func in self.rc_collectors ): yield dp
311 | 
312 | 
313 | collector = CGAcct
314 | 


--------------------------------------------------------------------------------