├── docs ├── realms.rst ├── .gitignore ├── _static │ └── images │ │ └── nymms_arch.png ├── demo.rst ├── hidden_code_block.py ├── index.rst ├── Makefile ├── getting_started.rst └── conf.py ├── nymms ├── api │ ├── __init__.py │ ├── plugins │ │ ├── __init__.py │ │ └── sdb_handler.py │ └── routes.py ├── probe │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ └── test_probe.py │ ├── sqs_probe.py │ └── Probe.py ├── state │ ├── __init__.py │ ├── sdb_state.py │ └── State.py ├── tests │ ├── __init__.py │ ├── test_registry.py │ └── test_resources.py ├── config │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ ├── empty.yaml │ │ ├── included_yaml │ │ │ ├── 1.yaml │ │ │ ├── 2.yaml │ │ │ └── a.yaml │ │ ├── indent_include.yaml │ │ ├── relative_includes.yaml │ │ ├── config.yaml │ │ └── test_yaml_config.py │ ├── yaml_config.py │ └── config.py ├── providers │ ├── __init__.py │ └── sdb.py ├── reactor │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ └── test_reactor.py │ ├── handlers │ │ ├── __init__.py │ │ ├── log_handler.py │ │ ├── sdb_handler.py │ │ ├── ses_handler.py │ │ ├── pagerduty_handler.py │ │ └── Handler.py │ ├── filters │ │ ├── tests │ │ │ └── __init__.py │ │ └── __init__.py │ ├── aws_reactor.py │ └── Reactor.py ├── scheduler │ ├── __init__.py │ ├── lock │ │ ├── __init__.py │ │ ├── tests │ │ │ ├── __init__.py │ │ │ └── test_scheduler_lock.py │ │ ├── SchedulerLock.py │ │ └── SDBLock.py │ ├── backends │ │ ├── __init__.py │ │ ├── yaml_backend.py │ │ └── Backend.py │ ├── aws_scheduler.py │ └── Scheduler.py ├── suppress │ ├── __init__.py │ ├── tests │ │ ├── __init__.py │ │ └── test_suppression_manager.py │ ├── sdb_suppress.py │ └── suppress.py ├── utils │ ├── tests │ │ ├── __init__.py │ │ ├── test_commands.py │ │ └── test_base.py │ ├── cli.py │ ├── templates.py │ ├── aws_helper.py │ ├── commands.py │ ├── __init__.py │ └── logutil.py ├── __init__.py ├── daemon.py ├── exceptions.py ├── registry.py ├── schemas │ ├── types │ │ └── __init__.py │ └── __init__.py └── resources.py ├── MANIFEST.in ├── .landscape.yaml ├── .travis.yml ├── docker ├── conf │ ├── nodes.yaml │ ├── private.yaml │ ├── handlers │ │ ├── pagerduty.conf │ │ ├── log_handler.conf │ │ └── ses_handler.conf │ ├── resources.yaml │ └── config.yaml ├── probe ├── reactor └── scheduler ├── config ├── nodes.yaml ├── handlers │ └── sdb_handler.yaml ├── config.yaml ├── resources.yaml └── aws_policy.json ├── CHANGELOG.md ├── scripts ├── nymms_api ├── nymms_reactor ├── nymms_delete_suppressions ├── nymms_probe ├── nymms_scheduler ├── nymms_create_suppression ├── nymms_list_suppressions └── make_ami.py ├── Dockerfile ├── .gitignore ├── LICENSE ├── setup.py └── README.rst /docs/realms.rst: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/probe/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/state/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | -------------------------------------------------------------------------------- /nymms/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/providers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/reactor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/suppress/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.rst 2 | -------------------------------------------------------------------------------- /nymms/api/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/config/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/config/tests/empty.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/probe/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/reactor/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/scheduler/lock/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/suppress/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/utils/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/reactor/handlers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/scheduler/backends/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/scheduler/lock/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nymms/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.5.0' 2 | -------------------------------------------------------------------------------- /nymms/config/tests/included_yaml/1.yaml: -------------------------------------------------------------------------------- 1 | file1: 1 2 | -------------------------------------------------------------------------------- /nymms/config/tests/included_yaml/2.yaml: -------------------------------------------------------------------------------- 1 | file2: 2 2 | -------------------------------------------------------------------------------- /nymms/config/tests/included_yaml/a.yaml: -------------------------------------------------------------------------------- 1 | filea: a 2 | -------------------------------------------------------------------------------- /nymms/config/tests/indent_include.yaml: -------------------------------------------------------------------------------- 1 | a: 1 2 | b: 2 3 | -------------------------------------------------------------------------------- /nymms/config/tests/relative_includes.yaml: -------------------------------------------------------------------------------- 1 | foo: bar 2 | -------------------------------------------------------------------------------- /.landscape.yaml: -------------------------------------------------------------------------------- 1 | ignore-paths: 2 | - docs 3 | max-line-length: 80 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | # command to run tests 5 | script: python setup.py test 6 | -------------------------------------------------------------------------------- /docs/_static/images/nymms_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cloudtools/nymms/HEAD/docs/_static/images/nymms_arch.png -------------------------------------------------------------------------------- /docker/conf/nodes.yaml: -------------------------------------------------------------------------------- 1 | !include /etc/nymms/nodes/*.yaml 2 | 3 | www.google.com: 4 | monitoring_groups: 5 | - google 6 | -------------------------------------------------------------------------------- /docker/probe: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | exec /sbin/setuser nymms /usr/local/bin/nymms_probe -c /etc/nymms/config.yaml $* $EXTRA_OPTS 4 | -------------------------------------------------------------------------------- /docker/reactor: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | exec /sbin/setuser nymms /usr/local/bin/nymms_reactor -c /etc/nymms/config.yaml $* $EXTRA_OPTS 4 | -------------------------------------------------------------------------------- /docker/scheduler: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | exec /sbin/setuser nymms /usr/local/bin/nymms_scheduler -c /etc/nymms/config.yaml $* $EXTRA_OPTS 4 | -------------------------------------------------------------------------------- /config/nodes.yaml: -------------------------------------------------------------------------------- 1 | !include /etc/nymms/nodes/\*.yaml 2 | 3 | local: 4 | monitoring_groups: 5 | - local 6 | www.google.com: 7 | monitoring_groups: 8 | - google 9 | -------------------------------------------------------------------------------- /nymms/config/tests/config.yaml: -------------------------------------------------------------------------------- 1 | base: 2 | - 1 3 | - 2 4 | - 3 5 | !include relative_includes.yaml 6 | !include included_yaml/*.yaml 7 | included: 8 | !include indent_include.yaml 9 | -------------------------------------------------------------------------------- /config/handlers/sdb_handler.yaml: -------------------------------------------------------------------------------- 1 | # Disabled by default, here for an example. 2 | handler_class: nymms.reactor.handlers.sdb_handler.SDBHandler 3 | enabled: true 4 | region: us-east-1 5 | alerts_domain: nymms_alerts 6 | -------------------------------------------------------------------------------- /docker/conf/private.yaml: -------------------------------------------------------------------------------- 1 | # The variables in this context are shared with monitors, but are never passed 2 | # along between hosts. This means that you must distribute it between all 3 | # probe hosts that use it. 4 | # The context is accessed in __private, so for example for the example_password 5 | # below: 6 | # __private.example_password 7 | example_password: example 8 | -------------------------------------------------------------------------------- /docker/conf/handlers/pagerduty.conf: -------------------------------------------------------------------------------- 1 | handler_class: nymms.reactor.handlers.pagerduty_handler.PagerDutyHandler 2 | enabled: false 3 | subject_template: | 4 | {{state_type_name}}/{{state_name}}: {{id}} 5 | # You need a pagerduty service key to use this. 6 | service_keys: 7 | - 8 | filters: 9 | - nymms.reactor.filters.hard_state 10 | - nymms.reactor.filters.changed_state 11 | -------------------------------------------------------------------------------- /docker/conf/handlers/log_handler.conf: -------------------------------------------------------------------------------- 1 | handler_class: nymms.reactor.handlers.log_handler.LogHandler 2 | enabled: true 3 | filename: /var/log/nymms/reactor.log 4 | # See: http://docs.python.org/2/library/logging.handlers.html#timedrotatingfilehandler 5 | when: midnight 6 | interval: 1 7 | backup_count: 7 8 | # If you specify no filter its the same as a filter that always returns True 9 | # but we put this here as an example. 10 | filters: 11 | - nymms.reactor.filters.always_true 12 | -------------------------------------------------------------------------------- /nymms/scheduler/backends/yaml_backend.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from nymms.scheduler.backends.Backend import Backend 4 | from nymms.config import yaml_config 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class YamlBackend(Backend): 10 | def __init__(self, path): 11 | self.path = path 12 | 13 | def _load_nodes(self): 14 | version, nodes = yaml_config.load_config(self.path) 15 | logger.debug("Loaded node config (%s) from %s.", version, self.path) 16 | return nodes 17 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # pending 2 | 3 | # 0.5.0 (2015-04-10) 4 | - Fix error in nymms\_delete\_suppressions (GH-26) 5 | - A bunch of doc updates 6 | - Moved to backend provider / state/suppression manager split 7 | - Added landscape.io checking 8 | - Fixed nymms\_create\_suppression name (it was missing an s) 9 | - moved to schematics for most (all?) datastructures, so much nicer 10 | - Move to using iso8601 dates almost everywhere 11 | - Added REST API (GH-31, GH-32 - thanks @berndtj) 12 | - API Plugin support (GH-34 - thanks @berndtj) 13 | -------------------------------------------------------------------------------- /nymms/daemon.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | import nymms 6 | 7 | 8 | class NymmsDaemon(object): 9 | def __init__(self): 10 | logger.debug("%s initialized.", self.__class__.__name__) 11 | 12 | def run(self, *args, **kwargs): 13 | raise NotImplementedError 14 | 15 | def main(self, *args, **kwargs): 16 | logger.debug("Launching %s version %s.", self.__class__.__name__, 17 | nymms.__version__) 18 | self.run(*args, **kwargs) 19 | -------------------------------------------------------------------------------- /scripts/nymms_api: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ Command line script to launch the NYMMS REST API. """ 3 | 4 | import importlib 5 | 6 | from nymms.config import config 7 | from nymms.utils import cli 8 | 9 | parser = cli.NymmsCommandArgs(__doc__) 10 | args = parser.parse_args() 11 | logger = cli.setup_logging(args.verbose) 12 | 13 | config.load_config(args.config) 14 | 15 | from nymms.api import routes 16 | 17 | for plugin in config.settings.get('api', []).get('plugins', []): 18 | importlib.import_module(plugin) 19 | 20 | routes.nymms_api.run(debug=True) 21 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM phusion/baseimage:0.9.17 2 | MAINTAINER Michael Barrett 3 | 4 | RUN apt-get update && apt-get -y install nagios-plugins ncurses-dev libreadline-dev python-dev python-setuptools libyaml-dev && easy_install pip && apt-get clean && rm -rf /var/lib/apt/lists/* 5 | 6 | RUN adduser --system nymms; mkdir -p /var/log/nymms; chown -R nymms /var/log/nymms 7 | 8 | COPY . /src 9 | RUN cd /src; python setup.py install 10 | 11 | COPY docker/conf /etc/nymms 12 | COPY docker/scheduler /scheduler 13 | COPY docker/reactor /reactor 14 | COPY docker/probe /probe 15 | 16 | CMD ["/sbin/my_init"] 17 | -------------------------------------------------------------------------------- /nymms/utils/tests/test_commands.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from nymms.utils import commands 4 | 5 | 6 | class TestCommands(unittest.TestCase): 7 | def test_execute_failure(self): 8 | with self.assertRaises(commands.CommandFailure): 9 | # Non-existant command 10 | commands.execute('xxxps auwwwx', 10) 11 | 12 | def test_execute_timeout(self): 13 | with self.assertRaises(commands.CommandTimeout): 14 | commands.execute('sleep 2', 1) 15 | 16 | def test_execute_success(self): 17 | out = commands.execute('echo test', 10) 18 | self.assertEqual(out, 'test\n') 19 | -------------------------------------------------------------------------------- /docker/conf/handlers/ses_handler.conf: -------------------------------------------------------------------------------- 1 | # Disabled by default, here for an example. 2 | handler_class: nymms.reactor.handlers.ses_handler.SESHandler 3 | enabled: false 4 | region: us-east-1 5 | sender: admin@example.com 6 | subject_template: | 7 | {{state_type_name}}/{{state_name}}: {{id}} 8 | body_template: | 9 | Output: {{output}} 10 | Task Data: {{task_context}} 11 | recipients: 12 | - example@example.com 13 | # We only send email if it passes both of these filters 14 | # ie: the result needs to be a HARD state_type 15 | # AND it needs to have changed state (or not had a previous state) 16 | filters: 17 | - nymms.reactor.filters.hard_state 18 | - nymms.reactor.filters.changed_state 19 | -------------------------------------------------------------------------------- /nymms/scheduler/lock/tests/test_scheduler_lock.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from nymms.scheduler.lock.SchedulerLock import SchedulerLock 4 | 5 | NOW = 1391115581.238759 6 | DURATION = 30 7 | 8 | 9 | class TestSchedulerLock(unittest.TestCase): 10 | def setUp(self): 11 | self.lock = SchedulerLock(DURATION) 12 | 13 | def test_lock_expired(self): 14 | no_lock = None 15 | self.assertIs(self.lock.lock_expired(no_lock, NOW), True) 16 | expired_lock = NOW - (DURATION + 5) 17 | self.assertIs(self.lock.lock_expired(expired_lock, NOW), 18 | True) 19 | valid_lock = NOW + 5 20 | self.assertIs(self.lock.lock_expired(valid_lock, NOW), False) 21 | -------------------------------------------------------------------------------- /nymms/utils/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | from nymms.utils import logutil 7 | 8 | 9 | def setup_logging(verbose=False): 10 | level = logutil.INFO 11 | if verbose: 12 | level = logutil.DEBUG 13 | if not verbose > 1: 14 | logutil.quiet_boto_logging() 15 | return logutil.setup_root_logger(stdout=level) 16 | 17 | 18 | class NymmsCommandArgs(argparse.ArgumentParser): 19 | def __init__(self, *args, **kwargs): 20 | super(NymmsCommandArgs, self).__init__(*args, **kwargs) 21 | self.add_argument('-v', '--verbose', action='count', default=0) 22 | self.add_argument('-c', '--config', default='/etc/nymms/config.yaml') 23 | -------------------------------------------------------------------------------- /docker/conf/resources.yaml: -------------------------------------------------------------------------------- 1 | commands: 2 | check_https: 3 | command_string: /usr/lib/nagios/plugins/check_http -H {{address}} -S -u {{url}} -m {{minimum_size}} -w {{warn_timeout}} -c {{crit_timeout}} 4 | warn_timeout: 1 5 | crit_timeout: 10 6 | check_http: 7 | command_string: /usr/lib/nagios/plugins/check_http -H {{address}} -u {{url}} -w {{warn_timeout}} -c {{crit_timeout}} 8 | warn_timeout: 1 9 | crit_timeout: 10 10 | check_https_cert: 11 | command_string: /usr/lib/nagios/plugins/check_http -H {{address}} -S -u {{url}} -C {{cert_days}} 12 | 13 | monitoring_groups: 14 | all: 15 | google: 16 | 17 | monitors: 18 | google_http: 19 | command: check_http 20 | url: / 21 | monitoring_groups: 22 | - google 23 | -------------------------------------------------------------------------------- /nymms/utils/templates.py: -------------------------------------------------------------------------------- 1 | from jinja2 import Undefined 2 | 3 | 4 | class SimpleUndefined(Undefined): 5 | """ A version of undefined that doesn't freak out when a context 6 | variable is missing and gives non-verbose help. Unfortunately it's all 7 | but impossible with jinja to return the name of a dictionary that is 8 | missing a key. We'll make do for now. 9 | 10 | This is a weak hack - I really need to find a way to provide more 11 | 'context' about what is missing in the context. 12 | """ 13 | __slots__ = () 14 | 15 | def __unicode__(self): 16 | return u'{{MISSING_CONTEXT}}' 17 | 18 | def __getattr__(self, attr): 19 | return u'{{MISSING_CONTEXT}}' 20 | 21 | def __getitem__(self, item): 22 | return u'{{MISSING_CONTEXT}}' 23 | -------------------------------------------------------------------------------- /config/config.yaml: -------------------------------------------------------------------------------- 1 | # can be defined on a task by task basis 2 | monitor_timeout: 15 3 | resources: config/resources.yaml 4 | region: us-east-1 5 | results_topic: nymms_results 6 | tasks_queue: nymms_tasks 7 | state_domain: nymms_state 8 | result_domain: nymms_alerts 9 | 10 | probe: 11 | queue_wait_time: 20 12 | # can be defined on a task by task basis 13 | max_retries: 3 14 | # can be defined on a task by task basis 15 | retry_delay: 10 16 | 17 | scheduler: 18 | interval: 60 19 | backend: nymms.scheduler.backends.yaml_backend.YamlBackend 20 | backend_args: 21 | path: config/nodes.yaml 22 | 23 | reactor: 24 | queue_wait_time: 20 25 | visibility_timeout: 30 26 | queue_name: reactor_queue 27 | handler_config_path: config/handlers 28 | 29 | api: 30 | plugins: 31 | - nymms.api.plugins.sdb_handler -------------------------------------------------------------------------------- /docker/conf/config.yaml: -------------------------------------------------------------------------------- 1 | # can be defined on a task by task basis 2 | monitor_timeout: 15 3 | resources: /etc/nymms/resources.yaml 4 | region: us-east-1 5 | results_topic: nymms_results 6 | tasks_queue: nymms_tasks 7 | state_domain: nymms_state 8 | private_context_file: /etc/nymms/private.yaml 9 | task_expiration: 300 10 | 11 | probe: 12 | queue_wait_time: 20 13 | # can be defined on a task by task basis 14 | max_retries: 3 15 | # can be defined on a task by task basis 16 | retry_delay: 10 17 | 18 | scheduler: 19 | interval: 60 20 | backend: nymms.scheduler.backends.yaml_backend.YamlBackend 21 | backend_args: 22 | path: /etc/nymms/nodes.yaml 23 | 24 | reactor: 25 | queue_wait_time: 20 26 | visibility_timeout: 30 27 | queue_name: reactor_queue 28 | handler_config_path: /etc/nymms/handlers 29 | -------------------------------------------------------------------------------- /nymms/scheduler/backends/Backend.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from nymms.resources import load_resource, Node 4 | from nymms.registry import DuplicateEntryError 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class Backend(object): 10 | def _load_nodes(self): 11 | """ Should return a dictionary of Node information in this form: 12 | {'': {}, ...} 13 | 14 | Meant to be overridden by subclasses. 15 | """ 16 | raise NotImplementedError 17 | 18 | def load_nodes(self, reset=False): 19 | nodes = self._load_nodes() 20 | try: 21 | load_resource(nodes, Node, reset=reset) 22 | except DuplicateEntryError: 23 | # TODO: Need to figure out the story for reloading nodes, etc. 24 | logger.debug("Nodes already loaded and reset is False, skipping.") 25 | -------------------------------------------------------------------------------- /nymms/reactor/tests/test_reactor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from nymms.reactor.Reactor import Reactor 4 | from nymms.reactor.handlers.Handler import Handler 5 | 6 | 7 | enabled_config = {'handler_class': 'nymms.reactor.handlers.Handler.Handler', 8 | 'enabled': True} 9 | disabled_config = {'handler_class': 'nymms.reactor.handlers.Handler.Handler', 10 | 'enabled': False} 11 | 12 | 13 | class TestReactor(unittest.TestCase): 14 | @classmethod 15 | def setUpClass(cls): 16 | cls.reactor = Reactor() 17 | 18 | def test_load_enabled_handler(self): 19 | handler = self.reactor.load_handler('dummy_handler', enabled_config) 20 | self.assertIsInstance(handler, Handler) 21 | 22 | def test_load_disabled_handler(self): 23 | handler = self.reactor.load_handler('dummy_handler', disabled_config) 24 | self.assertIs(handler, None) 25 | -------------------------------------------------------------------------------- /config/resources.yaml: -------------------------------------------------------------------------------- 1 | commands: 2 | check_https: 3 | command_string: /usr/lib/nagios/plugins/check_http -H {{address}} -S -u {{url}} -m {{minimum_size}} -w {{warn_timeout}} -c {{crit_timeout}} 4 | warn_timeout: 1 5 | crit_timeout: 10 6 | check_http: 7 | command_string: /usr/lib/nagios/plugins/check_http -H {{address}} -u {{url}} -w {{warn_timeout}} -c {{crit_timeout}} 8 | warn_timeout: 1 9 | crit_timeout: 10 10 | check_https_cert: 11 | command_string: /usr/lib/nagios/plugins/check_http -H {{address}} -S -u {{url}} -C {{cert_days}} 12 | check_file: 13 | command_string: /bin/test -f {{file_name}} 14 | 15 | monitoring_groups: 16 | all: 17 | local: 18 | google: 19 | 20 | monitors: 21 | google_http: 22 | command: check_http 23 | url: / 24 | monitoring_groups: 25 | - google 26 | file_tmp_woot: 27 | command: check_file 28 | file_name: /tmp/woot 29 | monitoring_groups: 30 | - local 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.com 4 | *.class 5 | *.dll 6 | *.exe 7 | *.o 8 | *.so 9 | 10 | # Packages # 11 | ############ 12 | # it's better to unpack these files and commit the raw source 13 | # git has its own built in compression methods 14 | *.7z 15 | *.dmg 16 | *.gz 17 | *.iso 18 | *.jar 19 | *.rar 20 | *.tar 21 | *.zip 22 | 23 | # Logs and databases # 24 | ###################### 25 | *.log 26 | *.sql 27 | *.sqlite 28 | 29 | # OS generated files # 30 | ###################### 31 | .DS_Store* 32 | ehthumbs.db 33 | Icon? 34 | Thumbs.db 35 | 36 | # Vagrant 37 | .vagrant 38 | 39 | # Editor crap 40 | *.sw* 41 | *~ 42 | 43 | # Byte-compiled python 44 | *.pyc 45 | 46 | # Package directory 47 | build/ 48 | 49 | # Build object file directory 50 | objdir/ 51 | dist/ 52 | nymms.egg-info 53 | 54 | # nosetest --with-coverage dumps these in CWD 55 | .coverage 56 | 57 | Vagrantfile 58 | 59 | vm_setup.sh 60 | 61 | *.egg 62 | .eggs/ 63 | -------------------------------------------------------------------------------- /nymms/exceptions.py: -------------------------------------------------------------------------------- 1 | class NymmsException(Exception): 2 | """ NYMMS base exception class. 3 | """ 4 | pass 5 | 6 | 7 | class OutOfDateState(NymmsException): 8 | def __init__(self, current, previous): 9 | self.current_state = current 10 | self.previous_state = previous 11 | 12 | def __str__(self): 13 | return "Previous (%d) state newer than current (%d)." % ( 14 | self.previous_state.last_update, self.current_state.last_update) 15 | 16 | 17 | class MissingCommandContext(NymmsException): 18 | def __init__(self, message): 19 | self.message = message 20 | 21 | def __str__(self): 22 | return "Invalid command variable: %s" % self.message 23 | 24 | 25 | class InvalidConfig(NymmsException): 26 | def __init__(self, path, message): 27 | self.path = path 28 | self.message = message 29 | 30 | def __str__(self): 31 | return "Invalid config file '%s': %s" % (self.path, self.message) 32 | -------------------------------------------------------------------------------- /nymms/utils/tests/test_base.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from nymms import utils 4 | 5 | 6 | class TestBase(unittest.TestCase): 7 | def test_load_class_from_string(self): 8 | from logging import Logger 9 | cls = utils.load_object_from_string('logging.Logger') 10 | self.assertIs(cls, Logger) 11 | 12 | def test_parse_time(self): 13 | base_time = 1415311935 14 | self.assertEqual(utils.parse_time('+60s', base_time).timestamp, 15 | base_time + 60) 16 | self.assertEqual(utils.parse_time('+10m', base_time).timestamp, 17 | base_time + (60 * 10)) 18 | self.assertEqual(utils.parse_time('+10h', base_time).timestamp, 19 | base_time + (60 * 60 * 10)) 20 | self.assertEqual(utils.parse_time('-10d', base_time).timestamp, 21 | base_time - (10 * 60 * 60 * 24)) 22 | with self.assertRaises(ValueError): 23 | utils.parse_time('+2000xxx') 24 | -------------------------------------------------------------------------------- /scripts/nymms_reactor: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from nymms.utils import cli 4 | 5 | parser = cli.NymmsCommandArgs() 6 | args = parser.parse_args() 7 | 8 | cli.setup_logging(args.verbose) 9 | 10 | from nymms.config import config 11 | from nymms.reactor.aws_reactor import AWSReactor 12 | 13 | config.load_config(args.config) 14 | 15 | region = config.settings['region'] 16 | 17 | results_topic = config.settings['results_topic'] 18 | state_domain = config.settings['state_domain'] 19 | queue_name = config.settings['reactor']['queue_name'] 20 | handler_config_path = config.settings['reactor']['handler_config_path'] 21 | wait_time = config.settings['reactor']['queue_wait_time'] 22 | visibility_timeout = config.settings['reactor']['visibility_timeout'] 23 | suppress_domain = config.settings['suppress']['domain'] 24 | suppress_cache_timeout = config.settings['suppress']['cache_timeout'] 25 | daemon = AWSReactor(region, results_topic, state_domain, queue_name, 26 | suppress_domain, suppress_cache_timeout) 27 | daemon.main(handler_config_path, wait_time=wait_time, 28 | visibility_timeout=visibility_timeout) 29 | -------------------------------------------------------------------------------- /nymms/registry.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from nymms import exceptions 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class DuplicateEntryError(exceptions.NymmsException): 9 | def __init__(self, name, obj, registry): 10 | self.name = name 11 | self.obj = obj 12 | self.registry = registry 13 | 14 | def __str__(self): 15 | return "Duplicate entry in '%s' registry for '%s'." % ( 16 | self.registry.object_type.__name__, self.name) 17 | 18 | 19 | class Registry(dict): 20 | def __init__(self, object_type, *args, **kwargs): 21 | logging.debug("New '%s' registry id: %d", object_type, id(self)) 22 | self.object_type = object_type 23 | dict.__init__(self, *args, **kwargs) 24 | 25 | def __setitem__(self, name, value): 26 | if not isinstance(value, self.object_type): 27 | raise TypeError("This registry only accepts objects of " 28 | "type %s.", self.object_type.__name__) 29 | 30 | if name in self: 31 | raise DuplicateEntryError(name, value, self) 32 | dict.__setitem__(self, name, value) 33 | -------------------------------------------------------------------------------- /nymms/config/tests/test_yaml_config.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | 4 | from nymms.config import yaml_config 5 | 6 | 7 | class TestIncludeLoader(unittest.TestCase): 8 | def setUp(self): 9 | self.root = os.path.dirname(__file__) 10 | 11 | def test_relative_include(self): 12 | full_path = os.path.join(self.root, 'config.yaml') 13 | version, relative_config = yaml_config.load_config(full_path) 14 | self.assertEqual(relative_config['foo'], 'bar') 15 | self.assertEqual(relative_config['file1'], 1) 16 | 17 | def test_missing_config(self): 18 | with self.assertRaises(IOError): 19 | yaml_config.load_config('nonexistant.yaml') 20 | 21 | def test_indent_include(self): 22 | full_path = os.path.join(self.root, 'config.yaml') 23 | version, relative_config = yaml_config.load_config(full_path) 24 | self.assertEqual(relative_config['included']['a'], 1) 25 | 26 | def test_empty_config(self): 27 | full_path = os.path.join(self.root, 'empty.yaml') 28 | with self.assertRaises(yaml_config.EmptyConfig) as ee: 29 | yaml_config.load_config(full_path) 30 | self.assertEqual(ee.exception.filename, full_path) 31 | -------------------------------------------------------------------------------- /nymms/reactor/handlers/log_handler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from logging.handlers import TimedRotatingFileHandler 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | from nymms.reactor.handlers.Handler import Handler 7 | 8 | 9 | class LogHandler(Handler): 10 | """ A basic handler to send alerts to a log file via python's logging 11 | module. 12 | """ 13 | def _setup_logger(self): 14 | if getattr(self, '_file_logger', None): 15 | return 16 | filename = self.config['filename'] 17 | when = self.config['when'] 18 | interval = self.config['interval'] 19 | backup_count = self.config['backup_count'] 20 | handler = TimedRotatingFileHandler(filename, when, interval, 21 | backup_count) 22 | handler.setLevel(logging.INFO) 23 | msg_fmt = '[%(asctime)s] %(message)s' 24 | handler.setFormatter(logging.Formatter(msg_fmt)) 25 | self._file_logger = logging.getLogger('LogHandler') 26 | self._file_logger.propagate = False 27 | self._file_logger.addHandler(handler) 28 | self._file_logger.setLevel(logging.INFO) 29 | 30 | def _process(self, result, previous_state): 31 | self._setup_logger() 32 | self._file_logger.info(result.serialize()) 33 | -------------------------------------------------------------------------------- /scripts/nymms_delete_suppressions: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """CLI tool for marking reactor suppression as inactive""" 4 | 5 | from nymms.utils import cli 6 | 7 | 8 | parser = cli.NymmsCommandArgs(__doc__) 9 | parser.add_argument('-k', '--key', dest='rowkey', 10 | help='rowkey of suppression filter to deactivate') 11 | parser.add_argument('--deactivate-all', dest='all_filters', 12 | help='Deactivate all the suppression filters', 13 | action='store_true') 14 | 15 | args = parser.parse_args() 16 | logger = cli.setup_logging(args.verbose) 17 | 18 | from nymms.suppress.sdb_suppress import SDBSuppressionManager 19 | from nymms.config import config 20 | config.load_config(args.config) 21 | 22 | region = config.settings['region'] 23 | cache_timeout = config.settings['suppress']['cache_timeout'] 24 | domain = config.settings['suppress']['domain'] 25 | 26 | suppress = SDBSuppressionManager(region, cache_timeout, domain) 27 | 28 | if args.all_filters: 29 | suppress.deactivate_all_suppressions() 30 | print "Deactivated all active suppression filters" 31 | elif args.rowkey: 32 | print "Deactivating %s" % (args.rowkey,) 33 | if not suppress.deactivate_suppression(args.rowkey): 34 | print " ! Key %s not found." % (args.rowkey) 35 | else: 36 | print "Please provide a --key or --deactivate-all" 37 | exit(-1) 38 | -------------------------------------------------------------------------------- /nymms/api/plugins/sdb_handler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import arrow 3 | 4 | from flask import request 5 | 6 | from nymms import schemas 7 | from nymms.api import routes 8 | from nymms.config import config 9 | from nymms.providers.sdb import SimpleDBBackend 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | logger.debug('imported API plugin: %s', __name__) 14 | 15 | 16 | @routes.nymms_api.route('/result', methods=['GET']) 17 | def result(): 18 | """ 19 | List past results. 20 | 21 | Query Params: 22 | - limit (default 1000) 23 | - from_timestamp 24 | - to_timestamp 25 | """ 26 | region = config.settings['region'] 27 | domain_name = config.settings['result_domain'] 28 | backend = SimpleDBBackend(region, domain_name) 29 | args = request.args.to_dict(flat=True) 30 | limit = int(args.pop('limit', routes.DEFAULT_RESULT_LIMIT)) 31 | from_timestamp = args.pop('from_timestamp', None) 32 | to_timestamp = args.pop('to_timestamp', None) 33 | filters = [] 34 | if from_timestamp: 35 | filters.append( 36 | 'timestamp >= "%s"' % arrow.get(from_timestamp)) 37 | if to_timestamp: 38 | filters.append( 39 | 'timestamp <= "%s"' % arrow.get(to_timestamp).timestamp) 40 | results, _ = backend.filter(filters=filters, max_items=limit) 41 | return [schemas.APIResult(r).to_primitive(role='sdb') for r in results] 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012-2013, Michael Barrett 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 14 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 17 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 18 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 19 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 20 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 21 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 22 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 23 | POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /nymms/scheduler/lock/SchedulerLock.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | import uuid 6 | 7 | 8 | class SchedulerLock(object): 9 | def __init__(self, duration, lock_name="scheduler_lock"): 10 | self.id = self.get_instance_id() 11 | self.duration = duration 12 | self.lock_name = lock_name 13 | logger.debug("%s:%s initialized with %s duration.", 14 | self.__class__.__name__, self.id, duration) 15 | 16 | def get_instance_id(self): 17 | """ Can be overridden, but a random UUID at launch is probably good 18 | enough. 19 | """ 20 | return uuid.uuid4().hex 21 | 22 | def lock_expired(self, expiry, now): 23 | """ Returns True if the lock is expired, False otherwise. """ 24 | if not expiry or int(now) > int(expiry): 25 | return True 26 | return False 27 | 28 | def acquire(self): 29 | """ Should be overridden and return True or False depending on whether 30 | it got the lock or not. 31 | """ 32 | raise NotImplementedError 33 | 34 | 35 | class NoOpLock(SchedulerLock): 36 | def __init__(self): 37 | logger.warning("!!! Using NoOpLock") 38 | logger.warning("!!! Do not do this if you are planning to run more ") 39 | logger.warning("!!! than one scheduler.""") 40 | 41 | def acquire(self): 42 | return True 43 | -------------------------------------------------------------------------------- /nymms/tests/test_registry.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from nymms import registry 4 | from nymms.resources import Command, MonitoringGroup 5 | from weakref import WeakValueDictionary 6 | 7 | 8 | class TestRegistry(unittest.TestCase): 9 | def setUp(self): 10 | # Ensure we have a fresh registry before every test 11 | Command.registry.clear() 12 | 13 | def test_empty_registry(self): 14 | self.assertEqual(Command.registry, WeakValueDictionary()) 15 | 16 | def test_register_object(self): 17 | # First test it's empty 18 | self.assertEqual(Command.registry, WeakValueDictionary()) 19 | # Add a command 20 | command = Command('test_command', '/bin/true') 21 | # verify that there is only a single command in the registry 22 | self.assertEqual(len(Command.registry), 1) 23 | # Verify that the registered command is the same as command 24 | self.assertIs(Command.registry[command.name], command) 25 | 26 | def test_duplicate_register(self): 27 | # add a command 28 | print Command.registry 29 | Command('test_command', '/bin/true') 30 | with self.assertRaises(registry.DuplicateEntryError): 31 | Command('test_command', '/bin/true') 32 | 33 | def test_invalid_resource_register(self): 34 | with self.assertRaises(TypeError): 35 | Command.registry['test'] = MonitoringGroup('test_group') 36 | -------------------------------------------------------------------------------- /nymms/suppress/tests/test_suppression_manager.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import copy 3 | 4 | from nymms.suppress.suppress import SuppressionManager 5 | from nymms.schemas import Suppression 6 | 7 | now = 0 8 | 9 | SUPPRESSION_COMMON = { 10 | 'ipaddr': '10.0.0.1', 'userid': 'testuser', 'comment': 'testcomment'} 11 | 12 | SUPPRESSIONS = ( 13 | {'rowkey': '4b068858-4e98-4028-8413-7cd9e4dd94d1', 14 | 'regex': 'test_foo', 15 | 'expires': now + 60}, 16 | {'rowkey': '8a8fa3c8-29ee-4ad2-931a-57287c36b151', 17 | 'regex': 'test_bar.*', 18 | 'expires': now + 60}) 19 | 20 | 21 | class MockSuppressionManager(SuppressionManager): 22 | def migrate_suppressions(self): 23 | return 24 | 25 | def get_suppressions(self, expire, include_disabled=False): 26 | suppressions = copy.deepcopy(SUPPRESSIONS) 27 | for s in suppressions: 28 | s.update(SUPPRESSION_COMMON) 29 | return ([Suppression(x) for x in suppressions], None) 30 | 31 | 32 | class TestBase(unittest.TestCase): 33 | def setUp(self): 34 | self.suppression_manager = MockSuppressionManager( 35 | cache_ttl=60, schema_class=Suppression) 36 | 37 | def test_is_suppressed(self): 38 | self.assertFalse(self.suppression_manager.is_suppressed('woot')) 39 | self.assertTrue(self.suppression_manager.is_suppressed('test_foo')) 40 | self.assertTrue(self.suppression_manager.is_suppressed('test_barn')) 41 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | import glob 4 | 5 | src_dir = os.path.dirname(__file__) 6 | 7 | install_requires = [ 8 | "arrow>=0.5.4", 9 | "boto>=2.36.0", 10 | "Flask>=0.10.1", 11 | "Jinja2>=2.7.3", 12 | "MarkupSafe>=0.23", 13 | "python-dateutil>=2.4.0", 14 | "PyYAML>=3.11", 15 | "schematics>=1.0.2", 16 | "six>=1.9.0", 17 | "validictory>=1.0.0", 18 | "Werkzeug>=0.10.1", 19 | "Flask-API>=0.6.6.post1", 20 | ] 21 | 22 | tests_require = [ 23 | 'nose>=1.0', 24 | ] 25 | 26 | 27 | def read(filename): 28 | full_path = os.path.join(src_dir, filename) 29 | with open(full_path) as fd: 30 | return fd.read() 31 | 32 | 33 | if __name__ == '__main__': 34 | setup( 35 | name='nymms', 36 | version='0.5.0', 37 | author='Michael Barrett', 38 | author_email='loki77@gmail.com', 39 | license="New BSD license", 40 | url="https://github.com/cloudtools/nymms", 41 | description='Not Your Mother\'s Monitoring System (NYMMS)', 42 | long_description=read('README.rst'), 43 | classifiers=[ 44 | "Topic :: System :: Monitoring", 45 | "License :: OSI Approved :: BSD License", 46 | "Development Status :: 3 - Alpha"], 47 | packages=find_packages(), 48 | scripts=glob.glob(os.path.join(src_dir, 'scripts', 'nymms_*')), 49 | install_requires=install_requires, 50 | tests_require=tests_require, 51 | test_suite='nose.collector', 52 | ) 53 | -------------------------------------------------------------------------------- /config/aws_policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Action": [ 6 | "ses:GetSendQuota", 7 | "ses:SendEmail" 8 | ], 9 | "Sid": "NymmsSESAccess", 10 | "Resource": [ 11 | "*" 12 | ], 13 | "Effect": "Allow" 14 | }, 15 | { 16 | "Action": [ 17 | "sns:ConfirmSubscription", 18 | "sns:CreateTopic", 19 | "sns:DeleteTopic", 20 | "sns:GetTopicAttributes", 21 | "sns:ListSubscriptions", 22 | "sns:ListSubscriptionsByTopic", 23 | "sns:ListTopics", 24 | "sns:Publish", 25 | "sns:SetTopicAttributes", 26 | "sns:Subscribe", 27 | "sns:Unsubscribe" 28 | ], 29 | "Sid": "NymmsSNSAccess", 30 | "Resource": [ 31 | "*" 32 | ], 33 | "Effect": "Allow" 34 | }, 35 | { 36 | "Action": [ 37 | "sqs:ChangeMessageVisibility", 38 | "sqs:CreateQueue", 39 | "sqs:DeleteMessage", 40 | "sqs:DeleteQueue", 41 | "sqs:GetQueueAttributes", 42 | "sqs:GetQueueUrl", 43 | "sqs:ListQueues", 44 | "sqs:ReceiveMessage", 45 | "sqs:SendMessage", 46 | "sqs:SetQueueAttributes" 47 | ], 48 | "Sid": "NymmsSQSAccess", 49 | "Resource": [ 50 | "*", 51 | ], 52 | "Effect": "Allow" 53 | }, 54 | { 55 | "Action": [ 56 | "sdb:*" 57 | ], 58 | "Sid": "NymmsSDBAccess", 59 | "Resource": [ 60 | "*" 61 | ], 62 | "Effect": "Allow" 63 | } 64 | ] 65 | } -------------------------------------------------------------------------------- /scripts/nymms_probe: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from nymms.utils import cli 4 | 5 | parser = cli.NymmsCommandArgs() 6 | parser.add_argument('--realm', 7 | help="If specified this probe will only execute monitors " 8 | "in the given realm.") 9 | 10 | args = parser.parse_args() 11 | 12 | cli.setup_logging(args.verbose) 13 | 14 | from nymms.config import config 15 | from nymms.resources import load_resources 16 | from nymms.probe.sqs_probe import SQSProbe 17 | 18 | config.load_config(args.config) 19 | 20 | resource_version = load_resources(config.settings['resources']) 21 | region = config.settings['region'] 22 | 23 | tasks_queue = config.settings['tasks_queue'] 24 | if args.realm: 25 | tasks_queue += '_REALM_' + args.realm 26 | results_topic = config.settings['results_topic'] 27 | state_domain = config.settings['state_domain'] 28 | wait_timeout = config.settings['probe']['queue_wait_time'] 29 | monitor_timeout = config.settings['monitor_timeout'] 30 | max_retries = config.settings['probe']['max_retries'] 31 | retry_delay = config.settings['probe']['retry_delay'] 32 | task_expiration = config.settings['task_expiration'] 33 | private_context_file = config.settings['private_context_file'] 34 | 35 | daemon = SQSProbe(region, tasks_queue, results_topic, state_domain) 36 | daemon.main(monitor_timeout=monitor_timeout, 37 | max_retries=max_retries, 38 | retry_delay=retry_delay, 39 | queue_wait_time=wait_timeout, 40 | private_context_file=private_context_file, 41 | task_expiration=task_expiration) 42 | -------------------------------------------------------------------------------- /scripts/nymms_scheduler: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | 5 | from nymms.utils import aws_helper, cli 6 | 7 | parser = cli.NymmsCommandArgs() 8 | args = parser.parse_args() 9 | 10 | logger = cli.setup_logging(args.verbose) 11 | 12 | from nymms.config import config 13 | from nymms.resources import load_resources 14 | from nymms.scheduler.aws_scheduler import AWSScheduler 15 | from nymms.utils import load_object_from_string 16 | 17 | 18 | config.load_config(args.config) 19 | settings = config.settings 20 | 21 | resource_version = load_resources(settings['resources']) 22 | 23 | backend_cls = load_object_from_string(settings['scheduler']['backend']) 24 | backend_args = settings['scheduler']['backend_args'] 25 | backend = backend_cls(**backend_args) 26 | 27 | conn_mgr = aws_helper.ConnectionManager(config.settings['region']) 28 | 29 | interval = settings['scheduler']['interval'] 30 | task_expiration = settings['task_expiration'] 31 | 32 | lock = None 33 | 34 | lock_backend = settings['scheduler'].get('lock_backend') 35 | lock_args = settings['scheduler'].get('lock_args') 36 | 37 | if lock_args: 38 | lock_duration = lock_args.get('duration') 39 | if lock_duration <= interval: 40 | logger.error("Your lock duration (%s) should be larger than your " 41 | "scheduler interval (%s) or weird things can happen.", 42 | lock_duration, interval) 43 | sys.exit(1) 44 | 45 | if lock_backend: 46 | if lock_backend.lower() == 'sdb': 47 | from nymms.scheduler.lock.SDBLock import SDBLock 48 | lock = SDBLock(conn=conn_mgr.sdb, **lock_args) 49 | else: 50 | logger.error("Unrecognized lock_backend '%s'.") 51 | logger.error("Valid lock_backends are: sdb") 52 | sys.exit(1) 53 | 54 | s = AWSScheduler(backend, conn_mgr, config.settings['tasks_queue'], lock=lock) 55 | s.main(interval=interval, task_expiration=task_expiration) 56 | -------------------------------------------------------------------------------- /nymms/reactor/handlers/sdb_handler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from nymms.schemas.types import STATE_OK 4 | from nymms.reactor.handlers.Handler import Handler 5 | from nymms.utils.aws_helper import ConnectionManager 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class SDBHandler(Handler): 12 | """ A basic handler to persist alerts to AWS simpleDB. To filter 13 | results you should subclass this and provide a _filter method. 14 | 15 | config options: 16 | enabled: bool 17 | region: string, aws region (us-east-1, etc) 18 | sender: string, email address 19 | subject_template: string 20 | body_template: string 21 | recipients: list, email addresses 22 | filters: list, filters 23 | """ 24 | 25 | def __init__(self, *args, **kwargs): 26 | super(SDBHandler, self).__init__(*args, **kwargs) 27 | self._conn = None 28 | self._domain = None 29 | self.region = self.config['region'] 30 | self.domain_name = self.config['alerts_domain'] 31 | 32 | @property 33 | def conn(self): 34 | if not getattr(self, '_aws_conn', None): 35 | self._conn = ConnectionManager(region=self.region) 36 | return self._conn 37 | 38 | @property 39 | def domain(self): 40 | if not self._domain: 41 | self._domain = self.conn.sdb.create_domain(self.domain_name) 42 | return self._domain 43 | 44 | def _save_result(self, result, previous_state): 45 | """Adds a result to the SDB store 46 | """ 47 | item_name = '%s-%s' % (result.id, result.timestamp) 48 | # only persist alert states 49 | if result.state in (STATE_OK,): 50 | return item_name 51 | # Need to strip the context, since it could be larger than 1k 52 | self.domain.put_attributes(item_name, 53 | result.to_primitive(role='strip_context')) 54 | logger.debug("Added %s to %s", item_name, self.domain_name) 55 | return item_name 56 | 57 | def _process(self, result, previous_state): 58 | self._save_result(result, previous_state) 59 | -------------------------------------------------------------------------------- /nymms/reactor/handlers/ses_handler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from nymms.reactor.handlers.Handler import Handler 4 | from nymms.utils.aws_helper import ConnectionManager 5 | 6 | from jinja2 import Template 7 | from nymms.utils.templates import SimpleUndefined 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class SESHandler(Handler): 13 | """ A basic handler to send alerts to people via email through Amazon's 14 | SES service. Sends every result it receives by default. To filter 15 | results you should subclass this and provide a _filter method. 16 | 17 | config options: 18 | enabled: bool 19 | region: string, aws region (us-east-1, etc) 20 | sender: string, email address 21 | subject_template: string 22 | body_template: string 23 | recipients: list, email addresses 24 | filters: list, filters 25 | """ 26 | @property 27 | def aws_conn(self): 28 | if not getattr(self, '_aws_conn', None): 29 | self._aws_conn = ConnectionManager(region=self.config['region']) 30 | return self._aws_conn 31 | 32 | def _send_email(self, result, previous_state): 33 | subject = Template(self.config['subject_template']) 34 | subject.environment.undefined = SimpleUndefined 35 | body = Template(self.config['body_template']) 36 | body.environment.undefined = SimpleUndefined 37 | sender = self.config['sender'] 38 | recipients = self.config.get('recipients', []) 39 | result_data = result.serialize() 40 | if recipients: 41 | logger.debug("Sending SES alert to %s as %s for %s.", 42 | recipients, sender, result.id) 43 | self.aws_conn.ses.send_email( 44 | source=sender, 45 | subject=subject.render(result_data), 46 | body=body.render(result_data), 47 | to_addresses=recipients) 48 | else: 49 | logger.debug("No valid recipients found, not sending email for " 50 | "%s.", result.id) 51 | 52 | def _process(self, result, previous_state): 53 | self._send_email(result, previous_state) 54 | -------------------------------------------------------------------------------- /nymms/scheduler/aws_scheduler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | 4 | from boto.sqs.message import Message 5 | 6 | from nymms.scheduler.Scheduler import Scheduler 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class AWSScheduler(Scheduler): 12 | def __init__(self, node_backend, conn_mgr, task_queue, lock=None): 13 | self._conn = conn_mgr 14 | self._queue_name = task_queue 15 | self._default_queue = None 16 | self._realm_queues = {} 17 | super(AWSScheduler, self).__init__(node_backend, lock) 18 | 19 | def _set_expiration(self, queue, expiration): 20 | if expiration: 21 | logger.debug("Setting queue %s message retention to %d.", 22 | queue.name, expiration) 23 | queue.set_attribute('MessageRetentionPeriod', 24 | expiration) 25 | 26 | def _setup_realm(self, realm, **kwargs): 27 | if realm in self._realm_queues: 28 | return 29 | queue_name = self._queue_name + '_REALM_' + realm 30 | logger.debug("setting up realm queue %s", queue_name) 31 | queue = self._conn.sqs.create_queue(queue_name) 32 | self._set_expiration(queue, kwargs.get('task_expiration', None)) 33 | self._realm_queues[realm] = queue 34 | 35 | def _setup_queue(self, **kwargs): 36 | if self._default_queue: 37 | return 38 | logger.debug("setting up queue %s", self._queue_name) 39 | queue = self._conn.sqs.create_queue(self._queue_name) 40 | self._set_expiration(queue, kwargs.get('task_expiration', None)) 41 | self._default_queue = queue 42 | 43 | def _choose_queue(self, task, **kwargs): 44 | realm = task.context['realm'] 45 | if realm: 46 | self._setup_realm(realm, **kwargs) 47 | queue = self._realm_queues[realm] 48 | else: 49 | self._setup_queue(**kwargs) 50 | queue = self._default_queue 51 | return queue 52 | 53 | def submit_task(self, task, **kwargs): 54 | queue = self._choose_queue(task, **kwargs) 55 | logger.debug("Sending task '%s' to queue '%s'.", task.id, 56 | queue.name) 57 | m = Message() 58 | m.set_body(json.dumps(task.to_primitive())) 59 | return queue.write(m) 60 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | =========================================== 2 | NYMMS (Not Your Mother's Monitoring System) 3 | =========================================== 4 | 5 | You can find the latest docs (there aren't enough!) at ReadTheDocs_. 6 | 7 | NYMMS is a monitoring framework that takes inspiration from a lot of different 8 | places. 9 | 10 | It's goals are: 11 | 12 | - Independently scalable components 13 | - Fault tolerant 14 | - Easily useable in a cloud environment 15 | - Easy to add new monitors 16 | 17 | There are many other goals, but that's a good start. 18 | 19 | Here's a somewhat hard to understand diagram (at least without some 20 | explanation): 21 | 22 | .. image:: https://raw.github.com/cloudtools/nymms/master/docs/_static/images/nymms_arch.png 23 | 24 | Requirements 25 | ============ 26 | 27 | Currently the main requirements are: 28 | 29 | - Python (2.7 - may work on older versions, haven't tested) 30 | - boto 31 | - PyYAML (used in a few backends, will eventually not be a requirement unless 32 | you need to use those backends) 33 | - Jinja2 (needed for templating) 34 | - Validictory (0.9.1 https://pypi.python.org/pypi/validictory/0.9.1) 35 | 36 | Optionally: 37 | 38 | - pagerduty (0.2.1 https://pypi.python.org/pypi/pagerduty/0.2.1) if you use the 39 | pagerduty reactor handler 40 | 41 | Docker 42 | ====== 43 | 44 | A docker image is provided that can be used to run any of the daemons used in 45 | NYMMS. It can be pulled from `phobologic/nymms`. To run the daemons, you can 46 | launch them with the following command: 47 | 48 | docker run -e "AWS_ACCESS_KEY_ID=" -e "AWS_SECRET_ACCESS_KEY=" --rm -it phobologic/nymms:latest /[scheduler|probe|reactor] 49 | 50 | For example, to run the scheduler (with verbose logging, the -v) you can run: 51 | 52 | docker run --rm -it phobologic/nymms:latest /scheduler -v 53 | 54 | You can also set the `AWS_ACCESS_KEY_ID` & `AWS_SECRET_ACCESS_KEY` in a file, 55 | and then use `--env-file` rather than specifying the variables on the command 56 | line. Optionally, if you are running on a host in EC2 that has an IAM profile 57 | with all the necessary permissions, you do not need to specify the keys at all. 58 | 59 | .. _`boto pull request`: https://github.com/boto/boto/pull/1414 60 | .. _`ReadTheDocs`: https://nymms.readthedocs.io/en/latest/ 61 | -------------------------------------------------------------------------------- /nymms/suppress/sdb_suppress.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from nymms.suppress.suppress import SuppressionManager 4 | from nymms.schemas import Suppression 5 | from nymms.providers.sdb import SimpleDBBackend 6 | 7 | import arrow 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class SDBSuppressionManager(SuppressionManager): 13 | def __init__(self, region, timeout=60, domain_name='nymms_suppress', 14 | schema_class=Suppression): 15 | self.region = region 16 | self.domain_name = domain_name 17 | self.timeout = timeout 18 | 19 | super(SDBSuppressionManager, self).__init__(timeout, schema_class) 20 | 21 | @property 22 | def conn(self): 23 | return self.backend.conn 24 | 25 | @property 26 | def domain(self): 27 | return self.backend.domain 28 | 29 | def get_backend(self): 30 | return SimpleDBBackend(self.region, self.domain_name) 31 | 32 | def get_old_suppressions(self): 33 | query = ("select * from `%s` where `version` is null or " 34 | "`version` < '%s'" % (self.backend.domain_name, 35 | Suppression.CURRENT_VERSION)) 36 | return self.domain.select(query, consistent_read=True) 37 | 38 | def get_suppressions(self, expire=None, include_disabled=False, 39 | limit=None): 40 | """ Returns a list of suppressions that are not expired. 41 | 42 | expire = arrow datetime, or None for no start time 43 | active = True/False to limit to only filters flagged 'active' = 'True' 44 | """ 45 | filters = ["`created` is not null"] 46 | if expire: 47 | filters.append("`expires` >= '%s'" % expire.isoformat()) 48 | else: 49 | filters.append("`expires` > '0'") 50 | 51 | if not include_disabled: 52 | filters.append("`disabled` is null") 53 | 54 | return self.filter(filters=filters, max_items=limit) 55 | 56 | def deactivate_suppression(self, rowkey): 57 | """Deactivates a single suppression""" 58 | if self.backend.get(rowkey): 59 | self.conn.sdb.put_attributes(self.backend.domain_name, rowkey, 60 | {'disabled': arrow.get().isoformat()}) 61 | return True 62 | return False 63 | -------------------------------------------------------------------------------- /nymms/tests/test_resources.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from weakref import WeakValueDictionary 3 | 4 | from nymms import resources 5 | from nymms.exceptions import MissingCommandContext 6 | 7 | 8 | class TestNanoResources(unittest.TestCase): 9 | def test_reserved_attributes(self): 10 | with self.assertRaises(TypeError): 11 | resources.Command(name='test', command_string='test', 12 | address='10.0.0.1') 13 | 14 | def test_resource_context(self): 15 | c = resources.Command(name='test', command_string='test') 16 | context = c._context() 17 | self.assertEqual(context.keys()[0], 'command') 18 | self.assertEqual(context['command']['name'], c.name) 19 | 20 | def test_extra_attributes(self): 21 | extra_attribute_name = 'extra' 22 | extra2_value = 'extra2' 23 | c1 = resources.Command(name='test1', command_string='test1') 24 | c2 = resources.Command(name='test2', command_string='test2', 25 | extra=extra2_value) 26 | with self.assertRaises(KeyError): 27 | c1.extra_attributes[extra_attribute_name] 28 | self.assertEquals(c2.extra_attributes[extra_attribute_name], 29 | extra2_value) 30 | 31 | 32 | class TestNode(unittest.TestCase): 33 | def test_adding_monitoring_groups(self): 34 | mg1 = resources.MonitoringGroup('mg1') 35 | self.assertEqual(mg1.nodes, WeakValueDictionary()) 36 | mg2 = resources.MonitoringGroup('mg2') 37 | self.assertEqual(mg2.nodes, WeakValueDictionary()) 38 | node = resources.Node(name='node1', address='127.0.0.1', 39 | monitoring_groups=[mg1, mg2]) 40 | self.assertIn(node, mg1.nodes.values()) 41 | self.assertIn(node, mg2.nodes.values()) 42 | 43 | 44 | class TestCommand(unittest.TestCase): 45 | def test_format_command(self): 46 | command = "/bin/echo {{public}} {{__private.password}}" 47 | context = {'public': 'public'} 48 | private_context = {'password': 'mypassword'} 49 | c = resources.Command('echo', command) 50 | with self.assertRaises(MissingCommandContext): 51 | c.format_command(context) 52 | c_out = c.format_command(context, private_context) 53 | self.assertEquals(c_out, 54 | "/bin/echo public mypassword") 55 | -------------------------------------------------------------------------------- /scripts/nymms_create_suppression: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """CLI tool to create new reactor suppression filters""" 4 | 5 | import os 6 | import socket 7 | 8 | from nymms.schemas import Suppression 9 | from nymms.utils import cli, parse_time 10 | from nymms.suppress.sdb_suppress import SDBSuppressionManager 11 | from nymms.config import config 12 | 13 | import arrow 14 | 15 | 16 | def get_ipaddr(): 17 | """This is probably the most consistent & cross platform way 18 | to figure out what our IP address is""" 19 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 20 | s.connect(('google.com', 80)) 21 | ip = s.getsockname()[0] 22 | s.close() 23 | return ip 24 | 25 | ipaddr = get_ipaddr() 26 | userid = os.getlogin() 27 | 28 | parser = cli.NymmsCommandArgs(__doc__) 29 | parser.add_argument('-e', '--expires', dest='expires', default='+1h', 30 | help='ISO-8601 or +XXX[smhd] to expire in. ' 31 | 'default: %(default)s') 32 | parser.add_argument('-f', '--filter', dest='filter', required=True, 33 | help='Regex to filter events with') 34 | parser.add_argument('-C', '--comment', dest='comment', required=True, 35 | help='Comment to record with filter') 36 | parser.add_argument('-i', '--ipaddr', dest='ipaddr', default=ipaddr, 37 | help='Override IP address to record: %(default)s') 38 | parser.add_argument('-u', '--userid', dest='userid', default=userid, 39 | help='Override userid to record: %(default)s') 40 | 41 | args = parser.parse_args() 42 | logger = cli.setup_logging(args.verbose) 43 | 44 | config.load_config(args.config) 45 | 46 | now = arrow.get() 47 | expires = parse_time(args.expires) 48 | 49 | if expires <= now: 50 | logger.error("Expires must be in the future") 51 | exit(1) 52 | 53 | logger.debug("Currently %s, will expire at %s.", now, expires) 54 | 55 | region = config.settings['region'] 56 | cache_timeout = config.settings['suppress']['cache_timeout'] 57 | domain = config.settings['suppress']['domain'] 58 | 59 | suppress = SDBSuppressionManager(region, cache_timeout, domain) 60 | 61 | suppress_obj = Suppression({ 62 | 'comment': args.comment, 63 | 'expires': expires, 64 | 'ipaddr': args.ipaddr, 65 | 'regex': args.filter, 66 | 'userid': args.userid}) 67 | suppress.add_suppression(suppress_obj) 68 | print "Suppression added: %s" % suppress_obj.rowkey 69 | -------------------------------------------------------------------------------- /nymms/reactor/handlers/pagerduty_handler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | from nymms.reactor.handlers.Handler import Handler 6 | 7 | from jinja2 import Template 8 | from nymms.utils.templates import SimpleUndefined 9 | 10 | try: 11 | import pagerduty 12 | except ImportError: 13 | logger.error("Unable to import the pagerduty module.") 14 | logger.error("Please install it from here: ") 15 | logger.error(" https://pypi.python.org/pypi/pagerduty/") 16 | logger.error("(You can use pip, ie: pip install pagerduty)") 17 | raise 18 | 19 | 20 | MISSING_SUBJECT = 'Handler %s missing subject_template.' 21 | 22 | 23 | class PagerDutyHandler(Handler): 24 | """ A basic handler to send alerts to people via email through Amazon's 25 | SES service. Sends every result it receives by default. To filter 26 | results you should subclass this and provide a _filter method. 27 | 28 | config options: 29 | enabled: bool 30 | subject_template: string, jinja2 Template string 31 | service_keys: list(string), pagerduty service keys 32 | filters: list(string), filters 33 | """ 34 | def _connect(self): 35 | if getattr(self, '_endpoints', None): 36 | return 37 | self._endpoints = [] 38 | service_keys = self.config.get('service_keys', []) 39 | if not service_keys: 40 | logger.warning("No service_keys configured for Handler %s.", 41 | self.__class__.__name__) 42 | return 43 | for key in service_keys: 44 | logger.debug("Initializing pagerduty service endpoint %s.", 45 | key) 46 | self._endpoints.append(pagerduty.PagerDuty(key)) 47 | 48 | def _send_incident(self, result, previous_state): 49 | self._connect() 50 | subject_template = self.config.get( 51 | 'subject_template', 52 | MISSING_SUBJECT % (self.__class__.__name__)) 53 | description = Template(subject_template) 54 | description.environment.undefined = SimpleUndefined 55 | result_data = result.serialize() 56 | for ep in self._endpoints: 57 | logger.debug("Submitting to pagerduty service_key %s.", 58 | ep.service_key) 59 | ep.trigger(description=description.render(result_data), 60 | incident_key=result.id, 61 | details=result_data) 62 | 63 | def _process(self, result, previous_state): 64 | self._send_incident(result, previous_state) 65 | -------------------------------------------------------------------------------- /nymms/probe/sqs_probe.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | from boto.sqs.message import Message 7 | 8 | from nymms.schemas import Task 9 | from nymms.probe.Probe import Probe 10 | from nymms.state.sdb_state import SDBStateManager 11 | from nymms.utils.aws_helper import SNSTopic, ConnectionManager 12 | 13 | 14 | class SQSProbe(Probe): 15 | def __init__(self, region, task_queue, results_topic, state_domain, 16 | state_manager=SDBStateManager): 17 | self.region = region 18 | self.queue_name = task_queue 19 | self.topic_name = results_topic 20 | self.state_manager = state_manager(region, state_domain) 21 | 22 | self._conn = None 23 | self._queue = None 24 | self._topic = None 25 | 26 | super(SQSProbe, self).__init__() 27 | 28 | @property 29 | def conn(self): 30 | if not self._conn: 31 | self._conn = ConnectionManager(self.region) 32 | return self._conn 33 | 34 | @property 35 | def queue(self): 36 | if not self._queue: 37 | self._queue = self.conn.sqs.create_queue(self.queue_name) 38 | return self._queue 39 | 40 | @property 41 | def topic(self): 42 | if not self._topic: 43 | self._topic = SNSTopic(self.region, self.topic_name) 44 | return self._topic 45 | 46 | def get_task(self, **kwargs): 47 | wait_time = kwargs.get('queue_wait_time') 48 | timeout = kwargs.get('monitor_timeout') + 3 49 | logger.debug("Getting task from queue %s.", self.queue_name) 50 | task_item = self.queue.read(visibility_timeout=timeout, 51 | wait_time_seconds=wait_time) 52 | task = None 53 | if task_item: 54 | task = Task(json.loads(task_item.get_body()), origin=task_item) 55 | return task 56 | 57 | def resubmit_task(self, task, delay, **kwargs): 58 | task.increment_attempt() 59 | logger.debug("Resubmitting task %s with %d second delay.", task.id, 60 | delay) 61 | m = Message() 62 | m.set_body(json.dumps(task.serialize())) 63 | return self.queue.write(m, delay_seconds=delay) 64 | 65 | def submit_result(self, result, **kwargs): 66 | logger.debug("%s - submitting '%s/%s' result", result.id, 67 | result.state.name, result.state_type.name) 68 | return self.topic.publish(json.dumps(result.to_primitive())) 69 | 70 | def delete_task(self, task): 71 | self.queue.delete_message(task._origin) 72 | -------------------------------------------------------------------------------- /nymms/state/sdb_state.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | from boto.exception import SDBResponseError 6 | 7 | from nymms.state.State import StateManager 8 | from nymms.schemas import StateRecord 9 | from nymms.exceptions import OutOfDateState 10 | from nymms.providers.sdb import SimpleDBBackend 11 | 12 | 13 | class SDBStateManager(StateManager): 14 | def __init__(self, region, domain_name, schema_class=StateRecord): 15 | self.region = region 16 | self.domain_name = domain_name 17 | 18 | super(SDBStateManager, self).__init__(schema_class) 19 | 20 | @property 21 | def conn(self): 22 | return self.backend.conn 23 | 24 | @property 25 | def domain(self): 26 | return self.backend.domain 27 | 28 | def get_backend(self): 29 | return SimpleDBBackend(self.region, self.domain_name) 30 | 31 | def save_state(self, task_id, result, previous): 32 | new_state = self.build_new_state(task_id, result, previous) 33 | expected_value = ['last_update', False] 34 | if previous: 35 | expected_value = ['last_update', previous.last_update.isoformat()] 36 | if previous.last_update > new_state.last_update: 37 | logger.warning(task_id + " - found previous state that is " 38 | "newer than current state. Discarding.") 39 | logger.warning(task_id + " - previous state: %s", 40 | previous.to_primitive()) 41 | logger.warning(task_id + " - current state: %s", 42 | new_state.to_primitive()) 43 | raise OutOfDateState(new_state, previous) 44 | logger.debug(task_id + " - saving state: %s", 45 | new_state.to_primitive()) 46 | try: 47 | self.domain.put_attributes(task_id, new_state.to_primitive(), 48 | replace=True, 49 | expected_value=expected_value) 50 | except SDBResponseError as e: 51 | if e.error_code == 'ConditionalCheckFailed': 52 | logger.warning('last_update for %s was updated, skipping', 53 | task_id) 54 | return 55 | raise 56 | 57 | def get_old_states(self): 58 | query = ("select * from `%s` where `version` is null or " 59 | "`version` < '%s'" % (self.backend.domain_name, 60 | self.schema_class.CURRENT_VERSION)) 61 | return self.domain.select(query, consistent_read=True) 62 | -------------------------------------------------------------------------------- /scripts/nymms_list_suppressions: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """CLI tool to report on reactor suppression filters.""" 4 | 5 | import csv 6 | import sys 7 | from textwrap import TextWrapper 8 | 9 | from nymms.utils import cli 10 | from nymms.suppress.sdb_suppress import SDBSuppressionManager 11 | from nymms.config import config 12 | 13 | import arrow 14 | 15 | 16 | parser = cli.NymmsCommandArgs(__doc__) 17 | parser.add_argument('-f', '--format', dest='format', default='pretty', 18 | choices=('csv', 'pretty'), 19 | help='Output format. Choices: %(choices)s. Default: ' 20 | '%(default)s') 21 | parser.add_argument('-a', '--all', dest='show_all', action='store_true', 22 | help='Show all matching suppression filters, even ' 23 | 'inactive ones') 24 | 25 | args = parser.parse_args() 26 | logger = cli.setup_logging(args.verbose) 27 | 28 | config.load_config(args.config) 29 | 30 | now = arrow.get() 31 | 32 | region = config.settings['region'] 33 | cache_timeout = config.settings['suppress']['cache_timeout'] 34 | domain = config.settings['suppress']['domain'] 35 | 36 | suppress = SDBSuppressionManager(region, cache_timeout, domain) 37 | 38 | result, next_token = suppress.get_suppressions(None, args.show_all) 39 | if args.format == 'csv': 40 | print "regex,created,expires,userid,ipaddr,comment,rowkey,active" 41 | 42 | for item in result: 43 | if item.expires < now and not args.show_all: 44 | continue 45 | 46 | if args.format == 'pretty': 47 | comment_wrapper = TextWrapper(initial_indent=' comment: ', 48 | subsequent_indent=' ') 49 | print "regex: '%s'" % item.regex 50 | print " rowkey: %s" % item.rowkey 51 | print " created: %s (%d)" % (item.created, 52 | item.created.timestamp) 53 | print " %s" % (item.created.humanize()) 54 | print " expires: %s (%d)" % (item.expires, 55 | item.expires.timestamp) 56 | print " %s" % (item.expires.humanize()) 57 | print " author: %s@%s" % (item.userid, item.ipaddr) 58 | print " state: %s" % item.state 59 | print '\n'.join(comment_wrapper.wrap(item.comment)) 60 | print 61 | elif args.format == 'csv': 62 | writer = csv.writer(sys.stdout) 63 | writer.writerow([item.regex, item.created, item.expires, 64 | item.userid, item.ipaddr, item.comment, item.rowkey, 65 | item.active]) 66 | -------------------------------------------------------------------------------- /nymms/scheduler/lock/SDBLock.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | import time 6 | 7 | from boto.exception import SDBResponseError 8 | 9 | from nymms.scheduler.lock.SchedulerLock import SchedulerLock 10 | 11 | 12 | class SDBLock(SchedulerLock): 13 | def __init__(self, duration, conn, domain_name, 14 | lock_name="scheduler_lock"): 15 | super(SDBLock, self).__init__(duration, lock_name) 16 | self.conn = conn 17 | self.domain_name = domain_name 18 | self.domain = None 19 | self.lock = None 20 | 21 | def setup_domain(self): 22 | if self.domain: 23 | return 24 | logger.debug("Setting up lock domain %s", self.domain_name) 25 | self.domain = self.conn.create_domain(self.domain_name) 26 | 27 | def acquire(self): 28 | logger.debug("Attempting to acquire lock %s:%s", self.domain_name, 29 | self.lock_name) 30 | self.setup_domain() 31 | now = int(time.time()) 32 | existing_lock = self.domain.get_item(self.lock_name, 33 | consistent_read=True) 34 | lock_body = {'expiry': now + self.duration, 35 | 'timestamp': now, 36 | 'owner': self.id} 37 | expected_value = ['timestamp', False] 38 | if existing_lock: 39 | logger.debug("Existing lock found: %s", existing_lock) 40 | existing_ts = existing_lock['timestamp'] 41 | if not existing_lock['owner'] == self.id: 42 | if not self.lock_expired(existing_lock['expiry'], now): 43 | logger.debug("Lock still valid, not taking over.") 44 | return False 45 | else: 46 | logger.info("Lock expired, attempting takeover.") 47 | else: 48 | logger.info("I already own the lock, updating.") 49 | expected_value = ['timestamp', existing_ts] 50 | 51 | try: 52 | self.domain.put_attributes(self.lock_name, lock_body, 53 | replace=bool(existing_lock), 54 | expected_value=expected_value) 55 | self.lock = lock_body 56 | logger.debug("Acquired lock %s:%s", self.domain_name, 57 | self.lock_name) 58 | return True 59 | except SDBResponseError as e: 60 | if e.status == 409: 61 | logger.debug('Looks like someone else has acquired the lock.') 62 | return False 63 | raise 64 | return False 65 | -------------------------------------------------------------------------------- /nymms/utils/aws_helper.py: -------------------------------------------------------------------------------- 1 | import boto 2 | import imp 3 | 4 | CONNECT_MAP = { 5 | 'beanstalk': ['beanstalk'], 6 | 'cloudformation': ['cloudformation'], 7 | 'cloudsearch': ['cloudsearch'], 8 | 'dynamodb': ['dynamodb'], 9 | 'dynamodb2': ['dynamodb2'], 10 | 'ec2': ['ec2'], 11 | 'autoscale': ['ec2', 'autoscale'], 12 | 'cloudwatch': ['ec2', 'cloudwatch'], 13 | 'elb': ['ec2', 'elb'], 14 | 'elasticache': ['elasticache'], 15 | 'elastictranscoder': ['elastictranscoder'], 16 | 'emr': ['emr'], 17 | 'glacier': ['glacier'], 18 | 'iam': ['iam'], 19 | 'rds': ['rds'], 20 | 'redshift': ['redshift'], 21 | 'route53': ['route53'], 22 | 's3': ['s3'], 23 | 'sdb': ['sdb'], 24 | 'ses': ['ses'], 25 | 'sns': ['sns'], 26 | 'sqs': ['sqs'], 27 | 'sts': ['sts'], 28 | 'support': ['support'], 29 | 'swf': ['swf'], 30 | 'vpc': ['vpc'], 31 | } 32 | 33 | 34 | class ConnectionManager(object): 35 | """ Used to setup and maintain AWS service connections in a single region. 36 | 37 | This acts as a proxy for AWS connections to all AWS services that boto 38 | provides a connect_to_region method for. 39 | """ 40 | def __init__(self, region='us-east-1', **kw_params): 41 | self.region = region 42 | self.params = kw_params 43 | 44 | def __getattr__(self, attr): 45 | try: 46 | modules = CONNECT_MAP[attr] 47 | except KeyError: 48 | raise AttributeError(attr) 49 | 50 | current_mod = boto 51 | for m in modules: 52 | x = imp.find_module(m, current_mod.__path__) 53 | current_mod = imp.load_module(attr, *x) 54 | connect_to_region = getattr(current_mod, 'connect_to_region') 55 | conn = connect_to_region(self.region, **self.params) 56 | setattr(self, attr, conn) 57 | return conn 58 | 59 | 60 | class SNSTopic(object): 61 | def __init__(self, region, topic_name): 62 | self.region = region 63 | self.topic_name = topic_name 64 | 65 | self._conn = None 66 | self.topic_arn = None 67 | 68 | @property 69 | def conn(self): 70 | if not self._conn: 71 | self._conn = ConnectionManager(self.region).sns 72 | response = self.conn.create_topic( 73 | self.topic_name)['CreateTopicResponse'] 74 | self.topic_arn = response['CreateTopicResult']['TopicArn'] 75 | return self._conn 76 | 77 | def publish(self, *args, **kwargs): 78 | return self.conn.publish(self.topic_arn, *args, **kwargs) 79 | 80 | def subscribe_sqs_queue(self, *args, **kwargs): 81 | return self.conn.subscribe_sqs_queue(self.topic_arn, *args, **kwargs) 82 | -------------------------------------------------------------------------------- /nymms/state/State.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | from nymms.schemas import StateRecord, types 6 | 7 | 8 | class StateManager(object): 9 | def __init__(self, schema_class=StateRecord): 10 | self._backend = None 11 | self.schema_class = schema_class 12 | self.migrate() 13 | logger.debug("%s initialized.", self.__class__.__name__) 14 | 15 | @property 16 | def backend(self): 17 | if not self._backend: 18 | self._backend = self.get_backend() 19 | return self._backend 20 | 21 | def get_backend(self, *args, **kwargs): 22 | raise NotImplementedError 23 | 24 | def build_new_state(self, task_id, result, previous): 25 | new_state = self.schema_class({'id': task_id, 26 | 'state': result.state, 27 | 'state_type': result.state_type}) 28 | # Only update last_state_change if the state has changed to a new 29 | # HARD state_type state, otherwise we use the previous 30 | # last_state_change 31 | if previous: 32 | if (new_state.state_type is types.STATE_TYPE_SOFT or 33 | previous.state is new_state.state): 34 | new_state.last_state_change = previous.last_state_change 35 | 36 | new_state.validate() 37 | return new_state 38 | 39 | def deserialize(self, item, strict=False): 40 | try: 41 | item_obj = self.schema_class(item, strict=strict, origin=item) 42 | item_obj.validate() 43 | return item_obj 44 | except Exception: 45 | logger.exception("Problem deserializing item:") 46 | logger.error("Data: %s", str(item)) 47 | return None 48 | 49 | def get_state(self, task_id): 50 | item = self.backend.get(task_id) 51 | if item: 52 | return self.deserialize(item) 53 | return None 54 | 55 | def delete_record(self, record): 56 | return self.backend.purge(record) 57 | 58 | def filter(self, *args, **kwargs): 59 | result, next_token = self.backend.filter(*args, **kwargs) 60 | return ([self.deserialize(i) for i in result], next_token) 61 | 62 | def migrate(self): 63 | """ Temporary method, used to update all expressions to the new format 64 | using schematics & arrow. 65 | """ 66 | for item in self.get_old_states(): 67 | new_state = self.schema_class.migrate(item) 68 | old_key = item['id'] 69 | self.backend.purge(old_key) 70 | if new_state: 71 | logger.debug("Migrating old state %s.", old_key) 72 | self.backend.put(new_state) 73 | -------------------------------------------------------------------------------- /nymms/utils/commands.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import signal 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class CommandException(Exception): 9 | pass 10 | 11 | 12 | class CommandTimeout(CommandException): 13 | def __init__(self, command, timeout): 14 | self.command = command 15 | self.timeout = timeout 16 | 17 | def __str__(self): 18 | return "Command '%s' took longer than %d seconds to execute." % ( 19 | self.command, self.timeout) 20 | 21 | 22 | class CommandFailure(CommandException): 23 | def __init__(self, command, return_code, output): 24 | self.command = command 25 | self.return_code = return_code 26 | self.output = output 27 | 28 | def __str__(self): 29 | return "Command '%s' exited with a return code of %d." % ( 30 | self.command, self.return_code,) 31 | 32 | 33 | def execute(command_string, timeout=None): 34 | """ 35 | Execute a command with an optional timeout. If the command takes longer 36 | than timeout raise a CommandTimeout exception. If the command fails raise 37 | a CommandFailure exception. Otherwise return stdout & stderr from the 38 | command. 39 | """ 40 | def handle_sigalrm(signum, frame): 41 | if signum == signal.SIGALRM: 42 | logger.debug("Command '%s' timed out after %d seconds.", 43 | command_string, timeout) 44 | raise CommandTimeout(command_string, timeout) 45 | signal.signal(signal.SIGALRM, handle_sigalrm) 46 | log_header = "Executing command:" 47 | # If a timeout is given, lets setup an alarm signal 48 | if timeout: 49 | log_header += " (timeout: %d)" % (timeout) 50 | signal.alarm(timeout) 51 | # Execute the command 52 | logger.debug(log_header) 53 | logger.debug(" %s", command_string) 54 | command_object = subprocess.Popen(command_string, shell=True, 55 | stdout=subprocess.PIPE, 56 | stderr=subprocess.STDOUT) 57 | try: 58 | output = command_object.communicate() 59 | except CommandTimeout: 60 | logger.debug("Command timed out, terminating child command.") 61 | command_object.terminate() 62 | raise 63 | if not command_object.returncode == 0: 64 | signal.alarm(0) 65 | logger.debug("Command '%s' failed with return code %d:", 66 | command_string, command_object.returncode) 67 | for line in output[0].split('\n'): 68 | logger.debug(" output: %s", line) 69 | raise CommandFailure(command_string, command_object.returncode, 70 | output[0]) 71 | signal.alarm(0) 72 | return output[0] 73 | -------------------------------------------------------------------------------- /nymms/scheduler/Scheduler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | from nymms.daemon import NymmsDaemon 5 | from nymms.resources import Node 6 | from nymms.scheduler.lock.SchedulerLock import NoOpLock 7 | from nymms.schemas import Task 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class Scheduler(NymmsDaemon): 13 | task_id_template = "{node[name]}:{monitor[name]}" 14 | 15 | def __init__(self, node_backend, lock=None): 16 | self._node_backend = node_backend 17 | if not lock: 18 | lock = NoOpLock() 19 | 20 | self._lock = lock 21 | super(Scheduler, self).__init__() 22 | 23 | def get_tasks(self): 24 | tasks = {} 25 | self._node_backend.load_nodes() 26 | nodes = Node.registry 27 | for node_name, node in nodes.iteritems(): 28 | tasks[node_name] = node.monitors 29 | return tasks 30 | 31 | def submit_task(self, task, **kwargs): 32 | raise NotImplementedError 33 | 34 | def run(self, **kwargs): 35 | interval = kwargs.get('interval') 36 | while True: 37 | start = time.time() 38 | if self._lock.acquire(): 39 | self.run_once(**kwargs) 40 | run_time = time.time() - start 41 | logger.info("Scheduler iteration took %d seconds.", run_time) 42 | sleep_time = interval - max(run_time, 0) 43 | logger.info("Scheduler sleeping for %d seconds.", sleep_time) 44 | else: 45 | # Only sleep for 10 seconds before checking the lock again 46 | # when we don't acquire the lock. Allows for faster takeover. 47 | sleep_time = 10 48 | logger.info("Failed to acquire lock, sleeping for %d seconds.", 49 | sleep_time) 50 | time.sleep(sleep_time) 51 | 52 | def run_once(self, **kwargs): 53 | tasks = self.get_tasks() 54 | # This is done to make sure we submit one task per node until we've 55 | # submitted all the tasks. This helps ensure we don't hammer a 56 | # single node with monitoring tasks 57 | while True: 58 | working_index = tasks.keys() 59 | if not working_index: 60 | break 61 | for node in working_index: 62 | try: 63 | task_context = tasks[node].pop() 64 | task_id = self.task_id_template.format(**task_context) 65 | task = Task({ 66 | 'id': task_id, 67 | 'context': task_context}) 68 | except IndexError: 69 | del(tasks[node]) 70 | continue 71 | self.submit_task(task, **kwargs) 72 | -------------------------------------------------------------------------------- /nymms/reactor/filters/tests/__init__.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from nymms.schemas import Result, StateRecord, types 4 | from nymms.reactor import filters 5 | 6 | 7 | class TestFilters(unittest.TestCase): 8 | def setUp(self): 9 | self.result = Result({'id': 'test:filter', 10 | 'state': types.STATE_OK, 11 | 'state_type': types.STATE_TYPE_HARD}) 12 | self.result.validate() 13 | self.record = StateRecord({ 14 | 'id': 'test:filter', 15 | 'state': types.STATE_OK, 16 | 'state_type': types.STATE_TYPE_HARD}) 17 | self.record.validate() 18 | 19 | def test_hard_state(self): 20 | self.assertTrue(filters.hard_state(self.result, self.record)) 21 | 22 | self.result.state_type = types.STATE_TYPE_SOFT 23 | self.result.validate() 24 | self.assertFalse(filters.hard_state(self.result, self.record)) 25 | 26 | def test_ok_state(self): 27 | self.assertTrue(filters.ok_state(self.result, self.record)) 28 | 29 | self.result.state = types.STATE_WARNING 30 | self.result.validate() 31 | self.assertFalse(filters.ok_state(self.result, self.record)) 32 | 33 | def test_not_ok_state(self): 34 | self.assertFalse(filters.not_ok_state(self.result, self.record)) 35 | 36 | self.result.state = types.STATE_WARNING 37 | self.result.validate() 38 | self.assertTrue(filters.not_ok_state(self.result, self.record)) 39 | 40 | def test_warning_state(self): 41 | self.assertFalse(filters.warning_state(self.result, self.record)) 42 | 43 | self.result.state = types.STATE_WARNING 44 | self.result.validate() 45 | self.assertTrue(filters.warning_state(self.result, self.record)) 46 | 47 | def test_critical_state(self): 48 | self.assertFalse(filters.critical_state(self.result, self.record)) 49 | 50 | self.result.state = types.STATE_CRITICAL 51 | self.result.validate() 52 | self.assertTrue(filters.critical_state(self.result, self.record)) 53 | 54 | def test_unknown_state(self): 55 | self.assertFalse(filters.unknown_state(self.result, self.record)) 56 | 57 | self.result.state = types.STATE_UNKNOWN 58 | self.result.validate() 59 | self.assertTrue(filters.unknown_state(self.result, self.record)) 60 | 61 | def test_changed_state(self): 62 | f = filters.changed_state 63 | self.assertFalse(f(self.result, self.record)) 64 | 65 | self.assertTrue(f(self.result, None)) 66 | 67 | self.result.state = types.STATE_CRITICAL 68 | self.result.validate() 69 | self.assertTrue(f(self.result, self.record)) 70 | 71 | self.result.state = types.STATE_OK 72 | self.result.state_type = types.STATE_TYPE_SOFT 73 | self.result.validate() 74 | self.assertTrue(f(self.result, self.record)) 75 | -------------------------------------------------------------------------------- /nymms/reactor/filters/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | from nymms.schemas import types 6 | 7 | 8 | def always_true(result, previous_state): 9 | """ Not really necessary since no filters results in an always true result, 10 | but this is useful to show an example of what a filter is without actually 11 | doing anything. 12 | """ 13 | return True 14 | 15 | 16 | def hard_state(result, previous_state): 17 | if result.state_type == types.STATE_TYPE_HARD: 18 | logger.debug("%s state_type is HARD.", result.id) 19 | return True 20 | return False 21 | 22 | 23 | def changed_state(result, previous_state): 24 | """ Only alert if the state is new or has either changed state or 25 | state_type. 26 | """ 27 | if not previous_state: 28 | logger.debug("No previous state found.") 29 | return True 30 | if not previous_state.state == result.state: 31 | logger.debug("Previous state (%s) does not match current " 32 | "state (%s).", previous_state.state.name, 33 | result.state.name) 34 | return True 35 | if not previous_state.state_type == result.state_type: 36 | logger.debug("Previous state_type (%s) does not match current " 37 | "state_type (%s).", 38 | previous_state.state.name, 39 | result.state_type.name) 40 | return True 41 | return False 42 | 43 | 44 | def ok_state(result, previous_state): 45 | if result.state == types.STATE_OK: 46 | return True 47 | return False 48 | 49 | 50 | def warning_state(result, previous_state): 51 | if result.state == types.STATE_WARNING: 52 | return True 53 | return False 54 | 55 | 56 | def critical_state(result, previous_state): 57 | if result.state == types.STATE_CRITICAL: 58 | return True 59 | return False 60 | 61 | 62 | def unknown_state(result, previous_state): 63 | if result.state >= types.STATE_UNKNOWN: 64 | return True 65 | return False 66 | 67 | 68 | def not_ok_state(result, previous_state): 69 | return not(ok_state(result, previous_state)) 70 | 71 | 72 | def passive_command(result, previous_state): 73 | return result.task_context['command_type'] == 'passive' 74 | 75 | 76 | def active_command(result, previous_state): 77 | return not passive_command(result, previous_state) 78 | 79 | 80 | def not_soft_recovery(result, previous_state): 81 | if (previous_state and 82 | previous_state.state_type == types.STATE_TYPE_SOFT and 83 | result.state == types.STATE_OK): 84 | return False 85 | return True 86 | 87 | 88 | def no_previous(result, previous_state): 89 | return not previous_state 90 | 91 | 92 | def not_first_ok(result, previous_state): 93 | if no_previous(result, previous_state) and result.state == types.STATE_OK: 94 | return False 95 | return True 96 | -------------------------------------------------------------------------------- /nymms/reactor/aws_reactor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | from nymms.reactor.Reactor import Reactor 7 | from nymms.suppress.sdb_suppress import SDBSuppressionManager 8 | from nymms.utils.aws_helper import SNSTopic, ConnectionManager 9 | from nymms.state.sdb_state import SDBStateManager 10 | from nymms.schemas import Result 11 | 12 | from boto.sqs.message import RawMessage 13 | 14 | 15 | class AWSReactor(Reactor): 16 | def __init__(self, region, topic_name, state_domain_name, queue_name, 17 | suppress_domain_name, suppress_cache_timeout=60, 18 | state_manager=SDBStateManager, 19 | suppression_manager=SDBSuppressionManager): 20 | super(AWSReactor, self).__init__() 21 | self.region = region 22 | self.topic_name = topic_name 23 | self.queue_name = queue_name 24 | 25 | self._conn = None 26 | self._queue = None 27 | 28 | self.state_manager = state_manager(region, state_domain_name) 29 | self.suppression_manager = suppression_manager(region, 30 | suppress_cache_timeout, 31 | suppress_domain_name) 32 | 33 | @property 34 | def conn(self): 35 | if not self._conn: 36 | self._conn = ConnectionManager(self.region) 37 | return self._conn 38 | 39 | @property 40 | def queue(self): 41 | if not self._queue: 42 | self._queue = self.conn.sqs.create_queue(self.queue_name) 43 | self._queue.set_message_class(RawMessage) 44 | topic = SNSTopic(self.region, self.topic_name) 45 | topic.subscribe_sqs_queue(self.queue) 46 | return self._queue 47 | 48 | def get_result(self, **kwargs): 49 | wait_time = kwargs.get('wait_time', 0) 50 | visibility_timeout = kwargs.get('visibility_timeout', None) 51 | 52 | logger.debug("Getting result from queue %s.", self.queue_name) 53 | result = self.queue.read(visibility_timeout=visibility_timeout, 54 | wait_time_seconds=wait_time) 55 | result_obj = None 56 | if result: 57 | result_message = json.loads(result.get_body())['Message'] 58 | result_dict = json.loads(result_message) 59 | # Not sure why these fields are sometimes serialized but 60 | # mostly not... regardless they cause problems because they 61 | # are just properties of the model and not fields. 62 | result_dict.pop('state_name', None) 63 | result_dict.pop('state_type_name', None) 64 | try: 65 | result_obj = Result(result_dict, origin=result) 66 | result_obj.validate() 67 | except Exception as e: 68 | logger.debug('Got unexpected message: %s', result_dict) 69 | logger.exception( 70 | 'Error reading result from queue: %s', e.message) 71 | return result_obj 72 | 73 | def delete_result(self, result): 74 | self.queue.delete_message(result._origin) 75 | -------------------------------------------------------------------------------- /nymms/providers/sdb.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | from nymms.utils.aws_helper import ConnectionManager 6 | from nymms.schemas import OriginModel 7 | 8 | 9 | class SimpleDBBackend(object): 10 | OPERATOR_MAP = { 11 | 'lt': '<', 12 | 'gt': '>', 13 | 'eq': '=', 14 | 'ne': '!=', 15 | 'gte': '>=', 16 | 'lte': '<=', 17 | 'like': 'like', 18 | 'notlike': 'not like'} 19 | 20 | def __init__(self, region, domain_name): 21 | self.region = region 22 | self.domain_name = domain_name 23 | 24 | self._conn = None 25 | self._domain = None 26 | 27 | @property 28 | def conn(self): 29 | if not self._conn: 30 | self._conn = ConnectionManager(self.region) 31 | return self._conn 32 | 33 | @property 34 | def domain(self): 35 | if not self._domain: 36 | self._domain = self.conn.sdb.create_domain(self.domain_name) 37 | return self._domain 38 | 39 | def get(self, item_id, consistent_read=True): 40 | logger.debug("getting item '%s'", item_id) 41 | item = self.domain.get_item(item_id, consistent_read=consistent_read) 42 | if not item: 43 | logger.debug("Item %s not found.", item_id) 44 | return None 45 | return item 46 | 47 | def filter(self, filters=None, order_by=None, consistent_read=True, 48 | max_items=None, next_token=None): 49 | order_by = order_by 50 | 51 | query = "select * from %s" % self.domain_name 52 | if filters: 53 | query += " where " 54 | query += ' and '.join(filters) 55 | 56 | # TODO: This is kind of a weak way of dealing with the fact that in 57 | # order to order by something it has to be specified in the where 58 | # field. 59 | if order_by and order_by in query: 60 | query += " order by `%s`" % order_by 61 | 62 | results = [] 63 | 64 | if max_items: 65 | max_items = int(max_items) 66 | query += " limit %s" % max_items 67 | 68 | logger.debug("Executing query: %s", query) 69 | query_results = self.domain.select(query, 70 | consistent_read=consistent_read, 71 | max_items=max_items, 72 | next_token=next_token) 73 | for item in query_results: 74 | results.append(item) 75 | 76 | _next_token = query_results.next_token 77 | if _next_token: 78 | _next_token = _next_token.replace('\n', '') 79 | return (results, _next_token) 80 | 81 | def purge(self, item_or_key): 82 | """ Deletes from the datastore entirely. Shouldn't be used in most 83 | cases. """ 84 | if isinstance(item_or_key, OriginModel): 85 | return item_or_key._origin.delete() 86 | return self.domain.delete_attributes(item_or_key) 87 | 88 | def put(self, item, context=None): 89 | key = getattr(item, item.pk) 90 | logger.debug("Added %s to %s.", key, self.domain_name) 91 | return self.domain.put_attributes(key, 92 | item.to_primitive(context=context)) 93 | -------------------------------------------------------------------------------- /nymms/config/yaml_config.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import logging 4 | import hashlib 5 | import re 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | import yaml 10 | 11 | from nymms import exceptions 12 | 13 | include_regex = re.compile(r'^(?P\s*)\!include (?P[^\s]+)$') 14 | 15 | 16 | class EmptyConfig(exceptions.NymmsException): 17 | def __init__(self, filename): 18 | self.filename = filename 19 | 20 | def __str__(self): 21 | return "Config file %s resulted in an empty config." % (self.filename) 22 | 23 | 24 | def open_config_file(config_file): 25 | """ Opens a config file, logging common IOError exceptions. 26 | """ 27 | try: 28 | return open(config_file) 29 | except IOError, e: 30 | # This should only happen with the top level config file, since 31 | # we use glob.glob on includes 32 | if e.errno == 2: 33 | logger.error("Could not find file '%s'.", config_file) 34 | elif e.errno == 13: 35 | logger.error("Invalid permissions to open '%s'.", config_file) 36 | raise 37 | 38 | 39 | def load_config(config_file): 40 | stack = [] 41 | root = os.path.dirname(os.path.abspath(os.path.expanduser(config_file))) 42 | 43 | def recursive_preprocess(filename, indent=''): 44 | filename = os.path.expanduser(filename) 45 | stack.append(os.path.abspath(filename)) 46 | c = [] 47 | 48 | with open_config_file(filename) as fd: 49 | for lineno, line in enumerate(fd): 50 | line = line.rstrip() 51 | match = include_regex.match(line) 52 | if match: 53 | path = match.group('path') 54 | indent = indent + match.group('indent') 55 | # if the include doesn't have a fully qualified path then 56 | # assume the relative path is based off the directory of 57 | # the initial config file 58 | if not path.startswith('/'): 59 | path = os.path.join(root, path) 60 | files = glob.glob(path) 61 | if not files: 62 | logger.warning("Include statement '%s' at %s:%d did " 63 | "not match any files. Skipping.", line, 64 | filename, lineno) 65 | continue 66 | for f in files: 67 | f = os.path.abspath(f) 68 | if f in stack: 69 | logger.warning("Already parsed %s, skipping " 70 | "(%s:%d) to avoid infinite loop.", 71 | f, filename, lineno) 72 | continue 73 | if os.path.isfile(f): 74 | logger.debug("Parsing include (%s:%d): %s", 75 | filename, lineno, f) 76 | c.extend(recursive_preprocess(f, indent)) 77 | else: 78 | logger.warning("%s is not a regular file, " 79 | "skipping (%s:%d).", f, filename, 80 | lineno) 81 | continue 82 | c.append(indent + line) 83 | return c 84 | logger.debug("Loading config file: %s", config_file) 85 | config = recursive_preprocess(config_file) 86 | if not config: 87 | raise EmptyConfig(config_file) 88 | config = os.linesep.join(config) 89 | version = hashlib.sha512(config).hexdigest() 90 | return (version, yaml.safe_load(config)) 91 | -------------------------------------------------------------------------------- /nymms/api/routes.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import arrow 3 | 4 | from flask import request 5 | from flask.ext.api import FlaskAPI 6 | from flask.ext.api import status 7 | 8 | from schematics.exceptions import ValidationError 9 | 10 | from nymms.state import sdb_state 11 | from nymms.suppress import sdb_suppress 12 | from nymms.config import config 13 | from nymms import schemas 14 | 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | nymms_api = FlaskAPI(__name__) 19 | 20 | 21 | DEFAULT_RESULT_LIMIT = 1000 22 | DEFAULT_SUPPRESSION_LIMIT = 1000 23 | DEFAULT_STATE_LIMIT = 1000 24 | 25 | 26 | @nymms_api.route('/state', methods=['GET']) 27 | def state(): 28 | """ 29 | List current states. 30 | 31 | Query Params: 32 | - limit (default 1000) 33 | """ 34 | region = config.settings['region'] 35 | domain = config.settings['state_domain'] 36 | 37 | state = sdb_state.SDBStateManager(region, domain, 38 | schema_class=schemas.APIStateRecord) 39 | args = request.args.to_dict(flat=True) 40 | limit = int(args.get('limit', DEFAULT_STATE_LIMIT)) 41 | states, _ = state.filter( 42 | filters=request.args, 43 | max_items=limit) 44 | return [s.to_primitive() for s in states] 45 | 46 | 47 | @nymms_api.route('/suppress', methods=['GET', 'POST']) 48 | def suppress(): 49 | """ 50 | List or create suppressions. 51 | 52 | Query Params: 53 | - limit (default 1000) 54 | - show_inactive (default False) 55 | """ 56 | region = config.settings['region'] 57 | cache_timeout = config.settings['suppress']['cache_timeout'] 58 | domain = config.settings['suppress']['domain'] 59 | 60 | mgr = sdb_suppress.SDBSuppressionManager( 61 | region, cache_timeout, domain, schema_class=schemas.APISuppression) 62 | 63 | if request.method == 'POST': 64 | data = request.data 65 | 66 | suppress_obj = schemas.APISuppression(data) 67 | try: 68 | suppress_obj.validate() 69 | except ValidationError as e: 70 | return e.messages, status.HTTP_400_BAD_REQUEST 71 | 72 | now = arrow.get() 73 | if suppress_obj.expires <= now: 74 | return ( 75 | {'expires': 'expires must be in the future'}, 76 | status.HTTP_400_BAD_REQUEST 77 | ) 78 | mgr.add_suppression(suppress_obj) 79 | return suppress_obj.to_primitive(), status.HTTP_201_CREATED 80 | 81 | # request.method == 'GET' 82 | args = request.args.to_dict(flat=True) 83 | limit = int(args.get('limit', DEFAULT_SUPPRESSION_LIMIT)) 84 | filters = ["`expires` > '0'"] 85 | if not args.get('show_inactive', False): 86 | filters.append("`disabled` is null") 87 | 88 | suppressions, _ = mgr.filter( 89 | filters=filters, max_items=limit) 90 | return [s.to_primitive() for s in suppressions] 91 | 92 | 93 | @nymms_api.route("/suppress//", methods=['GET', 'DELETE']) 94 | def suppress_detail(key): 95 | """ 96 | View, Edit, Deactivate suppressions. 97 | 98 | Query Params: 99 | - hard_delete (default False) 100 | """ 101 | region = config.settings['region'] 102 | cache_timeout = config.settings['suppress']['cache_timeout'] 103 | domain = config.settings['suppress']['domain'] 104 | 105 | mgr = sdb_suppress.SDBSuppressionManager( 106 | region, cache_timeout, domain, schema_class=schemas.APISuppression) 107 | 108 | item = mgr.get(key) 109 | if request.method == 'DELETE': 110 | if request.args.get('hard_delete', False): 111 | mgr.backend.purge(item) 112 | else: 113 | mgr.deactivate_suppression(key) 114 | return item.to_primitive() 115 | -------------------------------------------------------------------------------- /nymms/reactor/handlers/Handler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | from nymms.utils import load_object_from_string 6 | 7 | 8 | class Handler(object): 9 | def __init__(self, config=None): 10 | self.config = config 11 | self._filters = [] 12 | self._suppression_enabled = self.config.pop( 13 | 'suppression_enabled', 14 | False) 15 | logger.debug("%s suppression enabled is %s", 16 | self.__class__.__name__, 17 | self._suppression_enabled) 18 | 19 | def _load_filters(self): 20 | filters = self.config.get('filters', []) 21 | if filters: 22 | for filter_string in filters: 23 | logging.debug("Adding Filter %s to Handler %s.", filter_string, 24 | self.__class__.__name__) 25 | f = load_object_from_string(filter_string) 26 | self._filters.append(f) 27 | else: 28 | logger.debug("No filters configured for Handler %s.", 29 | self.__class__.__name__) 30 | 31 | def _filter(self, result, previous_state): 32 | """ Runs the result & previous state through all the configured 33 | filters. A filter should be a callable that accepts two arguments: 34 | the result and the previous state. It should return either True or 35 | False regarding whether the message should be allowed through the 36 | handler. 37 | """ 38 | if not self._filters: 39 | self._load_filters() 40 | # Assume that no filters means just that - that the result is 41 | # not to be filtered for the handler. 42 | if not self._filters: 43 | return True 44 | results = {} 45 | for f in self._filters: 46 | try: 47 | results[f.__name__] = f(result, previous_state) 48 | except Exception as e: 49 | logger.exception("Filter %s on Handler %s had an unhandled " 50 | "exception. Ignoring: %s", 51 | f.__name__, self.__class__.__name__, e) 52 | continue 53 | logger.debug("Handler %s filter results: %s", self.__class__.__name__, 54 | results) 55 | return all(results.values()) 56 | 57 | def process(self, result, previous_state, is_suppressed): 58 | """First checks to see if the given event should be filtered and 59 | then sees if it passes the suppressor (if enabled). If pass, then 60 | call the subclass's _process() method""" 61 | cname = self.__class__.__name__ 62 | if self._filter(result, previous_state): 63 | if not self.suppression_enabled: 64 | logger.debug("Handler %s filters returned true for %s", 65 | cname, result.id) 66 | return self._process(result, previous_state) 67 | elif self.suppression_enabled and not is_suppressed(result): 68 | logger.debug("Handler %s filters & suppressor returned true" 69 | " for %s, reacting.", cname, result.id) 70 | return self._process(result, previous_state) 71 | else: 72 | logger.debug("Handler %s suppressor returned false" 73 | " for %s, skipping.", cname, result.id) 74 | else: 75 | logger.debug("Handler %s filters returned false for %s, skipping.", 76 | cname, result.id) 77 | 78 | def _process(self, result, previous_state): 79 | """ Meant to be overridden by subclasses - should handle the actual 80 | process of reacting to a result. 81 | """ 82 | raise NotImplementedError 83 | 84 | @property 85 | def suppression_enabled(self): 86 | """Are suppressions enabled for this handler?""" 87 | return self._suppression_enabled 88 | -------------------------------------------------------------------------------- /docs/demo.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Demo AMI 3 | ======== 4 | 5 | In order to give people something easy to start playing with (and to alleviate 6 | my shame in not having amazing documentation yet) I've gone ahead and started 7 | creating Demo AMIs in Amazon AWS. These AMIs come up with a complete, 8 | all-in-one (ie: all daemons) instance that has a very basic configuration 9 | that can be used to play with NYMMS and get used to the system. 10 | 11 | Currently the AMIs are only being built in **us-west-2 (ie: oregon)** but if 12 | you have interest in running the AMI elsewhere contact me and I'll see about 13 | spinning one up for you. 14 | 15 | You can find the AMIs by searching in the EC2 console in **us-west-2** for 16 | **nymms**. The AMIs are named with a timestamp like so: 17 | 18 | *nymms-ubuntu-precise-20131014-215959* 19 | 20 | Once you launch the AMI (I suggest using an m1.medium, though it MAY be 21 | possible to use an m1.small) you'll need to provide it with the correct access 22 | to the various AWS services (SQS, SNS, SES, SDB) that NYMMS makes use of. 23 | 24 | This can be done one of two ways: 25 | 26 | - You can create an instance role with the appropriate permissions (given 27 | below) and assign the instance to it. 28 | - You can create an IAM user and assign the appropriate permissions then take 29 | their API credentials and put them in **/etc/default/nymms-common** 30 | 31 | 32 | The first way is the more secure, but the second is the easiest. Here's an 33 | example permission policy that should work:: 34 | 35 | { 36 | "Version": "2012-10-17", 37 | "Statement": [ 38 | { 39 | "Action": [ 40 | "ses:GetSendQuota", 41 | "ses:SendEmail" 42 | ], 43 | "Sid": "NymmsSESAccess", 44 | "Resource": [ 45 | "*" 46 | ], 47 | "Effect": "Allow" 48 | }, 49 | { 50 | "Action": [ 51 | "sns:ConfirmSubscription", 52 | "sns:CreateTopic", 53 | "sns:DeleteTopic", 54 | "sns:GetTopicAttributes", 55 | "sns:ListSubscriptions", 56 | "sns:ListSubscriptionsByTopic", 57 | "sns:ListTopics", 58 | "sns:Publish", 59 | "sns:SetTopicAttributes", 60 | "sns:Subscribe", 61 | "sns:Unsubscribe" 62 | ], 63 | "Sid": "NymmsSNSAccess", 64 | "Resource": [ 65 | "*" 66 | ], 67 | "Effect": "Allow" 68 | }, 69 | { 70 | "Action": [ 71 | "sqs:ChangeMessageVisibility", 72 | "sqs:CreateQueue", 73 | "sqs:DeleteMessage", 74 | "sqs:DeleteQueue", 75 | "sqs:GetQueueAttributes", 76 | "sqs:GetQueueUrl", 77 | "sqs:ListQueues", 78 | "sqs:ReceiveMessage", 79 | "sqs:SendMessage", 80 | "sqs:SetQueueAttributes" 81 | ], 82 | "Sid": "NymmsSQSAccess", 83 | "Resource": [ 84 | "*", 85 | ], 86 | "Effect": "Allow" 87 | }, 88 | { 89 | "Action": [ 90 | "sdb:*" 91 | ], 92 | "Sid": "NymmsSDBAccess", 93 | "Resource": [ 94 | "*" 95 | ], 96 | "Effect": "Allow" 97 | } 98 | ] 99 | } 100 | 101 | Once you've done all that you need to restart each of the three nymms daemons 102 | via upstart so that they can read their new credentials:: 103 | 104 | # restart nymms-reactor 105 | # restart nymms-probe 106 | # restart nymms-scheduler 107 | 108 | If all went well (you can tell by checking out the individual daemon logs in 109 | **/var/log/upstart/**) you should start to see the results of the very basic 110 | monitors in **/var/log/nymms/reactor.log**. 111 | 112 | You can find all of the configuration in **/etc/nymms**. 113 | 114 | Let me know if you have any questions or run into any issues bringing up the 115 | AMI/services. 116 | -------------------------------------------------------------------------------- /nymms/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | import importlib 4 | import sys 5 | import collections 6 | 7 | import arrow 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def retry_on_exception(exception_list, retries=3, reset_func=None, 13 | final_exception=None, delay=0): 14 | """ A decorator that executes a function and catches any exceptions in 15 | 'exception_list'. It then retries 'retries' with 'delay' seconds between 16 | retries and executing 'reset_func' each time. If it fails after reaching 17 | the retry limit it then raises 'final_exception' or the last exception 18 | raised. 19 | """ 20 | def decorator(func): 21 | def wrapped(*args, **kwargs): 22 | i = 0 23 | while True: 24 | try: 25 | return func(*args, **kwargs) 26 | except exception_list as e: 27 | if reset_func: 28 | reset_func() 29 | if delay: 30 | if callable(delay): 31 | time.sleep(delay(i)) 32 | else: 33 | time.sleep(delay) 34 | logger.warn("%s exception caught. Retrying %d time(s): " 35 | "%s", e.__class__.__name__, retries - i, 36 | e.message) 37 | i += 1 38 | if retries and i > retries: 39 | break 40 | if final_exception: 41 | raise final_exception(str(e)) 42 | else: 43 | raise e 44 | return wrapped 45 | return decorator 46 | 47 | 48 | def load_object_from_string(fqcn): 49 | """ Given a '.' delimited string representing the full path to an object 50 | (function, class, variable) inside a module, return that object. Example: 51 | 52 | load_object_from_string('os.path.basename') 53 | load_object_from_string('logging.Logger') 54 | load_object_from_string('LocalClassName') 55 | """ 56 | module_path = '__main__' 57 | object_name = fqcn 58 | if '.' in fqcn: 59 | module_path, object_name = fqcn.rsplit('.', 1) 60 | importlib.import_module(module_path) 61 | return getattr(sys.modules[module_path], object_name) 62 | 63 | 64 | def deep_update(orig, upd): 65 | """ Does a 'deep' update of dictionary 'orig' with dictionary 'upd'.""" 66 | for k, v in upd.iteritems(): 67 | if isinstance(v, collections.Mapping): 68 | r = deep_update(orig.get(k, {}), v) 69 | orig[k] = r 70 | else: 71 | orig[k] = upd[k] 72 | return orig 73 | 74 | 75 | def parse_time(time_string, reference_time=None): 76 | """ Parses timestamps and returns an arrow time object. 77 | 78 | Takes a time_string in either ISO-8601 format, a unix timestamp, 79 | or a time offset in the form of [+-]XXXX[smhd] and returns an arrow time 80 | object with at that time. 81 | 82 | Can take an optional reference_time arrow object, which will only be 83 | used in the case that an offset was given, and will be used in place of 84 | now for offsets. 85 | """ 86 | suffix_map = {'s': 'seconds', 87 | 'm': 'minutes', 88 | 'h': 'hours', 89 | 'd': 'days'} 90 | if not isinstance(time_string, basestring): 91 | raise ValueError(time_string) 92 | 93 | if time_string[0] in ('+', '-'): 94 | unit = 's' 95 | offset = time_string 96 | if time_string[-1] in ('s', 'm', 'h', 'd'): 97 | unit = time_string[-1] 98 | offset = offset[:-1] 99 | result_time = arrow.get(reference_time) 100 | replace_args = {suffix_map[unit]: int(offset)} 101 | result_time = result_time.replace(**replace_args) 102 | elif '-' in time_string: 103 | result_time = arrow.get(time_string) 104 | else: 105 | raise ValueError(time_string) 106 | return result_time 107 | -------------------------------------------------------------------------------- /nymms/schemas/types/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import collections 3 | import json 4 | import time 5 | import string 6 | import random 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | from schematics.types import BaseType 11 | from schematics.exceptions import ValidationError 12 | 13 | from nymms.utils import parse_time 14 | 15 | import arrow 16 | 17 | 18 | class TimestampType(BaseType): 19 | def to_native(self, value, context=None): 20 | if isinstance(value, arrow.arrow.Arrow): 21 | return value 22 | try: 23 | return parse_time(value) 24 | except ValueError: 25 | return arrow.get(value) 26 | 27 | def to_primitive(self, value, context=None): 28 | return value.isoformat() 29 | 30 | def _mock(self, context=None): 31 | year = 86400 * 365 32 | return arrow.get(time.time() + (random.randrange(-1 * 20 * year, 33 | 200 * year))) 34 | 35 | 36 | class JSONType(BaseType): 37 | def to_native(self, value, context=None): 38 | if isinstance(value, basestring): 39 | return json.loads(value) 40 | return value 41 | 42 | def to_primitive(self, value, context=None): 43 | return json.dumps(value) 44 | 45 | def _mock(self, context=None): 46 | return dict( 47 | [(random.choice(string.ascii_letters), 48 | random.choice(string.printable)) for i in 49 | range(random.randrange(4, 10))]) 50 | 51 | 52 | StateObject = collections.namedtuple('StateObject', ['name', 'code']) 53 | STATE_OK = StateObject('ok', 0) 54 | STATE_WARNING = STATE_WARN = StateObject('warning', 1) 55 | STATE_CRITICAL = STATE_CRIT = StateObject('critical', 2) 56 | STATE_UNKNOWN = StateObject('unknown', 3) 57 | STATES = collections.OrderedDict([ 58 | ('ok', STATE_OK), 59 | ('warning', STATE_WARNING), 60 | ('critical', STATE_CRITICAL), 61 | ('unknown', STATE_UNKNOWN)]) 62 | 63 | 64 | class StateType(BaseType): 65 | def __init__(self, *args, **kwargs): 66 | super(StateType, self).__init__(*args, choices=STATES.values(), 67 | **kwargs) 68 | 69 | def to_native(self, value, context=None): 70 | if isinstance(value, StateObject): 71 | return value 72 | try: 73 | int_value = int(value) 74 | try: 75 | return STATES.values()[int_value] 76 | except IndexError: 77 | return STATE_UNKNOWN 78 | except ValueError: 79 | try: 80 | return STATES[value.lower()] 81 | except KeyError: 82 | raise ValidationError(self.messages['choices'].format( 83 | unicode(self.choices))) 84 | 85 | def to_primitive(self, value, context=None): 86 | return value.code 87 | 88 | 89 | class StateNameType(StateType): 90 | def to_primitive(self, value, context=None): 91 | return value.name 92 | 93 | 94 | StateTypeObject = collections.namedtuple('StateTypeObject', ['name', 'code']) 95 | STATE_TYPE_SOFT = StateTypeObject('soft', 0) 96 | STATE_TYPE_HARD = StateTypeObject('hard', 1) 97 | STATE_TYPES = collections.OrderedDict([ 98 | ('soft', STATE_TYPE_SOFT), 99 | ('hard', STATE_TYPE_HARD)]) 100 | 101 | 102 | class StateTypeType(BaseType): 103 | def __init__(self, *args, **kwargs): 104 | super(StateTypeType, self).__init__(*args, 105 | choices=STATE_TYPES.values(), 106 | **kwargs) 107 | 108 | def to_native(self, value, context=None): 109 | if isinstance(value, StateTypeObject): 110 | return value 111 | try: 112 | return STATE_TYPES.values()[int(value)] 113 | except ValueError: 114 | try: 115 | return STATE_TYPES[value.lower()] 116 | except KeyError: 117 | raise ValidationError(self.messages['choices'].format( 118 | unicode(self.choices))) 119 | 120 | def to_primitive(self, value, context=None): 121 | return value.code 122 | 123 | 124 | class StateTypeNameType(StateTypeType): 125 | def to_primitive(self, value, context=None): 126 | return value.name 127 | -------------------------------------------------------------------------------- /docs/hidden_code_block.py: -------------------------------------------------------------------------------- 1 | """Simple, inelegant Sphinx extension which adds a directive for a 2 | highlighted code-block that may be toggled hidden and shown in HTML. 3 | This is possibly useful for teaching courses. 4 | 5 | The directive, like the standard code-block directive, takes 6 | a language argument and an optional linenos parameter. The 7 | hidden-code-block adds starthidden and label as optional 8 | parameters. 9 | 10 | Examples: 11 | 12 | .. hidden-code-block:: python 13 | :starthidden: False 14 | 15 | a = 10 16 | b = a + 5 17 | 18 | .. hidden-code-block:: python 19 | :label: --- SHOW/HIDE --- 20 | 21 | x = 10 22 | y = x + 5 23 | 24 | Thanks to http://www.javascriptkit.com/javatutors/dom3.shtml for 25 | inspiration on the javascript. 26 | 27 | Thanks to Milad 'animal' Fatenejad for suggesting this extension 28 | in the first place. 29 | 30 | Written by Anthony 'el Scopz' Scopatz, January 2012. 31 | 32 | Released under the WTFPL (http://sam.zoy.org/wtfpl/). 33 | """ 34 | 35 | from docutils import nodes 36 | from docutils.parsers.rst import directives 37 | from sphinx.directives.code import CodeBlock 38 | 39 | HCB_COUNTER = 0 40 | 41 | js_showhide = """\ 42 | 53 | """ 54 | 55 | def nice_bool(arg): 56 | tvalues = ('true', 't', 'yes', 'y') 57 | fvalues = ('false', 'f', 'no', 'n') 58 | arg = directives.choice(arg, tvalues + fvalues) 59 | return arg in tvalues 60 | 61 | 62 | class hidden_code_block(nodes.General, nodes.FixedTextElement): 63 | pass 64 | 65 | 66 | class HiddenCodeBlock(CodeBlock): 67 | """Hidden code block is Hidden""" 68 | 69 | option_spec = dict(starthidden=nice_bool, 70 | label=str, 71 | **CodeBlock.option_spec) 72 | 73 | def run(self): 74 | # Body of the method is more or less copied from CodeBlock 75 | code = u'\n'.join(self.content) 76 | hcb = hidden_code_block(code, code) 77 | hcb['language'] = self.arguments[0] 78 | hcb['linenos'] = 'linenos' in self.options 79 | hcb['starthidden'] = self.options.get('starthidden', True) 80 | hcb['label'] = self.options.get('label', '+ show/hide code') 81 | hcb.line = self.lineno 82 | return [hcb] 83 | 84 | 85 | def visit_hcb_html(self, node): 86 | """Visit hidden code block""" 87 | global HCB_COUNTER 88 | HCB_COUNTER += 1 89 | 90 | # We want to use the original highlighter so that we don't 91 | # have to reimplement it. However it raises a SkipNode 92 | # error at the end of the function call. Thus we intercept 93 | # it and raise it again later. 94 | try: 95 | self.visit_literal_block(node) 96 | except nodes.SkipNode: 97 | pass 98 | 99 | # The last element of the body should be the literal code 100 | # block that was just made. 101 | code_block = self.body[-1] 102 | 103 | fill_header = {'divname': 'hiddencodeblock{0}'.format(HCB_COUNTER), 104 | 'startdisplay': 'none' if node['starthidden'] else 'block', 105 | 'label': node.get('label'), 106 | } 107 | 108 | divheader = ("""""" 109 | """{label}
""" 110 | '''
''' 111 | ).format(**fill_header) 112 | 113 | code_block = js_showhide + divheader + code_block + "
" 114 | 115 | # reassign and exit 116 | self.body[-1] = code_block 117 | raise nodes.SkipNode 118 | 119 | 120 | def depart_hcb_html(self, node): 121 | """Depart hidden code block""" 122 | # Stub because of SkipNode in visit 123 | 124 | def visit_hcb_latex(self, node): 125 | raise nodes.SkipNode 126 | 127 | def depart_hcb_latex(self, node): 128 | return 129 | 130 | 131 | def setup(app): 132 | app.add_directive('hidden-code-block', HiddenCodeBlock) 133 | app.add_node(hidden_code_block, html=(visit_hcb_html, depart_hcb_html), 134 | latex=(visit_hcb_latex, depart_hcb_latex)) 135 | -------------------------------------------------------------------------------- /nymms/suppress/suppress.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from nymms.schemas import Suppression 4 | 5 | import arrow 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class SuppressionManager(object): 11 | """Parent SuppressFilterBackend class. Don't use this directly! 12 | 13 | You need to define: 14 | add_suppression(self, suppress) 15 | get_suppressions(self, expire, include_disabled) 16 | deactivate_suppression(self, rowkey) 17 | """ 18 | def __init__(self, cache_ttl, schema_class): 19 | self.cache_ttl = cache_ttl 20 | self._cache_expire_time = 0 21 | self._cached_suppressions = [] 22 | self._backend = None 23 | self.schema_class = schema_class 24 | logger.debug("%s initialized.", self.__class__.__name__) 25 | 26 | self.migrate_suppressions() 27 | 28 | @property 29 | def backend(self): 30 | if not self._backend: 31 | self._backend = self.get_backend() 32 | return self._backend 33 | 34 | def deserialize(self, item, strict=False): 35 | try: 36 | item_obj = self.schema_class(item, strict=strict, origin=item) 37 | item_obj.validate() 38 | return item_obj 39 | except Exception: 40 | logger.exception("Problem deserializing item:") 41 | logger.error("Data: %s", str(item)) 42 | return None 43 | 44 | def get_active_suppressions(self, now=None): 45 | """Returns a list of suppression filters which are currently 46 | active in SDB""" 47 | now = now or arrow.get() 48 | # return the suppressions only, not the token 49 | return self.get_suppressions(now, include_disabled=False)[0] 50 | 51 | def cache_expired(self, now=None): 52 | now = now or arrow.get() 53 | return self._cache_expire_time < now.timestamp 54 | 55 | def refresh_cache(self, now=None): 56 | logger.debug("Refreshing reactor suppression cache") 57 | now = now or arrow.get() 58 | self._cache_expire_time = now.timestamp + self.cache_ttl 59 | self._cached_suppressions = [] 60 | for suppression in self.get_active_suppressions(): 61 | self._cached_suppressions.append(suppression) 62 | 63 | def get_current_suppressions(self, now=None): 64 | """Returns a list of currently active suppression filters""" 65 | now = now or arrow.get() 66 | if self.cache_expired(now): 67 | self.refresh_cache(now) 68 | return self._cached_suppressions 69 | 70 | def is_suppressed(self, message, now=None): 71 | """Returns True if given message matches one of our active filters""" 72 | now = now or arrow.get() 73 | suppressions = self.get_current_suppressions(now) 74 | for item in suppressions: 75 | if item.re.search(message): 76 | return item 77 | return False 78 | 79 | def get_backend(self, *args, **kwargs): 80 | raise NotImplementedError 81 | 82 | def add_suppression(self, suppression): 83 | """Adds a suppression filter to the SDB store 84 | """ 85 | self.backend.put(suppression) 86 | return suppression.rowkey 87 | 88 | def get_suppressions(self, expire=None, include_disabled=False, 89 | limit=None): 90 | """ Gets all suppressions that expire after given 'expire' time. """ 91 | raise NotImplementedError 92 | 93 | def deactivate_suppression(self, rowkey): 94 | """ Deactivates a suppression in the SuppressionBackend.""" 95 | raise NotImplementedError 96 | 97 | def get_old_suppressions(self): 98 | """ Gets all suppressions in the SuppressionBackend that are not the 99 | current version. The current_version can be gotten from 100 | nymms.schemas.Suppression.CURRENT_VERSION 101 | """ 102 | raise NotImplementedError 103 | 104 | def get(self, suppression_id): 105 | item = self.backend.get(suppression_id) 106 | if item: 107 | return self.deserialize(item) 108 | return None 109 | 110 | def deactivate_all_suppressions(self): 111 | """Deactivates all the active suppression filters we have currently.""" 112 | deactivated = [] 113 | for item in self.get_active_suppressions(): 114 | logger.debug("Deactivating %s", item.rowkey) 115 | deactivated.append(item.rowkey) 116 | self.deactivate_suppression(item.rowkey) 117 | return deactivated 118 | 119 | def migrate_suppressions(self): 120 | """ Temporary method, used to update all expressions to the new format 121 | using schematics & arrow. 122 | """ 123 | for item in self.get_old_suppressions(): 124 | new_suppression = Suppression.migrate(item) 125 | old_key = item['rowkey'] 126 | self.backend.purge_suppression(old_key) 127 | if new_suppression: 128 | logger.debug("Migrating old suppression %s.", old_key) 129 | self.add_suppression(new_suppression) 130 | 131 | def filter(self, *args, **kwargs): 132 | result, next_token = self.backend.filter(*args, **kwargs) 133 | return ([self.deserialize(s) for s in result], next_token) 134 | -------------------------------------------------------------------------------- /nymms/reactor/Reactor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import glob 3 | import os 4 | import sys 5 | 6 | from nymms.daemon import NymmsDaemon 7 | from nymms.config import yaml_config 8 | from nymms.utils import load_object_from_string, logutil 9 | from nymms.exceptions import OutOfDateState 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class Reactor(NymmsDaemon): 15 | def __init__(self): 16 | self.handlers = {} 17 | self.suppression_manager = None 18 | self.state_manager = None 19 | 20 | super(Reactor, self).__init__() 21 | 22 | def list_handler_configs(self, path): 23 | path = os.path.expanduser(path) 24 | configs = glob.glob(os.path.join(path, '*.conf')) 25 | configs += glob.glob(os.path.join(path, '*.yaml')) 26 | return configs 27 | 28 | def get_handler_name(self, filename): 29 | return os.path.basename(filename)[:-5] 30 | 31 | def load_handler(self, handler_name, config, **kwargs): 32 | enabled = config.pop('enabled', False) 33 | 34 | if not enabled: 35 | logger.debug("Handler %s 'enabled' is not set to true. " 36 | "Skipping.", handler_name) 37 | return None 38 | cls_string = config.pop('handler_class') 39 | logger.debug('Initializing handler %s.', handler_name) 40 | try: 41 | handler_cls = load_object_from_string(cls_string) 42 | return handler_cls(config) 43 | except Exception: 44 | logutil.log_exception("Skipping handler %s due to " 45 | "unhandled exception:" % handler_name, 46 | logger) 47 | return None 48 | 49 | def load_handlers(self, handler_config_path, **kwargs): 50 | conf_files = self.list_handler_configs(handler_config_path) 51 | logger.info("Loading handlers from %s", handler_config_path) 52 | for f in conf_files: 53 | handler_name = self.get_handler_name(f) 54 | # We could eventually have the handlers get loaded everytime and 55 | # update them if their config has changed (via config_version 56 | # below). For now lets not get that tricky. 57 | if handler_name in self.handlers: 58 | if not kwargs.get('force_load_handlers', False): 59 | logger.debug("Handler %s already loaded, skipping.", 60 | handler_name) 61 | continue 62 | _, config = yaml_config.load_config(f) 63 | handler = self.load_handler(handler_name, config, **kwargs) 64 | if handler: 65 | self.handlers[handler_name] = handler 66 | 67 | if not self.handlers: 68 | logger.error("No handlers loaded. Exiting.") 69 | sys.exit(1) 70 | 71 | def get_result(self, **kwargs): 72 | raise NotImplementedError 73 | 74 | def get_state(self, task_id): 75 | return self.state_manager.get_state(task_id) 76 | 77 | def save_state(self, task_id, result, previous): 78 | return self.state_manager.save_state(task_id, result, previous) 79 | 80 | def is_suppressed(self, result): 81 | """Returns True if we should suppress the given result for event""" 82 | if not self.suppression_manager: 83 | logger.debug("is_suppressed(): No suppress backend, so returning " 84 | "False") 85 | return False 86 | suppression_filter = self.suppression_manager.is_suppressed(result.id) 87 | if suppression_filter: 88 | suppression_filter.validate() 89 | logger.debug("Suppressed %s with '%s' (%s) created at %s", 90 | result.id, 91 | suppression_filter.regex, 92 | suppression_filter.rowkey, 93 | suppression_filter.created.isoformat()) 94 | return suppression_filter 95 | 96 | def handle_result(self, result, **kwargs): 97 | previous_state = self.get_state(result.id) 98 | for handler_name, handler in self.handlers.iteritems(): 99 | try: 100 | # We do suppression AFTER filters, so we have to 101 | # pass Reactor to the handler to do that for us 102 | handler.process(result, previous_state, self.is_suppressed) 103 | except Exception: 104 | logutil.log_exception("Unhandled %s handler " 105 | "exception:" % (handler_name,), logger) 106 | continue 107 | try: 108 | self.save_state(result.id, result, previous_state) 109 | except OutOfDateState: 110 | pass 111 | 112 | def run(self, handler_config_path, **kwargs): 113 | """ This will run in a tight loop. It is expected that the subclass's 114 | get_result() method will introduce a delay if the results queue is 115 | empty. 116 | """ 117 | self.load_handlers(handler_config_path, **kwargs) 118 | while True: 119 | result = self.get_result(**kwargs) 120 | if not result: 121 | logger.debug('Result queue empty.') 122 | continue 123 | self.handle_result(result, **kwargs) 124 | self.delete_result(result) 125 | -------------------------------------------------------------------------------- /nymms/config/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import copy 3 | import os 4 | import warnings 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | validator = None 9 | try: 10 | import validictory 11 | validator = validictory.validate 12 | except ImportError: 13 | warnings.warn("Unable to import validictory - skipping config validation.", 14 | ImportWarning) 15 | 16 | from nymms.config import yaml_config 17 | from nymms.utils import deep_update 18 | from nymms import exceptions 19 | 20 | 21 | default_conf_dir = '/etc/nymms' 22 | 23 | SCHEMA = { 24 | 'type': 'object', 25 | 'properties': { 26 | 'monitor_timeout': { 27 | 'type': 'integer', 'minimum': 0, 28 | }, 29 | 'resources': { 30 | 'type': 'string', 31 | }, 32 | 'region': { 33 | 'type': 'string', 34 | }, 35 | 'state_domain': { 36 | 'type': 'string', 37 | }, 38 | 'tasks_queue': { 39 | 'type': 'string', 40 | }, 41 | 'results_topic': { 42 | 'type': 'string', 43 | }, 44 | 'private_context_file': { 45 | 'type': 'string', 46 | }, 47 | 'task_expiration': { 48 | 'type': 'integer', 'minimum': 0, 49 | }, 50 | 'suppress': { 51 | 'type': 'object', 52 | 'properties': { 53 | 'domain': { 54 | 'type': 'string', 55 | }, 56 | 'cache_timeout': { 57 | 'type': 'integer', 'minimum': 0, 58 | }, 59 | } 60 | }, 61 | 'probe': { 62 | 'type': 'object', 63 | 'properties': { 64 | 'max_retries': { 65 | 'type': 'integer', 'minimum': 0, 66 | }, 67 | 'queue_wait_time': { 68 | 'type': 'integer', 'minimum': 0, 'maximum': 20, 69 | }, 70 | 'retry_delay': { 71 | 'type': 'integer', 'minimum': 0, 72 | }, 73 | } 74 | }, 75 | 'reactor': { 76 | 'type': 'object', 77 | 'properties': { 78 | 'handler_config_path': { 79 | 'type': 'string', 80 | }, 81 | 'queue_name': { 82 | 'type': 'string', 83 | }, 84 | 'queue_wait_time': { 85 | 'type': 'integer', 'minimum': 0, 'maximum': 20, 86 | }, 87 | 'visibility_timeout': { 88 | 'type': 'integer', 'minimum': 5, 89 | }, 90 | }, 91 | }, 92 | 'scheduler': { 93 | 'type': 'object', 94 | 'properties': { 95 | 'interval': { 96 | 'type': 'integer', 'minimum': 30, 97 | }, 98 | 'backend': { 99 | 'type': 'string', 100 | }, 101 | 'backend_args': { 102 | 'type': 'object', 103 | }, 104 | 'lock_backend': { 105 | 'type': 'string', 106 | }, 107 | 'lock_args': { 108 | 'type': 'object', 109 | }, 110 | }, 111 | }, 112 | } 113 | } 114 | 115 | DEFAULTS = { 116 | 'monitor_timeout': 30, 117 | 'resources': os.path.join(default_conf_dir, 'resources.yaml'), 118 | 'region': 'us-east-1', 119 | 'state_domain': 'nymms_state', 120 | 'tasks_queue': 'nymms_tasks', 121 | 'results_topic': 'nymms_results', 122 | 'private_context_file': os.path.join(default_conf_dir, 'private.yaml'), 123 | 'task_expiration': 600, 124 | 125 | 'probe': { 126 | 'max_retries': 2, 127 | 'queue_wait_time': 20, 128 | 'retry_delay': 30, 129 | }, 130 | 131 | 'reactor': { 132 | 'handler_config_path': os.path.join(default_conf_dir, 'handlers'), 133 | 'queue_name': 'reactor_queue', 134 | 'queue_wait_time': 20, 135 | 'visibility_timeout': 30, 136 | }, 137 | 138 | 'scheduler': { 139 | 'interval': 300, 140 | 'backend': 'nymms.scheduler.backends.yaml_backend.YamlBackend', 141 | 'backend_args': { 142 | 'path': os.path.join(default_conf_dir, 'nodes.yaml'), 143 | }, 144 | 'lock_backend': 'SDB', 145 | 'lock_args': { 146 | 'duration': 360, 147 | 'domain_name': 'nymms_locks', 148 | 'lock_name': 'scheduler_lock', 149 | }, 150 | }, 151 | 152 | 'suppress': { 153 | 'domain': 'nymms_suppress', 154 | 'cache_timeout': 60, 155 | } 156 | } 157 | 158 | settings = None 159 | version = None 160 | 161 | 162 | def load_config(path, force=False): 163 | global settings, version 164 | if settings and not force: 165 | return 166 | settings = copy.deepcopy(DEFAULTS) 167 | version, _config_settings = yaml_config.load_config(path) 168 | if _config_settings: 169 | deep_update(settings, _config_settings) 170 | if validator: 171 | try: 172 | validator(settings, SCHEMA) 173 | except ValueError as e: 174 | raise exceptions.InvalidConfig(path, e.message) 175 | logger.debug("Config loaded from '%s' with version '%s'.", path, version) 176 | -------------------------------------------------------------------------------- /nymms/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import uuid 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | from nymms.schemas.types import (TimestampType, StateType, StateTypeType, 8 | JSONType, StateNameType, StateTypeNameType) 9 | 10 | from schematics.models import Model 11 | from schematics.transforms import blacklist 12 | from schematics.types import ( 13 | StringType, IPv4Type, UUIDType, IntType) 14 | import arrow 15 | 16 | 17 | class OriginModel(Model): 18 | def __init__(self, raw_data=None, deserialize_mapping=None, strict=True, 19 | origin=None): 20 | super(OriginModel, self).__init__( 21 | raw_data=raw_data, 22 | deserialize_mapping=deserialize_mapping, 23 | strict=strict) 24 | self._origin = origin 25 | 26 | 27 | class Suppression(OriginModel): 28 | CURRENT_VERSION = 2 29 | 30 | rowkey = UUIDType(default=uuid.uuid4) 31 | regex = StringType(required=True) 32 | created = TimestampType(default=arrow.get) 33 | disabled = TimestampType(serialize_when_none=False) 34 | expires = TimestampType(required=True) 35 | ipaddr = IPv4Type(required=True) 36 | userid = StringType(required=True) 37 | comment = StringType(required=True) 38 | version = IntType(default=CURRENT_VERSION) 39 | 40 | default_sort_order = 'created' 41 | pk = 'rowkey' 42 | 43 | @property 44 | def active(self): 45 | if self.disabled or self.expires < arrow.get(): 46 | return False 47 | else: 48 | return True 49 | 50 | @property 51 | def state(self): 52 | if self.disabled: 53 | return "disabled (%s, %s)" % (self.disabled, 54 | self.disabled.humanize()) 55 | elif self.expires < arrow.get(): 56 | return "expired (%s, %s)" % (self.expires, 57 | self.expires.humanize()) 58 | else: 59 | return "active" 60 | 61 | @property 62 | def re(self): 63 | return re.compile(self.regex) 64 | 65 | @classmethod 66 | def migrate(cls, item): 67 | """ Takes an old version 1 item and returns a new version 2 68 | Suppression. 69 | """ 70 | new_suppression = None 71 | try: 72 | new_suppression = cls({ 73 | 'rowkey': uuid.UUID(item['rowkey']), 74 | 'regex': item['regex'], 75 | 'created': arrow.get(int(item['created_at'])), 76 | 'expires': arrow.get(int(item['expires'])), 77 | 'ipaddr': item['ipaddr'], 78 | 'userid': item['userid'], 79 | 'comment': item['comment']}) 80 | if not item['active'] == 'True': 81 | new_suppression.disabled = arrow.get(int(item['active'])) 82 | except Exception: 83 | logger.exception("Unable to migrate suppression to v2: %s", item) 84 | return new_suppression 85 | 86 | 87 | class APISuppression(Suppression): 88 | """Suppression Model with friendler date fields. 89 | """ 90 | disabled = TimestampType(serialize_when_none=True) 91 | 92 | 93 | class StateModel(Model): 94 | CURRENT_VERSION = 2 95 | 96 | state = StateType(required=True) 97 | state_type = StateTypeType(required=True) 98 | version = IntType(default=CURRENT_VERSION) 99 | 100 | @property 101 | def state_name(self): 102 | return self.state.name 103 | 104 | @property 105 | def state_type_name(self): 106 | return self.state_type.name 107 | 108 | 109 | class Task(OriginModel): 110 | id = StringType(required=True) 111 | created = TimestampType(default=arrow.get) 112 | attempt = IntType(default=0) 113 | context = JSONType() 114 | 115 | def increment_attempt(self): 116 | self.attempt += 1 117 | 118 | 119 | class Result(StateModel, OriginModel): 120 | id = StringType(required=True) 121 | timestamp = TimestampType(default=arrow.get) 122 | output = StringType() 123 | task_context = JSONType() 124 | 125 | class Options: 126 | roles = {'strip_context': blacklist('task_context')} 127 | 128 | 129 | class APIResult(Result): 130 | """Result model with friendlier fields for input/output 131 | """ 132 | state = StateNameType(required=True) 133 | state_type = StateTypeNameType(required=True) 134 | 135 | 136 | class StateRecord(StateModel, OriginModel): 137 | id = StringType(required=True) 138 | last_update = TimestampType(default=arrow.get) 139 | last_state_change = TimestampType(default=arrow.get) 140 | 141 | pk = 'id' 142 | default_sort_order = 'last_update' 143 | 144 | @classmethod 145 | def migrate(cls, item): 146 | new_state = None 147 | try: 148 | new_state = cls({ 149 | 'id': item['id'], 150 | 'last_update': arrow.get(int(item['last_update'])), 151 | 'last_state_change': arrow.get(int(item['last_state_change'])), 152 | 'state': item['state'], 153 | 'state_type': item['state_type']}) 154 | except Exception: 155 | logger.exception("Unable to migrate state record to v2: %s", item) 156 | return new_state 157 | 158 | 159 | class APIStateRecord(StateRecord): 160 | """StateRecord model with friendlier fields for input/output 161 | """ 162 | state = StateNameType(required=True) 163 | state_type = StateTypeNameType(required=True) 164 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. NYMMS documentation master file, created by 2 | sphinx-quickstart on Thu Oct 17 11:08:51 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ================================= 7 | Welcome to NYMMS's documentation! 8 | ================================= 9 | 10 | NYMMS is a monitoring system written in python that takes influences from 11 | many of the existing monitoring systems. It aims to be easy to scale and 12 | extend. 13 | 14 | Demo 15 | ==== 16 | 17 | Before we get into the guts of NYMMS I'd like to mention that we build a 18 | demonstration Amazon AMI that comes up with a basic configuration for an 19 | all-in-one NYMMS host that runs all of the daemons. To get more information 20 | on how to use that, please visit :doc:`Demo AMI ` 21 | 22 | 23 | Scaling 24 | ======= 25 | 26 | NYMMS intends to scale as easily as possible. It does so by separating out 27 | the work often handled in a monitoring system into multiple processes, and then 28 | handling communication between those processes with queues. None of this is 29 | revolutionary (Shinken_ broke the Nagios_ daemon up into many small pieces, 30 | and Sensu_ made heavy use of queues, and all of them are excellent 31 | monitoring systems that we take heavy influence from)- but I'm hoping to bring 32 | the two together in useful ways. 33 | 34 | Architecture Diagram 35 | -------------------- 36 | 37 | .. image:: _static/images/nymms_arch.png 38 | :align: center 39 | 40 | The Daemons 41 | ----------- 42 | 43 | **nymms-scheduler:** 44 | The daemon responsible for reading in the configuration, figuring out what 45 | it is you want to monitor and how you want to monitor those things, and 46 | then submitting tasks to the queue for probes. 47 | 48 | **nymms-probe:** 49 | The daemon(s) responsible for reading from the task queue and taking those 50 | monitoring tasks and executing them. It sends along the results of those 51 | monitors to the results topic. 52 | 53 | **nymms-reactor:** 54 | The daemon(s) that takes all the results, applies filters to them and then 55 | passes off the results that pass the filters onto their various handlers. 56 | Handler's can do just about anything with the results, from emailing people 57 | to triggering an incident in PagerDuty_, to submitting stats to a stats 58 | system. Finally the reactor updates the state database with the result. 59 | 60 | Communication 61 | ------------- 62 | 63 | I've tried to keep the interface with the various communication channels 64 | simple and easily extendible. As of this writing the entire system is very 65 | AWS_ based. We make use of the following AWS_ services: 66 | 67 | **SQS:** 68 | We use SQS as our general queue service. The scheduler passes tasks to the 69 | probes via SQS directly. The reactors read the results from the probes off 70 | SQS queues (note that the probes don't send results directly through SQS, 71 | which leads us to...) 72 | 73 | **SNS:** 74 | Probes submit results into SNS topics, which then pass them onto the 75 | reactors' SQS queues. This allows a single result to be shared amongst 76 | multiple types of reactors, as well as allowing results to be sent to 77 | various other endpoints. 78 | 79 | **SDB:** 80 | We use AWS_ SimpleDB to store state. This state database is written to by 81 | reactors when they receive results. It's read from by probes (to make sure 82 | we aren't beating a dead horse when something is down and has been down for 83 | some time) and by the reactors (to allow for logic regarding reacting to 84 | results that have changed state, or have been in a state for some length of 85 | time). 86 | 87 | **SES:** 88 | We use AWS_ Simple Email Service in some reactor handlers in order to be 89 | able to easily send email. 90 | 91 | Each of these services is used fairly lightly in most cases, so the charges 92 | should be minimal in almost all cases. The upside is that we currently do not 93 | require physical servers for any of these functions, which inevitably cost 94 | a significant amount to build and maintain. 95 | 96 | In the future it should be fairly easy to convert these services to other 97 | systems (such as RabbitMQ_, MongoDB_, etc). 98 | 99 | Other Details 100 | ------------- 101 | 102 | Right now all monitors are active monitors - they are fired from the probes and 103 | contact other services via various protocols to determine if the service is in 104 | an okay state. Because of the design using the various queues however, it 105 | should be simple in the future to submit passive results. The reactors are 106 | very permissive in accepting data from just about any source just as long as 107 | it comes from their queue and it fits the correct dataformat. 108 | 109 | As well we use a plugin format identical to the Nagios_ format. The benefit 110 | of this is that there is a vast wealth of nagios plugins out there, and they 111 | can be used as is with NYMMS. In the future we may come up with other plugin 112 | formats, but we haven't had a reason to so far. 113 | 114 | 115 | ======== 116 | Contents 117 | ======== 118 | 119 | .. toctree:: 120 | :maxdepth: 2 121 | 122 | config 123 | demo 124 | getting_started 125 | realms 126 | 127 | 128 | 129 | ================== 130 | Indices and tables 131 | ================== 132 | 133 | * :ref:`genindex` 134 | * :ref:`modindex` 135 | * :ref:`search` 136 | 137 | 138 | .. _Shinken: http://www.shinken-monitoring.org/ 139 | .. _Nagios: http://www.nagios.org/ 140 | .. _Sensu: http://sensuapp.org/ 141 | .. _PagerDuty: http://www.pagerduty.com/ 142 | .. _AWS: http://aws.amazon.com/ 143 | .. _RabbitMQ: http://www.rabbitmq.com/ 144 | .. _MongoDB: http://www.mongodb.org/ 145 | -------------------------------------------------------------------------------- /nymms/probe/Probe.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | from nymms.schemas import Result, types 6 | from nymms.daemon import NymmsDaemon 7 | from nymms.resources import Monitor 8 | from nymms.utils import commands 9 | from nymms.config.yaml_config import load_config, EmptyConfig 10 | 11 | import arrow 12 | 13 | 14 | TIMEOUT_OUTPUT = "Command timed out after %d seconds." 15 | 16 | 17 | class Probe(NymmsDaemon): 18 | state_manager = None 19 | 20 | def get_private_context(self, private_context_file): 21 | if not private_context_file: 22 | return None 23 | try: 24 | return load_config(private_context_file)[1] 25 | except (IOError, EmptyConfig): 26 | logger.exception("Unable to open private context file: %s", 27 | private_context_file) 28 | return None 29 | 30 | # TODO: This calls on _state_manager but setting up of the _state_manager 31 | # needs to be handled in the subclass. Not sure how I should handle 32 | # this, but I really like the idea of these being base class 33 | # methods since in reality all reactors should have some sort of 34 | # state backend, even if its a no-op 35 | def get_state(self, task_id): 36 | return self.state_manager.get_state(task_id) 37 | 38 | def get_task(self, **kwargs): 39 | raise NotImplementedError 40 | 41 | def resubmit_task(self, task, delay, **kwargs): 42 | raise NotImplementedError 43 | 44 | def submit_result(self, result, **kwargs): 45 | raise NotImplementedError 46 | 47 | def delete_task(self, task): 48 | raise NotImplementedError 49 | 50 | def execute_task(self, task, timeout, **kwargs): 51 | log_prefix = "%s - " % (task.id,) 52 | monitor = Monitor.registry[task.context['monitor']['name']] 53 | command = monitor.command.command_string 54 | current_attempt = int(task.attempt) + 1 55 | logger.debug(log_prefix + "attempt %d, executing: %s", current_attempt, 56 | command) 57 | result = Result({'id': task.id, 58 | 'timestamp': task.created, 59 | 'task_context': task.context}) 60 | try: 61 | output = monitor.execute(task.context, timeout, 62 | self._private_context) 63 | result.output = output 64 | result.state = types.STATE_OK 65 | except commands.CommandException as e: 66 | if isinstance(e, commands.CommandFailure): 67 | result.state = e.return_code 68 | result.output = e.output 69 | if isinstance(e, commands.CommandTimeout): 70 | result.state = types.STATE_UNKNOWN 71 | result.output = (TIMEOUT_OUTPUT % timeout) 72 | except Exception as e: 73 | result.state = types.STATE_UNKNOWN 74 | result.output = str(e) 75 | result.state_type = types.STATE_TYPE_HARD 76 | result.validate() 77 | return result 78 | 79 | def expire_task(self, task, task_expiration): 80 | if task_expiration: 81 | now = arrow.get() 82 | task_lifetime = now.timestamp - task.created.timestamp 83 | if task_lifetime > task_expiration: 84 | logger.debug("Task %s is older than expiration limit %d. " 85 | "Skipping.", task.id, task_expiration) 86 | return True 87 | return False 88 | return False 89 | 90 | def handle_task(self, task, **kwargs): 91 | log_prefix = "%s - " % (task.id,) 92 | task_expiration = kwargs.get('task_expiration', None) 93 | if self.expire_task(task, task_expiration): 94 | return None 95 | # Used to add the command context to the task 96 | monitor = Monitor.registry[task.context['monitor']['name']] 97 | command = monitor.command 98 | task.context = command.build_context(task.context) 99 | previous_state = self.get_state(task.id) 100 | # check if the timeout is defined on the task first, if not then 101 | # go with what was passed into handle_task via run 102 | timeout = task.context.get('monitor_timeout', 103 | kwargs.get('monitor_timeout')) 104 | max_retries = task.context.get('max_retries', 105 | kwargs.get('max_retries')) 106 | last_attempt = int(task.attempt) 107 | current_attempt = last_attempt + 1 108 | result = self.execute_task(task, timeout, **kwargs) 109 | # Trying to emulate this: 110 | # http://nagios.sourceforge.net/docs/3_0/statetypes.html 111 | if result.state == types.STATE_OK: 112 | if (previous_state and not 113 | previous_state.state == types.STATE_OK and 114 | previous_state.state_type == types.STATE_TYPE_SOFT): 115 | result.state_type = types.STATE_TYPE_SOFT 116 | else: 117 | logger.debug(log_prefix + "current_attempt: %d, max_retries: %d", 118 | current_attempt, max_retries) 119 | if current_attempt <= max_retries: 120 | # XXX Hate this logic - hope to find a cleaner way to handle 121 | # it someday. 122 | if (not previous_state or 123 | previous_state.state_type == types.STATE_TYPE_SOFT or 124 | previous_state.state == types.STATE_OK): 125 | result.state_type = types.STATE_TYPE_SOFT 126 | delay = task.context.get('retry_delay', 127 | kwargs.get('retry_delay')) 128 | delay = max(delay, 0) 129 | logger.debug('Resubmitting task with %ds delay.', delay) 130 | self.resubmit_task(task, delay, **kwargs) 131 | else: 132 | logger.debug("Retry limit hit, not resubmitting.") 133 | result.validate() 134 | return result 135 | 136 | def run(self, **kwargs): 137 | """ This will run in a tight loop. It is expected that the subclass's 138 | get_task() method will introduce a delay if the results queue is 139 | empty. 140 | """ 141 | private_context_file = kwargs.get('private_context_file', None) 142 | self._private_context = self.get_private_context(private_context_file) 143 | while True: 144 | task = self.get_task(**kwargs) 145 | if not task: 146 | logger.debug("Task queue is empty.") 147 | continue 148 | result = self.handle_task(task, **kwargs) 149 | if result: 150 | self.submit_result(result, **kwargs) 151 | self.delete_task(task) 152 | -------------------------------------------------------------------------------- /scripts/make_ami.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import time 4 | import urllib2 5 | import argparse 6 | 7 | import yaml 8 | from boto.exception import EC2ResponseError 9 | from boto.ec2 import connect_to_region, RegionData 10 | 11 | from nymms.utils.commands import execute, CommandFailure, CommandTimeout 12 | from nymms.utils import logutil 13 | 14 | regions = RegionData.keys() 15 | 16 | 17 | def get_ubuntu_ami(requested_region, release='precise'): 18 | ubuntu_url = ('http://cloud-images.ubuntu.com/query/%s/server/' 19 | 'released.current.txt') % (release) 20 | logger.debug("Getting ami for region %s from %s.", requested_region, 21 | ubuntu_url) 22 | lines = urllib2.urlopen(ubuntu_url).read().split('\n') 23 | for l in lines: 24 | # skip blank lines 25 | if not l: 26 | continue 27 | # Split the tab separated list 28 | entries = l.split('\t') 29 | (disk, cpu, region, ami) = entries[4:8] 30 | virtual = entries[10] 31 | if disk == 'ebs' and cpu == 'amd64' and virtual == 'paravirtual': 32 | if region == requested_region: 33 | logger.debug("Found ami %s.", ami) 34 | return ami 35 | raise ValueError("AMI for region '%s' not found" % requested_region) 36 | 37 | 38 | def wait_for_instance_state(instance, state, timeout=None): 39 | logger.debug("Waiting for instance %s to enter %s state...", instance.id, 40 | state) 41 | waited = 1 42 | while instance.update() != state: 43 | if timeout and waited > timeout: 44 | return None 45 | if waited % 5 == 0: 46 | logger.debug("Instance has taken %d seconds...", waited) 47 | waited += 1 48 | time.sleep(1) 49 | logger.debug("Instance in %s state.", state) 50 | return True 51 | 52 | 53 | def generate_cloud_config(): 54 | sources = [ 55 | { 56 | 'source': 'deb http://ppa.launchpad.net/chris-lea/python-boto/' 57 | 'ubuntu precise main', 58 | 'keyid': 'C7917B12', 59 | 'filename': 'boto.list' 60 | }, 61 | { 62 | 'source': 'deb http://ppa.launchpad.net/loki77/nymms/ubuntu ' 63 | 'precise main', 64 | 'keyid': 'A835227D', 65 | 'filename': 'nymms.list' 66 | }, 67 | ] 68 | 69 | packages = ['python-yaml', 'python-jinja2', 'python-boto', 'python-nymms', 70 | 'nymms-common', 'nymms-scheduler', 'nymms-probe', 71 | 'nymms-reactor', 'nagios-plugins', 'python-pip'] 72 | 73 | commands = ['pip install validictory'] 74 | 75 | cloud_config = {'apt_sources': sources, 76 | 'packages': packages, 77 | 'runcmd': commands} 78 | return "#cloud-config\n" + yaml.dump(cloud_config) 79 | 80 | 81 | def check_finish_install(address): 82 | attempt = 0 83 | while True: 84 | attempt += 1 85 | try: 86 | out = execute("ssh ubuntu@%s status cloud-config" % (address), 87 | timeout=30).strip() 88 | if out == 'cloud-config stop/waiting': 89 | logger.debug('cloud-config finished') 90 | break 91 | else: 92 | logger.debug(out.strip()) 93 | except CommandTimeout: 94 | logger.debug("ssh to %s timed out.", address) 95 | except CommandFailure, e: 96 | logger.debug("Command failed with exit code %d.", e.return_code) 97 | for i in e.output.split('\n'): 98 | logger.debug(" stdout: %s", i) 99 | time.sleep(5) 100 | 101 | 102 | def publish_ami(image_id): 103 | logger.debug("Publishing AMI %s to the world.", image_id) 104 | while True: 105 | try: 106 | ami = conn.get_image(image_id) 107 | break 108 | except EC2ResponseError: 109 | logger.debug("AMI does not exist yet.") 110 | time.sleep(2) 111 | 112 | while True: 113 | try: 114 | ami.set_launch_permissions(group_names=['all']) 115 | logger.info("AMI %s published.", image_id) 116 | break 117 | except EC2ResponseError: 118 | sleep_time = 30 119 | logger.debug("AMI not ready. Sleeping %d seconds.", sleep_time) 120 | time.sleep(sleep_time) 121 | 122 | 123 | if __name__ == '__main__': 124 | parser = argparse.ArgumentParser( 125 | description='Used to create a new NYMMS example AMI.') 126 | parser.add_argument('-v', '--verbose', action='count', default=0, 127 | help='Verbose output. Can be specified up to two ' 128 | 'times.') 129 | parser.add_argument('-k', '--ssh-key', default='default', 130 | help="SSH Keypair to use. Default: %(default)s") 131 | parser.add_argument('-t', '--instance-type', default='m3.medium', 132 | help="Instance type to use to build AMI. " 133 | "Default: %(default)s") 134 | parser.add_argument('-g', '--security-group', default='default', 135 | help="Security group of instance used to build the " 136 | "AMI. Must allow SSH (tcp port 22). " 137 | "Default: %(default)s") 138 | parser.add_argument('region', choices=regions, 139 | help='The region to build the ami in.') 140 | 141 | args = parser.parse_args() 142 | 143 | log_level = logutil.INFO 144 | if args.verbose: 145 | log_level = logutil.DEBUG 146 | logger = logutil.setup_root_logger(stdout=log_level) 147 | if not args.verbose > 2: 148 | logutil.quiet_boto_logging() 149 | 150 | base_image_id = get_ubuntu_ami(args.region) 151 | logger.info("Building from base ubuntu AMI %s.", base_image_id) 152 | logger.debug("Connecting to EC2 API in region %s.", args.region) 153 | conn = connect_to_region(args.region) 154 | base_ami = conn.get_image(base_image_id) 155 | logger.debug("Launching instance with AMI %s.", base_image_id) 156 | reservation = base_ami.run(key_name=args.ssh_key, 157 | security_groups=[args.security_group, ], 158 | user_data=generate_cloud_config(), 159 | instance_type=args.instance_type) 160 | instance = reservation.instances[0] 161 | wait_for_instance_state(instance, 'running') 162 | logger.debug('Instance up, public IP: %s', instance.ip_address) 163 | logger.debug('Sleeping for 30 seconds for instance to finish booting.') 164 | time.sleep(30) 165 | check_finish_install(instance.ip_address) 166 | logger.debug('Shutting down instance.') 167 | instance.stop() 168 | wait_for_instance_state(instance, 'stopped') 169 | ami_name = time.strftime("nymms-ubuntu-precise-%Y%m%d-%H%M%S") 170 | logger.debug("Creating image %s.", ami_name) 171 | image_id = conn.create_image(instance.id, ami_name) 172 | publish_ami(image_id) 173 | logger.debug("Terminating instance %s.", instance.id) 174 | instance.terminate() 175 | -------------------------------------------------------------------------------- /nymms/utils/logutil.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from logging.handlers import SysLogHandler 3 | import os.path 4 | import sys 5 | import traceback 6 | import platform 7 | import os 8 | 9 | DEBUG = logging.DEBUG 10 | INFO = logging.INFO 11 | WARNING = logging.WARNING 12 | ERROR = logging.ERROR 13 | CRITICAL = logging.CRITICAL 14 | 15 | _pid = os.getpid() 16 | 17 | syslog_socket_paths = { 18 | 'Darwin': '/var/run/syslog', 19 | 'Linux': '/dev/log' 20 | } 21 | 22 | 23 | def quiet_boto_logging(): 24 | """ 25 | Boto's debug logs are full dumps of the XML that was passed between the 26 | client and server. This can be annoying. This is a simple function to 27 | hide those dumps whenever you put your code into debug. 28 | """ 29 | logging.getLogger('boto').setLevel(logging.CRITICAL) 30 | 31 | 32 | def quiet_paramiko_logging(): 33 | """ Paramiko is really noisy when set to INFO or below. 34 | 35 | This sets the paramiko logger to only send WARNING or above messages. 36 | """ 37 | logging.getLogger('paramiko').setLevel(logging.WARNING) 38 | 39 | 40 | def quiet_requests_connpool_logging(): 41 | """ Paramiko is really noisy when set to INFO or below. 42 | 43 | This sets the paramiko logger to only send WARNING or above messages. 44 | """ 45 | logging.getLogger('requests.packages.urllib3.connectionpool').setLevel( 46 | logging.WARNING) 47 | 48 | 49 | def get_syslog_path(): 50 | system_os = platform.system() 51 | try: 52 | return syslog_socket_paths[system_os] 53 | except KeyError: 54 | raise ValueError("Unable to find syslog unix domain socket for os " 55 | "'%s'." % (system_os)) 56 | 57 | DEFAULT_FORMAT = ('pid:' + str(_pid) + ' %(levelname)s %(name)s ' 58 | '%(module)s(%(funcName)s):%(lineno)d - %(message)s') 59 | 60 | 61 | def setup_root_logger(stdout=INFO, filename=None, file_level=INFO, 62 | file_mode='w', syslog=None, 63 | syslog_facility=SysLogHandler.LOG_LOCAL7, 64 | syslog_socket_path=None, syslog_tag=None, 65 | time_format="%Y/%m/%d %H:%M:%S %Z", 66 | message_format=DEFAULT_FORMAT): 67 | """Setup basic logging, including stdout, file, and syslog logging. 68 | 69 | Sets up the root logger, deleting any previously configured handlers. It 70 | does this to make sure that we don't have multiples of the same handler 71 | being attached to the root logger, resulting in multiple messages of the 72 | same type. 73 | 74 | This should be called in the main script/command/daemon itself, and never 75 | inside libraries unless you really know what you're doing. 76 | 77 | :type stdout: int 78 | :param stdout: The logging level to send to stdout. Can be any of the 79 | logging.* constants (logging.DEBUG, etc) or logutil constants 80 | (logutil.DEBUG, etc) which are just pointers to the logging constants. 81 | If set to None or False, disable stdout logging. 82 | Default: logging.INFO 83 | 84 | :type filename: string 85 | :param filename: The path to a file to log to. Setting to None disables 86 | file logging. 87 | Default: None 88 | 89 | :type file_level: int 90 | :param file_level: The logging level to send to the file given in the 91 | 'filename' parameter. Can be any of the logging.* or logutil.* 92 | constants like the 'stdout' parameter. 93 | Default: logging.INFO 94 | 95 | :type file_mode: string 96 | :param file_mode: The mode to open the file at the 'filename' parameter 97 | with. 98 | Default: 'w' 99 | 100 | :type syslog: int 101 | :param syslog: The logging level to send to syslog. Can be any of the 102 | logging.* or logutil.* constants. Set to None to disable syslog 103 | logging. 104 | Default: None 105 | 106 | :type syslog_facility: int 107 | :param syslog_facility: The syslog facility to send messages to if syslog 108 | is enabled. Can be any of the SysLogHandler.LOG_* facility constants. 109 | Default: SysLogHandler.LOG_LOCAL7 110 | 111 | :type syslog_socket_path: string 112 | :param syslog_socket_path: The path to the unix domain socket used by 113 | syslog if syslog is enabled. If not given, will automatically try to 114 | determine the correct path. 115 | Default: None 116 | 117 | :type syslog_tag: string 118 | :param syslog_tag: The tag to be pre-pended to syslog messages. If not 119 | given it will try to determine the name of the command that was 120 | called, and use that. 121 | Default: None 122 | 123 | :type time_format: string 124 | :param time_format: A time.strftime formatted string to use for the 125 | timestamp format. This will be prepended to stdout and logfiles, but 126 | not to syslog (since syslog has it's own timestamp system) 127 | 128 | :type message_format: string 129 | :param message_format: A logging.Formatter formatted string to use for 130 | the output of log messages. See the following for variables: 131 | http://docs.python.org/2/library/logging.html#logrecord-attributes 132 | """ 133 | base_format = message_format 134 | timed_format = '[%(asctime)s] ' + base_format 135 | timed_formatter = logging.Formatter(timed_format, datefmt=time_format) 136 | logger = logging.getLogger() 137 | 138 | # Delete all previous handlers. 139 | for h in logger.handlers: 140 | logger.removeHandler(h) 141 | 142 | # Used to track what levels are being used by handlers. 143 | levels = [] 144 | 145 | if stdout: 146 | stdout_handler = logging.StreamHandler(sys.stdout) 147 | stdout_handler.setFormatter(timed_formatter) 148 | stdout_handler.setLevel(stdout) 149 | levels.append(stdout) 150 | logger.addHandler(stdout_handler) 151 | 152 | if filename: 153 | file_handler = logging.FileHandler(filename, mode=file_mode) 154 | file_handler.setFormatter(timed_formatter) 155 | file_handler.setLevel(file_level) 156 | levels.append(file_level) 157 | logger.addHandler(file_handler) 158 | 159 | if syslog: 160 | if not syslog_socket_path: 161 | syslog_socket_path = get_syslog_path() 162 | syslog_handler = SysLogHandler(syslog_socket_path, 163 | facility=syslog_facility) 164 | if not syslog_tag: 165 | syslog_tag = os.path.basename(sys.argv[0]) 166 | syslog_format = syslog_tag + ": " + base_format 167 | syslog_handler.setFormatter(logging.Formatter(syslog_format)) 168 | syslog_handler.setLevel(syslog) 169 | levels.append(syslog) 170 | logger.addHandler(syslog_handler) 171 | 172 | # Set the logger level to the level of the lowest leveled handler 173 | logger.setLevel(min(levels)) 174 | 175 | return logger 176 | 177 | 178 | def log_exception(message=None, logger=logging): 179 | """ 180 | Used to produce more cleanly readable exceptions in syslog by breaking 181 | the exception up over multiple logging calls. 182 | """ 183 | if message: 184 | logger.error(message) 185 | logger.error('Exception output: ') 186 | exc_msg = traceback.format_exc().split('\n') 187 | for line in exc_msg: 188 | logger.error(' %s' % (line,)) 189 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/NYMMS.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/NYMMS.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/NYMMS" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/NYMMS" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /nymms/probe/tests/test_probe.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | 4 | os.environ['PATH'] += ":/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin" 5 | os.environ['PATH'] += ":/usr/local/sbin" 6 | 7 | from nymms.probe.Probe import Probe, TIMEOUT_OUTPUT 8 | from nymms.schemas import types, Result, Task, StateRecord 9 | from nymms import resources 10 | from nymms.state.State import StateManager 11 | 12 | import arrow 13 | 14 | result_codes = [ 15 | types.STATE_CRITICAL, 16 | types.STATE_WARNING, 17 | types.STATE_CRITICAL, 18 | types.STATE_WARNING, 19 | types.STATE_WARNING, 20 | types.STATE_OK, 21 | types.STATE_OK, 22 | types.STATE_UNKNOWN, 23 | types.STATE_OK, 24 | types.STATE_OK 25 | ] 26 | 27 | true_output = "Good output" 28 | true_command = resources.Command('true_command', 'echo ' + true_output) 29 | fail_command = resources.Command('fail_command', 'false') 30 | sleep_command = resources.Command('sleep_command', 'sleep {{sleep_time}}') 31 | 32 | true_monitor = resources.Monitor('true_monitor', command=true_command) 33 | fail_monitor = resources.Monitor('fail_monitor', command=fail_command) 34 | sleep_monitor = resources.Monitor('sleep_monitor', command=sleep_command) 35 | 36 | 37 | class DummyStateBackend(object): 38 | def __init__(self): 39 | self.states = [ 40 | None, 41 | {'last_update': 1, 'last_state_change': 0, 42 | 'state': types.STATE_CRITICAL, 43 | 'state_type': types.STATE_TYPE_SOFT}, 44 | {'last_update': 2, 'last_state_change': 0, 45 | 'state': types.STATE_WARNING, 46 | 'state_type': types.STATE_TYPE_SOFT}, 47 | {'last_update': 3, 'last_state_change': 3, 48 | 'state': types.STATE_CRITICAL, 49 | 'state_type': types.STATE_TYPE_HARD}, 50 | {'last_update': 4, 'last_state_change': 4, 51 | 'state': types.STATE_WARNING, 52 | 'state_type': types.STATE_TYPE_HARD}, 53 | {'last_update': 5, 'last_state_change': 4, 54 | 'state': types.STATE_WARNING, 55 | 'state_type': types.STATE_TYPE_HARD}, 56 | {'last_update': 6, 'last_state_change': 6, 57 | 'state': types.STATE_OK, 'state_type': types.STATE_TYPE_HARD}, 58 | {'last_update': 7, 'last_state_change': 6, 59 | 'state': types.STATE_OK, 'state_type': types.STATE_TYPE_HARD}, 60 | {'last_update': 8, 'last_state_change': 8, 61 | 'state': types.STATE_UNKNOWN, 62 | 'state_type': types.STATE_TYPE_SOFT}, 63 | {'last_update': 9, 'last_state_change': 9, 64 | 'state': types.STATE_OK, 'state_type': types.STATE_TYPE_SOFT}, 65 | {'last_update': 10, 'last_state_change': 9, 66 | 'state': types.STATE_OK, 'state_type': types.STATE_TYPE_HARD} 67 | ] 68 | self.state_iter = iter(self.states) 69 | 70 | def get(self, item_id, consistent_read=True): 71 | item = next(self.state_iter) 72 | if item: 73 | item['id'] = item_id 74 | return item 75 | 76 | 77 | class DummyStateManager(StateManager): 78 | def __init__(self): 79 | self._backend = DummyStateBackend() 80 | self.schema_class = StateRecord 81 | 82 | 83 | class DummyProbe(Probe): 84 | def __init__(self, state_manager=DummyStateManager): 85 | self.task = Task({ 86 | 'id': 'test:task', 87 | 'context': {'monitor': {'name': 'true_monitor'}}}) 88 | self.state_manager = state_manager() 89 | self.results_iter = iter(self.state_manager.backend.states[1:]) 90 | 91 | def get_task(self, **kwargs): 92 | return self.task 93 | 94 | def resubmit_task(self, task, delay, **kwargs): 95 | self.task.increment_attempt() 96 | 97 | def submit_result(self, result, **kwargs): 98 | if result.state_type == types.STATE_TYPE_HARD: 99 | self.task.attempt = 0 100 | return result 101 | 102 | def execute_task(self, task, timeout, **kwargs): 103 | result = Result({'id': task.id, 104 | 'timestamp': task.created, 105 | 'task_context': task.context}) 106 | r = next(self.results_iter) 107 | result.state = r['state'] 108 | result.state_type = r['state_type'] 109 | result.output = 'Some output here.' 110 | result.validate() 111 | return result 112 | 113 | 114 | class TestStateChange(unittest.TestCase): 115 | @classmethod 116 | def setUpClass(cls): 117 | cls.state_manager = DummyStateManager() 118 | cls.probe = DummyProbe() 119 | cls.probe.state_manager = cls.state_manager 120 | 121 | def test_state_change(self): 122 | # tests that our logic follows the nagios logic here 123 | # http://nagios.sourceforge.net/docs/3_0/statetypes.html 124 | # We take our example state changes from the table at the bottom 125 | t = self.probe.get_task() 126 | for i, code in enumerate(result_codes): 127 | r = self.probe.handle_task(t, monitor_timeout=30, 128 | max_retries=2) 129 | expected = self.state_manager.backend.states[i + 1] 130 | print "[%d] Result STATE/TYPE: %s/%s" % (i, r.state, r.state_type) 131 | print "[%d] Expected STATE/TYPE: %s/%s" % (i, expected['state'], 132 | expected['state_type']) 133 | self.assertEqual(r.state, expected['state']) 134 | self.assertEqual(r.state_type, expected['state_type']) 135 | self.probe.submit_result(r) 136 | 137 | def test_expiration(self): 138 | expiration = 30 139 | now = arrow.get() 140 | t = self.probe.get_task() 141 | t.created = now 142 | t.validate() 143 | self.assertFalse(self.probe.expire_task(t, expiration)) 144 | t.created = now.replace(seconds=-(expiration + 5)) 145 | t.validate() 146 | self.assertTrue(self.probe.expire_task(t, expiration)) 147 | 148 | 149 | class TestExecuteTask(unittest.TestCase): 150 | @classmethod 151 | def setUpClass(cls): 152 | cls.probe = Probe() 153 | cls.true_task = Task({ 154 | 'id': 'test:true_monitor', 155 | 'context': {'monitor': {'name': 'true_monitor'}}}) 156 | cls.fail_task = Task({ 157 | 'id': 'test:fail_monitor', 158 | 'context': {'monitor': {'name': 'fail_monitor'}}}) 159 | cls.timeout_task = Task({ 160 | 'id': 'test:timeout_monitor', 161 | 'context': {'monitor': {'name': 'sleep_monitor'}, 162 | 'sleep_time': 2}}) 163 | cls.probe._private_context = {} 164 | 165 | def test_successful_execute_task(self): 166 | result = self.probe.execute_task(self.true_task, 30) 167 | self.assertEqual(result.state, types.STATE_OK) 168 | self.assertEqual(result.output.strip(), true_output) 169 | 170 | def test_failed_execute_task(self): 171 | result = self.probe.execute_task(self.fail_task, 30) 172 | self.assertEqual(result.state, types.STATE_WARNING) 173 | self.assertEqual(result.output.strip(), '') 174 | 175 | def test_timeout_execute_task(self): 176 | timeout = 1 177 | result = self.probe.execute_task(self.timeout_task, timeout) 178 | self.assertEqual(result.state, types.STATE_UNKNOWN) 179 | self.assertEqual(result.output.strip(), TIMEOUT_OUTPUT % timeout) 180 | -------------------------------------------------------------------------------- /docs/getting_started.rst: -------------------------------------------------------------------------------- 1 | ========================== 2 | Getting Started with NYMMS 3 | ========================== 4 | 5 | This tutorial will walk you through installing and configuring NYMMS. If you'd 6 | quickly like to start a NYMMS system to play with yourself, please see 7 | the :doc:`Demo AMI ` documentation. 8 | 9 | This tutorial assumes basic understanding of `Amazon Web Services`_. You will 10 | either need to understand how to launch an instance with an `instance profile`_ 11 | with the appropriate permissions (see below) or you will need the 12 | ``Access Key ID`` and ``Secret Access Key`` for a user with the appropriate 13 | permissions. 14 | 15 | 16 | ---------------- 17 | Installing NYMMS 18 | ---------------- 19 | 20 | On Ubuntu 21 | ========= 22 | 23 | Maintaining the Ubuntu packages proved to be difficult after NYMMS started 24 | using multiple third party python packages. Because of that, we no longer 25 | maintain the Ubuntu packages. Instead you should use the docker images (see 26 | below) 27 | 28 | Using Docker 29 | ============ 30 | 31 | A docker image is provided that can be used to run any of the daemons used in 32 | NYMMS. It can be pulled from `phobologic/nymms`. To run the daemons, you can 33 | launch them with the following command: 34 | 35 | docker run -e "AWS_ACCESS_KEY_ID=" -e "AWS_SECRET_ACCESS_KEY=" --rm -it phobologic/nymms:latest /[scheduler|probe|reactor] 36 | 37 | For example, to run the scheduler (with verbose logging, the -v) you can run: 38 | 39 | docker run --rm -it phobologic/nymms:latest /scheduler -v 40 | 41 | You can also set the `AWS_ACCESS_KEY_ID` & `AWS_SECRET_ACCESS_KEY` in a file, 42 | and then use `--env-file` rather than specifying the variables on the command 43 | line. Optionally, if you are running on a host in EC2 that has an IAM profile 44 | with all the necessary permissions, you do not need to specify the keys at all. 45 | 46 | The docker container has the example config, which just checks that 47 | `www.google.com` is alive. It only has a single reactor handler enabled, the 48 | log handler, which logs to `/var/log/nymms/reactor.log`. 49 | 50 | To use the docker container with your own configs, you should put them in a 51 | directory, then mount it as a volume when you run the containers. If you put 52 | the configs in the directory `/etc/nymms` on the host, you should run the 53 | container like this: 54 | 55 | docker run -v /etc/nymms:/etc/nymms:ro --rm -it phobologic/nymms:latest /scheduler -v 56 | 57 | Using PIP 58 | ========= 59 | 60 | Since NYMMS is written in python I've also published it to `PyPI`_. You can 61 | install it with pip by running:: 62 | 63 | pip install nymms 64 | 65 | .. warning:: 66 | 67 | The python library does not come with startup scripts, though it does 68 | install the three daemon scripts in system directories. You should work on 69 | your own startup scripts for the OS you are using. 70 | 71 | .. _`PyPI`: https://pypi.python.org/pypi 72 | 73 | Installing From Source 74 | ====================== 75 | 76 | You can also install from the latest source repo:: 77 | 78 | git clone https://github.com/cloudtools/nymms.git 79 | cd nymms 80 | python setup.py install 81 | 82 | .. warning:: 83 | 84 | The python library does not come with startup scripts, though it does 85 | install the three daemon scripts in system directories. You should work on 86 | your own startup scripts for the OS you are using. 87 | 88 | Using Virtual Environments 89 | =========================== 90 | 91 | Another common way to install ``NYMMS`` is to use a `virtualenv`_ which 92 | provides isolated environments. This is also useful if you want to play with 93 | ``NYMMS`` but do not want to (or do not have the permissions to) install it as 94 | root. First install the ``virtualenv`` Python package:: 95 | 96 | pip install virtualenv 97 | 98 | Next you'll need to create a virtual environment to work in with the newly 99 | installed ``virtualenv`` command and specifying a directory where you want 100 | the virtualenv to be created:: 101 | 102 | mkdir ~/.virtualenvs 103 | virtualenv ~/.virtualenvs/nymms 104 | 105 | Now you need to activate the virtual environment:: 106 | 107 | source ~/.virtualenvs/nymms/bin/activate 108 | 109 | Now you can use either the instructions in `Using PIP`_ or 110 | `Installing From Source`_ above. 111 | 112 | When you are finished using ``NYMMS`` you can deactivate your virtual 113 | environment with:: 114 | 115 | deactivate 116 | 117 | .. note:: 118 | 119 | The deactivate command just unloads the virtualenv from that session. 120 | The virtualenv still exists in the location you created it and can be 121 | re-activated by running the activate command once more. 122 | 123 | .. _`virtualenv`: http://www.virtualenv.org/en/latest/ 124 | 125 | 126 | ----------- 127 | Permissions 128 | ----------- 129 | 130 | NYMMS makes use of many of the `Amazon Web Services`_. In order for the 131 | daemons to use these services they have to be given access to them. Since 132 | NYMMS is written in python, we make heavy use of the `boto`_ library. 133 | Because of that we fall back on boto's way of dealing with credentials. 134 | 135 | If you are running NYMMS on an EC2 instance the preferred way to provide 136 | access is to use an `instance profile`_. If that is not possible (you do not 137 | run on EC2, or you don't understand how to setup the instance profile, etc) 138 | then the next best way of providing the credentials is by createing an `IAM`_ 139 | user with only the permissions necessary to run NYMMS. You would then need 140 | to get that user's Access Key ID & Secret Key and provide them as the 141 | environment variables ``AWS_ACCESS_KEY_ID`` and ``AWS_SECRET_ACCESS_KEY``. 142 | 143 | Whichever method you choose, you'll need to provide the following permission 144 | document (for either the user, or the role):: 145 | 146 | { 147 | "Version": "2012-10-17", 148 | "Statement": [ 149 | { 150 | "Action": [ 151 | "ses:GetSendQuota", 152 | "ses:SendEmail" 153 | ], 154 | "Sid": "NymmsSESAccess", 155 | "Resource": [ 156 | "*" 157 | ], 158 | "Effect": "Allow" 159 | }, 160 | { 161 | "Action": [ 162 | "sns:ConfirmSubscription", 163 | "sns:CreateTopic", 164 | "sns:DeleteTopic", 165 | "sns:GetTopicAttributes", 166 | "sns:ListSubscriptions", 167 | "sns:ListSubscriptionsByTopic", 168 | "sns:ListTopics", 169 | "sns:Publish", 170 | "sns:SetTopicAttributes", 171 | "sns:Subscribe", 172 | "sns:Unsubscribe" 173 | ], 174 | "Sid": "NymmsSNSAccess", 175 | "Resource": [ 176 | "*" 177 | ], 178 | "Effect": "Allow" 179 | }, 180 | { 181 | "Action": [ 182 | "sqs:ChangeMessageVisibility", 183 | "sqs:CreateQueue", 184 | "sqs:DeleteMessage", 185 | "sqs:DeleteQueue", 186 | "sqs:GetQueueAttributes", 187 | "sqs:GetQueueUrl", 188 | "sqs:ListQueues", 189 | "sqs:ReceiveMessage", 190 | "sqs:SendMessage", 191 | "sqs:SetQueueAttributes" 192 | ], 193 | "Sid": "NymmsSQSAccess", 194 | "Resource": [ 195 | "*", 196 | ], 197 | "Effect": "Allow" 198 | }, 199 | { 200 | "Action": [ 201 | "sdb:*" 202 | ], 203 | "Sid": "NymmsSDBAccess", 204 | "Resource": [ 205 | "*" 206 | ], 207 | "Effect": "Allow" 208 | } 209 | ] 210 | } 211 | 212 | .. note:: 213 | 214 | If you want to provide even tighter permissions, you can limit the SNS, SDB 215 | and SQS stanzas to specific resources. You should provide the ARNs for 216 | each of the resources necessary. 217 | 218 | 219 | ------------- 220 | Configuration 221 | ------------- 222 | 223 | Please see the :doc:`configuration ` page for information on how to 224 | configure ``NYMMS``. Usually the configuration files are located in 225 | ``/etc/nymms/config`` but that is not a requirement and all of the daemons 226 | accept the ``--config`` argument to point them at a new config file. 227 | 228 | 229 | .. _`Amazon Web Services`: https://aws.amazon.com/ 230 | .. _`AWS`: https://aws.amazon.com/ 231 | .. _`boto`: https://github.com/boto/boto 232 | .. _`instance profile`: http://docs.aws.amazon.com/IAM/latest/UserGuide/instance-profiles.html 233 | .. _`IAM`: http://aws.amazon.com/iam/ 234 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # NYMMS documentation build configuration file, created by 4 | # sphinx-quickstart on Thu Oct 17 11:08:51 2013. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | sys.path.insert(0, os.path.abspath('.')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 'sphinx.ext.ifconfig', 'sphinx.ext.viewcode', 'hidden_code_block'] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'NYMMS' 44 | copyright = u'2013, Michael Barrett' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '0.2.6' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '0.2.6' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = ['_build'] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | # If true, keep warnings as "system message" paragraphs in the built documents. 90 | #keep_warnings = False 91 | 92 | 93 | # -- Options for HTML output --------------------------------------------------- 94 | 95 | # The theme to use for HTML and HTML Help pages. See the documentation for 96 | # a list of builtin themes. 97 | html_theme = 'default' 98 | 99 | # Theme options are theme-specific and customize the look and feel of a theme 100 | # further. For a list of options available for each theme, see the 101 | # documentation. 102 | #html_theme_options = {} 103 | 104 | # Add any paths that contain custom themes here, relative to this directory. 105 | #html_theme_path = [] 106 | 107 | # The name for this set of Sphinx documents. If None, it defaults to 108 | # " v documentation". 109 | #html_title = None 110 | 111 | # A shorter title for the navigation bar. Default is the same as html_title. 112 | #html_short_title = None 113 | 114 | # The name of an image file (relative to this directory) to place at the top 115 | # of the sidebar. 116 | #html_logo = None 117 | 118 | # The name of an image file (within the static path) to use as favicon of the 119 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 120 | # pixels large. 121 | #html_favicon = None 122 | 123 | # Add any paths that contain custom static files (such as style sheets) here, 124 | # relative to this directory. They are copied after the builtin static files, 125 | # so a file named "default.css" will overwrite the builtin "default.css". 126 | html_static_path = ['_static'] 127 | 128 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 129 | # using the given strftime format. 130 | #html_last_updated_fmt = '%b %d, %Y' 131 | 132 | # If true, SmartyPants will be used to convert quotes and dashes to 133 | # typographically correct entities. 134 | #html_use_smartypants = True 135 | 136 | # Custom sidebar templates, maps document names to template names. 137 | #html_sidebars = {} 138 | 139 | # Additional templates that should be rendered to pages, maps page names to 140 | # template names. 141 | #html_additional_pages = {} 142 | 143 | # If false, no module index is generated. 144 | #html_domain_indices = True 145 | 146 | # If false, no index is generated. 147 | #html_use_index = True 148 | 149 | # If true, the index is split into individual pages for each letter. 150 | #html_split_index = False 151 | 152 | # If true, links to the reST sources are added to the pages. 153 | #html_show_sourcelink = True 154 | 155 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 156 | #html_show_sphinx = True 157 | 158 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 159 | #html_show_copyright = True 160 | 161 | # If true, an OpenSearch description file will be output, and all pages will 162 | # contain a tag referring to it. The value of this option must be the 163 | # base URL from which the finished HTML is served. 164 | #html_use_opensearch = '' 165 | 166 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 167 | #html_file_suffix = None 168 | 169 | # Output file base name for HTML help builder. 170 | htmlhelp_basename = 'NYMMSdoc' 171 | 172 | 173 | # -- Options for LaTeX output -------------------------------------------------- 174 | 175 | latex_elements = { 176 | # The paper size ('letterpaper' or 'a4paper'). 177 | #'papersize': 'letterpaper', 178 | 179 | # The font size ('10pt', '11pt' or '12pt'). 180 | #'pointsize': '10pt', 181 | 182 | # Additional stuff for the LaTeX preamble. 183 | #'preamble': '', 184 | } 185 | 186 | # Grouping the document tree into LaTeX files. List of tuples 187 | # (source start file, target name, title, author, documentclass [howto/manual]). 188 | latex_documents = [ 189 | ('index', 'NYMMS.tex', u'NYMMS Documentation', 190 | u'Michael Barrett', 'manual'), 191 | ] 192 | 193 | # The name of an image file (relative to this directory) to place at the top of 194 | # the title page. 195 | #latex_logo = None 196 | 197 | # For "manual" documents, if this is true, then toplevel headings are parts, 198 | # not chapters. 199 | #latex_use_parts = False 200 | 201 | # If true, show page references after internal links. 202 | #latex_show_pagerefs = False 203 | 204 | # If true, show URL addresses after external links. 205 | #latex_show_urls = False 206 | 207 | # Documents to append as an appendix to all manuals. 208 | #latex_appendices = [] 209 | 210 | # If false, no module index is generated. 211 | #latex_domain_indices = True 212 | 213 | 214 | # -- Options for manual page output -------------------------------------------- 215 | 216 | # One entry per manual page. List of tuples 217 | # (source start file, name, description, authors, manual section). 218 | man_pages = [ 219 | ('index', 'nymms', u'NYMMS Documentation', 220 | [u'Michael Barrett'], 1) 221 | ] 222 | 223 | # If true, show URL addresses after external links. 224 | #man_show_urls = False 225 | 226 | 227 | # -- Options for Texinfo output ------------------------------------------------ 228 | 229 | # Grouping the document tree into Texinfo files. List of tuples 230 | # (source start file, target name, title, author, 231 | # dir menu entry, description, category) 232 | texinfo_documents = [ 233 | ('index', 'NYMMS', u'NYMMS Documentation', 234 | u'Michael Barrett', 'NYMMS', 'One line description of project.', 235 | 'Miscellaneous'), 236 | ] 237 | 238 | # Documents to append as an appendix to all manuals. 239 | #texinfo_appendices = [] 240 | 241 | # If false, no module index is generated. 242 | #texinfo_domain_indices = True 243 | 244 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 245 | #texinfo_show_urls = 'footnote' 246 | 247 | # If true, do not generate a @detailmenu in the "Top" node's menu. 248 | #texinfo_no_detailmenu = False 249 | 250 | 251 | # Example configuration for intersphinx: refer to the Python standard library. 252 | intersphinx_mapping = {'http://docs.python.org/': None} 253 | -------------------------------------------------------------------------------- /nymms/resources.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | 5 | import copy 6 | from weakref import WeakValueDictionary 7 | 8 | from nymms import registry 9 | from nymms.utils import commands 10 | from nymms.config import yaml_config 11 | from nymms.exceptions import MissingCommandContext 12 | 13 | from jinja2 import Template 14 | from jinja2.runtime import StrictUndefined 15 | from jinja2.exceptions import UndefinedError 16 | 17 | 18 | RESERVED_ATTRIBUTES = ['name', 'address', 'node_monitor', 'monitoring_groups', 19 | 'command_string'] 20 | 21 | 22 | class RegistryMetaClass(type): 23 | """ Creates a registry of all objects of a classes type. 24 | 25 | This allows us to get a list of all objects of a given class type quickly 26 | and easily. IE: 27 | 28 | >>> from nymms import resources 29 | >>> webservers = resources.MonitoringGroup('webservers') 30 | >>> www1 = resources.Node('www1', monitoring_groups=[webservers]) 31 | >>> resources.Node.registry.items() 32 | [('www1', )] 33 | """ 34 | def __new__(cls, name, bases, dct): 35 | new_class = super(RegistryMetaClass, cls).__new__(cls, name, bases, 36 | dct) 37 | new_class.registry = registry.Registry(new_class) 38 | return new_class 39 | 40 | 41 | class NanoResource(object): 42 | __metaclass__ = RegistryMetaClass 43 | 44 | context_attributes = ['name'] 45 | 46 | def __init__(self, name, **kwargs): 47 | self.name = name 48 | # Ensure noone tries to set a reserved attribute as an extra 49 | disallowed_attributes = list( 50 | set(RESERVED_ATTRIBUTES) & set(kwargs.keys())) 51 | if disallowed_attributes: 52 | raise TypeError("The following are reserved attributes and cannot " 53 | "be used on this resource: %s" % (', '.join( 54 | disallowed_attributes))) 55 | self.extra_attributes = kwargs 56 | self._context_cache = None 57 | 58 | self.register() 59 | 60 | def register(self): 61 | logger.debug( 62 | "Registering %s resource '%s' with the '%s' registry.", 63 | self.__class__.__name__, self.name, self.__class__.__name__) 64 | self.registry[self.name] = self 65 | 66 | def _context(self, force=False): 67 | if self._context_cache and not force: 68 | logger.debug("Returning context cache for %s resource.", self.name) 69 | return self._context_cache 70 | logger.debug("Generating context cache for %s resource.", self.name) 71 | context_key = self.__class__.__name__.lower() 72 | context = {} 73 | for attr in self.context_attributes: 74 | context[attr] = getattr(self, attr) 75 | for k, v in self.extra_attributes.items(): 76 | context[k] = v 77 | self._context_cache = {context_key: context} 78 | return self._context_cache 79 | 80 | def build_context(self, context): 81 | context = copy.deepcopy(context) 82 | c = self._context() 83 | context.update(c) 84 | for k, v in c.values()[0].iteritems(): 85 | if not k == 'name': 86 | context[k] = v 87 | return context 88 | 89 | 90 | class MonitoringGroup(NanoResource): 91 | context_attributes = ['name', 'realm'] 92 | 93 | def __init__(self, name, realm=None, monitors=None, nodes=None, **kwargs): 94 | self.nodes = WeakValueDictionary() 95 | self.monitors = WeakValueDictionary() 96 | self.realm = realm 97 | 98 | super(MonitoringGroup, self).__init__(name, **kwargs) 99 | 100 | if monitors: 101 | for monitor in monitors: 102 | self.add_monitor(monitor) 103 | if nodes: 104 | for node in nodes: 105 | self.add_node(node) 106 | 107 | def add_node(self, node): 108 | if not isinstance(node, Node): 109 | try: 110 | node = Node.registry[node] 111 | except KeyError: 112 | logger.error("Unable to find Node '%s' in registry.", node) 113 | logger.debug("Adding node '%s' to monitoring group '%s'.", node.name, 114 | self.name) 115 | self.nodes[node.name] = node 116 | node.monitoring_groups[self.name] = self 117 | 118 | def add_monitor(self, monitor): 119 | if not isinstance(monitor, Monitor): 120 | try: 121 | monitor = Monitor.registry[monitor] 122 | except KeyError: 123 | logger.error("Unable to find Monitor '%s' in registry.", 124 | monitor) 125 | logger.debug("Adding monitor '%s' to monitoring group '%s'.", 126 | monitor.name, self.name) 127 | self.monitors[monitor.name] = monitor 128 | monitor.monitoring_groups[self.name] = self 129 | 130 | 131 | class Node(NanoResource): 132 | context_attributes = ['name', 'realm', 'address', 'node_monitor'] 133 | 134 | def __init__(self, name, realm=None, address=None, node_monitor=None, 135 | monitoring_groups=None, **kwargs): 136 | self.name = name 137 | self.realm = realm 138 | self.address = address or name 139 | self.node_monitor = node_monitor 140 | self.monitoring_groups = WeakValueDictionary() 141 | self._tasks = [] 142 | if monitoring_groups: 143 | for group in monitoring_groups: 144 | if not isinstance(group, MonitoringGroup): 145 | try: 146 | group = MonitoringGroup.registry[group] 147 | except KeyError: 148 | logger.error("Unable to find MonitoringGroup '%s' " 149 | "in registry, skipping.", group) 150 | group.add_node(self) 151 | 152 | super(Node, self).__init__(name, **kwargs) 153 | 154 | def _build_context(self, monitoring_group, monitor): 155 | context = {} 156 | for obj in (monitoring_group, self, monitor): 157 | context = obj.build_context(context) 158 | return context 159 | 160 | @property 161 | def monitors(self): 162 | if self._tasks: 163 | return self._tasks 164 | for group in self.monitoring_groups.itervalues(): 165 | for monitor in group.monitors.itervalues(): 166 | self._tasks.append(self._build_context(group, monitor)) 167 | return self._tasks 168 | 169 | 170 | class Monitor(NanoResource): 171 | context_attributes = ['name', 'realm'] 172 | 173 | def __init__(self, name, command, realm=None, monitoring_groups=None, 174 | **kwargs): 175 | self.name = name 176 | self.realm = realm 177 | if not isinstance(command, Command): 178 | try: 179 | command = Command.registry[command] 180 | except KeyError: 181 | logger.error("Unable to find Command '%s' in registry.", 182 | command) 183 | raise 184 | 185 | self.command = command 186 | self.monitoring_groups = WeakValueDictionary() 187 | if monitoring_groups: 188 | for group in monitoring_groups: 189 | if not isinstance(group, MonitoringGroup): 190 | try: 191 | group = MonitoringGroup.registry[group] 192 | except KeyError: 193 | logger.error("Unable to find MonitoringGroup '%s' in " 194 | "registry.", group) 195 | raise 196 | group.add_monitor(self) 197 | 198 | super(Monitor, self).__init__(name, **kwargs) 199 | 200 | def execute(self, context, timeout, private_context=None): 201 | return self.command.execute(context, timeout, private_context) 202 | 203 | def format_command(self, context, private_context=None): 204 | return self.command.format_command(context, private_context) 205 | 206 | 207 | class Command(NanoResource): 208 | context_attributes = ['name', 'command_type', 'command_string'] 209 | 210 | def __init__(self, name, command_string, command_type='nagios', **kwargs): 211 | self.command_type = command_type 212 | self.command_string = command_string 213 | super(Command, self).__init__(name, **kwargs) 214 | 215 | def format_command(self, context, private_context=None): 216 | my_context = self._context() 217 | local_context = copy.deepcopy(context) 218 | local_context.update(my_context) 219 | local_context['__private'] = {} 220 | if private_context: 221 | local_context['__private'].update(private_context) 222 | for k, v in my_context.values()[0].iteritems(): 223 | if not k == 'name' and k not in local_context: 224 | local_context[k] = v 225 | t = Template(self.command_string) 226 | t.environment.undefined = StrictUndefined 227 | try: 228 | out = t.render(local_context) 229 | except UndefinedError as e: 230 | raise MissingCommandContext(e.message) 231 | return out 232 | 233 | def execute(self, context, timeout, private_context=None): 234 | cmd = self.format_command(context, private_context) 235 | return commands.execute(cmd, timeout) 236 | 237 | 238 | def load_resource(resources, resource_class, reset=False): 239 | """ Given a dictionary of a given resource_type, instantiate them. 240 | 241 | The resources are loaded into the given resource registry. 242 | """ 243 | if reset: 244 | logger.debug("Clearing old %s entries from registry.", 245 | resource_class.__name__) 246 | resource_class.registry.clear() 247 | 248 | for name, kwargs in resources.items(): 249 | if not kwargs: 250 | kwargs = {} 251 | resource_class(name, **kwargs) 252 | 253 | 254 | def load_resources(resource_file, reset=False): 255 | """ Loads resources in yaml formatted resource_file in the proper order. 256 | 257 | Returns a sha512 hash of the resources. The resources themselves are 258 | stored in their individual registries. 259 | """ 260 | LOAD_ORDER = [('commands', Command), 261 | ('monitoring_groups', MonitoringGroup), 262 | ('monitors', Monitor)] 263 | 264 | logger.info("Loading local resources from %s.", resource_file) 265 | version, resources = yaml_config.load_config(resource_file) 266 | 267 | for resource_type, resource_class in LOAD_ORDER: 268 | items = resources[resource_type] 269 | load_resource(items, resource_class, reset=reset) 270 | return version 271 | 272 | 273 | def load_nodes(node_file, reset=False): 274 | """ Loads nodes from a yaml formatted file. 275 | 276 | Nodes are stored in the Node registry. 277 | """ 278 | logger.info("Loading nodes from %s.", node_file) 279 | version, nodes = yaml_config.load_config(node_file) 280 | 281 | items = nodes['nodes'] 282 | load_resource(items, Node, reset=reset) 283 | return version 284 | --------------------------------------------------------------------------------