├── .gitignore
├── seneschal
    ├── engine.py
    ├── __init__.py
    ├── messaging.py
    └── managers.py
├── seneschal.py
├── docs
    ├── seneschal_config_sample.yaml
    └── tech_specs.md
├── test
    └── test_tasks.py
└── seneschald.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/seneschal/engine.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """Functions for the lifecycle of messages. The action code lives in other
 4 | modules."""
 5 | 
 6 | 
 7 | import logging
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class Engine(object):
14 |     """Object responsible for fetching messages and delegating to
15 |     the business rules."""
16 |     running = True  # When False, start shutting down.
17 | 
18 |     def __init__(self, config):
19 |         self.__dict__.update(config)  # Absorb config
20 | 
21 |     def sweep(self):
22 |         """Loop over work queue until it is exhausted, then return."""
23 |         while Engine.running:
24 |             did_work = self.do_one_mesage()
25 |             if not did_work:
26 |                 logger.debug('no more work')
27 |                 break
28 | 
29 |     def do_one_mesage(self):
30 |         """Check for incoming messages, and process the first. Return
31 |         True if a message was found, False otherwise."""
32 |         return False  # TODO
33 | 


--------------------------------------------------------------------------------
/seneschal/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """Engine for handling messages that represent requests to process
 4 | protected data. A request is a JSON file that contains:
 5 | 
 6 | 1. the desired action
 7 | 2. the input data
 8 | 3. the destination
 9 | 4. (optional) parameters
10 | 
11 | The owner of the the file is considered the sender of the message.
12 | 
13 | The lifecycle of a message:
14 | 
15 | 1. User invokes a command that:
16 |     a. writes the message in a temporary directory on the same filesystem as
17 |        the queue directory
18 |     b. moves the message to the queue directory
19 | 2. Automation (daemon):
20 |     a. Select the oldest message in the queue.
21 |     b. Move the message to the processing directory.
22 |     c. Log the message.
23 |     d. Apply business rules.
24 |     e. Execute and log the action.
25 |     f. Move the message the the finished directory.
26 | 3. Cleanup automation (crontab):
27 |     a. Creates a new directory named after a time period.
28 |     b. Moves all messages in the finished directory into the new directory.
29 |     c. Creates a tarball based on the new directory.
30 |     d. Verifies the tarball.
31 |     e. Deletes the new directory and its contents.
32 | 
33 | Some messages simply result in sending a request for human approval. The
34 | arrival of such an approval, triggers the matching requested action.
35 | """
36 | 
37 | from .engine import Engine
38 | 
39 | 
40 | __author__ = """Walker Hale IV"""
41 | __email__ = 'walker.hale.iv@gmail.com'
42 | __version__ = '0.1.0'
43 | 
44 | __all__ = ['__author__', '__email__', '__version__', 'Engine']
45 | 


--------------------------------------------------------------------------------
/seneschal.py:
--------------------------------------------------------------------------------
 1 | """User interface to the Seneschal automation system. Everything is a
 2 | subcommand."""
 3 | 
 4 | 
 5 | import argparse
 6 | import json
 7 | import sys
 8 | 
 9 | import yaml
10 | 
11 | from seneschal.messaging import leave_new_request
12 | 
13 | 
14 | emit = lambda *args: None  # Do nothing
15 | 
16 | 
17 | def main():
18 |     global emit
19 |     args = parse_args()
20 |     if args.verbose:
21 |         emit = err_output
22 |     config = load_config_file(args.config_file)
23 |     seneschal_config = config['seneschal']
24 |     try:
25 |         args.func(seneschal_config, args)
26 |     except BrokenPipeError as e:
27 |         pass  # Ignore. Something like head is truncating output.
28 |     finally:
29 |         sys.stderr.close()  # Needed to supress meaningless BrokenPipeError.
30 | 
31 | 
32 | def parse_args():
33 |     parser = argparse.ArgumentParser(description=__doc__)
34 |     parser.add_argument('-v', '--verbose', action='store_true')
35 |     parser.add_argument('config_file', help='path to YAML file')
36 |     subparsers = parser.add_subparsers(help='sub-commands')
37 | 
38 |     # create the parser for the "submit" command
39 |     parser_submit = subparsers.add_parser(
40 |         'submit',
41 |         help='Submit a request to the Seneschal automation system.'
42 |     )
43 |     parser_submit.add_argument('workflow', help='what to do')
44 |     parser_submit.add_argument('args', nargs='*', help='workflow-specific')
45 |     parser_submit.set_defaults(func=submit)
46 | 
47 |     # create the parser for the "ls" command
48 |     parser_ls = subparsers.add_parser('ls', help='List workflows')
49 |     parser_ls.set_defaults(func=ls)
50 | 
51 |     args = parser.parse_args()
52 | 
53 |     if not hasattr(args, 'func'):
54 |         parser.error('missing subcommand (subcommand, ls, etc.)')
55 | 
56 |     return args
57 | 
58 | 
59 | def load_config_file(config_file):
60 |     with open(config_file) as fin:
61 |         config = yaml.load(fin)
62 |     return config
63 | 
64 | 
65 | def submit(seneschal_config, args):
66 |     """Submit a request to the Seneschal automation system."""
67 |     emit('workflow:', args.workflow)
68 |     for arg in args.args:
69 |         emit(arg)
70 |     directory = seneschal_config['paths']['user_events']
71 |     uuid_str = leave_new_request(directory, args.workflow, args.args)
72 |     print(uuid_str)
73 | 
74 | 
75 | def ls(seneschal_config, args):
76 |     """List workflows."""
77 |     print(args)
78 |     yaml.safe_dump(seneschal_config, sys.stdout, default_flow_style=False)
79 |     pass  # TODO
80 | 
81 | 
82 | def err_output(*args):
83 |     """Send args to sys.stderr."""
84 |     print(*args, file=sys.stderr)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     main()
89 | 


--------------------------------------------------------------------------------
/docs/seneschal_config_sample.yaml:
--------------------------------------------------------------------------------
 1 | # This file is typically:
 2 | #   (1) maintained and deployed by System Administrators.
 3 | #   (2) assumed to be trusted.
 4 | #   (3) parsed by PyYAML running as root.
 5 | 
 6 | # senechald config file
 7 | # Specifies:
 8 | #   (1) logging settings
 9 | #   (2) daemon settings, like PID file location and UID
10 | #   (3) Senechal operation settings, like location of event directories
11 | 
12 | logging:
13 |   formatters:
14 |     verbose:
15 |       format: '%(asctime)s %(levelname)-8s %(name)s %(module)s %(process)d %(message)s'
16 |     audit_format:
17 |       format: '%(asctime)s %(message)s'
18 |   handlers:
19 |     main:
20 |       class : logging.handlers.RotatingFileHandler
21 |       formatter: verbose
22 |       filename: /var/log/seneschal/seneschal.log
23 |       maxBytes: 40960
24 |       backupCount: 1
25 |     audit_handler:
26 |       class : logging.handlers.RotatingFileHandler
27 |       formatter: audit_format
28 |       filename: /var/log/seneschal/audit.log
29 |       maxBytes: 409600
30 |       backupCount: 4
31 |   loggers:
32 |     audit:
33 |       level: INFO
34 |       handlers:
35 |         - audit_handler
36 |   root:
37 |     level: DEBUG
38 |     handlers:
39 |       - main
40 | daemon:
41 |   # Any settings left blank, get the default.
42 |   # pidfile is REQUIRED
43 |   pidfile: /var/run/seneschal.pid
44 |   # working_directory default = '/'
45 |   working_directory:
46 |   # chroot_directory default = don`t chroot
47 |   chroot_directory:
48 |   # umask default = 0
49 |   umask:
50 |   # detach_process default = True unless determined to be already detached
51 |   detach_process:
52 |   # uid default = real UID as returned by getuid() (often root - 0)
53 |   uid:
54 |   # gid default = real GID as returned by getgid() (often root - 0)
55 |   gid:
56 |   # prevent_core default = True; set to False to enable a core dump
57 |   prevent_core:
58 | seneschal:
59 |   paths:
60 |     # Depending on interpretation of the standards, you could drop "local/" or
61 |     #   replace "lib/" with "spool/" for "*_messages/"...
62 |     # Where compute jobs will write events (They are not immediately deleted.)
63 |     job_messages:  /var/local/lib/seneschal/job_messages
64 |     # Where user clients will write events (They are not immediately deleted.)
65 |     user_messages: /var/local/lib/seneschal/user_messages
66 |     # Where state is maintained for tracking progress of automation requests
67 |     requests: /var/local/lib/seneschal/requests
68 |     # Where state is maintained for tracking progress of batch jobs
69 |     jobs: /var/local/lib/seneschal/jobs
70 |     # Where state is maintained for restarting subprocesses after system reboot
71 |     subprocesses: /var/local/lib/seneschal/subprocesses
72 |     # Where plugins are installed
73 |     plugins:  /usr/local/lib/seneschal/plugins
74 |   plugins:
75 |     md5:
76 |       executable: /usr/bin/md5sum
77 | 


--------------------------------------------------------------------------------
/test/test_tasks.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | 
  3 | from seneschal import managers
  4 | 
  5 | 
  6 | STATE_1_YAML = '''
  7 | type: sequence
  8 | cwd: output_batch_dir
  9 | zchildren:
 10 |   - type: parallel
 11 |     zchildren:
 12 |       - type: sequence
 13 |         executable: .../python3.6
 14 |         prefix_arguments: [.../topmed-1/scripts/copy_rename]
 15 |         child_type: subprocess
 16 |         zchildren:
 17 |           - arguments: [src_file1, dst_file1]
 18 |           - arguments: [src_file2, dst_file2]
 19 |       - type: parallel
 20 |         cwd: md5_dst_dir
 21 |         cores: 1
 22 |         executable: .../python3.6
 23 |         prefix_arguments: [.../topmed-1/scripts/md5_script]
 24 |         child_type: batch_job
 25 |         zchildren:
 26 |           - arguments: [src_file1, sample1]
 27 |           - arguments: [src_file2, sample2]
 28 |   - type: subprocess
 29 |     executable: .../python3.6
 30 |     arguments: [.../topmed-1/scripts/make_manifest, md5_dst_dir]
 31 | '''
 32 | 
 33 | 
 34 | STATE_2_YAML = '''
 35 | cwd: output_batch_dir
 36 | path: t
 37 | type: sequence
 38 | zchildren:
 39 | - cwd: output_batch_dir
 40 |   index: 0
 41 |   path: t/0
 42 |   type: parallel
 43 |   zchildren:
 44 |   - child_type: subprocess
 45 |     cwd: output_batch_dir
 46 |     executable: '.../python3.6'
 47 |     index: 0
 48 |     path: t/0/0
 49 |     prefix_arguments: &id001 ['.../topmed-1/scripts/copy_rename']
 50 |     type: sequence
 51 |     zchildren:
 52 |     - arguments: [src_file1, dst_file1]
 53 |       cwd: output_batch_dir
 54 |       executable: '.../python3.6'
 55 |       index: 0
 56 |       path: t/0/0/0
 57 |       prefix_arguments: *id001
 58 |       type: subprocess
 59 |     - arguments: [src_file2, dst_file2]
 60 |       cwd: output_batch_dir
 61 |       executable: '.../python3.6'
 62 |       index: 1
 63 |       path: t/0/0/1
 64 |       prefix_arguments: *id001
 65 |       type: subprocess
 66 |   - child_type: batch_job
 67 |     cores: 1
 68 |     cwd: md5_dst_dir
 69 |     executable: '.../python3.6'
 70 |     index: 1
 71 |     path: t/0/1
 72 |     prefix_arguments: &id002 ['.../topmed-1/scripts/md5_script']
 73 |     type: parallel
 74 |     zchildren:
 75 |     - arguments: [src_file1, sample1]
 76 |       cores: 1
 77 |       cwd: md5_dst_dir
 78 |       executable: '.../python3.6'
 79 |       index: 0
 80 |       path: t/0/1/0
 81 |       prefix_arguments: *id002
 82 |       type: batch_job
 83 |     - arguments: [src_file2, sample2]
 84 |       cores: 1
 85 |       cwd: md5_dst_dir
 86 |       executable: '.../python3.6'
 87 |       index: 1
 88 |       path: t/0/1/1
 89 |       prefix_arguments: *id002
 90 |       type: batch_job
 91 | - arguments: ['.../topmed-1/scripts/make_manifest', md5_dst_dir]
 92 |   cwd: output_batch_dir
 93 |   executable: '.../python3.6'
 94 |   index: 1
 95 |   path: t/1
 96 |   type: subprocess
 97 | '''
 98 | 
 99 | T011 = '''
100 | arguments: [src_file2, sample2]
101 | cores: 1
102 | cwd: md5_dst_dir
103 | executable: '.../python3.6'
104 | index: 1
105 | path: t/0/1/1
106 | prefix_arguments: ['.../topmed-1/scripts/md5_script']
107 | type: batch_job
108 | '''
109 | 
110 | PATHS = '''
111 | t
112 | t/0
113 | t/0/0
114 | t/0/0/0
115 | t/0/0/1
116 | t/0/1
117 | t/0/1/0
118 | t/0/1/1
119 | t/1
120 | '''.split()
121 | 
122 | 
123 | def test_task_inheritance():
124 |     state = yaml.load(STATE_1_YAML)
125 |     managers.propagate_inheritance(state)
126 |     assert yaml.dump(state) == STATE_2_YAML[1:]
127 | 
128 | 
129 | def test_index_mappings():
130 |     state = yaml.load(STATE_2_YAML)
131 |     index = managers.index_mappings(state)
132 |     assert sorted(index) == PATHS
133 |     for k, v in index.items():
134 |         assert k == v['path']
135 |     print(yaml.dump(index['t/0/1/1']))
136 |     assert index['t/0/1/1'] == yaml.load(T011)
137 |     assert index['t'] == state
138 | 


--------------------------------------------------------------------------------
/seneschald.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """Daemon control script. The seneschal daemon handles messages requesting
  4 | processing or export of data located within the protected area."""
  5 | 
  6 | 
  7 | import argparse
  8 | import logging
  9 | import logging.config
 10 | import os
 11 | import signal
 12 | import sys
 13 | import syslog
 14 | import time
 15 | 
 16 | from daemon import DaemonContext
 17 | from daemon.runner import is_pidfile_stale, emit_message
 18 | from lockfile.pidlockfile import PIDLockFile
 19 | import yaml
 20 | 
 21 | from seneschal import Engine
 22 | 
 23 | 
 24 | logger = logging.getLogger('seneschald')
 25 | 
 26 | 
 27 | def main():
 28 |     args = parse_args()
 29 |     daemon_command = args.daemon_command
 30 |     config = load_config_file(args.config_file)
 31 |     logging_config = config.pop('logging', None)
 32 |     daemon_config = config.pop('daemon')
 33 |     seneschal_config = config.pop('seneschal')
 34 |     try:
 35 |         if daemon_command == 'start':
 36 |             start(logging_config, daemon_config, seneschal_config)
 37 |         else:
 38 |             config_logging(logging_config)
 39 |             if daemon_command == 'stop':
 40 |                 stop(daemon_config)
 41 |             else:
 42 |                 engine = Engine(seneschal_config)
 43 |                 if daemon_command == 'sweep':
 44 |                     engine.sweep()
 45 |     except Exception as e:
 46 |         emit_message(e)
 47 |         sys.exit(1)
 48 |     finally:
 49 |         logging.shutdown()
 50 | 
 51 | 
 52 | def parse_args():
 53 |     parser = argparse.ArgumentParser(description=__doc__)
 54 |     parser.add_argument('config_file', help='path to YAML file')
 55 |     parser.add_argument('daemon_command', choices=['start', 'stop', 'sweep'])
 56 |     args = parser.parse_args()
 57 |     return args
 58 | 
 59 | 
 60 | def load_config_file(config_file):
 61 |     with open(config_file) as fin:
 62 |         config = yaml.load(fin)
 63 |     return config
 64 | 
 65 | 
 66 | def start(logging_config, daemon_config, seneschal_config):
 67 |     syslog.openlog('seneschal', 0, syslog.LOG_USER)
 68 |     engine = Engine(seneschal_config)
 69 |     pidfile, daemon_options = check_daemon_options(daemon_config)
 70 |     if is_pidfile_stale(pidfile):
 71 |         syslog.syslog(syslog.LOG_NOTICE, 'breaking stale PID file')
 72 |         pidfile.break_lock()
 73 |     # The remaining entries in daemon_options will be passed as-is to
 74 |     # daemon.DaemonContext.
 75 |     context = DaemonContext(pidfile=pidfile, **daemon_options)
 76 |     context.signal_map = make_signal_map()
 77 |     syslog.syslog(syslog.LOG_NOTICE, 'starting daemon context')
 78 |     try:
 79 |         with context:  # Will fail if daemon already running
 80 |             pid = os.getpid()
 81 |             syslog.syslog(syslog.LOG_NOTICE, 'daemon running as: %s' % pid)
 82 |             config_logging(logging_config)
 83 |             logger.debug('========================================')
 84 |             logger.info('daemon running pid=%s', pid)
 85 |             logger.debug('args: %r', sys.argv)
 86 |             logger.debug('daemon_options: %r', daemon_options)
 87 |             logger.debug('seneschal_config: %r', seneschal_config)
 88 |             while Engine.running:
 89 |                 engine.sweep()
 90 |                 time.sleep(1)
 91 |                 # TODO: Long polling times, may result in an unacceptable
 92 |                 # delay during daemon shutdown.
 93 |     except Exception as e:
 94 |         syslog.syslog(syslog.LOG_ERR, str(e))
 95 |         logger.exception(repr(e))
 96 |         raise
 97 |     finally:
 98 |         syslog.syslog(syslog.LOG_NOTICE, 'exiting')
 99 |         logger.info('exiting')
100 | 
101 | 
102 | def stop(daemon_config):
103 |     """Standard daemon stop logic."""
104 |     pidfile, _ = check_daemon_options(daemon_config)
105 |     if not pidfile.is_locked():
106 |         error = DaemonStopError(
107 |             "PID file {pidfile.path!r} not locked".format(pidfile=pidfile)
108 |         )
109 |         raise error
110 |     if is_pidfile_stale(pidfile):
111 |         syslog.syslog(syslog.LOG_NOTICE, 'breaking stale PID file')
112 |         pidfile.break_lock()
113 |     else:
114 |         pid = pidfile.read_pid()
115 |         try:
116 |             os.kill(pid, signal.SIGTERM)
117 |         except OSError as exc:
118 |             error = DaemonStopError(
119 |                 "Failed to terminate {pid:d}: {exc}".format(pid=pid, exc=exc)
120 |             )
121 |             raise error
122 | 
123 | 
124 | def check_daemon_options(daemon_config):
125 |     """Returns the pidfile object and non-default daemon settings;
126 |     dies if there are any illegal settings."""
127 |     check_for_illegal_daemon_options(daemon_config)
128 |     daemon_options = {k: v for k, v in daemon_config.items() if v is not None}
129 |     pidfile_path = daemon_options.pop('pidfile')
130 |     pidfile = PIDLockFile(pidfile_path)
131 |     return pidfile, daemon_options
132 | 
133 | 
134 | def check_for_illegal_daemon_options(daemon_config):
135 |     """Error out and die if any illegal options."""
136 |     LEGAL_DAEMON_OPTIONS = set('''
137 |         pidfile
138 |         working_directory
139 |         chroot_directory
140 |         umask
141 |         detach_process
142 |         uid
143 |         gid
144 |         prevent_core
145 |     '''.split())
146 |     illegal_options = set(daemon_config) - LEGAL_DAEMON_OPTIONS
147 |     if illegal_options:
148 |         logger.critical('illegal daemon options in YAML config file: %r',
149 |                         list(illegal_options))
150 |         sys.exit(1)
151 | 
152 | 
153 | def make_signal_map():
154 |     result = {
155 |         signal.SIGTERM: trigger_shutdown,
156 |         signal.SIGHUP: None,
157 |         signal.SIGTTIN: None,
158 |         signal.SIGTTOU: None,
159 |         signal.SIGTSTP: None,
160 |     }
161 |     return result
162 | 
163 | 
164 | def config_logging(logging_config_dict):
165 |     if logging_config_dict:
166 |         config = dict(logging_config_dict,
167 |                       version=1,
168 |                       disable_existing_loggers=False)
169 |         logging.config.dictConfig(config)
170 |     else:
171 |         logging.basicConfig(level=logging.NOTSET)
172 | 
173 | 
174 | def trigger_shutdown(signum, frame):
175 |     """Set global `running` to False, to trigger shutdown."""
176 |     syslog.syslog(syslog.LOG_NOTICE, 'term signal')
177 |     Engine.running = False
178 | 
179 | 
180 | class DaemonStopError(RuntimeError):
181 |     """Either daemon not running or as OS error."""
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     main()
186 | 


--------------------------------------------------------------------------------
/seneschal/messaging.py:
--------------------------------------------------------------------------------
  1 | """JSON file based messaging. User client software makes requests by executing
  2 | `leave_new_request`. The rest of this module supports the automation engine."""
  3 | 
  4 | from json import dump, load
  5 | import logging
  6 | from pathlib import Path
  7 | from uuid import uuid4
  8 | 
  9 | 
 10 | # TODO: Start auditing messages.
 11 | 
 12 | # Channels
 13 | REQUEST = 'REQUEST'
 14 | JOB = 'JOB'
 15 | SUBPROCESS = 'SUBPROCESS'
 16 | 
 17 | # MessageDrop directory names
 18 | TEMP = '0_temp'
 19 | INBOX = '1_inbox'
 20 | RECEIVED = '2_received'
 21 | ERROR = '3_error'
 22 | 
 23 | # Message types
 24 | NEW = 'NEW'
 25 | STARTED = 'STARTED'
 26 | SUCCEEDED = 'SUCCEEDED'
 27 | FAILED = 'FAILED'
 28 | 
 29 | # JSON message keys
 30 | ILLEGAL_JSON_KEYS = {'channel', 'uid', 'user_name'}
 31 | REQUIRED_JSON_KEYS = {'message_type', 'target_id', 'uuid_str'}
 32 | 
 33 | 
 34 | logger = logging.getLogger(__name__)
 35 | 
 36 | 
 37 | def leave_new_request(directory, workflow, arg_list):
 38 |     """On behalf of a user, creates a new Request message file, using
 39 |     `directory` as the root of a message drop. The requested automation is
 40 |     named by `workflow`. This is the only code in this module that client
 41 |     software needs to invoke in order to create a request."""
 42 |     uuid_str = leave_message(directory, NEW,
 43 |                              workflow=workflow, arg_list=arg_list)
 44 |     return uuid_str
 45 | 
 46 | 
 47 | class Message:
 48 |     """Contains a single message as a JSON file. `channel` is where the message
 49 |     should go. `target_id` is the ID of a specific object that should get the
 50 |     message. `message_type` defines the general type of the message. Any
 51 |     additional attributes are specific to the type of message."""
 52 |     def __init__(self, *, channel, target_id, message_type, **kwds):
 53 |         """Raises an exception if the message fails certain validity checks."""
 54 |         self.channel = channel
 55 |         self.target_id = target_id
 56 |         self.message_type = message_type
 57 |         self.__dict__.update(kwds)
 58 |         pass  # TODO
 59 | 
 60 | 
 61 | class MessageBroker:
 62 |     """Responsible for creating, receiving, and dispatching messages, which
 63 |     are serialized as JSON files."""
 64 |     def __init__(self, seneschal_config,
 65 |                  request_manager, job_manager, subprocess_manager):
 66 |         job_messages_path = seneschal_config['paths']['job_messages']
 67 |         user_messages_path = seneschal_config['paths']['user_messages']
 68 |         self.message_drops = (
 69 |             MessageDrop(directory=user_messages_path, channel=REQUEST),
 70 |             MessageDrop(directory=job_messages_path, channel=JOB)
 71 |         )
 72 |         self.managers = {
 73 |             REQUEST: request_manager,
 74 |             JOB: job_manager,
 75 |             SUBPROCESS: subprocess_manager
 76 |         }
 77 |         for manager in self.managers.values():
 78 |             manager.set_message_broker(self)
 79 | 
 80 |     def attempt_to_deliver_one_left_message(self):
 81 |         """Check the message drops for messages and if possible, deliver one
 82 |         message to the corresponding manager. Returns True if the MessageBroker
 83 |         delivered a message."""
 84 |         for message_drop in self.message_drops:
 85 |             message = message_drop.fetch_message()
 86 |             if message:
 87 |                 self.deliver_one_message(message)
 88 |                 return True
 89 |         return False
 90 | 
 91 |     def deliver_one_message(self, message):
 92 |         """Deliver the message to the target manager, based on channel."""
 93 |         manager = self.managers[message.channel]
 94 |         manager.receive_message(message)
 95 | 
 96 | 
 97 | class MessageDrop(object):
 98 |     """Represents a filesystem directory that contains a `TEMP` directory and
 99 |     an `INBOX` directory. Messages are placed in the drop by writing a new
100 |     JSON file with a unique name to the `TEMP` directory and then moving that
101 |     file to the `INBOX` directory. Messages left here should not contain uid,
102 |     user_name, or channel. When messages are read back into memory, they are
103 |     augmented with these values. The user is the owner of the file."""
104 |     def __init__(self, *, directory, channel, **kwds):
105 |         """Parameters: `directory` must contain `TEMP`, `INBOX`, and
106 |         `RECEIVED`; `channel` is only used when fetching messages."""
107 |         super().__init__(**kwds)
108 |         self.directory = Path(directory)
109 |         self.channel = channel
110 | 
111 |     @property
112 |     def inbox(self):
113 |         """Returns `self.directory / INBOX`."""
114 |         return self.directory / INBOX
115 | 
116 |     @property
117 |     def received(self):
118 |         """Returns `self.directory / RECEIVED`."""
119 |         return self.directory / RECEIVED
120 | 
121 |     @property
122 |     def error(self):
123 |         """Returns `self.directory / ERROR`."""
124 |         return self.directory / ERROR
125 | 
126 |     def leave_message(self, message_type, target_id=None, **kwds):
127 |         """Write a new JSON file into the `TEMP` directory and then move that
128 |         file into the `INBOX` directory. The JSON file is an object (dict)
129 |         that contains the combination of message_type, target_id, kwds, and
130 |         a UUID, which is also used to name the file. The UUID is generated
131 |         inside this method using `uuid4`, so as to insure that the message and
132 |         file have unique names. Returns the UUID as a str. This method is
133 |         usually invoked from client software that does not call any other
134 |         methods in this module."""
135 |         # TODO: Consider that we may want to create a UUID based on a previous
136 |         # UUID, such as events for an existing job or request, using a
137 |         # UUID5 algorith taking the existing UUID as the namespace.
138 |         leave_message(self.directory, message_type, target_id, **kwds)
139 | 
140 |     def fetch_message(self):
141 |         """Return the next message or `None`. Locates the oldest JSON file in
142 |         the `INBOX` directory, loads it, moves it into the `RECEIVED`
143 |         directory, and returns the resulting `Message` object."""
144 |         # All messages, oldest first:
145 |         message_paths = sorted(self.inbox.glob('*.json'),
146 |                                key=lambda x: x.stat().st_mtime)
147 |         message = None
148 |         # If there are messages, keep processing until we find a good one:
149 |         for message_path in message_paths:
150 |             name = message_path.name
151 |             try:
152 |                 message = load_message(message_path, self.channel)
153 |             except ValueError as e:
154 |                 logger.exception(f'problem loading {name}')
155 |                 message_path.rename(self.error / name)
156 |             else:
157 |                 message_path.rename(self.received / name)
158 |                 logger.info(f'received {name}')
159 |                 break
160 |         return message
161 | 
162 | 
163 | def load_message(message_path, channel):
164 |     """Return the `Message` object at message_path, filling in `channel`,
165 |     `uid`, and `user_name`. Will raise a subclass of `ValueError` if the file
166 |     has a bad set of keys or the UUID in the file does not match the name of
167 |     the file."""
168 |     message_path = Path(message_path)
169 |     with message_path.open() as fin:
170 |         message_mapping = load(fin)
171 |     message_keys = set(message_mapping)
172 |     illegal_keys = ILLEGAL_JSON_KEYS & message_keys
173 |     missing_keys = REQUIRED_JSON_KEYS - message_keys
174 |     if illegal_keys:
175 |         raise IllegalJSONKeysError(
176 |             f'illegal keys in {message_path.name}: {illegal_keys}'
177 |         )
178 |     if missing_keys:
179 |         raise IllegalJSONKeysError(
180 |             f'missing keys in {message_path.name}: {missing_keys}'
181 |         )
182 |     uid = message_path.stat().st_uid
183 |     try:
184 |         user_name = message_path.owner()
185 |     except Exception as e:
186 |         logger.exception(f'problem getting owner name for {message_path.name}')
187 |         user_name = str(uid)
188 |     message = Message(channel=channel,
189 |                       uid=uid,
190 |                       user_name=user_name,
191 |                       **message_mapping)
192 |     if message.uuid_str != message_path.stem:
193 |         raise ValueError(
194 |             f'wrong UUID in {message_path.name}: {message.uuid_str}'
195 |         )
196 |     return message
197 | 
198 | 
199 | def leave_message(directory, message_type, target_id=None, **kwds):
200 |     """Using `directory` as the root of a message drop, write a new JSON file
201 |     into the `TEMP` directory and then move that file into the `INBOX`
202 |     directory. The JSON file is an object (dict) that contains the combination
203 |     of message_type, target_id, kwds, and a UUID, which is also used to name
204 |     the file. The UUID is generated inside this method using `uuid4`, so as to
205 |     insure that the message and file have unique names. Returns the UUID as a
206 |     str. This function is usually invoked from client software that does not
207 |     call anything else in this module."""
208 |     assert 'uuid_str' not in kwds, kwds
209 |     assert 'channel' not in kwds, kwds
210 |     directory_path = Path(directory)
211 |     uuid_str = str(uuid4())
212 |     file_name = uuid_str + '.json'
213 |     initial_path = directory_path / TEMP / file_name
214 |     final_path = directory_path / INBOX / file_name
215 |     message = dict(uuid_str=uuid_str,
216 |                    message_type=message_type,
217 |                    target_id=target_id,
218 |                    **kwds)
219 |     with initial_path.open('w') as fout:
220 |         dump(message, fout, sort_keys=True)
221 |     initial_path.rename(final_path)
222 |     return uuid_str
223 | 
224 | 
225 | class MissingJSONKeysError(ValueError):
226 |     """Some required keys were missing."""
227 |     pass
228 | 
229 | 
230 | class IllegalJSONKeysError(ValueError):
231 |     """Some illegal keys were present in a JSON file."""
232 |     pass
233 | 


--------------------------------------------------------------------------------
/seneschal/managers.py:
--------------------------------------------------------------------------------
  1 | """Implements the managers, which are objects in between the engine and the
  2 | workers. There are four domains: user requests, batch jobs, local
  3 | subprocesses, and plugins. Everything begins with a user request, but almost
  4 | all actual work is defined by a plugin and then executed as either a batch
  5 | job or a subprocess.
  6 | 
  7 | Stateful workers are persisted to the filesystem in paths like this:
  8 | 
  9 |     SOME_ROOT/by_uuid/12300000-0000-0000-0000-000000000000/0.json
 10 |     SOME_ROOT/by_uuid/12300000-0000-0000-0000-000000000000/1.json
 11 |     SOME_ROOT/by_uuid/12300000-0000-0000-0000-000000000000/2.json
 12 | 
 13 | In this example, SOME_ROOT is the directory associated with a manager, 123...
 14 | is the UUID or ID of the worker and 2.json is the most recent state for that
 15 | worker
 16 | 
 17 | """
 18 | 
 19 | from json import dump, load
 20 | import logging
 21 | from pathlib import Path
 22 | 
 23 | from .messaging import NEW, STARTED, SUCCEEDED, FAILED
 24 | 
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | UUID_GLOB = '????????-????-????-????-????????????'
 29 | WORKER_GLOB = 'by_uuid/' + UUID_GLOB
 30 | 
 31 | 
 32 | class Manager:
 33 |     """Abstract base class that encapsulates the concept of workers, an
 34 |     associated filesystem directory containing worker-specific subdirectories,
 35 |     the ability to load workers from their subdirectories during startup,
 36 |     a registry of workers by ID, and the ability to send messages to a
 37 |     `messaging.MessageBroker`. Subclasses must implement `load`."""
 38 | 
 39 |     def __init__(self, *, directory, worker_class, **kwds):
 40 |         """Load state from directory into memory."""
 41 |         super().__init__(**kwds)
 42 |         self.directory = Path(directory)
 43 |         self.worker_class = worker_class
 44 |         self.message_broker = None  # See set_message_broker
 45 |         assert self.directory.is_dir()
 46 |         self.registry = dict()  # ID -> worker
 47 |         for subdir in self.directory.glob(WORKER_GLOB):
 48 |             worker_params = self.load(subdir)
 49 |             self.add_worker(worker_params)
 50 | 
 51 |     def set_message_broker(self, message_broker):
 52 |         """Called after construction to install the `messaging.MessageBroker`.
 53 |         Should only be called once."""
 54 |         assert self.message_broker is None
 55 |         self.message_broker = message_broker
 56 | 
 57 |     def load(self, subdir):
 58 |         """Abstract method to load worker state by reading the filesystem at
 59 |         `subdir`. Returns a `dict` of worker state."""
 60 |         raise NotImplementedError('abstract method')
 61 | 
 62 |     def add_worker(self, worker_params):
 63 |         """Construct a new worker, and install it in the `registry`.
 64 |         Return the new worker's ID."""
 65 |         worker = self.worker_class(worker_params)
 66 |         self.registry[worker.id] = worker
 67 |         return worker.id
 68 | 
 69 | 
 70 | class MessageReceiver(Manager):
 71 |     """Abstract base class for Manager that can receive external messages.
 72 |     Subclasses must implement `load`."""
 73 | 
 74 |     def receive_message(self, message):
 75 |         """Called by `messaging.MessageReceiver`, returns nothing. If
 76 |         `target_id` is None, then `message_type` should be `NEW`, in which
 77 |         case we should create a new worker object with the message payload.
 78 |         Otherwise, `target_id` should reference an existing object, and we
 79 |         should pass the message to that object by method call."""
 80 |         worker_params = dict(vars(message))
 81 |         # Remove parameters no longer needed.
 82 |         worker_params.pop(channel)
 83 |         worker_params.pop(message_type)
 84 |         worker_params.pop(target_id)
 85 |         if message.target_id is None:
 86 |             assert message.message_type is NEW
 87 |             worker = self.add_worker(worker_params)
 88 |         else:
 89 |             worker = self.registry[message.target_id]
 90 |             worker.receive_message(worker_params)
 91 |         worker.save()
 92 | 
 93 | 
 94 | class RequestManager(MessageReceiver):
 95 |     """The Manager for all Request objects."""
 96 | 
 97 |     def __init__(self, **kwds):
 98 |         """Load state from directory into memory."""
 99 |         super().__init__(**kwds, worker_class=Request)
100 | 
101 |     def load(self, subdir):
102 |         """Required by `Manager`. Delegates to `load_most_recent_state`."""
103 |         return load_most_recent_state(subdir)
104 | 
105 | 
106 | class DictProxy:
107 |     """A class that links its state to an existing dict; a flyweight facade
108 |     wrapping a dict. Any changes to the object are changes to the dict."""
109 |     def __init__(self, mapping):
110 |         self.__dict__ = mapping
111 | 
112 |     def __repr__(self):
113 |         return '%s(%r)' % (self.__class__.__name__, self.__dict__)
114 | 
115 | 
116 | class Request(DictProxy):
117 |     pass  # TODO
118 | 
119 | 
120 | class Task(DictProxy):
121 |     """Abstract base class for all tasks. Every `Task` obtains and maintains
122 |     its state in an external dict."""
123 |     # The static method `register_concrete_subclass` is a class decorator
124 |     # that will populate concrete_subclasses, which enables `from_dict`.
125 |     concrete_subclasses = {}
126 | 
127 |     def __init__(self, mapping):
128 |         """Validates mapping and delegates construction to superclass."""
129 |         assert 'type' in mapping
130 |         super().__init__(mapping)
131 | 
132 |     @staticmethod
133 |     def from_dict(mapping):
134 |         """Return the appropriate type of `Task` object based on the contents
135 |         of `mapping`."""
136 |         assert isinstance(mapping, dict)
137 |         subclass_selection = mapping['type']
138 |         subclass = Task.concrete_subclasses[subclass_selection]
139 |         return subclass(mapping)
140 | 
141 |     @staticmethod
142 |     def register_concrete_subclass(cls):
143 |         """A class decorator that registers concrete Task subclasses.
144 |         Subclasses must implement a class member __task_type_id__ as the str
145 |         that will be used during deserialization to select the appropriate
146 |         subclass."""
147 |         Task.concrete_subclasses[cls.__task_type_id__] = cls
148 |         return cls
149 | 
150 |     def start(self, message_broker):
151 |         """Abstract method. Invoked by the `Request` or a parent `CompoundTask`
152 |         when it is time for this `Task` to start. Subclasses should either
153 |         `start` a sub-task or send a `Message` by invoking
154 |         `message_broker.deliver_one_message`."""
155 |         raise NotImplementedError
156 | 
157 | 
158 | @Task.register_concrete_subclass
159 | class BatchJobTask(Task):
160 |     """Executes asynchronously in a batch job."""
161 |     __task_type_id__ = 'batch_job'
162 |     pass  # TODO
163 | 
164 | 
165 | @Task.register_concrete_subclass
166 | class SubprocessTask(Task):
167 |     """Executes asynchronously in a subprocess."""
168 |     __task_type_id__ = 'subprocess'
169 |     pass  # TODO
170 | 
171 | 
172 | class CompoundTask(Task):
173 |     """A `Task` that implements `children`, which must be a list of `Tasks`."""
174 |     # We store the children under a key named "zchildren" for a serialized
175 |     # state that is easier to read.
176 | 
177 |     def __getitem__(self, key):
178 |         return self.children[key]
179 | 
180 |     @property
181 |     def children(self):
182 |         """Returns a list of the appropriate `Task` objects."""
183 |         return [Task.from_dict(child) for child in self.zchildren]
184 | 
185 | 
186 | @Task.register_concrete_subclass
187 | class SequenceTask(CompoundTask):
188 |     """Executes a list of child `Task`s in sequence."""
189 |     __task_type_id__ = 'sequence'
190 |     pass  # TODO
191 | 
192 | 
193 | @Task.register_concrete_subclass
194 | class ParallelTask(CompoundTask):
195 |     """Executes a list of child `Task`s in parallel."""
196 |     __task_type_id__ = 'parallel'
197 |     pass  # TODO
198 | 
199 | 
200 | def load_most_recent_state(state_files_dir):
201 |     """Read the highest numbered state file as JSON and return the result."""
202 |     assert isinstance(state_files_dir, Path), state_files_dir
203 |     assert state_files_dir.match(UUID_GLOB), state_files_dir
204 |     state_files = sorted(enumerate_numbered_json_files(state_files_dir))
205 |     assert state_files, state_files_dir
206 |     state_file = state_files[-1][1]
207 |     with state_file.open() as fin:
208 |         state = load(fin)
209 |     return state
210 | 
211 | 
212 | def enumerate_numbered_json_files(directory):
213 |     """Yield pairs of num & json_file_path."""
214 |     for json_file_path in directory.glob('*.json'):
215 |         try:
216 |             num = int(json_file_path.stem)
217 |             yield num, json_file_path
218 |         except Exception as e:
219 |             pass  # TODO: log the unusual file
220 | 
221 | 
222 | def propagate_inheritance(mapping, path='t'):
223 |     """Given a mapping that represents the state of a possibly compound `Task`,
224 |     bestows inheritable attributes to children that have not overridden
225 |     those attributes. Each `Task` will have a path relative to the `Request`
226 |     object. The root `Task` has a path of "t", the zeroth child has a path
227 |     of "t/0", and the zeroth grandchild has a path of "t/0/0"."""
228 |     mapping['path'] = path
229 |     if 'zchildren' not in mapping:
230 |         return  # Nothing to do
231 |     child_type = mapping.get('child_type', None)
232 |     keys = set(mapping)  # Will be the set of inheritable keys.
233 |     # Children do not inherit these:
234 |     keys.discard('zchildren')
235 |     keys.discard('child_type')  # optional
236 |     keys.remove('type')  # but they can get type from child_type
237 |     # Give the children their inheritances:
238 |     for index, child_mapping in enumerate(mapping['zchildren']):
239 |         child_mapping['index'] = index
240 |         if not 'type' in child_mapping:
241 |             assert child_type is not None
242 |             child_mapping['type'] = child_type
243 |         for key in keys:
244 |             if key not in child_mapping:
245 |                 child_mapping[key] = mapping[key]
246 |         # Give the child object a chance to initialize state:
247 |         propagate_inheritance(child_mapping, f'{path}/{index}')
248 | 
249 | 
250 | def index_mappings(mapping):
251 |     """Return a `dict` of all nested mappings, including the root, where the
252 |     key is the value for "path". Uses `iterate_nested_mappings`, and requires
253 |     that every mapping have a key of "path"."""
254 |     result = {value['path']: value
255 |               for value in iterate_nested_mappings(mapping)}
256 |     return result
257 | 
258 | 
259 | def iterate_nested_mappings(mapping):
260 |     """Generator function that top-down iterates all the mapping objects.
261 |     Children are expected to be in a iterable under the key `zchildren`. The
262 |     iterator always yields something, since the first value is always
263 |     the `mapping` parameter."""
264 |     yield mapping
265 |     for child_mapping in mapping.get('zchildren', ()):
266 |         yield from iterate_nested_mappings(child_mapping)
267 | 
268 | 
269 | # TODO: Synchonize documentation in messaging.py and tech_specs.md.
270 | 


--------------------------------------------------------------------------------
/docs/tech_specs.md:
--------------------------------------------------------------------------------
  1 | # Seneschal Technical Specifications
  2 | 
  3 | Seneschal is an automation system that serves as a restricted control interface between users and a protected environment.
  4 | 
  5 | Features:
  6 | 
  7 | * accepts requests from users
  8 | * executes approved workflows
  9 | * will reject requests that violate policy
 10 | * creates an audit trail containing:
 11 |     * requests — who, what, when, input, output
 12 |     * request rejections
 13 |     * action start
 14 |     * action complete
 15 |     * errors
 16 | * is hardened — no database required
 17 | * is expandable — Users can write plugins, that define new workflows.
 18 | * is change managed — Administrators must install new plugins or versions, allowing change review and auditing.
 19 | 
 20 | ## Main Concepts
 21 | 
 22 | Regular users issue commands that send _request_ _messages_. A _message_ is a small JSON file that specifies the desired action. For _requests_, the files are written to a special publicly writeable directory, the _inbox directory_.
 23 | 
 24 | A standard daemon named _seneschald_ polls the _inbox directory_, applies business and security rules, and then takes appropriate action such as rejecting the request, initiating a direct copy operation in a subprocess, or submitting a job to a batch scheduler. As is typical, the daemon has a PID file and responds to `SIGTERM` by shutting down after a few seconds.
 25 | 
 26 | With the exception of subprocesses executing copies, all state is maintained on the filesystem. This allows the daemon to recover from restart. Of course any subprocesses running local copies would be killed with the daemon, and those local copies would have to be restarted. This is not a problem if the copy is similar to rsync.
 27 | 
 28 | In addition to the normal logging expected of a daemon, the _seneschald_ daemon writes to a special _audit log_, noting all events that might be security relevant. These events are formatted to optimize ingestion by log analysis systems, such as splunk.
 29 | 
 30 | Batch jobs run inside a _job wrapper_ that sends start and finish _job events_ as _messages_ back to the daemon by writing small text files to a special directory that is different from the one used by users making requests. These messages usually trigger audit logging and progress updates to the _request_.
 31 | 
 32 | The _seneschald_ daemon is just a framework for automation. Almost all of the security logic and workflow machinery is provided by _plugins_. This allows more efficient change management, since the plugins will evolve on different timescales from each other and the daemon.
 33 | 
 34 | Borrowing from git documentation, we use the analogy of plumbing and porcelain. Plumbing is the machinery required for the system to operate: daemon, plugins, etc. Porcelain is the set of user-facing commands that make the system useful: making requests, checking status, etc. Without the porcelain, the plumbing is pointless.
 35 | 
 36 | * message
 37 | * request
 38 | * inbox directory
 39 | * seneschald
 40 | * audit log
 41 | * job wrapper
 42 | * job events
 43 | * plugins
 44 | * porcelain
 45 | 
 46 | ## Deployment Requirements
 47 | 
 48 | * service account
 49 | * daemon host
 50 | * daemon directories
 51 | * plugins directory
 52 | * cluster resources
 53 | * software directory
 54 | 
 55 | ### service account
 56 | 
 57 | * The daemon must run with the UID set to a non-root, non-login service account.
 58 | 
 59 | * In order to facilitate fulfilling user requests for data this service account will belong to multiple regular user groups. Otherwise the users would have to create publicly writeable directories to receive the output that they are requesting.
 60 | 
 61 | * Recommendations:
 62 |     * The name of the service account should include "seneschal".
 63 |     * There should be an associated group with the same name.
 64 | 
 65 | ### daemon host
 66 | 
 67 | The host running the daemon must:
 68 | 
 69 | * mount all relevant filesystems
 70 |     * These filesystems may be mounted read-only:
 71 |         * protected upstream filesystems
 72 |         * software filesystems
 73 |     * These must be mounted read-write:
 74 |         * destination filesystems
 75 |             * user filesystems
 76 |             * data transfer filesystems
 77 |         * filesystems holding:
 78 |             * inbox directory
 79 |             * processing directory
 80 |             * job events directory
 81 |             * logging
 82 | * have enough capacity in cores and bandwidth to support daily data transfer operations
 83 | * share the same service account name and numeric ID as the compute cluster
 84 | 
 85 | ### daemon directories
 86 | 
 87 | In order to process requests and track state, the daemon needs an assortment of directories. Two of these directories must be publicly writeable. It is easiest if they are all publicly readable.
 88 | 
 89 | This is the recommended structure:
 90 | 
 91 |     drwxrwxr-x  seneschal  seneschal    ./seneschal
 92 |     drwxrwxr-x  seneschal  seneschal    ./seneschal/internal_events
 93 |     drwxrwxr-x  seneschal  seneschal    ./seneschal/job_events
 94 |     drwxrwxr-x  seneschal  seneschal    ./seneschal/job_events/0_temp
 95 |     drwxrwxr-x  seneschal  seneschal    ./seneschal/job_events/1_inbox
 96 |     drwxrwxr-x  seneschal  seneschal    ./seneschal/jobs
 97 |     drwxrwxr-x  seneschal  seneschal    ./seneschal/outbox
 98 |     drwxrwxr-x  seneschal  seneschal    ./seneschal/requests
 99 |     drwxrwxr-x  seneschal  seneschal    ./seneschal/requests/by_num
100 |     drwxrwxr-x  seneschal  seneschal    ./seneschal/requests/by_uuid
101 |     drwxrwxr-x  seneschal  seneschal    ./seneschal/requests/error
102 |     drwxrwxr-x  seneschal  seneschal    ./seneschal/requests/finished
103 |     drwxrwxr-x  seneschal  seneschal    ./seneschal/user_events
104 |     drwxrwsrwt  seneschal  seneschal    ./seneschal/user_events/0_temp
105 |     drwxrwsrwt  seneschal  seneschal    ./seneschal/user_events/1_inbox
106 | 
107 | The idea is for the daemon to track state through a clean system reboot (`SIGTERM` + timeout), even restarting interrupted copies that were running locally.
108 | 
109 | #### temp and inbox directories
110 | 
111 | The inbox directory should be visible to all user login nodes and user compute nodes. They must:
112 | 
113 | * be on the same filesystem
114 | * have permissions `drwxrwsrwt`
115 | * be accessible to users (All parent directories are `o+rx`.)
116 | * be owned by _seneschald_ service account
117 | * be in a group that contains the service account and does not include users
118 | * be on the same filesystem as the processing directory
119 | 
120 | #### processing directory
121 | 
122 | The processing directory is where _seneschald_ maintains most of its state. It contains a moderate number of small files, separated by request into subdirectories. It must:
123 | 
124 | * be on the same filesystem as the inbox directory
125 | * be owned by _seneschald_ service account
126 | * be in the same group as the inbox directory
127 | 
128 | It should:
129 | 
130 | * be readable by developers on login nodes
131 | * be readable by regular users on login nodes so that they can receive status reports (Obscurity is not our friend.)
132 | 
133 | #### error and finished directories
134 | 
135 | Where _requests_ go at the end. There will be some sort of consolidation (such as tar) and cleanup (such as archiving). TBD
136 | 
137 | It must be readable and writeable by the service account on the daemon host. It should be readable by developers on login nodes.
138 | 
139 | #### job events directory
140 | 
141 | The job events directory is where the batch job wrapper script will write events. It must be readable and writeable by the service account from both the daemon host and the cluster nodes used by seneschal.
142 | 
143 | It should be readable by developers on login nodes.
144 | 
145 | #### internal events directory
146 | 
147 | The internal events directory is for routing information between components of the system. Unlike job events and requests, these events do not come from the outside of the daemon. It must be readable and writeable by the service account. It should be readable by developers on login nodes.
148 | 
149 | #### outbox directory
150 | 
151 | The outbox directory is where seneschal places public messages intended to be read by user clients. It must:
152 | 
153 | * be writeable by the daemon
154 | * be readable by both login and cluster nodes
155 | 
156 | It should be readable by developers on login nodes.
157 | 
158 | ### plugins directory
159 | 
160 | Without plugins, the daemon does nothing ... very well. In this mode, all requests are errors. Plugins define whitelisted implementations of workflows.
161 | 
162 | The plugins directory only has to be readable by the daemon user on the daemon host.
163 | 
164 | Plugins have change management life-cycles independent of each other or the daemon. The goal is to enable each plugin to be auditable largely independently of the rest of the system. The directory for a plugin contains both name and version. Although this allows for limited backwards compatibility by have more than one active version, this capability should be used __very sparingly__.
165 | 
166 | Installing a plugin consists of unpacking it into the plugins directory. A plugin is a directory of files, minimally containing a module or executable with a standard name. If the plugin requires configuration, the configuration is stored in the configuration file under `plugins/the_plugin_name_and_version`.
167 | 
168 | For initial testing purposes, the core seneschal software will ship with a truly minimal set of plugins that are not quite as useful as `echo` and `ping`.
169 | 
170 | The plugins directory must be publicly readable on the _daemon host_ and all login nodes.
171 | 
172 | ### cluster resources
173 | 
174 | The mounting requirements for compute nodes serving seneschal are a subset of those on the daemon host:
175 | 
176 | * These filesystems may be mounted read-only:
177 |     * upstream filesystems
178 |     * software filesystems
179 | * These must be mounted read-write:
180 |     * user filesystems
181 |     * the filesystem holding the job events directory
182 | 
183 | ### software directory
184 | 
185 | In addition to other standard software directories, the seneschal software must be available to all users, the daemon host, and the compute nodes. In particular:
186 | 
187 | * Users must have access to the scripts that generate _requests_.
188 | * Compute jobs must have access to the seneschal wrapper.
189 | * The daemon must have access to its own machinery and all the plugins.
190 | 
191 | Easiest is to just make everything public.
192 | 
193 | ## Deployment and Operating Instructions
194 | 
195 | __NOTE:__ Except for temp and inbox, none of these directories or files should be writable by users.
196 | 
197 | * Install Python 3.6 somewhere accessible to the daemon host.
198 | * Unpack the seneschal tarball, which contains its third-party dependencies.
199 | * Create the necessary directories.
200 | * Copy all the active plugins into some directory, such as a directory named "plugins" just inside the unpacked tarball.
201 | * Write a config file, which specifies:
202 |     * PID file
203 |     * logging destinations
204 |     * daemon process settings
205 |     * directory locations
206 |     * any optional or required configuration settings for the active plugins
207 | * Write a systemd unit file that
208 |     * knows the location of the PID file
209 |     * defines a start command contaning:
210 |         * Your Python 3.6
211 |         * Your location for `seneschald.py`
212 |         * Your config file
213 |         * "start"
214 | * Configure and deploy the `seneschal` script so that users can easily execute it.
215 | * Test the `seneschal` script. (It can run without the daemon, and any messages it generates will be detected by the daemon when it runs.)
216 | * Start the daemon like any other. (You could manually invoke `seneschald.py` with "start", but ... why?)
217 | * Stop the daemon by like any other. The manual method is either sending a `SIGTERM` or invoking `seneschald.py` with "stop", which just does the same thing.
218 | * When planning ahead for a stop, invoking `seneschald.py` with "drain" will notify to the daemon (by writing a special file) that it should postpone long-running local subprocesses, such as copies, until after "start" or "resume". The "drain" and "resume" events are idempotent.
219 | 
220 | Note that if seneschald is submitting a job to a batch scheduler or calling a webservice when `SIGTERM` is sent, then seneschald will not shutdown until after the batch scheduler acknowledges the job (e.g. bsub/msub/qsub exits) or the webservice returns.
221 | 
222 | __Question:__ Should the daemon remain in a drained state after restart? Should this be a configuration option?
223 | 
224 | ## User Operation
225 | 
226 | User's interact with the seneschal system through porcelain commands. These commands do nothing more than scan directories, read files, and for _submit_ write files.
227 | 
228 | To list plugins:
229 | 
230 |     seneschal -l
231 |     # A fancy ls of the plugins directory
232 | 
233 | To get documentation about a plugin:
234 | 
235 |     seneschal help PLUGIN_NAME
236 |     # Runs the plugin's documentation (if any) through a pager
237 |     # If there is no documentation file and the plugin supports it, instead
238 |     # executes:
239 |     #   ${PLUGINS_DIR}/${PLUGIN_NAME}/execute_job -h | ${PAGER}
240 | 
241 | To run a command:
242 | 
243 |     seneschal submit PLUGIN_NAME ARG1 ARG2 ...
244 |     # Outputs the unique ID of the request
245 | 
246 | To check the status of a request:
247 | 
248 |     seneschal status UNIQUE_ID
249 |     # Queries the seneschald files and generates a report
250 | 
251 | ## Writing Plugins
252 | 
253 | TBD
254 | 
255 | ## Developer Needs
256 | 
257 | In order to support and maintain the software, developers need to see recent logs and the contents of all of those directories.
258 | 
259 | Updates to either the daemon or a plugin consist of developers delivering checksummed tarballs for review and deployment.
260 | 
261 | ## Implementation Design
262 | 
263 | The daemon code is written in Python. Messages are JSON files. Plugins can be written in any language that supports reading environment variables, command line argument parsing, and file I/O.
264 | 
265 | The daemon uses two third-party packages:
266 | 
267 | * [python-daemon](https://pypi.python.org/pypi/python-daemon): implements [PEP 3143](https://www.python.org/dev/peps/pep-3143/)
268 | * [lockfile](https://pypi.python.org/pypi/lockfile): provides PID file support, may switch to [pid](https://pypi.python.org/pypi/pid)
269 | 
270 | ### Logging & Auditing
271 | 
272 | Logging and auditing is handled through the Python standard library [logging module](https://docs.python.org/3/library/logging.html). Logging is configured using [logging.config.dictConfig](https://docs.python.org/3/library/logging.config.html#logging.config.dictConfig). The necessary dictionary is read from the config file under the "logging" section with these two keys added by `seneschald`:
273 | 
274 |     version=1
275 |     disable_existing_loggers=False
276 | 
277 | Much of the internal logging is handled in a manner that is typical for well behaved, long-running Python processes. Logging messages intended for developers or system administrators are human-friendly. The audit log is special.
278 | 
279 | There is a special logger named "audit". Log messages sent to this logger will be optimized for ingestion by log analysis systems, such as splunk:
280 | 
281 |     timestamp key1=value1 key2=value2 key3=value3 key4=value4 ...
282 | 
283 | Example of the section of the config file that configures logging:
284 | 
285 |     logging:
286 |       formatters:
287 |         verbose:
288 |           format: '%(asctime)s %(levelname)-8s %(name)s %(module)s %(process)d %(message)s'
289 |         audit_format:
290 |           format: '%(asctime)s %(message)s'
291 |       handlers:
292 |         debug_handler:
293 |           class : logging.handlers.RotatingFileHandler
294 |           formatter: verbose
295 |           filename: /var/log/seneschal/debug.log
296 |           maxBytes: 40960
297 |           backupCount: 1
298 |         audit_handler:
299 |           class : logging.handlers.RotatingFileHandler
300 |           formatter: audit_format
301 |           filename: /var/log/seneschal/audit.log
302 |           maxBytes: 409600
303 |           backupCount: 4
304 |       loggers:
305 |         audit:
306 |           level: INFO
307 |           handlers: [audit_handler]
308 |       root:
309 |         level: DEBUG
310 |         handlers: [debug_handler]
311 | 
312 | The level of audit messages is always INFO or higher.
313 | 
314 | References:
315 | 
316 | * [http://dev.splunk.com/view/dev-guide/SP-CAAAE3A](http://dev.splunk.com/view/dev-guide/SP-CAAAE3A)
317 | * [https://answers.splunk.com/answers/1951/what-is-the-best-custom-log-event-format-for-splunk-to-eat.html#answer-1953](https://answers.splunk.com/answers/1951/what-is-the-best-custom-log-event-format-for-splunk-to-eat.html#answer-1953)
318 | 
319 | 
320 | ### Architecture
321 | 
322 | These are the important kinds of objects in the system:
323 | 
324 | * daemon
325 | * _Engine_
326 | * managers
327 |     * fertile
328 |         * _RequestManager_
329 |         * _JobManager_
330 |         * _SubprocessManager_
331 |     * sterile
332 |         * _PluginManager_
333 | * workers
334 |     * stateful
335 |         * _Request_
336 |         * _Job_
337 |         * _Subprocess_
338 |     * stateless
339 |         * _Plugin_
340 | * _MessageBroker_
341 | * _Message_
342 | * _Event_
343 | 
344 | The _daemon_ reads the configuration file, constructs the _Engine_, and then start its main loop. During this loop, the _daemon_ passes control to the _Engine_. If there was no work for the _Engine_ to do, then the _daemon_ will sleep. The main loop terminates after `SIGTERM`.
345 | 
346 | The _Engine_ constructs instances of the _MessageBroker_ each type of manager. The engine will then go into a loop until the _daemon_ receives `SIGTERM`. Each pass through the loop will call upon the _MessageBroker_ to process one message. If there are no messages, then the _Engine_ returns control back to the _daemon_.
347 | 
348 | The _MessageBroker_ reads a _Message_ and converts it into an _Event_ for some worker or manager. A _Message_ and an _Event_ are similar, with the main difference being perspective. An object sends a _Message_ with a destination. That destination object receives an _Event_ where the destination is implicit and other contextual information may be added. Both messages and events are really just JSON files. (There's a lot of disk churn going on.)
349 | 
350 | Manager objects are responsible for creating and fetching workers. Managers maintain two kinds of state in memory:
351 | 
352 | * configuration
353 | * a registry of workers, indexed by ID
354 | 
355 | When _seneschald_ starts up, each manager will reload its registry by scanning a directory.
356 | 
357 | Stateful workers uses the filesystem to maintain its state. Each stateful worker has a corresponding directory that contains an ordered list of JSON files corresponding to events, starting with an initialization event. The state of the worker is loaded into memory by reading each file in sequence.
358 | 
359 | Plugins have no state besides the configuration read from the main configuration file. Each time a plugin is invoked, it must be passed all the information it needs to do its job. It is the responsibility of stateful worker objects to hold the necessary state information. Some plugins are special, in that they define how the seneschal system integrates with external system like a compute cluster or permissions system. These plugins are typically aliased to logical resource names, like "batch\_scheduler". Most plugins are workflow plugins.
360 | 
361 | Plugins can be implemented as either external executables (usually scripts) or Python modules. There is no semantic difference between the two methods.
362 | 
363 | In the case of external executables, plugin configuration is defined in environment variables, and the input for the plugin is JSON data fed into standard input. The output is JSON data fed to standard output.
364 | 
365 | In the case of Python modules, a plugin is loaded by importing the module and then passing any configuration in the form of a Python _dict_ into a function in that module named _create_. The return value of that function call is the plugin object, which is callable. The input and output are just Python dictionaries. For some applications, Python modules are the better choice either because of better efficiency or simplicity.
366 | 
367 | Worker objects can do these things:
368 | 
369 | * __[not plugins]__ notify their manager that they should be purged
370 | * __[not subprocesses]__ receive events
371 | * send messages
372 | * __[plugins and subprocesses only]__ do stuff
373 | 
374 | A _Request_ object represents — perhaps indirectly — a user's request to execute an automated workflow with a particular set of inputs and outputs. The owner of the request is the owner of the _Message_ file that originated the _Request_. Every _Request_ has an associated _workflow plugin_ that defines the actual workflow. After initialization, the _Request_ invokes the _workflow plugin_ passing in a JSON object representing the request. The plugin will respond with a _workflow_ — a JSON object containing a list of tasks that will satisfy the _Request_. The _Request_, will then typically trigger other messages, write its state to the filesystem, and exit. (The _Request_ could invoke other plugins as it iterates through the list of tasks.) Eventually the _Request_ will receive messages indicating the end of its various tasks. When there are no more tasks running, the _Request_ will have reached the end of its lifecycle. At that point, the _Request_ will notify the _RequestManager_ that it should be purged. Hopefully along the way, something useful happened.
375 | 
376 | A _Job_ represents a batch job submitted to a [job scheduler](https://en.wikipedia.org/wiki/Job_scheduler). A _Job_ is created when the _JobManager_ receives a _Message_ submitting a new _Job_. A _Job_ knows the ID of the originating _Request_. A _Job_ sends a message to the logical "batch\_scheduler" _Plugin_ to submit the job. The _Plugin_ will typically create a custom file for the job and then submit a plugin-defined wrapper script and that custom file to the underlying job scheduler. The wrapper script will then read the job-specific file, execute the required tasks, and send messages back to the _Job_ object upon the start and finish of compute. The _Job_ object will forward information back to the _Request_. The _Job_ object will notify the _JobManager_ when its lifecycle is complete.
377 | 
378 | A _Subprocess_ is a logical wrapper around an external command. It is much simpler than a _Job_, since there is not need for a plugin to implement it. The implementation is handled by the Python [subprocess module](https://docs.python.org/3/library/subprocess.html). State is also maintained on the filesystem. If the daemon must shutdown, all running subprocesses must be killed. By default, all subprocesses will restart when the daemon restarts. Workflows that use subprocesses are usually file copy operations. They should be coded so as to be restartable.
379 | 
380 | #### Example Putting it all Together
381 | 
382 | Consider a workflow that computes the MD5 checksum of a file. For the purpose of this discussion, let us agree to the interpretation that the MD5 of a protected file does not constitute protected information. Therefore, no security checks are required. A plugin implementing this workflow has two inputs: the input file path and the output file path.
383 | 
384 | This would be the sequence of received messages:
385 | 
386 | 1.  _RequestManager_: new md5 inputPath outputPath
387 | 2.  request001: initialization userName md5 inputPath outputPath
388 | 3.  _JobManager_: new request001 step0 /usr/bin/md5sum inputPath outputPath
389 | 4.  job001: initialization request001 step0 /usr/bin/md5sum inputPath outputPath
390 | 5.  request001: step0 job submitted with ID 123
391 | 6.  job001 (sent by wrapper): compute started on node ABC at someTimestamp
392 | 7.  request001: step0 compute started on node ABC at someTimestamp
393 | 8.  job001 (sent by wrapper): compute succeeded at someTimestamp
394 | 9.  request001: step0 compute succeeded at someTimestamp
395 | 10. _JobManager_: purge job001
396 | 11. _RequestManager_: purge request001
397 | 
398 | This list does not show plugin invocations, since they are invoked by direct input — either function call or passing JSON into _stdin_.
399 | 
400 | Items 3, 10, and 11 are in-memory messages only, really just direct method calls. Even with optimization of those instantaneous messages staying in memory, there are at least 8 files created for this simplest cluster workflow.
401 | 


--------------------------------------------------------------------------------