├── .gitignore ├── seneschal ├── engine.py ├── __init__.py ├── messaging.py └── managers.py ├── seneschal.py ├── docs ├── seneschal_config_sample.yaml └── tech_specs.md ├── test └── test_tasks.py └── seneschald.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | -------------------------------------------------------------------------------- /seneschal/engine.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Functions for the lifecycle of messages. The action code lives in other 4 | modules.""" 5 | 6 | 7 | import logging 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class Engine(object): 14 | """Object responsible for fetching messages and delegating to 15 | the business rules.""" 16 | running = True # When False, start shutting down. 17 | 18 | def __init__(self, config): 19 | self.__dict__.update(config) # Absorb config 20 | 21 | def sweep(self): 22 | """Loop over work queue until it is exhausted, then return.""" 23 | while Engine.running: 24 | did_work = self.do_one_mesage() 25 | if not did_work: 26 | logger.debug('no more work') 27 | break 28 | 29 | def do_one_mesage(self): 30 | """Check for incoming messages, and process the first. Return 31 | True if a message was found, False otherwise.""" 32 | return False # TODO 33 | -------------------------------------------------------------------------------- /seneschal/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Engine for handling messages that represent requests to process 4 | protected data. A request is a JSON file that contains: 5 | 6 | 1. the desired action 7 | 2. the input data 8 | 3. the destination 9 | 4. (optional) parameters 10 | 11 | The owner of the the file is considered the sender of the message. 12 | 13 | The lifecycle of a message: 14 | 15 | 1. User invokes a command that: 16 | a. writes the message in a temporary directory on the same filesystem as 17 | the queue directory 18 | b. moves the message to the queue directory 19 | 2. Automation (daemon): 20 | a. Select the oldest message in the queue. 21 | b. Move the message to the processing directory. 22 | c. Log the message. 23 | d. Apply business rules. 24 | e. Execute and log the action. 25 | f. Move the message the the finished directory. 26 | 3. Cleanup automation (crontab): 27 | a. Creates a new directory named after a time period. 28 | b. Moves all messages in the finished directory into the new directory. 29 | c. Creates a tarball based on the new directory. 30 | d. Verifies the tarball. 31 | e. Deletes the new directory and its contents. 32 | 33 | Some messages simply result in sending a request for human approval. The 34 | arrival of such an approval, triggers the matching requested action. 35 | """ 36 | 37 | from .engine import Engine 38 | 39 | 40 | __author__ = """Walker Hale IV""" 41 | __email__ = 'walker.hale.iv@gmail.com' 42 | __version__ = '0.1.0' 43 | 44 | __all__ = ['__author__', '__email__', '__version__', 'Engine'] 45 | -------------------------------------------------------------------------------- /seneschal.py: -------------------------------------------------------------------------------- 1 | """User interface to the Seneschal automation system. Everything is a 2 | subcommand.""" 3 | 4 | 5 | import argparse 6 | import json 7 | import sys 8 | 9 | import yaml 10 | 11 | from seneschal.messaging import leave_new_request 12 | 13 | 14 | emit = lambda *args: None # Do nothing 15 | 16 | 17 | def main(): 18 | global emit 19 | args = parse_args() 20 | if args.verbose: 21 | emit = err_output 22 | config = load_config_file(args.config_file) 23 | seneschal_config = config['seneschal'] 24 | try: 25 | args.func(seneschal_config, args) 26 | except BrokenPipeError as e: 27 | pass # Ignore. Something like head is truncating output. 28 | finally: 29 | sys.stderr.close() # Needed to supress meaningless BrokenPipeError. 30 | 31 | 32 | def parse_args(): 33 | parser = argparse.ArgumentParser(description=__doc__) 34 | parser.add_argument('-v', '--verbose', action='store_true') 35 | parser.add_argument('config_file', help='path to YAML file') 36 | subparsers = parser.add_subparsers(help='sub-commands') 37 | 38 | # create the parser for the "submit" command 39 | parser_submit = subparsers.add_parser( 40 | 'submit', 41 | help='Submit a request to the Seneschal automation system.' 42 | ) 43 | parser_submit.add_argument('workflow', help='what to do') 44 | parser_submit.add_argument('args', nargs='*', help='workflow-specific') 45 | parser_submit.set_defaults(func=submit) 46 | 47 | # create the parser for the "ls" command 48 | parser_ls = subparsers.add_parser('ls', help='List workflows') 49 | parser_ls.set_defaults(func=ls) 50 | 51 | args = parser.parse_args() 52 | 53 | if not hasattr(args, 'func'): 54 | parser.error('missing subcommand (subcommand, ls, etc.)') 55 | 56 | return args 57 | 58 | 59 | def load_config_file(config_file): 60 | with open(config_file) as fin: 61 | config = yaml.load(fin) 62 | return config 63 | 64 | 65 | def submit(seneschal_config, args): 66 | """Submit a request to the Seneschal automation system.""" 67 | emit('workflow:', args.workflow) 68 | for arg in args.args: 69 | emit(arg) 70 | directory = seneschal_config['paths']['user_events'] 71 | uuid_str = leave_new_request(directory, args.workflow, args.args) 72 | print(uuid_str) 73 | 74 | 75 | def ls(seneschal_config, args): 76 | """List workflows.""" 77 | print(args) 78 | yaml.safe_dump(seneschal_config, sys.stdout, default_flow_style=False) 79 | pass # TODO 80 | 81 | 82 | def err_output(*args): 83 | """Send args to sys.stderr.""" 84 | print(*args, file=sys.stderr) 85 | 86 | 87 | if __name__ == "__main__": 88 | main() 89 | -------------------------------------------------------------------------------- /docs/seneschal_config_sample.yaml: -------------------------------------------------------------------------------- 1 | # This file is typically: 2 | # (1) maintained and deployed by System Administrators. 3 | # (2) assumed to be trusted. 4 | # (3) parsed by PyYAML running as root. 5 | 6 | # senechald config file 7 | # Specifies: 8 | # (1) logging settings 9 | # (2) daemon settings, like PID file location and UID 10 | # (3) Senechal operation settings, like location of event directories 11 | 12 | logging: 13 | formatters: 14 | verbose: 15 | format: '%(asctime)s %(levelname)-8s %(name)s %(module)s %(process)d %(message)s' 16 | audit_format: 17 | format: '%(asctime)s %(message)s' 18 | handlers: 19 | main: 20 | class : logging.handlers.RotatingFileHandler 21 | formatter: verbose 22 | filename: /var/log/seneschal/seneschal.log 23 | maxBytes: 40960 24 | backupCount: 1 25 | audit_handler: 26 | class : logging.handlers.RotatingFileHandler 27 | formatter: audit_format 28 | filename: /var/log/seneschal/audit.log 29 | maxBytes: 409600 30 | backupCount: 4 31 | loggers: 32 | audit: 33 | level: INFO 34 | handlers: 35 | - audit_handler 36 | root: 37 | level: DEBUG 38 | handlers: 39 | - main 40 | daemon: 41 | # Any settings left blank, get the default. 42 | # pidfile is REQUIRED 43 | pidfile: /var/run/seneschal.pid 44 | # working_directory default = '/' 45 | working_directory: 46 | # chroot_directory default = don`t chroot 47 | chroot_directory: 48 | # umask default = 0 49 | umask: 50 | # detach_process default = True unless determined to be already detached 51 | detach_process: 52 | # uid default = real UID as returned by getuid() (often root - 0) 53 | uid: 54 | # gid default = real GID as returned by getgid() (often root - 0) 55 | gid: 56 | # prevent_core default = True; set to False to enable a core dump 57 | prevent_core: 58 | seneschal: 59 | paths: 60 | # Depending on interpretation of the standards, you could drop "local/" or 61 | # replace "lib/" with "spool/" for "*_messages/"... 62 | # Where compute jobs will write events (They are not immediately deleted.) 63 | job_messages: /var/local/lib/seneschal/job_messages 64 | # Where user clients will write events (They are not immediately deleted.) 65 | user_messages: /var/local/lib/seneschal/user_messages 66 | # Where state is maintained for tracking progress of automation requests 67 | requests: /var/local/lib/seneschal/requests 68 | # Where state is maintained for tracking progress of batch jobs 69 | jobs: /var/local/lib/seneschal/jobs 70 | # Where state is maintained for restarting subprocesses after system reboot 71 | subprocesses: /var/local/lib/seneschal/subprocesses 72 | # Where plugins are installed 73 | plugins: /usr/local/lib/seneschal/plugins 74 | plugins: 75 | md5: 76 | executable: /usr/bin/md5sum 77 | -------------------------------------------------------------------------------- /test/test_tasks.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | from seneschal import managers 4 | 5 | 6 | STATE_1_YAML = ''' 7 | type: sequence 8 | cwd: output_batch_dir 9 | zchildren: 10 | - type: parallel 11 | zchildren: 12 | - type: sequence 13 | executable: .../python3.6 14 | prefix_arguments: [.../topmed-1/scripts/copy_rename] 15 | child_type: subprocess 16 | zchildren: 17 | - arguments: [src_file1, dst_file1] 18 | - arguments: [src_file2, dst_file2] 19 | - type: parallel 20 | cwd: md5_dst_dir 21 | cores: 1 22 | executable: .../python3.6 23 | prefix_arguments: [.../topmed-1/scripts/md5_script] 24 | child_type: batch_job 25 | zchildren: 26 | - arguments: [src_file1, sample1] 27 | - arguments: [src_file2, sample2] 28 | - type: subprocess 29 | executable: .../python3.6 30 | arguments: [.../topmed-1/scripts/make_manifest, md5_dst_dir] 31 | ''' 32 | 33 | 34 | STATE_2_YAML = ''' 35 | cwd: output_batch_dir 36 | path: t 37 | type: sequence 38 | zchildren: 39 | - cwd: output_batch_dir 40 | index: 0 41 | path: t/0 42 | type: parallel 43 | zchildren: 44 | - child_type: subprocess 45 | cwd: output_batch_dir 46 | executable: '.../python3.6' 47 | index: 0 48 | path: t/0/0 49 | prefix_arguments: &id001 ['.../topmed-1/scripts/copy_rename'] 50 | type: sequence 51 | zchildren: 52 | - arguments: [src_file1, dst_file1] 53 | cwd: output_batch_dir 54 | executable: '.../python3.6' 55 | index: 0 56 | path: t/0/0/0 57 | prefix_arguments: *id001 58 | type: subprocess 59 | - arguments: [src_file2, dst_file2] 60 | cwd: output_batch_dir 61 | executable: '.../python3.6' 62 | index: 1 63 | path: t/0/0/1 64 | prefix_arguments: *id001 65 | type: subprocess 66 | - child_type: batch_job 67 | cores: 1 68 | cwd: md5_dst_dir 69 | executable: '.../python3.6' 70 | index: 1 71 | path: t/0/1 72 | prefix_arguments: &id002 ['.../topmed-1/scripts/md5_script'] 73 | type: parallel 74 | zchildren: 75 | - arguments: [src_file1, sample1] 76 | cores: 1 77 | cwd: md5_dst_dir 78 | executable: '.../python3.6' 79 | index: 0 80 | path: t/0/1/0 81 | prefix_arguments: *id002 82 | type: batch_job 83 | - arguments: [src_file2, sample2] 84 | cores: 1 85 | cwd: md5_dst_dir 86 | executable: '.../python3.6' 87 | index: 1 88 | path: t/0/1/1 89 | prefix_arguments: *id002 90 | type: batch_job 91 | - arguments: ['.../topmed-1/scripts/make_manifest', md5_dst_dir] 92 | cwd: output_batch_dir 93 | executable: '.../python3.6' 94 | index: 1 95 | path: t/1 96 | type: subprocess 97 | ''' 98 | 99 | T011 = ''' 100 | arguments: [src_file2, sample2] 101 | cores: 1 102 | cwd: md5_dst_dir 103 | executable: '.../python3.6' 104 | index: 1 105 | path: t/0/1/1 106 | prefix_arguments: ['.../topmed-1/scripts/md5_script'] 107 | type: batch_job 108 | ''' 109 | 110 | PATHS = ''' 111 | t 112 | t/0 113 | t/0/0 114 | t/0/0/0 115 | t/0/0/1 116 | t/0/1 117 | t/0/1/0 118 | t/0/1/1 119 | t/1 120 | '''.split() 121 | 122 | 123 | def test_task_inheritance(): 124 | state = yaml.load(STATE_1_YAML) 125 | managers.propagate_inheritance(state) 126 | assert yaml.dump(state) == STATE_2_YAML[1:] 127 | 128 | 129 | def test_index_mappings(): 130 | state = yaml.load(STATE_2_YAML) 131 | index = managers.index_mappings(state) 132 | assert sorted(index) == PATHS 133 | for k, v in index.items(): 134 | assert k == v['path'] 135 | print(yaml.dump(index['t/0/1/1'])) 136 | assert index['t/0/1/1'] == yaml.load(T011) 137 | assert index['t'] == state 138 | -------------------------------------------------------------------------------- /seneschald.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """Daemon control script. The seneschal daemon handles messages requesting 4 | processing or export of data located within the protected area.""" 5 | 6 | 7 | import argparse 8 | import logging 9 | import logging.config 10 | import os 11 | import signal 12 | import sys 13 | import syslog 14 | import time 15 | 16 | from daemon import DaemonContext 17 | from daemon.runner import is_pidfile_stale, emit_message 18 | from lockfile.pidlockfile import PIDLockFile 19 | import yaml 20 | 21 | from seneschal import Engine 22 | 23 | 24 | logger = logging.getLogger('seneschald') 25 | 26 | 27 | def main(): 28 | args = parse_args() 29 | daemon_command = args.daemon_command 30 | config = load_config_file(args.config_file) 31 | logging_config = config.pop('logging', None) 32 | daemon_config = config.pop('daemon') 33 | seneschal_config = config.pop('seneschal') 34 | try: 35 | if daemon_command == 'start': 36 | start(logging_config, daemon_config, seneschal_config) 37 | else: 38 | config_logging(logging_config) 39 | if daemon_command == 'stop': 40 | stop(daemon_config) 41 | else: 42 | engine = Engine(seneschal_config) 43 | if daemon_command == 'sweep': 44 | engine.sweep() 45 | except Exception as e: 46 | emit_message(e) 47 | sys.exit(1) 48 | finally: 49 | logging.shutdown() 50 | 51 | 52 | def parse_args(): 53 | parser = argparse.ArgumentParser(description=__doc__) 54 | parser.add_argument('config_file', help='path to YAML file') 55 | parser.add_argument('daemon_command', choices=['start', 'stop', 'sweep']) 56 | args = parser.parse_args() 57 | return args 58 | 59 | 60 | def load_config_file(config_file): 61 | with open(config_file) as fin: 62 | config = yaml.load(fin) 63 | return config 64 | 65 | 66 | def start(logging_config, daemon_config, seneschal_config): 67 | syslog.openlog('seneschal', 0, syslog.LOG_USER) 68 | engine = Engine(seneschal_config) 69 | pidfile, daemon_options = check_daemon_options(daemon_config) 70 | if is_pidfile_stale(pidfile): 71 | syslog.syslog(syslog.LOG_NOTICE, 'breaking stale PID file') 72 | pidfile.break_lock() 73 | # The remaining entries in daemon_options will be passed as-is to 74 | # daemon.DaemonContext. 75 | context = DaemonContext(pidfile=pidfile, **daemon_options) 76 | context.signal_map = make_signal_map() 77 | syslog.syslog(syslog.LOG_NOTICE, 'starting daemon context') 78 | try: 79 | with context: # Will fail if daemon already running 80 | pid = os.getpid() 81 | syslog.syslog(syslog.LOG_NOTICE, 'daemon running as: %s' % pid) 82 | config_logging(logging_config) 83 | logger.debug('========================================') 84 | logger.info('daemon running pid=%s', pid) 85 | logger.debug('args: %r', sys.argv) 86 | logger.debug('daemon_options: %r', daemon_options) 87 | logger.debug('seneschal_config: %r', seneschal_config) 88 | while Engine.running: 89 | engine.sweep() 90 | time.sleep(1) 91 | # TODO: Long polling times, may result in an unacceptable 92 | # delay during daemon shutdown. 93 | except Exception as e: 94 | syslog.syslog(syslog.LOG_ERR, str(e)) 95 | logger.exception(repr(e)) 96 | raise 97 | finally: 98 | syslog.syslog(syslog.LOG_NOTICE, 'exiting') 99 | logger.info('exiting') 100 | 101 | 102 | def stop(daemon_config): 103 | """Standard daemon stop logic.""" 104 | pidfile, _ = check_daemon_options(daemon_config) 105 | if not pidfile.is_locked(): 106 | error = DaemonStopError( 107 | "PID file {pidfile.path!r} not locked".format(pidfile=pidfile) 108 | ) 109 | raise error 110 | if is_pidfile_stale(pidfile): 111 | syslog.syslog(syslog.LOG_NOTICE, 'breaking stale PID file') 112 | pidfile.break_lock() 113 | else: 114 | pid = pidfile.read_pid() 115 | try: 116 | os.kill(pid, signal.SIGTERM) 117 | except OSError as exc: 118 | error = DaemonStopError( 119 | "Failed to terminate {pid:d}: {exc}".format(pid=pid, exc=exc) 120 | ) 121 | raise error 122 | 123 | 124 | def check_daemon_options(daemon_config): 125 | """Returns the pidfile object and non-default daemon settings; 126 | dies if there are any illegal settings.""" 127 | check_for_illegal_daemon_options(daemon_config) 128 | daemon_options = {k: v for k, v in daemon_config.items() if v is not None} 129 | pidfile_path = daemon_options.pop('pidfile') 130 | pidfile = PIDLockFile(pidfile_path) 131 | return pidfile, daemon_options 132 | 133 | 134 | def check_for_illegal_daemon_options(daemon_config): 135 | """Error out and die if any illegal options.""" 136 | LEGAL_DAEMON_OPTIONS = set(''' 137 | pidfile 138 | working_directory 139 | chroot_directory 140 | umask 141 | detach_process 142 | uid 143 | gid 144 | prevent_core 145 | '''.split()) 146 | illegal_options = set(daemon_config) - LEGAL_DAEMON_OPTIONS 147 | if illegal_options: 148 | logger.critical('illegal daemon options in YAML config file: %r', 149 | list(illegal_options)) 150 | sys.exit(1) 151 | 152 | 153 | def make_signal_map(): 154 | result = { 155 | signal.SIGTERM: trigger_shutdown, 156 | signal.SIGHUP: None, 157 | signal.SIGTTIN: None, 158 | signal.SIGTTOU: None, 159 | signal.SIGTSTP: None, 160 | } 161 | return result 162 | 163 | 164 | def config_logging(logging_config_dict): 165 | if logging_config_dict: 166 | config = dict(logging_config_dict, 167 | version=1, 168 | disable_existing_loggers=False) 169 | logging.config.dictConfig(config) 170 | else: 171 | logging.basicConfig(level=logging.NOTSET) 172 | 173 | 174 | def trigger_shutdown(signum, frame): 175 | """Set global `running` to False, to trigger shutdown.""" 176 | syslog.syslog(syslog.LOG_NOTICE, 'term signal') 177 | Engine.running = False 178 | 179 | 180 | class DaemonStopError(RuntimeError): 181 | """Either daemon not running or as OS error.""" 182 | 183 | 184 | if __name__ == "__main__": 185 | main() 186 | -------------------------------------------------------------------------------- /seneschal/messaging.py: -------------------------------------------------------------------------------- 1 | """JSON file based messaging. User client software makes requests by executing 2 | `leave_new_request`. The rest of this module supports the automation engine.""" 3 | 4 | from json import dump, load 5 | import logging 6 | from pathlib import Path 7 | from uuid import uuid4 8 | 9 | 10 | # TODO: Start auditing messages. 11 | 12 | # Channels 13 | REQUEST = 'REQUEST' 14 | JOB = 'JOB' 15 | SUBPROCESS = 'SUBPROCESS' 16 | 17 | # MessageDrop directory names 18 | TEMP = '0_temp' 19 | INBOX = '1_inbox' 20 | RECEIVED = '2_received' 21 | ERROR = '3_error' 22 | 23 | # Message types 24 | NEW = 'NEW' 25 | STARTED = 'STARTED' 26 | SUCCEEDED = 'SUCCEEDED' 27 | FAILED = 'FAILED' 28 | 29 | # JSON message keys 30 | ILLEGAL_JSON_KEYS = {'channel', 'uid', 'user_name'} 31 | REQUIRED_JSON_KEYS = {'message_type', 'target_id', 'uuid_str'} 32 | 33 | 34 | logger = logging.getLogger(__name__) 35 | 36 | 37 | def leave_new_request(directory, workflow, arg_list): 38 | """On behalf of a user, creates a new Request message file, using 39 | `directory` as the root of a message drop. The requested automation is 40 | named by `workflow`. This is the only code in this module that client 41 | software needs to invoke in order to create a request.""" 42 | uuid_str = leave_message(directory, NEW, 43 | workflow=workflow, arg_list=arg_list) 44 | return uuid_str 45 | 46 | 47 | class Message: 48 | """Contains a single message as a JSON file. `channel` is where the message 49 | should go. `target_id` is the ID of a specific object that should get the 50 | message. `message_type` defines the general type of the message. Any 51 | additional attributes are specific to the type of message.""" 52 | def __init__(self, *, channel, target_id, message_type, **kwds): 53 | """Raises an exception if the message fails certain validity checks.""" 54 | self.channel = channel 55 | self.target_id = target_id 56 | self.message_type = message_type 57 | self.__dict__.update(kwds) 58 | pass # TODO 59 | 60 | 61 | class MessageBroker: 62 | """Responsible for creating, receiving, and dispatching messages, which 63 | are serialized as JSON files.""" 64 | def __init__(self, seneschal_config, 65 | request_manager, job_manager, subprocess_manager): 66 | job_messages_path = seneschal_config['paths']['job_messages'] 67 | user_messages_path = seneschal_config['paths']['user_messages'] 68 | self.message_drops = ( 69 | MessageDrop(directory=user_messages_path, channel=REQUEST), 70 | MessageDrop(directory=job_messages_path, channel=JOB) 71 | ) 72 | self.managers = { 73 | REQUEST: request_manager, 74 | JOB: job_manager, 75 | SUBPROCESS: subprocess_manager 76 | } 77 | for manager in self.managers.values(): 78 | manager.set_message_broker(self) 79 | 80 | def attempt_to_deliver_one_left_message(self): 81 | """Check the message drops for messages and if possible, deliver one 82 | message to the corresponding manager. Returns True if the MessageBroker 83 | delivered a message.""" 84 | for message_drop in self.message_drops: 85 | message = message_drop.fetch_message() 86 | if message: 87 | self.deliver_one_message(message) 88 | return True 89 | return False 90 | 91 | def deliver_one_message(self, message): 92 | """Deliver the message to the target manager, based on channel.""" 93 | manager = self.managers[message.channel] 94 | manager.receive_message(message) 95 | 96 | 97 | class MessageDrop(object): 98 | """Represents a filesystem directory that contains a `TEMP` directory and 99 | an `INBOX` directory. Messages are placed in the drop by writing a new 100 | JSON file with a unique name to the `TEMP` directory and then moving that 101 | file to the `INBOX` directory. Messages left here should not contain uid, 102 | user_name, or channel. When messages are read back into memory, they are 103 | augmented with these values. The user is the owner of the file.""" 104 | def __init__(self, *, directory, channel, **kwds): 105 | """Parameters: `directory` must contain `TEMP`, `INBOX`, and 106 | `RECEIVED`; `channel` is only used when fetching messages.""" 107 | super().__init__(**kwds) 108 | self.directory = Path(directory) 109 | self.channel = channel 110 | 111 | @property 112 | def inbox(self): 113 | """Returns `self.directory / INBOX`.""" 114 | return self.directory / INBOX 115 | 116 | @property 117 | def received(self): 118 | """Returns `self.directory / RECEIVED`.""" 119 | return self.directory / RECEIVED 120 | 121 | @property 122 | def error(self): 123 | """Returns `self.directory / ERROR`.""" 124 | return self.directory / ERROR 125 | 126 | def leave_message(self, message_type, target_id=None, **kwds): 127 | """Write a new JSON file into the `TEMP` directory and then move that 128 | file into the `INBOX` directory. The JSON file is an object (dict) 129 | that contains the combination of message_type, target_id, kwds, and 130 | a UUID, which is also used to name the file. The UUID is generated 131 | inside this method using `uuid4`, so as to insure that the message and 132 | file have unique names. Returns the UUID as a str. This method is 133 | usually invoked from client software that does not call any other 134 | methods in this module.""" 135 | # TODO: Consider that we may want to create a UUID based on a previous 136 | # UUID, such as events for an existing job or request, using a 137 | # UUID5 algorith taking the existing UUID as the namespace. 138 | leave_message(self.directory, message_type, target_id, **kwds) 139 | 140 | def fetch_message(self): 141 | """Return the next message or `None`. Locates the oldest JSON file in 142 | the `INBOX` directory, loads it, moves it into the `RECEIVED` 143 | directory, and returns the resulting `Message` object.""" 144 | # All messages, oldest first: 145 | message_paths = sorted(self.inbox.glob('*.json'), 146 | key=lambda x: x.stat().st_mtime) 147 | message = None 148 | # If there are messages, keep processing until we find a good one: 149 | for message_path in message_paths: 150 | name = message_path.name 151 | try: 152 | message = load_message(message_path, self.channel) 153 | except ValueError as e: 154 | logger.exception(f'problem loading {name}') 155 | message_path.rename(self.error / name) 156 | else: 157 | message_path.rename(self.received / name) 158 | logger.info(f'received {name}') 159 | break 160 | return message 161 | 162 | 163 | def load_message(message_path, channel): 164 | """Return the `Message` object at message_path, filling in `channel`, 165 | `uid`, and `user_name`. Will raise a subclass of `ValueError` if the file 166 | has a bad set of keys or the UUID in the file does not match the name of 167 | the file.""" 168 | message_path = Path(message_path) 169 | with message_path.open() as fin: 170 | message_mapping = load(fin) 171 | message_keys = set(message_mapping) 172 | illegal_keys = ILLEGAL_JSON_KEYS & message_keys 173 | missing_keys = REQUIRED_JSON_KEYS - message_keys 174 | if illegal_keys: 175 | raise IllegalJSONKeysError( 176 | f'illegal keys in {message_path.name}: {illegal_keys}' 177 | ) 178 | if missing_keys: 179 | raise IllegalJSONKeysError( 180 | f'missing keys in {message_path.name}: {missing_keys}' 181 | ) 182 | uid = message_path.stat().st_uid 183 | try: 184 | user_name = message_path.owner() 185 | except Exception as e: 186 | logger.exception(f'problem getting owner name for {message_path.name}') 187 | user_name = str(uid) 188 | message = Message(channel=channel, 189 | uid=uid, 190 | user_name=user_name, 191 | **message_mapping) 192 | if message.uuid_str != message_path.stem: 193 | raise ValueError( 194 | f'wrong UUID in {message_path.name}: {message.uuid_str}' 195 | ) 196 | return message 197 | 198 | 199 | def leave_message(directory, message_type, target_id=None, **kwds): 200 | """Using `directory` as the root of a message drop, write a new JSON file 201 | into the `TEMP` directory and then move that file into the `INBOX` 202 | directory. The JSON file is an object (dict) that contains the combination 203 | of message_type, target_id, kwds, and a UUID, which is also used to name 204 | the file. The UUID is generated inside this method using `uuid4`, so as to 205 | insure that the message and file have unique names. Returns the UUID as a 206 | str. This function is usually invoked from client software that does not 207 | call anything else in this module.""" 208 | assert 'uuid_str' not in kwds, kwds 209 | assert 'channel' not in kwds, kwds 210 | directory_path = Path(directory) 211 | uuid_str = str(uuid4()) 212 | file_name = uuid_str + '.json' 213 | initial_path = directory_path / TEMP / file_name 214 | final_path = directory_path / INBOX / file_name 215 | message = dict(uuid_str=uuid_str, 216 | message_type=message_type, 217 | target_id=target_id, 218 | **kwds) 219 | with initial_path.open('w') as fout: 220 | dump(message, fout, sort_keys=True) 221 | initial_path.rename(final_path) 222 | return uuid_str 223 | 224 | 225 | class MissingJSONKeysError(ValueError): 226 | """Some required keys were missing.""" 227 | pass 228 | 229 | 230 | class IllegalJSONKeysError(ValueError): 231 | """Some illegal keys were present in a JSON file.""" 232 | pass 233 | -------------------------------------------------------------------------------- /seneschal/managers.py: -------------------------------------------------------------------------------- 1 | """Implements the managers, which are objects in between the engine and the 2 | workers. There are four domains: user requests, batch jobs, local 3 | subprocesses, and plugins. Everything begins with a user request, but almost 4 | all actual work is defined by a plugin and then executed as either a batch 5 | job or a subprocess. 6 | 7 | Stateful workers are persisted to the filesystem in paths like this: 8 | 9 | SOME_ROOT/by_uuid/12300000-0000-0000-0000-000000000000/0.json 10 | SOME_ROOT/by_uuid/12300000-0000-0000-0000-000000000000/1.json 11 | SOME_ROOT/by_uuid/12300000-0000-0000-0000-000000000000/2.json 12 | 13 | In this example, SOME_ROOT is the directory associated with a manager, 123... 14 | is the UUID or ID of the worker and 2.json is the most recent state for that 15 | worker 16 | 17 | """ 18 | 19 | from json import dump, load 20 | import logging 21 | from pathlib import Path 22 | 23 | from .messaging import NEW, STARTED, SUCCEEDED, FAILED 24 | 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | UUID_GLOB = '????????-????-????-????-????????????' 29 | WORKER_GLOB = 'by_uuid/' + UUID_GLOB 30 | 31 | 32 | class Manager: 33 | """Abstract base class that encapsulates the concept of workers, an 34 | associated filesystem directory containing worker-specific subdirectories, 35 | the ability to load workers from their subdirectories during startup, 36 | a registry of workers by ID, and the ability to send messages to a 37 | `messaging.MessageBroker`. Subclasses must implement `load`.""" 38 | 39 | def __init__(self, *, directory, worker_class, **kwds): 40 | """Load state from directory into memory.""" 41 | super().__init__(**kwds) 42 | self.directory = Path(directory) 43 | self.worker_class = worker_class 44 | self.message_broker = None # See set_message_broker 45 | assert self.directory.is_dir() 46 | self.registry = dict() # ID -> worker 47 | for subdir in self.directory.glob(WORKER_GLOB): 48 | worker_params = self.load(subdir) 49 | self.add_worker(worker_params) 50 | 51 | def set_message_broker(self, message_broker): 52 | """Called after construction to install the `messaging.MessageBroker`. 53 | Should only be called once.""" 54 | assert self.message_broker is None 55 | self.message_broker = message_broker 56 | 57 | def load(self, subdir): 58 | """Abstract method to load worker state by reading the filesystem at 59 | `subdir`. Returns a `dict` of worker state.""" 60 | raise NotImplementedError('abstract method') 61 | 62 | def add_worker(self, worker_params): 63 | """Construct a new worker, and install it in the `registry`. 64 | Return the new worker's ID.""" 65 | worker = self.worker_class(worker_params) 66 | self.registry[worker.id] = worker 67 | return worker.id 68 | 69 | 70 | class MessageReceiver(Manager): 71 | """Abstract base class for Manager that can receive external messages. 72 | Subclasses must implement `load`.""" 73 | 74 | def receive_message(self, message): 75 | """Called by `messaging.MessageReceiver`, returns nothing. If 76 | `target_id` is None, then `message_type` should be `NEW`, in which 77 | case we should create a new worker object with the message payload. 78 | Otherwise, `target_id` should reference an existing object, and we 79 | should pass the message to that object by method call.""" 80 | worker_params = dict(vars(message)) 81 | # Remove parameters no longer needed. 82 | worker_params.pop(channel) 83 | worker_params.pop(message_type) 84 | worker_params.pop(target_id) 85 | if message.target_id is None: 86 | assert message.message_type is NEW 87 | worker = self.add_worker(worker_params) 88 | else: 89 | worker = self.registry[message.target_id] 90 | worker.receive_message(worker_params) 91 | worker.save() 92 | 93 | 94 | class RequestManager(MessageReceiver): 95 | """The Manager for all Request objects.""" 96 | 97 | def __init__(self, **kwds): 98 | """Load state from directory into memory.""" 99 | super().__init__(**kwds, worker_class=Request) 100 | 101 | def load(self, subdir): 102 | """Required by `Manager`. Delegates to `load_most_recent_state`.""" 103 | return load_most_recent_state(subdir) 104 | 105 | 106 | class DictProxy: 107 | """A class that links its state to an existing dict; a flyweight facade 108 | wrapping a dict. Any changes to the object are changes to the dict.""" 109 | def __init__(self, mapping): 110 | self.__dict__ = mapping 111 | 112 | def __repr__(self): 113 | return '%s(%r)' % (self.__class__.__name__, self.__dict__) 114 | 115 | 116 | class Request(DictProxy): 117 | pass # TODO 118 | 119 | 120 | class Task(DictProxy): 121 | """Abstract base class for all tasks. Every `Task` obtains and maintains 122 | its state in an external dict.""" 123 | # The static method `register_concrete_subclass` is a class decorator 124 | # that will populate concrete_subclasses, which enables `from_dict`. 125 | concrete_subclasses = {} 126 | 127 | def __init__(self, mapping): 128 | """Validates mapping and delegates construction to superclass.""" 129 | assert 'type' in mapping 130 | super().__init__(mapping) 131 | 132 | @staticmethod 133 | def from_dict(mapping): 134 | """Return the appropriate type of `Task` object based on the contents 135 | of `mapping`.""" 136 | assert isinstance(mapping, dict) 137 | subclass_selection = mapping['type'] 138 | subclass = Task.concrete_subclasses[subclass_selection] 139 | return subclass(mapping) 140 | 141 | @staticmethod 142 | def register_concrete_subclass(cls): 143 | """A class decorator that registers concrete Task subclasses. 144 | Subclasses must implement a class member __task_type_id__ as the str 145 | that will be used during deserialization to select the appropriate 146 | subclass.""" 147 | Task.concrete_subclasses[cls.__task_type_id__] = cls 148 | return cls 149 | 150 | def start(self, message_broker): 151 | """Abstract method. Invoked by the `Request` or a parent `CompoundTask` 152 | when it is time for this `Task` to start. Subclasses should either 153 | `start` a sub-task or send a `Message` by invoking 154 | `message_broker.deliver_one_message`.""" 155 | raise NotImplementedError 156 | 157 | 158 | @Task.register_concrete_subclass 159 | class BatchJobTask(Task): 160 | """Executes asynchronously in a batch job.""" 161 | __task_type_id__ = 'batch_job' 162 | pass # TODO 163 | 164 | 165 | @Task.register_concrete_subclass 166 | class SubprocessTask(Task): 167 | """Executes asynchronously in a subprocess.""" 168 | __task_type_id__ = 'subprocess' 169 | pass # TODO 170 | 171 | 172 | class CompoundTask(Task): 173 | """A `Task` that implements `children`, which must be a list of `Tasks`.""" 174 | # We store the children under a key named "zchildren" for a serialized 175 | # state that is easier to read. 176 | 177 | def __getitem__(self, key): 178 | return self.children[key] 179 | 180 | @property 181 | def children(self): 182 | """Returns a list of the appropriate `Task` objects.""" 183 | return [Task.from_dict(child) for child in self.zchildren] 184 | 185 | 186 | @Task.register_concrete_subclass 187 | class SequenceTask(CompoundTask): 188 | """Executes a list of child `Task`s in sequence.""" 189 | __task_type_id__ = 'sequence' 190 | pass # TODO 191 | 192 | 193 | @Task.register_concrete_subclass 194 | class ParallelTask(CompoundTask): 195 | """Executes a list of child `Task`s in parallel.""" 196 | __task_type_id__ = 'parallel' 197 | pass # TODO 198 | 199 | 200 | def load_most_recent_state(state_files_dir): 201 | """Read the highest numbered state file as JSON and return the result.""" 202 | assert isinstance(state_files_dir, Path), state_files_dir 203 | assert state_files_dir.match(UUID_GLOB), state_files_dir 204 | state_files = sorted(enumerate_numbered_json_files(state_files_dir)) 205 | assert state_files, state_files_dir 206 | state_file = state_files[-1][1] 207 | with state_file.open() as fin: 208 | state = load(fin) 209 | return state 210 | 211 | 212 | def enumerate_numbered_json_files(directory): 213 | """Yield pairs of num & json_file_path.""" 214 | for json_file_path in directory.glob('*.json'): 215 | try: 216 | num = int(json_file_path.stem) 217 | yield num, json_file_path 218 | except Exception as e: 219 | pass # TODO: log the unusual file 220 | 221 | 222 | def propagate_inheritance(mapping, path='t'): 223 | """Given a mapping that represents the state of a possibly compound `Task`, 224 | bestows inheritable attributes to children that have not overridden 225 | those attributes. Each `Task` will have a path relative to the `Request` 226 | object. The root `Task` has a path of "t", the zeroth child has a path 227 | of "t/0", and the zeroth grandchild has a path of "t/0/0".""" 228 | mapping['path'] = path 229 | if 'zchildren' not in mapping: 230 | return # Nothing to do 231 | child_type = mapping.get('child_type', None) 232 | keys = set(mapping) # Will be the set of inheritable keys. 233 | # Children do not inherit these: 234 | keys.discard('zchildren') 235 | keys.discard('child_type') # optional 236 | keys.remove('type') # but they can get type from child_type 237 | # Give the children their inheritances: 238 | for index, child_mapping in enumerate(mapping['zchildren']): 239 | child_mapping['index'] = index 240 | if not 'type' in child_mapping: 241 | assert child_type is not None 242 | child_mapping['type'] = child_type 243 | for key in keys: 244 | if key not in child_mapping: 245 | child_mapping[key] = mapping[key] 246 | # Give the child object a chance to initialize state: 247 | propagate_inheritance(child_mapping, f'{path}/{index}') 248 | 249 | 250 | def index_mappings(mapping): 251 | """Return a `dict` of all nested mappings, including the root, where the 252 | key is the value for "path". Uses `iterate_nested_mappings`, and requires 253 | that every mapping have a key of "path".""" 254 | result = {value['path']: value 255 | for value in iterate_nested_mappings(mapping)} 256 | return result 257 | 258 | 259 | def iterate_nested_mappings(mapping): 260 | """Generator function that top-down iterates all the mapping objects. 261 | Children are expected to be in a iterable under the key `zchildren`. The 262 | iterator always yields something, since the first value is always 263 | the `mapping` parameter.""" 264 | yield mapping 265 | for child_mapping in mapping.get('zchildren', ()): 266 | yield from iterate_nested_mappings(child_mapping) 267 | 268 | 269 | # TODO: Synchonize documentation in messaging.py and tech_specs.md. 270 | -------------------------------------------------------------------------------- /docs/tech_specs.md: -------------------------------------------------------------------------------- 1 | # Seneschal Technical Specifications 2 | 3 | Seneschal is an automation system that serves as a restricted control interface between users and a protected environment. 4 | 5 | Features: 6 | 7 | * accepts requests from users 8 | * executes approved workflows 9 | * will reject requests that violate policy 10 | * creates an audit trail containing: 11 | * requests — who, what, when, input, output 12 | * request rejections 13 | * action start 14 | * action complete 15 | * errors 16 | * is hardened — no database required 17 | * is expandable — Users can write plugins, that define new workflows. 18 | * is change managed — Administrators must install new plugins or versions, allowing change review and auditing. 19 | 20 | ## Main Concepts 21 | 22 | Regular users issue commands that send _request_ _messages_. A _message_ is a small JSON file that specifies the desired action. For _requests_, the files are written to a special publicly writeable directory, the _inbox directory_. 23 | 24 | A standard daemon named _seneschald_ polls the _inbox directory_, applies business and security rules, and then takes appropriate action such as rejecting the request, initiating a direct copy operation in a subprocess, or submitting a job to a batch scheduler. As is typical, the daemon has a PID file and responds to `SIGTERM` by shutting down after a few seconds. 25 | 26 | With the exception of subprocesses executing copies, all state is maintained on the filesystem. This allows the daemon to recover from restart. Of course any subprocesses running local copies would be killed with the daemon, and those local copies would have to be restarted. This is not a problem if the copy is similar to rsync. 27 | 28 | In addition to the normal logging expected of a daemon, the _seneschald_ daemon writes to a special _audit log_, noting all events that might be security relevant. These events are formatted to optimize ingestion by log analysis systems, such as splunk. 29 | 30 | Batch jobs run inside a _job wrapper_ that sends start and finish _job events_ as _messages_ back to the daemon by writing small text files to a special directory that is different from the one used by users making requests. These messages usually trigger audit logging and progress updates to the _request_. 31 | 32 | The _seneschald_ daemon is just a framework for automation. Almost all of the security logic and workflow machinery is provided by _plugins_. This allows more efficient change management, since the plugins will evolve on different timescales from each other and the daemon. 33 | 34 | Borrowing from git documentation, we use the analogy of plumbing and porcelain. Plumbing is the machinery required for the system to operate: daemon, plugins, etc. Porcelain is the set of user-facing commands that make the system useful: making requests, checking status, etc. Without the porcelain, the plumbing is pointless. 35 | 36 | * message 37 | * request 38 | * inbox directory 39 | * seneschald 40 | * audit log 41 | * job wrapper 42 | * job events 43 | * plugins 44 | * porcelain 45 | 46 | ## Deployment Requirements 47 | 48 | * service account 49 | * daemon host 50 | * daemon directories 51 | * plugins directory 52 | * cluster resources 53 | * software directory 54 | 55 | ### service account 56 | 57 | * The daemon must run with the UID set to a non-root, non-login service account. 58 | 59 | * In order to facilitate fulfilling user requests for data this service account will belong to multiple regular user groups. Otherwise the users would have to create publicly writeable directories to receive the output that they are requesting. 60 | 61 | * Recommendations: 62 | * The name of the service account should include "seneschal". 63 | * There should be an associated group with the same name. 64 | 65 | ### daemon host 66 | 67 | The host running the daemon must: 68 | 69 | * mount all relevant filesystems 70 | * These filesystems may be mounted read-only: 71 | * protected upstream filesystems 72 | * software filesystems 73 | * These must be mounted read-write: 74 | * destination filesystems 75 | * user filesystems 76 | * data transfer filesystems 77 | * filesystems holding: 78 | * inbox directory 79 | * processing directory 80 | * job events directory 81 | * logging 82 | * have enough capacity in cores and bandwidth to support daily data transfer operations 83 | * share the same service account name and numeric ID as the compute cluster 84 | 85 | ### daemon directories 86 | 87 | In order to process requests and track state, the daemon needs an assortment of directories. Two of these directories must be publicly writeable. It is easiest if they are all publicly readable. 88 | 89 | This is the recommended structure: 90 | 91 | drwxrwxr-x seneschal seneschal ./seneschal 92 | drwxrwxr-x seneschal seneschal ./seneschal/internal_events 93 | drwxrwxr-x seneschal seneschal ./seneschal/job_events 94 | drwxrwxr-x seneschal seneschal ./seneschal/job_events/0_temp 95 | drwxrwxr-x seneschal seneschal ./seneschal/job_events/1_inbox 96 | drwxrwxr-x seneschal seneschal ./seneschal/jobs 97 | drwxrwxr-x seneschal seneschal ./seneschal/outbox 98 | drwxrwxr-x seneschal seneschal ./seneschal/requests 99 | drwxrwxr-x seneschal seneschal ./seneschal/requests/by_num 100 | drwxrwxr-x seneschal seneschal ./seneschal/requests/by_uuid 101 | drwxrwxr-x seneschal seneschal ./seneschal/requests/error 102 | drwxrwxr-x seneschal seneschal ./seneschal/requests/finished 103 | drwxrwxr-x seneschal seneschal ./seneschal/user_events 104 | drwxrwsrwt seneschal seneschal ./seneschal/user_events/0_temp 105 | drwxrwsrwt seneschal seneschal ./seneschal/user_events/1_inbox 106 | 107 | The idea is for the daemon to track state through a clean system reboot (`SIGTERM` + timeout), even restarting interrupted copies that were running locally. 108 | 109 | #### temp and inbox directories 110 | 111 | The inbox directory should be visible to all user login nodes and user compute nodes. They must: 112 | 113 | * be on the same filesystem 114 | * have permissions `drwxrwsrwt` 115 | * be accessible to users (All parent directories are `o+rx`.) 116 | * be owned by _seneschald_ service account 117 | * be in a group that contains the service account and does not include users 118 | * be on the same filesystem as the processing directory 119 | 120 | #### processing directory 121 | 122 | The processing directory is where _seneschald_ maintains most of its state. It contains a moderate number of small files, separated by request into subdirectories. It must: 123 | 124 | * be on the same filesystem as the inbox directory 125 | * be owned by _seneschald_ service account 126 | * be in the same group as the inbox directory 127 | 128 | It should: 129 | 130 | * be readable by developers on login nodes 131 | * be readable by regular users on login nodes so that they can receive status reports (Obscurity is not our friend.) 132 | 133 | #### error and finished directories 134 | 135 | Where _requests_ go at the end. There will be some sort of consolidation (such as tar) and cleanup (such as archiving). TBD 136 | 137 | It must be readable and writeable by the service account on the daemon host. It should be readable by developers on login nodes. 138 | 139 | #### job events directory 140 | 141 | The job events directory is where the batch job wrapper script will write events. It must be readable and writeable by the service account from both the daemon host and the cluster nodes used by seneschal. 142 | 143 | It should be readable by developers on login nodes. 144 | 145 | #### internal events directory 146 | 147 | The internal events directory is for routing information between components of the system. Unlike job events and requests, these events do not come from the outside of the daemon. It must be readable and writeable by the service account. It should be readable by developers on login nodes. 148 | 149 | #### outbox directory 150 | 151 | The outbox directory is where seneschal places public messages intended to be read by user clients. It must: 152 | 153 | * be writeable by the daemon 154 | * be readable by both login and cluster nodes 155 | 156 | It should be readable by developers on login nodes. 157 | 158 | ### plugins directory 159 | 160 | Without plugins, the daemon does nothing ... very well. In this mode, all requests are errors. Plugins define whitelisted implementations of workflows. 161 | 162 | The plugins directory only has to be readable by the daemon user on the daemon host. 163 | 164 | Plugins have change management life-cycles independent of each other or the daemon. The goal is to enable each plugin to be auditable largely independently of the rest of the system. The directory for a plugin contains both name and version. Although this allows for limited backwards compatibility by have more than one active version, this capability should be used __very sparingly__. 165 | 166 | Installing a plugin consists of unpacking it into the plugins directory. A plugin is a directory of files, minimally containing a module or executable with a standard name. If the plugin requires configuration, the configuration is stored in the configuration file under `plugins/the_plugin_name_and_version`. 167 | 168 | For initial testing purposes, the core seneschal software will ship with a truly minimal set of plugins that are not quite as useful as `echo` and `ping`. 169 | 170 | The plugins directory must be publicly readable on the _daemon host_ and all login nodes. 171 | 172 | ### cluster resources 173 | 174 | The mounting requirements for compute nodes serving seneschal are a subset of those on the daemon host: 175 | 176 | * These filesystems may be mounted read-only: 177 | * upstream filesystems 178 | * software filesystems 179 | * These must be mounted read-write: 180 | * user filesystems 181 | * the filesystem holding the job events directory 182 | 183 | ### software directory 184 | 185 | In addition to other standard software directories, the seneschal software must be available to all users, the daemon host, and the compute nodes. In particular: 186 | 187 | * Users must have access to the scripts that generate _requests_. 188 | * Compute jobs must have access to the seneschal wrapper. 189 | * The daemon must have access to its own machinery and all the plugins. 190 | 191 | Easiest is to just make everything public. 192 | 193 | ## Deployment and Operating Instructions 194 | 195 | __NOTE:__ Except for temp and inbox, none of these directories or files should be writable by users. 196 | 197 | * Install Python 3.6 somewhere accessible to the daemon host. 198 | * Unpack the seneschal tarball, which contains its third-party dependencies. 199 | * Create the necessary directories. 200 | * Copy all the active plugins into some directory, such as a directory named "plugins" just inside the unpacked tarball. 201 | * Write a config file, which specifies: 202 | * PID file 203 | * logging destinations 204 | * daemon process settings 205 | * directory locations 206 | * any optional or required configuration settings for the active plugins 207 | * Write a systemd unit file that 208 | * knows the location of the PID file 209 | * defines a start command contaning: 210 | * Your Python 3.6 211 | * Your location for `seneschald.py` 212 | * Your config file 213 | * "start" 214 | * Configure and deploy the `seneschal` script so that users can easily execute it. 215 | * Test the `seneschal` script. (It can run without the daemon, and any messages it generates will be detected by the daemon when it runs.) 216 | * Start the daemon like any other. (You could manually invoke `seneschald.py` with "start", but ... why?) 217 | * Stop the daemon by like any other. The manual method is either sending a `SIGTERM` or invoking `seneschald.py` with "stop", which just does the same thing. 218 | * When planning ahead for a stop, invoking `seneschald.py` with "drain" will notify to the daemon (by writing a special file) that it should postpone long-running local subprocesses, such as copies, until after "start" or "resume". The "drain" and "resume" events are idempotent. 219 | 220 | Note that if seneschald is submitting a job to a batch scheduler or calling a webservice when `SIGTERM` is sent, then seneschald will not shutdown until after the batch scheduler acknowledges the job (e.g. bsub/msub/qsub exits) or the webservice returns. 221 | 222 | __Question:__ Should the daemon remain in a drained state after restart? Should this be a configuration option? 223 | 224 | ## User Operation 225 | 226 | User's interact with the seneschal system through porcelain commands. These commands do nothing more than scan directories, read files, and for _submit_ write files. 227 | 228 | To list plugins: 229 | 230 | seneschal -l 231 | # A fancy ls of the plugins directory 232 | 233 | To get documentation about a plugin: 234 | 235 | seneschal help PLUGIN_NAME 236 | # Runs the plugin's documentation (if any) through a pager 237 | # If there is no documentation file and the plugin supports it, instead 238 | # executes: 239 | # ${PLUGINS_DIR}/${PLUGIN_NAME}/execute_job -h | ${PAGER} 240 | 241 | To run a command: 242 | 243 | seneschal submit PLUGIN_NAME ARG1 ARG2 ... 244 | # Outputs the unique ID of the request 245 | 246 | To check the status of a request: 247 | 248 | seneschal status UNIQUE_ID 249 | # Queries the seneschald files and generates a report 250 | 251 | ## Writing Plugins 252 | 253 | TBD 254 | 255 | ## Developer Needs 256 | 257 | In order to support and maintain the software, developers need to see recent logs and the contents of all of those directories. 258 | 259 | Updates to either the daemon or a plugin consist of developers delivering checksummed tarballs for review and deployment. 260 | 261 | ## Implementation Design 262 | 263 | The daemon code is written in Python. Messages are JSON files. Plugins can be written in any language that supports reading environment variables, command line argument parsing, and file I/O. 264 | 265 | The daemon uses two third-party packages: 266 | 267 | * [python-daemon](https://pypi.python.org/pypi/python-daemon): implements [PEP 3143](https://www.python.org/dev/peps/pep-3143/) 268 | * [lockfile](https://pypi.python.org/pypi/lockfile): provides PID file support, may switch to [pid](https://pypi.python.org/pypi/pid) 269 | 270 | ### Logging & Auditing 271 | 272 | Logging and auditing is handled through the Python standard library [logging module](https://docs.python.org/3/library/logging.html). Logging is configured using [logging.config.dictConfig](https://docs.python.org/3/library/logging.config.html#logging.config.dictConfig). The necessary dictionary is read from the config file under the "logging" section with these two keys added by `seneschald`: 273 | 274 | version=1 275 | disable_existing_loggers=False 276 | 277 | Much of the internal logging is handled in a manner that is typical for well behaved, long-running Python processes. Logging messages intended for developers or system administrators are human-friendly. The audit log is special. 278 | 279 | There is a special logger named "audit". Log messages sent to this logger will be optimized for ingestion by log analysis systems, such as splunk: 280 | 281 | timestamp key1=value1 key2=value2 key3=value3 key4=value4 ... 282 | 283 | Example of the section of the config file that configures logging: 284 | 285 | logging: 286 | formatters: 287 | verbose: 288 | format: '%(asctime)s %(levelname)-8s %(name)s %(module)s %(process)d %(message)s' 289 | audit_format: 290 | format: '%(asctime)s %(message)s' 291 | handlers: 292 | debug_handler: 293 | class : logging.handlers.RotatingFileHandler 294 | formatter: verbose 295 | filename: /var/log/seneschal/debug.log 296 | maxBytes: 40960 297 | backupCount: 1 298 | audit_handler: 299 | class : logging.handlers.RotatingFileHandler 300 | formatter: audit_format 301 | filename: /var/log/seneschal/audit.log 302 | maxBytes: 409600 303 | backupCount: 4 304 | loggers: 305 | audit: 306 | level: INFO 307 | handlers: [audit_handler] 308 | root: 309 | level: DEBUG 310 | handlers: [debug_handler] 311 | 312 | The level of audit messages is always INFO or higher. 313 | 314 | References: 315 | 316 | * [http://dev.splunk.com/view/dev-guide/SP-CAAAE3A](http://dev.splunk.com/view/dev-guide/SP-CAAAE3A) 317 | * [https://answers.splunk.com/answers/1951/what-is-the-best-custom-log-event-format-for-splunk-to-eat.html#answer-1953](https://answers.splunk.com/answers/1951/what-is-the-best-custom-log-event-format-for-splunk-to-eat.html#answer-1953) 318 | 319 | 320 | ### Architecture 321 | 322 | These are the important kinds of objects in the system: 323 | 324 | * daemon 325 | * _Engine_ 326 | * managers 327 | * fertile 328 | * _RequestManager_ 329 | * _JobManager_ 330 | * _SubprocessManager_ 331 | * sterile 332 | * _PluginManager_ 333 | * workers 334 | * stateful 335 | * _Request_ 336 | * _Job_ 337 | * _Subprocess_ 338 | * stateless 339 | * _Plugin_ 340 | * _MessageBroker_ 341 | * _Message_ 342 | * _Event_ 343 | 344 | The _daemon_ reads the configuration file, constructs the _Engine_, and then start its main loop. During this loop, the _daemon_ passes control to the _Engine_. If there was no work for the _Engine_ to do, then the _daemon_ will sleep. The main loop terminates after `SIGTERM`. 345 | 346 | The _Engine_ constructs instances of the _MessageBroker_ each type of manager. The engine will then go into a loop until the _daemon_ receives `SIGTERM`. Each pass through the loop will call upon the _MessageBroker_ to process one message. If there are no messages, then the _Engine_ returns control back to the _daemon_. 347 | 348 | The _MessageBroker_ reads a _Message_ and converts it into an _Event_ for some worker or manager. A _Message_ and an _Event_ are similar, with the main difference being perspective. An object sends a _Message_ with a destination. That destination object receives an _Event_ where the destination is implicit and other contextual information may be added. Both messages and events are really just JSON files. (There's a lot of disk churn going on.) 349 | 350 | Manager objects are responsible for creating and fetching workers. Managers maintain two kinds of state in memory: 351 | 352 | * configuration 353 | * a registry of workers, indexed by ID 354 | 355 | When _seneschald_ starts up, each manager will reload its registry by scanning a directory. 356 | 357 | Stateful workers uses the filesystem to maintain its state. Each stateful worker has a corresponding directory that contains an ordered list of JSON files corresponding to events, starting with an initialization event. The state of the worker is loaded into memory by reading each file in sequence. 358 | 359 | Plugins have no state besides the configuration read from the main configuration file. Each time a plugin is invoked, it must be passed all the information it needs to do its job. It is the responsibility of stateful worker objects to hold the necessary state information. Some plugins are special, in that they define how the seneschal system integrates with external system like a compute cluster or permissions system. These plugins are typically aliased to logical resource names, like "batch\_scheduler". Most plugins are workflow plugins. 360 | 361 | Plugins can be implemented as either external executables (usually scripts) or Python modules. There is no semantic difference between the two methods. 362 | 363 | In the case of external executables, plugin configuration is defined in environment variables, and the input for the plugin is JSON data fed into standard input. The output is JSON data fed to standard output. 364 | 365 | In the case of Python modules, a plugin is loaded by importing the module and then passing any configuration in the form of a Python _dict_ into a function in that module named _create_. The return value of that function call is the plugin object, which is callable. The input and output are just Python dictionaries. For some applications, Python modules are the better choice either because of better efficiency or simplicity. 366 | 367 | Worker objects can do these things: 368 | 369 | * __[not plugins]__ notify their manager that they should be purged 370 | * __[not subprocesses]__ receive events 371 | * send messages 372 | * __[plugins and subprocesses only]__ do stuff 373 | 374 | A _Request_ object represents — perhaps indirectly — a user's request to execute an automated workflow with a particular set of inputs and outputs. The owner of the request is the owner of the _Message_ file that originated the _Request_. Every _Request_ has an associated _workflow plugin_ that defines the actual workflow. After initialization, the _Request_ invokes the _workflow plugin_ passing in a JSON object representing the request. The plugin will respond with a _workflow_ — a JSON object containing a list of tasks that will satisfy the _Request_. The _Request_, will then typically trigger other messages, write its state to the filesystem, and exit. (The _Request_ could invoke other plugins as it iterates through the list of tasks.) Eventually the _Request_ will receive messages indicating the end of its various tasks. When there are no more tasks running, the _Request_ will have reached the end of its lifecycle. At that point, the _Request_ will notify the _RequestManager_ that it should be purged. Hopefully along the way, something useful happened. 375 | 376 | A _Job_ represents a batch job submitted to a [job scheduler](https://en.wikipedia.org/wiki/Job_scheduler). A _Job_ is created when the _JobManager_ receives a _Message_ submitting a new _Job_. A _Job_ knows the ID of the originating _Request_. A _Job_ sends a message to the logical "batch\_scheduler" _Plugin_ to submit the job. The _Plugin_ will typically create a custom file for the job and then submit a plugin-defined wrapper script and that custom file to the underlying job scheduler. The wrapper script will then read the job-specific file, execute the required tasks, and send messages back to the _Job_ object upon the start and finish of compute. The _Job_ object will forward information back to the _Request_. The _Job_ object will notify the _JobManager_ when its lifecycle is complete. 377 | 378 | A _Subprocess_ is a logical wrapper around an external command. It is much simpler than a _Job_, since there is not need for a plugin to implement it. The implementation is handled by the Python [subprocess module](https://docs.python.org/3/library/subprocess.html). State is also maintained on the filesystem. If the daemon must shutdown, all running subprocesses must be killed. By default, all subprocesses will restart when the daemon restarts. Workflows that use subprocesses are usually file copy operations. They should be coded so as to be restartable. 379 | 380 | #### Example Putting it all Together 381 | 382 | Consider a workflow that computes the MD5 checksum of a file. For the purpose of this discussion, let us agree to the interpretation that the MD5 of a protected file does not constitute protected information. Therefore, no security checks are required. A plugin implementing this workflow has two inputs: the input file path and the output file path. 383 | 384 | This would be the sequence of received messages: 385 | 386 | 1. _RequestManager_: new md5 inputPath outputPath 387 | 2. request001: initialization userName md5 inputPath outputPath 388 | 3. _JobManager_: new request001 step0 /usr/bin/md5sum inputPath outputPath 389 | 4. job001: initialization request001 step0 /usr/bin/md5sum inputPath outputPath 390 | 5. request001: step0 job submitted with ID 123 391 | 6. job001 (sent by wrapper): compute started on node ABC at someTimestamp 392 | 7. request001: step0 compute started on node ABC at someTimestamp 393 | 8. job001 (sent by wrapper): compute succeeded at someTimestamp 394 | 9. request001: step0 compute succeeded at someTimestamp 395 | 10. _JobManager_: purge job001 396 | 11. _RequestManager_: purge request001 397 | 398 | This list does not show plugin invocations, since they are invoked by direct input — either function call or passing JSON into _stdin_. 399 | 400 | Items 3, 10, and 11 are in-memory messages only, really just direct method calls. Even with optimization of those instantaneous messages staying in memory, there are at least 8 files created for this simplest cluster workflow. 401 | --------------------------------------------------------------------------------