├── debian ├── docs ├── compat ├── mastermind.examples ├── python-mastermind.install ├── mastermind-utils.install ├── mastermind-utils.bash-completion ├── mastermind.install ├── rules ├── postinst ├── copyright └── control ├── src ├── cocaine-app │ ├── db │ │ ├── __init__.py │ │ └── mongo │ │ │ ├── __init__.py │ │ │ └── pool.py │ ├── mastermind_core │ │ ├── __init__.py │ │ ├── errors.py │ │ ├── helpers.py │ │ └── response.py │ ├── mastermind.manifest │ ├── mastermind-cache.manifest │ ├── mastermind-inventory.manifest │ ├── mastermind.profile │ ├── mastermind-cache.profile │ ├── mastermind-inventory.profile │ ├── importer.py │ ├── config.py │ ├── errors.py │ ├── jobs │ │ ├── error.py │ │ ├── job_factory.py │ │ ├── job_types.py │ │ ├── tasks │ │ │ ├── remove_group.py │ │ │ ├── create_group.py │ │ │ ├── recover_group_dc.py │ │ │ ├── node_backend_defrag.py │ │ │ ├── dnet_client_backend_cmd.py │ │ │ ├── __init__.py │ │ │ ├── task.py │ │ │ ├── write_meta_key.py │ │ │ ├── minion_cmd.py │ │ │ ├── couple_defrag_state_check.py │ │ │ ├── node_stop.py │ │ │ ├── history_remove_node.py │ │ │ └── rsync_backend.py │ │ ├── couple_defrag.py │ │ └── recover_dc.py │ ├── cache_transport │ │ ├── fake_transport.py │ │ └── __init__.py │ ├── sync │ │ ├── __init__.py │ │ ├── error.py │ │ └── fake_sync.py │ ├── timer.py │ ├── inventory.py │ ├── keys.py │ ├── inv.py │ ├── monitor_pool.py │ ├── inventory_worker.py │ ├── manual_locks.py │ ├── log.py │ ├── timed_queue.py │ ├── indexes.py │ ├── couple_records.py │ ├── fake_inventory.py │ ├── cache_worker.py │ ├── coll.py │ └── load_manager.py ├── python-mastermind │ └── src │ │ └── mastermind │ │ ├── utils │ │ └── __init__.py │ │ ├── __init__.py │ │ ├── query │ │ ├── namespaces_states.py │ │ ├── stats.py │ │ ├── __init__.py │ │ ├── history.py │ │ ├── node_backends.py │ │ ├── groupsets.py │ │ └── groups.py │ │ ├── helpers.py │ │ ├── errors.py │ │ ├── client.py │ │ ├── monitor_pool.py │ │ └── service.py └── create_group_ids ├── .gitignore ├── make_tree.sh ├── tests ├── conftest.py ├── fixtures │ ├── __init__.py │ ├── pool_workers.py │ ├── util.py │ └── monitor_stat_worker.py ├── test_tree_picker.py ├── test_monitor_pool.py └── test_pool.py ├── usr └── bin │ ├── mastermind_app_name.sh │ └── mastermind_deploy.sh ├── setup.py ├── etc └── bash_completion.d │ └── mastermind └── scripts ├── 07-move-gatlinggun-tasks.py ├── 06-clean-metadb-jobs.py ├── 05-group-history-unification.py ├── 08-create-couples-free-eff-space-coll.py └── 03-namespace-settings-service-flags.py /debian/docs: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 8 2 | -------------------------------------------------------------------------------- /src/cocaine-app/db/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/cocaine-app/mastermind_core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /debian/mastermind.examples: -------------------------------------------------------------------------------- 1 | examples/mastermind.conf 2 | -------------------------------------------------------------------------------- /debian/python-mastermind.install: -------------------------------------------------------------------------------- 1 | usr/lib/python2.7 2 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /debian/mastermind-utils.install: -------------------------------------------------------------------------------- 1 | src/mastermind usr/bin/ 2 | -------------------------------------------------------------------------------- /debian/mastermind-utils.bash-completion: -------------------------------------------------------------------------------- 1 | etc/bash_completion.d/mastermind 2 | -------------------------------------------------------------------------------- /src/cocaine-app/mastermind.manifest: -------------------------------------------------------------------------------- 1 | { 2 | "slave" : "__init__.py" 3 | } 4 | 5 | -------------------------------------------------------------------------------- /src/cocaine-app/mastermind-cache.manifest: -------------------------------------------------------------------------------- 1 | { 2 | "slave" : "cache_worker.py" 3 | } 4 | 5 | -------------------------------------------------------------------------------- /src/cocaine-app/mastermind-inventory.manifest: -------------------------------------------------------------------------------- 1 | { 2 | "slave" : "inventory_worker.py" 3 | } 4 | 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build/* 3 | mastermind.egg-info 4 | 5 | # in case someone uses fuse for development 6 | .fuse* 7 | -------------------------------------------------------------------------------- /src/cocaine-app/mastermind.profile: -------------------------------------------------------------------------------- 1 | { 2 | "pool-limit": 5, 3 | "startup-timeout": 600, 4 | "heartbeat-timeout": 240 5 | } 6 | -------------------------------------------------------------------------------- /src/cocaine-app/mastermind-cache.profile: -------------------------------------------------------------------------------- 1 | { 2 | "pool-limit": 1, 3 | "startup-timeout": 600, 4 | "heartbeat-timeout": 120 5 | } 6 | -------------------------------------------------------------------------------- /src/cocaine-app/mastermind-inventory.profile: -------------------------------------------------------------------------------- 1 | { 2 | "pool-limit": 1, 3 | "startup-timeout": 10, 4 | "heartbeat-timeout": 20 5 | } 6 | -------------------------------------------------------------------------------- /make_tree.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | CURDIR=$PWD 4 | 5 | #---------- mastermind code 6 | mkdir -p $CURDIR/debian/tmp 7 | find . -type f -name \*.py -print0 | tar czvf $CURDIR/debian/tmp/mastermind.tar.gz --null -T - 8 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/__init__.py: -------------------------------------------------------------------------------- 1 | from mastermind.client import MastermindClient 2 | from mastermind.service import ReconnectableService 3 | 4 | __all__ = [ReconnectableService, MastermindClient] 5 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/query/namespaces_states.py: -------------------------------------------------------------------------------- 1 | from mastermind.query import Query 2 | 3 | 4 | class NamespacesStatesQuery(Query): 5 | def update(self): 6 | self.client.request('force_update_namespaces_states', None) 7 | -------------------------------------------------------------------------------- /debian/mastermind.install: -------------------------------------------------------------------------------- 1 | usr/bin/mastermind_deploy.sh usr/bin 2 | usr/bin/mastermind_app_name.sh usr/bin 3 | src/cocaine-app/*.manifest usr/lib/mastermind/cocaine-app/ 4 | src/cocaine-app/*.profile usr/lib/mastermind/cocaine-app/ 5 | mastermind.tar.gz usr/lib/mastermind/cocaine-app/ 6 | -------------------------------------------------------------------------------- /src/cocaine-app/importer.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def import_object(s): 4 | parts = [p.encode('utf-8') for p in s.rsplit('.', 1)] 5 | if len(parts) == 1: 6 | return __import__(s) 7 | else: 8 | mod = __import__(parts[0], fromlist=[parts[1]]) 9 | return getattr(mod, parts[1]) 10 | -------------------------------------------------------------------------------- /src/cocaine-app/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | CONFIG_PATH = '/etc/elliptics/mastermind.conf' 4 | 5 | try: 6 | 7 | with open(CONFIG_PATH, 'r') as config_file: 8 | config = json.load(config_file) 9 | 10 | except Exception as e: 11 | raise ValueError('Failed to load config file %s: %s' % (CONFIG_PATH, e)) 12 | -------------------------------------------------------------------------------- /src/cocaine-app/errors.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class MinionApiError(Exception): 4 | pass 5 | 6 | class NotReadyError(Exception): 7 | pass 8 | 9 | class CacheUpstreamError(Exception): 10 | """ 11 | Indicates that upstream request failed. 12 | By default original exception is logged by caching facilities. 13 | """ 14 | pass 15 | -------------------------------------------------------------------------------- /src/cocaine-app/mastermind_core/errors.py: -------------------------------------------------------------------------------- 1 | class CacheUpstreamError(Exception): 2 | """ 3 | Indicates that upstream request failed. 4 | By default original exception is logged by caching facilities. 5 | """ 6 | pass 7 | 8 | 9 | ELLIPTICS_NOT_FOUND = -2 10 | ELLIPTICS_GROUP_NOT_IN_ROUTE_LIST = -6 11 | ELLIPTICS_TIMEOUT = -110 12 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from fixtures import * 2 | 3 | 4 | def pytest_addoption(parser): 5 | parser.addoption('--clean', action='store_true', dest='clean', 6 | help='reset all bench entities to their init state') 7 | 8 | 9 | def pytest_configure(config): 10 | print "Clean option: {}".format(config.getoption('clean')) 11 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/error.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class JobBrokenError(Exception): 4 | pass 5 | 6 | 7 | class RetryError(Exception): 8 | def __init__(self, attempts, e): 9 | self.attempts = attempts 10 | self.original_e = e 11 | 12 | def __str__(self): 13 | return 'error at attempt {0}: {1}'.format(self.attempts, self.original_e) 14 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/helpers.py: -------------------------------------------------------------------------------- 1 | 2 | def elliptics_time_to_ts(t): 3 | if isinstance(t, dict) and 'tv_sec' in t: 4 | return t['tv_sec'] + t.get('tv_usec', 0) / float(10 ** 6) 5 | elif hasattr(t, 'tsec'): 6 | # instance of elliptics.Time 7 | return t.tsec + t.tnsec / float(10 ** 9) 8 | raise TypeError('Invalid elliptics time object: {}'.format(t)) 9 | -------------------------------------------------------------------------------- /tests/fixtures/__init__.py: -------------------------------------------------------------------------------- 1 | from pool_workers import delay_task_worker_pool 2 | from util import ascii_data 3 | from monitor_stat_worker import ( 4 | monitor_pool, 5 | monitor_server, 6 | monitor_port, 7 | ) 8 | 9 | 10 | __all__ = [ 11 | 'ascii_data', 12 | 'delay_task_worker_pool', 13 | 'monitor_pool', 14 | 'monitor_server', 15 | 'monitor_port', 16 | ] 17 | -------------------------------------------------------------------------------- /src/cocaine-app/cache_transport/fake_transport.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Transport(object): 4 | 5 | def __init__(self, *args, **kwargs): 6 | self.tasks = [] 7 | 8 | def put_task(self, task): 9 | self.tasks.append(task) 10 | 11 | def put_all(self, tasks): 12 | for task in tasks: 13 | self.put_task(task) 14 | 15 | def list(self): 16 | return self.tasks 17 | -------------------------------------------------------------------------------- /usr/bin/mastermind_app_name.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import json 3 | import sys 4 | 5 | 6 | CONFIG_PATH = '/etc/elliptics/mastermind.conf' 7 | 8 | 9 | def main(): 10 | try: 11 | 12 | with open(CONFIG_PATH, 'r') as config_file: 13 | config = json.load(config_file) 14 | 15 | except Exception as e: 16 | # raise ValueError('Failed to load config file %s: %s' % (CONFIG_PATH, e)) 17 | config = {} 18 | 19 | sys.stdout.write(config.get('app_name', 'mastermind2.26')) 20 | 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /src/cocaine-app/sync/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | from copy import deepcopy 3 | import logging 4 | 5 | from config import config 6 | from importer import import_object 7 | 8 | 9 | logger = logging.getLogger('mm.sync') 10 | 11 | params = {} 12 | 13 | try: 14 | params = deepcopy(config['sync']) 15 | SyncManager = import_object(params.pop('class')) 16 | except (ImportError, KeyError) as e: 17 | logger.error(e) 18 | from fake_sync import SyncManager 19 | 20 | 21 | logger.info('Sync manager being used: {0}'.format(SyncManager)) 22 | sync_manager = SyncManager(**params) 23 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/errors.py: -------------------------------------------------------------------------------- 1 | class MastermindError(Exception): 2 | @property 3 | def code(self): 4 | return MASTERMIND_ERROR_CODES[type(self)] 5 | 6 | @staticmethod 7 | def make_error(code, msg): 8 | if code not in MASTERMIND_ERROR_CLS: 9 | raise ValueError('Unknown error code {}'.format(code)) 10 | return MASTERMIND_ERROR_CLS[code](msg) 11 | 12 | GENERAL_ERROR_CODE = 1024 13 | 14 | MASTERMIND_ERROR_CODES = { 15 | MastermindError: GENERAL_ERROR_CODE 16 | } 17 | 18 | MASTERMIND_ERROR_CLS = dict((v, k) for k, v in MASTERMIND_ERROR_CODES.iteritems()) 19 | -------------------------------------------------------------------------------- /src/cocaine-app/cache_transport/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | from copy import deepcopy 3 | import logging 4 | 5 | from config import config 6 | from importer import import_object 7 | 8 | 9 | logger = logging.getLogger('mm.sync') 10 | 11 | params = {} 12 | 13 | try: 14 | params = deepcopy(config['cache']['manager']) 15 | CacheTaskManager = import_object(params.pop('class')) 16 | except (ImportError, KeyError) as e: 17 | logger.error(e) 18 | from fake_transport import Transport as CacheTaskManager 19 | 20 | 21 | logger.info('Cache task manager being used: {0}'.format(CacheTaskManager)) 22 | cache_task_manager = CacheTaskManager(**params) 23 | -------------------------------------------------------------------------------- /usr/bin/mastermind_deploy.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | DEPLOY_DIR=$1 4 | APP_NAME=$2 5 | MANIFEST=$3 6 | PROFILE=$4 7 | 8 | echo "Cleaning old version of application $APP_NAME..." 9 | 10 | rm -rf /var/lib/cocaine/apps/$APP_NAME 11 | rm -f /var/lib/cocaine/manifests/$APP_NAME 12 | rm -rf /var/spool/cocaine/$APP_NAME 13 | rm -rf /var/cache/cocaine/apps/$APP_NAME 14 | rm -f /var/cache/cocaine/manifests/$APP_NAME 15 | 16 | echo "Deploying new application $app" 17 | cocaine-tool app upload --manifest $DEPLOY_DIR/cocaine-app/$MANIFEST --package $DEPLOY_DIR/cocaine-app/mastermind.tar.gz -n $APP_NAME 18 | cocaine-tool profile upload -n $APP_NAME --profile $DEPLOY_DIR/cocaine-app/$PROFILE 19 | -------------------------------------------------------------------------------- /src/cocaine-app/timer.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import math 3 | from time import time 4 | 5 | 6 | MINUTE = 60 7 | HOUR = 60 * 60 8 | DAY = 24 * 60 * 60 9 | 10 | 11 | def periodic_timer(seconds=0, minutes=0, hours=0, days=0): 12 | if not seconds and not minutes and not hours and not days: 13 | raise ValueError('Timer period should be configured') 14 | period = (days * DAY + hours * HOUR + minutes * MINUTE + seconds) 15 | while True: 16 | yield math.ceil(time() / period) * period 17 | 18 | 19 | def periodic_datetimer(**kwargs): 20 | timer = periodic_timer(**kwargs) 21 | while True: 22 | yield datetime.fromtimestamp(timer.next()) 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name="mastermind", 5 | version="2.25", 6 | author="Andrey Vasilenkov", 7 | author_email="indigo@yandex-team.ru", 8 | url="https://github.com/yandex/mastermind", 9 | description="Common components and a client library for Mastermind", 10 | long_description="", 11 | license="LGPLv3+", 12 | packages=[ 13 | "mastermind", 14 | "mastermind.query", 15 | "mastermind.utils", 16 | ], 17 | package_dir={'mastermind': 'src/python-mastermind/src/mastermind', 18 | 'mastermind.query': 'src/python-mastermind/src/mastermind/query', 19 | 'mastermind.utils': 'src/python-mastermind/src/mastermind/utils', 20 | }, 21 | ) 22 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | # Uncomment this to turn on verbose mode. 3 | #export DH_VERBOSE=1 4 | override_dh_builddeb: 5 | dh_builddeb -- -Zgzip 6 | 7 | PY_BUILD_DIR=build/python-lib 8 | TESTS_DIR=tests 9 | 10 | override_dh_auto_build: 11 | dh_auto_build -- --build-lib $(CURDIR)/$(PY_BUILD_DIR) 12 | 13 | override_dh_auto_test: 14 | cp -r $(CURDIR)/$(TESTS_DIR) $(CURDIR)/$(PY_BUILD_DIR) && \ 15 | cd $(CURDIR)/$(PY_BUILD_DIR) && \ 16 | python -m pytest -v -l -x --duration 20 $(TESTS_DIR) 17 | 18 | override_dh_auto_clean: 19 | rm -rf $(CURDIR)/$(PY_BUILD_DIR) 20 | dh_auto_clean 21 | 22 | override_dh_auto_install: 23 | mkdir -p $(CURDIR)/debian/tmp 24 | cd $(CURDIR)/src/cocaine-app/; find . -type f -name \*.py -print0 | tar czvf $(CURDIR)/debian/tmp/mastermind.tar.gz --null -T - 25 | dh_auto_install 26 | 27 | %: 28 | dh $@ --with python2,bash-completion 29 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/query/stats.py: -------------------------------------------------------------------------------- 1 | class Stats(object): 2 | def __init__(self, raw_data): 3 | self.effective_space = raw_data['effective_space'] 4 | self.free_effective_space = raw_data['free_effective_space'] 5 | self.free_reserved_space = raw_data['free_reserved_space'] 6 | 7 | def __repr__(self): 8 | # TODO: convert numbers to human-readable form 9 | return ( 10 | '' 15 | ).format( 16 | effective_space=self.effective_space, 17 | free_effective_space=self.free_effective_space, 18 | free_reserved_space=self.free_reserved_space, 19 | ) 20 | -------------------------------------------------------------------------------- /src/cocaine-app/inventory.py: -------------------------------------------------------------------------------- 1 | from config import config 2 | from importer import import_object 3 | 4 | 5 | try: 6 | inv = import_object(config['inventory']) 7 | except (ImportError, KeyError): 8 | import fake_inventory as inv 9 | 10 | get_dc_by_host = inv.get_dc_by_host 11 | get_host_tree = inv.get_host_tree 12 | node_shutdown_command = inv.node_shutdown_command 13 | node_start_command = inv.node_start_command 14 | node_reconfigure = inv.node_reconfigure 15 | get_balancer_node_types = inv.get_balancer_node_types 16 | get_dc_node_type = inv.get_dc_node_type 17 | set_net_monitoring_downtime = inv.set_net_monitoring_downtime 18 | remove_net_monitoring_downtime = inv.remove_net_monitoring_downtime 19 | get_host_ip_addresses = inv.get_host_ip_addresses 20 | get_new_group_files = inv.get_new_group_files 21 | get_node_config_path = inv.get_node_config_path 22 | get_node_types = inv.get_node_types 23 | -------------------------------------------------------------------------------- /src/cocaine-app/db/mongo/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger('mm.mongo') 4 | 5 | 6 | class MongoObject(object): 7 | 8 | PRIMARY_ID_KEY = 'id' 9 | 10 | def __init__(self, *args, **kwargs): 11 | super(MongoObject, self).__init__(*args, **kwargs) 12 | self._dirty = False 13 | 14 | @classmethod 15 | def new(cls, *args, **kwargs): 16 | pass 17 | 18 | def save(self): 19 | if not self._dirty: 20 | logger.debug('Object with id {0} has no _dirty flag set'.format(self.id)) 21 | return 22 | 23 | res = self.collection.update({self.PRIMARY_ID_KEY: self.id}, self.dump(), upsert=True) 24 | if res['ok'] != 1: 25 | logger.error('Unexpected mongo response: {0}, saving object {1}'.format(res, self.dump())) 26 | raise RuntimeError('Mongo operation result: {0}'.format(res['ok'])) 27 | self._dirty = False 28 | -------------------------------------------------------------------------------- /src/cocaine-app/keys.py: -------------------------------------------------------------------------------- 1 | 2 | SYMMETRIC_GROUPS_KEY = 'metabalancer\0symmetric_groups' 3 | 4 | MASTERMIND_MAX_GROUP_KEY = 'mastermind:max_group' 5 | MASTERMIND_COUPLE_META_KEY = 'mastermind:couple_meta:%s' 6 | 7 | MM_GROUPS_IDX = 'mastermind:groups_idx' 8 | MM_ISTRUCT_GROUP = 'mastermind:group_%d' 9 | 10 | MM_DC_CACHE_IDX = 'mastermind:dc_cache' 11 | MM_DC_CACHE_HOST = 'mastermind:dc_cache_%s' 12 | 13 | MM_HOSTNAME_CACHE_IDX = 'mastermind:hostname_cache' 14 | MM_HOSTNAME_CACHE_HOST = 'mastermind:hostname_cache_%s' 15 | 16 | MM_HOSTTREE_CACHE_IDX = 'mastermind:hosttree_cache' 17 | MM_HOSTTREE_CACHE_HOST = 'mastermind:hosttree_cache_%s' 18 | 19 | MM_IPADDRESSES_CACHE_IDX = 'mastermind:ipaddresses_cache' 20 | MM_IPADDRESSES_CACHE_HOST = 'mastermind:ipaddresses_cache_%s' 21 | 22 | MM_NAMESPACE_SETTINGS_IDX = 'mastermind:ns_settings_idx' 23 | MM_NAMESPACE_SETTINGS_KEY_TPL = 'mastermind:ns_setttings:%s' 24 | 25 | MINION_HISTORY_KEY = 'minion_cmd_log:%s' 26 | MINION_HISTORY_ENTRY_KEY = 'minion_cmd_entry:%s' 27 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/job_factory.py: -------------------------------------------------------------------------------- 1 | from job_types import JobTypes 2 | from move import MoveJob 3 | from recover_dc import RecoverDcJob 4 | from couple_defrag import CoupleDefragJob 5 | from restore_group import RestoreGroupJob 6 | from make_lrc_groups import MakeLrcGroupsJob 7 | 8 | 9 | class JobFactory(object): 10 | 11 | @staticmethod 12 | def make_job(data): 13 | job_type = data.get('type') 14 | if job_type == JobTypes.TYPE_MOVE_JOB: 15 | return MoveJob.from_data(data) 16 | elif job_type == JobTypes.TYPE_RECOVER_DC_JOB: 17 | return RecoverDcJob.from_data(data) 18 | elif job_type == JobTypes.TYPE_COUPLE_DEFRAG_JOB: 19 | return CoupleDefragJob.from_data(data) 20 | elif job_type == JobTypes.TYPE_RESTORE_GROUP_JOB: 21 | return RestoreGroupJob.from_data(data) 22 | elif job_type == JobTypes.TYPE_MAKE_LRC_GROUPS_JOB: 23 | return MakeLrcGroupsJob.from_data(data) 24 | raise ValueError('Unknown job type {0}'.format(job_type)) 25 | -------------------------------------------------------------------------------- /tests/fixtures/pool_workers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from tornado import gen 3 | 4 | from mastermind import pool 5 | 6 | 7 | @pytest.fixture 8 | def delay_task_worker_pool(processes, task_delay): 9 | """Pool of DelayTaskWorker processes""" 10 | 11 | class DelayTaskWorker(pool.PoolWorker): 12 | """Worker emulating async tasks execution 13 | 14 | Returns task that was passed to it as a result. 15 | """ 16 | def __init__(self, 17 | ioloop=None, 18 | task_delay=0.0, 19 | **kwds): 20 | super(DelayTaskWorker, self).__init__(ioloop=ioloop, **kwds) 21 | self._task_delay = task_delay 22 | 23 | @gen.coroutine 24 | def process(self, task): 25 | yield gen.sleep(self._task_delay) 26 | raise gen.Return(task) 27 | 28 | return pool.Pool( 29 | processes=processes, 30 | worker=DelayTaskWorker, 31 | w_initkwds={ 32 | 'task_delay': task_delay, 33 | 'tasks_fetch_period': 0.001, 34 | } 35 | ) 36 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/job_types.py: -------------------------------------------------------------------------------- 1 | 2 | class JobTypes(object): 3 | TYPE_MOVE_JOB = 'move_job' 4 | TYPE_RECOVER_DC_JOB = 'recover_dc_job' 5 | TYPE_COUPLE_DEFRAG_JOB = 'couple_defrag_job' 6 | TYPE_RESTORE_GROUP_JOB = 'restore_group_job' 7 | TYPE_MAKE_LRC_GROUPS_JOB = 'make_lrc_groups_job' 8 | 9 | AVAILABLE_TYPES = ( 10 | TYPE_MOVE_JOB, 11 | TYPE_RECOVER_DC_JOB, 12 | TYPE_COUPLE_DEFRAG_JOB, 13 | TYPE_RESTORE_GROUP_JOB, 14 | TYPE_MAKE_LRC_GROUPS_JOB, 15 | ) 16 | 17 | 18 | class TaskTypes(object): 19 | TYPE_MINION_CMD = 'minion_cmd' 20 | TYPE_NODE_STOP_TASK = 'node_stop_task' 21 | TYPE_RECOVER_DC_GROUP_TASK = 'recover_dc_group_task' 22 | TYPE_HISTORY_REMOVE_NODE = 'history_remove_node' 23 | TYPE_NODE_BACKEND_DEFRAG_TASK = 'node_backend_defrag_task' 24 | TYPE_COUPLE_DEFRAG_STATE_CHECK_TASK = 'couple_defrag_state_check' 25 | TYPE_RSYNC_BACKEND_TASK = 'rsync_backend_task' 26 | TYPE_CREATE_GROUP = 'create_group' 27 | TYPE_REMOVE_GROUP = 'remove_group' 28 | TYPE_WRITE_META_KEY = 'write_meta_key' 29 | TYPE_DNET_CLIENT_BACKEND_CMD = 'dnet_client_backend_cmd' 30 | -------------------------------------------------------------------------------- /src/cocaine-app/inv.py: -------------------------------------------------------------------------------- 1 | from config import config 2 | from importer import import_object 3 | 4 | import helpers as h 5 | 6 | 7 | try: 8 | inv = import_object(config['inventory']) 9 | except (ImportError, KeyError): 10 | import fake_inventory as inv 11 | 12 | 13 | class Inventory(object): 14 | """General interface to inventory specification 15 | 16 | Inventory is designed as a general tool to provide 17 | information about specific infrastructure environment 18 | to mastermind. 19 | 20 | Inventory implementation can be provided using 21 | 'inventory' config section using module name as a value 22 | (NB: this module should be importable by python interpreter). 23 | If either custom inventory is not provided or is not importable 24 | mastermind will fall back to default implementation of 25 | @fake_inventory module. 26 | 27 | Inventory API is fully described in @fake_inventory module. 28 | 29 | TODO: move API description to @Inventory class; 30 | TODO: add support of all inventory methods as worker handles. 31 | """ 32 | @staticmethod 33 | @h.source 34 | def get_dc_by_host(host): 35 | yield inv.get_dc_by_host(host) 36 | -------------------------------------------------------------------------------- /src/cocaine-app/monitor_pool.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import elliptics 3 | 4 | from mastermind.pool import Pool 5 | from mastermind.monitor_pool import MonitorStatParseWorker 6 | 7 | from config import config 8 | 9 | 10 | MONITOR_CFG = config.get('monitor', {}) 11 | 12 | MONITOR_STAT_CATEGORIES = ( 13 | elliptics.monitor_stat_categories.procfs | 14 | elliptics.monitor_stat_categories.backend | 15 | elliptics.monitor_stat_categories.io | 16 | elliptics.monitor_stat_categories.stats | 17 | elliptics.monitor_stat_categories.commands 18 | ) 19 | 20 | monitor_pool = Pool( 21 | worker=MonitorStatParseWorker, 22 | w_initkwds={ 23 | 'max_http_clients': MONITOR_CFG.get('max_http_clients', 30), 24 | 'monitor_stat_categories': MONITOR_STAT_CATEGORIES, 25 | 'monitor_port': config.get('elliptics', {}).get('monitor_port', 10025), 26 | 'connect_timeout': MONITOR_CFG.get('connect_timeout', 5.0), 27 | 'request_timeout': MONITOR_CFG.get('request_timeout', 5.0), 28 | }, 29 | processes=MONITOR_CFG.get('pool_size', 5), 30 | ) 31 | 32 | 33 | @atexit.register 34 | def stop_pool(): 35 | """Stop pool workers on main process shutdown""" 36 | monitor_pool.close() 37 | monitor_pool.join() 38 | -------------------------------------------------------------------------------- /src/cocaine-app/mastermind_core/helpers.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import copy 3 | import cStringIO 4 | import gzip 5 | 6 | 7 | def gzip_compress(data, compression_level=1): 8 | with contextlib.closing(cStringIO.StringIO()) as buf: 9 | gz = gzip.GzipFile( 10 | fileobj=buf, 11 | mode='wb', 12 | compresslevel=compression_level 13 | ) 14 | with gz: 15 | gz.write(data) 16 | return buf.getvalue() 17 | 18 | 19 | def encode(s, encoding='utf-8', errors='strict'): 20 | """Returns an encoded version of string 21 | 22 | NB: s is encoded only if it was a unicode string, otherwise 23 | it is returned as is. 24 | """ 25 | if isinstance(s, unicode): 26 | return s.encode(encoding, errors=errors) 27 | return s 28 | 29 | 30 | def merge_dict(dst, src): 31 | """ Merges two dicts updating 'dst' keys with those from 'src' 32 | """ 33 | res = copy.deepcopy(dst) 34 | for k, val in src.iteritems(): 35 | if k not in dst: 36 | res[k] = val 37 | else: 38 | if not isinstance(val, dict): 39 | res[k] = val 40 | else: 41 | res[k] = merge_dict(res[k], src[k]) 42 | return res 43 | -------------------------------------------------------------------------------- /debian/postinst: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | 4 | set -e 5 | 6 | #DEBHELPER# 7 | 8 | case "$1" in 9 | configure) 10 | 11 | DEPLOY_DIR="/usr/lib/mastermind" 12 | 13 | mkdir -p /var/log/mastermind 14 | chown cocaine -R $DEPLOY_DIR 15 | chown cocaine -R /var/log/mastermind 16 | 17 | BASE_APP_NAME=`/usr/bin/mastermind_app_name.sh` 18 | 19 | /usr/bin/mastermind_deploy.sh $DEPLOY_DIR $BASE_APP_NAME-inventory mastermind-inventory.manifest mastermind-inventory.profile 20 | cocaine-tool runlist add-app -n default --app $BASE_APP_NAME-inventory --profile $BASE_APP_NAME-inventory --force 21 | 22 | /usr/bin/mastermind_deploy.sh $DEPLOY_DIR $BASE_APP_NAME mastermind.manifest mastermind.profile 23 | cocaine-tool runlist add-app -n default --app $BASE_APP_NAME --profile $BASE_APP_NAME --force 24 | 25 | /usr/bin/mastermind_deploy.sh $DEPLOY_DIR $BASE_APP_NAME-cache mastermind-cache.manifest mastermind-cache.profile 26 | cocaine-tool runlist add-app -n default --app $BASE_APP_NAME-cache --profile $BASE_APP_NAME-cache --force 27 | 28 | ;; 29 | abort-upgrade|abort-remove|abort-deconfigure) 30 | ;; 31 | 32 | *) 33 | echo "postinst called with unknown argument \`$1'" >&2 34 | exit 1 35 | ;; 36 | esac 37 | 38 | exit 0 39 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | Format: http://dep.debian.net/deps/dep5 2 | Upstream-Name: mastermind 3 | Source: https://github.com/toshic/mastermind 4 | 5 | Files: * 6 | Copyright: 2012 Anton Kortunov 7 | 2012 Andrey Godin 8 | License: GPL-3.0+ 9 | 10 | Files: debian/* 11 | Copyright: 2012 Andrey Godin 12 | License: GPL-3.0+ 13 | 14 | License: GPL-3.0+ 15 | This program is free software: you can redistribute it and/or modify 16 | it under the terms of the GNU General Public License as published by 17 | the Free Software Foundation, either version 3 of the License, or 18 | (at your option) any later version. 19 | . 20 | This package is distributed in the hope that it will be useful, 21 | but WITHOUT ANY WARRANTY; without even the implied warranty of 22 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23 | GNU General Public License for more details. 24 | . 25 | You should have received a copy of the GNU General Public License 26 | along with this program. If not, see . 27 | . 28 | On Debian systems, the complete text of the GNU General 29 | Public License version 3 can be found in "/usr/share/common-licenses/GPL-3". 30 | 31 | # Please also look if there are files or directories which have a 32 | # different copyright/license attached and list them here. 33 | -------------------------------------------------------------------------------- /etc/bash_completion.d/mastermind: -------------------------------------------------------------------------------- 1 | # mastermind(1) completion 2 | 3 | _mastermind() 4 | { 5 | local cur prev 6 | 7 | HELPER="mastermind " 8 | COMP_CWORD_PREV=`expr ${COMP_CWORD} - 1` 9 | for opt in `seq 1 ${COMP_CWORD_PREV}` 10 | do 11 | HELPER=${HELPER}" "${COMP_WORDS[opt]} 12 | done 13 | HELPER=$HELPER" --commands" 14 | 15 | COMPREPLY=() 16 | cur="${COMP_WORDS[COMP_CWORD]}" 17 | prev="${COMP_WORDS[COMP_CWORD-1]}" 18 | 19 | # if [[ ${COMP_CWORD} -le 2 ]] ; then 20 | COMPREPLY=( $(compgen -W "`$HELPER`" -- ${cur}) ) 21 | return 0 22 | # fi 23 | 24 | # if [[ ${COMP_WORDS[1]} = 'cmd' ]] ; then 25 | # 26 | # if [[ ${COMP_WORDS[2]} = 'restore' ]] ; then 27 | # 28 | # case "$prev" in 29 | # -u|--user) 30 | # COMPREPLY=( $(compgen -W "`lastlog | cut -d' ' -f1`" -- ${cur}) ) 31 | # return 0 32 | # ;; 33 | # -f|--group-file|-d|--dest) 34 | # _filedir 35 | # return 0 36 | # ;; 37 | # esac 38 | # fi 39 | # fi 40 | 41 | } && 42 | complete -F _mastermind mastermind 43 | 44 | # Local variables: 45 | # mode: shell-script 46 | # sh-basic-offset: 4 47 | # sh-indent-comment: t 48 | # indent-tabs-mode: nil 49 | # End: 50 | # ex: ts=4 sw=4 et filetype=sh -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/remove_group.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from jobs import JobBrokenError, TaskTypes, RetryError 4 | from minion_cmd import MinionCmdTask 5 | import storage 6 | 7 | 8 | logger = logging.getLogger('mm.jobs') 9 | 10 | 11 | class RemoveGroupTask(MinionCmdTask): 12 | """ 13 | Minion task to remove storage group 14 | 15 | Current implementation just renames the backend base path 16 | so that automatic configuration could skip backend when 17 | the node is being started. 18 | """ 19 | 20 | PARAMS = MinionCmdTask.PARAMS 21 | 22 | def __init__(self, job): 23 | super(RemoveGroupTask, self).__init__(job) 24 | self.cmd = TaskTypes.TYPE_REMOVE_GROUP 25 | self.type = TaskTypes.TYPE_REMOVE_GROUP 26 | 27 | def execute(self, processor): 28 | if self.group not in storage.groups: 29 | raise JobBrokenError( 30 | 'Group {group_id} is not found in storage'.format( 31 | group_id=self.group, 32 | ) 33 | ) 34 | try: 35 | minion_response = processor.minions.remove_group( 36 | self.host, 37 | self.params 38 | ) 39 | except RuntimeError as e: 40 | raise RetryError(self.attempts, e) 41 | self._set_minion_task_parameters(minion_response) 42 | -------------------------------------------------------------------------------- /src/cocaine-app/sync/error.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class API_ERROR_CODE(object): 4 | 5 | LOCK = 20 6 | LOCK_FAILED = 21 7 | LOCK_ALREADY_ACQUIRED = 22 8 | LOCK_INCONSISTENT = 23 9 | 10 | 11 | class LockError(Exception): 12 | 13 | PARAMS = () 14 | 15 | def __init__(self, *args, **kwargs): 16 | super(LockError, self).__init__(*args) 17 | for param in self.PARAMS: 18 | setattr(self, param, kwargs.get(param)) 19 | 20 | @property 21 | def code(self): 22 | return API_ERROR_CODE.LOCK 23 | 24 | def dump(self): 25 | res = { 26 | 'msg': str(self), 27 | 'code': self.code 28 | } 29 | for param in self.PARAMS: 30 | res[param] = getattr(self, param) 31 | return res 32 | 33 | class LockFailedError(LockError): 34 | 35 | PARAMS = ('lock_id',) 36 | 37 | @property 38 | def code(self): 39 | return API_ERROR_CODE.LOCK_FAILED 40 | 41 | def __str__(self): 42 | return 'Failed to acquire lock {0}'.format(self.lock_id) 43 | 44 | 45 | class LockAlreadyAcquiredError(LockFailedError): 46 | 47 | PARAMS = ('lock_id', 'holder_id', 'lock_ids', 'holders_ids') 48 | 49 | @property 50 | def code(self): 51 | return API_ERROR_CODE.LOCK_ALREADY_ACQUIRED 52 | 53 | 54 | class InconsistentLockError(LockAlreadyAcquiredError): 55 | @property 56 | def code(self): 57 | return API_ERROR_CODE.LOCK_INCONSISTENT 58 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/create_group.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import inventory 4 | from jobs import JobBrokenError, TaskTypes, RetryError 5 | from minion_cmd import MinionCmdTask 6 | import storage 7 | 8 | 9 | logger = logging.getLogger('mm.jobs') 10 | 11 | 12 | class CreateGroupTask(MinionCmdTask): 13 | """ 14 | Minion task to create storage group 15 | 16 | Creates custom file structure on a file system 17 | for a node to find new group and run its backend. 18 | """ 19 | 20 | PARAMS = MinionCmdTask.PARAMS 21 | 22 | def __init__(self, job): 23 | super(CreateGroupTask, self).__init__(job) 24 | self.cmd = TaskTypes.TYPE_CREATE_GROUP 25 | self.type = TaskTypes.TYPE_CREATE_GROUP 26 | 27 | def execute(self, processor): 28 | if self.group in storage.groups: 29 | raise JobBrokenError( 30 | 'Group {group_id} already exists'.format( 31 | group_id=self.group, 32 | ) 33 | ) 34 | try: 35 | minion_response = processor.minions.create_group( 36 | self.host, 37 | self.params, 38 | files=inventory.get_new_group_files( 39 | group_id=self.group, 40 | total_space=self.params['total_space'], 41 | ) 42 | ) 43 | except RuntimeError as e: 44 | raise RetryError(self.attempts, e) 45 | self._set_minion_task_parameters(minion_response) 46 | -------------------------------------------------------------------------------- /src/cocaine-app/inventory_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import logging 3 | import signal 4 | import sys 5 | 6 | from cocaine.asio.exceptions import LocatorResolveError 7 | from cocaine.worker import Worker 8 | 9 | import log 10 | 11 | try: 12 | log.setup_logger('mm_inventory_logging') 13 | logger = logging.getLogger('mm.init') 14 | except LocatorResolveError: 15 | log.setup_logger() 16 | logger = logging.getLogger('mm.init') 17 | logger.warn( 18 | 'mm_inventory_logging is not set up properly in ' 19 | 'cocaine.conf, fallback to default logging service' 20 | ) 21 | 22 | from config import config 23 | # TODO: rename inv module to 'inventory' when switched to using inventory worker 24 | import inv as inventory 25 | import helpers 26 | 27 | 28 | def init_inventory_worker(worker): 29 | helpers.register_handle_wne(worker, inventory.Inventory.get_dc_by_host) 30 | 31 | 32 | DEFAULT_DISOWN_TIMEOUT = 2 33 | 34 | if __name__ == '__main__': 35 | 36 | def term_handler(signo, frame): 37 | # required to guarantee execution of cleanup functions registered 38 | # with atexit.register 39 | sys.exit(0) 40 | 41 | signal.signal(signal.SIGTERM, term_handler) 42 | 43 | logger.info("before creating inventory worker") 44 | worker = Worker(disown_timeout=config.get('disown_timeout', DEFAULT_DISOWN_TIMEOUT)) 45 | logger.info("after creating inventory worker") 46 | 47 | init_inventory_worker(worker) 48 | 49 | logger.info("Starting inventory worker") 50 | worker.run() 51 | logger.info("Inventory worker stopped") 52 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: mastermind 2 | Section: net 3 | Priority: extra 4 | Maintainer: Andrey Godin 5 | Build-Depends: 6 | debhelper (>= 8.0.0), 7 | bash-completion, 8 | python-all, 9 | python-setuptools, 10 | python-pytest, 11 | python-msgpack | msgpack-python, 12 | python-tornado (>= 4.0), 13 | cocaine-framework-python, 14 | python-simplejson, 15 | Standards-Version: 3.9.2 16 | Homepage: https://github.com/toshic/mastermind 17 | Vcs-Git: git://github.com/toshic/mastermind.git 18 | X-Python-Version: >= 2.7 19 | 20 | Package: mastermind 21 | Architecture: amd64 22 | Depends: 23 | ${shlibs:Depends}, 24 | ${misc:Depends}, 25 | cocaine-framework-python (<< 0.12), 26 | python-tornado (>= 4.0), 27 | cocaine-tools (<< 0.12), 28 | python-pymongo (<< 3), 29 | python-bson (<< 3), 30 | cocaine-runtime (<< 0.12), 31 | python-requests, 32 | python-mastermind (= ${binary:Version}), 33 | Recommends: python-kazoo 34 | Description: Metabalancer for elliptics storage 35 | 36 | Package: mastermind-utils 37 | Architecture: amd64 38 | Depends: 39 | ${shlibs:Depends}, 40 | ${misc:Depends}, 41 | python-opster (>= 4.0), 42 | cocaine-framework-python (<< 0.12), 43 | python-msgpack | msgpack-python, 44 | python-tornado (>= 4.0), 45 | python-mastermind (= ${binary:Version}), 46 | Description: Metabalancer CLI for elliptics storage 47 | 48 | Package: python-mastermind 49 | Architecture: amd64 50 | Section: python 51 | Depends: 52 | ${shlibs:Depends}, 53 | ${misc:Depends}, 54 | ${python:Depends}, 55 | python-tornado (>= 4.0), 56 | cocaine-framework-python (<< 0.12), 57 | python-simplejson, 58 | Description: Common components and a client library for Mastermind 59 | -------------------------------------------------------------------------------- /scripts/07-move-gatlinggun-tasks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import json 3 | import logging 4 | import sys 5 | 6 | import msgpack 7 | from kazoo.client import KazooClient 8 | from mastermind.utils.queue import LockingQueue 9 | 10 | 11 | logger = logging.getLogger('mm.convert') 12 | 13 | 14 | CONFIG_PATH = '/etc/elliptics/mastermind.conf' 15 | 16 | try: 17 | 18 | with open(CONFIG_PATH, 'r') as config_file: 19 | config = json.load(config_file) 20 | 21 | except Exception as e: 22 | raise ValueError('Failed to load config file %s: %s' % (CONFIG_PATH, e)) 23 | 24 | 25 | def make_cache_kazoo_client(): 26 | kz = KazooClient(config['cache']['manager']['host']) 27 | kz.start() 28 | return kz 29 | 30 | 31 | def move_tasks(kz): 32 | base_path = config['cache']['manager']['lock_path_prefix'] 33 | for path in kz.get_children(base_path + '/entries'): 34 | item = kz.get('{}/entries/{}'.format( 35 | base_path, path))[0] 36 | if not item: 37 | continue 38 | task = msgpack.unpackb(item) 39 | q = LockingQueue(kz, base_path, task['group']) 40 | q.put(item) 41 | 42 | 43 | def clean_old_tasks(kz): 44 | base_path = config['cache']['manager']['lock_path_prefix'] 45 | kz.delete(base_path + '/entries', recursive=True) 46 | kz.delete(base_path + '/taken', recursive=True) 47 | 48 | 49 | if __name__ == '__main__': 50 | 51 | if len(sys.argv) < 2 or sys.argv[1] not in ('move', 'clean'): 52 | print "Usage: {0} move|clean".format(sys.argv[0]) 53 | sys.exit(1) 54 | 55 | if sys.argv[1] == 'move': 56 | 57 | kz = make_cache_kazoo_client() 58 | move_tasks(kz) 59 | 60 | print 61 | elif sys.argv[1] == 'clean': 62 | kz = make_cache_kazoo_client() 63 | clean_old_tasks(kz) 64 | 65 | print 66 | 67 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/recover_group_dc.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import storage 3 | 4 | from jobs import JobBrokenError, TaskTypes 5 | from minion_cmd import MinionCmdTask 6 | 7 | 8 | logger = logging.getLogger('mm.jobs') 9 | 10 | 11 | class RecoverGroupDcTask(MinionCmdTask): 12 | 13 | PARAMS = MinionCmdTask.PARAMS + ('couple',) 14 | 15 | def __init__(self, job): 16 | super(RecoverGroupDcTask, self).__init__(job) 17 | self.type = TaskTypes.TYPE_RECOVER_DC_GROUP_TASK 18 | 19 | @classmethod 20 | def new(cls, job, **kwargs): 21 | task = super(RecoverGroupDcTask, cls).new(job, **kwargs) 22 | task.check(task.group) 23 | task.couple = storage.groups[task.group].couple.as_tuple() 24 | return task 25 | 26 | def check(self, group_id): 27 | if not group_id in storage.groups: 28 | raise JobBrokenError('Group {0} is not found'.format(group_id)) 29 | 30 | group = storage.groups[group_id] 31 | 32 | if group.status != storage.Status.COUPLED: 33 | raise JobBrokenError('Task {0}: group {1} has status {2}, ' 34 | 'should be {3}'.format(self, self.group, 35 | group.status, storage.Status.COUPLED)) 36 | 37 | def execute(self, processor): 38 | 39 | # checking if task still applicable 40 | logger.info('Job {0}, task {1}: checking group {2} and couple {3} ' 41 | 'consistency'.format(self.parent_job.id, self.id, self.group, self.couple)) 42 | 43 | self.check(self.group) 44 | group = storage.groups[self.group] 45 | 46 | if set(self.couple) != set(group.couple.as_tuple()): 47 | raise JobBrokenError('Task {0}: group {1} has changed couple to {2}, ' 48 | 'expected {3}'.format(self, self.group, 49 | group.couple, self.couple)) 50 | 51 | super(RecoverGroupDcTask, self).execute(processor) 52 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/query/__init__.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import mastermind.client 4 | 5 | 6 | class Query(object): 7 | def __init__(self, client): 8 | self.client = client or mastermind.client.DummyClient() 9 | 10 | @classmethod 11 | def _object(cls, object_or_id, client): 12 | if isinstance(object_or_id, cls): 13 | return object_or_id 14 | return cls(object_or_id, client) 15 | 16 | @staticmethod 17 | def not_idempotent(method): 18 | @functools.wraps(method) 19 | def wrapper(self, *args, **kwargs): 20 | if 'attempts' not in kwargs: 21 | kwargs['attempts'] = 1 22 | return method(self, *args, **kwargs) 23 | return wrapper 24 | 25 | @classmethod 26 | def from_data(cls, data, client): 27 | obj = cls(cls._raw_id(data), client) 28 | obj._set_raw_data(data) 29 | return obj 30 | 31 | 32 | class LazyDataObject(object): 33 | 34 | @staticmethod 35 | def _lazy_load(method): 36 | @functools.wraps(method) 37 | def wrapper(self, *args, **kwargs): 38 | self._fetch_and_set_raw_data() 39 | return method(self, *args, **kwargs) 40 | return wrapper 41 | 42 | def _fetch_and_set_raw_data(self): 43 | if not hasattr(self, '_data'): 44 | self._set_raw_data(self._fetch_data()) 45 | 46 | def _set_raw_data(self, data): 47 | data = self._preprocess_raw_data(data) 48 | self._data = data 49 | 50 | def _expire(self): 51 | if hasattr(self, '_data'): 52 | delattr(self, '_data') 53 | 54 | def _fetch_data(self): 55 | raise NotImplemented() 56 | 57 | def _raw_id(self): 58 | raise NotImplemented() 59 | 60 | def _preprocess_raw_data(self, data): 61 | return data 62 | 63 | def serialize(self): 64 | self._fetch_and_set_raw_data() 65 | return self._data 66 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/node_backend_defrag.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from jobs import JobBrokenError, RetryError, TaskTypes 4 | from minion_cmd import MinionCmdTask 5 | import storage 6 | 7 | 8 | logger = logging.getLogger('mm.jobs') 9 | 10 | 11 | class NodeBackendDefragTask(MinionCmdTask): 12 | 13 | PARAMS = MinionCmdTask.PARAMS + ('node_backend', 'group') 14 | 15 | def __init__(self, job): 16 | super(NodeBackendDefragTask, self).__init__(job) 17 | self.type = TaskTypes.TYPE_NODE_BACKEND_DEFRAG_TASK 18 | 19 | def execute(self, processor): 20 | # checking if task still applicable 21 | logger.info('Job {0}, task {1}: checking group {2} and node backend {3} ' 22 | 'consistency'.format( 23 | self.parent_job.id, self.id, self.group, self.node_backend)) 24 | 25 | if self.group not in storage.groups: 26 | raise JobBrokenError('Group {0} is not found'.format(self.group)) 27 | if self.node_backend not in storage.node_backends: 28 | raise JobBrokenError('Node backend {0} is not found'.format(self.node_backend)) 29 | 30 | group = storage.groups[self.group] 31 | node_backend = storage.node_backends[self.node_backend] 32 | 33 | if group.couple is None: 34 | raise JobBrokenError('Task {0}: group {1} does not belong ' 35 | 'to any couple'.format(self, self.group)) 36 | 37 | if group.couple.status not in storage.GOOD_STATUSES: 38 | raise RetryError(10, JobBrokenError('Task {}: group {} couple status is {}'.format( 39 | self, self.group, group.couple.status))) 40 | 41 | if node_backend not in group.node_backends: 42 | raise JobBrokenError('Task {0}: node backend {1} does not belong to ' 43 | 'group {2}'.format(self, self.node_backend, self.group)) 44 | 45 | super(NodeBackendDefragTask, self).execute(processor) 46 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/dnet_client_backend_cmd.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import infrastructure 4 | from jobs import TaskTypes, RetryError 5 | from minion_cmd import MinionCmdTask 6 | 7 | 8 | logger = logging.getLogger('mm.jobs') 9 | 10 | 11 | class DnetClientBackendCmdTask(MinionCmdTask): 12 | """ 13 | Minion task for executing dnet_client backend command 14 | 15 | Commands 'params' can contain the following keys: 16 | host: ip address of the elliptics node 17 | port: port of the elliptics node 18 | family: family of the elliptics node 19 | command: dnet_client subcommand (enable, disable, etc.) 20 | backend_id: dnet_client command target backend id 21 | 22 | All these params are required to run the command, but if 23 | not all of them are available to mastermind, it can supply 24 | additional parameters so that minion is able to get 25 | required ones from them. 26 | 27 | For example, when a newly created group should be run, 28 | mastermind has no way of knowing the backend id, but it 29 | can provide minion with config path and group id. Minion 30 | will open config file and find corresponding backend id by 31 | group id, and will then successfully run the command. 32 | """ 33 | 34 | PARAMS = MinionCmdTask.PARAMS 35 | 36 | def __init__(self, job): 37 | super(DnetClientBackendCmdTask, self).__init__(job) 38 | self.cmd = TaskTypes.TYPE_DNET_CLIENT_BACKEND_CMD 39 | self.type = TaskTypes.TYPE_DNET_CLIENT_BACKEND_CMD 40 | 41 | def execute(self, processor): 42 | self.params['cmd_tpl'] = infrastructure.DNET_CLIENT_BACKEND_CMD_TPL 43 | self.params['subcommand'] = 'backend' 44 | try: 45 | minion_response = processor.minions.dnet_client_cmd( 46 | self.host, 47 | self.params 48 | ) 49 | except RuntimeError as e: 50 | raise RetryError(self.attempts, e) 51 | self._set_minion_task_parameters(minion_response) 52 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/query/history.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | 4 | DT_FORMAT = '%Y-%m-%d %H:%M:%S' 5 | 6 | 7 | class GroupHistory(object): 8 | def __init__(self, couples=None, nodes=None): 9 | self.couples = [CoupleHistoryRecord(c) for c in couples or []] 10 | self.nodes = [NodeBackendSetHistoryRecord(c) for c in nodes or []] 11 | 12 | 13 | class CoupleHistoryRecord(object): 14 | def __init__(self, data): 15 | self.couple = data['couple'] 16 | self.timestamp = data['timestamp'] 17 | 18 | def __str__(self): 19 | return '[{}] {}'.format(datetime.fromtimestamp(self.timestamp).strftime(DT_FORMAT), 20 | self.couple) 21 | 22 | def __repr__(self): 23 | return '<{}: {}>'.format(type(self).__name__, str(self)) 24 | 25 | 26 | class NodeBackendSetHistoryRecord(object): 27 | def __init__(self, data): 28 | self.set = [NodeBackendHistoryRecord(**ns) for ns in data['set']] 29 | self.timestamp = data['timestamp'] 30 | self.type = data['type'] 31 | 32 | def __str__(self): 33 | return '[{}] ({})'.format(datetime.fromtimestamp(self.timestamp).strftime(DT_FORMAT), 34 | ','.join(str(hr) for hr in self.set)) 35 | 36 | def __repr__(self): 37 | return '<{}: {}>'.format(type(self).__name__, str(self)) 38 | 39 | 40 | class NodeBackendHistoryRecord(object): 41 | def __init__(self, hostname, port, family, backend_id, path): 42 | self.hostname = hostname 43 | self.port = port 44 | self.family = family 45 | self.backend_id = backend_id 46 | self.path = path 47 | 48 | def __str__(self): 49 | return '{hostname}:{port}:{family}/{backend_id} {path}'.format( 50 | hostname=self.hostname, 51 | port=self.port, 52 | family=self.family, 53 | backend_id=self.backend_id, 54 | path=self.path) 55 | 56 | def __repr__(self): 57 | return '<{}: {}>'.format(type(self).__name__, str(self)) 58 | -------------------------------------------------------------------------------- /src/cocaine-app/sync/fake_sync.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | import logging 3 | from threading import Lock 4 | 5 | from sync.error import LockAlreadyAcquiredError 6 | 7 | 8 | logger = logging.getLogger('mm.sync') 9 | 10 | 11 | class SyncManager(object): 12 | 13 | def __init__(self, *args, **kwargs): 14 | self.locks = {} 15 | self.__locks_lock = Lock() 16 | 17 | @contextmanager 18 | def lock(self, lockid, blocking=True, timeout=None): 19 | """ Locks mastermind jobs list. 20 | This is just a demo implementation that provides locking among 21 | different threads of the same process, you should provide your 22 | own implemetation using locking primitives available in your 23 | infrastructure. 24 | """ 25 | with self.__locks_lock: 26 | lock = self.locks.setdefault(lockid, Lock()) 27 | with lock: 28 | yield 29 | 30 | def persistent_locks_acquire(self, locks, data=''): 31 | with self.__locks_lock: 32 | already_locked = [] 33 | for lockid in locks: 34 | lock = self.locks.setdefault(lockid, Lock()) 35 | if lock.locked(): 36 | already_locked.append(lockid) 37 | if already_locked: 38 | raise LockAlreadyAcquiredError(lock_id=already_locked[0], 39 | holder_id='', 40 | lock_ids=already_locked, 41 | holders_ids=['']) 42 | for lockid in locks: 43 | self.locks[lockid].acquire() 44 | return True 45 | 46 | def persistent_locks_release(self, locks, check=''): 47 | with self.__locks_lock: 48 | for lockid in locks: 49 | lock = self.locks.get(lockid) 50 | if lock and lock.locked(): 51 | lock.release() 52 | else: 53 | logger.warn('Persistent lock {0} is already removed'.format(lockid)) 54 | 55 | def get_children_locks(self, lock_prefix): 56 | return [lock_id for lock_id in self.locks 57 | if lock_id.startswith(lock_prefix)] 58 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from jobs.job_types import TaskTypes 2 | 3 | from task import Task 4 | from node_stop import NodeStopTask 5 | from minion_cmd import MinionCmdTask 6 | from history_remove_node import HistoryRemoveNodeTask 7 | from recover_group_dc import RecoverGroupDcTask 8 | from node_backend_defrag import NodeBackendDefragTask 9 | from couple_defrag_state_check import CoupleDefragStateCheckTask 10 | from rsync_backend import RsyncBackendTask 11 | from create_group import CreateGroupTask 12 | from remove_group import RemoveGroupTask 13 | from dnet_client_backend_cmd import DnetClientBackendCmdTask 14 | from write_meta_key import WriteMetaKeyTask 15 | 16 | 17 | class TaskFactory(object): 18 | 19 | @staticmethod 20 | def make_task(data, job): 21 | task_type = data.get('type') 22 | if task_type == TaskTypes.TYPE_NODE_STOP_TASK: 23 | return NodeStopTask.from_data(data, job) 24 | if task_type == TaskTypes.TYPE_MINION_CMD: 25 | return MinionCmdTask.from_data(data, job) 26 | if task_type == TaskTypes.TYPE_HISTORY_REMOVE_NODE: 27 | return HistoryRemoveNodeTask.from_data(data, job) 28 | if task_type == TaskTypes.TYPE_RECOVER_DC_GROUP_TASK: 29 | return RecoverGroupDcTask.from_data(data, job) 30 | if task_type == TaskTypes.TYPE_NODE_BACKEND_DEFRAG_TASK: 31 | return NodeBackendDefragTask.from_data(data, job) 32 | if task_type == TaskTypes.TYPE_COUPLE_DEFRAG_STATE_CHECK_TASK: 33 | return CoupleDefragStateCheckTask.from_data(data, job) 34 | if task_type == TaskTypes.TYPE_RSYNC_BACKEND_TASK: 35 | return RsyncBackendTask.from_data(data, job) 36 | if task_type == TaskTypes.TYPE_CREATE_GROUP: 37 | return CreateGroupTask.from_data(data, job) 38 | if task_type == TaskTypes.TYPE_REMOVE_GROUP: 39 | return RemoveGroupTask.from_data(data, job) 40 | if task_type == TaskTypes.TYPE_DNET_CLIENT_BACKEND_CMD: 41 | return DnetClientBackendCmdTask.from_data(data, job) 42 | if task_type == TaskTypes.TYPE_WRITE_META_KEY: 43 | return WriteMetaKeyTask.from_data(data, job) 44 | raise ValueError('Unknown task type {0}'.format(task_type)) 45 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/query/node_backends.py: -------------------------------------------------------------------------------- 1 | from mastermind.query import Query, LazyDataObject 2 | 3 | 4 | class NodeBackendQuery(Query): 5 | pass 6 | 7 | 8 | class NodeBackendDataObject(LazyDataObject): 9 | 10 | @staticmethod 11 | def _raw_id(raw_data): 12 | return raw_data['id'] 13 | 14 | @property 15 | @LazyDataObject._lazy_load 16 | def host(self): 17 | """ Node backend's host IP address 18 | """ 19 | return self._data['host'] 20 | 21 | @property 22 | @LazyDataObject._lazy_load 23 | def hostname(self): 24 | """ Node backend's host FQDN 25 | """ 26 | return self._data['hostname'] 27 | 28 | @property 29 | @LazyDataObject._lazy_load 30 | def port(self): 31 | """ Node backend's port 32 | """ 33 | return self._data['port'] 34 | 35 | @property 36 | @LazyDataObject._lazy_load 37 | def family(self): 38 | """ Node backend's family 39 | """ 40 | return self._data['family'] 41 | 42 | @property 43 | @LazyDataObject._lazy_load 44 | def backend_id(self): 45 | """ Node backend's id 46 | """ 47 | return self._data['backend_id'] 48 | 49 | @property 50 | @LazyDataObject._lazy_load 51 | def status(self): 52 | return self._data['status'] 53 | 54 | @property 55 | @LazyDataObject._lazy_load 56 | def status_text(self): 57 | return self._data['status_text'] 58 | 59 | @property 60 | @LazyDataObject._lazy_load 61 | def path(self): 62 | """ Node backend's root filesystem path 63 | """ 64 | return self._data['path'] 65 | 66 | # TODO: backward-compatibility with dictionary object, 67 | # remove when all clients use 'node_backends' array as array of 68 | # 'NodeBackend' objects 69 | def __getitem__(self, key): 70 | return self._data[key] 71 | 72 | 73 | class NodeBackend(NodeBackendQuery, NodeBackendDataObject): 74 | def __init__(self, id, client): 75 | super(NodeBackend, self).__init__(client) 76 | self.id = id 77 | 78 | def __repr__(self): 79 | return ''.format( 80 | self.id, 81 | self.status, 82 | self.status_text 83 | ) 84 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/client.py: -------------------------------------------------------------------------------- 1 | import msgpack 2 | 3 | from mastermind.query import groups, namespaces, couples, groupsets, namespaces_states 4 | from mastermind.service import ReconnectableService 5 | 6 | 7 | class MastermindClient(object): 8 | """Provides python binding to mastermind cocaine application. 9 | 10 | Args: 11 | app_name: 12 | Cocaine application name, defaults to "mastermind2.26". 13 | **kwargs: 14 | Parameters for constructing ReconnactableService object which 15 | is used for cocaine requests. 16 | """ 17 | 18 | DEFAULT_APP_NAME = 'mastermind2.26' 19 | 20 | def __init__(self, app_name=None, **kwargs): 21 | self.service = ReconnectableService(app_name=app_name or self.DEFAULT_APP_NAME, 22 | **kwargs) 23 | 24 | def request(self, handle, data, attempts=None, timeout=None): 25 | """Performs syncronous requests to mastermind cocaine application. 26 | 27 | Args: 28 | handle: API handle name. 29 | data: request data that will be serialized and sent. 30 | """ 31 | data = self.service.enqueue( 32 | handle, msgpack.packb(data), 33 | attempts=attempts, timeout=timeout).get() 34 | if isinstance(data, dict): 35 | if 'Error' in data: 36 | raise RuntimeError(data['Error']) 37 | if 'Balancer error' in data: 38 | raise RuntimeError(data['Balancer error']) 39 | return data 40 | 41 | @property 42 | def groups(self): 43 | return groups.GroupsQuery(self) 44 | 45 | @property 46 | def namespaces(self): 47 | return namespaces.NamespacesQuery(self) 48 | 49 | @property 50 | def couples(self): 51 | return couples.CouplesQuery(self) 52 | 53 | @property 54 | def groupsets(self): 55 | return groupsets.GroupsetsQuery(self) 56 | 57 | @property 58 | def namespaces_states(self): 59 | return namespaces_states.NamespacesStatesQuery(self) 60 | 61 | @property 62 | def remotes(self): 63 | return self.service.enqueue('get_config_remotes', '') 64 | 65 | 66 | class DummyClient(object): 67 | def __getattribute__(self, attr): 68 | raise RuntimeError('Mastermind client should be bound to query object') 69 | -------------------------------------------------------------------------------- /src/cocaine-app/manual_locks.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import socket 3 | 4 | from errors import CacheUpstreamError 5 | import helpers as h 6 | import storage 7 | from sync import sync_manager 8 | from sync.error import LockError 9 | 10 | 11 | logger = logging.getLogger('mm.locker') 12 | 13 | 14 | class ManualLocker(object): 15 | 16 | HOST_LOCK_PREFIX = 'manual/host/' 17 | HOST_LOCK = HOST_LOCK_PREFIX + '{0}' 18 | 19 | @h.concurrent_handler 20 | def host_acquire_lock(self, request): 21 | 22 | try: 23 | host = request[0] 24 | except IndexError: 25 | raise ValueError('Host is required') 26 | 27 | check_host(host) 28 | 29 | lock_id = self.HOST_LOCK.format(host) 30 | sync_manager.persistent_locks_acquire([lock_id]) 31 | 32 | return lock_id 33 | 34 | @h.concurrent_handler 35 | def host_release_lock(self, request): 36 | 37 | try: 38 | host = request[0] 39 | except IndexError: 40 | raise ValueError('Host is required') 41 | 42 | check_host(host) 43 | 44 | lock_id = self.HOST_LOCK.format(host) 45 | sync_manager.persistent_locks_release([lock_id]) 46 | 47 | return lock_id 48 | 49 | def get_locked_hosts(self): 50 | locks = sync_manager.get_children_locks(self.HOST_LOCK_PREFIX) 51 | hostnames = set(lock[len(self.HOST_LOCK_PREFIX):] for lock in locks) 52 | hosts = set() 53 | logger.debug('hostnames: {0}'.format(hostnames)) 54 | for host in storage.hosts: 55 | try: 56 | if host.hostname in hostnames: 57 | hosts.add(host) 58 | except CacheUpstreamError: 59 | continue 60 | return hosts 61 | 62 | 63 | def check_host(host): 64 | try: 65 | resolved_host = socket.gethostbyaddr(host)[0] 66 | except Exception as e: 67 | error_msg = 'Failed to resolve host {0}'.format(host) 68 | logger.error(error_msg) 69 | raise ValueError(error_msg) 70 | 71 | if host != resolved_host: 72 | error_msg = ('Hostname is required ' 73 | '(host {0} is resolved to hostname {1})'.format( 74 | host, resolved_host)) 75 | logger.error(error_msg) 76 | raise ValueError(error_msg) 77 | 78 | 79 | manual_locker = ManualLocker() 80 | -------------------------------------------------------------------------------- /tests/fixtures/util.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | import pytest 5 | 6 | 7 | @pytest.fixture(scope='function') 8 | def ascii_data(data_size): 9 | return ''.join( 10 | random.choice( 11 | string.ascii_letters + string.digits 12 | ) for _ in xrange(data_size) 13 | ) 14 | 15 | 16 | def parametrize(argnames, argvalues, arglabels=None, **kwds): 17 | """Overrides 'pytest.mark.parametrize' implementation with 18 | automatic argvalue labels generation. 19 | 20 | Arguments: 21 | argnames - a comma-separated string denoting one or more argument names, 22 | or a list/tuple of argument strings. 23 | argvalues - the list of argvalues determines how often a test is invoked with 24 | different argument values. If only one argname was specified argvalues 25 | is a list of values. If N argnames were specified, argvalues must be 26 | a list of N-tuples, where each tuple-element specifies a value 27 | for its respective argname. 28 | arglabels - maps argname to a label that will be printed out along with 29 | corresponding argvalue on a test run. This argument helps to construct 30 | @ids parameter for underlying @pytest.mark.parametrize call. If argname 31 | is not present in arglabels it will be used instead of label. 32 | If @ids is supplied it will be passed as is to @pytest.mark.parametrize. 33 | **kwds - any other arguments for @pytest.mark.parametrize. 34 | """ 35 | if arglabels is None: 36 | arglabels = {} 37 | 38 | if 'ids' in kwds: 39 | ids = kwds.pop('ids') 40 | else: 41 | args = [a.strip() for a in argnames.split(',')] 42 | ids_tpls = ['{arg} {{value}}'.format(arg=arglabels.get(a, a)) for a in args] 43 | ids = [] 44 | for argvalue in argvalues: 45 | if not isinstance(argvalue, (list, tuple, basestring)): 46 | argvalue = [argvalue] 47 | ids.append( 48 | ', '.join( 49 | ids_tpl.format(value=argvalue[i]) 50 | for i, ids_tpl in enumerate(ids_tpls) 51 | ) 52 | ) 53 | 54 | def wrapper(func): 55 | return pytest.mark.parametrize( 56 | argnames, 57 | argvalues, 58 | ids=ids, 59 | **kwds 60 | )(func) 61 | 62 | return wrapper 63 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/task.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | 4 | class Task(object): 5 | 6 | STATUS_QUEUED = 'queued' 7 | STATUS_EXECUTING = 'executing' 8 | STATUS_FAILED = 'failed' 9 | STATUS_SKIPPED = 'skipped' 10 | STATUS_COMPLETED = 'completed' 11 | 12 | def __init__(self, job): 13 | self.status = self.STATUS_QUEUED 14 | self.id = uuid.uuid4().hex 15 | self.type = None 16 | self.start_ts = None 17 | self.finish_ts = None 18 | self.attempts = 0 19 | self.error_msg = [] 20 | self.parent_job = job 21 | 22 | def on_exec_start(self, processor): 23 | """ 24 | Called every time task changes status from 'queued' to 'executing' 25 | """ 26 | pass 27 | 28 | def on_exec_stop(self, processor): 29 | """ 30 | Called every time task changes status from 'executing' to anything else 31 | """ 32 | pass 33 | 34 | @classmethod 35 | def new(cls, job, **kwargs): 36 | task = cls(job) 37 | for param in cls.PARAMS: 38 | setattr(task, param, kwargs.get(param)) 39 | return task 40 | 41 | @classmethod 42 | def from_data(cls, data, job): 43 | task = cls(job) 44 | task.load(data) 45 | return task 46 | 47 | def load(self, data): 48 | # TODO: remove 'or' part 49 | self.id = data['id'] or uuid.uuid4().hex 50 | self.status = data['status'] 51 | self.type = data['type'] 52 | self.start_ts = data['start_ts'] 53 | self.finish_ts = data['finish_ts'] 54 | self.error_msg = data['error_msg'] 55 | self.attempts = data.get('attempts', 0) 56 | 57 | for param in self.PARAMS: 58 | val = data.get(param) 59 | if isinstance(val, unicode): 60 | val = val.encode('utf-8') 61 | setattr(self, param, val) 62 | 63 | def dump(self): 64 | res = {'status': self.status, 65 | 'id': self.id, 66 | 'type': self.type, 67 | 'start_ts': self.start_ts, 68 | 'finish_ts': self.finish_ts, 69 | 'error_msg': self.error_msg, 70 | 'attempts': self.attempts} 71 | res.update({ 72 | k: getattr(self, k) 73 | for k in self.PARAMS 74 | }) 75 | return res 76 | 77 | def human_dump(self): 78 | return self.dump() 79 | 80 | def __str__(self): 81 | raise RuntimeError('__str__ method should be implemented in ' 82 | 'derived class') 83 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/write_meta_key.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | import msgpack 5 | 6 | import helpers 7 | from jobs import TaskTypes 8 | import keys 9 | from task import Task 10 | 11 | 12 | logger = logging.getLogger('mm.jobs') 13 | 14 | 15 | class WriteMetaKeyTask(Task): 16 | 17 | PARAMS = ('group', 'metakey') 18 | TASK_TIMEOUT = 600 19 | 20 | def __init__(self, job): 21 | super(WriteMetaKeyTask, self).__init__(job) 22 | self.type = TaskTypes.TYPE_WRITE_META_KEY 23 | self._meta_key_written = False 24 | 25 | def update_status(self): 26 | # state update is not required. 27 | pass 28 | 29 | def execute(self): 30 | # this task execution does not rely on common task workflow 31 | # of executing a command and waiting till it's finished, 32 | # rather it tries to execute action on 'finished' check 33 | # till it completes without an error. 34 | pass 35 | 36 | def finished(self, processor): 37 | self._meta_key_written = self._try_write_meta_key(processor.session) 38 | return ( 39 | self._meta_key_written or 40 | time.time() - self.start_ts > self.TASK_TIMEOUT 41 | ) 42 | 43 | def _try_write_meta_key(self, session): 44 | s = session.clone() 45 | s.add_groups([self.group]) 46 | _, failed_groups = helpers.write_retry( 47 | s, 48 | keys.SYMMETRIC_GROUPS_KEY, 49 | msgpack.packb(self.metakey), 50 | retries=1, 51 | ) 52 | if failed_groups: 53 | logger.error( 54 | 'Job {job_id}, task {task_id}: failed to write metakey to group {group}'.format( 55 | job_id=self.parent_job.id, 56 | task_id=self.id, 57 | group=self.group, 58 | ) 59 | ) 60 | else: 61 | logger.debug( 62 | 'Job {job_id}, task {task_id}: metakey is successfully written ' 63 | 'to group {group}'.format( 64 | job_id=self.parent_job.id, 65 | task_id=self.id, 66 | group=self.group, 67 | ) 68 | ) 69 | 70 | return not failed_groups 71 | 72 | def failed(self, processor): 73 | return (time.time() - self.start_ts > self.TASK_TIMEOUT and 74 | not self._meta_key_written) 75 | 76 | def __str__(self): 77 | return 'WriteMetaKeyTask[id: {task_id}]'.format( 78 | task_id=self.id, 79 | group=self.group, 80 | ) 81 | -------------------------------------------------------------------------------- /scripts/06-clean-metadb-jobs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import datetime 3 | import json 4 | import logging 5 | from Queue import Queue 6 | import sys 7 | import time 8 | 9 | import elliptics 10 | import msgpack 11 | 12 | 13 | logger = logging.getLogger('mm.convert') 14 | 15 | 16 | CONFIG_PATH = '/etc/elliptics/mastermind.conf' 17 | 18 | try: 19 | 20 | with open(CONFIG_PATH, 'r') as config_file: 21 | config = json.load(config_file) 22 | 23 | except Exception as e: 24 | raise ValueError('Failed to load config file %s: %s' % (CONFIG_PATH, e)) 25 | 26 | 27 | def make_meta_session(): 28 | log = elliptics.Logger('/tmp/ell-namespace-convert.log', config["dnet_log_mask"]) 29 | node_config = elliptics.Config() 30 | meta_node = elliptics.Node(log, node_config) 31 | 32 | addresses = [elliptics.Address(host=str(node[0]), port=node[1], family=node[2]) 33 | for node in config["metadata"]["nodes"]] 34 | logger.info('Connecting to meta nodes: {0}'.format(config["metadata"]["nodes"])) 35 | meta_wait_timeout = config['metadata'].get('wait_timeout', 5) 36 | 37 | try: 38 | meta_node.add_remotes(addresses) 39 | except Exception as e: 40 | logger.error('Failed to connect to any elliptics meta storage node: {0}'.format( 41 | e)) 42 | raise ValueError('Failed to connect to any elliptics storage META node') 43 | 44 | meta_session = elliptics.Session(meta_node) 45 | meta_session.set_timeout(meta_wait_timeout) 46 | meta_session.add_groups(list(config["metadata"]["groups"])) 47 | 48 | time.sleep(5) 49 | 50 | return meta_session 51 | 52 | 53 | def process_jobs(s): 54 | 55 | s.set_namespace('jobs') 56 | s.set_timeout(600) 57 | 58 | indexes = ['mastermind:jobs_idx'] 59 | 60 | dt = datetime.datetime(2014, 01, 01) 61 | while dt < datetime.datetime.now(): 62 | indexes.append('mastermind:jobs_idx:%s' % dt.strftime('%Y-%m')) 63 | if dt.month == 12: 64 | dt = dt.replace(year=dt.year + 1, month=1) 65 | else: 66 | dt = dt.replace(month=dt.month + 1) 67 | 68 | for index in indexes: 69 | print index 70 | res = s.find_all_indexes([index]).get() 71 | print "Index {0}, keys {1}".format(index, len(res)) 72 | if not len(res): 73 | continue 74 | try: 75 | s.remove_index(res[0].indexes[0].index, True).get() 76 | except Exception as e: 77 | print "Failed to clean index {0}: {1}".format(index, e) 78 | 79 | if __name__ == '__main__': 80 | 81 | if len(sys.argv) < 2 or sys.argv[1] not in ('clean',): 82 | print "Usage: {0} clean".format(sys.argv[0]) 83 | sys.exit(1) 84 | 85 | if sys.argv[1] == 'clean': 86 | 87 | s = make_meta_session() 88 | process_jobs(s) 89 | 90 | print 91 | -------------------------------------------------------------------------------- /src/cocaine-app/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | import threading 4 | 5 | from cocaine.logging.log_message import Message 6 | from cocaine.logging.logger import VERBOSITY_LEVELS 7 | from cocaine.services import Service 8 | 9 | 10 | class Logger(Service): 11 | 12 | __instances_lock = threading.Lock() 13 | _instances = {} 14 | 15 | def __new__(cls, logger_name='logging'): 16 | with cls.__instances_lock: 17 | if logger_name not in cls._instances: 18 | instance = object.__new__(cls) 19 | 20 | cls._instances[logger_name] = instance 21 | instance._counter = 0 22 | instance._lock = threading.Lock() 23 | try: 24 | instance.target = "app/%s" % sys.argv[sys.argv.index("--app") + 1] 25 | except ValueError: 26 | instance.target = "app/standalone" 27 | 28 | def wrapper(level): 29 | target = instance.target 30 | 31 | def on_emit(message): 32 | with instance._lock: 33 | instance._counter += 1 34 | instance._writableStream.write(Message("Message", 35 | instance._counter, level, target, str(message)).pack()) 36 | 37 | return on_emit 38 | 39 | for level_name, level in VERBOSITY_LEVELS.items(): 40 | setattr(instance, level_name, wrapper(level)) 41 | 42 | return cls._instances[logger_name] 43 | 44 | def _on_message(self, args): 45 | # This is ESSENTIAL for logger to work properly 46 | pass 47 | 48 | 49 | class CocaineHandler(logging.Handler): 50 | def __init__(self, logger=None, *args, **kwargs): 51 | logging.Handler.__init__(self) 52 | 53 | self._logger = logger or Logger() 54 | self.level_binds = { 55 | logging.DEBUG: self._logger.debug, 56 | logging.INFO: self._logger.info, 57 | logging.WARNING: self._logger.warn, 58 | logging.ERROR: self._logger.error 59 | } 60 | 61 | def emit(self, record): 62 | def dummy(*args): # pragma: no cover 63 | pass 64 | msg = self.format(record) 65 | self.level_binds.get(record.levelno, dummy)(msg) 66 | 67 | 68 | def setup_logger(logger_name='logging'): 69 | cocaine_logger = Logger(logger_name) 70 | 71 | root_logger = logging.getLogger('mm') 72 | root_logger.propagate = False 73 | tornado_logger = logging.getLogger('tornado') 74 | tornado_logger.propagate = False 75 | 76 | _handler = CocaineHandler(cocaine_logger) 77 | _handler.setFormatter(logging.Formatter(fmt='[%(name)s] [%(process)d] %(message)s')) 78 | 79 | root_logger.addHandler(_handler) 80 | tornado_logger.addHandler(_handler) 81 | 82 | # cocaine.Logger will take care of low-level messages filtering 83 | root_logger.setLevel(logging.DEBUG) 84 | tornado_logger.setLevel(logging.DEBUG) 85 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/minion_cmd.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | import elliptics 5 | from tornado.httpclient import HTTPError 6 | 7 | from infrastructure import infrastructure 8 | from infrastructure_cache import cache 9 | from jobs import TaskTypes, RetryError 10 | from task import Task 11 | 12 | 13 | logger = logging.getLogger('mm.jobs') 14 | 15 | 16 | class MinionCmdTask(Task): 17 | 18 | PARAMS = ('group', 'host', 'cmd', 'params', 'minion_cmd_id') 19 | TASK_TIMEOUT = 6000 20 | 21 | def __init__(self, job): 22 | super(MinionCmdTask, self).__init__(job) 23 | self.minion_cmd = None 24 | self.minion_cmd_id = None 25 | self.type = TaskTypes.TYPE_MINION_CMD 26 | 27 | @classmethod 28 | def new(cls, job, **kwargs): 29 | task = super(MinionCmdTask, cls).new(job, **kwargs) 30 | task.params['task_id'] = task.id 31 | return task 32 | 33 | def update_status(self, processor): 34 | try: 35 | self.minion_cmd = processor.minions._get_command(self.minion_cmd_id) 36 | logger.debug('Job {0}, task {1}, minion command status was updated: {2}'.format( 37 | self.parent_job.id, self.id, self.minion_cmd)) 38 | except elliptics.Error as e: 39 | logger.warn('Job {0}, task {1}, minion command status {2} failed to fetch ' 40 | 'from metadb: {3}'.format(self.parent_job.id, self.id, 41 | self.minion_cmd_id, e)) 42 | pass 43 | 44 | def execute(self, processor): 45 | try: 46 | minion_response = processor.minions._execute_cmd( 47 | self.host, 48 | self.cmd, 49 | self.params 50 | ) 51 | except HTTPError as e: 52 | raise RetryError(self.attempts, e) 53 | self._set_minion_task_parameters(minion_response.values()[0]) 54 | 55 | def _set_minion_task_parameters(self, minion_cmd): 56 | self.minion_cmd = minion_cmd 57 | self.minion_cmd_id = self.minion_cmd['uid'] 58 | logger.info( 59 | 'Job {job_id}, task {task_id}, minions task ' 60 | 'execution: {command}'.format( 61 | job_id=self.parent_job.id, 62 | task_id=self.id, 63 | command=self.minion_cmd 64 | ) 65 | ) 66 | 67 | def human_dump(self): 68 | data = super(MinionCmdTask, self).human_dump() 69 | data['hostname'] = cache.get_hostname_by_addr(data['host'], strict=False) 70 | return data 71 | 72 | def finished(self, processor): 73 | return ((self.minion_cmd is None and 74 | time.time() - self.start_ts > self.TASK_TIMEOUT) or 75 | (self.minion_cmd and self.minion_cmd['progress'] == 1.0)) 76 | 77 | def failed(self, processor): 78 | if self.minion_cmd is None: 79 | return True 80 | return (self.minion_cmd['exit_code'] != 0 and 81 | self.minion_cmd.get('command_code') not in 82 | self.params.get('success_codes', [])) 83 | 84 | def __str__(self): 85 | return 'MinionCmdTask[id: {0}]<{1}>'.format(self.id, self.cmd) 86 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/couple_defrag_state_check.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | from jobs import JobBrokenError, TaskTypes 5 | import storage 6 | from task import Task 7 | 8 | 9 | logger = logging.getLogger('mm.jobs') 10 | 11 | 12 | class CoupleDefragStateCheckTask(Task): 13 | 14 | PARAMS = ('couple', 'stats_ts') 15 | TASK_TIMEOUT = 60 * 60 * 24 * 14 # 14 days 16 | 17 | def __init__(self, job): 18 | super(CoupleDefragStateCheckTask, self).__init__(job) 19 | self.type = TaskTypes.TYPE_COUPLE_DEFRAG_STATE_CHECK_TASK 20 | 21 | def update_status(self): 22 | # infrastructure state is updated by itself via task queue 23 | pass 24 | 25 | def execute(self): 26 | # TODO: use 'couples' container 27 | couples = (storage.cache_couples 28 | if self.parent_job.is_cache_couple else 29 | storage.replicas_groupsets) 30 | 31 | couple = couples[self.couple] 32 | 33 | stats = [] 34 | for group in couple.groups: 35 | for nb in group.node_backends: 36 | if not nb.stat: 37 | continue 38 | stats.append(nb.stat) 39 | stats_ts = [int(time.time())] 40 | if stats: 41 | stats_ts.extend(s.ts for s in stats) 42 | self.stats_ts = max(stats_ts) 43 | 44 | def finished(self, processor): 45 | return (self.__couple_defraged() or 46 | time.time() - self.start_ts > self.TASK_TIMEOUT) 47 | 48 | def failed(self, processor): 49 | return (time.time() - self.start_ts > self.TASK_TIMEOUT and 50 | not self.__couple_defraged()) 51 | 52 | def __couple_defraged(self): 53 | # TODO: use 'couples' container 54 | couples = (storage.cache_couples 55 | if self.parent_job.is_cache_couple else 56 | storage.replicas_groupsets) 57 | couple = couples[self.couple] 58 | stats = [] 59 | for group in couple.groups: 60 | if not group.node_backends: 61 | return False 62 | for nb in group.node_backends: 63 | if not nb.stat: 64 | return False 65 | stats.append(nb.stat) 66 | cur_stats_ts = min(s.ts for s in stats) 67 | if cur_stats_ts <= self.stats_ts: 68 | logger.info('Job {0}, task {1}: defrag status not updated since {2}'.format( 69 | self.parent_job.id, self.id, self.stats_ts)) 70 | return False 71 | 72 | if all(s.defrag_state == 0 for s in stats): 73 | logger.debug('Job {0}, task {1}: defrag finished, start_ts {2}, current ts {3}, ' 74 | 'defrag statuses {4}'.format(self.parent_job.id, self.id, self.stats_ts, 75 | cur_stats_ts, [s.defrag_state for s in stats])) 76 | return True 77 | 78 | logger.info('Job {0}, task {1}: defrag not finished, defrag statuses {2}'.format( 79 | self.parent_job.id, self.id, [s.defrag_state for s in stats])) 80 | return False 81 | 82 | def __str__(self): 83 | return 'CoupleDefragStateCheckTask[id: {0}]'.format( 84 | self.id, self.couple) 85 | -------------------------------------------------------------------------------- /tests/test_tree_picker.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | import pytest 4 | 5 | from mastermind.utils.tree_picker import TreePicker 6 | 7 | 8 | class TestTreePicker(object): 9 | def test_one_branch_tree(self): 10 | tp = TreePicker([1]) 11 | assert next(tp) == 1 12 | 13 | def test_one_branch_multi_level_tree(self): 14 | tp = TreePicker([[[1]]]) 15 | assert next(tp) == 1 16 | 17 | def test_one_branch_tree_exhausts(self): 18 | tp = TreePicker([[1]]) 19 | assert next(tp) == 1 20 | with pytest.raises(StopIteration): 21 | next(tp) 22 | 23 | def test_multi_branch_tree(self): 24 | tp = TreePicker([[1, 1], [2, 2]]) 25 | assert set(itertools.islice(tp, 2)) == set([1, 2]) 26 | 27 | def test_multi_branch_tree_exhausts(self): 28 | tp = TreePicker([[1, 1], [2, 2]]) 29 | assert set(itertools.islice(tp, 4)) == set([1, 2]) 30 | with pytest.raises(StopIteration): 31 | next(tp) 32 | 33 | def test_less_dependent_branch(self): 34 | """ Test that less dependent leaf nodes will 35 | be returned earlier than more dependent ones. 36 | root 37 | / \ 38 | / \ \ 39 | / / / \ 40 | 1 1 2 2 41 | """ 42 | tp = TreePicker([ 43 | [ 44 | [ 45 | 1, 46 | ], 47 | [ 48 | 1, 49 | ], 50 | ], 51 | [ 52 | [ 53 | 2, 54 | 2, 55 | ] 56 | ], 57 | ]) 58 | assert sorted(itertools.islice(tp, 3)) == [1, 1, 2] 59 | 60 | def test_independent_branches(self): 61 | """ Test that elements will be preferably picked from 62 | independent branches> 63 | root 64 | / | \ 65 | / / \ / | \ 66 | 1 2 2 3 3 3 67 | """ 68 | tp = TreePicker([ 69 | [ 70 | 1, 71 | ], 72 | [ 73 | 2, 74 | 2, 75 | ], 76 | [ 77 | 3, 78 | 3, 79 | 3, 80 | ], 81 | ]) 82 | assert set(itertools.islice(tp, 3)) == set([1, 2, 3]) 83 | 84 | def test_custom_select_algorithm(self): 85 | """ Test that elements will be picked according 86 | to custom select algorithm provided by user 87 | root 88 | / | \ 89 | / / \ / | \ 90 | 1 2 2 3 3 3 91 | """ 92 | tp = TreePicker( 93 | [ 94 | [ 95 | 1, 96 | ], 97 | [ 98 | 2, 99 | 2, 100 | ], 101 | [ 102 | 3, 103 | 3, 104 | 3, 105 | ], 106 | ], 107 | select=max, 108 | ) 109 | assert list(itertools.islice(tp, 3)) == [3, 2, 1] 110 | -------------------------------------------------------------------------------- /tests/fixtures/monitor_stat_worker.py: -------------------------------------------------------------------------------- 1 | import json 2 | import threading 3 | import zlib 4 | 5 | import pytest 6 | from tornado import gen 7 | import tornado.ioloop 8 | import tornado.httpserver 9 | import tornado.netutil 10 | import tornado.web 11 | 12 | from mastermind.pool import Pool 13 | from mastermind.monitor_pool import MonitorStatParseWorker 14 | 15 | 16 | @pytest.fixture 17 | def monitor_pool(monitor_port, request_timeout): 18 | """Pool of monitor stat workers""" 19 | pool = Pool( 20 | processes=1, 21 | worker=MonitorStatParseWorker, 22 | w_initkwds={ 23 | 'monitor_port': monitor_port, 24 | 'request_timeout': request_timeout, 25 | } 26 | ) 27 | return pool 28 | 29 | 30 | @pytest.yield_fixture 31 | def monitor_server(family, 32 | ascii_data, 33 | response_code, 34 | response_processing_time, 35 | valid_json, 36 | encode_content): 37 | """Run monitor server emulating elliptics in a separate thread""" 38 | 39 | class DataHandler(tornado.web.RequestHandler): 40 | @gen.coroutine 41 | def get(self): 42 | """Emulate elliptics monitor stat compression""" 43 | if response_code != 200: 44 | self.set_status(response_code) 45 | self.finish() 46 | return 47 | data = ascii_data 48 | if valid_json: 49 | self.set_header('Content-Type', 'application/json') 50 | data = json.dumps({'data': data}) 51 | if encode_content: 52 | self.set_header('Content-Encoding', 'deflate') 53 | data = zlib.compress(data) 54 | if response_processing_time: 55 | yield gen.sleep(response_processing_time) 56 | self.write(data) 57 | 58 | sockets = tornado.netutil.bind_sockets(port=0, family=family) 59 | 60 | def run_server(c): 61 | thread_io_loop = tornado.ioloop.IOLoop() 62 | thread_io_loop.make_current() 63 | app = tornado.web.Application([ 64 | (r'/', DataHandler) 65 | ]) 66 | server = tornado.httpserver.HTTPServer(app, io_loop=thread_io_loop) 67 | server.add_sockets(sockets) 68 | 69 | t = threading.current_thread() 70 | t.server = server 71 | t.io_loop = thread_io_loop 72 | with c: 73 | c.notify() 74 | thread_io_loop.start() 75 | thread_io_loop.close() 76 | 77 | c = threading.Condition() 78 | with c: 79 | t = threading.Thread(target=run_server, args=(c,)) 80 | t.start() 81 | # wait until server is initialized 82 | c.wait() 83 | 84 | yield t.server 85 | 86 | def stop_server(): 87 | t = threading.current_thread() 88 | t.server.stop() 89 | t.io_loop.stop() 90 | 91 | # ioloop should be stopped from the thread itself, therefore callback 92 | t.io_loop.add_callback(stop_server) 93 | t.join() 94 | 95 | 96 | @pytest.fixture 97 | def monitor_port(monitor_server): 98 | """Port of the running monitor server""" 99 | _, monitor_port = monitor_server._sockets.values()[0].getsockname()[:2] 100 | return monitor_port 101 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/node_stop.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import storage 3 | 4 | from jobs import JobBrokenError, TaskTypes 5 | from minion_cmd import MinionCmdTask 6 | 7 | 8 | logger = logging.getLogger('mm.jobs') 9 | 10 | 11 | class NodeStopTask(MinionCmdTask): 12 | 13 | PARAMS = MinionCmdTask.PARAMS + ('uncoupled',) 14 | 15 | def __init__(self, job): 16 | super(NodeStopTask, self).__init__(job) 17 | self.type = TaskTypes.TYPE_NODE_STOP_TASK 18 | 19 | def execute(self, processor): 20 | 21 | if self.group: 22 | # checking if task still applicable 23 | logger.info( 24 | 'Job {job_id}, task {task_id}: checking group {group} and host {host} ' 25 | 'consistency'.format( 26 | job_id=self.parent_job.id, 27 | task_id=self.id, 28 | group=self.group, 29 | host=self.host 30 | ) 31 | ) 32 | 33 | if self.group not in storage.groups: 34 | raise JobBrokenError( 35 | 'Task {task_id}: group {group} is not found'.format( 36 | task_id=self.id, 37 | group=self.group 38 | ) 39 | ) 40 | 41 | group = storage.groups[self.group] 42 | 43 | nb_addresses = set(nb.node.host.addr for nb in group.node_backends) 44 | nb_addresses.discard(self.host) 45 | 46 | if nb_addresses: 47 | raise JobBrokenError( 48 | 'Task {task_id}: group {group} has unexpected node backends: ' 49 | '{node_backends}, expected one backend on host {host}'.format( 50 | task_id=self.id, 51 | group=self.group, 52 | node_backends=list(nb_addresses), 53 | host=self.host 54 | ) 55 | ) 56 | 57 | valid_statuses = (storage.Status.OK, storage.Status.RO, storage.Status.STALLED) 58 | if group.node_backends and group.node_backends[0].status not in valid_statuses: 59 | raise JobBrokenError( 60 | 'Task {task_id}: node of group {group} has ' 61 | 'status {status}, should be one of {valid_statuses}'.format( 62 | task_id=self.id, 63 | group=self.group, 64 | status=group.node_backends[0].status, 65 | valid_statuses=valid_statuses 66 | ) 67 | ) 68 | 69 | if self.uncoupled: 70 | if group.couple: 71 | raise JobBrokenError( 72 | 'Task {task_id}: group {group} happens to be already coupled'.format( 73 | task_id=self.id, 74 | group=self.group 75 | ) 76 | ) 77 | else: 78 | if not group.couple: 79 | raise JobBrokenError( 80 | 'Task {task_id}: group {group} is not coupled'.format( 81 | task_id=self.id, 82 | group=self.group 83 | ) 84 | ) 85 | 86 | super(NodeStopTask, self).execute(processor) 87 | -------------------------------------------------------------------------------- /src/cocaine-app/timed_queue.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import threading 3 | import heapq 4 | import time 5 | 6 | 7 | class Task(object): 8 | 9 | def __init__(self, task_id, function, args, kwargs): 10 | self.__id = task_id 11 | self.__function = function 12 | self.__args = args 13 | self.__kwargs = kwargs 14 | self.__done = False 15 | 16 | def execute(self): 17 | try: 18 | self.__function(*self.__args, **self.__kwargs) 19 | finally: 20 | self.__done = True 21 | 22 | def done(self): 23 | return self.__done 24 | 25 | def id(self): 26 | return self.__id 27 | 28 | 29 | class TimedQueue(object): 30 | 31 | def __init__(self): 32 | self.__shutting_down = False 33 | self.__shutdown_lock = threading.Lock() 34 | self.__heap = [] 35 | self.__hurry = [] 36 | self.__task_by_id = {} 37 | self.__heap_lock = threading.Lock() 38 | self.__loop_thread = threading.Thread(target=TimedQueue.loop, args=(self,)) 39 | self.__loop_thread.setDaemon(True) 40 | 41 | def start(self): 42 | self.__loop_thread.start() 43 | 44 | def __del__(self): 45 | if not self._is_shutting_down(): 46 | self.shutdown() 47 | 48 | def _is_shutting_down(self): 49 | with self.__shutdown_lock: 50 | shutting_down = self.__shutting_down 51 | return shutting_down 52 | 53 | def loop(self): 54 | time.sleep(3) 55 | while not self._is_shutting_down(): 56 | task = None 57 | with self.__heap_lock: 58 | if self.__hurry: 59 | task = self.__hurry.pop() 60 | elif self.__heap and time.time() >= self.__heap[0][0]: 61 | task = heapq.heappop(self.__heap)[1] 62 | 63 | if task is None: 64 | time.sleep(1) 65 | continue 66 | 67 | with self.__heap_lock: 68 | id_ = task.id() 69 | self.__task_by_id.pop(id_, None) 70 | if not task.done(): 71 | try: 72 | task.execute() 73 | except Exception: 74 | # Task should handle its exceptions. If it doesn't, will lose it here. 75 | # The loop should not stop because of it. 76 | pass 77 | 78 | def add_task_in(self, task_id, secs, function, *args, **kwargs): 79 | self.add_task_at(task_id, time.time() + secs, function, *args, **kwargs) 80 | 81 | def add_task_at(self, task_id, at, function, *args, **kwargs): 82 | if self._is_shutting_down(): 83 | return 84 | with self.__heap_lock: 85 | if task_id in self.__task_by_id: 86 | raise ValueError("Task with ID %s already exists" % task_id) 87 | task = Task(task_id, function, args, kwargs) 88 | heapq.heappush(self.__heap, (at, task)) 89 | self.__task_by_id[task_id] = task 90 | 91 | def hurry(self, task_id): 92 | with self.__heap_lock: 93 | if task_id in self.__task_by_id: 94 | self.__hurry.append(self.__task_by_id[task_id]) 95 | return True 96 | return False 97 | 98 | def shutdown(self): 99 | with self.__shutdown_lock: 100 | self.__shutting_down = True 101 | self.__loop_thread.join() 102 | -------------------------------------------------------------------------------- /tests/test_monitor_pool.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | import msgpack 4 | 5 | from fixtures.util import parametrize 6 | 7 | 8 | @parametrize('family', (socket.AF_INET, socket.AF_INET6)) 9 | @parametrize('data_size', (1024,), arglabels={'data_size': 'data size'}) 10 | @parametrize( 11 | 'valid_json', 12 | (True, False,), 13 | arglabels={'valid_json': 'response in json'}, 14 | ) 15 | @parametrize( 16 | 'encode_content', 17 | (True, False,), 18 | arglabels={'encode_content': 'zlib encoding'} 19 | ) 20 | @parametrize( 21 | 'request_timeout', 22 | (0.2,), 23 | arglabels={'request_timeout': 'request timeout'} 24 | ) 25 | class TestMonitorStatParseWorker(object): 26 | """Test MonitorStatParse worker 27 | 28 | This suit validates: 29 | - acting upon various http response statuses; 30 | - support of 'deflate' encoding; 31 | - timeout tolerance. 32 | """ 33 | @parametrize( 34 | 'response_code', 35 | (200,), 36 | arglabels={'response_code': 'http code'}, 37 | ) 38 | @parametrize( 39 | 'response_processing_time', 40 | (0.0,), 41 | arglabels={'response_processing_time': 'processing time'}, 42 | ) 43 | def test_200_response(self, 44 | monitor_pool, 45 | ascii_data, 46 | family, 47 | response_code, 48 | valid_json): 49 | """Valid response fetching check with enabled or disable encoding""" 50 | task = ('localhost', 1025, family) 51 | result = msgpack.unpackb(monitor_pool.apply(None, (task,))) 52 | assert response_code == result['code'] 53 | if not valid_json: 54 | assert result['error'].startswith('Failed to parse json') 55 | if valid_json: 56 | assert result['content']['data'] == ascii_data 57 | 58 | @parametrize( 59 | 'response_code', 60 | (404, 502,), 61 | arglabels={'response_code': 'http code'}, 62 | ) 63 | @parametrize( 64 | 'response_processing_time', 65 | (0.0,), 66 | arglabels={'response_processing_time': 'processing time'}, 67 | ) 68 | def test_bad_response(self, 69 | monitor_pool, 70 | ascii_data, 71 | family, 72 | response_code, 73 | valid_json): 74 | """Tolerance for any http status other than 200""" 75 | task = ('localhost', 1025, family) 76 | result = msgpack.unpackb(monitor_pool.apply(None, (task,))) 77 | assert response_code == result['code'] 78 | 79 | @parametrize( 80 | 'response_code', 81 | (200,), 82 | arglabels={'response_code': 'http code'}, 83 | ) 84 | @parametrize( 85 | 'response_processing_time', 86 | (0.5,), 87 | arglabels={'response_processing_time': 'processing time'}, 88 | ) 89 | def test_timeout(self, 90 | monitor_pool, 91 | ascii_data, 92 | family, 93 | response_code, 94 | valid_json): 95 | """Request timeout tolerance""" 96 | task = ('localhost', 1025, family) 97 | result = msgpack.unpackb(monitor_pool.apply(None, (task,))) 98 | # From tornado docs: 99 | # "Error code 599 is used when no HTTP response was received, e.g. for a timeout." 100 | # 101 | # http://www.tornadoweb.org/en/stable/httpclient.html#tornado.httpclient.HTTPError 102 | assert result['code'] == 599 103 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/monitor_pool.py: -------------------------------------------------------------------------------- 1 | try: 2 | import simplejson as json 3 | except ImportError: 4 | import json 5 | import zlib 6 | 7 | import msgpack 8 | from tornado import gen 9 | from tornado.httpclient import AsyncHTTPClient, HTTPRequest 10 | 11 | from mastermind.pool import PoolWorker 12 | 13 | 14 | class MonitorStatParseWorker(PoolWorker): 15 | """Fetch and parse monitor stat from elliptics monitor port 16 | 17 | Performs heavy task of json parsing and packs it back using fast msgpack. 18 | 19 | Arguments: 20 | ioloop: tornado IOLoop instance; 21 | max_http_clients: number of concurrent requests that can be in progress; 22 | monitor_stat_categories: elliptics statistics categories that should be fetched; 23 | monitor_port: elliptics monitor port; 24 | connect_timeout: timeout for initial connection in seconds; 25 | request_timeout: timeout for entire request in seconds; 26 | **kwds: passed through to base PoolWorker class; 27 | """ 28 | 29 | HTTPClient = AsyncHTTPClient 30 | 31 | def __init__(self, 32 | ioloop=None, 33 | max_http_clients=1, 34 | monitor_stat_categories=0, 35 | monitor_port=10025, 36 | connect_timeout=5.0, 37 | request_timeout=5.0, 38 | **kwds): 39 | super(MonitorStatParseWorker, self).__init__(ioloop=ioloop, **kwds) 40 | self._monitor_stat_categories = monitor_stat_categories 41 | self._monitor_port = monitor_port 42 | self._connect_timeout = connect_timeout 43 | self._request_timeout = request_timeout 44 | self.http_client = MonitorStatParseWorker.HTTPClient( 45 | self._ioloop, 46 | max_clients=max_http_clients, 47 | ) 48 | 49 | def url(self, host): 50 | return 'http://{host}:{port}/?categories={categories}'.format( 51 | host=host, 52 | port=self._monitor_port, 53 | categories=self._monitor_stat_categories, 54 | ) 55 | 56 | @gen.coroutine 57 | def process(self, (host, port, family)): 58 | http_request = HTTPRequest( 59 | self.url(host=host), 60 | connect_timeout=self._connect_timeout, 61 | request_timeout=self._request_timeout, 62 | ) 63 | response = yield self.http_client.fetch(http_request, raise_error=False) 64 | result = self._parse_response(host, port, family, response) 65 | 66 | raise gen.Return(msgpack.packb(result)) 67 | 68 | def _parse_response(self, host, port, family, response): 69 | error = None 70 | content = '' 71 | if response.error: 72 | error = str(response.error) 73 | else: 74 | try: 75 | content_type = response.headers.get('Content-Type') 76 | if content_type != 'application/json': 77 | raise ValueError( 78 | 'unsupported content-type "{}"'.format(content_type) 79 | ) 80 | content = response.body 81 | if response.headers.get('Content-Encoding') == 'deflate': 82 | content = zlib.decompress(content) 83 | content = json.loads(content) 84 | except Exception as e: 85 | error = 'Failed to parse json: {}'.format(e) 86 | return { 87 | 'host': host, 88 | 'port': port, 89 | 'family': family, 90 | 'code': response.code, 91 | 'request_time': response.request_time, 92 | 'url': response.effective_url, 93 | 'error': error, 94 | 'content': content, 95 | } 96 | -------------------------------------------------------------------------------- /src/cocaine-app/indexes.py: -------------------------------------------------------------------------------- 1 | from Queue import Queue 2 | 3 | import elliptics 4 | 5 | 6 | class SecondaryIndex(object): 7 | def __init__(self, idx, key_tpl, meta_session): 8 | self.idx = idx 9 | self.key_tpl = key_tpl 10 | self.meta_session = meta_session 11 | 12 | def __iter__(self): 13 | for idx in self.meta_session.find_all_indexes([self.idx]): 14 | yield idx.indexes[0].data 15 | 16 | def __setitem__(self, key, val): 17 | eid = self.meta_session.transform(self.key_tpl % key) 18 | self.meta_session.set_indexes(eid, [self.idx], [val]) 19 | 20 | def __getitem__(self, key): 21 | eid = self.meta_session.transform(self.key_tpl % key) 22 | return self.meta_session.list_indexes(eid).get()[0].data 23 | 24 | def __delitem__(self, key): 25 | eid = self.meta_session.transform(self.key_tpl % key) 26 | self.meta_session.set_indexes(eid, [], []) 27 | 28 | 29 | class TagSecondaryIndex(object): 30 | 31 | BATCH_SIZE = 500 32 | 33 | def __init__(self, main_idx, idx_tpl, key_tpl, meta_session, logger=None, namespace=None, batch_size=BATCH_SIZE): 34 | self.main_idx = main_idx 35 | self.idx_tpl = idx_tpl 36 | self.key_tpl = key_tpl 37 | self.meta_session = meta_session.clone() 38 | if namespace: 39 | self.meta_session.set_namespace(namespace) 40 | self.batch_size = batch_size 41 | self.logger = logger 42 | 43 | def __iter__(self): 44 | idxes = [idx.id for idx in 45 | self.meta_session.clone().find_all_indexes([self.main_idx]).get()] 46 | 47 | for data in self._iter_keys(idxes): 48 | yield data 49 | 50 | def tagged(self, tag): 51 | idxes = [idx.id for idx in 52 | self.meta_session.clone().find_all_indexes([self.main_idx, self.idx_tpl % tag])] 53 | 54 | self.logger.info('Received {0} records from tagged index {1}'.format( 55 | len(idxes), self.idx_tpl % tag)) 56 | 57 | processed = 0 58 | for data in self._iter_keys(idxes): 59 | processed += 1 60 | yield data 61 | 62 | self.logger.info('Processed {0} records from tagged index {1}'.format( 63 | processed, self.idx_tpl % tag)) 64 | 65 | def __setitem__(self, key, val): 66 | eid = self.meta_session.transform(self.key_tpl % key) 67 | self.meta_session.clone().write_data(eid, val).get() 68 | 69 | def __getitem__(self, key): 70 | eid = self.meta_session.transform(self.key_tpl % key) 71 | return self.meta_session.clone().read_latest(eid).get()[0].data 72 | 73 | def set_tag(self, key, tag=None): 74 | eid = self.meta_session.transform(self.key_tpl % key) 75 | tags = [self.main_idx] 76 | if tag: 77 | tags.append(self.idx_tpl % tag) 78 | self.meta_session.clone().set_indexes(eid, tags, [''] * len(tags)) 79 | 80 | def _fetch_response_data(self, req): 81 | data = None 82 | try: 83 | result = req[1] 84 | result.wait() 85 | data = result.get()[0].data 86 | except Exception as e: 87 | self.logger.error('Failed to fetch record from tagged index: {0} ({1})'.format(req[0], e)) 88 | return data 89 | 90 | def _iter_keys(self, keys): 91 | if not keys: 92 | return 93 | 94 | q = Queue(self.batch_size) 95 | s = self.meta_session.clone() 96 | 97 | for k in keys: 98 | if not q.full(): 99 | q.put((k, s.read_latest(k))) 100 | else: 101 | data = self._fetch_response_data(q.get()) 102 | q.put((k, s.read_latest(k))) 103 | if data: 104 | yield data 105 | 106 | while q.qsize(): 107 | data = self._fetch_response_data(q.get()) 108 | if data: 109 | yield data 110 | -------------------------------------------------------------------------------- /src/cocaine-app/mastermind_core/response.py: -------------------------------------------------------------------------------- 1 | import json 2 | import threading 3 | 4 | from config import config 5 | from mastermind_core import helpers 6 | 7 | 8 | class CachedResponse(object): 9 | """Mastermind cached response 10 | 11 | Use CachedResponse to store mastermind handle's response data that requires 12 | significant calculation. 13 | 14 | Instance's reentrant ``lock`` object is designed to provide atomicity when deriving 15 | from CachedResponse class for implementation of 'set_result', 16 | 'set_exception' and 'get_result' methods. 17 | 18 | NB: this object is not able to cache responses for parametrized requests. 19 | 20 | TODO: add unit tests 21 | """ 22 | def __init__(self): 23 | self.lock = threading.RLock() 24 | self._result = None 25 | self._exception = None 26 | 27 | def get_result(self, *args, **kwargs): 28 | """Get cached result or raise stored exception 29 | 30 | Parameters: None 31 | """ 32 | with self.lock: 33 | if self.exception: 34 | raise self.exception 35 | return self._result 36 | 37 | def set_result(self, result): 38 | """Set result for cached response 39 | 40 | Parameters: 41 | result - result of calculation; 42 | """ 43 | with self.lock: 44 | self._result = result 45 | self._exception = None 46 | 47 | @property 48 | def exception(self): 49 | return self._exception 50 | 51 | def set_exception(self, exception): 52 | """Set exception that will be thrown on 'get_result' call 53 | 54 | Parameters: 55 | exception - exception to throw; 56 | """ 57 | with self.lock: 58 | self._result = None 59 | self._exception = exception 60 | 61 | 62 | GZIP_CONFIG = config.get('gzip', {}) 63 | DEFAULT_GZIP_COMPRESSION_LEVEL = GZIP_CONFIG.get('compression_level', 1) 64 | 65 | 66 | class CachedGzipResponse(CachedResponse): 67 | """Mastermind cached response with additionally stored compressed version 68 | 69 | Compression is performed using gzip on serialized json string, so be sure 70 | that result can be dumped to json. 71 | """ 72 | 73 | def __init__(self, compression_level=DEFAULT_GZIP_COMPRESSION_LEVEL): 74 | super(CachedGzipResponse, self).__init__() 75 | self._compressed_result = None 76 | self._compression_level = compression_level 77 | 78 | def set_result(self, result): 79 | """Set result for cached response and also store its compressed version 80 | """ 81 | compressed_result = self._compress(result) 82 | with self.lock: 83 | super(CachedGzipResponse, self).set_result(result) 84 | self._compressed_result = compressed_result 85 | 86 | def set_exception(self, exception): 87 | """Set exception that will be thrown on 'get_result' call 88 | 89 | Parameters: 90 | exception - exception to throw; 91 | """ 92 | with self.lock: 93 | super(CachedGzipResponse, self).set_exception(exception) 94 | self._compressed_result = None 95 | 96 | def get_result(self, compressed=True): 97 | """Get cached result or raise stored exception 98 | 99 | Parameters: 100 | compressed - boolean flag which determines if compressed version 101 | of cached response should be returned; 102 | """ 103 | with self.lock: 104 | if not compressed: 105 | return super(CachedGzipResponse, self).get_result() 106 | if self.exception: 107 | raise self.exception 108 | return self._compressed_result 109 | 110 | def _compress(self, result): 111 | try: 112 | data = json.dumps(result) 113 | except (TypeError, ValueError): 114 | raise TypeError( 115 | 'Cached gzip response does not support objects ' 116 | 'that cannot be dumped to json' 117 | ) 118 | return helpers.gzip_compress(data, compression_level=self._compression_level) 119 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/couple_defrag.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | 4 | from error import JobBrokenError 5 | from infrastructure import infrastructure 6 | from job import Job 7 | from job_types import JobTypes 8 | from tasks import Task, NodeBackendDefragTask, CoupleDefragStateCheckTask 9 | import storage 10 | 11 | 12 | logger = logging.getLogger('mm.jobs') 13 | 14 | 15 | class CoupleDefragJob(Job): 16 | 17 | PARAMS = ('couple', 'fragmentation', 'is_cache_couple', 'resources') 18 | 19 | def __init__(self, **kwargs): 20 | super(CoupleDefragJob, self).__init__(**kwargs) 21 | self.type = JobTypes.TYPE_COUPLE_DEFRAG_JOB 22 | 23 | @classmethod 24 | def new(cls, *args, **kwargs): 25 | job = super(CoupleDefragJob, cls).new(*args, **kwargs) 26 | try: 27 | # TODO: use 'couples' container 28 | couples = (storage.cache_couples 29 | if kwargs.get('is_cache_couple', False) else 30 | storage.replicas_groupsets) 31 | couple = couples[kwargs['couple']] 32 | fragmentation = [] 33 | for g in couple.groups: 34 | fragmentation.append(g.get_stat().fragmentation) 35 | fragmentation.sort(reverse=True) 36 | job.fragmentation = fragmentation 37 | except Exception: 38 | job.release_locks() 39 | raise 40 | return job 41 | 42 | def _set_resources(self): 43 | resources = {Job.RESOURCE_FS: []} 44 | 45 | # TODO: use 'couples' container 46 | couples = (storage.cache_couples 47 | if self.is_cache_couple else 48 | storage.replicas_groupsets) 49 | couple = couples[self.couple] 50 | 51 | for g in couple.groups: 52 | resources[Job.RESOURCE_FS].append( 53 | (g.node_backends[0].node.host.addr, str(g.node_backends[0].fs.fsid))) 54 | self.resources = resources 55 | 56 | def create_tasks(self): 57 | # TODO: use 'couples' container 58 | couples = (storage.cache_couples 59 | if self.is_cache_couple else 60 | storage.replicas_groupsets) 61 | if not self.couple in couples: 62 | raise JobBrokenError('Couple {0} is not found'.format(self.couple)) 63 | 64 | couple = couples[self.couple] 65 | 66 | def make_defrag_tasks(nb): 67 | cmd = infrastructure._defrag_node_backend_cmd( 68 | nb.node.host.addr, nb.node.port, nb.node.family, nb.backend_id) 69 | 70 | node_backend = self.node_backend( 71 | nb.node.host.addr, nb.node.port, nb.backend_id) 72 | 73 | task = NodeBackendDefragTask.new(self, 74 | host=nb.node.host.addr, 75 | cmd=cmd, 76 | node_backend=node_backend, 77 | group=group.group_id, 78 | params={'group': str(group.group_id), 79 | 'node_backend': node_backend.encode('utf-8'), 80 | 'success_codes': [self.DNET_CLIENT_ALREADY_IN_PROGRESS]}) 81 | 82 | self.tasks.append(task) 83 | 84 | defrag_tasks = False 85 | for group in couple.groups: 86 | for nb in group.node_backends: 87 | if nb.stat.want_defrag <= 1: 88 | continue 89 | make_defrag_tasks(nb) 90 | task = CoupleDefragStateCheckTask.new(self, couple=str(couple)) 91 | self.tasks.append(task) 92 | defrag_tasks = True 93 | 94 | if not defrag_tasks: 95 | raise ValueError("Couple's {} backends does not require " 96 | "defragmentation".format(self.couple)) 97 | 98 | @property 99 | def group(self): 100 | group = self._involved_groups[0] 101 | for task in self.tasks: 102 | if task.type != 'node_backend_defrag_task': 103 | continue 104 | if task.status == Task.STATUS_QUEUED: 105 | break 106 | group = task.group 107 | return group 108 | 109 | @property 110 | def _involved_groups(self): 111 | return [int(gid) for gid in self.couple.split(':')] 112 | 113 | @property 114 | def _involved_couples(self): 115 | return [self.couple] 116 | -------------------------------------------------------------------------------- /scripts/05-group-history-unification.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from copy import deepcopy 3 | import datetime 4 | import json 5 | import logging 6 | import msgpack 7 | import os.path 8 | import re 9 | import sys 10 | import time 11 | 12 | import elliptics 13 | 14 | 15 | logger = logging.getLogger('mm.convert') 16 | 17 | 18 | 19 | CONFIG_PATH = '/etc/elliptics/mastermind.conf' 20 | 21 | try: 22 | 23 | with open(CONFIG_PATH, 'r') as config_file: 24 | config = json.load(config_file) 25 | 26 | except Exception as e: 27 | raise ValueError('Failed to load config file %s: %s' % (CONFIG_PATH, e)) 28 | 29 | 30 | log = elliptics.Logger('/tmp/group-history-convert.log', config["dnet_log_mask"]) 31 | node_config = elliptics.Config() 32 | meta_node = elliptics.Node(log, node_config) 33 | 34 | addresses = [elliptics.Address(host=str(node[0]), port=node[1], family=node[2]) 35 | for node in config["metadata"]["nodes"]] 36 | logger.info('Connecting to meta nodes: {0}'.format(config["metadata"]["nodes"])) 37 | meta_wait_timeout = config['metadata'].get('wait_timeout', 5) 38 | 39 | try: 40 | meta_node.add_remotes(addresses) 41 | except Exception as e: 42 | logger.error('Failed to connect to any elliptics meta storage node: {0}'.format( 43 | e)) 44 | raise ValueError('Failed to connect to any elliptics storage META node') 45 | 46 | meta_session = elliptics.Session(meta_node) 47 | meta_session.set_timeout(meta_wait_timeout) 48 | meta_session.add_groups(list(config["metadata"]["groups"])) 49 | 50 | time.sleep(5) 51 | 52 | # def get_max_group_id(): 53 | # max_group = int(meta_session.read_data( 54 | # 'mastermind:max_group').get()[0].data) 55 | # return max_group 56 | 57 | BASE_PORT = config.get('elliptics_base_port', 1024) 58 | 59 | def port_to_path(port): 60 | if port == 9999: 61 | return '/srv/cache/' 62 | return os.path.join(config.get('elliptics_base_storage_path', '/srv/storage/'), 63 | port_to_dir(port) + '/') 64 | 65 | def port_to_dir(port): 66 | return str(port - BASE_PORT) 67 | 68 | IPV4_RE = re.compile('\d{0,3}\.\d{0,3}\.\d{0,3}\.\d{0,3}') 69 | 70 | def ip_to_family(ip): 71 | if IPV4_RE.match(ip) is not None: 72 | return 2 73 | else: 74 | return 10 75 | 76 | def try_convert_group_history(index): 77 | try: 78 | data = msgpack.unpackb(index.data) 79 | for node_set in data['nodes']: 80 | updated_node_set = [] 81 | for node in node_set['set']: 82 | record = [] 83 | if len(node) == 2: 84 | # old-style record 85 | record.extend(node) 86 | record.append(ip_to_family(node[0])) 87 | record.append(None) # backend_id didn't exist 88 | record.append(port_to_path(node[1])) 89 | elif len(node) == 4: 90 | record.extend(node[0:2]) 91 | record.append(ip_to_family(node[0])) 92 | record.extend(node[2:4]) 93 | elif len(node) == 5: 94 | record.extend(node) 95 | else: 96 | raise ValueError('Group {0} history record is strange: {1}'.format(data['id'], data)) 97 | updated_node_set.append(tuple(record)) 98 | node_set['set'] = tuple(updated_node_set) 99 | if not 'type' in node_set: 100 | node_set['type'] = 'automatic' 101 | 102 | # print data 103 | eid = meta_session.transform('mastermind:group_%d' % data['id']) 104 | meta_session.update_indexes(eid, ['mastermind:groups_idx'], 105 | [msgpack.packb(data)]).get() 106 | print "Converted group {0}".format(data['id']) 107 | except Exception as e: 108 | print "Failed to convert index record: {0}, data: {1}".format(e, index.data) 109 | 110 | def convert_groups_history(): 111 | for res in meta_session.find_all_indexes(['mastermind:groups_idx']).get(): 112 | for idx in res.indexes: 113 | try_convert_group_history(idx) 114 | 115 | if __name__ == '__main__': 116 | 117 | if len(sys.argv) < 2 or sys.argv[1] not in ('convert',): 118 | print "Usage: {0} convert".format(sys.argv[0]) 119 | sys.exit(1) 120 | 121 | convert_groups_history() 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /src/cocaine-app/couple_records.py: -------------------------------------------------------------------------------- 1 | from config import config 2 | from db.mongo import MongoObject 3 | from db.mongo.pool import Collection 4 | from mastermind_core import helpers 5 | import storage 6 | 7 | 8 | class CoupleRecord(MongoObject): 9 | 10 | ID = 'id' 11 | SETTINGS = 'settings' 12 | 13 | PRIMARY_ID_KEY = ID 14 | 15 | def __init__(self, **init_params): 16 | super(CoupleRecord, self).__init__() 17 | 18 | self.couple_id = init_params[self.ID] 19 | self.settings = init_params.get(self.SETTINGS, {}) 20 | 21 | @property 22 | def id(self): 23 | return self.couple_id 24 | 25 | @classmethod 26 | def new(cls, **kwargs): 27 | couple_record = cls(**kwargs) 28 | couple_record._dirty = True 29 | return couple_record 30 | 31 | def set_settings(self, settings, update=True): 32 | if update: 33 | old_settings = self.settings 34 | else: 35 | old_settings = {} 36 | new_settings = helpers.merge_dict(old_settings, settings) 37 | CoupleSettingsValidator.validate_settings(new_settings) 38 | self.settings = new_settings 39 | self._dirty = True 40 | 41 | def dump(self): 42 | return { 43 | 'id': self.id, 44 | 'settings': self.settings, 45 | } 46 | 47 | 48 | class CoupleSettingsValidator(object): 49 | 50 | @staticmethod 51 | def validate_settings(settings): 52 | if storage.Couple.READ_PREFERENCE not in settings: 53 | raise ValueError('Couple requires "read_preference" setting') 54 | for setting_name, setting in settings.iteritems(): 55 | if setting_name == storage.Couple.READ_PREFERENCE: 56 | CoupleSettingsValidator._validate_read_preference( 57 | settings[storage.Couple.READ_PREFERENCE] 58 | ) 59 | else: 60 | raise ValueError('Invalid couple setting: "{}"'.format(setting_name)) 61 | 62 | @staticmethod 63 | def _validate_read_preference(rp_settings): 64 | if not isinstance(rp_settings, (tuple, list)): 65 | raise ValueError('"read_preference" is "{type}", expected a list or a tuple'.format( 66 | type=type(rp_settings).__name__, 67 | )) 68 | 69 | unique_rp = set() 70 | for read_preference in rp_settings: 71 | if read_preference not in storage.GROUPSET_IDS: 72 | raise ValueError('Invalid groupset id: {}'.format(read_preference)) 73 | if read_preference in unique_rp: 74 | raise ValueError( 75 | 'Read preference should contain unique list of groupset ids, ' 76 | 'value "{value}" is found more than once'.format( 77 | value=read_preference, 78 | ) 79 | ) 80 | unique_rp.add(read_preference) 81 | 82 | 83 | class CoupleRecordNotFoundError(Exception): 84 | pass 85 | 86 | 87 | class CoupleRecordFinder(object): 88 | def __init__(self, db): 89 | self.collection = Collection(db[config['metadata']['couples']['db']], 'couples') 90 | 91 | def _get_couple_record(self, couple): 92 | couple_id = couple.as_tuple()[0] 93 | couple_records_list = ( 94 | self.collection 95 | .list(**{CoupleRecord.ID: couple_id}) 96 | .limit(1) 97 | ) 98 | try: 99 | couple_record = CoupleRecord(**couple_records_list[0]) 100 | except IndexError: 101 | raise CoupleRecordNotFoundError( 102 | 'Couple record for couple {} is not found'.format(couple_id) 103 | ) 104 | couple_record.collection = self.collection 105 | return couple_record 106 | 107 | def couple_record(self, couple): 108 | try: 109 | return self._get_couple_record(couple) 110 | except CoupleRecordNotFoundError: 111 | couple_id = couple.as_tuple()[0] 112 | couple_record = CoupleRecord.new(**{CoupleRecord.ID: couple_id}) 113 | couple_record.collection = self.collection 114 | return couple_record 115 | 116 | def couple_records(self, ids=None): 117 | records = [ 118 | CoupleRecord(**gh) 119 | for gh in self.collection.list(id=ids) 120 | ] 121 | for r in records: 122 | r.collection = self.collection 123 | r._dirty = False 124 | return records 125 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/service.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | import itertools 3 | 4 | from cocaine.asio.exceptions import CommunicationError, DisconnectionError, IllegalStateError 5 | from cocaine.futures import chain, Deferred 6 | from cocaine.logging import Logger 7 | from cocaine.services import Service 8 | from tornado import ioloop 9 | 10 | 11 | class ReconnectableService(object): 12 | 13 | DEFAULT_HOST = 'localhost' 14 | DEFAULT_PORT = 10053 15 | DEFAULT_ADDRESS = '{host}:{port}'.format(host=DEFAULT_HOST, 16 | port=DEFAULT_PORT) 17 | 18 | def __init__(self, 19 | app_name, 20 | addresses=None, 21 | attempts=3, 22 | delay=0.1, max_delay=60.0, delay_exp=2.0, 23 | connect_timeout=None, 24 | timeout=None, 25 | logger=None): 26 | self.delay = delay 27 | self.max_delay = max_delay 28 | self.delay_exp = delay_exp 29 | self.connect_timeout = connect_timeout 30 | self.timeout = timeout 31 | self.attempts = attempts 32 | self.logger = logger or Logger() 33 | self._reset() 34 | 35 | addresses = addresses or ReconnectableService.DEFAULT_ADDRESS 36 | pairs = [] 37 | for address in addresses.split(','): 38 | address_parts = address.split(':') 39 | host = address_parts[0] 40 | port = (len(address_parts) > 1 and int(address_parts[1]) or 41 | ReconnectableService.DEFAULT_PORT) 42 | pairs.append((host, port)) 43 | self.addresses = itertools.cycle(pairs) 44 | 45 | self.app_name = app_name 46 | self.upstream = None 47 | 48 | def _reset(self): 49 | self._cur_delay = self.delay 50 | 51 | @chain.source 52 | def enqueue(self, handler, data, attempts=None, timeout=None): 53 | attempt = 1 54 | request_attempts = attempts or self.attempts 55 | while True: 56 | try: 57 | yield self._reconnect_if_needed() 58 | yield self.upstream.enqueue(handler, data, timeout=timeout or self.timeout) 59 | self._reset() 60 | break 61 | except Exception as e: 62 | error_str = 'Upstream service request failed (attempt {}/{}): {}'.format( 63 | attempt, request_attempts, e) 64 | if isinstance(e, CommunicationError): 65 | self.logger.error(error_str) 66 | if isinstance(e, DisconnectionError): 67 | self.logger.debug('Disconnection from upstream service, ' 68 | 'will reconnect on next attempt') 69 | self.upstream = None 70 | else: 71 | self.logger.error(error_str) 72 | if attempt >= request_attempts: 73 | self._reset() 74 | raise 75 | attempt += 1 76 | yield self._delay() 77 | 78 | @chain.source 79 | def _delay(self): 80 | d = Deferred() 81 | ioloop.IOLoop.current().add_timeout(timedelta(seconds=self._cur_delay), 82 | lambda: d.trigger(None)) 83 | self.logger.debug('Delaying for {:.2f} s'.format(self._cur_delay)) 84 | yield d 85 | self.logger.debug('Resuming from delay...') 86 | self._cur_delay = min(self._cur_delay * self.delay_exp, self.max_delay) 87 | 88 | @chain.source 89 | def _reconnect_if_needed(self): 90 | if not self.upstream: 91 | host, port = self.addresses.next() 92 | self.upstream = Service(self.app_name, blockingConnect=False) 93 | self.logger.debug('Connecting to upstream service "{}", host={}, ' 94 | 'port={}'.format(self.app_name, host, port)) 95 | yield self.upstream.connect(host=host, port=port, 96 | timeout=self.connect_timeout, 97 | blocking=False) 98 | 99 | if not self.upstream.isConnected(): 100 | try: 101 | self.logger.debug( 102 | 'Reconnecting to upstream service "{}"'.format( 103 | self.app_name)) 104 | yield self.upstream.reconnect(timeout=self.connect_timeout, 105 | blocking=False) 106 | except IllegalStateError: 107 | # seems to be in connecting state 108 | pass 109 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/recover_dc.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | from error import JobBrokenError 5 | from infrastructure import infrastructure 6 | from infrastructure_cache import cache 7 | from job import Job 8 | from job_types import JobTypes 9 | from tasks import NodeBackendDefragTask, CoupleDefragStateCheckTask, RecoverGroupDcTask 10 | import storage 11 | from sync import sync_manager 12 | from sync.error import ( 13 | LockError, 14 | LockFailedError, 15 | LockAlreadyAcquiredError, 16 | InconsistentLockError, 17 | API_ERROR_CODE 18 | ) 19 | 20 | 21 | logger = logging.getLogger('mm.jobs') 22 | 23 | 24 | class RecoverDcJob(Job): 25 | 26 | PARAMS = ('group', 'couple', 27 | 'resources', 28 | 'keys', 'host', 'port', 'family', 'backend_id' # read-only parameters 29 | ) 30 | 31 | def __init__(self, **kwargs): 32 | super(RecoverDcJob, self).__init__(**kwargs) 33 | self.type = JobTypes.TYPE_RECOVER_DC_JOB 34 | 35 | @classmethod 36 | def new(cls, *args, **kwargs): 37 | job = super(RecoverDcJob, cls).new(*args, **kwargs) 38 | try: 39 | couple = storage.replicas_groupsets[kwargs['couple']] 40 | keys = [] 41 | 42 | for g in couple.groups: 43 | if not g.node_backends: 44 | raise JobBrokenError('Group {0} has no active backends, ' 45 | 'cannot create recover job'.format(g.group_id)) 46 | keys.append(g.get_stat().files) 47 | keys.sort(reverse=True) 48 | job.keys = keys 49 | 50 | min_keys_group = job.__min_keys_group(couple) 51 | nb = min_keys_group.node_backends[0] 52 | job.group = min_keys_group.group_id 53 | job.host = nb.node.host.addr 54 | job.port = nb.node.port 55 | job.backend_id = nb.backend_id 56 | job.family = nb.node.family 57 | except Exception: 58 | job.release_locks() 59 | raise 60 | return job 61 | 62 | def _set_resources(self): 63 | resources = { 64 | Job.RESOURCE_HOST_IN: [], 65 | Job.RESOURCE_HOST_OUT: [], 66 | Job.RESOURCE_FS: [], 67 | } 68 | 69 | couple = storage.replicas_groupsets[self.couple] 70 | for g in couple.groups: 71 | resources[Job.RESOURCE_HOST_IN].append(g.node_backends[0].node.host.addr) 72 | resources[Job.RESOURCE_HOST_OUT].append(g.node_backends[0].node.host.addr) 73 | resources[Job.RESOURCE_FS].append((g.node_backends[0].node.host.addr, str(g.node_backends[0].fs.fsid))) 74 | self.resources = resources 75 | 76 | def human_dump(self): 77 | data = super(RecoverDcJob, self).human_dump() 78 | data['hostname'] = cache.get_hostname_by_addr(data['host'], strict=False) 79 | return data 80 | 81 | def __min_keys_group(self, couple): 82 | return sorted(couple.groups, key=lambda g: g.get_stat().files)[0] 83 | 84 | def create_tasks(self): 85 | 86 | if not self.couple in storage.replicas_groupsets: 87 | raise JobBrokenError('Couple {0} is not found'.format(self.couple)) 88 | 89 | couple = storage.replicas_groupsets[self.couple] 90 | 91 | recover_cmd = infrastructure._recover_group_cmd( 92 | self.group, 93 | trace_id=self.id[:16], 94 | ) 95 | task = RecoverGroupDcTask.new(self, 96 | group=self.group, 97 | host=self.host, 98 | cmd=recover_cmd, 99 | params={'node_backend': self.node_backend( 100 | self.host, self.port, self.backend_id).encode('utf-8'), 101 | 'group': str(self.group)}) 102 | self.tasks.append(task) 103 | 104 | def on_complete(self, processor): 105 | processor.planner.update_recover_ts(self.couple, time.time()) 106 | 107 | @property 108 | def _involved_groups(self): 109 | if self.couple is None: 110 | # fallback to old recover dc job format 111 | group = storage.groups[self.group] 112 | else: 113 | # get couple from group, because couple id could have been altered 114 | # (sad but true) 115 | group_id = int(self.couple.split(':')[0]) 116 | group = storage.groups[group_id] 117 | couple = group.couple 118 | if self.couple != str(couple): 119 | self.couple = str(couple) 120 | 121 | group_ids = [g.group_id for g in couple.groups] 122 | 123 | return group_ids 124 | 125 | @property 126 | def _involved_couples(self): 127 | if self.couple is None: 128 | # fallback to old recover dc job format 129 | group = storage.groups[self.group] 130 | else: 131 | group_id = int(self.couple.split(':')[0]) 132 | group = storage.groups[group_id] 133 | couple = group.couple 134 | 135 | return [str(couple)] 136 | -------------------------------------------------------------------------------- /scripts/08-create-couples-free-eff-space-coll.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import datetime 3 | import json 4 | import sys 5 | 6 | import pymongo 7 | 8 | 9 | CONFIG_PATH = '/etc/elliptics/mastermind.conf' 10 | 11 | try: 12 | 13 | with open(CONFIG_PATH, 'r') as config_file: 14 | config = json.load(config_file) 15 | 16 | except Exception as e: 17 | raise ValueError('Failed to load config file %s: %s' % (CONFIG_PATH, e)) 18 | 19 | 20 | def get_mongo_client(): 21 | if not config.get('metadata', {}).get('url', ''): 22 | raise ValueError('Mongo db url is not set') 23 | return pymongo.mongo_replica_set_client.MongoReplicaSetClient(config['metadata']['url']) 24 | 25 | 26 | def create_collection(db, coll_name): 27 | try: 28 | db.create_collection(coll_name) 29 | except pymongo.errors.CollectionInvalid: 30 | raise RuntimeError('Collection coll_name already exists') 31 | 32 | 33 | record_ttl = config.get('metadata', {}).get('statistics', {}).get( 34 | 'couple_free_effective_space', {}).get('record_ttl', 60 * 60 * 24 * 365) 35 | 36 | 37 | def create_indexes(coll): 38 | coll.ensure_index([ 39 | ('namespace', pymongo.ASCENDING), 40 | ('ts', pymongo.ASCENDING), 41 | ]) 42 | try: 43 | coll.ensure_index( 44 | [ 45 | ('utc_date', pymongo.ASCENDING), 46 | ], 47 | expireAfterSeconds=record_ttl, 48 | background=True 49 | ) 50 | except pymongo.errors.OperationFailure as e: 51 | if e.code == 85: 52 | # this happens when the index already exists but its expireAfterSeconds option 53 | # value differs from the one set in config file. In this case it will be updated 54 | # with a collMod command 55 | pass 56 | else: 57 | raise 58 | # this call is required to update collection's expireAfterSecods parameter 59 | # as ensure index does not set it if index already exists 60 | coll.database.command( 61 | 'collMod', 62 | coll.name, 63 | index={ 64 | 'keyPattern': {'utc_date': pymongo.ASCENDING}, 65 | 'expireAfterSeconds': record_ttl, 66 | } 67 | ) 68 | 69 | 70 | def check_period(coll): 71 | period = config.get('metadata', {}).get('statistics', {}).get( 72 | 'couple_free_effective_space', {}).get('collect_period', 300) 73 | for ns in coll.distinct('namespace'): 74 | ts = None 75 | for rec in coll.find({'namespace': ns}, sort=[('ts', pymongo.ASCENDING)]): 76 | if ts: 77 | diff_ts = rec['ts'] - ts 78 | 79 | if diff_ts < period * 0.9: 80 | print ( 81 | 'Ns {}: record detected at {}, previous was at {}, ' 82 | 'diff is {}s (too fast)'.format( 83 | ns, 84 | datetime.datetime.fromtimestamp(rec['ts']), 85 | datetime.datetime.fromtimestamp(ts), 86 | int(diff_ts) 87 | ) 88 | ) 89 | elif diff_ts > period * 1.1: 90 | print ( 91 | 'Ns {}: record detected at {}, previous was at {}, ' 92 | 'diff is {}s (too slow)'.format( 93 | ns, 94 | datetime.datetime.fromtimestamp(rec['ts']), 95 | datetime.datetime.fromtimestamp(ts), 96 | int(diff_ts) 97 | ) 98 | ) 99 | ts = rec['ts'] 100 | 101 | 102 | if __name__ == '__main__': 103 | 104 | if len(sys.argv) < 2 or sys.argv[1] not in ('create', 'check', 'indexes'): 105 | print "Usage: {0} create".format(sys.argv[0]) 106 | sys.exit(1) 107 | 108 | coll_name = 'couple_free_effective_space' 109 | 110 | if sys.argv[1] == 'create': 111 | mc = get_mongo_client() 112 | db_name = config.get('metadata', {}).get('statistics', {}).get('db', '') 113 | if not db_name: 114 | print 'Statistics database name is not found in config' 115 | sys.exit(1) 116 | 117 | create_collection(mc[db_name], coll_name) 118 | coll = mc[db_name][coll_name] 119 | create_indexes(coll) 120 | 121 | print 'Successfully created collection {} and its indexes'.format(coll_name) 122 | 123 | elif sys.argv[1] == 'indexes': 124 | mc = get_mongo_client() 125 | db_name = config.get('metadata', {}).get('statistics', {}).get('db', '') 126 | if not db_name: 127 | print 'Statistics database name is not found in config' 128 | sys.exit(1) 129 | 130 | coll = mc[db_name][coll_name] 131 | create_indexes(coll) 132 | 133 | print 'Successfully created indexes for collection {}'.format(coll_name) 134 | 135 | elif sys.argv[1] == 'check': 136 | mc = get_mongo_client() 137 | 138 | db_name = config.get('metadata', {}).get('statistics', {}).get('db', '') 139 | if not db_name: 140 | print 'Statistics database name is not found in config' 141 | sys.exit(1) 142 | 143 | check_period(mc[db_name][coll_name]) 144 | 145 | print 146 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/history_remove_node.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | from errors import CacheUpstreamError 5 | import history 6 | from infrastructure import infrastructure 7 | from infrastructure_cache import cache 8 | from jobs import TaskTypes 9 | import storage 10 | from sync import sync_manager 11 | from task import Task 12 | 13 | 14 | logger = logging.getLogger('mm.jobs') 15 | 16 | 17 | class HistoryRemoveNodeTask(Task): 18 | 19 | PARAMS = ('group', 'host', 'port', 'family', 'backend_id') 20 | TASK_TIMEOUT = 600 21 | 22 | def __init__(self, job): 23 | super(HistoryRemoveNodeTask, self).__init__(job) 24 | self.type = TaskTypes.TYPE_HISTORY_REMOVE_NODE 25 | 26 | def update_status(self): 27 | # infrastructure state is updated by itself via task queue 28 | pass 29 | 30 | def execute(self): 31 | try: 32 | hostname = cache.get_hostname_by_addr(self.host) 33 | except CacheUpstreamError: 34 | raise ValueError('Failed to resolve job host {}'.format(self.host)) 35 | 36 | nb_hostname_str = '{0}:{1}/{2}'.format( 37 | hostname, self.port, self.backend_id 38 | ).encode('utf-8') 39 | try: 40 | logger.info('Job {0}, task {1}: removing node backend {2} ' 41 | 'from group {3} history'.format( 42 | self.parent_job.id, self.id, nb_hostname_str, self.group)) 43 | infrastructure.detach_node( 44 | group_id=self.group, 45 | hostname=hostname, 46 | port=self.port, 47 | family=self.family, 48 | backend_id=self.backend_id, 49 | record_type=history.GroupStateRecord.HISTORY_RECORD_JOB, 50 | ) 51 | logger.info('Job {0}, task {1}: removed node backend {2} ' 52 | 'from group {3} history'.format( 53 | self.parent_job.id, self.id, nb_hostname_str, self.group)) 54 | except ValueError as e: 55 | # TODO: Think about changing ValueError to some dedicated exception 56 | # to differentiate between event when there is no such node in group 57 | # and an actual ValueError being raised 58 | logger.error('Job {0}, task {1}: failed to remove node backend {2} ' 59 | 'from group {3} history: {4}'.format( 60 | self.parent_job.id, self.id, nb_hostname_str, self.group, e)) 61 | pass 62 | 63 | group = self.group in storage.groups and storage.groups[self.group] or None 64 | 65 | nb_str = '{0}:{1}/{2}'.format(self.host, self.port, self.backend_id).encode('utf-8') 66 | node_backend = nb_str in storage.node_backends and storage.node_backends[nb_str] or None 67 | if group and node_backend and node_backend in group.node_backends: 68 | logger.info('Job {0}, task {1}: removing node backend {2} ' 69 | 'from group {3} node backends'.format( 70 | self.parent_job.id, self.id, node_backend, group)) 71 | group.remove_node_backend(node_backend) 72 | group.update_status_recursive() 73 | logger.info('Job {0}, task {1}: removed node backend {2} ' 74 | 'from group {3} node backends'.format( 75 | self.parent_job.id, self.id, node_backend, group)) 76 | 77 | def human_dump(self): 78 | data = super(HistoryRemoveNodeTask, self).human_dump() 79 | data['hostname'] = cache.get_hostname_by_addr(data['host'], strict=False) 80 | return data 81 | 82 | def finished(self, processor): 83 | return (not self.__node_in_group() or 84 | time.time() - self.start_ts > self.TASK_TIMEOUT) 85 | 86 | def failed(self, processor): 87 | return (time.time() - self.start_ts > self.TASK_TIMEOUT and 88 | self.__node_in_group()) 89 | 90 | def __node_in_group(self): 91 | group = self.group in storage.groups and storage.groups[self.group] or None 92 | nb_str = '{0}:{1}/{2}'.format(self.host, self.port, self.backend_id).encode('utf-8') 93 | node_backend = nb_str in storage.node_backends and storage.node_backends[nb_str] or None 94 | 95 | if group and node_backend: 96 | logger.debug('Job {0}, task {1}: checking node backend {2} ' 97 | 'with group {3} node backends: {4}'.format( 98 | self.parent_job.id, self.id, node_backend, self.group, group.node_backends)) 99 | nb_in_group = node_backend.group is group 100 | else: 101 | nb_in_group = False 102 | 103 | try: 104 | hostname = cache.get_hostname_by_addr(self.host) 105 | except CacheUpstreamError: 106 | raise ValueError('Failed to resolve job host {}'.format(self.host)) 107 | 108 | nb_in_history = infrastructure.node_backend_in_last_history_state( 109 | self.group, hostname, self.port, self.backend_id) 110 | logger.debug('Job {0}, task {1}: checking node backend {2} ' 111 | 'in group {3} history set: {4}'.format( 112 | self.parent_job.id, self.id, nb_str, self.group, nb_in_history)) 113 | 114 | if nb_in_group: 115 | logger.info('Job {0}, task {1}: node backend {2} is still ' 116 | 'in group {3}'.format(self.parent_job.id, self.id, nb_str, self.group)) 117 | if nb_in_history: 118 | logger.info('Job {0}, task {1}: node backend {2} is still ' 119 | 'in group\'s {3} history'.format( 120 | self.parent_job.id, self.id, nb_str, self.group)) 121 | 122 | return nb_in_group or nb_in_history 123 | 124 | def __str__(self): 125 | return ( 126 | 'HistoryRemoveNodeTask[id: {id}]'.format( 128 | id=self.id, 129 | host=self.host, 130 | port=self.port, 131 | family=self.family, 132 | backend_id=self.backend_id, 133 | group=self.group, 134 | ) 135 | ) 136 | -------------------------------------------------------------------------------- /src/cocaine-app/fake_inventory.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | import socket 3 | 4 | 5 | def get_dc_by_host(addr): 6 | ''' 7 | This is a fake implementation that always returns hostname. 8 | Please provide your own version that uses your server management framework. 9 | ''' 10 | host = socket.gethostbyaddr(addr)[0] 11 | return host 12 | 13 | 14 | def get_host_tree(host): 15 | ''' 16 | This is a fake implementation that always one-level host infrastructure tree. 17 | Please provide your own version that uses your server management framework. 18 | 19 | Return format example: 20 | { 21 | 'name': 'hostname.domain.com', 22 | 'type': 'host', 23 | 'parent': { 24 | 'name': 'alpha', 25 | 'type': 'dc', 26 | } 27 | } 28 | 29 | Outer level type 'host' is mandatory, parents' types are voluntary. 30 | ''' 31 | return { 32 | 'name': host, 33 | 'type': 'host', 34 | } 35 | 36 | 37 | DC_NODE_TYPE = 'host' 38 | BALANCER_NODE_TYPES = [DC_NODE_TYPE] 39 | 40 | 41 | def get_balancer_node_types(): 42 | ''' 43 | A list of node types that are used by balancer to create fault-tolerant 44 | namespaces. When creating new couple for a namespace balancer takes into 45 | account the current distribution of open couples and tries to use 46 | cluster nodes that are least used by the namespace. 47 | 48 | All node types used should be presented in a host tree for of a host 49 | (inventory get_host_tree function). 50 | 51 | Example: ['dc', 'host'] 52 | ''' 53 | return BALANCER_NODE_TYPES 54 | 55 | 56 | def get_node_types(): 57 | ''' 58 | A list of infrastructure node types. 59 | Node types represent hardware hierarchy and are used to build cluster tree. 60 | Each node type represent a corresponding level of the cluster tree. 61 | NOTE: node types should be sorted in top-to-bottom order, e.g ['dc', 'router', 'host']. 62 | ''' 63 | return [DC_NODE_TYPE] 64 | 65 | 66 | def get_dc_node_type(): 67 | ''' 68 | Returns dc node type. 69 | Mastermind should know the dc node type identificator to prevent 70 | dc sharing among couples if corresponding setting is on. 71 | 72 | Example: 'dc' 73 | ''' 74 | return DC_NODE_TYPE 75 | 76 | 77 | def node_start_command(host, port, family): 78 | ''' 79 | Starting elliptics node is too complex to provide a fake implementation for. 80 | If you really want to be able to use this functionality, you should 81 | provide your own implementation that uses your server management framework. 82 | ''' 83 | return None 84 | 85 | 86 | def node_shutdown_command(host, port, family): 87 | ''' 88 | This is a fake implementation that shuts node down via dnet_ioclient command. 89 | Please provide your own version that uses your server management framework 90 | ''' 91 | cmd = 'dnet_ioclient -r {host}:{port}:{family} -U 1' 92 | return cmd.format(host=host, port=port, family=family) 93 | 94 | 95 | def node_reconfigure(host, port, family): 96 | ''' 97 | Command that is executed on elliptics node for elliptics configs regeneration. 98 | E. g., reconfiguration is required for backend restart with updated group id. 99 | ''' 100 | return None 101 | 102 | 103 | def set_net_monitoring_downtime(host): 104 | ''' 105 | If your infrastructure monitors network activity, it can cause alerts 106 | on every move/restore job that involves a certain host. This inventory 107 | function allows you to implement network activity downtime setting 108 | for the running time of rsync command. 109 | NB: This function should not throw an exception if net monitoring downtime 110 | is already set. 111 | ''' 112 | return None 113 | 114 | 115 | def remove_net_monitoring_downtime(host): 116 | ''' 117 | See "set_net_monitoring_downtime" doc string. 118 | ''' 119 | return None 120 | 121 | 122 | def get_host_ip_addresses(hostname): 123 | ''' 124 | Resolves hostname to ip(v6) addresses 125 | 126 | Mastermind will preferably use address with a family corresponding 127 | to elliptics client connection settings. 128 | 129 | Returns: 130 | { 131 | socket.AF_INET: [ 132 | '1.2.3.4', 133 | '5.6.7.8', 134 | ], 135 | socket.AF_INET6: [ 136 | '2001:db8:0:1', 137 | ] 138 | } 139 | ''' 140 | ip_addresses = {} 141 | host, port, family, socktype = hostname, None, socket.AF_UNSPEC, socket.SOL_TCP 142 | records = socket.getaddrinfo(host, port, family, socktype) 143 | for record in records: 144 | # record format is (family, socktype, proto, canonname, sockaddr), 145 | # sockaddr format depends on family of the socket: 146 | # socket.AF_INET - (address, port), 147 | # socket.AF_INET6 - (address, port, flow info, scope id). 148 | # See docs for more info: https://docs.python.org/2/library/socket.html#socket.getaddrinfo 149 | family, sockaddr = record[0], record[4] 150 | ip_address = sockaddr[0] 151 | ip_addresses.setdefault(family, []).append(ip_address) 152 | return ip_addresses 153 | 154 | 155 | def get_new_group_files(group_id, total_space): 156 | ''' 157 | Get files required for the new group to be created 158 | 159 | Files will be created on a filesystem in group's base directory by mastermind-minion. 160 | They can be helpful if elliptics is configured by automatic scripts 161 | that examine the contents of group's base directory. 162 | Filename should be relative to group's base directory. 163 | 164 | Returns: 165 | { 166 | : , 167 | : , 168 | ... 169 | } 170 | ''' 171 | return {} 172 | 173 | 174 | def get_node_config_path(node): 175 | ''' 176 | Get path to config file of node 177 | 178 | This config path can be used by mastermind-minion for fetching 179 | any elliptics config parameters. 180 | ''' 181 | return '/etc/elliptics/elliptics.conf' 182 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/query/groupsets.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from mastermind import query 4 | from mastermind.query import Query, LazyDataObject 5 | # imported as a module because of cross-dependencies with 'groupset' module 6 | import mastermind.query.groups 7 | 8 | 9 | class GroupsetDataObject(LazyDataObject): 10 | def _fetch_data(self): 11 | return self.client.request('get_groupset_by_id', self.id) 12 | 13 | @staticmethod 14 | def _raw_id(raw_data): 15 | return raw_data['id'] 16 | 17 | @property 18 | @LazyDataObject._lazy_load 19 | def status(self): 20 | """Current status of couple. 21 | 22 | Possible values: 23 | 'INIT' - newly created groupset or any of the groups has INIT status; 24 | 'OK' - groupset is up and ready for write requests; 25 | 'FULL' - groupset is up but has no available space for write requests; 26 | 'FROZEN' - groupset was frozen and should not participate in write requests; 27 | 'RO' - any of groupset's groups are in read-only state or migrating; 28 | 'SERVICE_ACTIVE' - some of groupset's groups are being processed in move or restore job, 29 | job is executing; 30 | 'SERVICE_STALLED' - some of groupset's groups are being processed in move or restore job, 31 | but job is in pending state and requires attention; 32 | 'ARCHIVED' - groupset's couple is in archived state, does not accept new data and 33 | has lrc groupset with copy of groupset's data; 34 | 'BROKEN' - groupset's configuration is invalid, text description is available through 35 | status_text attribute; 36 | 'BAD' - represents error state, text description is available through 37 | status_text attribute; 38 | """ 39 | return self._data['status'] 40 | 41 | @property 42 | @LazyDataObject._lazy_load 43 | def status_text(self): 44 | """Human-readable and clarifying version of status. 45 | """ 46 | return self._data['status_text'] 47 | 48 | @property 49 | @LazyDataObject._lazy_load 50 | def group_ids(self): 51 | """ List of ids of groups participating in groupset. 52 | """ 53 | return self._data['group_ids'] 54 | 55 | @property 56 | @LazyDataObject._lazy_load 57 | def groups(self): 58 | """ Groups in a groupset 59 | """ 60 | return self._data['groups'] 61 | 62 | @property 63 | @LazyDataObject._lazy_load 64 | def type(self): 65 | """ Groupset type 66 | """ 67 | return self._data['type'] 68 | 69 | @property 70 | @LazyDataObject._lazy_load 71 | def settings(self): 72 | """ Groupset settings 73 | """ 74 | return self._data['settings'] 75 | 76 | @property 77 | @LazyDataObject._lazy_load 78 | def couple_id(self): 79 | return self._data['couple'] 80 | 81 | def _preprocess_raw_data(self, data): 82 | groups = [] 83 | for g_data in data['groups']: 84 | groups.append(mastermind.query.groups.Group.from_data(g_data, self.client)) 85 | data['groups'] = groups 86 | return data 87 | 88 | def serialize(self): 89 | data = super(GroupsetDataObject, self).serialize() 90 | groups = [group.serialize() for group in data['groups']] 91 | data['groups'] = groups 92 | return data 93 | 94 | 95 | GOOD_STATUSES = set(['OK', 'FULL', 'FROZEN']) 96 | 97 | 98 | class GroupsetQuery(Query): 99 | @property 100 | def couple(self): 101 | if self.couple_id is None: 102 | return None 103 | return mastermind.query.couples.Couple(self.couple_id, client=self.client) 104 | 105 | 106 | class Groupset(GroupsetQuery, GroupsetDataObject): 107 | def __init__(self, id, client=None): 108 | super(Groupset, self).__init__(client) 109 | self.id = id 110 | 111 | def __repr__(self): 112 | return ''.format(self.id, self.status, self.status_text) 113 | 114 | 115 | class GroupsetsQuery(Query): 116 | def __init__(self, client, filter=None): 117 | super(GroupsetsQuery, self).__init__(client) 118 | self._filter = filter or {} 119 | 120 | def __getitem__(self, key): 121 | return Groupset(key, self.client) 122 | 123 | def __iter__(self): 124 | groupsets = self.client.request('get_groupsets_list', {'filter': self._filter}) 125 | for gs_data in groupsets: 126 | gs = Groupset(GroupsetDataObject._raw_id(gs_data), self.client) 127 | gs._set_raw_data(gs_data) 128 | yield gs 129 | 130 | def __contains__(self, key): 131 | # TODO: this should be implemented better than in CouplesQuery object 132 | raise NotImplemented 133 | 134 | @property 135 | def replicas(self): 136 | """ Get replicas groupsets 137 | """ 138 | return self.filter(type='replicas') 139 | 140 | @property 141 | def lrc(self): 142 | """ Get lrc groupsets 143 | """ 144 | return self.filter(type='lrc') 145 | 146 | def filter(self, **kwargs): 147 | """Filter groupsets list. 148 | 149 | Keyword args: 150 | namespace: get groupsets belonging to a certain namespace. 151 | state: mostly the same as groupsets status, but one state can actually 152 | combine several statuses. Represents groupset state from admin's point of view. 153 | States to groupset statuses: 154 | good: OK 155 | full: FULL 156 | frozen: FROZEN 157 | bad: INIT, BAD 158 | broken: BROKEN 159 | service-active: SERVICE_ACTIVE 160 | service-stalled: SERVICE_STALLED 161 | archived: ARCHIVED 162 | type: specific groupset type (by default there is no type restrictions): 163 | replicas: regular replicas groupsets 164 | lrc: lrc groupsets 165 | 166 | Returns: 167 | New groupsets query object with selected filter parameters. 168 | """ 169 | updated_filter = copy.copy(self._filter) 170 | if 'namespace' in kwargs: 171 | # TODO: rename _object method to 'to_object' or something similar 172 | updated_filter['namespace'] = query.namespaces.Namespace._object( 173 | kwargs['namespace'], 174 | self.client 175 | ).id 176 | if 'state' in kwargs: 177 | updated_filter['state'] = kwargs['state'] 178 | if 'type' in kwargs: 179 | updated_filter['type'] = kwargs['type'] 180 | return GroupsetsQuery(self.client, filter=updated_filter) 181 | 182 | def __delitem__(self, key): 183 | return Groupset._object(key, self.client).remove() 184 | -------------------------------------------------------------------------------- /src/python-mastermind/src/mastermind/query/groups.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from mastermind.query import Query, LazyDataObject 4 | import mastermind.query.groupsets 5 | from mastermind.query.node_backends import NodeBackend 6 | from mastermind.query.history import GroupHistory 7 | 8 | 9 | class GroupsQuery(Query): 10 | def __init__(self, client, filter=None): 11 | super(GroupsQuery, self).__init__(client) 12 | self._filter = filter or {} 13 | 14 | def __getitem__(self, key): 15 | return Group(key, self.client) 16 | 17 | def next_group_ids(self, count=1): 18 | """Fetch some free group ids. 19 | 20 | Elliptics groups are identified by integer group ids. Mastermind provides 21 | a sequence of increasing group ids for assigning to new groups added to storage. 22 | 23 | Args: 24 | count: number of group ids to fetch. 25 | """ 26 | return self.client.request('get_next_group_number', count) 27 | 28 | def __iter__(self): 29 | groups = self.client.request('get_groups_list', [self._filter]) 30 | for g_data in groups: 31 | gq = Group(GroupDataObject._raw_id(g_data), self.client) 32 | gq._set_raw_data(g_data) 33 | yield gq 34 | 35 | def filter(self, **kwargs): 36 | """Filter groups list. 37 | 38 | Keyword args: 39 | uncoupled (deprecated - use 'type' == 'uncoupled'): 40 | get groups that are not assigned to any couple. 41 | type: 42 | filter groups by type: 43 | 44 | uncoupled: uncoupled data groups (no metakey); 45 | data: simple data groups; 46 | cache: cache groups for popular keys; 47 | uncoupled_cache: cache groups that are not yet marked by mastermind; 48 | lrc-8-2-2-v1: lrc data groups (scheme 8-2-2 version 1); 49 | uncoupled_lrc-8-2-2-v1: uncoupled lrc groups prepared 50 | for data convert (scheme 8-2-2 version 1); 51 | 52 | in_jobs: 53 | get groups that are participating in any active jobs. 54 | state: 55 | mostly the same as group status, but one state can actually 56 | combine several statuses. Represents group state from admin's point of view. 57 | States to group statuses: 58 | 59 | init: INIT 60 | good: COUPLED 61 | bad: INIT, BAD 62 | broken: BROKEN 63 | ro: RO 64 | migrating: MIGRATING 65 | 66 | Returns: 67 | New groups query object with selected filter parameters. 68 | """ 69 | updated_filter = copy.copy(self._filter) 70 | if 'uncoupled' in kwargs: 71 | updated_filter['uncoupled'] = kwargs['uncoupled'] 72 | if 'in_jobs' in kwargs: 73 | updated_filter['in_jobs'] = kwargs['in_jobs'] 74 | if 'state' in kwargs: 75 | updated_filter['state'] = kwargs['state'] 76 | if 'type' in kwargs: 77 | updated_filter['type'] = kwargs['type'] 78 | return GroupsQuery(self.client, filter=updated_filter) 79 | 80 | 81 | class GroupDataObject(LazyDataObject): 82 | def _fetch_data(self): 83 | return self.client.request('get_group_info', self.id) 84 | 85 | @staticmethod 86 | def _raw_id(raw_data): 87 | return raw_data['id'] 88 | 89 | @property 90 | @LazyDataObject._lazy_load 91 | def status(self): 92 | return self._data['status'] 93 | 94 | @property 95 | @LazyDataObject._lazy_load 96 | def status_text(self): 97 | return self._data['status_text'] 98 | 99 | @property 100 | @LazyDataObject._lazy_load 101 | def node_backends(self): 102 | return self._data['node_backends'] 103 | 104 | @property 105 | @LazyDataObject._lazy_load 106 | def groupset_id(self): 107 | return self._data['groupset'] 108 | 109 | @property 110 | @LazyDataObject._lazy_load 111 | def couple_id(self): 112 | return self._data['couple'] 113 | 114 | def _preprocess_raw_data(self, data): 115 | node_backends = [] 116 | for nb_data in data['node_backends']: 117 | node_backends.append(NodeBackend.from_data(nb_data, self.client)) 118 | data['node_backends'] = node_backends 119 | 120 | return data 121 | 122 | def serialize(self): 123 | data = super(GroupDataObject, self).serialize() 124 | node_backends = [nb.serialize() for nb in data['node_backends']] 125 | data['node_backends'] = node_backends 126 | return data 127 | 128 | 129 | class GroupQuery(Query): 130 | @property 131 | def meta(self): 132 | """Reads metakey for group. 133 | 134 | Returns: 135 | Group metakey, already unpacked. 136 | """ 137 | return self.client.request('get_group_meta', [self.id, None, True])['data'] 138 | 139 | def move(self, uncoupled_groups=None, force=False): 140 | """Create group move job. 141 | 142 | Job will move group's node backend to uncoupled group's node backend. 143 | Uncoupled group will be replaces, source group node backend will be disabled. 144 | 145 | Args: 146 | uncoupled_groups: list of uncoupled group that should be merged together 147 | and replaced by source group. 148 | force: cancel all pending jobs of low priority (e.g. recover-dc and defragmentation). 149 | 150 | Returns: 151 | A json of created job (or a dict with a single error key and value). 152 | """ 153 | uncoupled_groups = [GroupQuery._object(self.client, g) for g in uncoupled_groups or []] 154 | return self.client.request('move_group', 155 | [self.id, 156 | {'uncoupled_groups': [g.id for g in uncoupled_groups]}, 157 | force]) 158 | 159 | @property 160 | def history(self): 161 | history_data = self.client.request('get_group_history', [self.id]) 162 | return GroupHistory(couples=history_data['couples'], 163 | nodes=history_data['nodes']) 164 | 165 | @property 166 | def groupset(self): 167 | if self.groupset_id is None: 168 | return None 169 | return mastermind.query.groupsets.Groupset(self.groupset_id, client=self.client) 170 | 171 | @property 172 | def couple(self): 173 | if self.couple_id is None: 174 | return None 175 | return mastermind.query.couples.Couple(self.couple_id, client=self.client) 176 | 177 | 178 | class Group(GroupQuery, GroupDataObject): 179 | def __init__(self, id, client=None): 180 | super(Group, self).__init__(client) 181 | self.id = id 182 | 183 | def __repr__(self): 184 | return ''.format(self.id, self.status, self.status_text) 185 | -------------------------------------------------------------------------------- /src/cocaine-app/jobs/tasks/rsync_backend.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from errors import CacheUpstreamError 4 | from infrastructure_cache import cache 5 | import inventory 6 | import jobs 7 | from jobs import JobBrokenError, TaskTypes 8 | from minion_cmd import MinionCmdTask 9 | import storage 10 | 11 | 12 | logger = logging.getLogger('mm.jobs') 13 | 14 | 15 | class RsyncBackendTask(MinionCmdTask): 16 | 17 | PARAMS = MinionCmdTask.PARAMS + ('node_backend', 'src_host') 18 | 19 | def __init__(self, job): 20 | super(RsyncBackendTask, self).__init__(job) 21 | self.type = TaskTypes.TYPE_RSYNC_BACKEND_TASK 22 | 23 | def execute(self, processor): 24 | logger.info( 25 | 'Job {job_id}, task {task_id}: checking group {group_id} ' 26 | 'and node backend {nb} state'.format( 27 | job_id=self.parent_job.id, 28 | task_id=self.id, 29 | group_id=self.group, 30 | nb=self.node_backend 31 | ) 32 | ) 33 | 34 | if self.node_backend: 35 | # Check if old backend is down or if a group is already running on a 36 | # different node backends 37 | # This check is not applied to move job 38 | current_group_node_backends = set() 39 | if self.group in storage.groups: 40 | group = storage.groups[self.group] 41 | current_group_node_backends = set(group.node_backends) 42 | 43 | if self.node_backend in storage.node_backends: 44 | old_node_backend = storage.node_backends[self.node_backend] 45 | expected_statuses = (storage.Status.STALLED, storage.Status.INIT, storage.Status.RO) 46 | old_group_node_backend_is_up = ( 47 | old_node_backend in current_group_node_backends and 48 | old_node_backend.status not in expected_statuses 49 | ) 50 | if old_group_node_backend_is_up: 51 | raise JobBrokenError( 52 | 'Node backend {nb} has status {status}, ' 53 | 'expected {expected_statuses}'.format( 54 | nb=old_node_backend, 55 | status=old_node_backend.status, 56 | expected_statuses=expected_statuses 57 | ) 58 | ) 59 | current_group_node_backends.discard(old_node_backend) 60 | 61 | if current_group_node_backends: 62 | raise JobBrokenError( 63 | 'Group {} is running on unexpected backends {}'.format( 64 | self.group, 65 | [str(nb) for nb in current_group_node_backends] 66 | ) 67 | ) 68 | 69 | super(RsyncBackendTask, self).execute(processor) 70 | 71 | def __hostnames(self, hosts): 72 | hostnames = [] 73 | for host in hosts: 74 | try: 75 | hostnames.append(cache.get_hostname_by_addr(host)) 76 | except CacheUpstreamError: 77 | raise RuntimeError('Failed to resolve host {0}'.format(host)) 78 | return hostnames 79 | 80 | def on_exec_start(self, processor): 81 | hostnames = set(self.__hostnames([self.host, self.src_host])) 82 | 83 | dl = jobs.Job.list(processor.downtimes, 84 | host=list(hostnames), type='network_load') 85 | 86 | set_hostnames = set(record['host'] for record in dl) 87 | not_set_hostnames = hostnames - set_hostnames 88 | 89 | if not_set_hostnames: 90 | try: 91 | for hostname in not_set_hostnames: 92 | inventory.set_net_monitoring_downtime(hostname) 93 | except Exception as e: 94 | logger.error( 95 | 'Job {job_id}, task {task_id}: failed to set net monitoring downtime: ' 96 | '{error}'.format( 97 | job_id=self.parent_job.id, 98 | task_id=self.id, 99 | error=e 100 | ) 101 | ) 102 | raise 103 | 104 | try: 105 | bulk_op = processor.downtimes.initialize_unordered_bulk_op() 106 | for hostname in hostnames: 107 | bulk_op.insert({'job_id': self.parent_job.id, 108 | 'host': hostname, 109 | 'type': 'network_load'}) 110 | res = bulk_op.execute() 111 | if res['nInserted'] != len(hostnames): 112 | raise ValueError('failed to set all downtimes: {0}/{1}'.format( 113 | res['nInserted'], len(hostnames))) 114 | except Exception as e: 115 | logger.error( 116 | 'Job {job_id}, task {task_id}: unexpected mongo error: {error}'.format( 117 | job_id=self.parent_job.id, 118 | task_id=self.id, 119 | error=e 120 | ) 121 | ) 122 | raise 123 | 124 | def on_exec_stop(self, processor): 125 | hostnames = set(self.__hostnames([self.host, self.src_host])) 126 | 127 | dl = jobs.Job.list(processor.downtimes, 128 | host=list(hostnames), type='network_load') 129 | 130 | busy_hostnames = set() 131 | for rec in dl: 132 | if rec['job_id'] != self.parent_job.id: 133 | busy_hostnames.add(rec['host']) 134 | 135 | release_hostnames = hostnames - busy_hostnames 136 | if release_hostnames: 137 | try: 138 | for hostname in release_hostnames: 139 | inventory.remove_net_monitoring_downtime(hostname) 140 | except Exception as e: 141 | logger.error( 142 | 'Job {job_id}, task {task_id}: failed to remove net monitoring downtime: ' 143 | '{error}'.format( 144 | job_id=self.parent_job.id, 145 | task_id=self.id, 146 | error=e 147 | ) 148 | ) 149 | raise 150 | 151 | try: 152 | res = processor.downtimes.remove({'job_id': self.parent_job.id, 153 | 'host': {'$in': list(hostnames)}, 154 | 'type': 'network_load'}) 155 | if res['ok'] != 1: 156 | raise ValueError('bad response: {0}'.format(res)) 157 | except Exception as e: 158 | logger.error( 159 | 'Job {job_id}, task {task_id}: unexpected mongo error: {error}'.format( 160 | job_id=self.parent_job.id, 161 | task_id=self.id, 162 | error=e 163 | ) 164 | ) 165 | raise 166 | -------------------------------------------------------------------------------- /tests/test_pool.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import time 3 | 4 | import pytest 5 | 6 | from mastermind import pool 7 | from fixtures.util import parametrize 8 | 9 | 10 | class TimingWrapper(object): 11 | """Executes function and stores execution elapsed time 12 | """ 13 | 14 | def __init__(self, func): 15 | self.func = func 16 | self.elapsed = None 17 | 18 | def __call__(self, *args, **kwds): 19 | t = time.time() 20 | try: 21 | return self.func(*args, **kwds) 22 | finally: 23 | self.elapsed = time.time() - t 24 | 25 | 26 | class TestPool(object): 27 | """Test basic functionality of overridden multiprocessing.Pool implementation 28 | """ 29 | 30 | def test_make_pool(self): 31 | """Pool initialization""" 32 | p = pool.Pool(processes=2) 33 | assert len(p._pool) == 2 34 | 35 | @parametrize( 36 | 'processes, task_delay', 37 | [(4, 5.0)], 38 | arglabels={ 39 | 'task_delay': 'task delay' 40 | }, 41 | ) 42 | def test_terminate(self, delay_task_worker_pool): 43 | """Pool termination""" 44 | delay_task_worker_pool.imap_unordered(None, xrange(100)) 45 | delay_task_worker_pool.terminate() 46 | 47 | join = TimingWrapper(delay_task_worker_pool.join) 48 | join() 49 | 50 | assert 0 <= join.elapsed <= 0.2 51 | 52 | @parametrize( 53 | 'processes, task_delay', 54 | [(2, 0.5)], 55 | arglabels={ 56 | 'task_delay': 'task delay' 57 | }, 58 | ) 59 | def test_close(self, delay_task_worker_pool): 60 | """Pool soft close""" 61 | delay_task_worker_pool.imap_unordered(None, xrange(2)) 62 | delay_task_worker_pool.close() 63 | 64 | join = TimingWrapper(delay_task_worker_pool.join) 65 | join() 66 | 67 | assert 0.5 <= join.elapsed <= 0.8 68 | 69 | @parametrize( 70 | 'processes, task_delay', 71 | [(2, 1.0)], 72 | arglabels={ 73 | 'task_delay': 'task delay' 74 | }, 75 | ) 76 | def test_restore_pool(self, delay_task_worker_pool): 77 | """Resurrection of workers in case of worker process termination""" 78 | delay_task_worker_pool._pool[0].terminate() 79 | time.sleep(0.2) 80 | assert len(delay_task_worker_pool._pool) == 2 81 | for w in delay_task_worker_pool._pool: 82 | assert w.is_alive() is True 83 | assert w.exitcode is None 84 | 85 | @parametrize( 86 | 'processes, task_delay', 87 | [(4, 0.0)], 88 | arglabels={ 89 | 'task_delay': 'task delay' 90 | }, 91 | ) 92 | def test_imap_unordered(self, delay_task_worker_pool): 93 | """imap_unordered with default chunksize (1)""" 94 | RESULTS_NUM = 21 95 | res = delay_task_worker_pool.imap_unordered(None, xrange(RESULTS_NUM)) 96 | assert range(RESULTS_NUM) == sorted(list(res)) 97 | delay_task_worker_pool.close() 98 | 99 | @pytest.mark.xfail 100 | @parametrize( 101 | 'processes, task_delay', 102 | [(4, 0.0)], 103 | arglabels={ 104 | 'task_delay': 'task delay' 105 | }, 106 | ) 107 | def test_imap_unordered_chunk(self, delay_task_worker_pool): 108 | """imap_unordered with chunksize > 1""" 109 | RESULTS_NUM = 21 110 | res = delay_task_worker_pool.imap_unordered( 111 | None, 112 | xrange(RESULTS_NUM), 113 | chunksize=4 114 | ) 115 | assert range(RESULTS_NUM) == sorted(list(res)) 116 | delay_task_worker_pool.close() 117 | 118 | @parametrize( 119 | 'processes, task_delay', 120 | [(2, 0.0)], 121 | arglabels={ 122 | 'task_delay': 'task delay' 123 | }, 124 | ) 125 | def test_apply(self, delay_task_worker_pool): 126 | assert delay_task_worker_pool.apply(None, (1,)) == 1 127 | 128 | @pytest.mark.xfail 129 | @parametrize( 130 | 'processes, task_delay', 131 | [(2, 0.0)], 132 | arglabels={ 133 | 'task_delay': 'task delay' 134 | }, 135 | ) 136 | def test_map(self, delay_task_worker_pool): 137 | """map with default chunksize (None)""" 138 | RESULTS_NUM = 5 139 | assert delay_task_worker_pool.map(None, xrange(RESULTS_NUM)) == range(RESULTS_NUM) 140 | 141 | @pytest.mark.xfail 142 | @parametrize( 143 | 'processes, task_delay', 144 | [(2, 0.0)], 145 | arglabels={ 146 | 'task_delay': 'task delay' 147 | }, 148 | ) 149 | def test_map_chunk(self, delay_task_worker_pool): 150 | """map with chunksize 1""" 151 | RESULTS_NUM = 5 152 | assert delay_task_worker_pool.map( 153 | None, xrange(RESULTS_NUM), chunksize=1 154 | ) == range(RESULTS_NUM) 155 | 156 | @parametrize( 157 | 'processes, task_delay', 158 | [(2, 0.0)], 159 | arglabels={ 160 | 'task_delay': 'task delay' 161 | }, 162 | ) 163 | def test_apply_async(self, delay_task_worker_pool): 164 | """Blocking apply_async""" 165 | delay_task_worker_pool.apply_async(None, (1,)).get() == 1 166 | 167 | @parametrize( 168 | 'processes, task_delay', 169 | [(2, 1.0)], 170 | arglabels={ 171 | 'task_delay': 'task delay' 172 | }, 173 | ) 174 | def test_apply_async_timeout(self, delay_task_worker_pool): 175 | """Timeout on apply_async call""" 176 | with pytest.raises(multiprocessing.TimeoutError): 177 | delay_task_worker_pool.apply_async(None, (1,)).get(0.5) 178 | 179 | @pytest.mark.xfail 180 | @parametrize( 181 | 'processes, task_delay', 182 | [(1, 0.0)], 183 | arglabels={ 184 | 'task_delay': 'task delay' 185 | }, 186 | ) 187 | def test_apply_map_async(self, delay_task_worker_pool): 188 | """Blocking map_async""" 189 | RESULTS_NUM = 5 190 | assert delay_task_worker_pool.map_async( 191 | None, 192 | xrange(RESULTS_NUM) 193 | ).get() == range(RESULTS_NUM) 194 | 195 | @parametrize( 196 | 'processes, task_delay', 197 | [(2, 1.0)], 198 | arglabels={ 199 | 'task_delay': 'task delay' 200 | }, 201 | ) 202 | def test_apply_map_async_timeout(self, delay_task_worker_pool): 203 | """Timeout on map_async call""" 204 | RESULTS_NUM = 5 205 | with pytest.raises(multiprocessing.TimeoutError): 206 | delay_task_worker_pool.map_async(None, xrange(RESULTS_NUM)).get(0.5) 207 | 208 | @parametrize( 209 | 'processes, task_delay', 210 | [(2, 0.0)], 211 | arglabels={ 212 | 'task_delay': 'task delay' 213 | }, 214 | ) 215 | def test_imap(self, delay_task_worker_pool): 216 | """imap in iterator and list mode""" 217 | RESULTS_NUM = 50 218 | assert list( 219 | delay_task_worker_pool.imap(None, xrange(RESULTS_NUM)) 220 | ) == range(RESULTS_NUM) 221 | it = delay_task_worker_pool.imap(None, xrange(RESULTS_NUM)) 222 | for i in xrange(RESULTS_NUM): 223 | assert it.next() == i 224 | -------------------------------------------------------------------------------- /src/cocaine-app/cache_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import logging 3 | import sys 4 | import signal 5 | import time 6 | 7 | # NB: pool should be initialized before importing 8 | # any of cocaine-framework-python modules to avoid 9 | # tornado ioloop dispatcher issues 10 | import monitor_pool 11 | 12 | from cocaine.asio.exceptions import LocatorResolveError 13 | from cocaine.worker import Worker 14 | import elliptics 15 | 16 | import log 17 | 18 | try: 19 | log.setup_logger('mm_cache_logging') 20 | logger = logging.getLogger('mm.init') 21 | except LocatorResolveError: 22 | log.setup_logger() 23 | logger = logging.getLogger('mm.init') 24 | logger.warn('mm_cache_logging is not set up properly in ' 25 | 'cocaine.conf, fallback to default logging service') 26 | 27 | from config import config 28 | import storage 29 | import cache 30 | from db.mongo.pool import MongoReplicaSetClient 31 | import infrastructure 32 | import infrastructure_cache 33 | import jobs 34 | import node_info_updater 35 | import helpers as h 36 | 37 | 38 | def init_elliptics_node(): 39 | nodes = config.get('elliptics', {}).get('nodes', []) or config["elliptics_nodes"] 40 | logger.debug("config: %s" % str(nodes)) 41 | 42 | log = elliptics.Logger(str(config["dnet_log"]), config["dnet_log_mask"]) 43 | 44 | node_config = elliptics.Config() 45 | node_config.io_thread_num = config.get('io_thread_num', 1) 46 | node_config.nonblocking_io_thread_num = config.get('nonblocking_io_thread_num', 1) 47 | node_config.net_thread_num = config.get('net_thread_num', 1) 48 | 49 | logger.info('Node config: io_thread_num {0}, nonblocking_io_thread_num {1}, ' 50 | 'net_thread_num {2}'.format( 51 | node_config.io_thread_num, node_config.nonblocking_io_thread_num, 52 | node_config.net_thread_num)) 53 | 54 | n = elliptics.Node(log, node_config) 55 | 56 | addresses = [] 57 | for node in nodes: 58 | try: 59 | addresses.append(elliptics.Address( 60 | host=str(node[0]), port=node[1], family=node[2])) 61 | except Exception as e: 62 | logger.error('Failed to connect to storage node: {0}:{1}:{2}'.format( 63 | node[0], node[1], node[2])) 64 | pass 65 | 66 | try: 67 | n.add_remotes(addresses) 68 | except Exception as e: 69 | logger.error('Failed to connect to any elliptics storage node: {0}'.format( 70 | e)) 71 | raise ValueError('Failed to connect to any elliptics storage node') 72 | 73 | meta_node = elliptics.Node(log, node_config) 74 | 75 | addresses = [] 76 | for node in config["metadata"]["nodes"]: 77 | try: 78 | addresses.append(elliptics.Address( 79 | host=str(node[0]), port=node[1], family=node[2])) 80 | except Exception as e: 81 | logger.error('Failed to connect to meta node: {0}:{1}:{2}'.format( 82 | node[0], node[1], node[2])) 83 | pass 84 | 85 | logger.info('Connecting to meta nodes: {0}'.format(config["metadata"]["nodes"])) 86 | 87 | try: 88 | meta_node.add_remotes(addresses) 89 | except Exception as e: 90 | logger.error('Failed to connect to any elliptics meta storage node: {0}'.format( 91 | e)) 92 | raise ValueError('Failed to connect to any elliptics storage META node') 93 | 94 | meta_wait_timeout = config['metadata'].get('wait_timeout', 5) 95 | 96 | meta_session = elliptics.Session(meta_node) 97 | meta_session.set_timeout(meta_wait_timeout) 98 | meta_session.add_groups(list(config["metadata"]["groups"])) 99 | n.meta_session = meta_session 100 | 101 | wait_timeout = config.get('elliptics', {}).get('wait_timeout', 5) 102 | time.sleep(wait_timeout) 103 | 104 | return n 105 | 106 | 107 | def init_meta_db(): 108 | meta_db = None 109 | 110 | mrsc_options = config['metadata'].get('options', {}) 111 | 112 | if config['metadata'].get('url'): 113 | meta_db = MongoReplicaSetClient(config['metadata']['url'], **mrsc_options) 114 | return meta_db 115 | 116 | 117 | def init_infrastructure_cache_manager(W, n): 118 | icm = infrastructure_cache.InfrastructureCacheManager(n.meta_session) 119 | return icm 120 | 121 | 122 | def init_node_info_updater(n): 123 | return node_info_updater.NodeInfoUpdater(n, None) 124 | 125 | 126 | def init_infrastructure(W, n): 127 | infstruct = infrastructure.infrastructure 128 | infstruct.init(n, None, None) 129 | return infstruct 130 | 131 | 132 | def init_cache_worker(W, n, niu, j, meta_db): 133 | if not config.get("cache"): 134 | logger.error('Cache is not set up in config ("cache" key), ' 135 | 'will not be initialized') 136 | return None 137 | if not config.get('metadata', {}).get('cache', {}).get('db'): 138 | logger.error('Cache metadata db is not set up ("metadata.cache.db" key), ' 139 | 'will not be initialized') 140 | return None 141 | c = cache.CacheManager(n, niu, j, meta_db) 142 | h.register_handle(W, c.get_top_keys) 143 | h.register_handle(W, c.cache_statistics) 144 | h.register_handle(W, c.cache_clean) 145 | h.register_handle(W, c.cache_groups) 146 | h.register_handle(W, c.get_cached_keys) 147 | h.register_handle(W, c.update_cache_key_upload_status) 148 | h.register_handle(W, c.update_cache_key_removal_status) 149 | 150 | return c 151 | 152 | 153 | def init_job_finder(meta_db): 154 | if not config['metadata'].get('jobs', {}).get('db'): 155 | logger.error('Job finder metadb is not set up ' 156 | '("metadata.jobs.db" key), will not be initialized') 157 | return None 158 | jf = jobs.JobFinder(meta_db) 159 | return jf 160 | 161 | 162 | def init_job_processor(jf, meta_db, niu): 163 | if jf is None: 164 | logger.error('Job processor will not be initialized because ' 165 | 'job finder is not initialized') 166 | return None 167 | j = jobs.JobProcessor(jf, n, meta_db, niu, minions=None) 168 | return j 169 | 170 | 171 | if __name__ == '__main__': 172 | 173 | def term_handler(signo, frame): 174 | # required to guarantee execution of cleanup functions registered 175 | # with atexit.register 176 | sys.exit(0) 177 | 178 | signal.signal(signal.SIGTERM, term_handler) 179 | 180 | n = init_elliptics_node() 181 | 182 | logger.info("before creating worker") 183 | W = Worker(disown_timeout=config.get('disown_timeout', 2)) 184 | logger.info("after creating worker") 185 | 186 | meta_db = init_meta_db() 187 | if meta_db is None: 188 | s = 'Meta db should be configured in "metadata" config section' 189 | logger.error(s) 190 | raise RuntimeError(s) 191 | 192 | i = init_infrastructure(W, n) 193 | icm = init_infrastructure_cache_manager(W, n) 194 | 195 | niu = init_node_info_updater(n) 196 | jf = init_job_finder(meta_db) 197 | j = init_job_processor(jf, meta_db, niu) 198 | 199 | c = init_cache_worker(W, n, niu, j, meta_db) 200 | 201 | icm._start_tq() 202 | c and c._start_tq() 203 | 204 | logger.info("Starting cache worker") 205 | W.run() 206 | logger.info("Cache worker initialized") 207 | -------------------------------------------------------------------------------- /src/create_group_ids: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from opster import command 4 | from ConfigParser import * 5 | from cocaine.services import Service 6 | import time 7 | import re 8 | import os 9 | import msgpack 10 | 11 | 12 | class EllipticsUbicConfigParser(ConfigParser): 13 | 14 | SECTCRE = re.compile( 15 | r'\s*<(?P\/)?' # < 16 | r'(?P
[^>]+)' # very permissive! 17 | r'>' # > 18 | ) 19 | 20 | OPTCRE = re.compile( 21 | r'\s*(?P