├── .gitignore ├── LICENSE ├── README.md ├── dev_requirements.txt ├── doctor ├── __init__.py ├── checker.py ├── configs.py ├── metrics.py └── plugins │ ├── __init__.py │ └── archer.py ├── setup.py └── tests ├── test_checker.py ├── test_configs.py └── test_metrics.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | __pycache__/ 3 | *-info/ 4 | build/ 5 | dist/ 6 | .cache/ 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) <2016> 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## doctor 2 | 3 | Health is described with current errors percentage, if the health status turns bad, actions like “refuse service” should be taken, mainly to protect our backend databases. 4 | 5 | You must invoke `on_*` like methods of `doctor.checker.HealthTester.metrics`(`doctor.metrics.Metrics`) to record metrics, then `HealthTester.is_healthy` calculate api call status by thresholds, and `HealthTester.test` based the flowing policy to decide whether the current request can be passed. 6 | 7 | ### Install 8 | 9 | pip install git+https://github.com/eleme/doctor.git 10 | 11 | ### Policy 12 | 13 | Current detail policy to test health description: 14 | 15 | - if current api is heavily under errors, disallow it to pass the test(), and further incoming requests should be refused ( in at least MIN_RECOVERY_TIME). 16 | - if current api has recoveried from bad health, allow it to pass the test() gradually (via random.random() with priority). 17 | 18 | Current errors threholds: 19 | 20 | - Errors contains system errors and gevent timeouts. 21 | - Threholds are percentages: errors / requests. 22 | - Errors threholds are checked only if the current requests count is greater than THRESHOLD_REQUEST. 23 | 24 | Health check interval: 25 | 26 | - Calculated by METRICS_GRANULARITY * METRICS_ROLLINGSIZE, in seconds. 27 | 28 | ### Settings 29 | 30 | ``` 31 | MIN_RECOVERY_TIME min recovery time (in seconds) 32 | MAX_RECOVERY_TIME max recovery time (in seconds) 33 | THRESHOLD_REQUEST min requests to trigger a health check. (per INTERVAL) # noqa 34 | THRESHOLD_TIMEOUT gevent timeout count threshold (per INTERVAL) 35 | THRESHOLD_SYS_EXC sys_exc count threshold (per INTERVAL) 36 | THRESHOLD_UNKWN_EXC unkwn_exc count threshold (per INTERVAL) 37 | ``` 38 | 39 | ### Examples 40 | 41 | ```Python 42 | # callbacks, take *doctor.checker.APIHealthTestCtx* as arguments 43 | def on_api_health_locked(result): 44 | pass 45 | def on_api_health_unlocked(result): 46 | pass 47 | def on_api_health_tested(result): 48 | pass 49 | def on_api_health_tested_bad(result): 50 | pass 51 | def on_api_health_tested_ok(result): 52 | pass 53 | 54 | # you can custom the settings, see doctor/configs.py 55 | configs = Configs() 56 | 57 | # callbacks order matters. 58 | tester = HealthTester( 59 | configs, 60 | on_api_health_locked, 61 | on_api_health_unlocked, 62 | on_api_health_tested, 63 | on_api_health_tested_bad, 64 | on_api_health_tested_ok, 65 | ) 66 | 67 | 68 | def api_decorator(func): 69 | @functools.wraps(func) 70 | def _wrapper(service, *args, **kwargs): 71 | service_name, func_name = service.name, func.__name__ 72 | if not tester.test(service_name, func_name): 73 | print('Oh! No!!!') 74 | return 75 | 76 | result = None 77 | try: 78 | result = func(service, *args, **kwargs) 79 | except UserError: 80 | tester.metrics.on_api_called_user_exc(service_name, func_name) 81 | except TimeoutError: 82 | tester.metrics.on_api_called_timeout(service_name, func_name) 83 | except SysError: 84 | tester.metrics.on_api_called_sys_exc(service_name, func_name) 85 | except Exception: 86 | tester.metrics.on_api_called_unkwn_exc(service_name, func_name) 87 | else: 88 | tester.metrics.on_api_called_ok(service_name, func_name) 89 | finally: 90 | tester.metrics.on_api_called(service_name, func_name) 91 | 92 | return result 93 | return _wrapper 94 | 95 | 96 | @api_decorator 97 | def api(service): 98 | client.connect(service.addr) 99 | ``` 100 | 101 | ### Ports 102 | 103 | - [Go](https://github.com/eleme/circuitbreaker) 104 | 105 | ### Authors 106 | 107 | * @Damnever 108 | * @xiangyu.wang 109 | * @hit9 110 | -------------------------------------------------------------------------------- /dev_requirements.txt: -------------------------------------------------------------------------------- 1 | pytest==2.9.1 2 | mock==1.3.0 3 | -------------------------------------------------------------------------------- /doctor/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | from .configs import Configs 6 | from .metrics import Metrics 7 | from .checker import HealthTester 8 | 9 | 10 | __version__ = '0.2.1' 11 | -------------------------------------------------------------------------------- /doctor/checker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | import time 6 | import random 7 | import logging 8 | from collections import defaultdict 9 | 10 | from .metrics import Metrics 11 | 12 | 13 | MODE_UNLOCKED = 0 14 | MODE_LOCKED = 1 15 | MODE_RECOVER = 2 16 | 17 | 18 | class APIHealthTestCtx(object): 19 | """ 20 | `API call` context to hold data:: 21 | 22 | func api function to be called. 23 | service the service this api belongs to. 24 | result the `test` result, (True or False). 25 | lock current api lock information, dict, 26 | keys: ``locked_at``, ``locked_status``. 27 | health_ok_now if the api is ok now, True for ok. 28 | start_at timestamp when the test starts. 29 | end_at timestamp when the test ends. 30 | logger service logger 31 | """ 32 | __slots__ = ['func_name', 'service_name', 'result', 'lock', 33 | 'health_ok_now', 'start_at', 'end_at', 'logger'] 34 | 35 | def __init__(self): 36 | for attr in self.__slots__: 37 | setattr(self, attr, None) 38 | 39 | 40 | class HealthTester(object): 41 | """ 42 | Parameters:: 43 | 44 | * configs: ``Configs`` object. 45 | """ 46 | _NON_CALLBACK = lambda ctx: None 47 | 48 | def __init__(self, configs, 49 | on_api_health_locked=_NON_CALLBACK, 50 | on_api_health_unlocked=_NON_CALLBACK, 51 | on_api_health_tested=_NON_CALLBACK, 52 | on_api_health_tested_bad=_NON_CALLBACK, 53 | on_api_health_tested_ok=_NON_CALLBACK): 54 | self._metrics = Metrics(configs) 55 | 56 | # init settings 57 | self._min_recovery_time = configs.HEALTH_MIN_RECOVERY_TIME 58 | self._max_recovery_time = configs.HEALTH_MAX_RECOVERY_TIME 59 | self._threshold_request = configs.HEALTH_THRESHOLD_REQUEST 60 | self._threshold_timeout = configs.HEALTH_THRESHOLD_TIMEOUT 61 | self._threshold_sys_exc = configs.HEALTH_THRESHOLD_SYS_EXC 62 | self._threshold_unkwn_exc = configs.HEALTH_THRESHOLD_UNKWN_EXC 63 | 64 | granularity = configs.METRICS_GRANULARITY 65 | rollingsize = configs.METRICS_ROLLINGSIZE 66 | self._interval = granularity * rollingsize 67 | 68 | # callbacks 69 | self._on_api_health_locked = on_api_health_locked 70 | self._on_api_health_unlocked = on_api_health_unlocked 71 | self._on_api_health_tested = on_api_health_tested 72 | self._on_api_health_tested_bad = on_api_health_tested_bad 73 | self._on_api_health_tested_ok = on_api_health_tested_ok 74 | 75 | self._locks = defaultdict(dict) 76 | 77 | @property 78 | def metrics(self): 79 | """``Metrics`` object.""" 80 | return self._metrics 81 | 82 | @property 83 | def locks(self): 84 | """ 85 | `locks` is a dict to hold :meth:`test` runtime data, its schema:: 86 | 87 | {func_slug: {locked_at: locked_time, 88 | locked_status: status of lock}} 89 | 90 | * locked_at: the time when the fun is locked 91 | * locked_status: the status of lock 92 | #. locked: the func is locked 93 | #. unlocked: the func is unlocked 94 | #. recover: state between locked and unlocked in which the 95 | circuit breaker is recovering based on the 96 | healthy status of func 97 | """ 98 | return self._locks 99 | 100 | def test(self, service_name, func_name, logger=None): 101 | """ 102 | Test current api health before the request is processed, returns 103 | ``True`` for OK, logic notes: 104 | 105 | * If current api is `unlocked`, lock it until `not is_healthy()`. 106 | * If current api is `locked`, recover it until `is_healthy()` (and 107 | locked time span > `MIN_RECOVERY_TIME`), one request will be 108 | released for health checking once this api enters recover mode. 109 | * If current api is in `recover` mode, try to unlock it if the latest 110 | request (the request just released) executed without errors. 111 | Requests on an api are unlocked gradually, but not immediately. It 112 | allows more requests to pass as the time becomes longer from the 113 | time turns to health OK, but it will be unlock anyway when the time 114 | span is over `MAX_RECOVERY_TIME`. If the latest request failed with 115 | any errors exccept 116 | `too_busy_exception`, it will be locked again. 117 | """ 118 | key = '{0}.{1}'.format(service_name, func_name) 119 | 120 | lock = self._get_api_lock(key) 121 | locked_at = lock['locked_at'] 122 | locked_status = lock['locked_status'] 123 | 124 | health_ok_now = self.is_healthy(service_name, func_name) 125 | time_now = time.time() 126 | 127 | if not logger: 128 | logger = logging.getLogger(__name__) 129 | ctx = APIHealthTestCtx() 130 | ctx.start_at = time_now 131 | ctx.func_name = func_name 132 | ctx.service_name = service_name 133 | ctx.health_ok_now = health_ok_now 134 | ctx.logger = logger 135 | 136 | lock_changed = None 137 | result = None 138 | 139 | if locked_status == MODE_LOCKED: 140 | if health_ok_now: 141 | # turns OK 142 | locked_span = time_now - locked_at 143 | if locked_span < self._min_recovery_time: 144 | # should be locked for at least MIN_RECOVERY_TIME 145 | result = False 146 | else: 147 | # enter into recover mode 148 | lock['locked_status'] = MODE_RECOVER 149 | lock_changed = MODE_RECOVER 150 | # release this request for health check 151 | result = True 152 | else: 153 | result = False 154 | elif locked_status == MODE_RECOVER: 155 | if self._metrics.api_latest_state.get(key, False): 156 | locked_span = time_now - locked_at 157 | if locked_span >= self._max_recovery_time: 158 | lock['locked_at'] = 0 159 | lock['locked_status'] = MODE_UNLOCKED 160 | lock_changed = MODE_UNLOCKED 161 | result = True 162 | else: 163 | if (random.random() < 164 | float(locked_span) / self._max_recovery_time): 165 | # allow pass gradually 166 | result = True 167 | else: 168 | # not lucky 169 | result = False 170 | else: 171 | # still suffering, lock it again 172 | lock['locked_at'] = time_now 173 | lock['locked_status'] = MODE_LOCKED 174 | lock_changed = MODE_LOCKED 175 | result = False 176 | else: 177 | # not in locked mode now 178 | if not health_ok_now: 179 | # turns BAD 180 | lock['locked_at'] = time_now 181 | lock['locked_status'] = MODE_LOCKED 182 | lock_changed = MODE_LOCKED 183 | result = False 184 | else: 185 | # still OK 186 | result = True 187 | 188 | ctx.end_at = time.time() 189 | ctx.result = result 190 | ctx.lock = lock.copy() 191 | # call callbacks. 192 | self._send_test_call_ctx(ctx, result, lock_changed) 193 | return result 194 | 195 | def _get_api_lock(self, key): 196 | if key not in self._locks: 197 | self._locks[key]['locked_at'] = 0 198 | self._locks[key]['locked_status'] = MODE_UNLOCKED 199 | return self._locks[key] 200 | 201 | def _send_test_call_ctx(self, ctx, result, lock_changed): 202 | if lock_changed == MODE_LOCKED: 203 | self._on_api_health_locked(ctx) 204 | elif lock_changed == MODE_UNLOCKED: 205 | self._on_api_health_unlocked(ctx) 206 | 207 | self._on_api_health_tested(ctx) 208 | if result: 209 | self._on_api_health_tested_ok(ctx) 210 | else: 211 | self._on_api_health_tested_bad(ctx) 212 | 213 | def is_healthy(self, service_name, func_name): 214 | """ 215 | Check current api health status by metrics, returns `True` 216 | for status OK:: 217 | 218 | if requests > THRESHOLD_REQUEST: 219 | if timeouts / requests > THRESHOLD_TIMEOUT or 220 | sys_excs / requests > THRESHOLD_SYS_EXC: 221 | return False 222 | return True 223 | """ 224 | key_request = '{0}.{1}'.format(service_name, func_name) 225 | key_timeout = '{0}.timeout'.format(key_request) 226 | key_sys_exc = '{0}.sys_exc'.format(key_request) 227 | key_unkwn_exc = '{0}.unkwn_exc'.format(key_request) 228 | 229 | requests = self._metrics.get(key_request) 230 | timeouts = self._metrics.get(key_timeout) 231 | sys_excs = self._metrics.get(key_sys_exc) 232 | unkwn_exc = self._metrics.get(key_unkwn_exc) 233 | 234 | if requests > self._threshold_request: 235 | return (((timeouts / float(requests)) < self._threshold_timeout) and 236 | ((sys_excs / float(requests)) < self._threshold_sys_exc) and 237 | ((unkwn_exc / float(requests)) < self._threshold_unkwn_exc)) 238 | return True 239 | -------------------------------------------------------------------------------- /doctor/configs.py: -------------------------------------------------------------------------------- 1 | # -*- coding = utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | 6 | class Configs(dict): 7 | """ 8 | Configs for ``Metrics`` and ``HealthTester``. 9 | """ 10 | 11 | def __init__(self, settings=None): 12 | defaults = dict( 13 | # Metrics settings. 14 | METRICS_GRANULARITY=20, # sec 15 | METRICS_ROLLINGSIZE=20, 16 | # Health settings. 17 | HEALTH_MIN_RECOVERY_TIME=20, # sec 18 | HEALTH_MAX_RECOVERY_TIME=2 * 60, # sec 19 | HEALTH_THRESHOLD_REQUEST=10 * 1, # per `INTERVAL` 20 | HEALTH_THRESHOLD_TIMEOUT=0.5, # percentage per `INTERVAL` 21 | HEALTH_THRESHOLD_SYS_EXC=0.5, # percentage per `INTERVAL` 22 | HEALTH_THRESHOLD_UNKWN_EXC=0.5, # percentage per `INTERVAL` 23 | ) 24 | super(self.__class__, self).__init__(**defaults) 25 | 26 | if settings is not None: 27 | self.load(settings) 28 | 29 | def load(self, obj): 30 | if isinstance(obj, dict): 31 | items = obj.iteritems() 32 | else: 33 | items = obj.__dict__.iteritems() 34 | 35 | for k, v in items: 36 | if k in self: 37 | self[k] = v 38 | 39 | def __setattr__(self, k, v): 40 | if k in self: 41 | self[k] = v 42 | 43 | def __getattr__(self, k): 44 | return super(self.__class__, self).get(k, None) 45 | -------------------------------------------------------------------------------- /doctor/metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import absolute_import 4 | 5 | """ 6 | Metrics 7 | ======= 8 | 9 | In process metrics. 10 | 11 | Features 12 | --------- 13 | 14 | 1. Behaves like a statsd client, but in process. 15 | 2. Currently only support counters. 16 | 3. Counters are implemented in ``RollingNumber``, a rolling number 17 | is like a sliding window on timestamp sequence. 18 | """ 19 | 20 | import time 21 | 22 | 23 | class RollingNumber(object): 24 | """ 25 | RollingNumber behaves like a FIFO queue with fixed length, or a 26 | sliding window on timestamp sequence:: 27 | 28 | 1 2 0 3 [4 5 1 2 4 2] 3 4 ... (<= time passing) 29 | +--- 18 ---+ 30 | 31 | A rolling number's value is the ``sum`` of the queue elements, the 32 | last element's value will roll into previous position once the clock 33 | passed 1 ``granularity`` (default ``1s``). 34 | 35 | Rolling number dosen't use an event loop (i.e. via gevent) to roll elements 36 | on time goes on, it uses passive clock checking instead. All read/write 37 | actions like ``incr()``, ``value()`` will shift current rolling number to 38 | align its internel ``_clock`` with timestamp now. The shift will pop 39 | elements on the left and fill ``0`` on the right, so if there is a long 40 | time no data incoming, the rolling number will change to a all zero queue. 41 | (aka, with its value as ``0``). 42 | 43 | Attributes: 44 | rolling_size the sliding window length 45 | rolling_granularity the shifting timestamp granularity (default: 1s) 46 | """ 47 | 48 | def __init__(self, rolling_size, rolling_granularity=1): 49 | """ 50 | Init a rolling number to 0 with size. 51 | """ 52 | self.rolling_size = rolling_size 53 | self.rolling_granularity = rolling_granularity 54 | 55 | self._clock = time.time() 56 | self._values = [0] * rolling_size 57 | 58 | def clear(self): 59 | """ 60 | Clear the value to all zeros. 61 | 62 | *Note*: :meth:`clear` dosen't shift the `clock`, it will certainly 63 | set the rolling number to zero. 64 | """ 65 | self._values = [0] * self.rolling_size 66 | 67 | def value(self): 68 | """ 69 | Return the value this rolling number present, actually the ``sum()`` 70 | value of this queue. 71 | """ 72 | self.shift_on_clock_changes() 73 | return sum(self._values) 74 | 75 | __int__ = value 76 | 77 | def increment(self, value): 78 | """ 79 | Increment this number by `value`, will increment the last element by `` 80 | value``. 81 | """ 82 | self.shift_on_clock_changes() 83 | self._values[-1] += value 84 | 85 | incr = increment 86 | 87 | def shift(self, length): 88 | """ 89 | Shift the rolling number to the right by ``length``, will pop elements 90 | on the left and fill ``0`` on the right. 91 | """ 92 | if length <= 0: 93 | return 94 | 95 | if length > self.rolling_size: 96 | return self.clear() 97 | 98 | end = [0] * length 99 | self._values = self._values[length:] + end 100 | 101 | def shift_on_clock_changes(self): 102 | """ 103 | Shift the rolling number if its ``_clock`` is bebind the timestamp 104 | ``now`` by at least 1 timestamp granularity, and synchronous its 105 | ``_clock`` to ``now``. 106 | """ 107 | now = time.time() 108 | length = int((now - self._clock) // self.rolling_granularity) 109 | if length > 0: 110 | self.shift(length) 111 | self._clock = now 112 | 113 | def __repr__(self): 114 | """ 115 | Python presentation: `` 116 | """ 117 | return ''.format(self.value(), self._values) 118 | 119 | 120 | class Metrics(object): 121 | 122 | def __init__(self, settings): 123 | self._granularity = settings.METRICS_GRANULARITY 124 | self._rollingsize = settings.METRICS_ROLLINGSIZE 125 | 126 | self._api_latest_state = dict() 127 | self._counters = dict() 128 | 129 | @property 130 | def counters(self): 131 | """``RollingNumber`` object.""" 132 | return self._counters 133 | 134 | @property 135 | def api_latest_state(self): 136 | """ 137 | A dict to record the latest api call result, schema: `{api_name: True/False}`. 138 | If the latest call on this api succeeds without any errors (except the 139 | `too_busy_exception`), the value in this dict will be set to be `True`, else 140 | `False`. 141 | """ 142 | return self._api_latest_state 143 | 144 | def incr(self, key, value=1): 145 | """increment the counter value by ``value``, if the 146 | counter was not found, create one and increment it. 147 | """ 148 | if key not in self._counters: 149 | self._counters[key] = RollingNumber( 150 | self._rollingsize, rolling_granularity=self._granularity) 151 | counter = self._counters[key] 152 | counter.incr(value) 153 | 154 | def get(self, key, default=0): 155 | """Get metric value by `key`, if not found ,return default.""" 156 | v = self._counters.get(key, None) 157 | return (v and v.value()) or default 158 | 159 | def on_api_called(self, service_name, func_name): 160 | self.incr('{0}.{1}'.format(service_name, func_name)) 161 | 162 | def on_api_called_ok(self, service_name, func_name): 163 | self._api_latest_state['{0}.{1}'.format(service_name, 164 | func_name)] = True 165 | 166 | def on_api_called_user_exc(self, service_name, func_name): 167 | self._api_latest_state['{0}.{1}'.format(service_name, 168 | func_name)] = True 169 | 170 | def on_api_called_timeout(self, service_name, func_name): 171 | self.incr('{0}.{1}.timeout'.format(service_name, func_name)) 172 | 173 | def on_api_called_sys_exc(self, service_name, func_name): 174 | self.incr('{0}.{1}.sys_exc'.format(service_name, func_name)) 175 | self._api_latest_state['{0}.{1}'.format(service_name, 176 | func_name)] = False 177 | 178 | def on_api_called_unkwn_exc(self, service_name, func_name): 179 | self.incr('{0}.{1}.unkwn_exc'.format(service_name, func_name)) 180 | self._api_latest_state['{0}.{1}'.format(service_name, 181 | func_name)] = False 182 | -------------------------------------------------------------------------------- /doctor/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eleme/doctor/a78f0446e9a85423bcb0cc8e1e96d48f6014a1b0/doctor/plugins/__init__.py -------------------------------------------------------------------------------- /doctor/plugins/archer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | from .. import HealthTester, Configs 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | EXPORTED_CALLBACKS = [ 10 | "on_api_health_locked", 11 | "on_api_health_unlocked", 12 | "on_api_health_tested", 13 | "on_api_health_tested_bad", 14 | "on_api_health_tested_ok", 15 | ] 16 | 17 | class Doctor(object): 18 | def __init__(self, failure_exception, settings=None): 19 | self.configs = Configs(settings) 20 | self.app = None 21 | self.tester = None 22 | self.failure_exception = failure_exception 23 | 24 | def test(self, app_meta): 25 | app, func_name = app_meta.app, app_meta.name 26 | if not self.tester.test(app.service_name, func_name): 27 | raise self.failure_exception 28 | 29 | def init_app(self, app): 30 | if self.app is not None: 31 | raise RuntimeError("Plugin is alread registered") 32 | self.tester = HealthTester( 33 | self.configs, 34 | self.on_api_health_locked, 35 | self.on_api_health_unlocked, 36 | self.on_api_health_tested, 37 | self.on_api_health_tested_bad, 38 | self.on_api_health_tested_ok, 39 | ) 40 | self.app = app 41 | self.app.before_api_call(self.test) 42 | self.app.tear_down_api_call(self.collect_api_call_result) 43 | 44 | def collect_api_call_result(self, api_meta, result_meta): 45 | service_name, func_name = api_meta.app.service_name, api_meta.name 46 | self.tester.metrics.on_api_called(service_name, func_name) 47 | if result_meta.error is None: 48 | self.tester.metrics.on_api_called_ok(service_name, func_name) 49 | else: 50 | self.tester.metrics.on_api_called_unkwn_exc(service_name, 51 | func_name) 52 | 53 | 54 | def set_handler(self, name, func): 55 | if name not in EXPORTED_CALLBACKS: 56 | raise RuntimeError("name should be only in %r", EXPORTED_CALLBACKS) 57 | setattr(self, name, func) 58 | 59 | def on_api_health_locked(self, result): 60 | pass 61 | 62 | def on_api_health_unlocked(self, result): 63 | pass 64 | 65 | def on_api_health_tested(self, result): 66 | pass 67 | 68 | def on_api_health_tested_bad(self, result): 69 | pass 70 | 71 | def on_api_health_tested_ok(self, result): 72 | pass 73 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | 5 | from setuptools import setup 6 | 7 | 8 | with open('doctor/__init__.py', 'rb') as f: 9 | version = re.search(r'__version__\s*=\s*[\'"]([^\'"]*)[\'"]', 10 | f.read(), re.M).group(1) 11 | 12 | with open('./README.md', 'rb') as f: 13 | description = f.read() 14 | 15 | 16 | setup( 17 | name='doctor', 18 | version=version, 19 | description=description, 20 | url='https://github.com/eleme/doctor', 21 | author='WangChao', 22 | author_email='hit9@ele.me, xiangyu.wang@ele.me', 23 | packages=['doctor'], 24 | ) 25 | -------------------------------------------------------------------------------- /tests/test_checker.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import time 4 | 5 | import mock 6 | import pytest 7 | 8 | from doctor import HealthTester, Configs 9 | from doctor.checker import MODE_LOCKED, MODE_UNLOCKED, MODE_RECOVER 10 | 11 | 12 | @pytest.fixture(scope='function') 13 | def configs(): 14 | configs = Configs() 15 | configs.HEALTH_THRESHOLD_REQUEST = 9 16 | configs.HEALTH_MIN_RECOVERY_TIME = 1 17 | configs.HEALTH_MAX_RECOVERY_TIME = 1 18 | return configs 19 | 20 | @pytest.fixture(scope='function') 21 | def key(): 22 | return ('hello', 'world') 23 | 24 | @pytest.fixture(scope='function') 25 | def f_locked(): 26 | return mock.Mock() 27 | 28 | @pytest.fixture(scope='function') 29 | def f_unlocked(): 30 | return mock.Mock() 31 | 32 | @pytest.fixture(scope='function') 33 | def f_tested(): 34 | return mock.Mock() 35 | 36 | @pytest.fixture(scope='function') 37 | def f_tested_bad(): 38 | return mock.Mock() 39 | 40 | @pytest.fixture(scope='function') 41 | def f_tested_ok(): 42 | return mock.Mock() 43 | 44 | 45 | def test_non_callbacks(configs): 46 | HealthTester(configs)._send_test_call_ctx(None, None, None) 47 | 48 | 49 | def test_requests_all_ok(configs, key, f_locked, f_unlocked, 50 | f_tested, f_tested_bad, f_tested_ok): 51 | """All requests ok, still UNLOCK.""" 52 | tester = HealthTester(configs, f_locked, f_unlocked, 53 | f_tested, f_tested_bad, f_tested_ok) 54 | 55 | for i in range(configs.HEALTH_THRESHOLD_REQUEST + 1): 56 | tester.metrics.on_api_called(*key) 57 | tester.metrics.on_api_called_ok(*key) 58 | 59 | assert tester.is_healthy(*key) 60 | assert tester.test(*key) 61 | assert tester.locks['.'.join(key)]['locked_status'] == MODE_UNLOCKED 62 | assert not f_locked.called 63 | assert not f_unlocked.called 64 | assert f_tested.called 65 | assert f_tested_ok.called 66 | assert not f_tested_bad.called 67 | 68 | 69 | def test_timeouts_over_threshold(configs, key, f_locked, f_unlocked, 70 | f_tested, f_tested_bad, f_tested_ok): 71 | """timeouts / requests > THRESHOLD_TIMEOUT, LOCK.""" 72 | tester = HealthTester(configs, f_locked, f_unlocked, 73 | f_tested, f_tested_bad, f_tested_ok) 74 | 75 | requests = configs.HEALTH_THRESHOLD_REQUEST + 1 76 | for i in range(requests): 77 | tester.metrics.on_api_called(*key) 78 | tester.metrics.on_api_called_ok(*key) 79 | 80 | for i in range(requests // 2 + 1): 81 | tester.metrics.on_api_called_timeout(*key) 82 | 83 | assert not tester.is_healthy(*key) 84 | assert not tester.test(*key) 85 | assert tester.locks['.'.join(key)]['locked_status'] == MODE_LOCKED 86 | assert f_locked.called 87 | assert not f_unlocked.called 88 | assert f_tested.called 89 | assert not f_tested_ok.called 90 | assert f_tested_bad.called 91 | 92 | 93 | def test_sys_excs_over_threshold(configs, key, f_locked, f_unlocked, 94 | f_tested, f_tested_bad, f_tested_ok): 95 | """sys_excs / requests > THRESHOLD_TIMEOUT, LOCK.""" 96 | tester = HealthTester(configs, f_locked, f_unlocked, 97 | f_tested, f_tested_bad, f_tested_ok) 98 | 99 | requests = configs.HEALTH_THRESHOLD_REQUEST + 1 100 | for i in range(requests): 101 | tester.metrics.on_api_called(*key) 102 | tester.metrics.on_api_called_ok(*key) 103 | 104 | for i in range(requests // 2 + 1): 105 | tester.metrics.on_api_called_sys_exc(*key) 106 | 107 | assert not tester.is_healthy(*key) 108 | assert not tester.test(*key) 109 | assert tester.locks['.'.join(key)]['locked_status'] == MODE_LOCKED 110 | assert f_locked.called 111 | assert not f_unlocked.called 112 | assert f_tested.called 113 | assert not f_tested_ok.called 114 | assert f_tested_bad.called 115 | 116 | 117 | def test_unkwn_excs_over_threshold(configs, key, f_locked, f_unlocked, 118 | f_tested, f_tested_bad, f_tested_ok): 119 | """unkwn_excs / requests > THRESHOLD_TIMEOUT, LOCK.""" 120 | tester = HealthTester(configs, f_locked, f_unlocked, 121 | f_tested, f_tested_bad, f_tested_ok) 122 | 123 | requests = configs.HEALTH_THRESHOLD_REQUEST + 1 124 | for i in range(requests): 125 | tester.metrics.on_api_called(*key) 126 | tester.metrics.on_api_called_ok(*key) 127 | 128 | for i in range(requests // 2 + 1): 129 | tester.metrics.on_api_called_unkwn_exc(*key) 130 | 131 | assert not tester.is_healthy(*key) 132 | assert not tester.test(*key) 133 | assert tester.locks['.'.join(key)]['locked_status'] == MODE_LOCKED 134 | assert f_locked.called 135 | assert not f_unlocked.called 136 | assert f_tested.called 137 | assert not f_tested_ok.called 138 | assert f_tested_bad.called 139 | 140 | 141 | def _set_lock_mode(tester, key, mode): 142 | lock = tester._get_api_lock('.'.join(key)) 143 | lock['locked_at'] = time.time() 144 | lock['locked_status'] = mode 145 | return lock 146 | 147 | 148 | def test_in_min_recovery_time_health_not_ok(configs, key, 149 | f_locked, f_unlocked, 150 | f_tested, f_tested_bad, 151 | f_tested_ok): 152 | """In MIN_RECOVERY_TIME, health is not ok, still LOCK.""" 153 | tester = HealthTester(configs, f_locked, f_unlocked, 154 | f_tested, f_tested_bad, f_tested_ok) 155 | lock = _set_lock_mode(tester, key, MODE_LOCKED) 156 | 157 | requests = configs.HEALTH_THRESHOLD_REQUEST + 1 158 | for i in range(requests): 159 | tester.metrics.on_api_called(*key) 160 | tester.metrics.on_api_called_timeout(*key) 161 | 162 | assert not tester.is_healthy(*key) 163 | assert not tester.test(*key) 164 | assert lock['locked_status'] == MODE_LOCKED 165 | assert not f_locked.called 166 | assert not f_unlocked.called 167 | assert f_tested.called 168 | assert not f_tested_ok.called 169 | assert f_tested_bad.called 170 | 171 | 172 | def test_in_min_recovery_time_health_ok(configs, key, 173 | f_locked, f_unlocked, 174 | f_tested, f_tested_bad, 175 | f_tested_ok): 176 | """Does not locked at least MIN_RECOVERY_TIME, even health is ok, 177 | still LOCK.""" 178 | tester = HealthTester(configs, f_locked, f_unlocked, 179 | f_tested, f_tested_bad, f_tested_ok) 180 | lock = _set_lock_mode(tester, key, MODE_LOCKED) 181 | 182 | requests = configs.HEALTH_THRESHOLD_REQUEST + 1 183 | for i in range(requests): 184 | tester.metrics.on_api_called(*key) 185 | tester.metrics.on_api_called_ok(*key) 186 | 187 | assert tester.is_healthy(*key) 188 | assert not tester.test(*key) 189 | assert lock['locked_status'] == MODE_LOCKED 190 | assert not f_locked.called 191 | assert not f_unlocked.called 192 | assert f_tested.called 193 | assert not f_tested_ok.called 194 | assert f_tested_bad.called 195 | 196 | 197 | def test_min_recovery_time_passed_health_ok(configs, key, 198 | f_locked, f_unlocked, 199 | f_tested, f_tested_bad, 200 | f_tested_ok): 201 | """Already locked at least MIN_RECOVERY_TIME, LOCK -> RECOVER.""" 202 | tester = HealthTester(configs, f_locked, f_unlocked, 203 | f_tested, f_tested_bad, f_tested_ok) 204 | lock = _set_lock_mode(tester, key, MODE_LOCKED) 205 | 206 | requests = configs.HEALTH_THRESHOLD_REQUEST + 1 207 | for i in range(requests): 208 | tester.metrics.on_api_called(*key) 209 | tester.metrics.on_api_called_ok(*key) 210 | 211 | time.sleep(configs.HEALTH_MIN_RECOVERY_TIME) 212 | assert tester.is_healthy(*key) 213 | assert tester.test(*key) 214 | assert lock['locked_status'] == MODE_RECOVER 215 | assert not f_locked.called 216 | assert not f_unlocked.called 217 | assert f_tested.called 218 | assert f_tested_ok.called 219 | assert not f_tested_bad.called 220 | 221 | 222 | def test_in_max_recovery_time_latest_state_not_ok(configs, key, 223 | f_locked, f_unlocked, 224 | f_tested, f_tested_bad, 225 | f_tested_ok): 226 | """In MAX_RECOVERY_TIME, latest_state is not ok, RECOVER -> LOCK. 227 | """ 228 | tester = HealthTester(configs, f_locked, f_unlocked, 229 | f_tested, f_tested_bad, f_tested_ok) 230 | lock = _set_lock_mode(tester, key, MODE_RECOVER) 231 | 232 | requests = configs.HEALTH_THRESHOLD_REQUEST + 1 233 | for i in range(requests): 234 | tester.metrics.on_api_called(*key) 235 | tester.metrics.on_api_called_sys_exc(*key) 236 | 237 | assert not tester.metrics.api_latest_state['.'.join(key)] 238 | assert not tester.test(*key) 239 | assert lock['locked_status'] == MODE_LOCKED 240 | assert f_locked.called 241 | assert not f_unlocked.called 242 | assert f_tested.called 243 | assert not f_tested_ok.called 244 | assert f_tested_bad.called 245 | 246 | 247 | def test_in_max_recovery_time_latest_state_ok(configs, key, 248 | f_locked, f_unlocked, 249 | f_tested, f_tested_bad, 250 | f_tested_ok): 251 | """Does not recover at least MAX_RECOVERY_TIME, even latest state 252 | is ok, still RECOVER, only random release request.""" 253 | tester = HealthTester(configs, f_locked, f_unlocked, 254 | f_tested, f_tested_bad, f_tested_ok) 255 | lock = _set_lock_mode(tester, key, MODE_RECOVER) 256 | 257 | requests = configs.HEALTH_THRESHOLD_REQUEST + 1 258 | for i in range(requests): 259 | tester.metrics.on_api_called(*key) 260 | tester.metrics.on_api_called_ok(*key) 261 | 262 | tester.test(*key) 263 | assert tester.metrics.api_latest_state['.'.join(key)] 264 | assert lock['locked_status'] == MODE_RECOVER 265 | assert not f_locked.called 266 | assert not f_unlocked.called 267 | assert f_tested.called 268 | 269 | 270 | def test_max_recovery_time_passed_latest_state_ok(configs, key, 271 | f_locked, f_unlocked, 272 | f_tested, f_tested_bad, 273 | f_tested_ok): 274 | """Latest state is ok, and already recovered at least 275 | MAX_RECOVERY_TIME, RECOVER -> UNLOCK. 276 | """ 277 | tester = HealthTester(configs, f_locked, f_unlocked, 278 | f_tested, f_tested_bad, f_tested_ok) 279 | lock = _set_lock_mode(tester, key, MODE_RECOVER) 280 | 281 | requests = configs.HEALTH_THRESHOLD_REQUEST + 1 282 | for i in range(requests): 283 | tester.metrics.on_api_called(*key) 284 | tester.metrics.on_api_called_ok(*key) 285 | 286 | time.sleep(configs.HEALTH_MAX_RECOVERY_TIME) 287 | assert tester.metrics.api_latest_state['.'.join(key)] 288 | assert tester.test(*key) 289 | assert lock['locked_status'] == MODE_UNLOCKED 290 | assert not f_locked.called 291 | assert f_unlocked.called 292 | assert f_tested.called 293 | assert f_tested_ok.called 294 | assert not f_tested_bad.called 295 | -------------------------------------------------------------------------------- /tests/test_configs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pytest 4 | 5 | from doctor import Configs 6 | 7 | 8 | def test_init_with_defaults(): 9 | configs = Configs() 10 | 11 | assert configs['METRICS_ROLLINGSIZE'] == 20 12 | assert configs['METRICS_ROLLINGSIZE'] == 20 13 | 14 | assert configs['HEALTH_MIN_RECOVERY_TIME'] == 20 15 | assert configs['HEALTH_MAX_RECOVERY_TIME'] == 2 * 60 16 | assert configs['HEALTH_THRESHOLD_REQUEST'] == 10 17 | assert configs['HEALTH_THRESHOLD_TIMEOUT'] == 0.5 18 | assert configs['HEALTH_THRESHOLD_SYS_EXC'] == 0.5 19 | assert configs['HEALTH_THRESHOLD_UNKWN_EXC'] == 0.5 20 | 21 | 22 | def test_setattr(): 23 | configs = Configs() 24 | 25 | configs.METRICS_ROLLINGSIZE = 100 26 | assert configs['METRICS_ROLLINGSIZE'] == 100 27 | 28 | configs.TEST = 200 29 | assert 'TEST' not in configs 30 | with pytest.raises(KeyError): 31 | configs['TEST'] 32 | 33 | 34 | def test_getattr(): 35 | configs = Configs() 36 | 37 | configs['METRICS_ROLLINGSIZE'] = 100 38 | assert configs.METRICS_ROLLINGSIZE == 100 39 | 40 | assert 'TEST' not in configs 41 | assert configs.TEST is None 42 | 43 | 44 | def test_load(): 45 | configs = Configs() 46 | 47 | configs.load({'METRICS_GRANULARITY': 10}) 48 | assert configs.METRICS_GRANULARITY == 10 49 | 50 | class Settings(object): 51 | HEALTH_MIN_RECOVERY_TIME = 100 52 | TEST = 'test' 53 | 54 | configs.load(Settings) 55 | assert configs.TEST is None 56 | assert configs.HEALTH_MIN_RECOVERY_TIME == 100 57 | -------------------------------------------------------------------------------- /tests/test_metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import time 4 | 5 | from doctor.metrics import RollingNumber, Metrics 6 | from doctor.configs import Configs 7 | 8 | 9 | def test_rollingnumber(): 10 | rn = RollingNumber(2, 1) 11 | 12 | rn.incr(1) 13 | assert rn._values == [0, 1] 14 | rn.incr(2) 15 | assert rn._values == [0, 3] 16 | 17 | time.sleep(1) 18 | rn.incr(1) 19 | assert rn._values == [3, 1] 20 | rn.incr(3) 21 | assert rn._values == [3, 4] 22 | 23 | time.sleep(2) 24 | assert rn.value() == 0 25 | 26 | 27 | def test_metrics(): 28 | metrics = Metrics(Configs()) 29 | 30 | metrics.incr('foo.bar', 1) 31 | assert metrics.get('foo.bar') == 1 32 | 33 | metrics.on_api_called('foo', 'bar') 34 | assert metrics.get('foo.bar') == 2 35 | 36 | metrics.on_api_called_ok('bar', 'foo') 37 | assert metrics.api_latest_state['bar.foo'] 38 | 39 | metrics.on_api_called_user_exc('foo', 'baz') 40 | assert metrics.api_latest_state['foo.baz'] 41 | 42 | metrics.on_api_called_timeout('bar', 'baz') 43 | assert metrics.get('bar.baz.timeout') == 1 44 | 45 | metrics.on_api_called_sys_exc('baz', 'foo') 46 | assert metrics.get('baz.foo.sys_exc') == 1 47 | assert not metrics.api_latest_state['baz.foo'] 48 | --------------------------------------------------------------------------------