├── scripts ├── __init__.py ├── .python-version ├── data │ ├── autoscaler-parameters.json │ ├── stress-parameters.json │ ├── stress_tester_app.json │ └── marathon_autoscaler_app.json ├── create_marathon_extensions.py ├── render_template.py ├── deploy_to_marathon.py ├── test_autoscaler.py └── deploy_autoscaler_to_marathon.py ├── integration-requirements.txt ├── lib └── marathon_autoscaler │ ├── settings.py │ ├── __init__.py │ ├── utils.py │ ├── constants.py │ ├── logging_config.json │ ├── application_definition.py │ ├── apiclientbase.py │ ├── mesosagent.py │ ├── mesosmaster.py │ ├── __main__.py │ ├── marathon.py │ ├── scaler.py │ ├── datadog_metrics.py │ ├── history_manager.py │ ├── poller.py │ └── rules_manager.py ├── tests ├── stress_tester_app │ ├── run-stress.sh │ ├── Dockerfile │ ├── README.md │ └── stress.py ├── minimesos │ ├── run_minimesos_mac.sh │ ├── registry.json │ └── marathon-autoscaler-app-def.json ├── simulation_data │ ├── app_definition.json │ ├── app_recommendations.json │ └── app_metric_summaries.json ├── test_autoscaler.py ├── test_rules_engine.py └── test_history_manager.py ├── requirements.txt ├── .github └── CODEOWNERS ├── supervisord.conf ├── catalog-info.yaml ├── Dockerfile ├── tox.ini ├── Makefile ├── .gitignore ├── .travis.yml ├── minimesosFile ├── README.md ├── LICENSE └── CONTRIBUTING.md /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /integration-requirements.txt: -------------------------------------------------------------------------------- 1 | jinja2 -------------------------------------------------------------------------------- /lib/marathon_autoscaler/settings.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/.python-version: -------------------------------------------------------------------------------- 1 | 2.7.12 2 | -------------------------------------------------------------------------------- /tests/stress_tester_app/run-stress.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | 4 | python /app/stress.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | datadog 2 | requests 3 | aniso8601 4 | supervisor-stdout 5 | tox 6 | mock 7 | pytest 8 | pytest-cov 9 | pydocstyle 10 | pycodestyle 11 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # See https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners for help 2 | * @tendrilinc/SRE 3 | -------------------------------------------------------------------------------- /supervisord.conf: -------------------------------------------------------------------------------- 1 | [supervisord] 2 | nodaemon=true 3 | 4 | [program:marathon_autoscaler] 5 | command=/usr/bin/env python /app/__main__.py 6 | autorestart=unexpected 7 | stderr_logfile=/dev/stdout 8 | stderr_logfile_maxbytes=0 9 | stdout_logfile=/dev/stdout 10 | stdout_logfile_maxbytes=0 11 | -------------------------------------------------------------------------------- /tests/stress_tester_app/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:latest 2 | MAINTAINER techops@tendrilinc.com 3 | 4 | RUN apt-get update && apt-get install -q -y python python-pip stress 5 | RUN mkdir -p /app 6 | COPY stress.py /app/stress.py 7 | 8 | ADD run-stress.sh /app/run-stress.sh 9 | RUN chmod a+x /app/run-stress.sh 10 | CMD /app/run-stress.sh 11 | -------------------------------------------------------------------------------- /catalog-info.yaml: -------------------------------------------------------------------------------- 1 | # See https://backstage.io/docs/features/software-catalog/descriptor-format/ 2 | 3 | apiVersion: backstage.io/v1alpha1 4 | kind: Component 5 | metadata: 6 | name: marathon-autoscaler 7 | annotations: 8 | github.com/project-slug: tendrilinc/marathon-autoscaler 9 | spec: 10 | owner: group:tendrilinc/SRE 11 | type: infrastructure 12 | lifecycle: production 13 | -------------------------------------------------------------------------------- /tests/minimesos/run_minimesos_mac.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | eval $(minimesos info | tail -n +3) 3 | curl http://$(docker-machine ip default):8080/v2/apps 4 | python scripts/render_template.py tests/minimesos/marathon-autoscaler-app-def.json -o autoscaler-app.json 5 | # python scripts/deploy_to_marathon.py autoscaler-app.json --marathon-uri http://$(docker-machine ip default):8080 6 | minimesos install --marathonFile autoscaler-app.json -------------------------------------------------------------------------------- /lib/marathon_autoscaler/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * 2 | 3 | __all__ = ["apiclientbase", 4 | "application_definition", 5 | "constants", 6 | "datadog_metrics", 7 | "history_manager", 8 | "marathon", 9 | "mesosagent", 10 | "mesosmaster", 11 | "poller", 12 | "rule_manager", 13 | "scaler", 14 | "settings", 15 | "utils"] 16 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gliderlabs/alpine:3.4 2 | 3 | RUN apk-install \ 4 | python \ 5 | supervisor && \ 6 | python -m ensurepip && \ 7 | rm -r /usr/lib/python*/ensurepip && \ 8 | pip install --upgrade pip setuptools && \ 9 | rm -r /root/.cache 10 | 11 | RUN mkdir -p /app 12 | 13 | COPY requirements.txt lib/marathon_autoscaler/ /app/ 14 | COPY supervisord.conf /etc/supervisor.d/marathon_autoscaler.ini 15 | RUN pip install -r /app/requirements.txt 16 | CMD ["/usr/bin/supervisord"] 17 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py35 3 | skipsdist = True 4 | 5 | [pytest] 6 | norecursedirs = .git .tox .cache scripts 7 | 8 | [testenv] 9 | passenv = * 10 | deps= 11 | datadog 12 | requests 13 | aniso8601 14 | supervisor-stdout 15 | mock 16 | pytest 17 | pytest-cov 18 | pydocstyle 19 | pycodestyle 20 | 21 | commands= 22 | py.test --cov={toxinidir}/lib/marathon_autoscaler/ -s --junitxml=junit-{envname}.xml \ 23 | {posargs} 24 | 25 | [pycodestyle] 26 | max-line-length = 130 27 | -------------------------------------------------------------------------------- /scripts/data/autoscaler-parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "min_instances": 1, 3 | "max_instances": 5, 4 | "upper_threshold": { 5 | "cpu_avg_usage": ">=50", 6 | "memory_avg_usage": ">=70", 7 | "scale_factor": 1, 8 | "tolerance": "PT30S", 9 | "backoff": "PT15S", 10 | "exclusive": 0 11 | }, 12 | "lower_threshold": { 13 | "cpu_avg_usage": "<50", 14 | "memory_avg_usage": "<0.5", 15 | "scale_factor": 1, 16 | "tolerance": "PT30S", 17 | "backoff": "PT15S", 18 | "exclusive": 0 19 | } 20 | } -------------------------------------------------------------------------------- /scripts/data/stress-parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "instructions": [ 3 | { 4 | "cmd": "sleep", 5 | "args": ["15"] 6 | }, 7 | { 8 | "cmd": "stress", 9 | "switches": { 10 | "--cpu": "1", 11 | "--timeout": "30s" 12 | } 13 | }, 14 | { 15 | "cmd": "stress", 16 | "switches": { 17 | "--cpu": "3", 18 | "--timeout": "90s" 19 | } 20 | }, 21 | { 22 | "cmd": "sleep", 23 | "args": ["900"] 24 | } 25 | ], 26 | "play_mode": "single" 27 | } -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | .PHONY: build clean build-test 3 | 4 | build: clean check-env get-version 5 | docker build -t $(REGISTRY)/marathon_autoscaler:$(VERSION) . 6 | 7 | build-test: 8 | cd tests/stress_test_app && docker build -t $(REGISTRY)/stress_test_app . 9 | 10 | check-env: 11 | ifndef REGISTRY 12 | $(error REGISTRY is undefined) 13 | endif 14 | 15 | clean: 16 | rm -rf ./build 17 | 18 | deploy: build 19 | docker push $(REGISTRY)/marathon_autoscaler:$(VERSION) 20 | 21 | get-version: 22 | VERSION=`python -c "import os, sys; sys.path.append(os.path.abspath('lib/marathon_autoscaler')); from constants import __version__; print(__version__)"` 23 | -------------------------------------------------------------------------------- /tests/minimesos/registry.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "registry", 3 | "container": { 4 | "docker": { 5 | "image": "registry:2", 6 | "network": "BRIDGE", 7 | "portMappings": [ 8 | { 9 | "containerPort": 5000, 10 | "hostPort": 5000, 11 | "servicePort": 0, 12 | "protocol": "tcp" 13 | } 14 | ], 15 | "parameters": [], 16 | "privileged": false 17 | }, 18 | "type": "DOCKER", 19 | "volumes": [] 20 | }, 21 | "cpus": 0.1, 22 | "disk": 0, 23 | "env": {}, 24 | "healthChecks": [], 25 | "instances": 1, 26 | "maxLaunchDelaySeconds": 3600, 27 | "mem": 64, 28 | "upgradeStrategy": { 29 | "maximumOverCapacity": 1, 30 | "minimumHealthCapacity": 1 31 | } 32 | } -------------------------------------------------------------------------------- /lib/marathon_autoscaler/utils.py: -------------------------------------------------------------------------------- 1 | 2 | def clamp(num, smallest, largest): 3 | """ 4 | Propose a number and a range (smallest, largest) to receive a number that is clamped within that range. 5 | :param num: a number to propose 6 | :param smallest: minimum of range 7 | :param largest: maximum of range 8 | :return: number in range 9 | """ 10 | return max(smallest, min(num, largest)) 11 | 12 | 13 | def list_get(lst, index, default=None): 14 | """ 15 | A safety mechanism for accessing uncharted indexes of a list. Always remember: safety first! 16 | :param lst: list 17 | :param index: int 18 | :param default: A default value 19 | :return: Value of list at index -or- default value 20 | """ 21 | assert type(lst) == list, "Requires a list type" 22 | return_value = default 23 | try: 24 | return_value = lst[index] 25 | except IndexError: 26 | pass 27 | 28 | return return_value 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | .hypothesis/ 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | 57 | # PyBuilder 58 | target/ 59 | 60 | #Ipython Notebook 61 | .ipynb_checkpoints 62 | 63 | # Intellij-type files 64 | **/.idea 65 | 66 | # Minimesos working directory 67 | .minimesos -------------------------------------------------------------------------------- /lib/marathon_autoscaler/constants.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | __version__ = "0.0.3" 4 | 5 | compare = { 6 | ">=": lambda a, b: a >= b, 7 | "<=": lambda a, b: a <= b, 8 | "<": lambda a, b: a < b, 9 | ">": lambda a, b: a > b, 10 | "=": lambda a, b: a == b, 11 | "==": lambda a, b: a == b 12 | } 13 | 14 | DOWN = -1 15 | UP = 1 16 | IDLE = 0 17 | 18 | TRUTHINESS = ["true", "t", "yes", "y", "1"] 19 | 20 | FLAP_SIGNATURES = [ 21 | [-1, 1, -1, 1], 22 | [1, -1, 1, -1], 23 | [-1, 0, 1, 0, -1, 0, 1], 24 | [1, 0, -1, 0, 1, 0, -1] 25 | ] 26 | 27 | 28 | RE_VERSION_CHECK = re.compile(r"^\d+\.\d+\.\d+") 29 | RE_DELIMITERS = re.compile(r"[\s,|/]+") 30 | RE_THRESHOLD = re.compile(r"(?P[=><]{1,2})\s*(?P[+-]?\d+(:?\.\d*)?(:?[eE][+-]?\d+)?)") 31 | # capture group op ^ ^ ^ 32 | # any 1 or 2 variations ^^^^^^^^^^ of =, >, < 33 | # spaces 0 or more ^^^ 34 | # capture group val ^ ^ ^ 35 | # complex signed decimal w/ scientific notation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 36 | # if you hate this explanation or it's just not enough... https://regex101.com/ 37 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | dist: trusty 4 | 5 | python: 6 | - "2.7" 7 | - "3.5" 8 | 9 | services: 10 | - docker 11 | 12 | env: 13 | global: 14 | - PATH=$PATH:$HOME/.minimesos/bin 15 | 16 | install: 17 | - curl -sSL https://minimesos.org/install | sh 18 | - pip install -r requirements.txt 19 | - pip install jinja2 20 | - minimesos up 21 | - eval $(minimesos info | tail -n +3) 22 | - export DOCKER_REGISTRY=127.0.0.1:5000 23 | - docker build -t marathon-autoscaler:localbuild . 24 | - docker build -t stress-tester-app:localbuild tests/stress_tester_app 25 | - docker tag marathon-autoscaler:localbuild ${DOCKER_REGISTRY}/marathon-autoscaler:localbuild 26 | - docker tag stress-tester-app:localbuild ${DOCKER_REGISTRY}/stress-tester-app:localbuild 27 | - docker push ${DOCKER_REGISTRY}/marathon-autoscaler:localbuild 28 | - docker push ${DOCKER_REGISTRY}/stress-tester-app:localbuild 29 | - python scripts/render_template.py tests/minimesos/marathon-autoscaler-app-def.json -o autoscaler-app.json 30 | - minimesos install --marathonFile autoscaler-app.json 31 | 32 | script: 33 | - py.test --cov=lib/marathon_autoscaler/ 34 | - curl $MINIMESOS_MARATHON/v2/apps 35 | 36 | after_script: 37 | - minimesos destroy 38 | -------------------------------------------------------------------------------- /lib/marathon_autoscaler/logging_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "disable_existing_loggers": true, 4 | "formatters": { 5 | "info_format": { 6 | "format": "%(asctime)s | %(levelname)s | %(message)s" 7 | }, 8 | "debug_format": { 9 | "format": "%(asctime)s | %(levelname)s | %(message)s (%(funcName)s:%(lineno)d)" 10 | } 11 | }, 12 | "handlers": { 13 | "console": { 14 | "class": "logging.StreamHandler", 15 | "formatter": "info_format", 16 | "stream": "ext://sys.stdout" 17 | } 18 | }, 19 | "loggers": { 20 | "urllib3": { 21 | "level": "CRITICAL", 22 | "handlers": ["console"], 23 | "propagate": 1 24 | }, 25 | "dd.datadogpy": { 26 | "level": "CRITICAL", 27 | "handlers": ["console"], 28 | "propagate": 1 29 | }, 30 | "scaler": { 31 | "level": "INFO", 32 | "handlers": ["console"], 33 | "propagate": 0 34 | }, 35 | "rules_manager": { 36 | "level": "INFO", 37 | "handlers": ["console"], 38 | "propagate": 0 39 | } 40 | }, 41 | "root": { 42 | "level": "INFO", 43 | "handlers": ["console"] 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /minimesosFile: -------------------------------------------------------------------------------- 1 | minimesos { 2 | clusterName = "for_mas_cluster" 3 | loggingLevel = "INFO" 4 | mapAgentSandboxVolume = true 5 | mapPortsToHost = true 6 | mesosVersion = "1.0.0" 7 | timeout = 60 8 | 9 | agent { 10 | imageName = "containersol/mesos-agent" 11 | imageTag = "1.0.0-0.1.0" 12 | loggingLevel = "# INHERIT FROM CLUSTER" 13 | portNumber = 5051 14 | 15 | resources { 16 | 17 | cpu { 18 | role = "*" 19 | value = 1 20 | } 21 | 22 | disk { 23 | role = "*" 24 | value = 200 25 | } 26 | 27 | mem { 28 | role = "*" 29 | value = 256 30 | } 31 | 32 | ports { 33 | role = "*" 34 | value = "[4000-32000]" 35 | } 36 | } 37 | } 38 | 39 | marathon { 40 | imageName = "mesosphere/marathon" 41 | imageTag = "v1.3.6" 42 | 43 | app { 44 | marathonJson = "tests/minimesos/registry.json" 45 | } 46 | } 47 | 48 | master { 49 | aclJson = null 50 | authenticate = false 51 | imageName = "containersol/mesos-master" 52 | imageTag = "1.0.0-0.1.0" 53 | loggingLevel = "# INHERIT FROM CLUSTER" 54 | } 55 | 56 | zookeeper { 57 | imageName = "jplock/zookeeper" 58 | imageTag = "3.4.6" 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /tests/minimesos/marathon-autoscaler-app-def.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "marathon-autoscaler", 3 | "container": { 4 | "docker": { 5 | "image": "{{ '127.0.0.1' if TRAVIS else '192.168.99.100' }}:5000/marathon-autoscaler:localbuild", 6 | "network": "BRIDGE", 7 | "parameters": [], 8 | "privileged": false 9 | }, 10 | "type": "DOCKER", 11 | "volumes": [ 12 | { 13 | "containerPath": "/var/run/docker.sock", 14 | "hostPath": "/run/docker.sock", 15 | "mode": "RW" 16 | } 17 | ] 18 | }, 19 | "cpus": 0.2, 20 | "disk": 0, 21 | "env": { 22 | "INTERVAL": "5", 23 | "MESOS_URI": "{{ MINIMESOS_MASTER }}", 24 | "MARATHON_URI": "{{ MINIMESOS_MARATHON }}" 25 | }, 26 | "healthChecks": [ 27 | { 28 | "protocol": "COMMAND", 29 | "command": { 30 | "value": "/usr/bin/supervisorctl status marathon_autoscaler" 31 | }, 32 | "gracePeriodSeconds": 30, 33 | "intervalSeconds": 60, 34 | "timeoutSeconds": 20, 35 | "maxConsecutiveFailures": 3 36 | } 37 | ], 38 | "instances": 1, 39 | "maxLaunchDelaySeconds": 3600, 40 | "mem": 64, 41 | "upgradeStrategy": { 42 | "maximumOverCapacity": 1, 43 | "minimumHealthCapacity": 1 44 | }, 45 | "fetch": [ 46 | { 47 | "uri": "https://gist.githubusercontent.com/kernelpanek-tendril/161c41e972b37eb23dbea48f9cfe94e4/raw/835bfce527acdfa6bec14c87142e5c7a74823bdd/logging_config.json", 48 | "executable": false, 49 | "extract": false, 50 | "cache": true 51 | } 52 | ] 53 | } -------------------------------------------------------------------------------- /scripts/data/stress_tester_app.json: -------------------------------------------------------------------------------- 1 | { 2 | "acceptedResourceRoles": null, 3 | "args": null, 4 | "backoffFactor": 1.15, 5 | "backoffSeconds": 1, 6 | "cmd": null, 7 | "constraints": [], 8 | "container": { 9 | "docker": { 10 | "forcePullImage": true, 11 | "image": "autoscale_test_app:latest", 12 | "network": "BRIDGE", 13 | "parameters": [], 14 | "privileged": false 15 | }, 16 | "type": "DOCKER", 17 | "volumes": [ 18 | { 19 | "containerPath": "/var/log/docker", 20 | "hostPath": "/var/lib/docker", 21 | "mode": "RO" 22 | }, 23 | { 24 | "containerPath": "/var/run/docker.sock", 25 | "hostPath": "/run/docker.sock", 26 | "mode": "RW" 27 | } 28 | ] 29 | }, 30 | "cpus": 0.2, 31 | "dependencies": [], 32 | "deployments": [], 33 | "disk": 0, 34 | "env": { 35 | }, 36 | "executor": "", 37 | "fetch": [ 38 | { 39 | "cache": false, 40 | "executable": false, 41 | "extract": true, 42 | "uri": "file:///root/.dockercfg" 43 | } 44 | ], 45 | "healthChecks": [], 46 | "id": "/autoscale-test-app", 47 | "instances": 1, 48 | "ipAddress": null, 49 | "labels": { 50 | }, 51 | "maxLaunchDelaySeconds": 3600, 52 | "mem": 64, 53 | "ports": [], 54 | "requirePorts": false, 55 | "storeUrls": [], 56 | "tasksHealthy": 0, 57 | "tasksRunning": 1, 58 | "tasksStaged": 0, 59 | "tasksUnhealthy": 0, 60 | "upgradeStrategy": { 61 | "maximumOverCapacity": 1, 62 | "minimumHealthCapacity": 1 63 | }, 64 | "user": null 65 | } -------------------------------------------------------------------------------- /scripts/data/marathon_autoscaler_app.json: -------------------------------------------------------------------------------- 1 | { 2 | "acceptedResourceRoles": null, 3 | "args": null, 4 | "backoffFactor": 1.15, 5 | "backoffSeconds": 1, 6 | "cmd": null, 7 | "constraints": [], 8 | "container": { 9 | "docker": { 10 | "forcePullImage": true, 11 | "image": "marathon_autoscaler:latest", 12 | "network": "BRIDGE", 13 | "parameters": [], 14 | "privileged": false 15 | }, 16 | "type": "DOCKER", 17 | "volumes": [ 18 | { 19 | "containerPath": "/var/log/docker", 20 | "hostPath": "/var/lib/docker", 21 | "mode": "RO" 22 | }, 23 | { 24 | "containerPath": "/var/run/docker.sock", 25 | "hostPath": "/run/docker.sock", 26 | "mode": "RW" 27 | } 28 | ] 29 | }, 30 | "cpus": 0.5, 31 | "dependencies": [], 32 | "deployments": [], 33 | "disk": 0, 34 | "env": { 35 | "INTERVAL": "", 36 | "MESOS_URI": "", 37 | "MARATHON_URI": "", 38 | "MARATHON_USER": "", 39 | "MARATHON_PASS": "", 40 | "LOG_VERBOSITY": "", 41 | "CPU_FAN_OUT": "" 42 | }, 43 | "executor": "", 44 | "fetch": [ 45 | { 46 | "cache": false, 47 | "executable": false, 48 | "extract": true, 49 | "uri": "file:///root/.dockercfg" 50 | } 51 | ], 52 | "healthChecks": [ 53 | { 54 | "protocol": "COMMAND", 55 | "command": { "value": "/usr/bin/supervisorctl status marathon_autoscaler" }, 56 | "gracePeriodSeconds": 30, 57 | "intervalSeconds": 60, 58 | "timeoutSeconds": 20, 59 | "maxConsecutiveFailures": 3 60 | } 61 | ], 62 | "id": "/marathon-autoscaler", 63 | "instances": 1, 64 | "ipAddress": null, 65 | "labels": {}, 66 | "maxLaunchDelaySeconds": 3600, 67 | "mem": 512, 68 | "ports": [], 69 | "requirePorts": false, 70 | "storeUrls": [], 71 | "tasksHealthy": 0, 72 | "tasksRunning": 1, 73 | "tasksStaged": 0, 74 | "tasksUnhealthy": 0, 75 | "upgradeStrategy": { 76 | "maximumOverCapacity": 1, 77 | "minimumHealthCapacity": 1 78 | }, 79 | "user": null 80 | } -------------------------------------------------------------------------------- /scripts/create_marathon_extensions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import json 4 | import sys 5 | sys.path.append('../../marathon_autoscaler') 6 | from marathon_autoscaler.marathon import Marathon 7 | 8 | """ 9 | This script constructs an Marathon application definition for the Marathon Autoscaler container. 10 | 11 | Be sure to deploy the latest Marathon Autoscaler docker image to the registry before running this. 12 | """ 13 | 14 | 15 | def load_extension_file(file_path): 16 | with open(file_path, 'r') as f: 17 | scaler_extensions_data = json.load(f) 18 | return scaler_extensions_data 19 | 20 | 21 | def parse_cli_args(): 22 | parser = argparse.ArgumentParser(description="Deploy Marathon Autoscaler") 23 | parser.add_argument("--marathon-uri", dest="marathon_uri", type=str, 24 | required=True, help="The Marathon Endpoint") 25 | parser.add_argument("--marathon-user", dest="marathon_user", type=str, 26 | required=True, help="Username for Marathon access") 27 | parser.add_argument("--marathon-pass", dest="marathon_pass", type=str, 28 | required=True, help="Password for Marathon access") 29 | parser.add_argument("--app", dest="marathon_app", type=str, 30 | required=True, help="Password for Marathon access") 31 | parser.add_argument("--ext-file", dest="extensions_file", type=str, 32 | required=True, help="Password for Marathon access") 33 | return parser.parse_args() 34 | 35 | 36 | if __name__ == "__main__": 37 | args = parse_cli_args() 38 | scaler_extensions = load_extension_file(args.extensions_file) 39 | labels_data = {"labels": {}} 40 | labels_data["labels"]["marathon_autoscaler_extensions"] = \ 41 | json.dumps(scaler_extensions).replace("\n", "").replace(" ", "") 42 | labels_data["labels"]["use_marathon_autoscaler"] = "True" 43 | 44 | mara = Marathon(args.marathon_uri, (args.marathon_user, args.marathon_pass)) 45 | print(mara.update_app(args.marathon_app, labels_data)) 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /lib/marathon_autoscaler/application_definition.py: -------------------------------------------------------------------------------- 1 | from constants import TRUTHINESS, RE_VERSION_CHECK, __version__ as marathon_autoscaler_version 2 | import logging 3 | import re 4 | import settings 5 | 6 | 7 | class ApplicationDefinition(dict): 8 | """ 9 | A class that helps make interaction with the application definition document just a little nicer. 10 | """ 11 | def __init__(self, *args, **kwargs): 12 | self.logger = logging.getLogger(__name__) 13 | dict.__init__(self, *args, **kwargs) 14 | 15 | def __getattr__(self, item): 16 | return dict.get(self, item) 17 | 18 | @property 19 | def app_name(self): 20 | """ 21 | A helper property to return the application name 22 | :return: str 23 | """ 24 | result = None 25 | app_id = dict.get(self, "id") 26 | if app_id: 27 | result = app_id.lstrip("/") 28 | return result 29 | 30 | @property 31 | def is_app_participating(self): 32 | """ Determine if the application is ready for scale actions 33 | :return: application's participation in auto_scaling 34 | """ 35 | result = False 36 | if self.labels: 37 | use_label = next((node for label, node in self.labels.items() 38 | if "use_marathon_autoscaler" in label), {}) 39 | if settings.enforce_version_match: 40 | if use_label is not {} and \ 41 | re.match(RE_VERSION_CHECK, str(use_label)) is not None and \ 42 | str(use_label) == marathon_autoscaler_version: 43 | self.logger.debug("Version matching is enforced. Version: {0}".format(marathon_autoscaler_version)) 44 | self.logger.debug("{0}: participating".format(self.app_name)) 45 | result = True 46 | else: 47 | if use_label is not {} and \ 48 | (str(use_label).lower() in TRUTHINESS or 49 | (re.match(RE_VERSION_CHECK, str(use_label)) is not None and 50 | str(use_label) == marathon_autoscaler_version)): 51 | self.logger.debug("{0}: participating".format(self.app_name)) 52 | result = True 53 | 54 | return result 55 | -------------------------------------------------------------------------------- /scripts/render_template.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import jinja2 4 | import logging 5 | import os 6 | import sys 7 | import traceback 8 | 9 | LOG = logging.getLogger() 10 | LOG.setLevel(logging.DEBUG) 11 | ch = logging.StreamHandler(sys.stdout) 12 | ch.setLevel(logging.DEBUG) 13 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 14 | ch.setFormatter(formatter) 15 | LOG.addHandler(ch) 16 | 17 | 18 | def main(main_args): 19 | """ 20 | Read environment variables to envvars_dict 21 | Open file with file_path 22 | Read contents as template_contents 23 | Render template_contents as rendered_file_contents with envvars_dict 24 | Send rendered_file_contents to file or stdout 25 | """ 26 | envvars_dict = dict(os.environ) 27 | template_contents = read_file_contents(main_args.template_file) 28 | rendered_file_contents = render_template(template_contents, envvars_dict) 29 | if main_args.output_file: 30 | sys.stdout = open(main_args.output_file, "w") 31 | sys.stdout.write(rendered_file_contents) 32 | sys.stdout.close() 33 | 34 | 35 | def parse_cli_args(): 36 | p = argparse.ArgumentParser(description="Template Renderer") 37 | 38 | p.add_argument("template_file", 39 | type=str, 40 | help="Path to template file") 41 | p.add_argument("-o", "--output", 42 | dest="output_file", 43 | type=str, 44 | required=False, 45 | help="Path to output file") 46 | return p.parse_known_args() 47 | 48 | 49 | def read_file_contents(file_path): 50 | contents = None 51 | if os.path.isfile(file_path): 52 | with open(file_path, "r") as f: 53 | contents = f.read() 54 | return contents 55 | 56 | 57 | def render_template(template_contents, parameters_dict): 58 | template = jinja2.Template(template_contents) 59 | rendered_contents = template.render(**parameters_dict) 60 | return rendered_contents 61 | 62 | 63 | if __name__ == "__main__": 64 | try: 65 | args, args_other = parse_cli_args() 66 | main(args) 67 | except Exception as main_ex: 68 | LOG.error("An error occurred in running the application!") 69 | LOG.error(main_ex) 70 | LOG.error(traceback.print_tb(sys.exc_info()[2])) 71 | finally: 72 | sys.exit(0) 73 | -------------------------------------------------------------------------------- /lib/marathon_autoscaler/apiclientbase.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import requests 4 | 5 | 6 | class ApiClientBase(object): 7 | def __init__(self, uri, creds=None, logger=None): 8 | """ 9 | :param uri: 10 | :param creds: 11 | :return: 12 | """ 13 | self.auth = creds 14 | self.uri = uri.rstrip('/') 15 | self.logger = logger or logging.getLogger(__name__) 16 | 17 | @staticmethod 18 | def _build_path(path_suffix, **kwargs): 19 | """ 20 | :param path_suffix: 21 | :param kwargs: 22 | :return: 23 | """ 24 | new_path = path_suffix.format(**kwargs) 25 | return new_path 26 | 27 | def _call_endpoint(self, path, **kwargs): 28 | """ 29 | :param path: 30 | :param kwargs: 31 | :return: 32 | """ 33 | do_request_kwargs = {"params": None, "data": None} 34 | if "params" in kwargs.keys(): 35 | do_request_kwargs["params"] = kwargs.pop("params") 36 | if "data" in kwargs.keys(): 37 | do_request_kwargs["data"] = kwargs.pop("data") 38 | verb = self.paths[path]["verb"] 39 | if "verb" in kwargs.keys(): 40 | verb = kwargs.pop("verb") 41 | response = self._do_request(verb, self._build_path(path, **kwargs), **do_request_kwargs) 42 | result = None 43 | if response is not None: 44 | contents = response.content.decode('utf-8') 45 | try: 46 | result = json.loads(contents) 47 | except ValueError as _: 48 | result = response 49 | 50 | return result 51 | 52 | def _do_request(self, method, path, params=None, data=None): 53 | """ 54 | :param method: 55 | :param path: 56 | :param params: 57 | :param data: 58 | :return: 59 | """ 60 | headers = {'Content-Type': 'application/json', 'Accept': 'application/json'} 61 | response = None 62 | url = "".join([self.uri, path]) 63 | try: 64 | response = requests.request(method, url, params=params, 65 | data=data, 66 | headers=headers, 67 | auth=self.auth) 68 | except requests.exceptions.RequestException as e: 69 | self.logger.error(e) 70 | return response 71 | -------------------------------------------------------------------------------- /tests/simulation_data/app_definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "acceptedResourceRoles": null, 3 | "args": null, 4 | "backoffFactor": 1.15, 5 | "backoffSeconds": 1, 6 | "cmd": null, 7 | "constraints": [], 8 | "container": { 9 | "docker": { 10 | "forcePullImage": false, 11 | "image": "registry/test-service:0.0.1", 12 | "network": "BRIDGE", 13 | "parameters": [], 14 | "portMappings": [ 15 | { 16 | "containerPort": 5991, 17 | "hostPort": 0, 18 | "protocol": "tcp", 19 | "servicePort": 10008 20 | } 21 | ], 22 | "privileged": false 23 | }, 24 | "type": "DOCKER", 25 | "volumes": [] 26 | }, 27 | "cpus": 0.5, 28 | "dependencies": [], 29 | "deployments": [], 30 | "disk": 0, 31 | "env": { 32 | "location": "test-environment" 33 | }, 34 | "executor": "", 35 | "fetch": [ 36 | { 37 | "cache": false, 38 | "executable": false, 39 | "extract": true, 40 | "uri": "file:///docker/.dockercfg" 41 | } 42 | ], 43 | "healthChecks": [ 44 | { 45 | "gracePeriodSeconds": 120, 46 | "ignoreHttp1xx": false, 47 | "intervalSeconds": 20, 48 | "maxConsecutiveFailures": 3, 49 | "path": "/healthchk", 50 | "portIndex": 1, 51 | "protocol": "HTTP", 52 | "timeoutSeconds": 20 53 | } 54 | ], 55 | "id": "/test-service", 56 | "instances": 3, 57 | "ipAddress": null, 58 | "labels": { 59 | "mas_rule_fastscaleup_1": "cpu | >90 | PT2M | 3 | PT1M30S", 60 | "mas_rule_fastscaleup_2": "memory | >85 | PT2M | 3 | PT1M30S", 61 | "mas_rule_slowscaledown_1": "cpu | <=90 | PT1M | -1 | PT30S", 62 | "mas_rule_slowscaledown_2": "memory | <=85 | PT1M | -1 | PT30S", 63 | "min_instances": "2", 64 | "max_instances": "5", 65 | "use_marathon_autoscaler": "0.0.3" 66 | }, 67 | "maxLaunchDelaySeconds": 3600, 68 | "mem": 3000, 69 | "ports": [ 70 | 10008 71 | ], 72 | "requirePorts": false, 73 | "storeUrls": [], 74 | "tasksHealthy": 3, 75 | "tasksRunning": 3, 76 | "tasksStaged": 0, 77 | "tasksUnhealthy": 0, 78 | "upgradeStrategy": { 79 | "maximumOverCapacity": 1, 80 | "minimumHealthCapacity": 1 81 | }, 82 | "uris": [ 83 | "file:///docker/.dockercfg" 84 | ], 85 | "user": null, 86 | "version": "2000-01-01T00:00:01.000Z", 87 | "versionInfo": { 88 | "lastConfigChangeAt": "2000-01-01T00:00:01.000Z", 89 | "lastScalingAt": "2000-01-01T00:00:01.000Z" 90 | } 91 | } -------------------------------------------------------------------------------- /tests/test_autoscaler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | 5 | import pytest 6 | 7 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "lib", "marathon_autoscaler")) 8 | from scaler import AutoScaler 9 | import settings 10 | 11 | 12 | class fake_marathon_client(): 13 | def scale_app(app_id, scale_size): 14 | return 15 | 16 | 17 | @pytest.fixture(scope="session") 18 | def testsettings(): 19 | fake_settings = dict(sleep_interval=5, 20 | mesos_uri=None, 21 | agent_port=5051, 22 | marathon_uri=None, 23 | marathon_user=None, 24 | marathon_pass=None, 25 | cpu_fan_out=None, 26 | datadog_api_key=None, 27 | datadog_app_key=None, 28 | datadog_env=None, 29 | log_config="/app/logging_config.json", 30 | enforce_version_match=False, 31 | rules_prefix="mas_rule" 32 | ) 33 | for name, value in fake_settings.items(): 34 | setattr(settings, name, value) 35 | 36 | 37 | @pytest.fixture(scope="session") 38 | def auto_scaler(): 39 | fake_client = fake_marathon_client() 40 | _autoscaler = AutoScaler(fake_client) 41 | assert (type(_autoscaler) is AutoScaler) 42 | return _autoscaler 43 | 44 | 45 | @pytest.fixture(scope="session") 46 | def metric_summaries(): 47 | with open( 48 | os.path.join( 49 | os.path.dirname(os.path.abspath(__file__)), 50 | "simulation_data/app_metric_summaries.json" 51 | ), "r") as f: 52 | fake_data = json.load(f) 53 | assert(fake_data.get("summaries") is not None) 54 | return fake_data.get("summaries") 55 | 56 | 57 | @pytest.fixture 58 | def app_def(): 59 | with open( 60 | os.path.join( 61 | os.path.dirname(os.path.abspath(__file__)), 62 | "simulation_data/app_definition.json" 63 | ), "r") as f: 64 | test_app_def = json.load(f) 65 | return test_app_def 66 | 67 | 68 | def test_autoscale_decisions(testsettings, auto_scaler, app_def, metric_summaries): 69 | assert(type(app_def) is dict) 70 | for summary in metric_summaries: 71 | summary["test-service"]["application_definition"] = app_def 72 | auto_scaler.decide(summary) 73 | assert(type(auto_scaler) is AutoScaler) 74 | -------------------------------------------------------------------------------- /tests/test_rules_engine.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | 5 | import pytest 6 | 7 | 8 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "lib", "marathon_autoscaler")) 9 | 10 | from application_definition import ApplicationDefinition 11 | from rules_manager import RulesManager 12 | import settings 13 | 14 | 15 | @pytest.fixture(scope="session") 16 | def testsettings(): 17 | fake_settings = dict(sleep_interval=5, 18 | mesos_uri=None, 19 | agent_port=5051, 20 | marathon_uri=None, 21 | marathon_user=None, 22 | marathon_pass=None, 23 | cpu_fan_out=None, 24 | datadog_api_key=None, 25 | datadog_app_key=None, 26 | datadog_env=None, 27 | log_config="/app/logging_config.json", 28 | enforce_version_match=False, 29 | rules_prefix="mas_rule" 30 | ) 31 | for name, value in fake_settings.items(): 32 | setattr(settings, name, value) 33 | 34 | 35 | @pytest.fixture(scope="session") 36 | def app_def(): 37 | with open( 38 | os.path.join( 39 | os.path.dirname(os.path.abspath(__file__)), 40 | "simulation_data/app_definition.json" 41 | ), "r") as f: 42 | test_app_def = json.load(f) 43 | return test_app_def 44 | 45 | @pytest.fixture(scope="session") 46 | def rules_mgr(testsettings, app_def): 47 | _rules_mgr = RulesManager(app_def=ApplicationDefinition(app_def)) 48 | assert (type(_rules_mgr) is RulesManager) 49 | return _rules_mgr 50 | 51 | 52 | def test_is_app_participating(rules_mgr): 53 | assert (rules_mgr.is_app_participating() is True) 54 | 55 | 56 | def test_is_app_ready(rules_mgr): 57 | assert (rules_mgr.is_app_ready() is True) 58 | 59 | 60 | def test_has_rules(rules_mgr): 61 | assert (rules_mgr.rules is not None) 62 | 63 | 64 | def test_is_app_within_min_or_max(rules_mgr): 65 | assert (rules_mgr.is_app_within_min_or_max() is True) 66 | 67 | 68 | def test_has_rules(rules_mgr): 69 | assert (rules_mgr.rules is not None) 70 | 71 | 72 | def test_triggering_scaledown_rules(rules_mgr): 73 | fake_metrics = dict(cpu=80, memory=80) 74 | triggered_rules = rules_mgr.trigger_rules(fake_metrics) 75 | for rule in triggered_rules: 76 | assert("slowscaledown" in rule.get("ruleInfo").get("ruleName")) 77 | 78 | 79 | def test_triggering_fastscaleup_rules(rules_mgr): 80 | fake_metrics = dict(cpu=97, memory=89) 81 | triggered_rules = rules_mgr.trigger_rules(fake_metrics) 82 | for rule in triggered_rules: 83 | assert ("fastscaleup" in rule.get("ruleInfo").get("ruleName")) 84 | -------------------------------------------------------------------------------- /tests/simulation_data/app_recommendations.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendationsList": [ 3 | { 4 | "test-service": { 5 | "vote": 1, 6 | "checksum": "2000-01-01T00:00:01.000Z", 7 | "timestamp": "current_date", 8 | "rule": { 9 | "ruleInfo": { 10 | "rulePart": "1", 11 | "ruleName": "fastcpuscaleup" 12 | }, 13 | "ruleValue": { 14 | "scale_factor": "3", 15 | "weight": 1.0, 16 | "threshold": { 17 | "val": "10", 18 | "op": ">" 19 | }, 20 | "metric": "cpu", 21 | "tolerance": "PT10S", 22 | "backoff": "PT30S" 23 | } 24 | } 25 | } 26 | }, 27 | { 28 | "test-service": { 29 | "vote": 1, 30 | "checksum": "2000-01-01T00:00:01.000Z", 31 | "timestamp": "current_date", 32 | "rule": { 33 | "ruleInfo": { 34 | "rulePart": "1", 35 | "ruleName": "fastcpuscaleup" 36 | }, 37 | "ruleValue": { 38 | "scale_factor": "3", 39 | "weight": 1.0, 40 | "threshold": { 41 | "val": "10", 42 | "op": ">" 43 | }, 44 | "metric": "cpu", 45 | "tolerance": "PT10S", 46 | "backoff": "PT30S" 47 | } 48 | } 49 | } 50 | }, 51 | { 52 | "test-service": { 53 | "vote": 1, 54 | "checksum": "2000-01-01T00:00:01.000Z", 55 | "timestamp": "current_date", 56 | "rule": { 57 | "ruleInfo": { 58 | "rulePart": "1", 59 | "ruleName": "fastcpuscaleup" 60 | }, 61 | "ruleValue": { 62 | "scale_factor": "3", 63 | "weight": 1.0, 64 | "threshold": { 65 | "val": "10", 66 | "op": ">" 67 | }, 68 | "metric": "cpu", 69 | "tolerance": "PT10S", 70 | "backoff": "PT30S" 71 | } 72 | } 73 | } 74 | }, 75 | { 76 | "test-service": { 77 | "vote": 1, 78 | "checksum": "2000-01-01T00:00:01.000Z", 79 | "timestamp": "current_date", 80 | "rule": { 81 | "ruleInfo": { 82 | "rulePart": "1", 83 | "ruleName": "fastcpuscaleup" 84 | }, 85 | "ruleValue": { 86 | "scale_factor": "3", 87 | "weight": 1.0, 88 | "threshold": { 89 | "val": "10", 90 | "op": ">" 91 | }, 92 | "metric": "cpu", 93 | "tolerance": "PT10S", 94 | "backoff": "PT30S" 95 | } 96 | } 97 | } 98 | } 99 | ] 100 | } -------------------------------------------------------------------------------- /scripts/deploy_to_marathon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | 4 | """ 5 | import requests 6 | from requests.auth import HTTPBasicAuth 7 | import argparse 8 | import json 9 | import logging 10 | import os 11 | import sys 12 | import traceback 13 | 14 | LOG = logging.getLogger() 15 | LOG.setLevel(logging.DEBUG) 16 | ch = logging.StreamHandler(sys.stdout) 17 | ch.setLevel(logging.DEBUG) 18 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 19 | ch.setFormatter(formatter) 20 | LOG.addHandler(ch) 21 | 22 | 23 | def main(main_args): 24 | """ 25 | Read environment variables to envvars_dict 26 | Open file with file_path 27 | Read contents as template_contents 28 | Render template_contents as rendered_file_contents with envvars_dict 29 | Send rendered_file_contents to file or stdout 30 | """ 31 | appdef_contents = read_file_contents(main_args.appdef_file) 32 | send_to_marathon(appdef_contents, main_args) 33 | 34 | 35 | def parse_cli_args(): 36 | p = argparse.ArgumentParser(description="Deploy To Marathon") 37 | p.add_argument("appdef_file", 38 | type=str, 39 | help="Path to application definition file") 40 | p.add_argument("--marathon-uri", dest="marathon_uri", type=str, 41 | required=True, help="The Marathon Endpoint") 42 | p.add_argument("--marathon-user", dest="marathon_user", type=str, 43 | required=False, help="Username for Marathon access") 44 | p.add_argument("--marathon-pass", dest="marathon_pass", type=str, 45 | required=False, help="Password for Marathon access") 46 | 47 | return p.parse_known_args() 48 | 49 | 50 | def read_file_contents(file_path): 51 | contents = None 52 | if os.path.isfile(file_path): 53 | with open(file_path, "r") as f: 54 | contents = f.read() 55 | return contents 56 | 57 | 58 | def send_to_marathon(app_def_contents, cli_args): 59 | app_def_obj = json.loads(app_def_contents) 60 | request_args = ["{}/v2/apps/{}".format( 61 | cli_args.marathon_uri, 62 | app_def_obj.get("id"))] 63 | request_kwargs = dict(data=json.dumps(app_def_obj)) 64 | 65 | if cli_args.marathon_user: 66 | request_kwargs.update(auth=HTTPBasicAuth(cli_args.marathon_user, cli_args.marathon_pass)) 67 | 68 | response = requests.put(*request_args, **request_kwargs) 69 | 70 | LOG.info(response.headers) 71 | LOG.info(response.status_code) 72 | LOG.info(response.content) 73 | 74 | 75 | if __name__ == "__main__": 76 | try: 77 | args, args_other = parse_cli_args() 78 | main(args) 79 | except Exception as main_ex: 80 | LOG.error("An error occurred in running the application!") 81 | LOG.error(main_ex) 82 | LOG.error(traceback.print_tb(sys.exc_info()[2])) 83 | finally: 84 | sys.exit(0) 85 | -------------------------------------------------------------------------------- /scripts/test_autoscaler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | This script constructs an Marathon application definition for the stress tester container. 4 | 5 | Be sure to deploy the latest stress tester docker image to the registry before running this. 6 | """ 7 | import argparse 8 | import json 9 | import os 10 | import sys 11 | 12 | BASE_PATH = os.path.dirname(os.path.realpath(__file__)) 13 | PROJECT_PATH = os.path.dirname(BASE_PATH) 14 | sys.path.append(os.path.join(PROJECT_PATH, 'lib/')) 15 | 16 | from marathon_autoscaler.marathon import Marathon 17 | 18 | 19 | def load_app_definition(): 20 | with open(os.path.join(os.getcwd(), "data", "stress_tester_app.json"), 'r') as f: 21 | test_app_definition = json.load(f) 22 | return test_app_definition 23 | 24 | 25 | def load_stress_parameters(): 26 | with open(os.path.join(os.getcwd(), "data", "stress-parameters.json"), 'r') as f: 27 | test_app_definition = json.load(f) 28 | return test_app_definition 29 | 30 | 31 | def load_autoscaler_parameters(): 32 | with open(os.path.join(os.getcwd(), "data", "autoscaler-parameters.json"), 'r') as f: 33 | test_app_definition = json.load(f) 34 | return test_app_definition 35 | 36 | 37 | def parse_cli_args(): 38 | parser = argparse.ArgumentParser(description="Stress Tester Deployer") 39 | parser.add_argument("--marathon-uri", dest="marathon_uri", type=str, 40 | required=True, help="The Marathon Endpoint") 41 | parser.add_argument("--marathon-user", dest="marathon_user", type=str, 42 | required=True, help="Username for Marathon access") 43 | parser.add_argument("--marathon-pass", dest="marathon_pass", type=str, 44 | required=True, help="Password for Marathon access") 45 | return parser.parse_args() 46 | 47 | 48 | if __name__ == "__main__": 49 | args = parse_cli_args() 50 | app_def = load_app_definition() 51 | 52 | mara = Marathon(args.marathon_uri, (args.marathon_user, args.marathon_pass)) 53 | 54 | stress_params = load_stress_parameters() 55 | autoscaler_params = load_autoscaler_parameters() 56 | print(""" 57 | Stress Parameters: 58 | {0} 59 | 60 | """.format(stress_params)) 61 | 62 | print(""" 63 | Scaling Parameters: 64 | {0} 65 | 66 | """.format(autoscaler_params)) 67 | app_def["labels"]["use_marathon_autoscaler"] = "0.0.3" 68 | app_def["labels"]["min_instances"] = str(autoscaler_params["min_instances"]) 69 | app_def["labels"]["max_instances"] = str(autoscaler_params["max_instances"]) 70 | app_def["labels"]["mas_rule_scaleup_1"] = "cpu | >90 | PT2M | 1 | PT2M" 71 | app_def["labels"]["mas_rule_scaleup_2"] = "mem | >90 | PT2M | 1 | PT2M" 72 | app_def["labels"]["mas_rule_scaledown"] = "cpu | <90 | PT2M | -1 | PT2M" 73 | 74 | app_def["env"]["INSTRUCTIONS"] = json.dumps(stress_params).replace("\n", "").replace(" ", "") 75 | 76 | response = mara.create_app(app_def) 77 | print(response) 78 | -------------------------------------------------------------------------------- /lib/marathon_autoscaler/mesosagent.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | from apiclientbase import ApiClientBase 5 | 6 | 7 | class MesosAgent(ApiClientBase): 8 | @staticmethod 9 | def load_paths(): 10 | """ 11 | :return: 12 | """ 13 | return """ 14 | { 15 | "/monitor/statistics": {"verb": "GET"}, 16 | "/metrics/snapshot": {"verb": "GET"}, 17 | "/api/v1/executor": {"verb": "GET"}, 18 | "/flags": {"verb": "GET"}, 19 | "/health": {"verb": "GET"}, 20 | "/state": {"verb": "GET"}, 21 | "/system/stats": {"verb": "GET"}, 22 | "/version": {"verb": "GET"}, 23 | "/{slave_id}/api/v1/executor": {"verb": "GET"}, 24 | "/{slave_id}/flags": {"verb": "GET"}, 25 | "/{slave_id}/health": {"verb": "GET"}, 26 | "/{slave_id}/state": {"verb": "GET"}, 27 | "/{slave_id}/state.json": {"verb": "GET"} 28 | } 29 | """ 30 | 31 | def get_statistics(self): 32 | """ 33 | :return: 34 | """ 35 | return self._call_endpoint("/monitor/statistics") 36 | 37 | def get_metrics(self): 38 | """ 39 | :return: 40 | """ 41 | return self._call_endpoint("/metrics/snapshot") 42 | 43 | def get_health(self): 44 | """ 45 | :return: 46 | """ 47 | return self._call_endpoint("/health") 48 | 49 | def get_state(self): 50 | """ 51 | :return: 52 | """ 53 | return self._call_endpoint("/state") 54 | 55 | def get_executor(self): 56 | """ 57 | :return: 58 | """ 59 | return self._call_endpoint("/api/v1/executor") 60 | 61 | def get_flags(self): 62 | """ 63 | :return: 64 | """ 65 | return self._call_endpoint("/flags") 66 | 67 | def get_slave_health(self, slave_id): 68 | """ 69 | :param slave_id: 70 | :return: 71 | """ 72 | return self._call_endpoint("/{slave_id}/health", slave_id=slave_id) 73 | 74 | def get_slave_state(self, slave_id): 75 | """ 76 | :param slave_id: 77 | :return: 78 | """ 79 | return self._call_endpoint("/{slave_id}/state", slave_id=slave_id) 80 | 81 | def get_slave_executor(self, slave_id): 82 | """ 83 | :param slave_id: 84 | :return: 85 | """ 86 | return self._call_endpoint("/{slave_id}/api/v1/executor", slave_id=slave_id) 87 | 88 | def get_slave_flags(self, slave_id): 89 | """ 90 | :param slave_id: 91 | :return: 92 | """ 93 | return self._call_endpoint("/{slave_id}/flags", slave_id=slave_id) 94 | 95 | def get_system_stats(self): 96 | """ 97 | :return: 98 | """ 99 | return self._call_endpoint("/system/stats") 100 | 101 | def __init__(self, uri, creds=None, logger=None): 102 | """ 103 | :param uri: 104 | :param creds: 105 | :return: 106 | """ 107 | super(MesosAgent, self).__init__(uri, creds) 108 | self.paths = json.loads(self.load_paths()) 109 | self.logger = logger or logging.getLogger(__name__) 110 | -------------------------------------------------------------------------------- /tests/test_history_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import dateutil.parser 5 | from datetime import timedelta, datetime 6 | import pytest 7 | import logging 8 | 9 | logging.basicConfig(level=logging.DEBUG) 10 | 11 | sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "lib", "marathon_autoscaler")) 12 | from history_manager import HistoryManager 13 | import settings 14 | 15 | rightnow = datetime.now() 16 | 17 | @pytest.fixture(scope="session") 18 | def testsettings(): 19 | fake_settings = dict(sleep_interval=5, 20 | mesos_uri=None, 21 | agent_port=5051, 22 | marathon_uri=None, 23 | marathon_user=None, 24 | marathon_pass=None, 25 | cpu_fan_out=None, 26 | datadog_api_key=None, 27 | datadog_app_key=None, 28 | datadog_env=None, 29 | log_config="/app/logging_config.json", 30 | enforce_version_match=False, 31 | rules_prefix="mas_rule" 32 | ) 33 | for name, value in fake_settings.items(): 34 | setattr(settings, name, value) 35 | 36 | 37 | @pytest.fixture(scope="session") 38 | def history_mgr(testsettings): 39 | _history_mgr = HistoryManager() 40 | assert (type(_history_mgr) is HistoryManager) 41 | return _history_mgr 42 | 43 | 44 | def datetime_parser(obj): 45 | for k, v in obj.items(): 46 | if "timestamp" in k: 47 | try: 48 | obj[k] = rightnow 49 | except ValueError as ve: 50 | pass 51 | return obj 52 | 53 | _counter = 1 54 | 55 | 56 | def _decrement_time(date_time, time_span): 57 | new_date_time = date_time - timedelta(seconds=time_span * _counter) 58 | global _counter 59 | _counter += 1 60 | return new_date_time 61 | 62 | 63 | @pytest.fixture(scope="session") 64 | def app_recommendations(): 65 | with open( 66 | os.path.join( 67 | os.path.dirname(os.path.abspath(__file__)), 68 | "simulation_data/app_recommendations.json" 69 | ), "r") as f: 70 | test_app_recommendations = json.load(f, object_hook=datetime_parser) 71 | return test_app_recommendations.get("recommendationsList") 72 | 73 | 74 | def test_add_to_perf_tail(history_mgr, app_recommendations): 75 | for recommendation in app_recommendations: 76 | history_mgr.add_to_perf_tail(recommendation) 77 | assert len(history_mgr.app_performance_tail) > 1 78 | 79 | 80 | def test_tolerance_reached(history_mgr): 81 | [event.update({"timestamp": _decrement_time(event.get("timestamp"), 6)}) 82 | for recommendation in history_mgr.app_performance_tail 83 | for app, event in recommendation.items()] 84 | 85 | assert history_mgr.tolerance_reached("test-service", "PT10S", 1) 86 | 87 | 88 | def test_within_backoff(history_mgr): 89 | assert not history_mgr.within_backoff("test-service", "PT2M", 1) 90 | 91 | 92 | def test_is_time_window_filled(history_mgr): 93 | rightnow = datetime.now() 94 | assert history_mgr.is_time_window_filled("test-service", rightnow - timedelta(seconds=10)) 95 | 96 | 97 | def test_get_timedelta(history_mgr): 98 | timespan_obj = history_mgr.get_timedelta("PT3M34S") 99 | assert timespan_obj.seconds == 214 100 | -------------------------------------------------------------------------------- /tests/stress_tester_app/README.md: -------------------------------------------------------------------------------- 1 | ## Stress Tester App Container 2 | 3 | ### Purpose 4 | 5 | This container is simply a container around the [stress application](http://people.seas.harvard.edu/~apw/stress/). 6 | When the container is started, it will call the application with parameters provided via environment variables. 7 | 8 | 9 | ### The INSTRUCTIONS Environment Variable 10 | 11 | The application reads the INSTRUCTIONS variable and attempts to convert it to JSON. 12 | The JSON document needs an instructions segment and play_mode segment. 13 | 14 | #### Examples 15 | 16 | Single ordered run 17 | - `stress --cpu 1 --timeout 10s --quiet` 18 | - `sleep 30` 19 | 20 | ``` 21 | { 22 | "instructions": [ 23 | { 24 | "cmd": "stress", 25 | "switches": { 26 | "--cpu": 1, 27 | "--timeout": "10s" 28 | }, 29 | "flags": [ 30 | "--quiet" 31 | ], 32 | "args": [] 33 | }, 34 | { 35 | "cmd": "sleep", 36 | "args": [ 37 | "30" 38 | ] 39 | } 40 | ], 41 | "play_mode": "single" 42 | } 43 | ``` 44 | 45 | Repeated ordered run 46 | - `stress --cpu 4 --vm 3 --timeout 5m --verbose` 47 | - `sleep 10` 48 | 49 | ``` 50 | { 51 | "instructions": [ 52 | { 53 | "cmd": "stress", 54 | "switches": { 55 | "--cpu": "4", 56 | "--vm": "3" 57 | "--timeout": "5m" 58 | }, 59 | "flags": [ 60 | "--verbose" 61 | ], 62 | "args": [] 63 | }, 64 | { 65 | "cmd": "sleep", 66 | "args": [ 67 | "30" 68 | ] 69 | } 70 | ], 71 | "play_mode": "repeat" 72 | } 73 | ``` 74 | 75 | Single shuffle run 76 | - `stress --cpu 4 --vm 3 --timeout 5m --verbose` 77 | - `stress --cpu 1 --timeout 10s --quiet` 78 | - `sleep 30` 79 | - `sleep 10` 80 | 81 | ``` 82 | { 83 | "instructions": [ 84 | { 85 | "cmd": "stress", 86 | "switches": { 87 | "--cpu": "4", 88 | "--vm": "3" 89 | "--timeout": "5m" 90 | }, 91 | "flags": [ 92 | "--verbose" 93 | ], 94 | "args": [] 95 | }, 96 | { 97 | "cmd": "stress", 98 | "switches": { 99 | "--cpu": 1, 100 | "--timeout": "10s" 101 | }, 102 | "flags": [ 103 | "--quiet" 104 | ], 105 | "args": [] 106 | }, 107 | { 108 | "cmd": "sleep", 109 | "args": [ 110 | "30" 111 | ] 112 | }, 113 | { 114 | "cmd": "sleep", 115 | "args": [ 116 | "10" 117 | ] 118 | } 119 | ], 120 | "play_mode": "shuffle" 121 | } 122 | ``` 123 | 124 | Repeated shuffle run 125 | - `stress --cpu 4 --vm 3 --timeout 5m --verbose` 126 | - `stress --cpu 1 --timeout 10s --quiet` 127 | - `sleep 30` 128 | - `sleep 10` 129 | 130 | ``` 131 | { 132 | "instructions": [ 133 | { 134 | "cmd": "stress", 135 | "switches": { 136 | "--cpu": "4", 137 | "--vm": "3" 138 | "--timeout": "5m" 139 | }, 140 | "flags": [ 141 | "--verbose" 142 | ], 143 | "args": [] 144 | }, 145 | { 146 | "cmd": "stress", 147 | "switches": { 148 | "--cpu": 1, 149 | "--timeout": "10s" 150 | }, 151 | "flags": [ 152 | "--quiet" 153 | ], 154 | "args": [] 155 | }, 156 | { 157 | "cmd": "sleep", 158 | "args": [ 159 | "30" 160 | ] 161 | }, 162 | { 163 | "cmd": "sleep", 164 | "args": [ 165 | "10" 166 | ] 167 | } 168 | ], 169 | "play_mode": "repeat_shuffle" 170 | } 171 | ``` -------------------------------------------------------------------------------- /lib/marathon_autoscaler/mesosmaster.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | from apiclientbase import ApiClientBase 5 | 6 | 7 | class MesosMaster(ApiClientBase): 8 | @staticmethod 9 | def load_paths(): 10 | """ 11 | :return: 12 | """ 13 | return """ 14 | { 15 | "/master/api/v1/scheduler": {"verb": "POST"}, 16 | "/master/create-volumes": {"verb": "POST"}, 17 | "/master/destroy-volumes": {"verb": "POST"}, 18 | "/master/flags": {"verb": "GET"}, 19 | "/master/frameworks": {"verb": "GET"}, 20 | "/master/health": {"verb": "GET"}, 21 | "/master/machine/down": {"verb": "POST"}, 22 | "/master/machine/up": {"verb": "POST"}, 23 | "/master/maintenance/schedule": {"verb": "GET"}, 24 | "/master/maintenance/status": {"verb": "GET"}, 25 | "/master/observe": {"verb": "POST"}, 26 | "/master/quota": {"verb": "GET"}, 27 | "/master/redirect": {"verb": "GET"}, 28 | "/master/reserve": {"verb": "POST"}, 29 | "/master/roles": {"verb": "GET"}, 30 | "/master/slaves": {"verb": "GET"}, 31 | "/master/state": {"verb": "GET"}, 32 | "/master/state-summary": {"verb": "GET"}, 33 | "/master/state": {"verb": "GET"}, 34 | "/master/tasks": {"verb": "GET"}, 35 | "/master/teardown": {"verb": "POST"}, 36 | "/master/unreserve": {"verb": "POST"}, 37 | "/monitor/statistics": {"verb": "GET"}, 38 | "/metrics/snapshot": {"verb": "GET"}, 39 | "/system/stats": {"verb": "GET"}, 40 | "/version": {"verb": "GET"} 41 | } 42 | """ 43 | 44 | def __init__(self, uri, creds=None, logger=None): 45 | """ 46 | :param uri: 47 | :param creds: 48 | :return: 49 | """ 50 | super(MesosMaster, self).__init__(uri, creds) 51 | self.paths = json.loads(self.load_paths()) 52 | self.logger = logger or logging.getLogger(__name__) 53 | 54 | def find_master(self): 55 | self.uri = self._call_endpoint("/master/redirect").url 56 | 57 | def get_health(self): 58 | """ 59 | :return: 60 | """ 61 | self.find_master() 62 | return self._call_endpoint("/master/health") 63 | 64 | def get_slaves(self): 65 | """ 66 | :return: 67 | """ 68 | self.find_master() 69 | return self._call_endpoint("/master/slaves") 70 | 71 | def get_state(self): 72 | """ 73 | :return: 74 | """ 75 | self.find_master() 76 | return self._call_endpoint("/master/state") 77 | 78 | def get_tasks(self): 79 | """ 80 | :return: 81 | """ 82 | self.find_master() 83 | return self._call_endpoint("/master/tasks") 84 | 85 | def get_statistics(self): 86 | """ 87 | :return: 88 | """ 89 | self.find_master() 90 | return self._call_endpoint("/monitor/statistics") 91 | 92 | def get_system_stats(self): 93 | """ 94 | :return: 95 | """ 96 | self.find_master() 97 | return self._call_endpoint("/system/stats") 98 | 99 | def get_metrics(self): 100 | """ 101 | :return: 102 | """ 103 | self.find_master() 104 | return self._call_endpoint("/metrics/snapshot") 105 | 106 | def get_version(self): 107 | """ 108 | :return: 109 | """ 110 | self.find_master() 111 | return self._call_endpoint("/version") 112 | -------------------------------------------------------------------------------- /scripts/deploy_autoscaler_to_marathon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | This script constructs an Marathon application definition for the Marathon Autoscaler container. 4 | 5 | Be sure to deploy the latest Marathon Autoscaler docker image to the registry before running this. 6 | """ 7 | import argparse 8 | import json 9 | import os 10 | import sys 11 | 12 | BASE_PATH = os.path.dirname(os.path.realpath(__file__)) 13 | PROJECT_PATH = os.path.dirname(BASE_PATH) 14 | sys.path.append(os.path.join(PROJECT_PATH, 'lib/')) 15 | 16 | from marathon_autoscaler.marathon import Marathon 17 | 18 | 19 | def load_app_definition(): 20 | with open(os.path.join(os.getcwd(), "data", "marathon_autoscaler_app.json"), 'r') as f: 21 | test_app_definition = json.load(f) 22 | return test_app_definition 23 | 24 | 25 | def parse_cli_args(): 26 | p = argparse.ArgumentParser(description="Deploy Marathon Autoscaler") 27 | p.add_argument("--marathon-uri", dest="marathon_uri", type=str, 28 | required=True, help="The Marathon Endpoint") 29 | p.add_argument("--marathon-user", dest="marathon_user", type=str, 30 | required=False, help="Username for Marathon access") 31 | p.add_argument("--marathon-pass", dest="marathon_pass", type=str, 32 | required=False, help="Password for Marathon access") 33 | p.add_argument("--interval", dest="sleep_interval", type=str, 34 | required=True, help="The time duration in seconds between polling events") 35 | p.add_argument("--mesos-uri", dest="mesos_uri", type=str, 36 | required=True, help="The Mesos Endpoint") 37 | p.add_argument("--log-verbosity", dest="log_verbosity", type=str, 38 | required=True, choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], 39 | help="Logging verbosity") 40 | p.add_argument("--cpu-fan-out", dest="cpu_fan_out", type=str, 41 | required=True, 42 | help="The number of sub processes to fan out to when making parallel web calls") 43 | p.add_argument("--dd-api-key", dest="datadog_api_key", type=str, 44 | required=True, help="Datadog API key") 45 | p.add_argument("--dd-app-key", dest="datadog_app_key", type=str, 46 | required=True, help="Datadog APP key") 47 | p.add_argument("--dd-env", dest="datadog_env", type=str, 48 | required=True, help="Datadog ENV variable") 49 | p.add_argument("--enforce-version-match", dest="enforce_version_match", 50 | type=bool, default=False, required=False, 51 | help="If set, version matching will be required of applications to participate") 52 | p.add_argument("--rules-prefix", dest="rules_prefix", 53 | type=str, default="mas_rule", required=False, 54 | help="The prefix for rule label names") 55 | 56 | return p.parse_args() 57 | 58 | 59 | if __name__ == "__main__": 60 | args = parse_cli_args() 61 | 62 | app_def = load_app_definition() 63 | app_def["env"]["INTERVAL"] = args.sleep_interval 64 | app_def["env"]["MESOS_URI"] = args.mesos_uri 65 | app_def["env"]["MARATHON_URI"] = args.marathon_uri 66 | app_def["env"]["MARATHON_USER"] = args.marathon_user 67 | app_def["env"]["MARATHON_PASS"] = args.marathon_pass 68 | app_def["env"]["LOG_VERBOSITY"] = args.log_verbosity 69 | app_def["env"]["CPU_FAN_OUT"] = args.cpu_fan_out 70 | app_def["env"]["DATADOG_API_KEY"] = args.datadog_api_key 71 | app_def["env"]["DATADOG_APP_KEY"] = args.datadog_app_key 72 | app_def["env"]["DATADOG_ENV"] = args.datadog_env 73 | app_def["env"]["ENFORCE_VERSION_MATCH"] = args.enforce_version_match 74 | app_def["env"]["RULES_PREFIX"] = args.rules_prefix 75 | 76 | mara = Marathon(args.marathon_uri, (args.marathon_user, args.marathon_pass)) 77 | response = mara.create_app(app_def) 78 | print(response) 79 | -------------------------------------------------------------------------------- /lib/marathon_autoscaler/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import logging.config 5 | import os 6 | import pkg_resources 7 | import sys 8 | from poller import Poller 9 | import settings 10 | 11 | 12 | def setup_logging(cli_args): 13 | """ Setup logging configuration 14 | :param cli_args: Argparse object containing parameters from the command line 15 | :return: Logger 16 | """ 17 | logconfig_path = cli_args.log_config 18 | if not os.path.isabs(logconfig_path): 19 | resource_package = __name__ 20 | logconfig_io = pkg_resources.resource_stream(resource_package, logconfig_path) 21 | logconfig_string = logconfig_io.getvalue().decode(encoding="utf-8") 22 | config = json.loads(logconfig_string) 23 | else: 24 | with open(logconfig_path, 'rt') as f: 25 | config = json.load(f) 26 | logging.config.dictConfig(config) 27 | 28 | 29 | class EnvDefault(argparse.Action): 30 | """ 31 | A custom argparse class to handle the consumption of environment variables in 32 | addition to commandline parameters. 33 | """ 34 | def __init__(self, envvar, required=True, default=None, **kwargs): 35 | if envvar: 36 | if envvar in os.environ: 37 | default = os.environ[envvar] 38 | if required and default: 39 | required = False 40 | super(EnvDefault, self).__init__(default=default, required=required, **kwargs) 41 | 42 | def __call__(self, parser, namespace, values, option_string=None): 43 | setattr(namespace, self.dest, values) 44 | 45 | 46 | def parse_cli_args(): 47 | """ 48 | A method for organizing all commandline argument and environment variable parsing. 49 | :return: An argparse object containing all CLI/ENV argument values. 50 | """ 51 | p = argparse.ArgumentParser(description="Marathon Autoscaler") 52 | p.add_argument("-i", "--interval", dest="sleep_interval", action=EnvDefault, envvar="INTERVAL", type=int, 53 | default=5, help="The time duration in seconds between polling events") 54 | p.add_argument("--mesos-uri", dest="mesos_uri", action=EnvDefault, envvar="MESOS_URI", type=str, required=True, 55 | help="The Mesos Endpoint") 56 | p.add_argument("--agent-port", dest="agent_port", action=EnvDefault, envvar="AGENT_PORT", type=int, 57 | required=True, default=5051, help="Mesos Agent Port") 58 | p.add_argument("--marathon-uri", dest="marathon_uri", action=EnvDefault, envvar="MARATHON_URI", type=str, 59 | required=True, help="The Marathon Endpoint") 60 | p.add_argument("--marathon-user", dest="marathon_user", action=EnvDefault, envvar="MARATHON_USER", type=str, 61 | required=False, help="The Marathon Username", default=None) 62 | p.add_argument("--marathon-pass", dest="marathon_pass", action=EnvDefault, envvar="MARATHON_PASS", type=str, 63 | required=False, help="The Marathon Password", default=None) 64 | p.add_argument("--cpu-fan-out", dest="cpu_fan_out", action=EnvDefault, envvar="CPU_FAN_OUT", type=int, 65 | default=None, required=False, help="Number of subprocesses to use for gathering and sending stats to Datadog") 66 | p.add_argument("--dd-api-key", dest="datadog_api_key", action=EnvDefault, envvar="DATADOG_API_KEY", type=str, 67 | required=False, help="Datadog API key") 68 | p.add_argument("--dd-app-key", dest="datadog_app_key", action=EnvDefault, envvar="DATADOG_APP_KEY", type=str, 69 | required=False, help="Datadog APP key") 70 | p.add_argument("--dd-env", dest="datadog_env", action=EnvDefault, envvar="DATADOG_ENV", type=str, 71 | required=False, help="Datadog ENV variable") 72 | p.add_argument("--log-config", dest="log_config", action=EnvDefault, envvar="LOG_CONFIG", type=str, 73 | default="/app/logging_config.json", 74 | help="Path to logging configuration file") 75 | p.add_argument("--enforce-version-match", dest="enforce_version_match", action=EnvDefault, 76 | envvar="ENFORCE_VERSION_MATCH", type=bool, default=False, 77 | required=False, help="If set, version matching will be required of applications to participate") 78 | p.add_argument("--rules-prefix", dest="rules_prefix", action=EnvDefault, 79 | envvar="RULES_PREFIX", type=str, default="mas_rule", 80 | required=False, help="The prefix for rule names") 81 | return p.parse_args() 82 | 83 | 84 | def add_args_to_settings(cli_args): 85 | for name, value in vars(cli_args).iteritems(): 86 | setattr(settings, name, value) 87 | 88 | if __name__ == "__main__": 89 | args = parse_cli_args() 90 | add_args_to_settings(args) 91 | setup_logging(args) 92 | logging.info(args) 93 | poller = Poller(args) 94 | poller.start() 95 | sys.exit(0) 96 | -------------------------------------------------------------------------------- /tests/stress_tester_app/stress.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import json 3 | import logging 4 | import os 5 | import random 6 | from subprocess import call 7 | import sys 8 | 9 | 10 | def single_play(command_statements): 11 | """ 12 | Executes the command_statements once, in order. 13 | :param command_statements: list of statement lists 14 | :return: None 15 | """ 16 | for statement in command_statements: 17 | execute_statement(statement) 18 | 19 | 20 | def repeat_play(command_statements): 21 | """ 22 | Infinitely repeat the single_play. 23 | :param command_statements: list of statement lists 24 | :return: None 25 | """ 26 | while True: 27 | single_play(command_statements) 28 | 29 | 30 | def shuffle_play(command_statements): 31 | """ 32 | Randomly pop statements from the list and execute. 33 | :param command_statements: list of statement lists 34 | :return: None 35 | """ 36 | statements = list(command_statements) 37 | while len(statements) > 0: 38 | total_cmds = len(statements) - 1 39 | if total_cmds > 0: 40 | next_statement = statements.pop(random.randint(1, total_cmds)) 41 | else: 42 | next_statement = statements.pop(0) 43 | execute_statement(next_statement) 44 | 45 | 46 | def repeat_shuffle_play(command_statements): 47 | """ 48 | Infinitely repeat the shuffle_play. 49 | :param command_statements: list of statement lists 50 | :return: None 51 | """ 52 | statements = tuple(command_statements) 53 | while len(statements) > 0: 54 | shuffle_play(statements) 55 | print(statements) 56 | 57 | 58 | def execute_statement(statement): 59 | """ 60 | Pass the statement list to subprocess.call() 61 | :param statement: 62 | :return: 63 | """ 64 | print("Calling {0}".format(statement)) 65 | call(statement) 66 | 67 | 68 | def generate_command_statement(instruction): 69 | """ 70 | Converts json fragment to list (cmd, [args] [switches] [flags]) 71 | :param instruction: json fragment 72 | :return: statement list 73 | """ 74 | cmd_statement = [instruction.get("cmd")] 75 | for arg in instruction.get("args", []): 76 | cmd_statement.append(arg) 77 | for switch, value in instruction.get("switches", {}).items(): 78 | cmd_statement.append(switch) 79 | cmd_statement.append(value) 80 | for flag in instruction.get("flags", []): 81 | cmd_statement.append(flag) 82 | return cmd_statement 83 | 84 | 85 | def load_instruction_set(): 86 | """ 87 | Converts the INSTRUCTIONS environment variable to json object. 88 | :return: json object 89 | """ 90 | raw_data = os.getenv("INSTRUCTIONS") 91 | try: 92 | return json.loads(raw_data) 93 | except ValueError as err: 94 | logger.info(raw_data) 95 | logger.exception(err) 96 | 97 | 98 | def get_play_mode(instruction_set): 99 | """ 100 | Get the play_mode, type of execution arrangement. 101 | :param instruction_set: json object 102 | :return: play_mode value 103 | """ 104 | result = instruction_set.get("play_mode") 105 | if result is None: 106 | raise ValueError("Play mode not found in instruction_set data.") 107 | return result 108 | 109 | 110 | def get_command_statements(instruction_set): 111 | """ 112 | Get the instructions out of the json document. 113 | :param instruction_set: json document containing instructions 114 | :return: list of statement lists 115 | """ 116 | raw_instructions = instruction_set.get("instructions") 117 | if raw_instructions is None: 118 | raise ValueError("Instructions not found in instruction_set data.") 119 | instructions = [] 120 | for instruction in raw_instructions: 121 | instructions.append(generate_command_statement(instruction)) 122 | 123 | return instructions 124 | 125 | 126 | def establish_logger(): 127 | """ 128 | Enable logging 129 | :return: None 130 | """ 131 | logging.getLogger('').handlers = [] 132 | 133 | log_formatter = logging.Formatter("%(asctime)s %(process)d %(funcName)s:%(lineno)d %(name)s %(message)s") 134 | log_system = logging.getLogger(__name__) 135 | log_system.setLevel(logging.DEBUG) 136 | 137 | console_handler = logging.StreamHandler(sys.stdout) 138 | console_handler.setLevel(logging.INFO) 139 | console_handler.setFormatter(log_formatter) 140 | 141 | log_system.addHandler(console_handler) 142 | 143 | return log_system 144 | 145 | 146 | def start(): 147 | """ 148 | Begin play / execution of instructions. 149 | :return: None 150 | """ 151 | instruction_set = load_instruction_set() 152 | play_mode = get_play_mode(instruction_set) 153 | command_statements = get_command_statements(instruction_set) 154 | play_modes[play_mode](command_statements) 155 | 156 | 157 | if __name__ == "__main__": 158 | play_modes = { 159 | "single": single_play, 160 | "repeat": repeat_play, 161 | "shuffle": shuffle_play, 162 | "repeat_shuffle": repeat_shuffle_play 163 | } 164 | logger = establish_logger() 165 | start() 166 | -------------------------------------------------------------------------------- /lib/marathon_autoscaler/marathon.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | from apiclientbase import ApiClientBase 5 | 6 | 7 | class Marathon(ApiClientBase): 8 | @staticmethod 9 | def load_paths(): 10 | """ 11 | :return: 12 | """ 13 | return """ 14 | { 15 | "/v2/apps": {"verb": "GET"}, 16 | "/v2/apps/{appId}": {"verb": "GET"}, 17 | "/v2/apps/{appId}/versions": {"verb": "GET"}, 18 | "/v2/apps/{appId}/versions/{versionId}": {"verb": "GET"}, 19 | "/v2/apps/{appId}/tasks": {"verb": "GET"}, 20 | "/v2/groups": {"verb": "GET"}, 21 | "/v2/groups/{groupId}": {"verb": "GET"}, 22 | "/v2/tasks": {"verb": "GET"}, 23 | "/v2/deployments": {"verb": "GET"}, 24 | "/v2/events": {"verb": "GET"}, 25 | "/v2/eventSubscriptions": {"verb": "GET"}, 26 | "/v2/queue": {"verb": "GET"}, 27 | "/v2/info": {"verb": "GET"}, 28 | "/v2/leader": {"verb": "GET"}, 29 | "/ping": {"verb": "GET"}, 30 | "/logging": {"verb": "GET"}, 31 | "/help": {"verb": "GET"}, 32 | "/metrics": {"verb": "GET"} 33 | } 34 | """ 35 | 36 | def __init__(self, uri, creds=None, logger=None): 37 | """ 38 | :param uri: 39 | :param creds: 40 | :return: 41 | """ 42 | super(Marathon, self).__init__(uri, creds) 43 | self.paths = json.loads(self.load_paths()) 44 | self.logger = logger or logging.getLogger(__name__) 45 | 46 | def get_all_apps(self): 47 | """ 48 | :return: 49 | """ 50 | return self._call_endpoint("/v2/apps") 51 | 52 | def get_all_app_names(self): 53 | """ 54 | :return: 55 | """ 56 | return [app.get("id").lstrip("/") for app in self.get_all_apps().get("apps")] 57 | 58 | def get_app_details(self, marathon_app): 59 | """ 60 | :param marathon_app: 61 | :return: 62 | """ 63 | return self._call_endpoint("/v2/apps/{appId}", appId=marathon_app) 64 | 65 | def get_app_tasks(self, marathon_app): 66 | """ 67 | :param marathon_app: 68 | :return: 69 | """ 70 | return self._call_endpoint("/v2/apps/{appId}/tasks", appId=marathon_app) 71 | 72 | def get_all_groups(self): 73 | """ 74 | :return: 75 | """ 76 | return self._call_endpoint("/v2/groups") 77 | 78 | def get_group(self, group): 79 | """ 80 | :param group: 81 | :return: 82 | """ 83 | return self._call_endpoint("/v2/groups/{groupId}", groupId=group) 84 | 85 | def get_info(self): 86 | """ 87 | :return: 88 | """ 89 | return self._call_endpoint("/v2/info") 90 | 91 | def get_tasks(self): 92 | """ 93 | :return: 94 | """ 95 | return self._call_endpoint("/v2/tasks") 96 | 97 | def get_deployments(self): 98 | """ 99 | :return: 100 | """ 101 | return self._call_endpoint("/v2/deployments") 102 | 103 | def get_events(self): 104 | """ 105 | :return: 106 | """ 107 | return self._call_endpoint("/v2/events") 108 | 109 | def get_event_subscriptions(self): 110 | """ 111 | :return: 112 | """ 113 | return self._call_endpoint("/v2/eventSubscriptions") 114 | 115 | def get_queue(self): 116 | """ 117 | :return: 118 | """ 119 | return self._call_endpoint("/v2/queue") 120 | 121 | def get_leader(self): 122 | """ 123 | :return: 124 | """ 125 | return self._call_endpoint("/v2/leader") 126 | 127 | def get_metrics(self): 128 | """ 129 | :return: 130 | """ 131 | return self._call_endpoint("/metrics") 132 | 133 | def update_app(self, marathon_app, data): 134 | """ 135 | :param marathon_app: name of the application 136 | :param data: json segment to update 137 | :return: 138 | """ 139 | json_data = json.dumps(data) 140 | response = self._call_endpoint("/v2/apps/{appId}", appId=marathon_app, verb="PUT", data=json_data) 141 | self.logger.debug(response) 142 | return response 143 | 144 | def scale_app(self, marathon_app, instances): 145 | """ 146 | :param marathon_app: name of the application 147 | :param instances: number of instances to scale to. 148 | :return: 149 | """ 150 | data = {"instances": instances} 151 | json_data = json.dumps(data) 152 | response = self._call_endpoint("/v2/apps/{appId}", appId=marathon_app, verb="PUT", data=json_data) 153 | self.logger.debug(response) 154 | return response 155 | 156 | def create_app(self, marathon_app_definition): 157 | """ 158 | :param marathon_app_definition: JSON representation of the application to be created 159 | :return: response 160 | """ 161 | json_data = json.dumps(marathon_app_definition) 162 | response = self._call_endpoint("/v2/apps", verb="POST", data=json_data) 163 | self.logger.debug(response) 164 | return response 165 | -------------------------------------------------------------------------------- /lib/marathon_autoscaler/scaler.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from constants import IDLE 3 | import logging 4 | from utils import clamp 5 | from rules_manager import RulesManager 6 | from history_manager import HistoryManager 7 | from application_definition import ApplicationDefinition 8 | 9 | 10 | class AutoScaler(object): 11 | """ 12 | The source of the scaling decision. 13 | """ 14 | def __init__(self, marathon_client, logger=None, dd_client=None, cli_args=None): 15 | self.marathon_client = marathon_client 16 | self.logger = logger or logging.getLogger(__name__) 17 | self.dd_client = dd_client 18 | self.enforce_version_match = False 19 | self.hm = HistoryManager(dd_client=dd_client) 20 | if cli_args is not None: 21 | self.enforce_version_match = cli_args.enforce_version_match 22 | 23 | def scale(self, app_def, rule_manager): 24 | """ Take scale action 25 | :param app_def: dict of marathon application settings 26 | :param rule_manager: object of scaling properties. 27 | :return: marathon response 28 | """ 29 | if not app_def.is_app_participating: 30 | return 31 | 32 | scale_factor = int(rule_manager.last_triggered_criteria.get("scale_factor")) 33 | min_instances = int(rule_manager.min_instances) 34 | max_instances = int(rule_manager.max_instances) 35 | 36 | scale_to = app_def.instances + scale_factor 37 | scale_to_size = clamp(scale_to, min_instances, max_instances) 38 | 39 | if app_def.instances == scale_to_size: 40 | msg = "{app_name}: application already scaled to {size}" 41 | self.logger.info(msg.format(app_name=app_def.app_name, 42 | size=scale_to_size)) 43 | return 44 | 45 | self.marathon_client.scale_app(app_def.id, scale_to_size) 46 | msg = "{app_name}: scaled to {size}" 47 | self.logger.info(msg.format(app_name=app_def.app_name, 48 | size=scale_to_size)) 49 | 50 | def decide(self, app_metrics_summary): 51 | """ 52 | The decision-maker of the autoscaler. 53 | :param app_metrics_summary: dict of app definitions and metrics 54 | :return: None 55 | """ 56 | self.logger.info("Decision process beginning.") 57 | 58 | app_scale_recommendations = {} 59 | for app, metrics_summary in app_metrics_summary.items(): 60 | app_def = ApplicationDefinition(metrics_summary.get("application_definition")) 61 | rm = RulesManager(app_def) 62 | if rm.is_app_participating(): 63 | vote = 0 64 | scale_factor = 0 65 | cpu = metrics_summary.get("cpu_avg_usage") 66 | mem = metrics_summary.get("memory_avg_usage") 67 | metrics = dict(cpu=cpu, 68 | mem=mem) 69 | 70 | rm.trigger_rules(metrics) 71 | 72 | if rm.last_triggered_criteria: 73 | scale_factor = int(rm.last_triggered_criteria.get("scale_factor")) 74 | vote = 1 if scale_factor > 0 else -1 75 | 76 | app_scale_recommendations[app] = dict(vote=vote, 77 | checksum=app_def.version, 78 | timestamp=datetime.now(), 79 | rule=rm.last_triggered_rule) 80 | info_msg = "{app_name}: vote: {vote} ; scale_factor requested: {scale_factor}" 81 | self.logger.info(info_msg.format(app_name=app_def.app_name, 82 | vote=vote, 83 | scale_factor=scale_factor)) 84 | # Check if app is participating 85 | # Check if app is ready 86 | # Check if app instances is greater than or equal to min and less than max 87 | 88 | if (rm.is_app_ready() and 89 | rm.is_app_within_min_or_max() and 90 | rm.last_triggered_criteria): 91 | tolerance_reached = self.hm.tolerance_reached(app, 92 | rm.last_triggered_criteria.get("tolerance"), 93 | vote) 94 | within_backoff = self.hm.within_backoff(app, 95 | rm.last_triggered_criteria.get("backoff"), 96 | vote) 97 | 98 | if vote is not IDLE and tolerance_reached and not within_backoff: 99 | self.logger.info("{app}: Decision made: Scale.".format(app=app_def.app_name)) 100 | app_scale_recommendations[app]["decision"] = vote 101 | self.scale(app_def, rm) 102 | elif vote == IDLE: 103 | app_scale_recommendations[app]["decision"] = IDLE 104 | self.logger.info("{app}: Decision made: No Change.".format(app=app_def.app_name)) 105 | 106 | self.hm.add_to_perf_tail(app_scale_recommendations) 107 | -------------------------------------------------------------------------------- /lib/marathon_autoscaler/datadog_metrics.py: -------------------------------------------------------------------------------- 1 | """Datadog functions for autoscaler""" 2 | import logging 3 | from datadog import initialize, api 4 | 5 | 6 | class DatadogClient: 7 | 8 | def __init__(self, cli_args, logger=None): 9 | if cli_args.datadog_api_key and cli_args.datadog_app_key: 10 | self.dd_auth = dict(api_key=cli_args.datadog_api_key, 11 | app_key=cli_args.datadog_app_key) 12 | self.dd_env = cli_args.datadog_env 13 | self.cpu_fan_out = cli_args.cpu_fan_out 14 | self.logger = logger or logging.getLogger(__name__) 15 | self.enabled = True 16 | initialize(**self.dd_auth) 17 | else: 18 | self.enabled = False 19 | 20 | def send_datadog_metrics(self, stats): 21 | """ Enumerates metrics from stats object to send to Datadog 22 | :param stats: a complex dictionary of marathon application metrics information 23 | :return: None 24 | """ 25 | try: 26 | if self.enabled: 27 | metrics = [] 28 | for app, items in stats.items(): 29 | tags = ["env:{}".format(self.dd_env), 30 | "app:{}".format(app)] 31 | 32 | # Avg CPU for entire app 33 | metrics.append(dict(metric='marathon.app.cpu_avg', 34 | points=items['cpu_avg_usage'], 35 | host='n/a', 36 | tags=tags)) 37 | 38 | # Avg mem for entire app 39 | metrics.append(dict(metric='marathon.app.mem_avg', 40 | points=items['memory_avg_usage'], 41 | host='n/a', 42 | tags=tags)) 43 | 44 | tags = ["env:{}".format(self.dd_env), 45 | "app:{}".format(app), 46 | "executor:{}".format(items['max_cpu'][1])] 47 | 48 | # Max CPU for entire app 49 | metrics.append(dict(metric='marathon.app.cpu_max', 50 | points=items['max_cpu'][0], 51 | host='n/a', 52 | tags=tags)) 53 | 54 | # Max mem for entire app 55 | tags = ["env:{}".format(self.dd_env), 56 | "app:{}".format(app), 57 | "executor:{}".format(items['max_memory'][1])] 58 | 59 | metrics.append(dict(metric='marathon.app.mem_max', 60 | points=items['max_memory'][0], 61 | host='n/a', 62 | tags=tags)) 63 | 64 | # Per-executor metrics 65 | for item in items['executor_metrics']: 66 | tags = ["env:{}".format(self.dd_env), 67 | "app:{}".format(app), 68 | "executor:{}".format(item['executor_id'])] 69 | 70 | metrics.append(dict(metric='marathon.executor.cpu', 71 | points=item['cpu_total_usage'], 72 | host=item['host'], 73 | tags=tags)) 74 | metrics.append(dict(metric='marathon.executor.mem', 75 | points=item['memory_total_usage'], 76 | host=item['host'], 77 | tags=tags)) 78 | 79 | api.Metric.send(metrics=metrics) 80 | except Exception as err: 81 | self.logger.error(err) 82 | 83 | def send_counter_event(self, app, metric, points=None, tags=None, **kwargs): 84 | """ 85 | marathon_autoscaler.counters.min_instances [tags- app:{app_name} env:{env}] 86 | marathon_autoscaler.counters.max_instances [tags- app:{app_name} env:{env}] 87 | marathon_autoscaler.counters.current_instances [tags- app:{app_name} env:{env}] 88 | :param app: the marathon application name 89 | :param metric: the metric name 90 | :param points: the metric value(s) 91 | :param tags: datadog tags for categorization 92 | :param kwargs: kwargs for additional future input 93 | :return: None 94 | """ 95 | if self.enabled: 96 | all_tags = ["env:{}".format(self.dd_env), "app:{}".format(app)] 97 | 98 | if tags: 99 | all_tags = tags + all_tags 100 | 101 | try: 102 | api.Metric.send(metric=metric, 103 | points=points if points else 1, 104 | tags=all_tags, 105 | type='counter') 106 | except Exception as err: 107 | self.logger.error(err) 108 | 109 | def send_scale_event(self, app, factor, direction, tags=None): 110 | """ 111 | marathon_autoscaler.events.scale_up [tags- app:{app_name} env:{env}] 112 | marathon_autoscaler.events.scale_down [tags- app:{app_name} env:{env}] 113 | :param app: the marathon application name 114 | :param factor: the scaling factor 115 | :param direction: the scaling direction 116 | :param tags: datadog tags for categorization 117 | :return: None 118 | """ 119 | if self.enabled: 120 | all_tags = ["env:{}".format(self.dd_env), "app:{}".format(app)] 121 | 122 | if tags: 123 | all_tags = tags + all_tags 124 | metrics = { 125 | 1: "marathon_autoscaler.events.scale_up", 126 | -1: "marathon_autoscaler.events.scale_down", 127 | 0: "marathon_autoscaler.events.idle" 128 | } 129 | try: 130 | api.Metric.send(metric=metrics[direction], 131 | points=factor, 132 | tags=all_tags, 133 | type='counter') 134 | except Exception as err: 135 | self.logger.error(err) 136 | -------------------------------------------------------------------------------- /tests/simulation_data/app_metric_summaries.json: -------------------------------------------------------------------------------- 1 | { 2 | "summaries": [ 3 | { 4 | "test-service": { 5 | "cpu_avg_usage": 0.43968765024682216, 6 | "max_cpu": [ 7 | 0.9436132136658013, 8 | "test-service.73966204-961a-11e6-b4c9-0a0e831c5c6b", 9 | "10.0.0.10" 10 | ], 11 | "executor_metrics": [ 12 | { 13 | "memory_total_usage": 18.38436759478673, 14 | "cpu_total_usage": 0.1272753639609533, 15 | "cpus_user_time_secs": 0.00999999999999801, 16 | "cpus_system_time_secs": 0.0, 17 | "host": "10.0.0.11", 18 | "timestamp": 7.856980085372925, 19 | "executor_id": "test-service.7396b026-961a-11e6-b4c9-0a0e831c5c6b" 20 | }, 21 | { 22 | "memory_total_usage": 18.812018661137444, 23 | "cpu_total_usage": 0.24817437311371182, 24 | "cpus_user_time_secs": 0.019999999999999574, 25 | "cpus_system_time_secs": 0.0, 26 | "host": "10.0.0.12", 27 | "timestamp": 8.058849811553955, 28 | "executor_id": "test-service.73968915-961a-11e6-b4c9-0a0e831c5c6b" 29 | }, 30 | { 31 | "memory_total_usage": 19.860164889415483, 32 | "cpu_total_usage": 0.9436132136658013, 33 | "cpus_user_time_secs": 0.07000000000000028, 34 | "cpus_system_time_secs": 0.010000000000000009, 35 | "host": "10.0.0.10", 36 | "timestamp": 8.478049993515015, 37 | "executor_id": "test-service.73966204-961a-11e6-b4c9-0a0e831c5c6b" 38 | } 39 | ], 40 | "max_memory": [ 41 | 19.860164889415483, 42 | "test-service.73966204-961a-11e6-b4c9-0a0e831c5c6b", 43 | "10.0.0.10" 44 | ], 45 | "memory_avg_usage": 19.018850381779885 46 | } 47 | }, 48 | { 49 | "test-service": { 50 | "cpu_avg_usage": 0.43968765024682216, 51 | "max_cpu": [ 52 | 0.9436132136658013, 53 | "test-service.73966204-961a-11e6-b4c9-0a0e831c5c6b", 54 | "10.0.0.10" 55 | ], 56 | "executor_metrics": [ 57 | { 58 | "memory_total_usage": 18.38436759478673, 59 | "cpu_total_usage": 0.1272753639609533, 60 | "cpus_user_time_secs": 0.00999999999999801, 61 | "cpus_system_time_secs": 0.0, 62 | "host": "10.0.0.11", 63 | "timestamp": 7.856980085372925, 64 | "executor_id": "test-service.7396b026-961a-11e6-b4c9-0a0e831c5c6b" 65 | }, 66 | { 67 | "memory_total_usage": 18.812018661137444, 68 | "cpu_total_usage": 0.24817437311371182, 69 | "cpus_user_time_secs": 0.019999999999999574, 70 | "cpus_system_time_secs": 0.0, 71 | "host": "10.0.0.12", 72 | "timestamp": 8.058849811553955, 73 | "executor_id": "test-service.73968915-961a-11e6-b4c9-0a0e831c5c6b" 74 | }, 75 | { 76 | "memory_total_usage": 19.860164889415483, 77 | "cpu_total_usage": 0.9436132136658013, 78 | "cpus_user_time_secs": 0.07000000000000028, 79 | "cpus_system_time_secs": 0.010000000000000009, 80 | "host": "10.0.0.10", 81 | "timestamp": 8.478049993515015, 82 | "executor_id": "test-service.73966204-961a-11e6-b4c9-0a0e831c5c6b" 83 | } 84 | ], 85 | "max_memory": [ 86 | 19.860164889415483, 87 | "test-service.73966204-961a-11e6-b4c9-0a0e831c5c6b", 88 | "10.0.0.10" 89 | ], 90 | "memory_avg_usage": 19.018850381779885 91 | } 92 | }, 93 | { 94 | "test-service": { 95 | "cpu_avg_usage": 0.43968765024682216, 96 | "max_cpu": [ 97 | 0.9436132136658013, 98 | "test-service.73966204-961a-11e6-b4c9-0a0e831c5c6b", 99 | "10.0.0.10" 100 | ], 101 | "executor_metrics": [ 102 | { 103 | "memory_total_usage": 18.38436759478673, 104 | "cpu_total_usage": 0.1272753639609533, 105 | "cpus_user_time_secs": 0.00999999999999801, 106 | "cpus_system_time_secs": 0.0, 107 | "host": "10.0.0.11", 108 | "timestamp": 7.856980085372925, 109 | "executor_id": "test-service.7396b026-961a-11e6-b4c9-0a0e831c5c6b" 110 | }, 111 | { 112 | "memory_total_usage": 18.812018661137444, 113 | "cpu_total_usage": 0.24817437311371182, 114 | "cpus_user_time_secs": 0.019999999999999574, 115 | "cpus_system_time_secs": 0.0, 116 | "host": "10.0.0.12", 117 | "timestamp": 8.058849811553955, 118 | "executor_id": "test-service.73968915-961a-11e6-b4c9-0a0e831c5c6b" 119 | }, 120 | { 121 | "memory_total_usage": 19.860164889415483, 122 | "cpu_total_usage": 0.9436132136658013, 123 | "cpus_user_time_secs": 0.07000000000000028, 124 | "cpus_system_time_secs": 0.010000000000000009, 125 | "host": "10.0.0.10", 126 | "timestamp": 8.478049993515015, 127 | "executor_id": "test-service.73966204-961a-11e6-b4c9-0a0e831c5c6b" 128 | } 129 | ], 130 | "max_memory": [ 131 | 19.860164889415483, 132 | "test-service.73966204-961a-11e6-b4c9-0a0e831c5c6b", 133 | "10.0.0.10" 134 | ], 135 | "memory_avg_usage": 19.018850381779885 136 | } 137 | }, 138 | { 139 | "test-service": { 140 | "cpu_avg_usage": 0.43968765024682216, 141 | "max_cpu": [ 142 | 0.9436132136658013, 143 | "test-service.73966204-961a-11e6-b4c9-0a0e831c5c6b", 144 | "10.0.0.10" 145 | ], 146 | "executor_metrics": [ 147 | { 148 | "memory_total_usage": 18.38436759478673, 149 | "cpu_total_usage": 0.1272753639609533, 150 | "cpus_user_time_secs": 0.00999999999999801, 151 | "cpus_system_time_secs": 0.0, 152 | "host": "10.0.0.11", 153 | "timestamp": 7.856980085372925, 154 | "executor_id": "test-service.7396b026-961a-11e6-b4c9-0a0e831c5c6b" 155 | }, 156 | { 157 | "memory_total_usage": 18.812018661137444, 158 | "cpu_total_usage": 0.24817437311371182, 159 | "cpus_user_time_secs": 0.019999999999999574, 160 | "cpus_system_time_secs": 0.0, 161 | "host": "10.0.0.12", 162 | "timestamp": 8.058849811553955, 163 | "executor_id": "test-service.73968915-961a-11e6-b4c9-0a0e831c5c6b" 164 | }, 165 | { 166 | "memory_total_usage": 19.860164889415483, 167 | "cpu_total_usage": 0.9436132136658013, 168 | "cpus_user_time_secs": 0.07000000000000028, 169 | "cpus_system_time_secs": 0.010000000000000009, 170 | "host": "10.0.0.10", 171 | "timestamp": 8.478049993515015, 172 | "executor_id": "test-service.73966204-961a-11e6-b4c9-0a0e831c5c6b" 173 | } 174 | ], 175 | "max_memory": [ 176 | 19.860164889415483, 177 | "test-service.73966204-961a-11e6-b4c9-0a0e831c5c6b", 178 | "10.0.0.10" 179 | ], 180 | "memory_avg_usage": 19.018850381779885 181 | } 182 | } 183 | ] 184 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Marathon Autoscaler 2 | 3 | | Build Status | Docker Images | 4 | |:------------:|:-------------:| 5 | | [![TravisCI](https://travis-ci.org/tendrilinc/marathon-autoscaler.svg?branch=master)](https://travis-ci.org/tendrilinc/marathon-autoscaler) | [Docker Hub](https://hub.docker.com/r/tendril/marathon-autoscaler/) | 6 | 7 | ## Description 8 | 9 | The aim of this project is to allow Marathon applications to scale to meet load requirements, without user intervention. To accomplish this, it monitors Marathon's application metrics and scales applications based on user-defined thresholds. 10 | 11 | 12 | #### Table Of Contents 13 | 14 | * [Build and deploy the Autoscaler](#build-and-deploy-the-autoscaler) 15 | * [Run the Autoscaler](#run-the-autoscaler) 16 | * [Deploying the Autoscaler to Marathon](#deploying-the-autoscaler-to-marathon) 17 | * [Deploying a Marathon application to use the Autoscaler](#deploying-a-marathon-application-to-use-the-autoscaler) 18 | * [Participation](#participation) 19 | * [Minimum and Maximum Instances](#minimum-and-maximum-instances) 20 | * [Scaling Rules](#scaling-rules) 21 | * [Testing the autoscaler with the Stress Tester app](#testing-the-autoscaler-with-the-stress-tester-app) 22 | 23 | 24 | ## Build and deploy the Autoscaler 25 | 26 | The Makefile requires REGISTRY environment variable to be set to your Docker registry. 27 | 28 | ```bash 29 | REGISTRY=fooreg.mydockerregistry.com make 30 | ``` 31 | 32 | To manually build the app, the following commands build and deploy the Autoscaler Docker container: 33 | 34 | Build the python zipapp: 35 | 36 | ```bash 37 | mkdir -p build/target 38 | python -m zipapp lib/marathon-autoscaler -o build/target/marathon-autoscaler.pyz 39 | ``` 40 | 41 | Build the docker image: 42 | ```bash 43 | docker build -t marathon_autoscaler . 44 | ``` 45 | 46 | Push the image to your registry: 47 | ```bash 48 | docker push {{registry_url}}/marathon_autoscaler:latest 49 | ``` 50 | 51 | ## Run the Autoscaler 52 | 53 | ### Deploying the Autoscaler to Marathon 54 | 55 | In the `scripts` directory, deploy_autoscaler_to_marathon.py can be executed to deploy an 56 | instance of the Autoscaler to your Marathon system. The parameters needed are explained below: 57 | 58 | 59 | | cli switch | environment variable | description | 60 | |------------|----------------------|-------------| 61 | | --interval | INTERVAL | The time duration in seconds between polling events | 62 | | --mesos-uri | MESOS_URI | The Mesos HTTP endpoint | 63 | | --mesos-agent-port | AGENT_PORT | The port your Mesos Agent is listening on (defaults to 5051) | 64 | | --marathon-uri | MARATHON_URI | The Marathon HTTP endpoint | 65 | | --marathon-user | MARATHON_USER | The Marathon username for authentication on the `marathon-uri` | 66 | | --marathon-pass | MARATHON_PASS | The Marathon password for authentication on the `marathon-uri` | 67 | | --cpu-fan-out | CPU_FAN_OUT | Number of subprocesses to use for gathering and sending stats to Datadog | 68 | | --dd-api-key | DATADOG_API_KEY | Datadog API key | 69 | | --dd-app-key | DATADOG_APP_KEY | Datadog APP key | 70 | | --dd-env | DATADOG_ENV | Datadog ENV variable to separate metrics by environment | 71 | | --log-config | LOG_CONFIG | Path to logging configuration file. Defaults to logging_config.json | 72 | | --enforce-version-match | ENFORCE_VERSION_MATCH | If set, version matching will be required of applications to participate | 73 | | --rules-prefix | RULES_PREFIX | The prefix for rule names | 74 | 75 | Run the scripts/deploy_autoscaler_to_marathon.py script: 76 | ```bash 77 | cd scripts && python deploy_autoscaler_to_marathon.py {PARAMETERS} 78 | ``` 79 | 80 | 81 | ### Deploying a Marathon application to use the Autoscaler 82 | 83 | #### Participation 84 | 85 | The autoscaler is a standalone application that monitors Marathon for applications that use specific labels. To make your application participate in the autoscaler the `use_marathon_autoscaler` label needs to be set to something truthful or a version number. To enable version matching, the autoscaler needs to be deployed with the `--enforce-version-match` commandline switch or `ENFORCE_VERSION_MATCH` environment variable. 86 | 87 | The Autoscaler considers the following list of strings as true: 88 | 89 | ```python 90 | ["true", "t", "yes", "y", "1"] 91 | ``` 92 | 93 | #### Minimum and Maximum Instances 94 | 95 | Number of minimum and maximum number of application instances. 96 | 97 | ```json 98 | ... 99 | "labels": { 100 | "min_instances": 1, 101 | "max_instances": 10 102 | } 103 | ... 104 | ``` 105 | 106 | #### Scaling Rules 107 | 108 | Scaling rules are set in a Marathon application's labels in its application definition. To get you introduced to scaling rules, let's jump right into an example: 109 | ```json 110 | ... 111 | "labels": { 112 | "mas_rule_fastscaleup": "cpu | >90 | PT2M | 3 | PT1M30S" 113 | }, 114 | ... 115 | ``` 116 | Explanation: The above rule is called "fastscaleup" which states: if cpu is greater than\* 90 percent for 2 minutes, then scale up by 3 instances and backoff for 1 minute and 30 seconds\*\*. These values in the label value are the same as the original upper and lower thresholds, but you are no longer bound to stating both cpu and memory conditions. The idea of having exclusive conditions is now implied by having multiple rules with the same name. Here's an example of the above rule added to other conditions: 117 | 118 | ```json 119 | ... 120 | "labels": { 121 | "mas_rule_fastscaleup_1": "cpu | >90 | PT2M | 3 | PT1M30S", 122 | "mas_rule_fastscaleup_2": "mem | >85 | PT2M | 3 | PT1M30S" 123 | }, 124 | ... 125 | ``` 126 | Notice that the tolerance, scale factor and backoff values are repeated, this is for clarity, but when the autoscaler sees 2 or more rules with the same name, it will combine them into one rule and use the tolerance, scale factor, and backoff of the first rule it sees. In the example above, the suffix "_1" and "_2" are for Marathon's sake because Marathon does not support having repeat label names. If this suffix is numeric, the autoscaler will order them numerically and take the tolerance, scale factor and backoff from the mas_rule_fastscale_1 rule. 127 | 128 | To complete the example above, so it contains scale down rules, here is example extended: 129 | ```json 130 | ... 131 | "labels": { 132 | "mas_rule_fastscaleup_1": "cpu | >90 | PT2M | 3 | PT1M30S", 133 | "mas_rule_fastscaleup_2": "mem | >85 | PT2M | 3 | PT1M30S", 134 | "mas_rule_slowscaledown_1": "cpu | <=90 | PT1M | -1 | PT30S", 135 | "mas_rule_slowscaledown_2": "mem | <=85 | PT1M | -1 | PT30S" 136 | }, 137 | ... 138 | ``` 139 | 140 | Let's explore some other ideas... 141 | Maybe your application is only interested in scaling based on CPU: 142 | 143 | ```json 144 | ... 145 | "labels": { 146 | "mas_rule_fastscaleup": "cpu | >90 | PT2M | 3 | PT1M30S", 147 | "mas_rule_slowscaledown": "cpu | <=90 | PT1M | -1 | PT30S", 148 | }, 149 | ... 150 | ``` 151 | 152 | Perhaps you want your application to scale up and down differently for different conditions: 153 | ```json 154 | ... 155 | "labels": { 156 | "mas_rule_slowscaleup": "cpu | >40 | PT2M | 1 | PT1M30S", 157 | "mas_rule_fastscaleup": "cpu | >60 | PT1M | 3 | PT30S", 158 | "mas_rule_hyperscaleup": "cpu | >90 | PT1M | 5 | PT15S", 159 | "mas_rule_slowscaledown": "cpu | <90 | PT1M30S | -1 | PT30S", 160 | "mas_rule_fastscaledown": "cpu | <10 | PT3M | -5 | PT30S", 161 | }, 162 | ... 163 | ``` 164 | When multiple rules focus on the same metric, the autoscaler should take the action of the rule that matches closest to the given tolerance and threshold. It is possible that your application may never trigger some rules depending on the application's behavior. 165 | 166 | 167 | \* Comparisons can use >, <, <=, >=, = or == 168 | 169 | \*\* [A Wikipedia Reference on ISO8601 time duration](https://en.wikipedia.org/wiki/ISO_8601#Durations) 170 | 171 | 172 | 173 | ## Testing the autoscaler with the Stress Tester app 174 | 175 | To see how the Autoscaler behaves with an application's scaling settings in a controlled environment, build and deploy the stress test application to an environment running the Autoscaler. 176 | 177 | ```bash 178 | cd tests/stress_tester_app && docker build -t autoscale_test_app . 179 | ``` 180 | 181 | Push the image to the registry: 182 | ```bash 183 | docker push autoscale_test_app:latest 184 | ``` 185 | 186 | Run the scripts/test_autoscaler.py script: 187 | ```bash 188 | cd scripts && python test_autoscaler.py --marathon-uri MARATHON_HTTP --marathon-user MARATHON_USER --marathon-pass MARATHON_PASS 189 | ``` 190 | -------------------------------------------------------------------------------- /lib/marathon_autoscaler/history_manager.py: -------------------------------------------------------------------------------- 1 | from aniso8601 import parse_duration 2 | from collections import defaultdict 3 | from constants import FLAP_SIGNATURES 4 | from datetime import datetime, timedelta 5 | import logging 6 | 7 | 8 | class HistoryManager(object): 9 | """ 10 | A keeper of the recent history of scaling decisions. 11 | """ 12 | def __init__(self, logger=None, dd_client=None): 13 | self.logger = logger or logging.getLogger(__name__) 14 | self.dd_client = dd_client 15 | self.app_performance_tail = [] 16 | 17 | def add_to_perf_tail(self, app_scale_recommendations): 18 | """ 19 | The app performance tail is the time series list of votes, decisions, timestamps and 20 | checksums (app version) for all participating applications. Upon adding to this list, 21 | it is truncated to hold the last 300 items. It is truncated to keep memory usage 22 | stable as this will be a long running application. 23 | :param app_scale_recommendations: list of scaling recommendations per app 24 | :return: tail of app_scale_recommendations list 25 | """ 26 | self.app_performance_tail.append(app_scale_recommendations) 27 | self.check_for_flapping() 28 | del self.app_performance_tail[:(len(self.app_performance_tail) - 300)] 29 | return self.app_performance_tail[::-1] 30 | 31 | def check_for_flapping(self): 32 | """ Searches through the app_performance_tail for decision flapping patterns 33 | :return: None 34 | """ 35 | decisions = defaultdict(list) 36 | # gather all decisions by application 37 | _ = [decisions[app].append(event) 38 | for recommendations in self.app_performance_tail 39 | for app, event in recommendations.items() 40 | if "decision" in event.keys()] 41 | # clean up the list of values by sorting them by datetime 42 | decisions = {k: sorted(v, key=lambda x: x.get("timestamp")) for k, v in dict(decisions).items()} 43 | 44 | # loop over key, value to reduce value down to list of just decision values and pass to search_tail 45 | search_results = {k: self.__search_tail([d.get("decision") for d in v], FLAP_SIGNATURES) 46 | for k, v in decisions.items()} 47 | 48 | [self.dd_client.send_counter_event(k, 'marathon_autoscaler.events.flapping_detected') 49 | for k, v in search_results.items() if len(v) > 0] 50 | 51 | def get_performance_tail_slice(self, app_name, after_date_time): 52 | """ 53 | Gets a select portion of the application performance tail by application name 54 | after the specified date/time, after_date_time. 55 | :param app_name: Application's name 56 | :param after_date_time: DateTime 57 | :return: A slice of the application's performance tail (votes, decisions, version checksums, timestamps) 58 | """ 59 | results = [event for recommendations in self.app_performance_tail 60 | for app, event in recommendations.items() 61 | if app_name in app and event["timestamp"] > after_date_time] 62 | return results 63 | 64 | def get_timedelta(self, iso8601_time_duration_string): 65 | """ A facade method for the iso8601.parse_duration method that reads a string, 66 | containing an iso8601 time duration value, and returns a datetime.timedelta object. 67 | :param iso8601_time_duration_string: a string containing an iso8601 time duration. 68 | :return: datetime.timedelta 69 | """ 70 | time_delta = None 71 | try: 72 | time_delta = parse_duration(iso8601_time_duration_string) 73 | except Exception as ex: 74 | self.logger.error("Time Duration Unparseable: {td}".format(td=iso8601_time_duration_string)) 75 | self.logger.error(ex) 76 | finally: 77 | return time_delta or timedelta(seconds=0) 78 | 79 | def is_time_window_filled(self, app_name, before_date_time): 80 | """ 81 | Does the application have events that precede the tolerance time window? This function 82 | is necessary to determine we have filled the entire window of tolerance. 83 | :param app_name: Application's name 84 | :param before_date_time: Date/Time to validate if the tolerance window is filled. 85 | :return: (bool) 86 | """ 87 | result = False 88 | past_events = [event for recommendations in self.app_performance_tail 89 | for app, event in recommendations.items() 90 | if app_name in app and event["timestamp"] < before_date_time] 91 | if len(past_events) > 0: 92 | result = True 93 | 94 | msg = "{app_name}: tolerance window filled: {result} / {before_date_time:%H:%M:%S.%f}" 95 | self.logger.info(msg.format(**locals())) 96 | 97 | dmsg = "{app_name}: {past_events}" 98 | self.logger.debug(dmsg.format(**locals())) 99 | 100 | return result 101 | 102 | def tolerance_reached(self, app_name, tolerance, vote): 103 | """ 104 | Has an application reached the point of needing to make a decision on a 105 | scaling event? It ensures 3 things: 106 | - Is tolerance window completely filled? 107 | - Do all time periods have the same application version (checksum)? 108 | - Are all votes in the tolerance window unanimous? 109 | :param app_name: Application's name 110 | :param tolerance: ISO8601 time duration 111 | :param vote: A vote on an upcoming decision 112 | :return: (bool) 113 | """ 114 | result = False 115 | time_difference = self.get_timedelta(tolerance) 116 | right_now = datetime.now() 117 | go_back_this_far = right_now - time_difference 118 | vote_list = [] 119 | if self.is_time_window_filled(app_name, go_back_this_far): 120 | app_tolerated_tail = self.get_performance_tail_slice(app_name, go_back_this_far) 121 | 122 | if len(app_tolerated_tail) != 0: 123 | checksums = set([item.get("checksum") for item in app_tolerated_tail]) 124 | vote_list = [item.get("vote") for item in app_tolerated_tail] 125 | votes = set(vote_list) 126 | if len(checksums) == 1 and len(votes) == 1 and votes == {vote}: 127 | result = True 128 | 129 | msg = "{app_name}: tolerance reached: {result} / {go_back_this_far:%H:%M:%S.%f} - " \ 130 | "{right_now:%H:%M:%S.%f}" 131 | self.logger.info(msg.format(**locals())) 132 | dmsg = "{app_name}: vote_list: {vote_list}; tolerance: {tolerance}; right_now: {right_now}; time_difference: {time_difference}; go_back_this_far: {go_back_this_far};" 133 | self.logger.debug(dmsg.format(**locals())) 134 | return result 135 | 136 | def within_backoff(self, app_name, backoff, decision): 137 | """ 138 | Answers whether an application is still in its backoff period. Has a 139 | scaling event (of the same decision) occurred within the time duration? 140 | :param app_name: Application name 141 | :param backoff: ISO8601 time duration 142 | :param decision: A scaling decision 143 | :return: (bool) 144 | """ 145 | result = False 146 | time_difference = self.get_timedelta(backoff) 147 | right_now = datetime.now() 148 | go_back_this_far = right_now - time_difference 149 | app_tail = self.get_performance_tail_slice(app_name, go_back_this_far) 150 | scale_events = [item for item in app_tail if item.get("decision") == decision] 151 | if len(scale_events) > 0: 152 | result = True 153 | msg = "{app_name}: within backoff window: {result} / {go_back_this_far:%H:%M:%S.%f} - " \ 154 | "{right_now:%H:%M:%S.%f}" 155 | self.logger.info(msg.format(**locals())) 156 | dmsg = "{app_name}: scale events: {scale_events}" 157 | self.logger.debug(dmsg.format(**locals())) 158 | return result 159 | 160 | @staticmethod 161 | def __search_tail(corpus, *signatures): 162 | """ Searches the tail of corpus for matching signatures 163 | :param corpus: a list of elements 164 | :param signatures: 1 or more signatures to search for 165 | :return: a list of search hits 166 | """ 167 | search_hits = [] 168 | for signature in signatures: 169 | search_size = len(signature) 170 | if corpus[search_size * -1:] == signature: 171 | search_hits.append(signature) 172 | return search_hits 173 | -------------------------------------------------------------------------------- /lib/marathon_autoscaler/poller.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from collections import defaultdict 4 | from application_definition import ApplicationDefinition 5 | import logging 6 | from datadog_metrics import DatadogClient 7 | from marathon import Marathon 8 | from mesosagent import MesosAgent 9 | from mesosmaster import MesosMaster 10 | from scaler import AutoScaler 11 | from multiprocessing import Pool 12 | from time import sleep 13 | import settings 14 | 15 | def get_mesos_agent_statistics(agent_host): 16 | """ 17 | :param agent_host: A Mesos slave endpoint defined as an FQDN or IP Address 18 | :return: Statistics (Metrics) JSON 19 | """ 20 | return agent_host, MesosAgent("http://{0}:{1}/".format(agent_host, settings.agent_port)).get_statistics() 21 | 22 | 23 | class Poller: 24 | """ 25 | A polling mechanism for gathering metrics data for the Autoscaler's decision engine. 26 | """ 27 | 28 | def __init__(self, cli_args, logger=None): 29 | self.args = cli_args 30 | self.logger = logger or logging.getLogger(__name__) 31 | self.mesos = MesosMaster(cli_args.mesos_uri) 32 | self.marathon = Marathon(cli_args.marathon_uri, 33 | (cli_args.marathon_user, cli_args.marathon_pass) 34 | if cli_args.marathon_user and cli_args.marathon_pass 35 | else None 36 | ) 37 | self.auto_scaler = AutoScaler(self.marathon, cli_args=cli_args) 38 | self.cpu_fan_out = cli_args.cpu_fan_out 39 | self.datadog_client = DatadogClient(cli_args) 40 | self.agent_port = cli_args.agent_port 41 | 42 | def poll(self, mesos_master, marathon_client, poll_time_span=3, cpu_fan_out=None): 43 | """ The main method for forming the applications metrics data object. A call to Marathon retrieves 44 | all applications, a call to Mesos Master retrieves all slaves, each slave node is queried for its 45 | statistics (metrics) twice (to acquire a differential), differentials are calculated, additional 46 | metrics are added (max_cpu, max_mem). All of this data is collected in a single dictionary object 47 | and returned to caller. 48 | :param mesos_master: A Mesos Master Client 49 | :param marathon_client: A Marathon Client 50 | :param poll_time_span: Time (in seconds) between polling events 51 | :param cpu_fan_out: Max number of processes in MP Pools 52 | :return: A dictionary object containing all Application metric data for 2 consecutive polls 53 | """ 54 | try: 55 | marathon_apps = marathon_client.get_all_apps().get("apps") 56 | slaves = mesos_master.get_slaves() 57 | agent_hosts = [slave.get("hostname") for slave in slaves.get("slaves")] 58 | except Exception as ex: 59 | self.logger.error(ex) 60 | self.logger.fatal("Marathon data could not be retrieved!") 61 | return 62 | 63 | executor_stats = defaultdict(list) 64 | 65 | for _ in range(2): 66 | agent_hosts_stats = {} 67 | if cpu_fan_out: 68 | fan_out_procs = cpu_fan_out 69 | else: 70 | fan_out_procs = len(agent_hosts) 71 | 72 | maxtasks = int(len(agent_hosts) / fan_out_procs) if int(len(agent_hosts) / fan_out_procs) else 1 73 | pool = Pool(processes=fan_out_procs, maxtasksperchild=maxtasks) 74 | for agent_host, stats in pool.imap_unordered(get_mesos_agent_statistics, agent_hosts): 75 | self.logger.debug((agent_host, stats)) 76 | agent_hosts_stats[agent_host] = stats 77 | pool.close() 78 | 79 | for host in agent_hosts_stats: 80 | if agent_hosts_stats[host] is not None: 81 | for executor in agent_hosts_stats[host]: 82 | data = {"host": host, "stats": executor["statistics"]} 83 | executor_stats[executor["executor_id"]].append(data) 84 | sleep(poll_time_span) 85 | 86 | self.logger.info("Stats differentials collected.") 87 | all_diffs = {} 88 | 89 | for key, stat in executor_stats.items(): 90 | 91 | if len(stat) != 2: 92 | continue 93 | 94 | host = stat[0].get("host") 95 | first_stats = stat[0].get("stats") 96 | second_stats = stat[1].get("stats") 97 | 98 | if "timestamp" in second_stats and "timestamp" in first_stats: 99 | sys_cpu_delta = second_stats["cpus_system_time_secs"] - \ 100 | first_stats["cpus_system_time_secs"] 101 | 102 | user_cpu_delta = second_stats["cpus_user_time_secs"] - first_stats["cpus_user_time_secs"] 103 | timestamp_delta = second_stats["timestamp"] - first_stats["timestamp"] 104 | mem_total = first_stats["mem_limit_bytes"] 105 | mem_used = second_stats["mem_rss_bytes"] 106 | cpu_total_usage = ((sys_cpu_delta + user_cpu_delta) / timestamp_delta) * 100 107 | memory_total_usage = (float(mem_used) / mem_total) * 100 108 | 109 | diffs = dict(timestamp=timestamp_delta, 110 | cpus_system_time_secs=sys_cpu_delta, 111 | cpus_user_time_secs=user_cpu_delta, 112 | cpu_total_usage=cpu_total_usage, 113 | memory_total_usage=memory_total_usage, 114 | host=host, 115 | executor_id=key) 116 | all_diffs[key] = diffs 117 | else: 118 | self.logger.error("Timestamps were not found in stats from host: {0}".format(host)) 119 | 120 | app_metric_map = defaultdict(list) 121 | 122 | for key in all_diffs.keys(): 123 | app_metric_map[key.split(".")[0]].append(all_diffs[key]) 124 | 125 | app_metric_summation = {} 126 | 127 | for app, metrics in app_metric_map.items(): 128 | 129 | metric_sums = {} 130 | 131 | cpu_values = [metric["cpu_total_usage"] for metric in metrics] 132 | metric_sums["cpu_avg_usage"] = sum(cpu_values) / len(cpu_values) 133 | 134 | metric_sums["max_cpu"] = max([(metric["cpu_total_usage"], 135 | metric["executor_id"], 136 | metric["host"]) for metric in metrics]) 137 | 138 | memory_values = [metric["memory_total_usage"] for metric in metrics] 139 | metric_sums["memory_avg_usage"] = sum(memory_values) / len(memory_values) 140 | 141 | metric_sums["max_memory"] = max([(metric["memory_total_usage"], 142 | metric["executor_id"], 143 | metric["host"]) for metric in metrics]) 144 | 145 | metric_sums["executor_metrics"] = metrics 146 | metric_sums["application_definition"] = next((appdef for appdef in marathon_apps 147 | if app.replace("_", "/") == appdef.get("id")), {}) 148 | app_metric_summation[app] = metric_sums 149 | 150 | return app_metric_summation 151 | 152 | def start(self): 153 | """ 154 | This is the entry method for the Poller class. 155 | :return: None 156 | """ 157 | self.logger.info("Mesos and Marathon Connections Established.") 158 | while True: 159 | polled_stats = self.poll(self.mesos, self.marathon, cpu_fan_out=self.cpu_fan_out) 160 | 161 | if polled_stats is not None: 162 | self.datadog_client.send_datadog_metrics(polled_stats) 163 | self.auto_scaler.decide(polled_stats) 164 | self.update_autoscaler_metrics(polled_stats) 165 | self.logger.info("Decisions are completed.") 166 | else: 167 | self.logger.fatal("Poller unable to reach Marathon/Mesos!") 168 | 169 | sleep(self.args.sleep_interval) 170 | 171 | def update_autoscaler_metrics(self, stats): 172 | """ 173 | * Number of participating applications 174 | * Total min instances 175 | * Total max instances 176 | * Total number of currently running instances (only participating applications) 177 | * Number of currently running instances per application 178 | * Number of scale up events 179 | * Number of scale down events 180 | * Number of flap detection events 181 | :param stats: dict object containing all application metric information specific to this polling event 182 | :return: 183 | """ 184 | participating_applications = [{"app": app, 185 | "current_instances": int(items["application_definition"]["instances"]), 186 | "min_instances": int(items["application_definition"]["labels"]["min_instances"]), 187 | "max_instances": int(items["application_definition"]["labels"]["max_instances"])} 188 | for app, items in stats.items() 189 | if ApplicationDefinition(items["application_definition"]).is_app_participating] 190 | 191 | # total_participating_applications = len(participating_applications) 192 | # total_min_instances = sum([app["min_instances"] for app in participating_applications]) 193 | # total_max_instances = sum([app["max_instances"] for app in participating_applications]) 194 | # total_current_instances = sum([app["current_instances"] for app in participating_applications]) 195 | 196 | [self.datadog_client.send_counter_event(app["app"], 197 | "marathon_autoscaler.counters.min_instances", 198 | points=app["min_instances"]) 199 | for app in participating_applications] 200 | [self.datadog_client.send_counter_event(app["app"], 201 | "marathon_autoscaler.counters.max_instances", 202 | points=app["max_instances"]) 203 | for app in participating_applications] 204 | [self.datadog_client.send_counter_event(app["app"], 205 | "marathon_autoscaler.counters.current_instances", 206 | points=app["current_instances"]) 207 | for app in participating_applications] 208 | -------------------------------------------------------------------------------- /lib/marathon_autoscaler/rules_manager.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | from constants import compare, RE_THRESHOLD, RE_DELIMITERS 3 | from utils import list_get 4 | import logging 5 | import re 6 | import settings 7 | 8 | 9 | class RulesManager(object): 10 | """ 11 | To manage all of the rules. A rule is a single description centered around a metric. A rule can have a part denoted 12 | by an underscore followed by a part value (it is useful to use a numerical part value). Multiple rule parts are 13 | automatically combined by having the same rule name. 14 | A single rule is a dict object: 15 | { 16 | "{{ prefix__rule_name__rule_part }}": { 17 | "ruleInfo": { 18 | "ruleName": "{{rule_name}}", 19 | "rulePart": "{{rule_part}}" 20 | }, 21 | { 22 | "ruleValue": { 23 | "metric", 24 | "threshold": {op, val}, 25 | "backoff", 26 | "scale_factor", 27 | "tolerance", 28 | "weight" 29 | } 30 | } 31 | } 32 | """ 33 | def __init__(self, app_def, logger=None): 34 | self.logger = logger or logging.getLogger(__name__) 35 | self.app_def = app_def 36 | self.rules = self._find_autoscaler_rules() 37 | self._last_triggered_rule = None 38 | 39 | def trigger_rules(self, metrics): 40 | """ 41 | Consumes a dict of metrics and attempts to match an application's rules to its metric information. 42 | :param metrics: dict of metric values 43 | :return: The recently triggered rule 44 | """ 45 | 46 | triggered_rule = self._get_matched_rule(metrics) 47 | 48 | self._last_triggered_rule = triggered_rule 49 | info_msg = "{app_name}: metrics: {metrics}" 50 | self.logger.info(info_msg.format(app_name=self.app_def.app_name, 51 | metrics=metrics)) 52 | info_msg = "{app_name}: last_triggered_rule set to: {triggered_rule}" 53 | self.logger.info(info_msg.format(app_name=self.app_def.app_name, 54 | triggered_rule=triggered_rule)) 55 | return self._last_triggered_rule 56 | 57 | @property 58 | def last_triggered_rule(self): 59 | """ 60 | The last rule triggered by the RuleManager 61 | :return: 62 | """ 63 | triggered_rule = None 64 | if self._last_triggered_rule is not None: 65 | triggered_rule = self._last_triggered_rule 66 | return triggered_rule 67 | 68 | @property 69 | def last_triggered_criteria(self): 70 | """ 71 | A helper property to aim at providing the core criteria of the last triggered rule. 72 | :return: A dict of rule criteria 73 | """ 74 | criteria = {} 75 | if self._last_triggered_rule is not None: 76 | rule_value = self._last_triggered_rule[0].get("ruleValue") 77 | criteria = dict(scale_factor=rule_value.get("scale_factor"), 78 | tolerance=rule_value.get("tolerance"), 79 | backoff=rule_value.get("backoff")) 80 | return criteria 81 | 82 | def _find_autoscaler_rules(self): 83 | rules_found = {} 84 | if self.app_def.labels: 85 | for k, v in self.app_def.labels.items(): 86 | rule_match = re.match(r"^{prefix}_(?P[A-Za-z0-9]+)_?(?P[A-Za-z0-9]+)*".format( 87 | prefix=settings.rules_prefix), k) 88 | if rule_match is not None: 89 | rule_values = re.split(RE_DELIMITERS, v) 90 | if len(rule_values) < 5: 91 | self.logger.warn("Scaling rule identified, but wrong number of arguments. Disregarding " 92 | "{rule_name} = {rule_values}".format(rule_name=k, 93 | rule_values=rule_values)) 94 | continue 95 | 96 | rule_values_dict = dict(metric=rule_values[0], 97 | threshold=self._parse_threshold(rule_values[1]), 98 | tolerance=rule_values[2], 99 | scale_factor=rule_values[3], 100 | backoff=rule_values[4], 101 | weight=list_get(rule_values, 5, 1.0)) 102 | rules_found[k] = dict(ruleValue=rule_values_dict, ruleInfo=rule_match.groupdict()) 103 | elif "max_instances" in k.lower(): 104 | self.max_instances = int(v) 105 | elif "min_instances" in k.lower(): 106 | self.min_instances = int(v) 107 | interpreted_rules = defaultdict(list) 108 | if rules_found: 109 | [interpreted_rules[v.get("ruleInfo").get("ruleName")].append(v) 110 | for k, v in rules_found.items()] 111 | 112 | return dict(interpreted_rules) 113 | 114 | def is_app_participating(self): 115 | """ Determine if the application is ready for scale actions 116 | :return: application's participation in auto_scaling 117 | """ 118 | return self.app_def.is_app_participating 119 | 120 | def is_app_within_min_or_max(self): 121 | """ Determine if the application is ready for scale actions. 122 | :return: application's participation in auto_scaling 123 | """ 124 | msg = "{0}: instances: min:{1}, running:{2}, max:{3}" 125 | self.logger.info(msg.format(self.app_def.app_name, 126 | int(self.min_instances), 127 | int(self.app_def.tasksRunning), 128 | int(self.max_instances) 129 | )) 130 | return int(self.min_instances) <= \ 131 | int(self.app_def.tasksRunning) <= \ 132 | int(self.max_instances) 133 | 134 | def is_app_ready(self): 135 | """ Determine if the application is ready for scale actions 136 | :return: application's readiness for scale actions 137 | """ 138 | result = False 139 | if self.app_def.tasksRunning == self.app_def.instances: 140 | result = True 141 | 142 | self.logger.info("{0}: application ready: {1}".format( 143 | self.app_def.app_name, result)) 144 | return result 145 | 146 | def _get_matched_rule(self, metrics): 147 | matched_rule = None 148 | rules_found = [] 149 | for rule_name, rule in self.rules.items(): 150 | rule_criteria_count = len(rule) 151 | matched_rule_criteria = [rule_item 152 | for rule_item in rule 153 | for metric_name, metric_value in metrics.items() 154 | if rule_item.get("ruleValue").get("metric") in metric_name and 155 | self._beyond_threshold(metric_value, rule_item.get("ruleValue").get("threshold"))] 156 | if len(matched_rule_criteria) == rule_criteria_count: 157 | rules_found.append(rule) 158 | 159 | self.logger.debug("triggering rules by metrics: {rules_found}".format(rules_found=rules_found)) 160 | 161 | if len(rules_found) > 1: 162 | matched_rule = self._find_best_matched_rule_by_criteria(rules_found) 163 | elif len(rules_found) == 1: 164 | matched_rule = rules_found[0] 165 | 166 | return matched_rule 167 | 168 | def _find_best_matched_rule_by_criteria(self, rules): 169 | """ 170 | This is the Highlander method; there can only be one... 171 | Round 1 : Who has the most rule criteria matched? 172 | :param rules: 173 | :return: 174 | """ 175 | rule_criteria_counter = Counter() 176 | for rule in rules: 177 | rule_criteria_counter[rule[0].get("ruleInfo").get("ruleName")] = len(rule) 178 | 179 | most_critical_rule_names = [k for k, v in rule_criteria_counter.items() 180 | if v == rule_criteria_counter.most_common()[0][1]] 181 | 182 | if len(most_critical_rule_names) > 1: 183 | critical_rules = {name: next(rule 184 | for rule_parts in rules 185 | for rule in rule_parts 186 | if name in rule.get("ruleInfo").get("ruleName")) 187 | for name in most_critical_rule_names} 188 | winning_rule = self._find_best_matched_rule_by_weight(critical_rules) 189 | else: 190 | winning_rule = [rule 191 | for rule_parts in rules 192 | for rule in rule_parts 193 | if most_critical_rule_names[0] in 194 | rule.get("ruleInfo").get("ruleName")] 195 | self.logger.debug("winning rule by criteria: {winning_rule}".format(winning_rule=winning_rule)) 196 | return winning_rule 197 | 198 | def _find_best_matched_rule_by_weight(self, rules): 199 | """ 200 | Round 2 : Which rule has the most weight? Weight is multiplied against scale_factor 201 | :param rules: A dict of rules 202 | :return: One rule with the maximum weight 203 | """ 204 | self.logger.debug(rules) 205 | rule_weights = {rule.get("ruleInfo").get("ruleName"): abs(rule.get("ruleValue").get("scale_factor") * 206 | rule.get("ruleValue").get("weight")) 207 | for rule_name, rule in rules.items()} 208 | self.logger.debug("winning rule by weight: {weighted_rule}".format(weighted_rule=rules.get(max(rule_weights)))) 209 | 210 | return rules.get(max(rule_weights)) 211 | 212 | def _parse_threshold(self, threshold): 213 | """ 214 | Parses the string representing the threshold. A threshold is a comparison operator and number. 215 | :param threshold: String 216 | :return: Lambda expression matching the operator 217 | """ 218 | m = re.search(RE_THRESHOLD, threshold) 219 | return m.groupdict() 220 | 221 | def _beyond_threshold(self, metric, threshold): 222 | """ 223 | This will answer whether the metric has met or exceeded the threshold. It uses 224 | the dictionary created above, compare, to look up the operation, th["op"] and 225 | passes the metric and threshold value, th["val"] to the lambda expression that 226 | corresponds with the operation. 227 | :param metric: The performance metric (cpu, memory, etc...) 228 | :param threshold: String representation of the threshold (>, <, =, ==, <=, >=) 229 | :return: (bool) 230 | """ 231 | return compare[threshold["op"]](float(metric), float(threshold["val"])) 232 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Marathon Autoscaler 2 | 3 | Thank you for considering making a contribution to this project! We sincerely appreciate your time and energy invested in getting to know this project's code. We are also appreciative that you've taken time to read this guideline and cared enough to contriubute. 4 | 5 | The following is a set of guidelines for contributing to the Marathon Autoscaler project. 6 | These are just guidelines, not rules, use your best judgment and feel free to propose changes to this document in a pull request. 7 | 8 | #### Table Of Contents 9 | 10 | [What should I know before I get started?](#what-should-i-know-before-i-get-started) 11 | * [Code of Conduct](#code-of-conduct) 12 | 13 | 14 | [How Can I Contribute?](#how-can-i-contribute) 15 | * [Reporting Bugs](#reporting-bugs) 16 | * [Suggesting Enhancements](#suggesting-enhancements) 17 | * [Your First Code Contribution](#your-first-code-contribution) 18 | * [Pull Requests](#pull-requests) 19 | 20 | [Styleguides](#styleguides) 21 | * [Git Commit Messages](#git-commit-messages) 22 | * [Documentation Styleguide](#documentation-styleguide) 23 | 24 | [Additional Notes](#additional-notes) 25 | * [Issue and Pull Request Labels](#issue-and-pull-request-labels) 26 | 27 | ## What should I know before I get started? 28 | 29 | ## How Can I Contribute? 30 | 31 | ### Reporting Bugs 32 | 33 | This section guides you through submitting a bug report. Following these guidelines helps maintainers and the community understand your report, reproduce the behavior, and find related reports. 34 | 35 | Before creating bug reports, please perform a [cursory search](https://github.com/tendrilinc/marathon-autoscaler/issues?q=is%3Aissue)** to see if the problem has already been reported. If it has, add a comment to the existing issue instead of opening a new one. When you are creating a bug report, please [include as many details as possible](#how-do-i-submit-a-good-bug-report). If you'd like, you can use [this template](#template-for-submitting-bug-reports) to structure the information. 36 | 37 | 38 | #### How Do I Submit A (Good) Bug Report? 39 | 40 | Bugs are tracked as [GitHub issues](https://guides.github.com/features/issues/). 41 | 42 | Explain the problem and include additional details to help maintainers reproduce the problem: 43 | 44 | * **Use a clear and descriptive title** for the issue to identify the problem. 45 | * **Describe the exact steps which reproduce the problem** in as many details as possible. When listing steps, **don't just say what you did, but explain how you did it**. 46 | * **Provide specific examples to demonstrate the steps**. Include links to files or GitHub projects, or copy/pasteable snippets, which you use in those examples. If you're providing snippets in the issue, use [Markdown code blocks](https://help.github.com/articles/markdown-basics/#multiple-lines). 47 | * **Describe the behavior you observed after following the steps** and point out what exactly is the problem with that behavior. 48 | * **Explain which behavior you expected to see instead and why.** 49 | * **Include the Autoscaler's logging output.** Copy the pertinent snippet of log information from the Autoscaler's standard out. **If the log output is too long, please open a public Gist** and include the link to this Gist with your report. 50 | Provide more context by answering these questions: 51 | 52 | * **Did the problem start happening recently** (e.g. after updating to a new version) or was this always a problem? 53 | * If the problem started happening recently, **can you reproduce the problem in an older version of Autoscaler?** What's the most recent version in which the problem doesn't happen? 54 | * **Can you reliably reproduce the issue?** If not, provide details about how often the problem happens and under which conditions it normally happens. 55 | 56 | 57 | Include details about your configuration and environment: 58 | 59 | * **Which versions of Marathon and Mesos are you using?** 60 | * **What's the name(s) and version(s) of the OS(es) you're running**? 61 | * **Are you running the Autoscaler in Docker Machine locally or on Mesos/Marathon**? 62 | * **Other configuration information that you believe to be relevant** 63 | 64 | 65 | ### Suggesting Enhancements 66 | 67 | This section guides you through submitting an enhancement suggestion for Autoscaler, including completely new features and minor improvements to existing functionality. Following these guidelines helps maintainers and the community understand your suggestion and find related suggestions. 68 | 69 | Before creating enhancement suggestions, please check [this list](#before-submitting-an-enhancement-suggestion) as you might find out that you don't need to create one. When you are creating an enhancement suggestion, please [include as many details as possible](#how-do-i-submit-a-good-enhancement-suggestion). If you'd like, you can use [this template](#template-for-submitting-enhancement-suggestions) to structure the information. 70 | 71 | #### Before Submitting An Enhancement Suggestion 72 | 73 | * **Perform a [cursory search](https://github.com/tendrilinc/marathon-autoscaler/issues?q=is%3Aissue)** to see if the enhancement has already been suggested. If it has, add a comment to the existing issue instead of opening a new one. 74 | 75 | #### How Do I Submit A (Good) Enhancement Suggestion? 76 | 77 | Enhancement suggestions are tracked as [GitHub issues](https://guides.github.com/features/issues/). Provide the following information: 78 | 79 | * **Use a clear and descriptive title** for the issue to identify the suggestion. 80 | * **Provide a step-by-step description of the suggested enhancement** in as many details as possible. 81 | * **Provide specific examples to demonstrate the steps**. Include copy/pasteable snippets which you use in those examples, as [Markdown code blocks](https://help.github.com/articles/markdown-basics/#multiple-lines). 82 | * **Describe the current behavior** and **explain which behavior you expected to see instead** and why. 83 | * **Explain why this enhancement would be useful**. 84 | 85 | #### Template For Submitting Enhancement Suggestions 86 | 87 | [Short description of suggestion] 88 | 89 | **Steps which explain the enhancement** 90 | 91 | 1. [First Step] 92 | 2. [Second Step] 93 | 3. [Other Steps...] 94 | 95 | **Current and suggested behavior** 96 | 97 | [Describe current and suggested behavior here] 98 | 99 | **Why would the enhancement be useful to most users** 100 | 101 | [Explain why the enhancement would be useful to most users] 102 | 103 | ### Your First Code Contribution 104 | 105 | Unsure where to begin contributing to Autoscaler? You can start by looking through these `beginner` and `help-wanted` issues: 106 | 107 | * [Beginner issues][beginner] - issues which should only require a few lines of code, and a test or two. 108 | * [Help wanted issues][help-wanted] - issues which should be a bit more involved than `beginner` issues. 109 | 110 | Both issue lists are sorted by total number of comments. While not perfect, number of comments is a reasonable proxy for impact a given change will have. 111 | 112 | ### Pull Requests 113 | 114 | * Reference the Github Issue 115 | * Document new code based on the 116 | [Documentation Styleguide](#documentation-styleguide) 117 | * End files with a newline. 118 | 119 | ## Styleguides 120 | 121 | ### Git Commit Messages 122 | 123 | * Use the present tense ("Add feature" not "Added feature") 124 | * Use the imperative mood ("Move cursor to..." not "Moves cursor to...") 125 | * Limit the first line to 72 characters or less 126 | * Reference issues and pull requests liberally 127 | * When only changing documentation, include `[ci skip]` in the commit description 128 | * Consider starting the commit message with an applicable emoji: 129 | * :art: `:art:` when improving the format/structure of the code 130 | * :racehorse: `:racehorse:` when improving performance 131 | * :non-potable_water: `:non-potable_water:` when plugging memory leaks 132 | * :memo: `:memo:` when writing docs 133 | * :penguin: `:penguin:` when fixing something on Linux 134 | * :apple: `:apple:` when fixing something on Mac OS 135 | * :checkered_flag: `:checkered_flag:` when fixing something on Windows 136 | * :bug: `:bug:` when fixing a bug 137 | * :fire: `:fire:` when removing code or files 138 | * :green_heart: `:green_heart:` when fixing the CI build 139 | * :white_check_mark: `:white_check_mark:` when adding tests 140 | * :lock: `:lock:` when dealing with security 141 | * :arrow_up: `:arrow_up:` when upgrading dependencies 142 | * :arrow_down: `:arrow_down:` when downgrading dependencies 143 | * :shirt: `:shirt:` when removing linter warnings 144 | 145 | ### Documentation Styleguide 146 | 147 | * Use [PyLint](https://www.pylint.org). 148 | * Use [Markdown](https://daringfireball.net/projects/markdown). 149 | 150 | ## Additional Notes 151 | 152 | ### Issue and Pull Request Labels 153 | 154 | This section lists the labels we use to help us track and manage issues and pull requests. 155 | 156 | [GitHub search](https://help.github.com/articles/searching-issues/) makes it easy to use labels for finding groups of issues or pull requests you're interested in. To help you find issues and pull requests, each label is listed with search links for finding open items with that label. We encourage you to read about [other search filters](https://help.github.com/articles/searching-issues/) which will help you write more focused queries. 157 | 158 | The labels are loosely grouped by their purpose, but it's not required that every issue have a label from every group or that an issue can't have more than one label from the same group. 159 | 160 | Please open an issue if you have suggestions for new labels, and if you notice some labels are missing on some repositories, then please open an issue on that repository. 161 | 162 | #### Type of Issue and Issue State 163 | 164 | | Label name | :mag_right: | Description | 165 | | --- | --- | --- | 166 | | `enhancement` | [search][search-label-enhancement] | Feature requests. | 167 | | `bug` | [search][search-label-bug] | Confirmed bugs or reports that are very likely to be bugs. | 168 | | `question` | [search][search-label-question] | Questions more than bug reports or feature requests (e.g. how do I do X). | 169 | | `feedback` | [search][search-label-feedback] | General feedback more than bug reports or feature requests. | 170 | | `help-wanted` | [search][search-label-help-wanted] | The core team would appreciate help from the community in resolving these issues. | 171 | | `beginner` | [search][search-label-beginner] | Less complex issues which would be good first issues to work on for users who want to contribute to Marathon Autoscaler. | 172 | | `more-information-needed` | [search][search-label-more-information-needed] | More information needs to be collected about these problems or feature requests (e.g. steps to reproduce). | 173 | | `needs-reproduction` | [search][search-label-needs-reproduction] | Likely bugs, but haven't been reliably reproduced. | 174 | | `blocked` | [search][search-label-blocked] | Issues blocked on other issues. | 175 | | `duplicate` | [search][search-label-duplicate] | Issues which are duplicates of other issues, i.e. they have been reported before. | 176 | | `wontfix` | [search][search-label-wontfix] | The core team has decided not to fix these issues for now, either because they're working as intended or for some other reason. | 177 | | `invalid` | [search][search-label-invalid] | Issues which aren't valid (e.g. user errors). | 178 | 179 | 180 | #### Topic Categories 181 | 182 | | Label name | :mag_right: | Description | 183 | | --- | --- | --- | 184 | | `documentation` | [search][search-label-documentation] | Related to any type of documentation. | 185 | | `performance` | [search][search-label-performance] | Related to performance. | 186 | | `security` | [search][search-label-security] | Related to security. | 187 | | `api` | [search][search-label-api] | Related to APIs. | 188 | | `uncaught-exception` | [search][search-label-uncaught-exception] | Issues about uncaught exceptions. | 189 | | `crash` | [search][search-label-crash] | Reports of crashing. | 190 | | `network` | [search][search-label-network] | Related to network problems or working with remote files (e.g. on network drives). | 191 | | `git` | [search][search-label-git] | Related to Git functionality (e.g. problems with gitignore files or with showing the correct file status). | 192 | | `build-error` | [search][search-label-build-error] | Related to problems with building from source. | 193 | 194 | 195 | #### Pull Request Labels 196 | 197 | | Label name | :mag_right: | Description 198 | | --- | --- | --- | 199 | | `work-in-progress` | [search][search-label-work-in-progress] | Pull requests which are still being worked on, more changes will follow. | 200 | | `needs-review` | [search][search-label-needs-review] | Pull requests which need code review, and approval from maintainers or core team. | 201 | | `under-review` | [search][search-label-under-review] | Pull requests being reviewed by maintainers or core team. | 202 | | `requires-changes` | [search][search-label-requires-changes] | Pull requests which need to be updated based on review comments and then reviewed again. | 203 | | `needs-testing` | [search][search-label-needs-testing] | Pull requests which need manual testing. | 204 | 205 | [search-label-enhancement]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Aenhancement 206 | [search-label-bug]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Abug 207 | [search-label-question]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Aquestion 208 | [search-label-feedback]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Afeedback 209 | [search-label-help-wanted]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Ahelp-wanted 210 | [search-label-beginner]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Abeginner 211 | [search-label-more-information-needed]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Amore-information-needed 212 | [search-label-needs-reproduction]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Aneeds-reproduction 213 | [search-label-triage-help-needed]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Atriage-help-needed 214 | [search-label-documentation]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Adocumentation 215 | [search-label-performance]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Aperformance 216 | [search-label-security]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Asecurity 217 | [search-label-api]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Aapi 218 | [search-label-crash]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Acrash 219 | [search-label-network]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Anetwork 220 | [search-label-uncaught-exception]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Auncaught-exception 221 | [search-label-git]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Agit 222 | [search-label-blocked]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Ablocked 223 | [search-label-duplicate]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Aduplicate 224 | [search-label-wontfix]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Awontfix 225 | [search-label-invalid]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Ainvalid 226 | [search-label-build-error]: https://github.com/issues?q=is%3Aopen+is%3Aissue+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Abuild-error 227 | [search-label-work-in-progress]: https://github.com/pulls?q=is%3Aopen+is%3Apr+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Awork-in-progress 228 | [search-label-needs-review]: https://github.com/pulls?q=is%3Aopen+is%3Apr+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Aneeds-review 229 | [search-label-under-review]: https://github.com/pulls?q=is%3Aopen+is%3Apr+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Aunder-review 230 | [search-label-requires-changes]: https://github.com/pulls?q=is%3Aopen+is%3Apr+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Arequires-changes 231 | [search-label-needs-testing]: https://github.com/pulls?q=is%3Aopen+is%3Apr+repo%3Atendrilinc%2Fmarathon-autoscaler+label%3Aneeds-testing 232 | 233 | [beginner]:https://github.com/issues?q=is%3Aopen+is%3Aissue+label%3Abeginner+label%3Ahelp-wanted++repo%3Atendrilinc%2Fmarathon-autoscale+sort%3Acomments-desc 234 | [help-wanted]:https://github.com/issues?q=is%3Aopen+is%3Aissue+label%3Ahelp-wanted+repo%3Atendrilinc%2Fmarathon-autoscale+sort%3Acomments-desc). 235 | * Use [Markdown](https://daringfireball.net/projects/markdown). 236 | --------------------------------------------------------------------------------