├── .activate.sh ├── .cman_debug_bashrc ├── .coveragerc ├── .coveragerc-yelp ├── .deactivate.sh ├── .dockerignore ├── .github ├── pull_request_template.md └── workflows │ └── ci.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── Dockerfile.external ├── LICENSE ├── Makefile ├── OWNERS ├── README.md ├── acceptance ├── Makefile ├── autoscaler_config.tmpl ├── bionic │ └── clusterman_signals_acceptance.tar.gz ├── clusterman.json ├── clusterman.sh ├── docker-compose-k8s.yaml ├── docker-compose.yaml ├── jammy │ └── clusterman_signals_acceptance.tar.gz ├── k8s-local-docker-registry.sh ├── mesos-agent-secret ├── mesos-secrets ├── moto │ └── Dockerfile ├── run_instance.py ├── secret ├── srv-configs │ ├── clog.yaml │ ├── clusterman-clusters │ │ └── local-dev │ │ │ ├── default.kubernetes │ │ │ └── default.mesos │ ├── clusterman-external.yaml │ ├── clusterman.yaml │ └── clusterman_metrics.yaml ├── utils.sh └── xenial │ └── clusterman_signals_acceptance.tar.gz ├── clusterman ├── __init__.py ├── args.py ├── autoscaler │ ├── __init__.py │ ├── autoscaler.py │ ├── config.py │ ├── offset.py │ ├── pool_manager.py │ └── toggle.py ├── aws │ ├── __init__.py │ ├── auto_scaling_resource_group.py │ ├── aws_resource_group.py │ ├── client.py │ ├── markets.py │ ├── response_types.py │ ├── spot_fleet_resource_group.py │ ├── spot_prices.py │ └── util.py ├── batch │ ├── __init__.py │ ├── autoscaler.py │ ├── autoscaler_bootstrap.py │ ├── clog.py │ ├── cluster_metrics_collector.py │ ├── drainer.py │ ├── node_migration.py │ ├── spot_price_collector.py │ └── util.py ├── cli │ ├── __init__.py │ ├── generate_data.py │ ├── info.py │ ├── manage.py │ ├── migrate.py │ ├── simulate.py │ ├── status.py │ ├── toggle.py │ └── util.py ├── common │ ├── __init__.py │ └── sfx.py ├── config.py ├── draining │ ├── __init__.py │ ├── kubernetes.py │ ├── mesos.py │ └── queue.py ├── exceptions.py ├── interfaces │ ├── __init__.py │ ├── cluster_connector.py │ ├── resource_group.py │ ├── signal.py │ └── types.py ├── kubernetes │ ├── __init__.py │ ├── kubernetes_cluster_connector.py │ └── util.py ├── math │ ├── __init__.py │ ├── piecewise.py │ └── piecewise_types.py ├── mesos │ ├── __init__.py │ ├── mesos_cluster_connector.py │ ├── metrics_generators.py │ └── util.py ├── migration │ ├── __init__.py │ ├── constants.py │ ├── event.py │ ├── event_enums.py │ ├── settings.py │ └── worker.py ├── monitoring_lib.py ├── reports │ ├── __init__.py │ ├── constants.py │ ├── data_transforms.py │ ├── plots.py │ ├── report_types.py │ └── reports.py ├── run.py ├── signals │ ├── __init__.py │ ├── external_signal.py │ └── pending_pods_signal.py ├── simulator │ ├── __init__.py │ ├── event.py │ ├── io.py │ ├── simulated_aws_cluster.py │ ├── simulated_cluster_connector.py │ ├── simulated_pool_manager.py │ ├── simulated_spot_fleet_resource_group.py │ ├── simulator.py │ └── util.py ├── supervisord │ ├── fetch_clusterman_signal │ ├── run_clusterman_signal │ └── supervisord.conf ├── tools │ ├── __init__.py │ ├── dynamodb_rename.py │ ├── rookout.py │ └── signalfx_scraper.py └── util.py ├── clusterman_logo.png ├── code-of-conduct.md ├── completions └── .gitignore ├── debian ├── .gitignore ├── changelog ├── clusterman.links ├── compat ├── control └── rules ├── docs ├── Makefile ├── examples │ ├── autoscaler_config.yaml │ ├── design.yaml │ └── metrics.json.gz └── source │ ├── _static │ └── .gitignore │ ├── api │ ├── AWSResourceGroup.rst │ ├── AutoScalingResourceGroup.rst │ ├── Autoscaler.rst │ ├── MesosPoolManager.rst │ ├── Signal.rst │ ├── SpotFleetResourceGroup.rst │ ├── aws_markets.rst │ └── clusterman_metrics.rst │ ├── autoscaler.rst │ ├── conf.py │ ├── configuration.rst │ ├── drainer.rst │ ├── index.rst │ ├── manage.rst │ ├── metrics.rst │ ├── node_migration.rst │ ├── overview.rst │ ├── resource_groups.rst │ ├── signals.rst │ ├── simulator.rst │ └── tools.rst ├── examples ├── __init__.py ├── batch │ ├── __init__.py │ ├── autoscaler.py │ ├── autoscaler_bootstrap.py │ ├── cluster_metrics_collector.py │ ├── spot_price_collector.py │ └── util.py ├── clusterman_metrics │ ├── .flake8 │ ├── clusterman_metrics │ │ ├── __init__.py │ │ ├── boto_client.py │ │ ├── simulation_client.py │ │ └── util │ │ │ ├── __init__.py │ │ │ ├── aws.py │ │ │ ├── constants.py │ │ │ ├── costs.py │ │ │ ├── meteorite.py │ │ │ └── misc.py │ ├── setup.cfg │ └── setup.py ├── schemas │ ├── clusterman.json │ ├── definitions.json │ └── pool.json ├── supervisord.conf └── terraform │ ├── clusterman.tf │ └── variables.tf ├── extra-requirements-yelp-dev.txt ├── extra-requirements-yelp.txt ├── images └── architecture-diagram.png ├── itest_status.py ├── itests ├── autoscaler_scaling.feature ├── draining_queue.feature ├── environment.py ├── prune_excess_fulfilled_capacity.feature ├── resource_group_modification.feature ├── simulation_aws_price_computations.feature ├── simulation_join_delay.feature ├── simulation_spot_fleet_diversification.feature └── steps │ ├── autoscaler.py │ ├── draining.py │ ├── exceptions.py │ ├── log.py │ ├── pool_manager.py │ ├── prune_excess_fulfilled_capacity.py │ ├── simulated_spot_fleet.py │ └── simulation.py ├── jenkins.yaml ├── mypy.ini ├── package ├── .gitignore ├── Makefile ├── debian-itest-runner ├── dockerfiles │ ├── bionic │ │ └── Dockerfile │ ├── jammy │ │ └── Dockerfile │ └── xenial │ │ └── Dockerfile └── itest │ ├── metrics.json.gz │ ├── metrics.yaml │ └── ubuntu.sh ├── pyproject.toml ├── requirements-bootstrap.txt ├── requirements-dev-minimal.txt ├── requirements-dev.txt ├── requirements-docs.txt ├── requirements-minimal.txt ├── requirements.txt ├── service-itest-runner ├── setup.py ├── stubs ├── simplejson.pyi ├── sorteddict.pyi └── staticconf.pyi ├── tests ├── __init__.py ├── args_test.py ├── autoscaler │ ├── autoscaler_test.py │ ├── config_test.py │ ├── offset_test.py │ ├── pool_manager_test.py │ └── toggle_test.py ├── aws │ ├── __init__.py │ ├── auto_scaling_resource_group_test.py │ ├── aws_resource_group_test.py │ ├── client_test.py │ ├── conftest.py │ ├── spot_fleet_resource_group_test.py │ └── spot_prices_test.py ├── batch │ ├── __init__.py │ ├── autoscaler_test.py │ ├── cluster_metrics_collector_test.py │ ├── conftest.py │ ├── drainer_test.py │ ├── node_migration_test.py │ ├── spot_price_collector_test.py │ └── util_test.py ├── cli │ ├── manage_test.py │ ├── migrate_test.py │ ├── simulate_test.py │ └── toggle_cli_test.py ├── common │ └── sfx_test.py ├── config_test.py ├── conftest.py ├── draining │ └── queue_test.py ├── interfaces │ ├── __init__.py │ └── signal_test.py ├── kubernetes │ ├── kubernetes_cluster_connector_test.py │ └── util_test.py ├── math │ └── piecewise_test.py ├── migration │ ├── __init__.py │ ├── conftest.py │ ├── migration_event_enums_test.py │ ├── migration_event_test.py │ ├── migration_settings_test.py │ └── migration_worker_test.py ├── monitoring_lib_test.py ├── signals │ ├── __init__.py │ ├── external_signal_test.py │ └── pending_pods_signal_test.py ├── simulator │ ├── __init__.py │ ├── conftest.py │ ├── io_test.py │ ├── simulated_aws_cluster_test.py │ ├── simulated_cluster_connector_test.py │ ├── simulated_spot_fleet_resource_group_test.py │ └── simulator_test.py ├── tools │ └── signalfx_scraper_test.py └── util_test.py └── tox.ini /.activate.sh: -------------------------------------------------------------------------------- 1 | virtualenv_run/bin/activate -------------------------------------------------------------------------------- /.cman_debug_bashrc: -------------------------------------------------------------------------------- 1 | alias cman_debug='python -m clusterman.batch.autoscaler_bootstrap start --no-daemon' 2 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = 4 | clusterman 5 | 6 | [report] 7 | omit = 8 | clusterman/batch/* 9 | exclude_lines = 10 | # Have to re-enable the standard pragma 11 | \#\s*pragma: no cover 12 | 13 | # Don't complain if tests don't hit defensive assertion code: 14 | ^\s*raise AssertionError\b 15 | ^\s*raise NotImplementedError\b 16 | ^\s*return NotImplemented\b 17 | ^\s*raise$ 18 | 19 | # Don't complain if non-runnable code isn't run: 20 | ^if __name__ == ['"]__main__['"]:$ 21 | 22 | [html] 23 | directory = coverage-html 24 | 25 | # vim:ft=dosini 26 | -------------------------------------------------------------------------------- /.coveragerc-yelp: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = 4 | clusterman 5 | 6 | [report] 7 | exclude_lines = 8 | # Have to re-enable the standard pragma 9 | \#\s*pragma: no cover 10 | 11 | # Don't complain if tests don't hit defensive assertion code: 12 | ^\s*raise AssertionError\b 13 | ^\s*raise NotImplementedError\b 14 | ^\s*return NotImplemented\b 15 | ^\s*raise$ 16 | 17 | # Don't complain if non-runnable code isn't run: 18 | ^if __name__ == ['"]__main__['"]:$ 19 | 20 | [html] 21 | directory = coverage-html 22 | 23 | # vim:ft=dosini 24 | -------------------------------------------------------------------------------- /.deactivate.sh: -------------------------------------------------------------------------------- 1 | deactivate 2 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .tox 3 | build 4 | 5 | # It is possible a user has their own virtualenv here, 6 | # but we don't want it to pollute the docker context because 7 | # it will get built inside. 8 | virtualenv_run 9 | venv 10 | tests 11 | itests 12 | docs 13 | tools 14 | 15 | # y/ycp 16 | .ycp_playground 17 | playground 18 | docker-venv 19 | .activate.sh 20 | .deactivate.sh 21 | 22 | # we don't need to send all the Debian package builds to Docker for the paasta service 23 | yelp_package 24 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ### Description 2 | 3 | Please fill out! 4 | 5 | ### Testing Done 6 | 7 | Please fill out! Generally speaking any new features should include 8 | additional unit or integration tests to ensure the behaviour is 9 | working correctly. 10 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | release: 9 | 10 | jobs: 11 | tox: 12 | env: 13 | PIP_INDEX_URL: https://pypi.python.org/simple 14 | runs-on: ubuntu-22.04 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | make_target: 19 | - run-pre-commit 20 | - test-external 21 | - itest-external 22 | - itest_bionic-external 23 | - itest_jammy-external 24 | steps: 25 | - uses: actions/checkout@v2 26 | - uses: actions/setup-python@v2 27 | with: 28 | python-version: 3.7 29 | - uses: actions/setup-python@v3 30 | with: 31 | python-version: 3.8 32 | - uses: actions/setup-go@v2 33 | with: 34 | go-version: '1.17.3' 35 | - uses: azure/setup-kubectl@v1 36 | with: 37 | version: v1.22.0 38 | # GHA won't setup tox for us and we use tox-pip-extensions for venv-update 39 | - run: pip install tox==3.8.6 tox-pip-extensions==1.6.0 40 | - run: go install sigs.k8s.io/kind@v0.11.1 41 | - run: make ${{ matrix.make_target }} 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | *.so 3 | *.sw[nop] 4 | .#* 5 | .DS_Store 6 | ._* 7 | \#*\# 8 | build 9 | dist 10 | *~ 11 | *.log 12 | .coverage 13 | precomputed 14 | .pydevproject 15 | .project 16 | *.sublime-* 17 | virtualenv_run 18 | .tox 19 | *.egg-info/ 20 | __pycache__ 21 | version 22 | .ycp_playground 23 | playground 24 | docker-venv 25 | .cache 26 | .pytest_cache/ 27 | .mypy_cache/ 28 | acceptance/autoscaler_config.yaml 29 | package/itest/autoscaler_config.yaml 30 | package/itest/autoscaler_config.tmpl 31 | package/itest/run_instance.py 32 | package/itest/trusty/* 33 | package/itest/xenial/* 34 | package/itest/bionic/* 35 | package/itest/jammy/* 36 | /completions/[a-zA-Z]* 37 | acceptance/.local* 38 | sftp-config.json 39 | .idea 40 | .vscode 41 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v0.9.4 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | exclude: ^\.activate\.sh$ 8 | - id: check-yaml 9 | - id: debug-statements 10 | exclude: ^itests/environment.py$ 11 | - id: name-tests-test 12 | - id: check-added-large-files 13 | exclude: ^(\.activate\.sh|.*clusterman_signals_.*\.tar\.gz)$ 14 | - id: check-byte-order-marker 15 | - id: fix-encoding-pragma 16 | args: [--remove] 17 | - repo: https://github.com/asottile/reorder_python_imports 18 | rev: v0.3.5 19 | hooks: 20 | - id: reorder-python-imports 21 | args: [ 22 | --remove-import, from __future__ import absolute_import, 23 | --remove-import, from __future__ import print_function, 24 | --remove-import, from __future__ import unicode_literals 25 | ] 26 | - repo: https://github.com/asottile/pyupgrade 27 | rev: v1.2.0 28 | hooks: 29 | - id: pyupgrade 30 | args: [--py3-plus] 31 | - repo: https://github.com/psf/black 32 | rev: 22.3.0 33 | hooks: 34 | - id: black 35 | args: 36 | - --target-version 37 | - py38 38 | - repo: https://github.com/PyCQA/flake8 39 | rev: 4.0.1 40 | hooks: 41 | - id: flake8 42 | exclude: ^docs/.* 43 | args: [ 44 | '--ignore=E121,E123,E126,E133,E203,E226,E231,E241,E242,E704,W503,W504,W505,W605' 45 | ] 46 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # This is an example Dockerfile to run your service in PaaSTA! 2 | # It satisfies the PaaSTA contract. 3 | FROM docker-dev.yelpcorp.com/jammy_yelp:latest 4 | 5 | # python and uwsgi deps 6 | RUN apt-get update \ 7 | && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 8 | awscli \ 9 | git \ 10 | libatlas-base-dev \ 11 | libpython3.8 \ 12 | libxml2 \ 13 | libyaml-0-2 \ 14 | lsb-release \ 15 | make \ 16 | openssh-client \ 17 | python3.8 \ 18 | python3-distutils \ 19 | python3-pip \ 20 | python3-setuptools \ 21 | stdin2scribe \ 22 | tox \ 23 | virtualenv \ 24 | zk-flock \ 25 | && apt-get clean 26 | 27 | RUN /usr/bin/pip3 install supervisor 28 | COPY tox.ini requirements.txt requirements-bootstrap.txt extra-requirements-yelp.txt /code/ 29 | RUN cd code && tox -e virtualenv_run 30 | RUN cd code && virtualenv_run/bin/pip3 install -rextra-requirements-yelp.txt 31 | 32 | RUN mkdir /home/nobody \ 33 | && chown nobody /home/nobody 34 | ENV HOME /home/nobody 35 | 36 | # Code is COPY'ed here after the pip install above, so that code changes do not 37 | # break the preceding cache layer. 38 | COPY . /code 39 | RUN chown nobody /code 40 | 41 | 42 | # This is needed so that we can pass PaaSTA itests on Jenkins; for some reason (probably aufs-related?) 43 | # root can't modify the contents of /code on Jenkins, even though it works locally. Root needs to 44 | # modify these contents so that it can configure the Dockerized Mesos cluster that we run our itests on. 45 | # This shouldn't be a security risk because we drop privileges below and on overlay2, root can already 46 | # modify the contents of this directory. 47 | RUN chmod -R 775 /code/acceptance 48 | RUN ln -s /code/clusterman/supervisord/fetch_clusterman_signal /usr/bin/fetch_clusterman_signal 49 | RUN ln -s /code/clusterman/supervisord/run_clusterman_signal /usr/bin/run_clusterman_signal 50 | 51 | RUN install -d --owner=nobody /code/logs 52 | 53 | # Create /nail/run to store the batch PID file 54 | RUN mkdir -p /nail/run && chown -R nobody /nail/run 55 | 56 | # For sake of security, don't run your service as a privileged user 57 | USER nobody 58 | WORKDIR /code 59 | ENV BASEPATH=/code PATH=/code/virtualenv_run/bin:$PATH 60 | -------------------------------------------------------------------------------- /Dockerfile.external: -------------------------------------------------------------------------------- 1 | # This is an example Dockerfile to run your service in PaaSTA! 2 | # It satisfies the PaaSTA contract. 3 | 4 | ARG DOCKER_REGISTRY 5 | ARG IMAGE_NAME 6 | FROM ${DOCKER_REGISTRY}/${IMAGE_NAME} 7 | 8 | # python and uwsgi deps 9 | RUN apt-get update && apt-get upgrade -y \ 10 | && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 11 | awscli \ 12 | g++ \ 13 | git \ 14 | libatlas-base-dev \ 15 | libpython3.8 \ 16 | libxml2 \ 17 | libyaml-0-2 \ 18 | lsb-release \ 19 | make \ 20 | openssh-client \ 21 | software-properties-common \ 22 | gpg \ 23 | gpg-agent \ 24 | && add-apt-repository ppa:deadsnakes/ppa \ 25 | && apt-cache policy python3.8 \ 26 | && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ 27 | python3.8 \ 28 | libtiff-dev \ 29 | libfreetype-dev \ 30 | libfreetype6 \ 31 | libfreetype6-dev \ 32 | python3.8-dev \ 33 | python3.8-distutils \ 34 | python3-apt \ 35 | python3-pip \ 36 | python3-setuptools \ 37 | virtualenv \ 38 | && apt-get clean 39 | 40 | RUN /usr/bin/pip3 install setuptools supervisor tox==3.24.4 41 | COPY tox.ini requirements.txt requirements-bootstrap.txt /code/ 42 | 43 | RUN mkdir /home/nobody \ 44 | && chown nobody /home/nobody 45 | ENV HOME /home/nobody 46 | 47 | # Code is COPY'ed here after the pip install above, so that code changes do not 48 | # break the preceding cache layer. 49 | COPY . /code 50 | RUN chown nobody /code 51 | RUN cd code && tox -e virtualenv_run && virtualenv_run/bin/pip3 install -eexamples/clusterman_metrics 52 | 53 | RUN ln -s /code/clusterman/supervisord/fetch_clusterman_signal /usr/bin/fetch_clusterman_signal 54 | RUN ln -s /code/clusterman/supervisord/run_clusterman_signal /usr/bin/run_clusterman_signal 55 | 56 | RUN install -d --owner=nobody /code/logs 57 | 58 | # Create /nail/run to store the batch PID file 59 | RUN mkdir -p /nail/run && chown -R nobody /nail/run 60 | 61 | # For sake of security, don't run your service as a privileged user 62 | USER nobody 63 | WORKDIR /code 64 | ENV BASEPATH=/code PATH=/code/virtualenv_run/bin:$PATH 65 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | --- 2 | teams: 3 | - Compute Infrastructure Core 4 | -------------------------------------------------------------------------------- /acceptance/autoscaler_config.tmpl: -------------------------------------------------------------------------------- 1 | --- 2 | configs: 3 | - LaunchSpecifications: 4 | - WeightedCapacity: 35 5 | SubnetId: REPLACE 6 | InstanceType: m3.large 7 | SpotPrice: 4 8 | AllocationStrategy: diversified 9 | -------------------------------------------------------------------------------- /acceptance/bionic/clusterman_signals_acceptance.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/acceptance/bionic/clusterman_signals_acceptance.tar.gz -------------------------------------------------------------------------------- /acceptance/clusterman.json: -------------------------------------------------------------------------------- 1 | { 2 | "accessKeyId": "ACCESS_KEY", 3 | "secretAccessKey": "SECRET_ACCESS_KEY" 4 | } 5 | -------------------------------------------------------------------------------- /acceptance/clusterman.sh: -------------------------------------------------------------------------------- 1 | export AWS_ACCESS_KEY_ID=ACCESS_KEY 2 | export AWS_SECRET_ACCESS_KEY=SECRET_ACCESS_KEY 3 | -------------------------------------------------------------------------------- /acceptance/docker-compose-k8s.yaml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | 3 | services: 4 | moto-ec2: 5 | build: ./moto/ 6 | ports: 7 | - 5000 8 | command: 'ec2' 9 | moto-s3: 10 | build: ./moto/ 11 | ports: 12 | - 5000 13 | command: 's3' 14 | moto-dynamodb: 15 | build: ./moto/ 16 | ports: 17 | - 5000 18 | command: 'dynamodb2' 19 | 20 | networks: 21 | default: 22 | external: 23 | name: kind 24 | -------------------------------------------------------------------------------- /acceptance/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "2" 2 | 3 | services: 4 | zookeeper: 5 | image: zookeeper 6 | environment: 7 | ZK_CONFIG: tickTime=2000,initLimit=10,syncLimit=5,maxClientCnxns=128,forceSync=no,clientPort=2181 8 | ZK_ID: 1 9 | mesosmaster: 10 | image: mesosphere/mesos:1.5.0 11 | ports: 12 | - 5050 13 | - 5054 14 | command: 'mesos-master --zk=zk://zookeeper:2181/mesos-testcluster --registry=in_memory --quorum=1 --authenticate --authenticate_agents --work_dir=/tmp/mesos --credentials=/etc/mesos-secrets' 15 | depends_on: 16 | - zookeeper 17 | volumes: 18 | - ./mesos-secrets:/etc/mesos-secrets 19 | mesosagent: 20 | image: mesosphere/mesos:1.5.0 21 | expose: 22 | - 5051 23 | volumes: 24 | - /var/run/docker.sock:/var/run/docker.sock 25 | - ./mesos-agent-secret:/etc/mesos-agent-secret 26 | environment: 27 | CLUSTER: testcluster 28 | MESOS_SYSTEMD_ENABLE_SUPPORT: "false" 29 | command: 'mesos-agent --master=zk://zookeeper:2181/mesos-testcluster --resources="cpus:20;mem:2048;disk:2000;ports:[31000-31100];cpus(taskproc):10;mem(taskproc):1024;disk(taskproc):1000;ports(taskproc):[31200-31500]" --credential=/etc/mesos-agent-secret --containerizers=docker --docker=/usr/bin/docker --work_dir=/tmp/mesos --attributes="region:fakeregion;pool:default" --no-docker_kill_orphans --log_dir=/var/log/mesos' 30 | depends_on: 31 | - mesosmaster 32 | - zookeeper 33 | moto-ec2: 34 | build: ./moto/ 35 | ports: 36 | - 5000 37 | command: 'ec2' 38 | moto-s3: 39 | build: ./moto/ 40 | ports: 41 | - 5000 42 | command: 's3' 43 | moto-dynamodb: 44 | build: ./moto/ 45 | ports: 46 | - 5000 47 | command: 'dynamodb2' 48 | moto-sts: 49 | build: ./moto/ 50 | ports: 51 | - 5000 52 | command: 'sts' 53 | -------------------------------------------------------------------------------- /acceptance/jammy/clusterman_signals_acceptance.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/acceptance/jammy/clusterman_signals_acceptance.tar.gz -------------------------------------------------------------------------------- /acceptance/k8s-local-docker-registry.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -o errexit -x 3 | 4 | REG_NAME=$1 5 | REG_PORT=$2 6 | CLUSTER_NAME=$3 7 | 8 | # create registry container unless it already exists 9 | running="$(docker inspect -f '{{.State.Running}}' "${REG_NAME}" 2>/dev/null || true)" 10 | if [ "${running}" != 'true' ]; then 11 | docker run -d --restart=always -e REGISTRY_HTTP_ADDR=0.0.0.0:${REG_PORT} -p "${REG_PORT}:${REG_PORT}" --name "${REG_NAME}" registry:2 12 | fi 13 | 14 | # connect the registry to the cluster network 15 | docker network connect "kind" "${REG_NAME}" 16 | 17 | # tell https://tilt.dev to use the registry 18 | # https://docs.tilt.dev/choosing_clusters.html#discovering-the-registry 19 | for node in $(kind get nodes --name ${CLUSTER_NAME}); do 20 | kubectl annotate node "${node}" "kind.x-k8s.io/registry=localhost:${REG_PORT}"; 21 | done 22 | -------------------------------------------------------------------------------- /acceptance/mesos-agent-secret: -------------------------------------------------------------------------------- 1 | { 2 | "principal": "agent", 3 | "secret": "secretagent" 4 | } 5 | -------------------------------------------------------------------------------- /acceptance/mesos-secrets: -------------------------------------------------------------------------------- 1 | { 2 | "credentials": [ 3 | { 4 | "principal": "clusterman", 5 | "secret": "secret" 6 | }, 7 | { 8 | "principal": "agent", 9 | "secret": "secretagent" 10 | } 11 | ] 12 | } 13 | -------------------------------------------------------------------------------- /acceptance/moto/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG DOCKER_REGISTRY 2 | ARG IMAGE_NAME 3 | 4 | FROM ${DOCKER_REGISTRY}/${IMAGE_NAME} 5 | 6 | RUN apt-get -yq update && apt-get install -yq --no-install-recommends \ 7 | gcc \ 8 | python3-dev \ 9 | libffi-dev \ 10 | python3 \ 11 | libssl-dev \ 12 | python3-pip 13 | 14 | ADD . /moto/ 15 | ENV PYTHONUNBUFFERED 1 16 | 17 | WORKDIR /moto/ 18 | # Setuptools needs to be installed and up-to-date for install of the actual packages 19 | # 20 | # moto and botocore have mismatched upper-bound pins for python-dateutils 21 | # which breaks our build. botocore used to have <3.0.0, but shrunk that to 22 | # <2.8.1, and moto hasn't updated their pin to match yet. So until those 23 | # are fixed, here's the latest version of boto that has the <3.0.0 pin. 24 | # 25 | # We can unpin boto3 and botocore once botocore fixes its pin 26 | # (see https://github.com/boto/botocore/commit/e87e7a745fd972815b235a9ee685232745aa94f9) 27 | RUN pip3 install pip==21.3.1 setuptools==59.6.0 && \ 28 | pip3 install cryptography==3.2 botocore==1.14.11 boto3==1.11.11 "moto[server]" 29 | 30 | ENTRYPOINT ["python3", "-m", "moto.server", "-H", "0.0.0.0"] 31 | 32 | EXPOSE 5000 33 | -------------------------------------------------------------------------------- /acceptance/secret: -------------------------------------------------------------------------------- 1 | secret 2 | -------------------------------------------------------------------------------- /acceptance/srv-configs/clog.yaml: -------------------------------------------------------------------------------- 1 | scribe_port: 1463 2 | scribe_retry_interval: 10 3 | 4 | monk_stream_prefix: '_clog.' 5 | scribe_disable: false 6 | preferred_backend: 'monk' 7 | monk_disable: false 8 | monk_timeout_ms: 10000 9 | monk_host: 'monk-leaf' 10 | monk_port: 6000 11 | scribe_host: 'RandomHost' 12 | use_schematizer: true 13 | -------------------------------------------------------------------------------- /acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes: -------------------------------------------------------------------------------- 1 | --- 2 | resource_groups: 3 | - sfr: 4 | s3: 5 | bucket: clusterman-resource-groups 6 | prefix: acceptance 7 | 8 | scaling_limits: 9 | min_capacity: 10 10 | max_capacity: 60 11 | max_tasks_to_kill: 100 12 | max_weight_to_add: 10 13 | max_weight_to_remove: 10 14 | 15 | autoscale_signal: 16 | internal: true 17 | period_minutes: 1 18 | 19 | autoscaling: 20 | prevent_scale_down_after_capacity_loss: true 21 | instance_loss_threshold: 3 22 | 23 | alert_on_max_capacity: false 24 | -------------------------------------------------------------------------------- /acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos: -------------------------------------------------------------------------------- 1 | --- 2 | resource_groups: 3 | - sfr: 4 | s3: 5 | bucket: clusterman-resource-groups 6 | prefix: acceptance 7 | 8 | scaling_limits: 9 | min_capacity: 10 10 | max_capacity: 60 11 | max_tasks_to_kill: 100 12 | max_weight_to_add: 10 13 | max_weight_to_remove: 10 14 | 15 | autoscale_signal: 16 | namespace: clusterman 17 | name: MostRecentResources 18 | branch_or_tag: acceptance 19 | period_minutes: 10 20 | required_metrics: 21 | - name: cpus_allocated 22 | type: system_metrics 23 | minute_range: 10 24 | - name: mem_allocated 25 | type: system_metrics 26 | minute_range: 10 27 | - name: disk_allocated 28 | type: system_metrics 29 | minute_range: 10 30 | 31 | alert_on_max_capacity: false 32 | -------------------------------------------------------------------------------- /acceptance/srv-configs/clusterman-external.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | module_config: 3 | - namespace: clusterman_metrics 4 | file: /nail/srv/configs/clusterman_metrics.yaml 5 | 6 | # ###### 7 | # Mappings for the clusterman service that are the same for all habitats. 8 | # 9 | # NOTE: The clusterman service will map clusters.{cluster_name}.aws_region 10 | # to aws.region, if the --cluster argument is passed to the service. 11 | clusters: 12 | local-dev: 13 | aws_account_number: 123456789012 14 | aws_region: us-west-2 15 | mesos_master_fqdn: mesosmaster 16 | kubeconfig_path: /var/lib/clusterman/clusterman.conf 17 | 18 | aws: 19 | endpoint_url: http://moto-{svc}:5000 20 | access_key_file: /etc/boto_cfg/clusterman.json 21 | signals_bucket: clusterman-signals 22 | 23 | batches: 24 | spot_prices: 25 | run_interval_seconds: 60 26 | dedupe_interval_seconds: 60 27 | cluster_metrics: 28 | run_interval_seconds: 60 29 | 30 | autoscaling: 31 | default_signal_role: 'clusterman' 32 | setpoint: 0.7 33 | target_capacity_margin: 0.05 34 | 35 | autoscale_signal: 36 | name: ConstantSignal 37 | branch_or_tag: acceptance 38 | period_minutes: 1 39 | 40 | sensu_config: 41 | - team: noop 42 | page: false 43 | 44 | module_env_config: [] 45 | -------------------------------------------------------------------------------- /acceptance/srv-configs/clusterman.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | module_config: 3 | - namespace: clog 4 | initialize: clusterman.batch.clog.initialize 5 | config: 6 | log_stream_name: clusterman 7 | file: /nail/srv/configs/clog.yaml 8 | - namespace: clusterman_metrics 9 | file: /nail/srv/configs/clusterman_metrics.yaml 10 | - namespace: yelp_batch 11 | config: 12 | watchers: 13 | - aws_key_rotation: /etc/boto_cfg/clusterman.json 14 | - clusterman_yaml: /nail/srv/configs/clusterman.yaml 15 | logging: 16 | console_log_level: 'CRITICAL' 17 | 18 | # ###### 19 | # Mappings for the clusterman service that are the same for all habitats. 20 | # 21 | # NOTE: The clusterman service will map clusters.{cluster_name}.aws_region 22 | # to aws.region, if the --cluster argument is passed to the service. 23 | clusters: 24 | local-dev: 25 | aws_account_number: 123456789012 26 | aws_region: us-west-2 27 | mesos_master_fqdn: mesosmaster 28 | kubeconfig_path: /var/lib/clusterman/clusterman.conf 29 | 30 | aws: 31 | endpoint_url: http://moto-{svc}:5000 32 | access_key_file: /etc/boto_cfg/clusterman.json 33 | signals_bucket: clusterman-signals 34 | 35 | batches: 36 | spot_prices: 37 | run_interval_seconds: 60 38 | dedupe_interval_seconds: 60 39 | cluster_metrics: 40 | run_interval_seconds: 60 41 | 42 | autoscaling: 43 | default_signal_role: 'clusterman' 44 | setpoint: 0.7 45 | target_capacity_margin: 0.05 46 | 47 | autoscale_signal: 48 | name: ConstantSignal 49 | branch_or_tag: acceptance 50 | period_minutes: 1 51 | 52 | sensu_config: 53 | - team: noop 54 | page: false 55 | 56 | module_env_config: [] 57 | 58 | monitoring_client: LogMonitoringClient 59 | -------------------------------------------------------------------------------- /acceptance/srv-configs/clusterman_metrics.yaml: -------------------------------------------------------------------------------- 1 | dynamodb: 2 | ttl_days: 732 3 | 4 | access_key_file: '/etc/boto_cfg/clusterman_metrics.json' 5 | -------------------------------------------------------------------------------- /acceptance/utils.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | COMPOSE_CONTAINERS="zookeeper mesosmaster mesosagent moto-ec2 moto-s3 moto-dynamodb moto-sts" 4 | 5 | cleanup() { 6 | docker kill "${CONTAINER}" > /dev/null 7 | for compose_container in ${COMPOSE_CONTAINERS}; do 8 | docker network disconnect "clusterman_${DISTRIB_CODENAME}_acceptance" "clusterman_${DISTRIB_CODENAME}_${compose_container}_1" 9 | done 10 | docker network rm "clusterman_${DISTRIB_CODENAME}_acceptance" > /dev/null 11 | } 12 | 13 | setup_networks() { 14 | CIDR_BLOCK="10.1.0.0/24" 15 | docker network create --ip-range "${CIDR_BLOCK}" --subnet "${CIDR_BLOCK}" "clusterman_${DISTRIB_CODENAME}_acceptance" 16 | for compose_container in ${COMPOSE_CONTAINERS}; do 17 | docker network connect "clusterman_${DISTRIB_CODENAME}_acceptance" "clusterman_${DISTRIB_CODENAME}_${compose_container}_1" 18 | done 19 | } 20 | -------------------------------------------------------------------------------- /acceptance/xenial/clusterman_signals_acceptance.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/acceptance/xenial/clusterman_signals_acceptance.tar.gz -------------------------------------------------------------------------------- /clusterman/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | __version__ = "4.22.2" 15 | -------------------------------------------------------------------------------- /clusterman/autoscaler/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/autoscaler/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import List 15 | from typing import NamedTuple 16 | 17 | import colorlog 18 | import staticconf 19 | 20 | logger = colorlog.getLogger(__name__) 21 | 22 | 23 | class AutoscalingConfig(NamedTuple): 24 | excluded_resources: List[str] 25 | setpoint: float 26 | target_capacity_margin: float 27 | prevent_scale_down_after_capacity_loss: bool = False 28 | instance_loss_threshold: int = 0 29 | orphan_instance_uptime_threshold_seconds: int = 1800 30 | 31 | 32 | def get_autoscaling_config(config_namespace: str) -> AutoscalingConfig: 33 | """Load autoscaling configuration values from the provided config_namespace, falling back to the 34 | values stored in the default namespace if none are specified. 35 | 36 | :param config_namespace: namespace to read from before falling back to the default namespace 37 | :returns: AutoscalingConfig object with loaded config values 38 | """ 39 | default_excluded_resources = staticconf.read_list("autoscaling.excluded_resources", default=[]) 40 | default_setpoint = staticconf.read_float("autoscaling.setpoint") 41 | default_target_capacity_margin = staticconf.read_float("autoscaling.target_capacity_margin") 42 | 43 | reader = staticconf.NamespaceReaders(config_namespace) 44 | return AutoscalingConfig( 45 | excluded_resources=reader.read_list("autoscaling.excluded_resources", default=default_excluded_resources), 46 | setpoint=reader.read_float("autoscaling.setpoint", default=default_setpoint), 47 | target_capacity_margin=reader.read_float( 48 | "autoscaling.target_capacity_margin", 49 | default=default_target_capacity_margin, 50 | ), 51 | prevent_scale_down_after_capacity_loss=reader.read_bool( 52 | "autoscaling.prevent_scale_down_after_capacity_loss", default=False 53 | ), 54 | instance_loss_threshold=reader.read_int("autoscaling.instance_loss_threshold", default=0), 55 | orphan_instance_uptime_threshold_seconds=reader.read_int( 56 | "autoscaling.orphan_instance_uptime_threshold_seconds", default=1800 57 | ), 58 | ) 59 | -------------------------------------------------------------------------------- /clusterman/autoscaler/toggle.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import time 15 | from typing import Union 16 | 17 | import arrow 18 | import staticconf 19 | 20 | from clusterman.aws.client import dynamodb 21 | from clusterman.util import CLUSTERMAN_STATE_TABLE 22 | from clusterman.util import parse_time_string 23 | 24 | 25 | AUTOSCALER_PAUSED = "autoscaler_paused" 26 | 27 | 28 | def disable_autoscaling(cluster: str, pool: str, scheduler: str, until: Union[str, int, float]): 29 | """Disable autoscaling for a pool 30 | 31 | :param str cluster: name of the cluster 32 | :param str pool: name of the pool 33 | :param str scheduler: cluster scheduler 34 | :param str until: how long should it remain disabled 35 | """ 36 | expiration = parse_time_string(until).timestamp if isinstance(until, str) else int(until) 37 | state = { 38 | "state": {"S": AUTOSCALER_PAUSED}, 39 | "entity": {"S": f"{cluster}.{pool}.{scheduler}"}, 40 | "timestamp": {"N": str(int(time.time()))}, 41 | "expiration_timestamp": {"N": str(expiration)}, 42 | } 43 | dynamodb.put_item( 44 | TableName=staticconf.read("aws.state_table", default=CLUSTERMAN_STATE_TABLE), 45 | Item=state, 46 | ) 47 | 48 | 49 | def enable_autoscaling(cluster: str, pool: str, scheduler: str): 50 | """Re-enable autoscaling for a pool 51 | 52 | :param str cluster: name of the cluster 53 | :param str pool: name of the pool 54 | :param str scheduler: cluster scheduler 55 | """ 56 | dynamodb.delete_item( 57 | TableName=staticconf.read("aws.state_table", default=CLUSTERMAN_STATE_TABLE), 58 | Key={ 59 | "state": {"S": AUTOSCALER_PAUSED}, 60 | "entity": {"S": f"{cluster}.{pool}.{scheduler}"}, 61 | }, 62 | ) 63 | 64 | 65 | def autoscaling_is_paused(cluster: str, pool: str, scheduler: str, timestamp: arrow.Arrow) -> bool: 66 | """Check if autoscaling is disabled 67 | 68 | :param str cluster: name of the cluster 69 | :param str pool: name of the pool 70 | :param str scheduler: cluster scheduler 71 | :param Arrow timestamp: threshold time 72 | :return: True if paused 73 | """ 74 | response = dynamodb.get_item( 75 | TableName=CLUSTERMAN_STATE_TABLE, 76 | Key={ 77 | "state": {"S": AUTOSCALER_PAUSED}, 78 | "entity": {"S": f"{cluster}.{pool}.{scheduler}"}, 79 | }, 80 | ConsistentRead=True, 81 | ) 82 | if "Item" not in response: 83 | return False 84 | 85 | if "expiration_timestamp" in response["Item"] and timestamp.timestamp > int( 86 | response["Item"]["expiration_timestamp"]["N"] 87 | ): 88 | return False 89 | 90 | return True 91 | -------------------------------------------------------------------------------- /clusterman/aws/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/clusterman/aws/__init__.py -------------------------------------------------------------------------------- /clusterman/aws/response_types.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from mypy_extensions import TypedDict 4 | 5 | 6 | class AutoScalingInstanceConfig(TypedDict): 7 | InstanceId: str 8 | InstanceType: str 9 | WeightedCapacity: str 10 | 11 | 12 | class LaunchTemplateDataConfig(TypedDict): 13 | InstanceType: str 14 | 15 | 16 | class LaunchTemplateConfig(TypedDict): 17 | LaunchTemplateName: str 18 | LaunchTemplateData: LaunchTemplateDataConfig 19 | Version: str 20 | 21 | 22 | class InstanceOverrideConfig(TypedDict): 23 | InstanceType: str 24 | WeightedCapacity: str 25 | 26 | 27 | class MixedInstancesPolicyLaunchTemplateConfig(TypedDict): 28 | LaunchTemplateSpecification: LaunchTemplateConfig 29 | Overrides: List[InstanceOverrideConfig] 30 | 31 | 32 | class MixedInstancesPolicyConfig(TypedDict): 33 | LaunchTemplate: MixedInstancesPolicyLaunchTemplateConfig 34 | 35 | 36 | class AutoScalingGroupConfig(TypedDict): 37 | AvailabilityZones: List[str] 38 | DesiredCapacity: int 39 | Instances: List[AutoScalingInstanceConfig] 40 | LaunchTemplate: LaunchTemplateConfig 41 | MaxSize: int 42 | MinSize: int 43 | MixedInstancesPolicy: MixedInstancesPolicyConfig 44 | -------------------------------------------------------------------------------- /clusterman/aws/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import List 15 | from typing import Mapping 16 | from typing import Type 17 | 18 | from clusterman.aws.auto_scaling_resource_group import AutoScalingResourceGroup 19 | from clusterman.aws.aws_resource_group import AWSResourceGroup 20 | from clusterman.aws.spot_fleet_resource_group import SpotFleetResourceGroup 21 | 22 | 23 | _RESOURCE_GROUP_TYPES: List[Type[AWSResourceGroup]] = [AutoScalingResourceGroup, SpotFleetResourceGroup] 24 | RESOURCE_GROUPS: Mapping[str, Type[AWSResourceGroup]] = {t.FRIENDLY_NAME: t for t in _RESOURCE_GROUP_TYPES} 25 | RESOURCE_GROUPS_REV: Mapping[Type[AWSResourceGroup], str] = {v: k for k, v in RESOURCE_GROUPS.items()} 26 | -------------------------------------------------------------------------------- /clusterman/batch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/batch/clog.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import logging 15 | import socket 16 | 17 | import clog.handlers 18 | import staticconf 19 | 20 | 21 | namespace = "clog" 22 | clog_namespace = staticconf.NamespaceGetters(namespace) # type: ignore 23 | DETAILED_FORMAT = "\t".join( 24 | [ 25 | "%(asctime)s", 26 | socket.gethostname(), 27 | "%(process)s", 28 | "%(name)s", 29 | "%(levelname)s", 30 | "%(message)s", 31 | ] 32 | ) 33 | 34 | 35 | log_stream_name = clog_namespace.get_string("log_stream_name") 36 | log_stream_format = clog_namespace.get_string("log_stream_format", default=DETAILED_FORMAT) 37 | log_stream_level = clog_namespace.get_string("log_stream_level", default="INFO") 38 | enable_uwsgi_mule_offload = clog_namespace.get_bool("enable_uwsgi_mule_offload", default=False) 39 | 40 | 41 | def initialize(): 42 | """Initialize clog from staticconf config.""" 43 | if enable_uwsgi_mule_offload and clog.uwsgi_plugin_enabled: 44 | clog.uwsgi_patch_global_state() 45 | 46 | add_clog_handler( 47 | name=log_stream_name.value, 48 | level=getattr(logging, log_stream_level.value), 49 | log_format=log_stream_format.value, 50 | ) 51 | 52 | 53 | def add_clog_handler(name, level=logging.INFO, log_format=DETAILED_FORMAT): 54 | """Add a CLog logging handler for the stream 'name'. 55 | 56 | :param name: the name of the log 57 | :type name: string 58 | :param level: the logging level of the handler 59 | :type level: int 60 | """ 61 | clog_handler = clog.handlers.CLogHandler(name) 62 | clog_handler.setLevel(level) 63 | formatter = logging.Formatter(log_format) 64 | clog_handler.setFormatter(formatter) 65 | logging.root.addHandler(clog_handler) 66 | -------------------------------------------------------------------------------- /clusterman/batch/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import inspect 15 | import os 16 | from contextlib import contextmanager 17 | 18 | import botocore.exceptions 19 | import colorlog 20 | from yelp_batch.batch import batch_context 21 | 22 | from clusterman.monitoring_lib import get_monitoring_client 23 | 24 | RLE_COUNTER_NAME = "clusterman.request_limit_exceeded" 25 | logger = colorlog.getLogger(__name__) 26 | 27 | 28 | class BatchLoggingMixin: # pragma: no cover 29 | @batch_context 30 | def setup_watchers(self): 31 | self.logger.info( 32 | "Starting batch {name}; watching {watched_files} for changes".format( 33 | name=type(self).__name__, 34 | watched_files=[watcher.filename for watcher in self.version_checker.watchers], 35 | ) 36 | ) 37 | yield 38 | self.logger.info("Batch {name} complete".format(name=type(self).__name__)) 39 | 40 | 41 | class BatchRunningSentinelMixin: # pragma: no cover 42 | @batch_context 43 | def make_running_sentinel(self): 44 | batch_name, ext = os.path.splitext(os.path.basename(inspect.getfile(self.__class__))) 45 | sentinel_file = f"/tmp/{batch_name}.running" 46 | with open(sentinel_file, "w") as f: 47 | f.write(str(os.getpid())) 48 | yield 49 | 50 | 51 | @contextmanager 52 | def suppress_request_limit_exceeded(): 53 | try: 54 | yield 55 | except botocore.exceptions.ClientError as e: 56 | if e.response.get("Error", {}).get("Code") == "RequestLimitExceeded": 57 | logger.warning(e) 58 | rle_counter = get_monitoring_client().create_counter(RLE_COUNTER_NAME) 59 | rle_counter.count() 60 | else: 61 | raise 62 | -------------------------------------------------------------------------------- /clusterman/cli/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/cli/info.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import simplejson as json 15 | 16 | from clusterman.args import add_cluster_arg 17 | from clusterman.args import add_json_arg 18 | from clusterman.args import subparser 19 | from clusterman.util import get_cluster_name_list 20 | from clusterman.util import get_pool_name_list 21 | 22 | 23 | def list_clusters(args): # pragma: no cover 24 | if args.json: 25 | print(json.dumps(list(get_cluster_name_list()))) 26 | else: 27 | print("\n".join(get_cluster_name_list())) 28 | 29 | 30 | @subparser("list-clusters", "list available clusters", list_clusters) 31 | def add_list_clusters_parser(subparser, required_named_args, optional_named_args): # pragma: no cover 32 | add_json_arg(optional_named_args) 33 | 34 | 35 | def list_pools(args): # pragma: no cover 36 | if args.json: 37 | obj = {scheduler: list(get_pool_name_list(args.cluster, scheduler)) for scheduler in ["mesos", "kubernetes"]} 38 | print(json.dumps(obj)) 39 | else: 40 | for scheduler in ["mesos", "kubernetes"]: 41 | print(f"\n{scheduler.capitalize()} pools\n--------------------") 42 | print("\n".join(get_pool_name_list(args.cluster, scheduler))) 43 | 44 | 45 | @subparser("list-pools", "list available pools in a cluster", list_pools) 46 | def add_list_pools_parser(subparser, required_named_args, optional_named_args): # pragma: no cover 47 | add_cluster_arg(required_named_args, required=True) 48 | add_json_arg(optional_named_args) 49 | -------------------------------------------------------------------------------- /clusterman/cli/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import argparse 15 | import socket 16 | from functools import partial 17 | 18 | import colorlog 19 | 20 | from clusterman.util import limit_function_runtime 21 | 22 | 23 | logger = colorlog.getLogger(__name__) 24 | TIMEOUT_TIME_SECONDS = 5 25 | 26 | 27 | def timeout_wrapper(main): 28 | def wrapper(args: argparse.Namespace): 29 | def timeout_handler(): 30 | warning_string = "This command is taking a long time to run; you're likely targetting a large pool/cluster." 31 | if "yelpcorp" in socket.getfqdn(): 32 | warning_string += "\nIf this command hasn't returned in several minutes, reach out to #clusterman" 33 | logger.warning(warning_string) 34 | 35 | limit_function_runtime(partial(main, args), TIMEOUT_TIME_SECONDS, timeout_handler) 36 | 37 | return wrapper 38 | -------------------------------------------------------------------------------- /clusterman/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/draining/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/draining/kubernetes.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import colorlog 4 | 5 | from clusterman.kubernetes.kubernetes_cluster_connector import KubernetesClusterConnector 6 | 7 | log = colorlog.getLogger(__name__) 8 | 9 | 10 | def drain(connector: Optional[KubernetesClusterConnector], node_name: str, disable_eviction: bool) -> bool: 11 | """Cordons and evicts/deletes all tasks from a given node. 12 | :param node_name: a single node name to drain (as would be passed to kubectl drain) 13 | :param connector: a kubernetes connector to connect kubernetes API 14 | :param disable_eviction: Force drain to use delete (ignoring PDBs) 15 | :returns: bool 16 | """ 17 | if connector: 18 | log.info(f"Preparing to drain {node_name}...") 19 | return connector.drain_node(node_name, disable_eviction) 20 | else: 21 | log.info(f"Unable to drain {node_name} (no Kubernetes connector configured)") 22 | return False 23 | 24 | 25 | def uncordon(connector: Optional[KubernetesClusterConnector], node_name: str) -> bool: 26 | """Cordons and safely evicts all tasks from a given node. 27 | :param node_name: a single node name to uncordon (as would be passed to kubectl uncordon) 28 | :param connector: a kubernetes connector to connect kubernetes API 29 | :returns: bool 30 | """ 31 | if connector: 32 | log.info(f"Preparing to uncordon {node_name}...") 33 | return connector.uncordon_node(node_name) 34 | else: 35 | log.info(f"Unable to uncordon {node_name} (no Kubernetes connector configured)") 36 | return False 37 | -------------------------------------------------------------------------------- /clusterman/exceptions.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | class ClustermanException(Exception): 17 | pass 18 | 19 | 20 | class AllResourceGroupsAreStaleError(Exception): 21 | pass 22 | 23 | 24 | class AccountNumberMistmatchError(Exception): 25 | pass 26 | 27 | 28 | class AutoscalerError(ClustermanException): 29 | pass 30 | 31 | 32 | class ClustermanSignalError(ClustermanException): 33 | pass 34 | 35 | 36 | class MetricsError(ClustermanException): 37 | pass 38 | 39 | 40 | class NoLaunchTemplateConfiguredError(ClustermanException): 41 | pass 42 | 43 | 44 | class NoResourceGroupsFoundError(Exception): 45 | pass 46 | 47 | 48 | class NoSignalConfiguredException(ClustermanException): 49 | pass 50 | 51 | 52 | class ResourceGroupError(ClustermanException): 53 | pass 54 | 55 | 56 | class PoolManagerError(ClustermanException): 57 | pass 58 | 59 | 60 | class PoolConnectionError(PoolManagerError): 61 | """Raised when the pool master cannot be reached""" 62 | 63 | pass 64 | 65 | 66 | class ResourceRequestError(ClustermanException): 67 | pass 68 | 69 | 70 | class SignalValidationError(ClustermanSignalError): 71 | pass 72 | 73 | 74 | class SignalConnectionError(ClustermanSignalError): 75 | pass 76 | 77 | 78 | class SimulationError(ClustermanException): 79 | pass 80 | -------------------------------------------------------------------------------- /clusterman/interfaces/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/interfaces/types.py: -------------------------------------------------------------------------------- 1 | import enum 2 | from typing import NamedTuple 3 | from typing import Optional 4 | 5 | import arrow 6 | 7 | from clusterman.aws.markets import InstanceMarket 8 | from clusterman.util import ClustermanResources 9 | 10 | 11 | class AgentState(enum.Enum): 12 | IDLE = "idle" 13 | ORPHANED = "orphaned" 14 | RUNNING = "running" 15 | UNKNOWN = "unknown" 16 | 17 | 18 | class AgentMetadata(NamedTuple): 19 | agent_id: str = "" 20 | allocated_resources: ClustermanResources = ClustermanResources() 21 | batch_task_count: int = 0 22 | is_safe_to_kill: bool = True 23 | is_draining: bool = False 24 | priority: float = 0.0 25 | state: AgentState = AgentState.UNKNOWN 26 | task_count: int = 0 27 | total_resources: ClustermanResources = ClustermanResources() 28 | kernel: str = "" 29 | lsbrelease: str = "" 30 | 31 | 32 | class InstanceMetadata(NamedTuple): 33 | market: InstanceMarket 34 | weight: float 35 | group_id: str = "" 36 | hostname: Optional[str] = None 37 | instance_id: str = "" 38 | is_cordoned: bool = False 39 | ip_address: Optional[str] = None 40 | is_stale: bool = False 41 | state: str = "" 42 | uptime: arrow.Arrow = 0 43 | 44 | 45 | class ClusterNodeMetadata(NamedTuple): 46 | agent: AgentMetadata # Agent metadata is information associated with the Mesos or Kubernetes agent 47 | instance: InstanceMetadata # Instance metadata is information associated with the EC2 instance 48 | -------------------------------------------------------------------------------- /clusterman/kubernetes/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/math/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/math/piecewise_types.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Hashable 15 | from typing import TypeVar 16 | 17 | from typing_extensions import Protocol 18 | 19 | 20 | T = TypeVar("T") 21 | 22 | 23 | class XValueDiff(Protocol[T], Hashable): 24 | def __mul__(self, other: int) -> "XValueDiff[T]": 25 | ... 26 | 27 | def __truediv__(self, other: "XValueDiff[T]") -> float: 28 | ... 29 | 30 | 31 | class XValue(Protocol[T], Hashable): 32 | def __add__(self, other: XValueDiff[T]) -> "XValue[T]": 33 | ... 34 | 35 | def __sub__(self, other: "XValue[T]") -> XValueDiff[T]: 36 | ... 37 | 38 | def __floordiv__(self, other: "XValue[T]") -> float: 39 | ... 40 | 41 | def __lt__(self, other: "XValue[T]") -> bool: 42 | ... 43 | 44 | def __ge__(self, other: "XValue[T]") -> bool: 45 | ... 46 | 47 | def __mod__(self, other: "XValue[T]") -> int: 48 | ... 49 | -------------------------------------------------------------------------------- /clusterman/mesos/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/migration/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/migration/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # K8s CRD 15 | MIGRATION_CRD_GROUP = "clusterman.yelp.com" 16 | MIGRATION_CRD_VERSION = "v1" 17 | MIGRATION_CRD_PLURAL = "nodemigrations" 18 | MIGRATION_CRD_KIND = "NodeMigration" 19 | MIGRATION_CRD_STATUS_LABEL = "clusterman.yelp.com/migration_status" 20 | MIGRATION_CRD_ATTEMPTS_LABEL = "clusterman.yelp.com/attempts" 21 | 22 | # Default settings 23 | DEFAULT_POOL_PRESCALING = 0 24 | DEFAULT_NODE_BOOT_WAIT = "3m" 25 | DEFAULT_NODE_BOOT_TIMEOUT = "10m" 26 | DEFAULT_WORKER_TIMEOUT = "2h" 27 | DEFAULT_HEALTH_CHECK_INTERVAL = "2m" 28 | DEFAULT_ALLOWED_FAILED_DRAINS = 3 29 | DEFAULT_ORPHAN_CAPACITY_TOLLERANCE = 0 30 | DEFAULT_MAX_UPTIME_WORKER_SKIPS = 6 31 | MAX_ORPHAN_CAPACITY_TOLLERANCE = 0.2 32 | 33 | # Worker parameters 34 | UPTIME_CHECK_INTERVAL_SECONDS = 60 * 60 # 1 hour 35 | INITIAL_POOL_HEALTH_TIMEOUT_SECONDS = 15 * 60 36 | SUPPORTED_POOL_SCHEDULER = "kubernetes" 37 | 38 | # SFX metrics keys 39 | SFX_NODE_DRAIN_COUNT = "clusterman.node_migration.drain_count" 40 | SFX_MIGRATION_JOB_DURATION = "clusterman.node_migration.duration" 41 | SFX_DRAINED_NODE_UPTIME = "clusterman.node_migration.drained_node_uptime" 42 | -------------------------------------------------------------------------------- /clusterman/migration/event_enums.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import enum 15 | import operator 16 | from typing import Any 17 | from typing import Collection 18 | from typing import Union 19 | 20 | import packaging.version 21 | import semver 22 | 23 | from clusterman.interfaces.types import ClusterNodeMetadata 24 | 25 | 26 | ComparableVersion = Union[semver.VersionInfo, packaging.version.Version] 27 | ComparableConditionTarget = Union[str, int, ComparableVersion] 28 | 29 | 30 | class MigrationStatus(enum.Enum): 31 | PENDING = "pending" 32 | INPROGRESS = "inprogress" 33 | COMPLETED = "completed" 34 | SKIPPED = "skipped" 35 | STOP = "stop" 36 | FAILED = "failed" 37 | 38 | 39 | class ConditionTrait(enum.Enum): 40 | KERNEL = "kernel" 41 | LSBRELEASE = "lsbrelease" 42 | INSTANCE_TYPE = "instance_type" 43 | UPTIME = "uptime" 44 | 45 | def get_from(self, node: ClusterNodeMetadata) -> ComparableConditionTarget: 46 | """Get trait value from node metadata 47 | 48 | :param ClusterNodeMetadata node: node metadata 49 | :return: value 50 | """ 51 | return CONDITION_TRAIT_GETTERS[self](node) 52 | 53 | 54 | class ConditionOperator(enum.Enum): 55 | GT = "gt" 56 | GE = "ge" 57 | EQ = "eq" 58 | NE = "ne" 59 | LT = "lt" 60 | LE = "le" 61 | IN = "in" 62 | NOTIN = "notin" 63 | 64 | @classmethod 65 | def expecting_collection(cls) -> Collection["ConditionOperator"]: 66 | """Return operators expecting collection of object as right-operand""" 67 | return (cls.IN, cls.NOTIN) 68 | 69 | def apply(self, left: Any, right: Any) -> bool: 70 | """Apply operator 71 | 72 | :param Any left: left operand 73 | :param Any right: right operand 74 | :return: boolean result 75 | """ 76 | if self == ConditionOperator.IN: 77 | return left in right 78 | elif self == ConditionOperator.NOTIN: 79 | return left not in right 80 | return getattr(operator, self.value)(left, right) 81 | 82 | 83 | CONDITION_OPERATOR_SUPPORT_MATRIX = { 84 | ConditionTrait.KERNEL: set(ConditionOperator), 85 | ConditionTrait.LSBRELEASE: set(ConditionOperator), 86 | ConditionTrait.INSTANCE_TYPE: { 87 | ConditionOperator.EQ, 88 | ConditionOperator.NE, 89 | ConditionOperator.IN, 90 | ConditionOperator.NOTIN, 91 | }, 92 | ConditionTrait.UPTIME: {ConditionOperator.GT, ConditionOperator.GE, ConditionOperator.LT, ConditionOperator.LE}, 93 | } 94 | 95 | CONDITION_TRAIT_GETTERS = { 96 | ConditionTrait.KERNEL: lambda node: semver.VersionInfo.parse(node.agent.kernel), 97 | ConditionTrait.LSBRELEASE: lambda node: packaging.version.parse(node.agent.lsbrelease), 98 | ConditionTrait.INSTANCE_TYPE: lambda node: node.instance.market.instance, 99 | ConditionTrait.UPTIME: lambda node: node.instance.uptime.total_seconds(), 100 | } 101 | -------------------------------------------------------------------------------- /clusterman/reports/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/reports/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | AXIS_DIMENSION_INCHES = (8, 2.5) 15 | COLORMAP = "plasma" 16 | ERROR_COLOR = "C3" 17 | TREND_LINE_COLOR = "orange" 18 | TREND_RANGE_COLOR = "xkcd:light orange" 19 | TREND_RANGE_ALPHA = 0.5 20 | FIGURE_DPI = 300 21 | MAGNITUDE_STRINGS = [ 22 | None, 23 | "thousands", 24 | "millions", 25 | "billions", 26 | "trillions", 27 | ] 28 | SUBTITLE_SPACING = 64 29 | -------------------------------------------------------------------------------- /clusterman/run.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import sys 15 | 16 | from clusterman.args import parse_args 17 | from clusterman.config import setup_config 18 | from clusterman.util import setup_logging 19 | 20 | 21 | def main(argv=None): 22 | if argv is None: 23 | argv = sys.argv[1:] 24 | 25 | args = parse_args(argv, "Cluster scaling and management for Mesos and Kubernetes") 26 | 27 | setup_logging(args.log_level) 28 | setup_config(args) 29 | 30 | args.entrypoint(args) 31 | 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /clusterman/signals/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/clusterman/signals/__init__.py -------------------------------------------------------------------------------- /clusterman/simulator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/simulator/io.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import gzip 15 | 16 | import arrow 17 | import jsonpickle 18 | import simplejson as json 19 | from sortedcontainers import SortedDict 20 | 21 | 22 | def _python_encode(obj): 23 | return json.loads(jsonpickle.encode(obj)) 24 | 25 | 26 | def _python_decode(obj): 27 | return jsonpickle.decode(json.dumps(obj)) 28 | 29 | 30 | class ArrowSerializer(jsonpickle.handlers.BaseHandler): 31 | def flatten(self, obj, data): 32 | data["timestamp"] = obj.timestamp 33 | return data 34 | 35 | def restore(self, data): 36 | return arrow.get(data["timestamp"]) 37 | 38 | 39 | class SortedDictSerializer(jsonpickle.handlers.BaseHandler): 40 | def flatten(self, obj, data): 41 | data["items"] = [(_python_encode(k), _python_encode(v)) for k, v in obj.items()] 42 | return data 43 | 44 | def restore(self, data): 45 | return SortedDict((_python_decode(k), _python_decode(v)) for k, v in data["items"]) 46 | 47 | 48 | def _register_handlers(): 49 | # These operations are idempotent, it's safe to do more than once 50 | jsonpickle.handlers.register(arrow.Arrow, ArrowSerializer) 51 | jsonpickle.handlers.register(SortedDict, SortedDictSerializer) 52 | 53 | 54 | def read_object_from_compressed_json(filename, raw_timestamps=False): 55 | """Read a Python object from a gzipped JSON file""" 56 | _register_handlers() 57 | with gzip.open(filename) as f: 58 | if raw_timestamps: 59 | old_arrow = arrow.get 60 | arrow.get = int 61 | data = jsonpickle.decode(f.read().decode()) 62 | if raw_timestamps: 63 | arrow.get = old_arrow 64 | return data 65 | 66 | 67 | def write_object_to_compressed_json(obj, filename): 68 | """Write the Python object to a compressed (gzipped) JSON file 69 | 70 | :param obj: a Python object to serialize 71 | :param filename: the file to write to 72 | """ 73 | _register_handlers() 74 | with gzip.open(filename, "w") as f: 75 | f.write(jsonpickle.encode(obj).encode()) 76 | -------------------------------------------------------------------------------- /clusterman/simulator/simulated_cluster_connector.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import uuid 15 | 16 | import staticconf 17 | 18 | from clusterman.interfaces.cluster_connector import ClusterConnector 19 | from clusterman.interfaces.types import AgentMetadata 20 | from clusterman.interfaces.types import AgentState 21 | from clusterman.simulator import simulator 22 | from clusterman.util import ClustermanResources 23 | 24 | 25 | class SimulatedClusterConnector(ClusterConnector): 26 | def __init__(self, cluster: str, pool: str, simulator: "simulator.Simulator") -> None: 27 | self.cluster = cluster 28 | self.pool = pool 29 | self.simulator = simulator 30 | 31 | def reload_state(self) -> None: 32 | pass 33 | 34 | def get_resource_allocation(self, resource_name: str) -> float: 35 | return 0 36 | 37 | def get_resource_total(self, resource_name: str) -> float: 38 | total = 0 39 | for c in self.simulator.aws_clusters: 40 | for i in c.instances.values(): 41 | if self.simulator.current_time < i.join_time: 42 | continue 43 | 44 | total += getattr(i.resources, resource_name) 45 | return total 46 | 47 | def _get_agent_metadata(self, instance_ip: str) -> AgentMetadata: 48 | for c in self.simulator.aws_clusters: 49 | for i in c.instances.values(): 50 | if instance_ip == i.ip_address: 51 | return AgentMetadata( 52 | agent_id=str(uuid.uuid4()), 53 | state=(AgentState.ORPHANED if self.simulator.current_time < i.join_time else AgentState.IDLE), 54 | total_resources=ClustermanResources( 55 | cpus=i.resources.cpus, 56 | mem=i.resources.mem * 1000, 57 | disk=(i.resources.disk or staticconf.read_int("ebs_volume_size", 0)) * 1000, 58 | gpus=(i.resources.gpus), 59 | ), 60 | ) 61 | 62 | # if we don't know the given IP then it's orphaned 63 | return AgentMetadata(state=AgentState.ORPHANED) 64 | -------------------------------------------------------------------------------- /clusterman/simulator/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import arrow 15 | from staticconf.testing import PatchConfiguration 16 | 17 | 18 | def patch_join_delay(mean=0, stdev=0): 19 | return PatchConfiguration( 20 | { 21 | "join_delay_mean_seconds": mean, 22 | "join_delay_stdev_seconds": stdev, 23 | } 24 | ) 25 | 26 | 27 | class SimulationMetadata: # pragma: no cover 28 | def __init__(self, name, cluster, pool, scheduler): 29 | self.name = name 30 | self.cluster = cluster 31 | self.pool = pool 32 | self.scheduler = scheduler 33 | self.sim_start = None 34 | self.sim_end = None 35 | 36 | def __enter__(self): 37 | self.sim_start = arrow.now() 38 | 39 | def __exit__(self, type, value, traceback): 40 | self.sim_end = arrow.now() 41 | 42 | def __str__(self): 43 | return f"({self.cluster}, {self.pool}, {self.sim_start}, {self.sim_end})" 44 | -------------------------------------------------------------------------------- /clusterman/supervisord/fetch_clusterman_signal: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source /etc/boto_cfg/clusterman.sh 4 | 5 | # don't set -ex until after executing the above so we don't leak keys in the logs 6 | set -ex 7 | source /etc/lsb-release # gives us DISTRIB_CODENAME 8 | 9 | SIGNAL_DIR=${2:-.} 10 | VERSIONS=(${CMAN_VERSIONS_TO_FETCH}) 11 | version="clusterman_signals_${VERSIONS[$1]}" 12 | mkdir -p ${SIGNAL_DIR}/${version} 13 | cd ${SIGNAL_DIR}/${version} 14 | aws ${AWS_ENDPOINT_URL_ARGS} s3 cp "s3://${CMAN_SIGNALS_BUCKET}/${DISTRIB_CODENAME}/${version}.tar.gz" . 15 | tar -xzf "${version}.tar.gz" 16 | -------------------------------------------------------------------------------- /clusterman/supervisord/run_clusterman_signal: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | SIGNAL_DIR=${2:-.} 5 | VERSIONS=(${CMAN_SIGNAL_VERSIONS}) 6 | NAMESPACES=(${CMAN_SIGNAL_NAMESPACES}) 7 | NAMES=(${CMAN_SIGNAL_NAMES}) 8 | APPS=(${CMAN_SIGNAL_APPS}) 9 | version="clusterman_signals_${VERSIONS[$1]}" 10 | namespace="${NAMESPACES[$1]}" 11 | name="${NAMES[$1]}" 12 | app="${APPS[$1]}" 13 | 14 | cd ${SIGNAL_DIR}/${version} 15 | prodenv/bin/python -m clusterman_signals.run ${namespace} ${name} ${app} 16 | -------------------------------------------------------------------------------- /clusterman/supervisord/supervisord.conf: -------------------------------------------------------------------------------- 1 | [unix_http_server] 2 | file=/tmp/supervisor.sock ; the path to the socket file 3 | 4 | [inet_http_server] 5 | port=127.0.0.1:9001 6 | 7 | [supervisord] 8 | logfile=/tmp/supervisord.log ; main log file; default $CWD/supervisord.log 9 | logfile_maxbytes=50MB ; max main logfile bytes b4 rotation; default 50MB 10 | logfile_backups=5 ; # of main logfile backups; 0 means none, default 10 11 | loglevel=info ; log level; default info; others: debug,warn,trace 12 | pidfile=/tmp/supervisord.pid ; supervisord pidfile; default supervisord.pid 13 | nodaemon=true ; start in foreground if true; default false 14 | minfds=1024 ; min. avail startup file descriptors; default 1024 15 | minprocs=200 ; min. avail process descriptors;default 200 16 | 17 | [rpcinterface:supervisor] 18 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface 19 | 20 | [supervisorctl] 21 | serverurl=unix:///tmp/supervisor.sock ; use a unix:// URL for a unix socket 22 | 23 | [program:fetch_signals] 24 | process_name=fetch_signals_%(process_num)s 25 | numprocs=%(ENV_CMAN_NUM_VERSIONS)s 26 | command=/usr/bin/fetch_clusterman_signal %(process_num)s 27 | autostart=true 28 | autorestart=false 29 | startretries=0 30 | stdout_logfile=/dev/stdout 31 | stdout_logfile_maxbytes=0 32 | redirect_stderr=true 33 | startsecs=0 34 | 35 | [program:run_signals] 36 | process_name=run_signals_%(process_num)s 37 | numprocs=%(ENV_CMAN_NUM_SIGNALS)s 38 | command=/usr/bin/run_clusterman_signal %(process_num)s 39 | autostart=false 40 | startretries=0 41 | stopasgroup=true 42 | stdout_logfile=/dev/stdout 43 | stdout_logfile_maxbytes=0 44 | redirect_stderr=true 45 | 46 | [program:autoscaler] 47 | directory=/code 48 | environment=PATH=/code/virtualenv_run/bin:%(ENV_PATH)s 49 | command=python -m clusterman.batch.autoscaler %(ENV_CMAN_ARGS)s 50 | autostart=false 51 | autorestart=false 52 | 53 | ; The following is to make manual testing and debugging easier. If we redirect to stdout 54 | ; from the autoscaler batch, then we end up writing to scribe twice: once from the autoscaler 55 | ; batch and once from the autoscaler bootstrap (where supervisord's stdout is piped to stdin2scribe). 56 | ; By redirecting the autoscaler to stderr we ensure that we can still see output during manual 57 | ; debugging but don't write to scribe twice. 58 | stdout_logfile=/dev/stderr 59 | stdout_logfile_maxbytes=0 60 | stderr_logfile=/dev/stderr 61 | stderr_logfile_maxbytes=0 62 | -------------------------------------------------------------------------------- /clusterman/tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /clusterman/tools/rookout.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from os import getenv 15 | 16 | 17 | def enable_rookout() -> None: 18 | """Enable rookout if environment variables are set""" 19 | if getenv("ROOKOUT_ENABLE", "") != "1": 20 | return 21 | import rook 22 | 23 | rook.start(token=getenv("ROOKOUT_TOKEN")) 24 | -------------------------------------------------------------------------------- /clusterman_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/clusterman_logo.png -------------------------------------------------------------------------------- /code-of-conduct.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies within all project spaces, and it also applies when 49 | an individual is representing the project or its community in public spaces. 50 | Examples of representing a project or community include using an official 51 | project e-mail address, posting via an official social media account, or acting 52 | as an appointed representative at an online or offline event. Representation of 53 | a project may be further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at opensource@yelp.com. All complaints 59 | will be reviewed and investigated and will result in a response that is deemed 60 | necessary and appropriate to the circumstances. The project team is obligated 61 | to maintain confidentiality with regard to the reporter of an incident. Further 62 | details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /completions/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/completions/.gitignore -------------------------------------------------------------------------------- /debian/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | !changelog 4 | !compat 5 | !control 6 | !copyright 7 | !rules 8 | !rules.external 9 | !clusterman.install 10 | !clusterman.links 11 | -------------------------------------------------------------------------------- /debian/clusterman.links: -------------------------------------------------------------------------------- 1 | opt/venvs/clusterman/bin/clusterman usr/bin/clusterman 2 | opt/venvs/clusterman/bin/fetch_clusterman_signal usr/bin/fetch_clusterman_signal 3 | opt/venvs/clusterman/bin/run_clusterman_signal usr/bin/run_clusterman_signal 4 | -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 9 2 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: clusterman 2 | Maintainer: Compute Infra 3 | Build-Depends: 4 | dh-virtualenv, 5 | 6 | Package: clusterman 7 | Depends: 8 | python3.8, 9 | # unfortunately needed for numpy to work 10 | libatlas3-base, 11 | # needed so that we can grab signals from s3 12 | # that said, we have an internal fork that conflicts with this 13 | # once that's gone, we should re-add this (aws-cli vs awscli) 14 | # instead of adding this with `jammyOrLater:Depends` 15 | # awscli, 16 | ${misc:Depends}, 17 | ${python:Depends}, 18 | ${shlibs:Depends}, 19 | ${bionicOrLater:Depends}, 20 | ${jammyOrLater:Depends}, 21 | Architecture: any 22 | Description: Cluster scaling and management - y/clusterman 23 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | # -*- makefile -*- 3 | ifneq ($(shell echo ${CI}),true) 4 | YELP_DH_VIRTUALENV = --preinstall=-rrequirements-bootstrap.txt --extra-pip-arg --only-binary=:all: 5 | YELP_DH_INSTALL = install -d completions/usr debian/package/. 6 | endif 7 | 8 | # Use the dpkg version comparison algorithm (which is easily in reach) to compare the build 9 | # system version to the version from which python3-distutils is required (18.04 and higher), 10 | # and conditionally populate a substitution variable which is referenced in the control file. 11 | # On bionic, where Ubuntu's python3.8 is used, this is a virtual package for python3-distutils 12 | # (in bionic-updates). On jammy, this is a real package providing distutils for python3.8 from deadsnakes. 13 | extra_substvars = -VbionicOrLater:Depends="python3.8-distutils" 14 | 15 | # and then do the same thing for awscli - of which we used to have a patched version called aws-cli pre-jammy. 16 | # once jammy boxes are the oldest things we install clusterman on, we can get rid of this and just include this 17 | # directly in debian/control 18 | ifeq ($(shell (. /etc/os-release && dpkg --compare-versions $$VERSION_ID "ge" "22.04" && echo yes || echo no)),yes) 19 | extra_substvars = -VjammyOrLater:Depends="awscli" 20 | else 21 | # aws-cli only exists internally, so lets make sure that we only use it internally 22 | ifeq ($(shell echo ${PAASTA_ENV}), YELP) 23 | extra_substvars = -VjammyOrLater:Depends="aws-cli" 24 | else 25 | extra_substvars = -VjammyOrLater:Depends="awscli" 26 | endif 27 | endif 28 | 29 | 30 | %: 31 | dh $@ --with python-virtualenv 32 | 33 | override_dh_gencontrol: 34 | dh_gencontrol -- $(extra_substvars) 35 | 36 | override_dh_virtualenv: 37 | dh_virtualenv --python python3.8 $(YELP_DH_VIRTUALENV) 38 | 39 | override_dh_install: 40 | dh_install 41 | $(YELP_DH_INSTALL) 42 | 43 | override_dh_shlibdeps: 44 | dh_shlibdeps -X site-packages/Pillow.libs --exclude matplotlib --exclude numpy --dpkg-shlibdeps-params=--ignore-missing-info 45 | 46 | override_dh_strip: 47 | dh_strip -X site-packages/Pillow.libs 48 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = clusterman 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/examples/autoscaler_config.yaml: -------------------------------------------------------------------------------- 1 | sfrs: 2 | - sfr-5c87f608-8fa8-48d9-8a78-8887a16509e0 3 | - sfr-a83610ec-5589-4258-bbd2-dad14f2acaa1 4 | - sfr-e8807b39-7041-4997-8a31-4a292514d26b 5 | 6 | # configs: 7 | # - "LaunchSpecifications": 8 | # - "InstanceType": "c4.8xlarge" 9 | # "SubnetId": "subnet-b47bb7d1" 10 | # "SpotPrice": 10 11 | # "WeightedCapacity": 4 12 | # "AllocationStrategy": "diversified" 13 | # - "LaunchSpecifications": 14 | # - "InstanceType": "c3.8xlarge" 15 | # "SubnetId": "subnet-b47bb7d1" 16 | # "SpotPrice": 10 17 | # "WeightedCapacity": 4 18 | # "AllocationStrategy": "diversified" 19 | -------------------------------------------------------------------------------- /docs/examples/design.yaml: -------------------------------------------------------------------------------- 1 | metadata: 2 | spot_prices|aws_availability_zone=us-west-2a,aws_instance_type=c3.8xlarge: &spot_prices 3 | 4 | # If no timezone is specified, generator will use YST 5 | start_time: "2017-12-01T08:00:00Z" 6 | end_time: "2017-12-01T09:00:00Z" 7 | 8 | frequency: 9 | distribution: expovariate 10 | params: 11 | lambd: 0.0033333 # Assume prices change on average every five minutes 12 | 13 | values: 14 | distribution: uniform 15 | params: 16 | a: 0 17 | b: 1 18 | 19 | spot_prices|aws_availability_zone=us-west-2b,aws_instance_type=c3.8xlarge: *spot_prices 20 | spot_prices|aws_availability_zone=us-west-2c,aws_instance_type=c3.8xlarge: *spot_prices 21 | 22 | capacity|cluster=norcal-prod,role=seagull: 23 | start_time: "2017-12-01T08:00:00Z" 24 | end_time: "2017-12-01T09:00:00Z" 25 | 26 | dict_keys: 27 | - c3.8xlarge,us-west-2a 28 | - c3.8xlarge,us-west-2b 29 | - c3.8xlarge,us-west-2c 30 | 31 | frequency: 32 | distribution: expovariate 33 | params: 34 | lambd: 0.001666 # Assume capacity change on average every ten minutes 35 | 36 | values: 37 | distribution: randint 38 | params: 39 | a: 10 40 | b: 50 41 | 42 | app_metrics: 43 | seagull_runs: 44 | start_time: "2017-12-01T08:00:00Z" 45 | end_time: "2017-12-01T09:00:00Z" 46 | frequency: 47 | distribution: expovariate 48 | params: 49 | lambd: 0.0041666 # 15 seagull runs per hour 50 | values: 1 51 | 52 | 53 | system_metrics: 54 | cpu_allocation|cluster=everywhere-testopia,role=jolt: 55 | start_time: "2017-12-01T08:00:00Z" 56 | end_time: "2017-12-01T09:00:00Z" 57 | frequency: historical 58 | values: 59 | aws_region: "us-west-2" 60 | params: # calculate value by a*x + b 61 | a: 1.5 62 | b: 10 63 | -------------------------------------------------------------------------------- /docs/examples/metrics.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/docs/examples/metrics.json.gz -------------------------------------------------------------------------------- /docs/source/_static/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/docs/source/_static/.gitignore -------------------------------------------------------------------------------- /docs/source/api/AWSResourceGroup.rst: -------------------------------------------------------------------------------- 1 | AWSResourceGroup 2 | ================ 3 | 4 | .. autoclass:: clusterman.aws.aws_resource_group.AWSResourceGroup 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/api/AutoScalingResourceGroup.rst: -------------------------------------------------------------------------------- 1 | AutoScalingResourceGroup 2 | ======================== 3 | 4 | .. autoclass:: clusterman.aws.auto_scaling_resource_group.AutoScalingResourceGroup 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/api/Autoscaler.rst: -------------------------------------------------------------------------------- 1 | Autoscaler 2 | ========== 3 | 4 | .. autoclass:: clusterman.autoscaler.autoscaler.Autoscaler 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/api/MesosPoolManager.rst: -------------------------------------------------------------------------------- 1 | MesosPoolManager 2 | ================ 3 | 4 | .. autoclass:: clusterman.mesos.mesos_pool_manager.MesosPoolManager 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/api/Signal.rst: -------------------------------------------------------------------------------- 1 | Signal 2 | ====== 3 | 4 | .. autoclass:: clusterman.interfaces.signal.Signal 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/api/SpotFleetResourceGroup.rst: -------------------------------------------------------------------------------- 1 | SpotFleetResourceGroup 2 | ====================== 3 | 4 | .. autoclass:: clusterman.aws.spot_fleet_resource_group.SpotFleetResourceGroup 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/api/aws_markets.rst: -------------------------------------------------------------------------------- 1 | AWS Markets 2 | =========== 3 | 4 | .. autoclass:: clusterman.aws.markets.InstanceResources 5 | 6 | .. class:: clusterman.aws.markets.InstanceMarket(instance, availability_zone) 7 | -------------------------------------------------------------------------------- /docs/source/api/clusterman_metrics.rst: -------------------------------------------------------------------------------- 1 | clusterman_metrics 2 | ================== 3 | 4 | .. autoclass:: clusterman_metrics.ClustermanMetricsBotoClient 5 | :members: __init__, get_writer, get_metric_values 6 | 7 | .. automodule:: clusterman_metrics 8 | :members: generate_key_with_dimensions 9 | -------------------------------------------------------------------------------- /docs/source/drainer.rst: -------------------------------------------------------------------------------- 1 | Drainer 2 | ============== 3 | 4 | *Drainer* is the component to drain pods off the node before terminating. 5 | It may drain and terminate nodes for three reasons: 6 | 7 | * ``spot_interruption`` 8 | * ``node_migration`` 9 | * ``scaling_down`` 10 | 11 | **NOTE**: all settings are only compatible with Kubernetes clusters. 12 | 13 | 14 | Drainer Batch 15 | -------------------- 16 | 17 | The *Drainer batch* is the entrypoint of the draining logic. 18 | 19 | The batch code can be invoked from the ``clusterman.batch.drainer`` Python module. 20 | 21 | 22 | .. _drainer_configuration: 23 | 24 | Pool Configuration 25 | ------------------ 26 | 27 | The behaviour of the drainer logic for a pool is controlled by the ``draining`` section of the pool configuration. 28 | The allowed values for the drainer settings are as follows: 29 | 30 | * ``draining_time_threshold_seconds``: maximum time to complete draining process (1800 by default) 31 | * ``redraining_delay_seconds``: how much to wait between draining tries in case of draining failure (15 by default). 32 | * ``force_terminate``: forcibly terminate the node after reaching `draining_time_threshold_seconds` (false by default). 33 | 34 | 35 | See :ref:`pool_configuration` for how an example configuration block would look like. 36 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. clusterman documentation master file, created by 2 | sphinx-quickstart on Thu Aug 3 09:34:59 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | clusterman 7 | ====================================== 8 | 9 | Clusterman autoscales Mesos clusters based on the values of user-defined signals 10 | of resource utilization. It also provides tools to manually manage those clusters, 11 | and simulate how changes to autoscaling logic will impact the cost and performance. 12 | 13 | 14 | .. toctree:: 15 | :titlesonly: 16 | 17 | overview 18 | 19 | .. toctree:: 20 | :maxdepth: 2 21 | :caption: Autoscaling 22 | 23 | metrics 24 | signals 25 | autoscaler 26 | configuration 27 | resource_groups 28 | 29 | 30 | .. toctree:: 31 | :maxdepth: 2 32 | :caption: Tools 33 | 34 | manage 35 | simulator 36 | tools 37 | node_migration 38 | drainer 39 | 40 | 41 | .. toctree:: 42 | :maxdepth: 1 43 | :caption: API Reference 44 | 45 | api/AutoScalingResourceGroup 46 | api/Autoscaler 47 | api/AWSResourceGroup 48 | api/aws_markets 49 | api/clusterman_metrics 50 | api/MesosPoolManager 51 | api/Signal 52 | api/SpotFleetResourceGroup 53 | 54 | 55 | Indices and tables 56 | ================== 57 | 58 | * :ref:`genindex` 59 | * :ref:`modindex` 60 | * :ref:`search` 61 | -------------------------------------------------------------------------------- /docs/source/manage.rst: -------------------------------------------------------------------------------- 1 | Cluster Management 2 | ================== 3 | 4 | Clusterman comes with a number of command-line tools to help with cluster management. 5 | 6 | Discovery 7 | --------- 8 | 9 | The ``clusterman list-clusters`` and ``clusterman list-pools`` commands can aid in determining what clusters and pools 10 | Clusterman knows about: 11 | 12 | .. program-output:: python -m clusterman.run list-clusters --help 13 | :cwd: ../../ 14 | 15 | .. program-output:: python -m clusterman.run list-pools --help 16 | :cwd: ../../ 17 | 18 | Management 19 | ---------- 20 | 21 | The ``clusterman manage`` command can be used to directly change the state of the cluster: 22 | 23 | .. program-output:: python -m clusterman.run manage --help 24 | :cwd: ../../ 25 | 26 | The ``--target-capacity`` option allows users to directly change the size of the Mesos cluster specified by the 27 | ``--cluster`` and ``--pool`` arguments. 28 | 29 | Note that there can be up to a few minutes of "lag time" between when the manage command is issued and when 30 | changes are reflected in the cluster. This is due to potential delays introduced into the pipeline while AWS finds and 31 | procures new instances for the cluster. Therefore, it is not recommended to run ``clusterman manage`` repeatedly in 32 | short succession, or immediately after the autoscaler batch has run. 33 | 34 | .. note:: Future versions of Clusterman may include a rate-limiter for the manage command 35 | 36 | .. note:: By providing the existing target capacity value as the argument to ``--target-capacity``, you can force 37 | Clusterman to attempt to prune any :attr:`fulfilled capacity ` that is above the 38 | desired :attr:`target capacity `. 39 | 40 | Status 41 | ------ 42 | 43 | The ``clusterman status`` command provides information on the current state of the cluster: 44 | 45 | .. program-output:: python -m clusterman.run status --help 46 | :cwd: ../../ 47 | 48 | As noted above, the state of the cluster may take a few minutes to equilibrate after a ``clusterman manage`` command or 49 | the autoscaler has run, so the output from ``clusterman status`` may not accurately reflect the desired status. 50 | -------------------------------------------------------------------------------- /docs/source/resource_groups.rst: -------------------------------------------------------------------------------- 1 | Resource Groups 2 | =============== 3 | 4 | Resource groups are wrappers around cloud provider APIs to enable scaling up and down groups of machines. A resource 5 | group implments the :py:class:`.ResourceGroup` interface, which provides the set of required methods for 6 | Clusterman to interact with the resource group. Currently, Clusterman supports the following types of resource groups: 7 | 8 | * :py:class:`.AutoScalingResourceGroup`: `AWS autoscaling groups 9 | `_ 10 | * :py:class:`.SpotFleetResourceGroup`: `AWS spot fleet requests 11 | `_ 12 | -------------------------------------------------------------------------------- /docs/source/simulator.rst: -------------------------------------------------------------------------------- 1 | Simulation 2 | ========== 3 | 4 | Running the Simulator 5 | --------------------- 6 | 7 | .. program-output:: python -m clusterman.run simulate --help 8 | :cwd: ../../ 9 | 10 | .. _input_data_fmt: 11 | 12 | Experimental Input Data 13 | ----------------------- 14 | 15 | The simulator can accept experimental input data for one or more metric timeseries using the ``--metrics-data-file`` 16 | argument to ``clusterman simulate``. The simulator expects this file to be stored as a compressed (gzipped) JSON file; 17 | the JSON schema is as follows:: 18 | 19 | { 20 | 'metric_name_1': [ 21 | [, value], 22 | [, value], 23 | ... 24 | ], 25 | 'metric_name_2': [ 26 | [, value], 27 | [, value], 28 | ... 29 | }, 30 | ... 31 | } 32 | 33 | .. _dict_data_fmt: 34 | 35 | Optional Multi-valued Timeseries Data 36 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 37 | 38 | Some timeseries data needs to have multiple y-values per timestamp. The metrics data file can optionally accept 39 | timeseries in a dictionary with the dictionary keys corresponding to the names of the individual timeseries. For 40 | example:: 41 | 42 | { 43 | 'metric_a': [ 44 | [ 45 | , 46 | { 47 | 'key1': value, 48 | 'key2': value 49 | } 50 | ], 51 | [ 52 | , 53 | { 54 | 'key3': value 55 | } 56 | ], 57 | [ 58 | , 59 | { 60 | 'key1': value, 61 | 'key2': value, 62 | 'key3': value 63 | } 64 | ] 65 | ] 66 | } 67 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /examples/batch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /examples/batch/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import inspect 15 | import os 16 | from contextlib import contextmanager 17 | 18 | import botocore.exceptions 19 | import colorlog 20 | 21 | logger = colorlog.getLogger(__name__) 22 | 23 | 24 | class BatchRunningSentinelMixin: # pragma: no cover 25 | def make_running_sentinel(self): 26 | batch_name, ext = os.path.splitext(os.path.basename(inspect.getfile(self.__class__))) 27 | sentinel_file = f"/tmp/{batch_name}.running" 28 | with open(sentinel_file, "w") as f: 29 | f.write(str(os.getpid())) 30 | 31 | 32 | @contextmanager 33 | def suppress_request_limit_exceeded(): 34 | try: 35 | yield 36 | except botocore.exceptions.ClientError as e: 37 | if e.response.get("Error", {}).get("Code") == "RequestLimitExceeded": 38 | logger.warning(e) 39 | else: 40 | raise 41 | -------------------------------------------------------------------------------- /examples/clusterman_metrics/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 121 3 | -------------------------------------------------------------------------------- /examples/clusterman_metrics/clusterman_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .boto_client import ClustermanMetricsBotoClient 15 | from .boto_client import MetricsValuesDict 16 | from .simulation_client import ClustermanMetricsSimulationClient 17 | from .util.constants import APP_METRICS 18 | from .util.constants import METADATA 19 | from .util.constants import METRIC_TYPES 20 | from .util.constants import SYSTEM_METRICS 21 | from .util.meteorite import generate_key_with_dimensions 22 | 23 | __all__ = [ 24 | "ClustermanMetricsBotoClient", 25 | "MetricsValuesDict", 26 | "ClustermanMetricsSimulationClient", 27 | "APP_METRICS", 28 | "METADATA", 29 | "METRIC_TYPES", 30 | "SYSTEM_METRICS", 31 | "generate_key_with_dimensions", 32 | ] 33 | -------------------------------------------------------------------------------- /examples/clusterman_metrics/clusterman_metrics/util/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /examples/clusterman_metrics/clusterman_metrics/util/aws.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import json 15 | 16 | import boto3 17 | import staticconf 18 | from clusterman_metrics.util.constants import CONFIG_NAMESPACE 19 | 20 | 21 | _metrics_session = None 22 | 23 | 24 | def _setup_session(): 25 | with open(staticconf.read_string("access_key_file", namespace=CONFIG_NAMESPACE)) as boto_cfg_file: 26 | boto_cfg = json.load(boto_cfg_file) 27 | _session = boto3.session.Session( 28 | aws_access_key_id=boto_cfg["accessKeyId"], 29 | aws_secret_access_key=boto_cfg["secretAccessKey"], 30 | ) 31 | return _session 32 | 33 | 34 | def get_metrics_session(): 35 | global _metrics_session 36 | 37 | if not _metrics_session: 38 | _metrics_session = _setup_session() 39 | 40 | return _metrics_session 41 | -------------------------------------------------------------------------------- /examples/clusterman_metrics/clusterman_metrics/util/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | CONFIG_NAMESPACE = "clusterman_metrics" 17 | 18 | CLUSTERMAN_NAME = "clusterman" 19 | 20 | SYSTEM_METRICS = "system_metrics" #: metrics collected about the cluster state (e.g., CPU, memory allocation) 21 | APP_METRICS = "app_metrics" #: metrics collected from client applications (e.g., number of application runs) 22 | METADATA = "metadata" #: metrics collected about the cluster (e.g., current spot prices, instance types present) 23 | 24 | METRIC_TYPES = frozenset( 25 | [ 26 | SYSTEM_METRICS, 27 | APP_METRICS, 28 | METADATA, 29 | ] 30 | ) 31 | -------------------------------------------------------------------------------- /examples/clusterman_metrics/clusterman_metrics/util/costs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import staticconf 15 | from clusterman_metrics.util.constants import CONFIG_NAMESPACE 16 | 17 | 18 | config_reader = staticconf.NamespaceReaders(CONFIG_NAMESPACE) 19 | 20 | 21 | def estimate_cost_per_hour( 22 | cluster, 23 | pool, 24 | cpus=0, 25 | mem=0, 26 | ): 27 | cpu_cost = cpus * _get_resource_cost("cpus", cluster, pool) 28 | mem_cost = mem * _get_resource_cost("mem", cluster, pool) 29 | return max(cpu_cost, mem_cost) 30 | 31 | 32 | def _get_resource_cost(resource, cluster, pool): 33 | default_cost = config_reader.read_float( 34 | "cost_per_hour.defaults.{}".format(resource), 35 | default=0, 36 | ) 37 | return config_reader.read_float( 38 | "cost_per_hour.{}.{}.{}".format(cluster, pool, resource), 39 | default=default_cost, 40 | ) 41 | 42 | 43 | def should_warn(cost): 44 | threshold = config_reader.read_float( 45 | "cost_warning_threshold", 46 | default=100, 47 | ) 48 | return cost > threshold 49 | -------------------------------------------------------------------------------- /examples/clusterman_metrics/clusterman_metrics/util/meteorite.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Mapping 15 | from typing import Optional 16 | from typing import Tuple 17 | 18 | from clusterman_metrics.util.constants import APP_METRICS 19 | from clusterman_metrics.util.constants import CLUSTERMAN_NAME 20 | 21 | 22 | def _parse_dimensions(metric_name): 23 | """Parse out existing dimensions from the metric name""" 24 | try: 25 | metric_name, dims = metric_name.split("|", 1) 26 | except ValueError: 27 | dims = "" 28 | 29 | return ( 30 | metric_name, 31 | dict(dim_pair.split("=") for dim_pair in dims.split(",") if dim_pair), 32 | ) 33 | 34 | 35 | def generate_key_with_dimensions(metric_name: str, dimensions: Optional[Mapping[str, str]] = None) -> str: 36 | """Helper function to generate a key used to reference metric timeseries data in DynamoDB; this key will 37 | be parsed by ``get_meteorite_identifiers`` to store data in SignalFX. 38 | 39 | :param metric_name: the name of the metric (can include some pre-existing dimensions) 40 | :param dimensions: dict of dimension names to values; dimensions in the metric name will by overwritten by 41 | values here 42 | :returns: string that can be passed to ``get_writer`` as the metric key 43 | """ 44 | if not dimensions: 45 | return metric_name 46 | 47 | # dimensions passed in override dimensions in the name 48 | metric_name, new_dimensions = _parse_dimensions(metric_name) 49 | new_dimensions.update(dimensions) 50 | 51 | dimension_parts = [] 52 | for key, value in sorted(new_dimensions.items()): 53 | dimension_parts.append("{key}={value}".format(key=key, value=value)) 54 | 55 | return "{metric_name}|{dim_string}".format( 56 | metric_name=metric_name, 57 | dim_string=",".join(dimension_parts), 58 | ) 59 | 60 | 61 | def get_meteorite_identifiers(metric_type: str, metric_key: str) -> Tuple[str, Optional[Mapping[str, str]]]: 62 | """ 63 | Given the primary key for a timeseries in the datastore and its Clusterman metric type, return the metric name and 64 | dimensions for that timeseries in meteorite. 65 | 66 | :param metric_type: string, one of METRIC_TYPES 67 | :param metric_key: string, the unique key for the timeseries in the datastore. 68 | :returns: (metric_name, dimensions_dict) tuple. Dimensions may be None. 69 | """ 70 | dimensions = None 71 | name_parts = [CLUSTERMAN_NAME, metric_type] 72 | 73 | metric_name, dimensions = _parse_dimensions(metric_key) 74 | 75 | if metric_type == APP_METRICS: 76 | # Namespace app metrics by the app identifier. 77 | name_parts.extend(metric_name.split(",", 1)) 78 | else: 79 | name_parts.append(metric_name) 80 | 81 | meteorite_name = ".".join(name_parts) 82 | return meteorite_name, dimensions 83 | -------------------------------------------------------------------------------- /examples/clusterman_metrics/clusterman_metrics/util/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from decimal import Decimal 15 | from decimal import getcontext 16 | from decimal import localcontext 17 | from decimal import ROUND_HALF_UP 18 | 19 | MAX_DECIMAL_PLACES = 20 20 | _PLACES_VALUE = Decimal(10) ** (-1 * MAX_DECIMAL_PLACES) 21 | 22 | 23 | def convert_decimal(numeric): 24 | full_decimal = Decimal(numeric) 25 | _, digits, exponent = full_decimal.as_tuple() 26 | # Round to MAX_DECIMAL_PLACES, if result has more places than that. 27 | if exponent < -MAX_DECIMAL_PLACES: 28 | # quantize can raise `decimal.InvalidOperation` if result is greater 29 | # than context precision, which is 28 by default. to get around this, 30 | # temporarily set a new precision up to the max number of sig figs of 31 | # `full_decimal`, which is also the max for the result of `quantize`. 32 | # this ensures that the result of `quantize` will be within the precision 33 | # limit, and not raise the error. 34 | with localcontext() as ctx: 35 | ctx.prec = max(len(digits), getcontext().prec) 36 | return full_decimal.quantize(_PLACES_VALUE, rounding=ROUND_HALF_UP) 37 | else: 38 | return full_decimal 39 | -------------------------------------------------------------------------------- /examples/clusterman_metrics/setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = True 3 | -------------------------------------------------------------------------------- /examples/clusterman_metrics/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from setuptools import find_packages 15 | from setuptools import setup 16 | 17 | 18 | setup( 19 | name="clusterman-metrics", 20 | version="1.0.0", 21 | classifiers=[ 22 | "Programming Language :: Python :: 3", 23 | "Programming Language :: Python :: 3.8", 24 | ], 25 | package_data={str("clusterman_metrics"): [str("py.typed")]}, 26 | install_requires=[ 27 | "boto3", 28 | "PyStaticConfiguration", 29 | ], 30 | packages=find_packages(exclude=("tests*", "testing*")), 31 | zip_safe=False, 32 | ) 33 | -------------------------------------------------------------------------------- /examples/schemas/clusterman.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-06/schema", 3 | "type": "object", 4 | "required": [ 5 | "autoscaling", "autoscale_signal", "aws", "batches", "clusters", "mesos_clusters", "module_config", "module_env_config", 6 | "sensu_config" 7 | ], 8 | "definitions": { 9 | "cluster": { 10 | "type": "object", 11 | "required": ["aws_region"], 12 | "properties": { 13 | "aws_region": {"$ref": "definitions.json#awsRegion"}, 14 | "cluster_manager": {"type": "string", "enum": ["mesos", "kubernetes"]}, 15 | "drain_queue_url": {"type": "string", "format": "uri"}, 16 | "kubeconfig_path": {"type": "string"}, 17 | "fqdn": {"type": "string"}, 18 | "mesos_master_fqdn": {"type": "string"}, 19 | "sensu_config": {"$ref": "definitions.json#sensu_config"}, 20 | "termination_queue_url": {"type": "string", "format": "uri"}, 21 | "warning_queue_url": {"type": "string", "format": "uri"} 22 | }, 23 | "additionalProperties": false 24 | } 25 | }, 26 | "properties": { 27 | "autoscaling": { 28 | "type": "object", 29 | "properties": { 30 | "default_signal_role": {"type": "string"}, 31 | "excluded_resources": { 32 | "type": "array", 33 | "items": {"type": "string", "enum": ["cpus", "mem", "disk", "gpus"]} 34 | }, 35 | "setpoint": {"$ref": "definitions.json#percentage"}, 36 | "setpoint_margin": {"$ref": "definitions.json#percentage"}, 37 | "target_capacity_margin": {"$ref": "definitions.json#percentage"} 38 | }, 39 | "required": ["default_signal_role", "excluded_resources", "setpoint", "setpoint_margin", "target_capacity_margin"], 40 | "additionalProperties": false 41 | }, 42 | "autoscale_signal": {"$ref": "definitions.json#autoscale_signal"}, 43 | "aws": { 44 | "type": "object", 45 | "properties": { 46 | "access_key_file": {"type": "string"} 47 | }, 48 | "required": ["access_key_file"] 49 | }, 50 | "batches": { 51 | "type": "object", 52 | "additionalProperties": { 53 | "type": "object", 54 | "required": ["run_interval_seconds"], 55 | "properties": { 56 | "run_interval_seconds": {"$ref": "definitions.json#posint"} 57 | } 58 | } 59 | }, 60 | "drain_termination_timeout_seconds": { 61 | "type": "object", 62 | "properties": { 63 | "sfr": {"$ref": "definitions.json#posint"} 64 | } 65 | }, 66 | "clusters": { 67 | "type": "object", 68 | "additionalProperties": {"$ref": "#/definitions/cluster"} 69 | }, 70 | "mesos_clusters": { 71 | "type": "object", 72 | "additionalProperties": {"$ref": "#/definitions/cluster"} 73 | }, 74 | "module_config": { 75 | "type": "array" 76 | }, 77 | "module_env_config": { 78 | "type": "array" 79 | }, 80 | "sensu_config": {"$ref": "definitions.json#sensu_config"} 81 | }, 82 | "additionalProperties": false 83 | } 84 | -------------------------------------------------------------------------------- /examples/schemas/definitions.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-04/schema", 3 | "awsRegion": { 4 | "type": "string", 5 | "enum": ["us-east-1", "us-west-1", "us-west-2"] 6 | }, 7 | "infinity": { 8 | "type": "number", 9 | "minimum": Infinity 10 | }, 11 | "nonnegative_int": { 12 | "type": "integer", 13 | "minimum": 0 14 | }, 15 | "percentage": { 16 | "type": "number", 17 | "minimum": 0, 18 | "maximum": 1 19 | }, 20 | "posint": { 21 | "type": "integer", 22 | "minimum": 1 23 | }, 24 | "autoscale_signal": { 25 | "type": "object", 26 | "required": ["name", "branch_or_tag", "period_minutes"], 27 | "properties": { 28 | "name": {"type": "string"}, 29 | "repository": {"type": "string"}, 30 | "branch_or_tag": {"type": "string"}, 31 | "period_minutes": {"$ref": "shared.json#posint"}, 32 | "parameters": { 33 | "type": "array", 34 | "items": {"type": "object"} 35 | }, 36 | "required_metrics": { 37 | "type": "array", 38 | "items": { 39 | "type": "object", 40 | "required": ["name", "type", "minute_range"], 41 | "properties": { 42 | "name": {"type": "string"}, 43 | "type": { 44 | "type": "string", 45 | "enum": ["system_metrics", "app_metrics"] 46 | }, 47 | "minute_range": {"$ref": "shared.json#posint"}, 48 | "regex": {"type": "boolean"} 49 | }, 50 | "additionalProperties": false 51 | } 52 | } 53 | } 54 | }, 55 | "sensu_config": { 56 | "type": "array", 57 | "minItems": 1, 58 | "maxItems": 1, 59 | "items": { 60 | "type": "object", 61 | "required": ["team", "runbook"], 62 | "properties": { 63 | "team": {"type": "string"}, 64 | "runbook": {"type": "string"}, 65 | "page": {"type": "boolean"}, 66 | "notification_email": {"type": "string"}, 67 | "irc_channels": {"type": "array", "items": {"type": "string"}}, 68 | "slack_channels": {"type": "array", "items": {"type": "string"}}, 69 | "ticket": {"type": "boolean"}, 70 | "project": {"type": "string"}, 71 | "tags": {"type": "array", "items": {"type": "string"}} 72 | }, 73 | "additionalProperties": false 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /examples/schemas/pool.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-04/schema", 3 | "sfr_resource_group_definition": { 4 | "type": "object", 5 | "properties": { 6 | "s3": { 7 | "type": "object", 8 | "required": ["bucket", "prefix"], 9 | "properties": { 10 | "bucket": {"type": "string"}, 11 | "prefix": {"type": "string"} 12 | }, 13 | "additionalProperties": false 14 | }, 15 | "tag": {"type": "string"} 16 | }, 17 | "additionalProperties": false 18 | }, 19 | "resource_groups_definition": { 20 | "type": "array", 21 | "items": [ 22 | { 23 | "type": "object", 24 | "properties": { 25 | "sfr": { 26 | "$ref": "#/sfr_resource_group_definition" 27 | } 28 | } 29 | } 30 | ] 31 | }, 32 | "type": "object", 33 | "required": ["resource_groups", "scaling_limits"], 34 | "properties": { 35 | "resource_groups": {"$ref": "#/resource_groups_definition"}, 36 | "draining_enabled": {"type": "boolean"}, 37 | "scaling_limits": { 38 | "type": "object", 39 | "required": ["min_capacity", "max_capacity", "max_weight_to_add", "max_weight_to_remove"], 40 | "properties": { 41 | "min_capacity": {"$ref": "definitions.json#nonnegative_int"}, 42 | "max_capacity": {"$ref": "definitions.json#posint"}, 43 | "max_tasks_to_kill": {"anyOf": [ 44 | {"$ref": "definitions.json#nonnegative_int"}, 45 | {"$ref": "definitions.json#infinity"} 46 | ]}, 47 | "max_weight_to_add": {"$ref": "definitions.json#posint"}, 48 | "max_weight_to_remove": {"$ref": "definitions.json#posint"} 49 | }, 50 | "additionalProperties": false 51 | }, 52 | "autoscale_signal": {"$ref": "definitions.json#autoscale_signal"}, 53 | "autoscaling": { 54 | "type": "object", 55 | "properties": { 56 | "excluded_resources": { 57 | "type": "array", 58 | "items": {"type": "string", "enum": ["cpus", "mem", "disk", "gpus"]} 59 | }, 60 | "setpoint": {"$ref": "definitions.json#percentage"}, 61 | "setpoint_margin": {"$ref": "definitions.json#percentage"}, 62 | "target_capacity_margin": {"$ref": "definitions.json#percentage"} 63 | }, 64 | "additionalProperties": false 65 | }, 66 | "sensu_config": {"$ref": "definitions.json#sensu_config"}, 67 | "alert_on_max_capacity": {"type": "boolean"} 68 | }, 69 | "additionalProperties": false 70 | } 71 | -------------------------------------------------------------------------------- /examples/supervisord.conf: -------------------------------------------------------------------------------- 1 | [unix_http_server] 2 | file=/tmp/supervisor.sock ; the path to the socket file 3 | 4 | [inet_http_server] 5 | port=127.0.0.1:9001 6 | 7 | [supervisord] 8 | logfile=/tmp/supervisord.log ; main log file; default $CWD/supervisord.log 9 | logfile_maxbytes=50MB ; max main logfile bytes b4 rotation; default 50MB 10 | logfile_backups=5 ; # of main logfile backups; 0 means none, default 10 11 | loglevel=info ; log level; default info; others: debug,warn,trace 12 | pidfile=/tmp/supervisord.pid ; supervisord pidfile; default supervisord.pid 13 | nodaemon=true ; start in foreground if true; default false 14 | minfds=1024 ; min. avail startup file descriptors; default 1024 15 | minprocs=200 ; min. avail process descriptors;default 200 16 | 17 | [rpcinterface:supervisor] 18 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface 19 | 20 | [supervisorctl] 21 | serverurl=unix:///tmp/supervisor.sock ; use a unix:// URL for a unix socket 22 | 23 | [program:fetch_signals] 24 | process_name=fetch_signals_%(process_num)s 25 | numprocs=%(ENV_CMAN_NUM_VERSIONS)s 26 | command=/usr/bin/fetch_clusterman_signal %(process_num)s 27 | autostart=true 28 | autorestart=false 29 | startretries=0 30 | stdout_logfile=/dev/stdout 31 | stdout_logfile_maxbytes=0 32 | redirect_stderr=true 33 | startsecs=0 34 | 35 | [program:run_signals] 36 | process_name=run_signals_%(process_num)s 37 | numprocs=%(ENV_CMAN_NUM_SIGNALS)s 38 | command=/usr/bin/run_clusterman_signal %(process_num)s 39 | autostart=false 40 | startretries=0 41 | stopasgroup=true 42 | stdout_logfile=/dev/stdout 43 | stdout_logfile_maxbytes=0 44 | redirect_stderr=true 45 | 46 | [program:autoscaler] 47 | directory=/code 48 | environment=PATH=/code/virtualenv_run/bin 49 | command=python -m examples.batch.autoscaler %(ENV_CMAN_ARGS)s 50 | autostart=false 51 | autorestart=false 52 | 53 | stdout_logfile=/dev/stderr 54 | stdout_logfile_maxbytes=0 55 | stderr_logfile=/dev/stderr 56 | stderr_logfile_maxbytes=0 57 | -------------------------------------------------------------------------------- /examples/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | variable "name_prefix" { 2 | default = "clusterman" 3 | } 4 | variable "metric_types" { 5 | type = "list" 6 | default = ["metadata", "app_metrics", "system_metrics"] 7 | } 8 | variable "read_capacity" { 9 | default = 5 10 | } 11 | variable "write_capacity" { 12 | default = 5 13 | } 14 | variable "read_autoscaling_enabled" { 15 | default = "false" 16 | } 17 | variable "write_autoscaling_enabled" { 18 | default = "false" 19 | } 20 | variable "max_read_capacity" { 21 | default = 100 22 | } 23 | variable "max_write_capacity" { 24 | default = 100 25 | } 26 | -------------------------------------------------------------------------------- /extra-requirements-yelp-dev.txt: -------------------------------------------------------------------------------- 1 | static-completion==0.1.7 2 | -------------------------------------------------------------------------------- /extra-requirements-yelp.txt: -------------------------------------------------------------------------------- 1 | clusterman-metrics==2.2.1 2 | monk==1.1.0 3 | pysensu-yelp==0.4.1 4 | yelp-batch==11.2.7 5 | yelp-clog==4.1.0 6 | yelp-lib==13.1.5 7 | yelp-meteorite==1.5.1 8 | -------------------------------------------------------------------------------- /images/architecture-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/images/architecture-diagram.png -------------------------------------------------------------------------------- /itest_status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Yelp Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import sys 15 | from subprocess import run 16 | 17 | 18 | def get_pid(batch_name): 19 | output = run( 20 | f'ps -ef | egrep "python -m {batch_name}(\s+|$)"', 21 | shell=True, 22 | capture_output=True, 23 | ) 24 | 25 | return output.stdout.split()[1].decode() 26 | 27 | 28 | def check_status(batch_name): # pragma: no cover 29 | # status written by BatchRunningSentinelMixin 30 | status_file = f'/tmp/{batch_name.split(".")[-1]}.running' 31 | 32 | try: 33 | with open(status_file) as f: 34 | status_pid = f.read() 35 | batch_pid = get_pid(batch_name) 36 | except FileNotFoundError: 37 | print(f"{batch_name} has not finished initialization") 38 | sys.exit(1) 39 | 40 | assert status_pid == batch_pid 41 | print(f"{batch_name} completed initialization and is running at PID {status_pid}") 42 | 43 | 44 | if __name__ == "__main__": 45 | check_status(sys.argv[1]) 46 | -------------------------------------------------------------------------------- /itests/draining_queue.feature: -------------------------------------------------------------------------------- 1 | Feature: make sure the drainer is working properly 2 | 3 | Scenario: process the draining queue 4 | Given a draining client 5 | And a message in the draining queue 6 | When the draining queue is processed 7 | Then the host should be submitted for termination 8 | And all queues are empty 9 | 10 | Scenario: process the termination queue 11 | Given a draining client 12 | And a message in the termination queue 13 | When the termination queue is processed 14 | Then the host should be terminated 15 | And all queues are empty 16 | 17 | Scenario: process the warning queue 18 | Given a draining client 19 | And a message in the warning queue 20 | When the warning queue is processed 21 | Then the host should be submitted for draining 22 | And all queues are empty 23 | -------------------------------------------------------------------------------- /itests/simulation_aws_price_computations.feature: -------------------------------------------------------------------------------- 1 | Feature: make sure we're computing spot prices correctly 2 | 3 | Scenario: one instance with constant price 4 | Given market A has 1 instance at time 0 5 | And market A costs $1/hour at time 0 6 | When the simulator runs for 2 hours 7 | Then the simulated cluster costs $2 total 8 | 9 | Scenario: one instance with price increase 10 | Given market A has 1 instance at time 0 11 | And market A costs $1/hour at time 0 12 | And market A costs $2/hour at time 1800 13 | When the simulator runs for 2 hours 14 | Then the simulated cluster costs $3 total 15 | 16 | Scenario: two instances in the same market are launched at the same time 17 | Given market A has 2 instances at time 0 18 | And market A costs $1/hour at time 0 19 | And market A costs $2/hour at time 1800 20 | When the simulator runs for 2 hours 21 | Then the simulated cluster costs $6 total 22 | 23 | Scenario: two instances in the same market are launched at different times 24 | Given market A has 1 instances at time 0 25 | And market A has 2 instances at time 1800 26 | And market A costs $1/hour at time 0 27 | And market A costs $2/hour at time 1200 28 | When the simulator runs for 2 hours 29 | Then the simulated cluster costs $6 total 30 | 31 | Scenario: two instances in different markets are launched at different times 32 | Given market A has 1 instance at time 0 33 | And market B has 1 instance at time 1800 34 | And market A costs $1/hour at time 0 35 | And market A costs $2/hour at time 1200 36 | And market B costs $0.50/hour at time 0 37 | And market B costs $0.75/hour at time 4500 38 | When the simulator runs for 2 hours 39 | Then the simulated cluster costs $3.875 total 40 | 41 | Scenario: (per-hour billing) two instances in different markets are launched at diff. times and one is terminated 42 | Given market A has 1 instance at time 0 43 | And market B has 1 instance at time 1920 44 | And market B has 0 instances at time 5400 45 | And market A costs $1/hour at time 0 46 | And market A costs $2/hour at time 1800 47 | And market B costs $0.50/hour at time 0 48 | And market B costs $0.75/hour at time 4500 49 | When the simulator runs for 2 hours 50 | Then the simulated cluster costs $3.5 total 51 | 52 | Scenario: (per-sec billing) two instances in different markets are launched at diff. times and one is terminated 53 | Given market A has 1 instance at time 0 54 | And market B has 1 instance at time 1920 55 | And market B has 0 instances at time 5400 56 | And market A costs $1/hour at time 0 57 | And market A costs $2/hour at time 1800 58 | And market B costs $0.50/hour at time 0 59 | And market B costs $0.75/hour at time 4500 60 | When the simulator runs for 2 hours and billing is per-second 61 | Then the simulated cluster costs $4.05 total 62 | -------------------------------------------------------------------------------- /itests/simulation_join_delay.feature: -------------------------------------------------------------------------------- 1 | Feature: make sure the simulator join-delay params work correctly 2 | 3 | Scenario Outline: instances should wait to join the cluster 4 | Given market A has 1 instance at time 0 5 | When the instance takes