├── .activate.sh
├── .cman_debug_bashrc
├── .coveragerc
├── .coveragerc-yelp
├── .deactivate.sh
├── .dockerignore
├── .github
    ├── pull_request_template.md
    └── workflows
    │   └── ci.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── Dockerfile.external
├── LICENSE
├── Makefile
├── OWNERS
├── README.md
├── acceptance
    ├── Makefile
    ├── autoscaler_config.tmpl
    ├── bionic
    │   └── clusterman_signals_acceptance.tar.gz
    ├── clusterman.json
    ├── clusterman.sh
    ├── docker-compose-k8s.yaml
    ├── docker-compose.yaml
    ├── jammy
    │   └── clusterman_signals_acceptance.tar.gz
    ├── k8s-local-docker-registry.sh
    ├── mesos-agent-secret
    ├── mesos-secrets
    ├── moto
    │   └── Dockerfile
    ├── run_instance.py
    ├── secret
    ├── srv-configs
    │   ├── clog.yaml
    │   ├── clusterman-clusters
    │   │   └── local-dev
    │   │   │   ├── default.kubernetes
    │   │   │   └── default.mesos
    │   ├── clusterman-external.yaml
    │   ├── clusterman.yaml
    │   └── clusterman_metrics.yaml
    ├── utils.sh
    └── xenial
    │   └── clusterman_signals_acceptance.tar.gz
├── clusterman
    ├── __init__.py
    ├── args.py
    ├── autoscaler
    │   ├── __init__.py
    │   ├── autoscaler.py
    │   ├── config.py
    │   ├── offset.py
    │   ├── pool_manager.py
    │   └── toggle.py
    ├── aws
    │   ├── __init__.py
    │   ├── auto_scaling_resource_group.py
    │   ├── aws_resource_group.py
    │   ├── client.py
    │   ├── markets.py
    │   ├── response_types.py
    │   ├── spot_fleet_resource_group.py
    │   ├── spot_prices.py
    │   └── util.py
    ├── batch
    │   ├── __init__.py
    │   ├── autoscaler.py
    │   ├── autoscaler_bootstrap.py
    │   ├── clog.py
    │   ├── cluster_metrics_collector.py
    │   ├── drainer.py
    │   ├── node_migration.py
    │   ├── spot_price_collector.py
    │   └── util.py
    ├── cli
    │   ├── __init__.py
    │   ├── generate_data.py
    │   ├── info.py
    │   ├── manage.py
    │   ├── migrate.py
    │   ├── simulate.py
    │   ├── status.py
    │   ├── toggle.py
    │   └── util.py
    ├── common
    │   ├── __init__.py
    │   └── sfx.py
    ├── config.py
    ├── draining
    │   ├── __init__.py
    │   ├── kubernetes.py
    │   ├── mesos.py
    │   └── queue.py
    ├── exceptions.py
    ├── interfaces
    │   ├── __init__.py
    │   ├── cluster_connector.py
    │   ├── resource_group.py
    │   ├── signal.py
    │   └── types.py
    ├── kubernetes
    │   ├── __init__.py
    │   ├── kubernetes_cluster_connector.py
    │   └── util.py
    ├── math
    │   ├── __init__.py
    │   ├── piecewise.py
    │   └── piecewise_types.py
    ├── mesos
    │   ├── __init__.py
    │   ├── mesos_cluster_connector.py
    │   ├── metrics_generators.py
    │   └── util.py
    ├── migration
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── event.py
    │   ├── event_enums.py
    │   ├── settings.py
    │   └── worker.py
    ├── monitoring_lib.py
    ├── reports
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── data_transforms.py
    │   ├── plots.py
    │   ├── report_types.py
    │   └── reports.py
    ├── run.py
    ├── signals
    │   ├── __init__.py
    │   ├── external_signal.py
    │   └── pending_pods_signal.py
    ├── simulator
    │   ├── __init__.py
    │   ├── event.py
    │   ├── io.py
    │   ├── simulated_aws_cluster.py
    │   ├── simulated_cluster_connector.py
    │   ├── simulated_pool_manager.py
    │   ├── simulated_spot_fleet_resource_group.py
    │   ├── simulator.py
    │   └── util.py
    ├── supervisord
    │   ├── fetch_clusterman_signal
    │   ├── run_clusterman_signal
    │   └── supervisord.conf
    ├── tools
    │   ├── __init__.py
    │   ├── dynamodb_rename.py
    │   ├── rookout.py
    │   └── signalfx_scraper.py
    └── util.py
├── clusterman_logo.png
├── code-of-conduct.md
├── completions
    └── .gitignore
├── debian
    ├── .gitignore
    ├── changelog
    ├── clusterman.links
    ├── compat
    ├── control
    └── rules
├── docs
    ├── Makefile
    ├── examples
    │   ├── autoscaler_config.yaml
    │   ├── design.yaml
    │   └── metrics.json.gz
    └── source
    │   ├── _static
    │       └── .gitignore
    │   ├── api
    │       ├── AWSResourceGroup.rst
    │       ├── AutoScalingResourceGroup.rst
    │       ├── Autoscaler.rst
    │       ├── MesosPoolManager.rst
    │       ├── Signal.rst
    │       ├── SpotFleetResourceGroup.rst
    │       ├── aws_markets.rst
    │       └── clusterman_metrics.rst
    │   ├── autoscaler.rst
    │   ├── conf.py
    │   ├── configuration.rst
    │   ├── drainer.rst
    │   ├── index.rst
    │   ├── manage.rst
    │   ├── metrics.rst
    │   ├── node_migration.rst
    │   ├── overview.rst
    │   ├── resource_groups.rst
    │   ├── signals.rst
    │   ├── simulator.rst
    │   └── tools.rst
├── examples
    ├── __init__.py
    ├── batch
    │   ├── __init__.py
    │   ├── autoscaler.py
    │   ├── autoscaler_bootstrap.py
    │   ├── cluster_metrics_collector.py
    │   ├── spot_price_collector.py
    │   └── util.py
    ├── clusterman_metrics
    │   ├── .flake8
    │   ├── clusterman_metrics
    │   │   ├── __init__.py
    │   │   ├── boto_client.py
    │   │   ├── simulation_client.py
    │   │   └── util
    │   │   │   ├── __init__.py
    │   │   │   ├── aws.py
    │   │   │   ├── constants.py
    │   │   │   ├── costs.py
    │   │   │   ├── meteorite.py
    │   │   │   └── misc.py
    │   ├── setup.cfg
    │   └── setup.py
    ├── schemas
    │   ├── clusterman.json
    │   ├── definitions.json
    │   └── pool.json
    ├── supervisord.conf
    └── terraform
    │   ├── clusterman.tf
    │   └── variables.tf
├── extra-requirements-yelp-dev.txt
├── extra-requirements-yelp.txt
├── images
    └── architecture-diagram.png
├── itest_status.py
├── itests
    ├── autoscaler_scaling.feature
    ├── draining_queue.feature
    ├── environment.py
    ├── prune_excess_fulfilled_capacity.feature
    ├── resource_group_modification.feature
    ├── simulation_aws_price_computations.feature
    ├── simulation_join_delay.feature
    ├── simulation_spot_fleet_diversification.feature
    └── steps
    │   ├── autoscaler.py
    │   ├── draining.py
    │   ├── exceptions.py
    │   ├── log.py
    │   ├── pool_manager.py
    │   ├── prune_excess_fulfilled_capacity.py
    │   ├── simulated_spot_fleet.py
    │   └── simulation.py
├── jenkins.yaml
├── mypy.ini
├── package
    ├── .gitignore
    ├── Makefile
    ├── debian-itest-runner
    ├── dockerfiles
    │   ├── bionic
    │   │   └── Dockerfile
    │   ├── jammy
    │   │   └── Dockerfile
    │   └── xenial
    │   │   └── Dockerfile
    └── itest
    │   ├── metrics.json.gz
    │   ├── metrics.yaml
    │   └── ubuntu.sh
├── pyproject.toml
├── requirements-bootstrap.txt
├── requirements-dev-minimal.txt
├── requirements-dev.txt
├── requirements-docs.txt
├── requirements-minimal.txt
├── requirements.txt
├── service-itest-runner
├── setup.py
├── stubs
    ├── simplejson.pyi
    ├── sorteddict.pyi
    └── staticconf.pyi
├── tests
    ├── __init__.py
    ├── args_test.py
    ├── autoscaler
    │   ├── autoscaler_test.py
    │   ├── config_test.py
    │   ├── offset_test.py
    │   ├── pool_manager_test.py
    │   └── toggle_test.py
    ├── aws
    │   ├── __init__.py
    │   ├── auto_scaling_resource_group_test.py
    │   ├── aws_resource_group_test.py
    │   ├── client_test.py
    │   ├── conftest.py
    │   ├── spot_fleet_resource_group_test.py
    │   └── spot_prices_test.py
    ├── batch
    │   ├── __init__.py
    │   ├── autoscaler_test.py
    │   ├── cluster_metrics_collector_test.py
    │   ├── conftest.py
    │   ├── drainer_test.py
    │   ├── node_migration_test.py
    │   ├── spot_price_collector_test.py
    │   └── util_test.py
    ├── cli
    │   ├── manage_test.py
    │   ├── migrate_test.py
    │   ├── simulate_test.py
    │   └── toggle_cli_test.py
    ├── common
    │   └── sfx_test.py
    ├── config_test.py
    ├── conftest.py
    ├── draining
    │   └── queue_test.py
    ├── interfaces
    │   ├── __init__.py
    │   └── signal_test.py
    ├── kubernetes
    │   ├── kubernetes_cluster_connector_test.py
    │   └── util_test.py
    ├── math
    │   └── piecewise_test.py
    ├── migration
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── migration_event_enums_test.py
    │   ├── migration_event_test.py
    │   ├── migration_settings_test.py
    │   └── migration_worker_test.py
    ├── monitoring_lib_test.py
    ├── signals
    │   ├── __init__.py
    │   ├── external_signal_test.py
    │   └── pending_pods_signal_test.py
    ├── simulator
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── io_test.py
    │   ├── simulated_aws_cluster_test.py
    │   ├── simulated_cluster_connector_test.py
    │   ├── simulated_spot_fleet_resource_group_test.py
    │   └── simulator_test.py
    ├── tools
    │   └── signalfx_scraper_test.py
    └── util_test.py
└── tox.ini


/.activate.sh:
--------------------------------------------------------------------------------
1 | virtualenv_run/bin/activate


--------------------------------------------------------------------------------
/.cman_debug_bashrc:
--------------------------------------------------------------------------------
1 | alias cman_debug='python -m clusterman.batch.autoscaler_bootstrap start --no-daemon'
2 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | source =
 4 |     clusterman
 5 | 
 6 | [report]
 7 | omit =
 8 |     clusterman/batch/*
 9 | exclude_lines =
10 |     # Have to re-enable the standard pragma
11 |     \#\s*pragma: no cover
12 | 
13 |     # Don't complain if tests don't hit defensive assertion code:
14 |     ^\s*raise AssertionError\b
15 |     ^\s*raise NotImplementedError\b
16 |     ^\s*return NotImplemented\b
17 |     ^\s*raise$
18 | 
19 |     # Don't complain if non-runnable code isn't run:
20 |     ^if __name__ == ['"]__main__['"]:$
21 | 
22 | [html]
23 | directory = coverage-html
24 | 
25 | # vim:ft=dosini
26 | 


--------------------------------------------------------------------------------
/.coveragerc-yelp:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | source =
 4 |     clusterman
 5 | 
 6 | [report]
 7 | exclude_lines =
 8 |     # Have to re-enable the standard pragma
 9 |     \#\s*pragma: no cover
10 | 
11 |     # Don't complain if tests don't hit defensive assertion code:
12 |     ^\s*raise AssertionError\b
13 |     ^\s*raise NotImplementedError\b
14 |     ^\s*return NotImplemented\b
15 |     ^\s*raise$
16 | 
17 |     # Don't complain if non-runnable code isn't run:
18 |     ^if __name__ == ['"]__main__['"]:$
19 | 
20 | [html]
21 | directory = coverage-html
22 | 
23 | # vim:ft=dosini
24 | 


--------------------------------------------------------------------------------
/.deactivate.sh:
--------------------------------------------------------------------------------
1 | deactivate
2 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | .tox
 3 | build
 4 | 
 5 | # It is possible a user has their own virtualenv here,
 6 | # but we don't want it to pollute the docker context because
 7 | # it will get built inside.
 8 | virtualenv_run
 9 | venv
10 | tests
11 | itests
12 | docs
13 | tools
14 | 
15 | # y/ycp
16 | .ycp_playground
17 | playground
18 | docker-venv
19 | .activate.sh
20 | .deactivate.sh
21 | 
22 | # we don't need to send all the Debian package builds to Docker for the paasta service
23 | yelp_package
24 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ### Description
 2 | 
 3 | Please fill out!
 4 | 
 5 | ### Testing Done
 6 | 
 7 | Please fill out!  Generally speaking any new features should include
 8 | additional unit or integration tests to ensure the behaviour is
 9 | working correctly.
10 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |   release:
 9 | 
10 | jobs:
11 |   tox:
12 |     env:
13 |       PIP_INDEX_URL: https://pypi.python.org/simple
14 |     runs-on: ubuntu-22.04
15 |     strategy:
16 |       fail-fast: false
17 |       matrix:
18 |         make_target:
19 |           - run-pre-commit
20 |           - test-external
21 |           - itest-external
22 |           - itest_bionic-external
23 |           - itest_jammy-external
24 |     steps:
25 |       - uses: actions/checkout@v2
26 |       - uses: actions/setup-python@v2
27 |         with:
28 |           python-version: 3.7
29 |       - uses: actions/setup-python@v3
30 |         with:
31 |           python-version: 3.8
32 |       - uses: actions/setup-go@v2
33 |         with:
34 |           go-version: '1.17.3'
35 |       - uses: azure/setup-kubectl@v1
36 |         with:
37 |           version: v1.22.0
38 |       # GHA won't setup tox for us and we use tox-pip-extensions for venv-update
39 |       - run: pip install tox==3.8.6 tox-pip-extensions==1.6.0
40 |       - run: go install sigs.k8s.io/kind@v0.11.1
41 |       - run: make ${{ matrix.make_target }}
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[co]
 2 | *.so
 3 | *.sw[nop]
 4 | .#*
 5 | .DS_Store
 6 | ._*
 7 | \#*\#
 8 | build
 9 | dist
10 | *~
11 | *.log
12 | .coverage
13 | precomputed
14 | .pydevproject
15 | .project
16 | *.sublime-*
17 | virtualenv_run
18 | .tox
19 | *.egg-info/
20 | __pycache__
21 | version
22 | .ycp_playground
23 | playground
24 | docker-venv
25 | .cache
26 | .pytest_cache/
27 | .mypy_cache/
28 | acceptance/autoscaler_config.yaml
29 | package/itest/autoscaler_config.yaml
30 | package/itest/autoscaler_config.tmpl
31 | package/itest/run_instance.py
32 | package/itest/trusty/*
33 | package/itest/xenial/*
34 | package/itest/bionic/*
35 | package/itest/jammy/*
36 | /completions/[a-zA-Z]*
37 | acceptance/.local*
38 | sftp-config.json
39 | .idea
40 | .vscode
41 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |     - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |       rev: v0.9.4
 4 |       hooks:
 5 |           - id: trailing-whitespace
 6 |           - id: end-of-file-fixer
 7 |             exclude: ^\.activate\.sh$
 8 |           - id: check-yaml
 9 |           - id: debug-statements
10 |             exclude: ^itests/environment.py$
11 |           - id: name-tests-test
12 |           - id: check-added-large-files
13 |             exclude: ^(\.activate\.sh|.*clusterman_signals_.*\.tar\.gz)$
14 |           - id: check-byte-order-marker
15 |           - id: fix-encoding-pragma
16 |             args: [--remove]
17 |     - repo: https://github.com/asottile/reorder_python_imports
18 |       rev: v0.3.5
19 |       hooks:
20 |           - id: reorder-python-imports
21 |             args: [
22 |                 --remove-import, from __future__ import absolute_import,
23 |                 --remove-import, from __future__ import print_function,
24 |                 --remove-import, from __future__ import unicode_literals
25 |             ]
26 |     - repo: https://github.com/asottile/pyupgrade
27 |       rev: v1.2.0
28 |       hooks:
29 |           - id: pyupgrade
30 |             args: [--py3-plus]
31 |     - repo: https://github.com/psf/black
32 |       rev: 22.3.0
33 |       hooks:
34 |           - id: black
35 |             args:
36 |                 - --target-version
37 |                 - py38
38 |     - repo: https://github.com/PyCQA/flake8
39 |       rev: 4.0.1
40 |       hooks:
41 |       -   id: flake8
42 |           exclude: ^docs/.*
43 |           args: [
44 |               '--ignore=E121,E123,E126,E133,E203,E226,E231,E241,E242,E704,W503,W504,W505,W605'
45 |           ]
46 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # This is an example Dockerfile to run your service in PaaSTA!
 2 | # It satisfies the PaaSTA contract.
 3 | FROM    docker-dev.yelpcorp.com/jammy_yelp:latest
 4 | 
 5 | # python and uwsgi deps
 6 | RUN     apt-get update \
 7 |         && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
 8 |             awscli \
 9 |             git \
10 |             libatlas-base-dev \
11 |             libpython3.8 \
12 |             libxml2 \
13 |             libyaml-0-2 \
14 |             lsb-release \
15 |             make \
16 |             openssh-client \
17 |             python3.8 \
18 |             python3-distutils \
19 |             python3-pip \
20 |             python3-setuptools \
21 |             stdin2scribe \
22 |             tox \
23 |             virtualenv \
24 |             zk-flock \
25 |         && apt-get clean
26 | 
27 | RUN     /usr/bin/pip3 install supervisor
28 | COPY    tox.ini requirements.txt requirements-bootstrap.txt extra-requirements-yelp.txt /code/
29 | RUN     cd code && tox -e virtualenv_run
30 | RUN     cd code && virtualenv_run/bin/pip3 install -rextra-requirements-yelp.txt
31 | 
32 | RUN     mkdir /home/nobody  \
33 |         && chown nobody /home/nobody
34 | ENV     HOME /home/nobody
35 | 
36 | # Code is COPY'ed here after the pip install above, so that code changes do not
37 | # break the preceding cache layer.
38 | COPY    . /code
39 | RUN     chown nobody /code
40 | 
41 | 
42 | # This is needed so that we can pass PaaSTA itests on Jenkins; for some reason (probably aufs-related?)
43 | # root can't modify the contents of /code on Jenkins, even though it works locally.  Root needs to
44 | # modify these contents so that it can configure the Dockerized Mesos cluster that we run our itests on.
45 | # This shouldn't be a security risk because we drop privileges below and on overlay2, root can already
46 | # modify the contents of this directory.
47 | RUN     chmod -R 775 /code/acceptance
48 | RUN     ln -s /code/clusterman/supervisord/fetch_clusterman_signal /usr/bin/fetch_clusterman_signal
49 | RUN     ln -s /code/clusterman/supervisord/run_clusterman_signal /usr/bin/run_clusterman_signal
50 | 
51 | RUN     install -d --owner=nobody /code/logs
52 | 
53 | # Create /nail/run to store the batch PID file
54 | RUN     mkdir -p /nail/run && chown -R nobody /nail/run
55 | 
56 | # For sake of security, don't run your service as a privileged user
57 | USER    nobody
58 | WORKDIR /code
59 | ENV     BASEPATH=/code PATH=/code/virtualenv_run/bin:$PATH
60 | 


--------------------------------------------------------------------------------
/Dockerfile.external:
--------------------------------------------------------------------------------
 1 | # This is an example Dockerfile to run your service in PaaSTA!
 2 | # It satisfies the PaaSTA contract.
 3 | 
 4 | ARG DOCKER_REGISTRY
 5 | ARG IMAGE_NAME
 6 | FROM ${DOCKER_REGISTRY}/${IMAGE_NAME}
 7 | 
 8 | # python and uwsgi deps
 9 | RUN     apt-get update && apt-get upgrade -y \
10 |        && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
11 |             awscli \
12 |             g++ \
13 |             git \
14 |             libatlas-base-dev \
15 |             libpython3.8 \
16 |             libxml2 \
17 |             libyaml-0-2 \
18 |             lsb-release \
19 |             make \
20 |             openssh-client \
21 |             software-properties-common \
22 |             gpg \
23 |             gpg-agent \
24 |         && add-apt-repository ppa:deadsnakes/ppa \
25 |         && apt-cache policy python3.8 \
26 |         && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
27 |             python3.8 \
28 |             libtiff-dev \
29 |             libfreetype-dev \
30 |             libfreetype6 \
31 |             libfreetype6-dev \
32 |             python3.8-dev \
33 |             python3.8-distutils \
34 |             python3-apt \
35 |             python3-pip \
36 |             python3-setuptools \
37 |             virtualenv \
38 |         && apt-get clean
39 | 
40 | RUN     /usr/bin/pip3 install setuptools supervisor tox==3.24.4
41 | COPY    tox.ini requirements.txt requirements-bootstrap.txt /code/
42 | 
43 | RUN     mkdir /home/nobody  \
44 |         && chown nobody /home/nobody
45 | ENV     HOME /home/nobody
46 | 
47 | # Code is COPY'ed here after the pip install above, so that code changes do not
48 | # break the preceding cache layer.
49 | COPY    . /code
50 | RUN     chown nobody /code
51 | RUN     cd code && tox -e virtualenv_run && virtualenv_run/bin/pip3 install -eexamples/clusterman_metrics
52 | 
53 | RUN     ln -s /code/clusterman/supervisord/fetch_clusterman_signal /usr/bin/fetch_clusterman_signal
54 | RUN     ln -s /code/clusterman/supervisord/run_clusterman_signal /usr/bin/run_clusterman_signal
55 | 
56 | RUN     install -d --owner=nobody /code/logs
57 | 
58 | # Create /nail/run to store the batch PID file
59 | RUN     mkdir -p /nail/run && chown -R nobody /nail/run
60 | 
61 | # For sake of security, don't run your service as a privileged user
62 | USER    nobody
63 | WORKDIR /code
64 | ENV     BASEPATH=/code PATH=/code/virtualenv_run/bin:$PATH
65 | 


--------------------------------------------------------------------------------
/OWNERS:
--------------------------------------------------------------------------------
1 | ---
2 | teams:
3 |   - Compute Infrastructure Core
4 | 


--------------------------------------------------------------------------------
/acceptance/autoscaler_config.tmpl:
--------------------------------------------------------------------------------
1 | ---
2 | configs:
3 |     - LaunchSpecifications:
4 |         - WeightedCapacity: 35
5 |           SubnetId: REPLACE
6 |           InstanceType: m3.large
7 |           SpotPrice: 4
8 |       AllocationStrategy: diversified
9 | 


--------------------------------------------------------------------------------
/acceptance/bionic/clusterman_signals_acceptance.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/acceptance/bionic/clusterman_signals_acceptance.tar.gz


--------------------------------------------------------------------------------
/acceptance/clusterman.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"accessKeyId": "ACCESS_KEY",
3 | 	"secretAccessKey": "SECRET_ACCESS_KEY"
4 | }
5 | 


--------------------------------------------------------------------------------
/acceptance/clusterman.sh:
--------------------------------------------------------------------------------
1 | export AWS_ACCESS_KEY_ID=ACCESS_KEY
2 | export AWS_SECRET_ACCESS_KEY=SECRET_ACCESS_KEY
3 | 


--------------------------------------------------------------------------------
/acceptance/docker-compose-k8s.yaml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | 
 3 | services:
 4 |   moto-ec2:
 5 |     build: ./moto/
 6 |     ports:
 7 |       - 5000
 8 |     command: 'ec2'
 9 |   moto-s3:
10 |     build: ./moto/
11 |     ports:
12 |       - 5000
13 |     command: 's3'
14 |   moto-dynamodb:
15 |     build: ./moto/
16 |     ports:
17 |       - 5000
18 |     command: 'dynamodb2'
19 | 
20 | networks:
21 |   default:
22 |     external:
23 |       name: kind
24 | 


--------------------------------------------------------------------------------
/acceptance/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | version: "2"
 2 | 
 3 | services:
 4 |   zookeeper:
 5 |     image: zookeeper
 6 |     environment:
 7 |       ZK_CONFIG: tickTime=2000,initLimit=10,syncLimit=5,maxClientCnxns=128,forceSync=no,clientPort=2181
 8 |       ZK_ID: 1
 9 |   mesosmaster:
10 |     image: mesosphere/mesos:1.5.0
11 |     ports:
12 |       - 5050
13 |       - 5054
14 |     command: 'mesos-master --zk=zk://zookeeper:2181/mesos-testcluster --registry=in_memory --quorum=1 --authenticate --authenticate_agents --work_dir=/tmp/mesos --credentials=/etc/mesos-secrets'
15 |     depends_on:
16 |       - zookeeper
17 |     volumes:
18 |       - ./mesos-secrets:/etc/mesos-secrets
19 |   mesosagent:
20 |     image: mesosphere/mesos:1.5.0
21 |     expose:
22 |       - 5051
23 |     volumes:
24 |       - /var/run/docker.sock:/var/run/docker.sock
25 |       - ./mesos-agent-secret:/etc/mesos-agent-secret
26 |     environment:
27 |       CLUSTER: testcluster
28 |       MESOS_SYSTEMD_ENABLE_SUPPORT: "false"
29 |     command: 'mesos-agent --master=zk://zookeeper:2181/mesos-testcluster --resources="cpus:20;mem:2048;disk:2000;ports:[31000-31100];cpus(taskproc):10;mem(taskproc):1024;disk(taskproc):1000;ports(taskproc):[31200-31500]" --credential=/etc/mesos-agent-secret --containerizers=docker --docker=/usr/bin/docker --work_dir=/tmp/mesos --attributes="region:fakeregion;pool:default" --no-docker_kill_orphans --log_dir=/var/log/mesos'
30 |     depends_on:
31 |       - mesosmaster
32 |       - zookeeper
33 |   moto-ec2:
34 |     build: ./moto/
35 |     ports:
36 |       - 5000
37 |     command: 'ec2'
38 |   moto-s3:
39 |     build: ./moto/
40 |     ports:
41 |       - 5000
42 |     command: 's3'
43 |   moto-dynamodb:
44 |     build: ./moto/
45 |     ports:
46 |       - 5000
47 |     command: 'dynamodb2'
48 |   moto-sts:
49 |     build: ./moto/
50 |     ports:
51 |       - 5000
52 |     command: 'sts'
53 | 


--------------------------------------------------------------------------------
/acceptance/jammy/clusterman_signals_acceptance.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/acceptance/jammy/clusterman_signals_acceptance.tar.gz


--------------------------------------------------------------------------------
/acceptance/k8s-local-docker-registry.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | set -o errexit -x
 3 | 
 4 | REG_NAME=$1
 5 | REG_PORT=$2
 6 | CLUSTER_NAME=$3
 7 | 
 8 | # create registry container unless it already exists
 9 | running="$(docker inspect -f '{{.State.Running}}' "${REG_NAME}" 2>/dev/null || true)"
10 | if [ "${running}" != 'true' ]; then
11 |     docker run -d --restart=always -e REGISTRY_HTTP_ADDR=0.0.0.0:${REG_PORT} -p "${REG_PORT}:${REG_PORT}" --name "${REG_NAME}" registry:2
12 | fi
13 | 
14 | # connect the registry to the cluster network
15 | docker network connect "kind" "${REG_NAME}"
16 | 
17 | # tell https://tilt.dev to use the registry
18 | # https://docs.tilt.dev/choosing_clusters.html#discovering-the-registry
19 | for node in $(kind get nodes --name ${CLUSTER_NAME}); do
20 |     kubectl annotate node "${node}" "kind.x-k8s.io/registry=localhost:${REG_PORT}";
21 | done
22 | 


--------------------------------------------------------------------------------
/acceptance/mesos-agent-secret:
--------------------------------------------------------------------------------
1 |  {
2 |    "principal": "agent",
3 |    "secret": "secretagent"
4 |  }
5 | 


--------------------------------------------------------------------------------
/acceptance/mesos-secrets:
--------------------------------------------------------------------------------
 1 | {
 2 |   "credentials": [
 3 |     {
 4 |       "principal": "clusterman",
 5 |       "secret": "secret"
 6 |     },
 7 |     {
 8 |       "principal": "agent",
 9 |       "secret": "secretagent"
10 |     }
11 |   ]
12 | }
13 | 


--------------------------------------------------------------------------------
/acceptance/moto/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG DOCKER_REGISTRY
 2 | ARG IMAGE_NAME
 3 | 
 4 | FROM ${DOCKER_REGISTRY}/${IMAGE_NAME}
 5 | 
 6 | RUN apt-get -yq update && apt-get install -yq --no-install-recommends \
 7 |     gcc \
 8 |     python3-dev \
 9 |     libffi-dev \
10 |     python3 \
11 |     libssl-dev \
12 |     python3-pip
13 | 
14 | ADD . /moto/
15 | ENV PYTHONUNBUFFERED 1
16 | 
17 | WORKDIR /moto/
18 | # Setuptools needs to be installed and up-to-date for install of the actual packages
19 | #
20 | # moto and botocore have mismatched upper-bound pins for python-dateutils
21 | # which breaks our build.  botocore used to have <3.0.0, but shrunk that to
22 | # <2.8.1, and moto hasn't updated their pin to match yet.  So until those
23 | # are fixed, here's the latest version of boto that has the <3.0.0 pin.
24 | #
25 | # We can unpin boto3 and botocore once botocore fixes its pin
26 | # (see https://github.com/boto/botocore/commit/e87e7a745fd972815b235a9ee685232745aa94f9)
27 | RUN pip3 install pip==21.3.1 setuptools==59.6.0 && \
28 |     pip3 install cryptography==3.2 botocore==1.14.11 boto3==1.11.11 "moto[server]"
29 | 
30 | ENTRYPOINT ["python3", "-m", "moto.server", "-H", "0.0.0.0"]
31 | 
32 | EXPOSE 5000
33 | 


--------------------------------------------------------------------------------
/acceptance/secret:
--------------------------------------------------------------------------------
1 | secret
2 | 


--------------------------------------------------------------------------------
/acceptance/srv-configs/clog.yaml:
--------------------------------------------------------------------------------
 1 | scribe_port: 1463
 2 | scribe_retry_interval: 10
 3 | 
 4 | monk_stream_prefix: '_clog.'
 5 | scribe_disable: false
 6 | preferred_backend: 'monk'
 7 | monk_disable: false
 8 | monk_timeout_ms: 10000
 9 | monk_host: 'monk-leaf'
10 | monk_port: 6000
11 | scribe_host: 'RandomHost'
12 | use_schematizer: true
13 | 


--------------------------------------------------------------------------------
/acceptance/srv-configs/clusterman-clusters/local-dev/default.kubernetes:
--------------------------------------------------------------------------------
 1 | ---
 2 | resource_groups:
 3 |   - sfr:
 4 |       s3:
 5 |         bucket: clusterman-resource-groups
 6 |         prefix: acceptance
 7 | 
 8 | scaling_limits:
 9 |   min_capacity: 10
10 |   max_capacity: 60
11 |   max_tasks_to_kill: 100
12 |   max_weight_to_add: 10
13 |   max_weight_to_remove: 10
14 | 
15 | autoscale_signal:
16 |   internal: true
17 |   period_minutes: 1
18 | 
19 | autoscaling:
20 |   prevent_scale_down_after_capacity_loss: true
21 |   instance_loss_threshold: 3
22 | 
23 | alert_on_max_capacity: false
24 | 


--------------------------------------------------------------------------------
/acceptance/srv-configs/clusterman-clusters/local-dev/default.mesos:
--------------------------------------------------------------------------------
 1 | ---
 2 | resource_groups:
 3 |   - sfr:
 4 |       s3:
 5 |         bucket: clusterman-resource-groups
 6 |         prefix: acceptance
 7 | 
 8 | scaling_limits:
 9 |   min_capacity: 10
10 |   max_capacity: 60
11 |   max_tasks_to_kill: 100
12 |   max_weight_to_add: 10
13 |   max_weight_to_remove: 10
14 | 
15 | autoscale_signal:
16 |   namespace: clusterman
17 |   name: MostRecentResources
18 |   branch_or_tag: acceptance
19 |   period_minutes: 10
20 |   required_metrics:
21 |     - name: cpus_allocated
22 |       type: system_metrics
23 |       minute_range: 10
24 |     - name: mem_allocated
25 |       type: system_metrics
26 |       minute_range: 10
27 |     - name: disk_allocated
28 |       type: system_metrics
29 |       minute_range: 10
30 | 
31 | alert_on_max_capacity: false
32 | 


--------------------------------------------------------------------------------
/acceptance/srv-configs/clusterman-external.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | module_config:
 3 |     - namespace: clusterman_metrics
 4 |       file: /nail/srv/configs/clusterman_metrics.yaml
 5 | 
 6 | # ######
 7 | # Mappings for the clusterman service that are the same for all habitats.
 8 | #
 9 | # NOTE: The clusterman service will map clusters.{cluster_name}.aws_region
10 | #   to aws.region, if the --cluster argument is passed to the service.
11 | clusters:
12 |     local-dev:
13 |         aws_account_number: 123456789012
14 |         aws_region: us-west-2
15 |         mesos_master_fqdn: mesosmaster
16 |         kubeconfig_path: /var/lib/clusterman/clusterman.conf
17 | 
18 | aws:
19 |     endpoint_url: http://moto-{svc}:5000
20 |     access_key_file: /etc/boto_cfg/clusterman.json
21 |     signals_bucket: clusterman-signals
22 | 
23 | batches:
24 |     spot_prices:
25 |         run_interval_seconds: 60
26 |         dedupe_interval_seconds: 60
27 |     cluster_metrics:
28 |         run_interval_seconds: 60
29 | 
30 | autoscaling:
31 |     default_signal_role: 'clusterman'
32 |     setpoint: 0.7
33 |     target_capacity_margin: 0.05
34 | 
35 | autoscale_signal:
36 |     name: ConstantSignal
37 |     branch_or_tag: acceptance
38 |     period_minutes: 1
39 | 
40 | sensu_config:
41 |     - team: noop
42 |       page: false
43 | 
44 | module_env_config: []
45 | 


--------------------------------------------------------------------------------
/acceptance/srv-configs/clusterman.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | module_config:
 3 |     - namespace: clog
 4 |       initialize: clusterman.batch.clog.initialize
 5 |       config:
 6 |           log_stream_name: clusterman
 7 |       file: /nail/srv/configs/clog.yaml
 8 |     - namespace: clusterman_metrics
 9 |       file: /nail/srv/configs/clusterman_metrics.yaml
10 |     - namespace: yelp_batch
11 |       config:
12 |           watchers:
13 |               - aws_key_rotation: /etc/boto_cfg/clusterman.json
14 |               - clusterman_yaml: /nail/srv/configs/clusterman.yaml
15 |           logging:
16 |               console_log_level: 'CRITICAL'
17 | 
18 | # ######
19 | # Mappings for the clusterman service that are the same for all habitats.
20 | #
21 | # NOTE: The clusterman service will map clusters.{cluster_name}.aws_region
22 | #   to aws.region, if the --cluster argument is passed to the service.
23 | clusters:
24 |     local-dev:
25 |         aws_account_number: 123456789012
26 |         aws_region: us-west-2
27 |         mesos_master_fqdn: mesosmaster
28 |         kubeconfig_path: /var/lib/clusterman/clusterman.conf
29 | 
30 | aws:
31 |     endpoint_url: http://moto-{svc}:5000
32 |     access_key_file: /etc/boto_cfg/clusterman.json
33 |     signals_bucket: clusterman-signals
34 | 
35 | batches:
36 |     spot_prices:
37 |         run_interval_seconds: 60
38 |         dedupe_interval_seconds: 60
39 |     cluster_metrics:
40 |         run_interval_seconds: 60
41 | 
42 | autoscaling:
43 |     default_signal_role: 'clusterman'
44 |     setpoint: 0.7
45 |     target_capacity_margin: 0.05
46 | 
47 | autoscale_signal:
48 |     name: ConstantSignal
49 |     branch_or_tag: acceptance
50 |     period_minutes: 1
51 | 
52 | sensu_config:
53 |     - team: noop
54 |       page: false
55 | 
56 | module_env_config: []
57 | 
58 | monitoring_client: LogMonitoringClient
59 | 


--------------------------------------------------------------------------------
/acceptance/srv-configs/clusterman_metrics.yaml:
--------------------------------------------------------------------------------
1 | dynamodb:
2 |     ttl_days: 732
3 | 
4 | access_key_file: '/etc/boto_cfg/clusterman_metrics.json'
5 | 


--------------------------------------------------------------------------------
/acceptance/utils.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | COMPOSE_CONTAINERS="zookeeper mesosmaster mesosagent moto-ec2 moto-s3 moto-dynamodb moto-sts"
 4 | 
 5 | cleanup() {
 6 |     docker kill "${CONTAINER}" > /dev/null
 7 |     for compose_container in ${COMPOSE_CONTAINERS}; do
 8 |         docker network disconnect "clusterman_${DISTRIB_CODENAME}_acceptance" "clusterman_${DISTRIB_CODENAME}_${compose_container}_1"
 9 |     done
10 |     docker network rm "clusterman_${DISTRIB_CODENAME}_acceptance" > /dev/null
11 | }
12 | 
13 | setup_networks() {
14 |     CIDR_BLOCK="10.1.0.0/24"
15 |     docker network create --ip-range "${CIDR_BLOCK}" --subnet "${CIDR_BLOCK}" "clusterman_${DISTRIB_CODENAME}_acceptance"
16 |     for compose_container in ${COMPOSE_CONTAINERS}; do
17 |         docker network connect "clusterman_${DISTRIB_CODENAME}_acceptance" "clusterman_${DISTRIB_CODENAME}_${compose_container}_1"
18 |     done
19 | }
20 | 


--------------------------------------------------------------------------------
/acceptance/xenial/clusterman_signals_acceptance.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/acceptance/xenial/clusterman_signals_acceptance.tar.gz


--------------------------------------------------------------------------------
/clusterman/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | __version__ = "4.22.2"
15 | 


--------------------------------------------------------------------------------
/clusterman/autoscaler/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/autoscaler/config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import List
15 | from typing import NamedTuple
16 | 
17 | import colorlog
18 | import staticconf
19 | 
20 | logger = colorlog.getLogger(__name__)
21 | 
22 | 
23 | class AutoscalingConfig(NamedTuple):
24 |     excluded_resources: List[str]
25 |     setpoint: float
26 |     target_capacity_margin: float
27 |     prevent_scale_down_after_capacity_loss: bool = False
28 |     instance_loss_threshold: int = 0
29 |     orphan_instance_uptime_threshold_seconds: int = 1800
30 | 
31 | 
32 | def get_autoscaling_config(config_namespace: str) -> AutoscalingConfig:
33 |     """Load autoscaling configuration values from the provided config_namespace, falling back to the
34 |     values stored in the default namespace if none are specified.
35 | 
36 |     :param config_namespace: namespace to read from before falling back to the default namespace
37 |     :returns: AutoscalingConfig object with loaded config values
38 |     """
39 |     default_excluded_resources = staticconf.read_list("autoscaling.excluded_resources", default=[])
40 |     default_setpoint = staticconf.read_float("autoscaling.setpoint")
41 |     default_target_capacity_margin = staticconf.read_float("autoscaling.target_capacity_margin")
42 | 
43 |     reader = staticconf.NamespaceReaders(config_namespace)
44 |     return AutoscalingConfig(
45 |         excluded_resources=reader.read_list("autoscaling.excluded_resources", default=default_excluded_resources),
46 |         setpoint=reader.read_float("autoscaling.setpoint", default=default_setpoint),
47 |         target_capacity_margin=reader.read_float(
48 |             "autoscaling.target_capacity_margin",
49 |             default=default_target_capacity_margin,
50 |         ),
51 |         prevent_scale_down_after_capacity_loss=reader.read_bool(
52 |             "autoscaling.prevent_scale_down_after_capacity_loss", default=False
53 |         ),
54 |         instance_loss_threshold=reader.read_int("autoscaling.instance_loss_threshold", default=0),
55 |         orphan_instance_uptime_threshold_seconds=reader.read_int(
56 |             "autoscaling.orphan_instance_uptime_threshold_seconds", default=1800
57 |         ),
58 |     )
59 | 


--------------------------------------------------------------------------------
/clusterman/autoscaler/toggle.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import time
15 | from typing import Union
16 | 
17 | import arrow
18 | import staticconf
19 | 
20 | from clusterman.aws.client import dynamodb
21 | from clusterman.util import CLUSTERMAN_STATE_TABLE
22 | from clusterman.util import parse_time_string
23 | 
24 | 
25 | AUTOSCALER_PAUSED = "autoscaler_paused"
26 | 
27 | 
28 | def disable_autoscaling(cluster: str, pool: str, scheduler: str, until: Union[str, int, float]):
29 |     """Disable autoscaling for a pool
30 | 
31 |     :param str cluster: name of the cluster
32 |     :param str pool: name of the pool
33 |     :param str scheduler: cluster scheduler
34 |     :param str until: how long should it remain disabled
35 |     """
36 |     expiration = parse_time_string(until).timestamp if isinstance(until, str) else int(until)
37 |     state = {
38 |         "state": {"S": AUTOSCALER_PAUSED},
39 |         "entity": {"S": f"{cluster}.{pool}.{scheduler}"},
40 |         "timestamp": {"N": str(int(time.time()))},
41 |         "expiration_timestamp": {"N": str(expiration)},
42 |     }
43 |     dynamodb.put_item(
44 |         TableName=staticconf.read("aws.state_table", default=CLUSTERMAN_STATE_TABLE),
45 |         Item=state,
46 |     )
47 | 
48 | 
49 | def enable_autoscaling(cluster: str, pool: str, scheduler: str):
50 |     """Re-enable autoscaling for a pool
51 | 
52 |     :param str cluster: name of the cluster
53 |     :param str pool: name of the pool
54 |     :param str scheduler: cluster scheduler
55 |     """
56 |     dynamodb.delete_item(
57 |         TableName=staticconf.read("aws.state_table", default=CLUSTERMAN_STATE_TABLE),
58 |         Key={
59 |             "state": {"S": AUTOSCALER_PAUSED},
60 |             "entity": {"S": f"{cluster}.{pool}.{scheduler}"},
61 |         },
62 |     )
63 | 
64 | 
65 | def autoscaling_is_paused(cluster: str, pool: str, scheduler: str, timestamp: arrow.Arrow) -> bool:
66 |     """Check if autoscaling is disabled
67 | 
68 |     :param str cluster: name of the cluster
69 |     :param str pool: name of the pool
70 |     :param str scheduler: cluster scheduler
71 |     :param Arrow timestamp: threshold time
72 |     :return: True if paused
73 |     """
74 |     response = dynamodb.get_item(
75 |         TableName=CLUSTERMAN_STATE_TABLE,
76 |         Key={
77 |             "state": {"S": AUTOSCALER_PAUSED},
78 |             "entity": {"S": f"{cluster}.{pool}.{scheduler}"},
79 |         },
80 |         ConsistentRead=True,
81 |     )
82 |     if "Item" not in response:
83 |         return False
84 | 
85 |     if "expiration_timestamp" in response["Item"] and timestamp.timestamp > int(
86 |         response["Item"]["expiration_timestamp"]["N"]
87 |     ):
88 |         return False
89 | 
90 |     return True
91 | 


--------------------------------------------------------------------------------
/clusterman/aws/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/clusterman/aws/__init__.py


--------------------------------------------------------------------------------
/clusterman/aws/response_types.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from mypy_extensions import TypedDict
 4 | 
 5 | 
 6 | class AutoScalingInstanceConfig(TypedDict):
 7 |     InstanceId: str
 8 |     InstanceType: str
 9 |     WeightedCapacity: str
10 | 
11 | 
12 | class LaunchTemplateDataConfig(TypedDict):
13 |     InstanceType: str
14 | 
15 | 
16 | class LaunchTemplateConfig(TypedDict):
17 |     LaunchTemplateName: str
18 |     LaunchTemplateData: LaunchTemplateDataConfig
19 |     Version: str
20 | 
21 | 
22 | class InstanceOverrideConfig(TypedDict):
23 |     InstanceType: str
24 |     WeightedCapacity: str
25 | 
26 | 
27 | class MixedInstancesPolicyLaunchTemplateConfig(TypedDict):
28 |     LaunchTemplateSpecification: LaunchTemplateConfig
29 |     Overrides: List[InstanceOverrideConfig]
30 | 
31 | 
32 | class MixedInstancesPolicyConfig(TypedDict):
33 |     LaunchTemplate: MixedInstancesPolicyLaunchTemplateConfig
34 | 
35 | 
36 | class AutoScalingGroupConfig(TypedDict):
37 |     AvailabilityZones: List[str]
38 |     DesiredCapacity: int
39 |     Instances: List[AutoScalingInstanceConfig]
40 |     LaunchTemplate: LaunchTemplateConfig
41 |     MaxSize: int
42 |     MinSize: int
43 |     MixedInstancesPolicy: MixedInstancesPolicyConfig
44 | 


--------------------------------------------------------------------------------
/clusterman/aws/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import List
15 | from typing import Mapping
16 | from typing import Type
17 | 
18 | from clusterman.aws.auto_scaling_resource_group import AutoScalingResourceGroup
19 | from clusterman.aws.aws_resource_group import AWSResourceGroup
20 | from clusterman.aws.spot_fleet_resource_group import SpotFleetResourceGroup
21 | 
22 | 
23 | _RESOURCE_GROUP_TYPES: List[Type[AWSResourceGroup]] = [AutoScalingResourceGroup, SpotFleetResourceGroup]
24 | RESOURCE_GROUPS: Mapping[str, Type[AWSResourceGroup]] = {t.FRIENDLY_NAME: t for t in _RESOURCE_GROUP_TYPES}
25 | RESOURCE_GROUPS_REV: Mapping[Type[AWSResourceGroup], str] = {v: k for k, v in RESOURCE_GROUPS.items()}
26 | 


--------------------------------------------------------------------------------
/clusterman/batch/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/batch/clog.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import logging
15 | import socket
16 | 
17 | import clog.handlers
18 | import staticconf
19 | 
20 | 
21 | namespace = "clog"
22 | clog_namespace = staticconf.NamespaceGetters(namespace)  # type: ignore
23 | DETAILED_FORMAT = "\t".join(
24 |     [
25 |         "%(asctime)s",
26 |         socket.gethostname(),
27 |         "%(process)s",
28 |         "%(name)s",
29 |         "%(levelname)s",
30 |         "%(message)s",
31 |     ]
32 | )
33 | 
34 | 
35 | log_stream_name = clog_namespace.get_string("log_stream_name")
36 | log_stream_format = clog_namespace.get_string("log_stream_format", default=DETAILED_FORMAT)
37 | log_stream_level = clog_namespace.get_string("log_stream_level", default="INFO")
38 | enable_uwsgi_mule_offload = clog_namespace.get_bool("enable_uwsgi_mule_offload", default=False)
39 | 
40 | 
41 | def initialize():
42 |     """Initialize clog from staticconf config."""
43 |     if enable_uwsgi_mule_offload and clog.uwsgi_plugin_enabled:
44 |         clog.uwsgi_patch_global_state()
45 | 
46 |     add_clog_handler(
47 |         name=log_stream_name.value,
48 |         level=getattr(logging, log_stream_level.value),
49 |         log_format=log_stream_format.value,
50 |     )
51 | 
52 | 
53 | def add_clog_handler(name, level=logging.INFO, log_format=DETAILED_FORMAT):
54 |     """Add a CLog logging handler for the stream 'name'.
55 | 
56 |     :param name: the name of the log
57 |     :type name: string
58 |     :param level: the logging level of the handler
59 |     :type level: int
60 |     """
61 |     clog_handler = clog.handlers.CLogHandler(name)
62 |     clog_handler.setLevel(level)
63 |     formatter = logging.Formatter(log_format)
64 |     clog_handler.setFormatter(formatter)
65 |     logging.root.addHandler(clog_handler)
66 | 


--------------------------------------------------------------------------------
/clusterman/batch/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import inspect
15 | import os
16 | from contextlib import contextmanager
17 | 
18 | import botocore.exceptions
19 | import colorlog
20 | from yelp_batch.batch import batch_context
21 | 
22 | from clusterman.monitoring_lib import get_monitoring_client
23 | 
24 | RLE_COUNTER_NAME = "clusterman.request_limit_exceeded"
25 | logger = colorlog.getLogger(__name__)
26 | 
27 | 
28 | class BatchLoggingMixin:  # pragma: no cover
29 |     @batch_context
30 |     def setup_watchers(self):
31 |         self.logger.info(
32 |             "Starting batch {name}; watching {watched_files} for changes".format(
33 |                 name=type(self).__name__,
34 |                 watched_files=[watcher.filename for watcher in self.version_checker.watchers],
35 |             )
36 |         )
37 |         yield
38 |         self.logger.info("Batch {name} complete".format(name=type(self).__name__))
39 | 
40 | 
41 | class BatchRunningSentinelMixin:  # pragma: no cover
42 |     @batch_context
43 |     def make_running_sentinel(self):
44 |         batch_name, ext = os.path.splitext(os.path.basename(inspect.getfile(self.__class__)))
45 |         sentinel_file = f"/tmp/{batch_name}.running"
46 |         with open(sentinel_file, "w") as f:
47 |             f.write(str(os.getpid()))
48 |         yield
49 | 
50 | 
51 | @contextmanager
52 | def suppress_request_limit_exceeded():
53 |     try:
54 |         yield
55 |     except botocore.exceptions.ClientError as e:
56 |         if e.response.get("Error", {}).get("Code") == "RequestLimitExceeded":
57 |             logger.warning(e)
58 |             rle_counter = get_monitoring_client().create_counter(RLE_COUNTER_NAME)
59 |             rle_counter.count()
60 |         else:
61 |             raise
62 | 


--------------------------------------------------------------------------------
/clusterman/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/cli/info.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import simplejson as json
15 | 
16 | from clusterman.args import add_cluster_arg
17 | from clusterman.args import add_json_arg
18 | from clusterman.args import subparser
19 | from clusterman.util import get_cluster_name_list
20 | from clusterman.util import get_pool_name_list
21 | 
22 | 
23 | def list_clusters(args):  # pragma: no cover
24 |     if args.json:
25 |         print(json.dumps(list(get_cluster_name_list())))
26 |     else:
27 |         print("\n".join(get_cluster_name_list()))
28 | 
29 | 
30 | @subparser("list-clusters", "list available clusters", list_clusters)
31 | def add_list_clusters_parser(subparser, required_named_args, optional_named_args):  # pragma: no cover
32 |     add_json_arg(optional_named_args)
33 | 
34 | 
35 | def list_pools(args):  # pragma: no cover
36 |     if args.json:
37 |         obj = {scheduler: list(get_pool_name_list(args.cluster, scheduler)) for scheduler in ["mesos", "kubernetes"]}
38 |         print(json.dumps(obj))
39 |     else:
40 |         for scheduler in ["mesos", "kubernetes"]:
41 |             print(f"\n{scheduler.capitalize()} pools\n--------------------")
42 |             print("\n".join(get_pool_name_list(args.cluster, scheduler)))
43 | 
44 | 
45 | @subparser("list-pools", "list available pools in a cluster", list_pools)
46 | def add_list_pools_parser(subparser, required_named_args, optional_named_args):  # pragma: no cover
47 |     add_cluster_arg(required_named_args, required=True)
48 |     add_json_arg(optional_named_args)
49 | 


--------------------------------------------------------------------------------
/clusterman/cli/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import argparse
15 | import socket
16 | from functools import partial
17 | 
18 | import colorlog
19 | 
20 | from clusterman.util import limit_function_runtime
21 | 
22 | 
23 | logger = colorlog.getLogger(__name__)
24 | TIMEOUT_TIME_SECONDS = 5
25 | 
26 | 
27 | def timeout_wrapper(main):
28 |     def wrapper(args: argparse.Namespace):
29 |         def timeout_handler():
30 |             warning_string = "This command is taking a long time to run; you're likely targetting a large pool/cluster."
31 |             if "yelpcorp" in socket.getfqdn():
32 |                 warning_string += "\nIf this command hasn't returned in several minutes, reach out to #clusterman"
33 |             logger.warning(warning_string)
34 | 
35 |         limit_function_runtime(partial(main, args), TIMEOUT_TIME_SECONDS, timeout_handler)
36 | 
37 |     return wrapper
38 | 


--------------------------------------------------------------------------------
/clusterman/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/draining/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/draining/kubernetes.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import colorlog
 4 | 
 5 | from clusterman.kubernetes.kubernetes_cluster_connector import KubernetesClusterConnector
 6 | 
 7 | log = colorlog.getLogger(__name__)
 8 | 
 9 | 
10 | def drain(connector: Optional[KubernetesClusterConnector], node_name: str, disable_eviction: bool) -> bool:
11 |     """Cordons and evicts/deletes all tasks from a given node.
12 |     :param node_name: a single node name to drain (as would be passed to kubectl drain)
13 |     :param connector: a kubernetes connector to connect kubernetes API
14 |     :param disable_eviction: Force drain to use delete (ignoring PDBs)
15 |     :returns: bool
16 |     """
17 |     if connector:
18 |         log.info(f"Preparing to drain {node_name}...")
19 |         return connector.drain_node(node_name, disable_eviction)
20 |     else:
21 |         log.info(f"Unable to drain {node_name} (no Kubernetes connector configured)")
22 |         return False
23 | 
24 | 
25 | def uncordon(connector: Optional[KubernetesClusterConnector], node_name: str) -> bool:
26 |     """Cordons and safely evicts all tasks from a given node.
27 |     :param node_name: a single node name to uncordon (as would be passed to kubectl uncordon)
28 |     :param connector: a kubernetes connector to connect kubernetes API
29 |     :returns: bool
30 |     """
31 |     if connector:
32 |         log.info(f"Preparing to uncordon {node_name}...")
33 |         return connector.uncordon_node(node_name)
34 |     else:
35 |         log.info(f"Unable to uncordon {node_name} (no Kubernetes connector configured)")
36 |         return False
37 | 


--------------------------------------------------------------------------------
/clusterman/exceptions.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | class ClustermanException(Exception):
17 |     pass
18 | 
19 | 
20 | class AllResourceGroupsAreStaleError(Exception):
21 |     pass
22 | 
23 | 
24 | class AccountNumberMistmatchError(Exception):
25 |     pass
26 | 
27 | 
28 | class AutoscalerError(ClustermanException):
29 |     pass
30 | 
31 | 
32 | class ClustermanSignalError(ClustermanException):
33 |     pass
34 | 
35 | 
36 | class MetricsError(ClustermanException):
37 |     pass
38 | 
39 | 
40 | class NoLaunchTemplateConfiguredError(ClustermanException):
41 |     pass
42 | 
43 | 
44 | class NoResourceGroupsFoundError(Exception):
45 |     pass
46 | 
47 | 
48 | class NoSignalConfiguredException(ClustermanException):
49 |     pass
50 | 
51 | 
52 | class ResourceGroupError(ClustermanException):
53 |     pass
54 | 
55 | 
56 | class PoolManagerError(ClustermanException):
57 |     pass
58 | 
59 | 
60 | class PoolConnectionError(PoolManagerError):
61 |     """Raised when the pool master cannot be reached"""
62 | 
63 |     pass
64 | 
65 | 
66 | class ResourceRequestError(ClustermanException):
67 |     pass
68 | 
69 | 
70 | class SignalValidationError(ClustermanSignalError):
71 |     pass
72 | 
73 | 
74 | class SignalConnectionError(ClustermanSignalError):
75 |     pass
76 | 
77 | 
78 | class SimulationError(ClustermanException):
79 |     pass
80 | 


--------------------------------------------------------------------------------
/clusterman/interfaces/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/interfaces/types.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | from typing import NamedTuple
 3 | from typing import Optional
 4 | 
 5 | import arrow
 6 | 
 7 | from clusterman.aws.markets import InstanceMarket
 8 | from clusterman.util import ClustermanResources
 9 | 
10 | 
11 | class AgentState(enum.Enum):
12 |     IDLE = "idle"
13 |     ORPHANED = "orphaned"
14 |     RUNNING = "running"
15 |     UNKNOWN = "unknown"
16 | 
17 | 
18 | class AgentMetadata(NamedTuple):
19 |     agent_id: str = ""
20 |     allocated_resources: ClustermanResources = ClustermanResources()
21 |     batch_task_count: int = 0
22 |     is_safe_to_kill: bool = True
23 |     is_draining: bool = False
24 |     priority: float = 0.0
25 |     state: AgentState = AgentState.UNKNOWN
26 |     task_count: int = 0
27 |     total_resources: ClustermanResources = ClustermanResources()
28 |     kernel: str = ""
29 |     lsbrelease: str = ""
30 | 
31 | 
32 | class InstanceMetadata(NamedTuple):
33 |     market: InstanceMarket
34 |     weight: float
35 |     group_id: str = ""
36 |     hostname: Optional[str] = None
37 |     instance_id: str = ""
38 |     is_cordoned: bool = False
39 |     ip_address: Optional[str] = None
40 |     is_stale: bool = False
41 |     state: str = ""
42 |     uptime: arrow.Arrow = 0
43 | 
44 | 
45 | class ClusterNodeMetadata(NamedTuple):
46 |     agent: AgentMetadata  # Agent metadata is information associated with the Mesos or Kubernetes agent
47 |     instance: InstanceMetadata  # Instance metadata is information associated with the EC2 instance
48 | 


--------------------------------------------------------------------------------
/clusterman/kubernetes/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/math/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/math/piecewise_types.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import Hashable
15 | from typing import TypeVar
16 | 
17 | from typing_extensions import Protocol
18 | 
19 | 
20 | T = TypeVar("T")
21 | 
22 | 
23 | class XValueDiff(Protocol[T], Hashable):
24 |     def __mul__(self, other: int) -> "XValueDiff[T]":
25 |         ...
26 | 
27 |     def __truediv__(self, other: "XValueDiff[T]") -> float:
28 |         ...
29 | 
30 | 
31 | class XValue(Protocol[T], Hashable):
32 |     def __add__(self, other: XValueDiff[T]) -> "XValue[T]":
33 |         ...
34 | 
35 |     def __sub__(self, other: "XValue[T]") -> XValueDiff[T]:
36 |         ...
37 | 
38 |     def __floordiv__(self, other: "XValue[T]") -> float:
39 |         ...
40 | 
41 |     def __lt__(self, other: "XValue[T]") -> bool:
42 |         ...
43 | 
44 |     def __ge__(self, other: "XValue[T]") -> bool:
45 |         ...
46 | 
47 |     def __mod__(self, other: "XValue[T]") -> int:
48 |         ...
49 | 


--------------------------------------------------------------------------------
/clusterman/mesos/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/migration/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/migration/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # K8s CRD
15 | MIGRATION_CRD_GROUP = "clusterman.yelp.com"
16 | MIGRATION_CRD_VERSION = "v1"
17 | MIGRATION_CRD_PLURAL = "nodemigrations"
18 | MIGRATION_CRD_KIND = "NodeMigration"
19 | MIGRATION_CRD_STATUS_LABEL = "clusterman.yelp.com/migration_status"
20 | MIGRATION_CRD_ATTEMPTS_LABEL = "clusterman.yelp.com/attempts"
21 | 
22 | # Default settings
23 | DEFAULT_POOL_PRESCALING = 0
24 | DEFAULT_NODE_BOOT_WAIT = "3m"
25 | DEFAULT_NODE_BOOT_TIMEOUT = "10m"
26 | DEFAULT_WORKER_TIMEOUT = "2h"
27 | DEFAULT_HEALTH_CHECK_INTERVAL = "2m"
28 | DEFAULT_ALLOWED_FAILED_DRAINS = 3
29 | DEFAULT_ORPHAN_CAPACITY_TOLLERANCE = 0
30 | DEFAULT_MAX_UPTIME_WORKER_SKIPS = 6
31 | MAX_ORPHAN_CAPACITY_TOLLERANCE = 0.2
32 | 
33 | # Worker parameters
34 | UPTIME_CHECK_INTERVAL_SECONDS = 60 * 60  # 1 hour
35 | INITIAL_POOL_HEALTH_TIMEOUT_SECONDS = 15 * 60
36 | SUPPORTED_POOL_SCHEDULER = "kubernetes"
37 | 
38 | # SFX metrics keys
39 | SFX_NODE_DRAIN_COUNT = "clusterman.node_migration.drain_count"
40 | SFX_MIGRATION_JOB_DURATION = "clusterman.node_migration.duration"
41 | SFX_DRAINED_NODE_UPTIME = "clusterman.node_migration.drained_node_uptime"
42 | 


--------------------------------------------------------------------------------
/clusterman/migration/event_enums.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Yelp Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import enum
 15 | import operator
 16 | from typing import Any
 17 | from typing import Collection
 18 | from typing import Union
 19 | 
 20 | import packaging.version
 21 | import semver
 22 | 
 23 | from clusterman.interfaces.types import ClusterNodeMetadata
 24 | 
 25 | 
 26 | ComparableVersion = Union[semver.VersionInfo, packaging.version.Version]
 27 | ComparableConditionTarget = Union[str, int, ComparableVersion]
 28 | 
 29 | 
 30 | class MigrationStatus(enum.Enum):
 31 |     PENDING = "pending"
 32 |     INPROGRESS = "inprogress"
 33 |     COMPLETED = "completed"
 34 |     SKIPPED = "skipped"
 35 |     STOP = "stop"
 36 |     FAILED = "failed"
 37 | 
 38 | 
 39 | class ConditionTrait(enum.Enum):
 40 |     KERNEL = "kernel"
 41 |     LSBRELEASE = "lsbrelease"
 42 |     INSTANCE_TYPE = "instance_type"
 43 |     UPTIME = "uptime"
 44 | 
 45 |     def get_from(self, node: ClusterNodeMetadata) -> ComparableConditionTarget:
 46 |         """Get trait value from node metadata
 47 | 
 48 |         :param ClusterNodeMetadata node: node metadata
 49 |         :return: value
 50 |         """
 51 |         return CONDITION_TRAIT_GETTERS[self](node)
 52 | 
 53 | 
 54 | class ConditionOperator(enum.Enum):
 55 |     GT = "gt"
 56 |     GE = "ge"
 57 |     EQ = "eq"
 58 |     NE = "ne"
 59 |     LT = "lt"
 60 |     LE = "le"
 61 |     IN = "in"
 62 |     NOTIN = "notin"
 63 | 
 64 |     @classmethod
 65 |     def expecting_collection(cls) -> Collection["ConditionOperator"]:
 66 |         """Return operators expecting collection of object as right-operand"""
 67 |         return (cls.IN, cls.NOTIN)
 68 | 
 69 |     def apply(self, left: Any, right: Any) -> bool:
 70 |         """Apply operator
 71 | 
 72 |         :param Any left: left operand
 73 |         :param Any right: right operand
 74 |         :return: boolean result
 75 |         """
 76 |         if self == ConditionOperator.IN:
 77 |             return left in right
 78 |         elif self == ConditionOperator.NOTIN:
 79 |             return left not in right
 80 |         return getattr(operator, self.value)(left, right)
 81 | 
 82 | 
 83 | CONDITION_OPERATOR_SUPPORT_MATRIX = {
 84 |     ConditionTrait.KERNEL: set(ConditionOperator),
 85 |     ConditionTrait.LSBRELEASE: set(ConditionOperator),
 86 |     ConditionTrait.INSTANCE_TYPE: {
 87 |         ConditionOperator.EQ,
 88 |         ConditionOperator.NE,
 89 |         ConditionOperator.IN,
 90 |         ConditionOperator.NOTIN,
 91 |     },
 92 |     ConditionTrait.UPTIME: {ConditionOperator.GT, ConditionOperator.GE, ConditionOperator.LT, ConditionOperator.LE},
 93 | }
 94 | 
 95 | CONDITION_TRAIT_GETTERS = {
 96 |     ConditionTrait.KERNEL: lambda node: semver.VersionInfo.parse(node.agent.kernel),
 97 |     ConditionTrait.LSBRELEASE: lambda node: packaging.version.parse(node.agent.lsbrelease),
 98 |     ConditionTrait.INSTANCE_TYPE: lambda node: node.instance.market.instance,
 99 |     ConditionTrait.UPTIME: lambda node: node.instance.uptime.total_seconds(),
100 | }
101 | 


--------------------------------------------------------------------------------
/clusterman/reports/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/reports/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | AXIS_DIMENSION_INCHES = (8, 2.5)
15 | COLORMAP = "plasma"
16 | ERROR_COLOR = "C3"
17 | TREND_LINE_COLOR = "orange"
18 | TREND_RANGE_COLOR = "xkcd:light orange"
19 | TREND_RANGE_ALPHA = 0.5
20 | FIGURE_DPI = 300
21 | MAGNITUDE_STRINGS = [
22 |     None,
23 |     "thousands",
24 |     "millions",
25 |     "billions",
26 |     "trillions",
27 | ]
28 | SUBTITLE_SPACING = 64
29 | 


--------------------------------------------------------------------------------
/clusterman/run.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import sys
15 | 
16 | from clusterman.args import parse_args
17 | from clusterman.config import setup_config
18 | from clusterman.util import setup_logging
19 | 
20 | 
21 | def main(argv=None):
22 |     if argv is None:
23 |         argv = sys.argv[1:]
24 | 
25 |     args = parse_args(argv, "Cluster scaling and management for Mesos and Kubernetes")
26 | 
27 |     setup_logging(args.log_level)
28 |     setup_config(args)
29 | 
30 |     args.entrypoint(args)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------
/clusterman/signals/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/clusterman/signals/__init__.py


--------------------------------------------------------------------------------
/clusterman/simulator/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/simulator/io.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import gzip
15 | 
16 | import arrow
17 | import jsonpickle
18 | import simplejson as json
19 | from sortedcontainers import SortedDict
20 | 
21 | 
22 | def _python_encode(obj):
23 |     return json.loads(jsonpickle.encode(obj))
24 | 
25 | 
26 | def _python_decode(obj):
27 |     return jsonpickle.decode(json.dumps(obj))
28 | 
29 | 
30 | class ArrowSerializer(jsonpickle.handlers.BaseHandler):
31 |     def flatten(self, obj, data):
32 |         data["timestamp"] = obj.timestamp
33 |         return data
34 | 
35 |     def restore(self, data):
36 |         return arrow.get(data["timestamp"])
37 | 
38 | 
39 | class SortedDictSerializer(jsonpickle.handlers.BaseHandler):
40 |     def flatten(self, obj, data):
41 |         data["items"] = [(_python_encode(k), _python_encode(v)) for k, v in obj.items()]
42 |         return data
43 | 
44 |     def restore(self, data):
45 |         return SortedDict((_python_decode(k), _python_decode(v)) for k, v in data["items"])
46 | 
47 | 
48 | def _register_handlers():
49 |     # These operations are idempotent, it's safe to do more than once
50 |     jsonpickle.handlers.register(arrow.Arrow, ArrowSerializer)
51 |     jsonpickle.handlers.register(SortedDict, SortedDictSerializer)
52 | 
53 | 
54 | def read_object_from_compressed_json(filename, raw_timestamps=False):
55 |     """Read a Python object from a gzipped JSON file"""
56 |     _register_handlers()
57 |     with gzip.open(filename) as f:
58 |         if raw_timestamps:
59 |             old_arrow = arrow.get
60 |             arrow.get = int
61 |         data = jsonpickle.decode(f.read().decode())
62 |         if raw_timestamps:
63 |             arrow.get = old_arrow
64 |         return data
65 | 
66 | 
67 | def write_object_to_compressed_json(obj, filename):
68 |     """Write the Python object to a compressed (gzipped) JSON file
69 | 
70 |     :param obj: a Python object to serialize
71 |     :param filename: the file to write to
72 |     """
73 |     _register_handlers()
74 |     with gzip.open(filename, "w") as f:
75 |         f.write(jsonpickle.encode(obj).encode())
76 | 


--------------------------------------------------------------------------------
/clusterman/simulator/simulated_cluster_connector.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import uuid
15 | 
16 | import staticconf
17 | 
18 | from clusterman.interfaces.cluster_connector import ClusterConnector
19 | from clusterman.interfaces.types import AgentMetadata
20 | from clusterman.interfaces.types import AgentState
21 | from clusterman.simulator import simulator
22 | from clusterman.util import ClustermanResources
23 | 
24 | 
25 | class SimulatedClusterConnector(ClusterConnector):
26 |     def __init__(self, cluster: str, pool: str, simulator: "simulator.Simulator") -> None:
27 |         self.cluster = cluster
28 |         self.pool = pool
29 |         self.simulator = simulator
30 | 
31 |     def reload_state(self) -> None:
32 |         pass
33 | 
34 |     def get_resource_allocation(self, resource_name: str) -> float:
35 |         return 0
36 | 
37 |     def get_resource_total(self, resource_name: str) -> float:
38 |         total = 0
39 |         for c in self.simulator.aws_clusters:
40 |             for i in c.instances.values():
41 |                 if self.simulator.current_time < i.join_time:
42 |                     continue
43 | 
44 |                 total += getattr(i.resources, resource_name)
45 |         return total
46 | 
47 |     def _get_agent_metadata(self, instance_ip: str) -> AgentMetadata:
48 |         for c in self.simulator.aws_clusters:
49 |             for i in c.instances.values():
50 |                 if instance_ip == i.ip_address:
51 |                     return AgentMetadata(
52 |                         agent_id=str(uuid.uuid4()),
53 |                         state=(AgentState.ORPHANED if self.simulator.current_time < i.join_time else AgentState.IDLE),
54 |                         total_resources=ClustermanResources(
55 |                             cpus=i.resources.cpus,
56 |                             mem=i.resources.mem * 1000,
57 |                             disk=(i.resources.disk or staticconf.read_int("ebs_volume_size", 0)) * 1000,
58 |                             gpus=(i.resources.gpus),
59 |                         ),
60 |                     )
61 | 
62 |         # if we don't know the given IP then it's orphaned
63 |         return AgentMetadata(state=AgentState.ORPHANED)
64 | 


--------------------------------------------------------------------------------
/clusterman/simulator/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import arrow
15 | from staticconf.testing import PatchConfiguration
16 | 
17 | 
18 | def patch_join_delay(mean=0, stdev=0):
19 |     return PatchConfiguration(
20 |         {
21 |             "join_delay_mean_seconds": mean,
22 |             "join_delay_stdev_seconds": stdev,
23 |         }
24 |     )
25 | 
26 | 
27 | class SimulationMetadata:  # pragma: no cover
28 |     def __init__(self, name, cluster, pool, scheduler):
29 |         self.name = name
30 |         self.cluster = cluster
31 |         self.pool = pool
32 |         self.scheduler = scheduler
33 |         self.sim_start = None
34 |         self.sim_end = None
35 | 
36 |     def __enter__(self):
37 |         self.sim_start = arrow.now()
38 | 
39 |     def __exit__(self, type, value, traceback):
40 |         self.sim_end = arrow.now()
41 | 
42 |     def __str__(self):
43 |         return f"({self.cluster}, {self.pool}, {self.sim_start}, {self.sim_end})"
44 | 


--------------------------------------------------------------------------------
/clusterman/supervisord/fetch_clusterman_signal:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source /etc/boto_cfg/clusterman.sh
 4 | 
 5 | # don't set -ex until after executing the above so we don't leak keys in the logs
 6 | set -ex
 7 | source /etc/lsb-release  # gives us DISTRIB_CODENAME
 8 | 
 9 | SIGNAL_DIR=${2:-.}
10 | VERSIONS=(${CMAN_VERSIONS_TO_FETCH})
11 | version="clusterman_signals_${VERSIONS[$1]}"
12 | mkdir -p ${SIGNAL_DIR}/${version}
13 | cd ${SIGNAL_DIR}/${version}
14 | aws ${AWS_ENDPOINT_URL_ARGS} s3 cp "s3://${CMAN_SIGNALS_BUCKET}/${DISTRIB_CODENAME}/${version}.tar.gz" .
15 | tar -xzf "${version}.tar.gz"
16 | 


--------------------------------------------------------------------------------
/clusterman/supervisord/run_clusterman_signal:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | 
 4 | SIGNAL_DIR=${2:-.}
 5 | VERSIONS=(${CMAN_SIGNAL_VERSIONS})
 6 | NAMESPACES=(${CMAN_SIGNAL_NAMESPACES})
 7 | NAMES=(${CMAN_SIGNAL_NAMES})
 8 | APPS=(${CMAN_SIGNAL_APPS})
 9 | version="clusterman_signals_${VERSIONS[$1]}"
10 | namespace="${NAMESPACES[$1]}"
11 | name="${NAMES[$1]}"
12 | app="${APPS[$1]}"
13 | 
14 | cd ${SIGNAL_DIR}/${version}
15 | prodenv/bin/python -m clusterman_signals.run ${namespace} ${name} ${app}
16 | 


--------------------------------------------------------------------------------
/clusterman/supervisord/supervisord.conf:
--------------------------------------------------------------------------------
 1 | [unix_http_server]
 2 | file=/tmp/supervisor.sock   ; the path to the socket file
 3 | 
 4 | [inet_http_server]
 5 | port=127.0.0.1:9001
 6 | 
 7 | [supervisord]
 8 | logfile=/tmp/supervisord.log ; main log file; default $CWD/supervisord.log
 9 | logfile_maxbytes=50MB        ; max main logfile bytes b4 rotation; default 50MB
10 | logfile_backups=5            ; # of main logfile backups; 0 means none, default 10
11 | loglevel=info                ; log level; default info; others: debug,warn,trace
12 | pidfile=/tmp/supervisord.pid ; supervisord pidfile; default supervisord.pid
13 | nodaemon=true                ; start in foreground if true; default false
14 | minfds=1024                  ; min. avail startup file descriptors; default 1024
15 | minprocs=200                 ; min. avail process descriptors;default 200
16 | 
17 | [rpcinterface:supervisor]
18 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
19 | 
20 | [supervisorctl]
21 | serverurl=unix:///tmp/supervisor.sock ; use a unix:// URL  for a unix socket
22 | 
23 | [program:fetch_signals]
24 | process_name=fetch_signals_%(process_num)s
25 | numprocs=%(ENV_CMAN_NUM_VERSIONS)s
26 | command=/usr/bin/fetch_clusterman_signal %(process_num)s
27 | autostart=true
28 | autorestart=false
29 | startretries=0
30 | stdout_logfile=/dev/stdout
31 | stdout_logfile_maxbytes=0
32 | redirect_stderr=true
33 | startsecs=0
34 | 
35 | [program:run_signals]
36 | process_name=run_signals_%(process_num)s
37 | numprocs=%(ENV_CMAN_NUM_SIGNALS)s
38 | command=/usr/bin/run_clusterman_signal %(process_num)s
39 | autostart=false
40 | startretries=0
41 | stopasgroup=true
42 | stdout_logfile=/dev/stdout
43 | stdout_logfile_maxbytes=0
44 | redirect_stderr=true
45 | 
46 | [program:autoscaler]
47 | directory=/code
48 | environment=PATH=/code/virtualenv_run/bin:%(ENV_PATH)s
49 | command=python -m clusterman.batch.autoscaler %(ENV_CMAN_ARGS)s
50 | autostart=false
51 | autorestart=false
52 | 
53 | ; The following is to make manual testing and debugging easier.  If we redirect to stdout
54 | ; from the autoscaler batch, then we end up writing to scribe twice: once from the autoscaler
55 | ; batch and once from the autoscaler bootstrap (where supervisord's stdout is piped to stdin2scribe).
56 | ; By redirecting the autoscaler to stderr we ensure that we can still see output during manual
57 | ; debugging but don't write to scribe twice.
58 | stdout_logfile=/dev/stderr
59 | stdout_logfile_maxbytes=0
60 | stderr_logfile=/dev/stderr
61 | stderr_logfile_maxbytes=0
62 | 


--------------------------------------------------------------------------------
/clusterman/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/clusterman/tools/rookout.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from os import getenv
15 | 
16 | 
17 | def enable_rookout() -> None:
18 |     """Enable rookout if environment variables are set"""
19 |     if getenv("ROOKOUT_ENABLE", "") != "1":
20 |         return
21 |     import rook
22 | 
23 |     rook.start(token=getenv("ROOKOUT_TOKEN"))
24 | 


--------------------------------------------------------------------------------
/clusterman_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/clusterman_logo.png


--------------------------------------------------------------------------------
/code-of-conduct.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to make participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies within all project spaces, and it also applies when
49 | an individual is representing the project or its community in public spaces.
50 | Examples of representing a project or community include using an official
51 | project e-mail address, posting via an official social media account, or acting
52 | as an appointed representative at an online or offline event. Representation of
53 | a project may be further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at opensource@yelp.com. All complaints
59 | will be reviewed and investigated and will result in a response that is deemed
60 | necessary and appropriate to the circumstances. The project team is obligated
61 | to maintain confidentiality with regard to the reporter of an incident.  Further
62 | details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/completions/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/completions/.gitignore


--------------------------------------------------------------------------------
/debian/.gitignore:
--------------------------------------------------------------------------------
 1 | *
 2 | !.gitignore
 3 | !changelog
 4 | !compat
 5 | !control
 6 | !copyright
 7 | !rules
 8 | !rules.external
 9 | !clusterman.install
10 | !clusterman.links
11 | 


--------------------------------------------------------------------------------
/debian/clusterman.links:
--------------------------------------------------------------------------------
1 | opt/venvs/clusterman/bin/clusterman usr/bin/clusterman
2 | opt/venvs/clusterman/bin/fetch_clusterman_signal usr/bin/fetch_clusterman_signal
3 | opt/venvs/clusterman/bin/run_clusterman_signal usr/bin/run_clusterman_signal
4 | 


--------------------------------------------------------------------------------
/debian/compat:
--------------------------------------------------------------------------------
1 | 9
2 | 


--------------------------------------------------------------------------------
/debian/control:
--------------------------------------------------------------------------------
 1 | Source: clusterman
 2 | Maintainer: Compute Infra <compute-infra@yelp.com>
 3 | Build-Depends:
 4 |     dh-virtualenv,
 5 | 
 6 | Package: clusterman
 7 | Depends:
 8 |     python3.8,
 9 | # unfortunately needed for numpy to work
10 |     libatlas3-base,
11 | # needed so that we can grab signals from s3
12 | # that said, we have an internal fork that conflicts with this
13 | # once that's gone, we should re-add this (aws-cli vs awscli)
14 | # instead of adding this with `jammyOrLater:Depends`
15 | #    awscli,
16 |     ${misc:Depends},
17 |     ${python:Depends},
18 |     ${shlibs:Depends},
19 |     ${bionicOrLater:Depends},
20 |     ${jammyOrLater:Depends},
21 | Architecture: any
22 | Description: Cluster scaling and management - y/clusterman
23 | 


--------------------------------------------------------------------------------
/debian/rules:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | # -*- makefile -*-
 3 | ifneq ($(shell echo ${CI}),true)
 4 | 	YELP_DH_VIRTUALENV = --preinstall=-rrequirements-bootstrap.txt --extra-pip-arg --only-binary=:all:
 5 | 	YELP_DH_INSTALL = install -d completions/usr debian/package/.
 6 | endif
 7 | 
 8 | # Use the dpkg version comparison algorithm (which is easily in reach) to compare the build
 9 | # system version to the version from which python3-distutils is required (18.04 and higher),
10 | # and conditionally populate a substitution variable which is referenced in the control file.
11 | # On bionic, where Ubuntu's python3.8 is used, this is a virtual package for python3-distutils
12 | # (in bionic-updates). On jammy, this is a real package providing distutils for python3.8 from deadsnakes.
13 | extra_substvars = -VbionicOrLater:Depends="python3.8-distutils"
14 | 
15 | # and then do the same thing for awscli - of which we used to have a patched version called aws-cli pre-jammy.
16 | # once jammy boxes are the oldest things we install clusterman on, we can get rid of this and just include this
17 | # directly in debian/control
18 | ifeq ($(shell (. /etc/os-release && dpkg --compare-versions $$VERSION_ID "ge" "22.04" && echo yes || echo no)),yes)
19 |     extra_substvars = -VjammyOrLater:Depends="awscli"
20 | else
21 | 	# aws-cli only exists internally, so lets make sure that we only use it internally
22 |     ifeq ($(shell echo ${PAASTA_ENV}), YELP)
23 |         extra_substvars = -VjammyOrLater:Depends="aws-cli"
24 |     else
25 |         extra_substvars = -VjammyOrLater:Depends="awscli"
26 |     endif
27 | endif
28 | 
29 | 
30 | %:
31 | 	dh $@ --with python-virtualenv
32 | 
33 | override_dh_gencontrol:
34 | 	dh_gencontrol -- $(extra_substvars)
35 | 
36 | override_dh_virtualenv:
37 | 	dh_virtualenv --python python3.8 $(YELP_DH_VIRTUALENV)
38 | 
39 | override_dh_install:
40 | 	dh_install
41 | 	$(YELP_DH_INSTALL)
42 | 
43 | override_dh_shlibdeps:
44 | 	dh_shlibdeps -X site-packages/Pillow.libs --exclude matplotlib --exclude numpy --dpkg-shlibdeps-params=--ignore-missing-info
45 | 
46 | override_dh_strip:
47 | 	dh_strip -X site-packages/Pillow.libs
48 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = clusterman
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/examples/autoscaler_config.yaml:
--------------------------------------------------------------------------------
 1 | sfrs:
 2 |     - sfr-5c87f608-8fa8-48d9-8a78-8887a16509e0
 3 |     - sfr-a83610ec-5589-4258-bbd2-dad14f2acaa1
 4 |     - sfr-e8807b39-7041-4997-8a31-4a292514d26b
 5 | 
 6 | # configs:
 7 | #     - "LaunchSpecifications":
 8 | #           - "InstanceType": "c4.8xlarge"
 9 | #             "SubnetId": "subnet-b47bb7d1"
10 | #             "SpotPrice": 10
11 | #             "WeightedCapacity": 4
12 | #       "AllocationStrategy": "diversified"
13 | #     - "LaunchSpecifications":
14 | #           - "InstanceType": "c3.8xlarge"
15 | #             "SubnetId": "subnet-b47bb7d1"
16 | #             "SpotPrice": 10
17 | #             "WeightedCapacity": 4
18 | #       "AllocationStrategy": "diversified"
19 | 


--------------------------------------------------------------------------------
/docs/examples/design.yaml:
--------------------------------------------------------------------------------
 1 | metadata:
 2 |     spot_prices|aws_availability_zone=us-west-2a,aws_instance_type=c3.8xlarge: &spot_prices
 3 | 
 4 |         # If no timezone is specified, generator will use YST
 5 |         start_time: "2017-12-01T08:00:00Z"
 6 |         end_time: "2017-12-01T09:00:00Z"
 7 | 
 8 |         frequency:
 9 |             distribution: expovariate
10 |             params:
11 |                 lambd: 0.0033333   # Assume prices change on average every five minutes
12 | 
13 |         values:
14 |             distribution: uniform
15 |             params:
16 |                 a: 0
17 |                 b: 1
18 | 
19 |     spot_prices|aws_availability_zone=us-west-2b,aws_instance_type=c3.8xlarge: *spot_prices
20 |     spot_prices|aws_availability_zone=us-west-2c,aws_instance_type=c3.8xlarge: *spot_prices
21 | 
22 |     capacity|cluster=norcal-prod,role=seagull:
23 |         start_time: "2017-12-01T08:00:00Z"
24 |         end_time: "2017-12-01T09:00:00Z"
25 | 
26 |         dict_keys:
27 |             - c3.8xlarge,us-west-2a
28 |             - c3.8xlarge,us-west-2b
29 |             - c3.8xlarge,us-west-2c
30 | 
31 |         frequency:
32 |             distribution: expovariate
33 |             params:
34 |                 lambd: 0.001666   # Assume capacity change on average every ten minutes
35 | 
36 |         values:
37 |             distribution: randint
38 |             params:
39 |                 a: 10
40 |                 b: 50
41 | 
42 | app_metrics:
43 |     seagull_runs:
44 |         start_time: "2017-12-01T08:00:00Z"
45 |         end_time: "2017-12-01T09:00:00Z"
46 |         frequency:
47 |             distribution: expovariate
48 |             params:
49 |                 lambd: 0.0041666 # 15 seagull runs per hour
50 |         values: 1
51 | 
52 | 
53 | system_metrics:
54 |     cpu_allocation|cluster=everywhere-testopia,role=jolt:
55 |         start_time: "2017-12-01T08:00:00Z"
56 |         end_time: "2017-12-01T09:00:00Z"
57 |         frequency: historical
58 |         values:
59 |             aws_region: "us-west-2"
60 |             params:   # calculate value by a*x + b
61 |                 a: 1.5
62 |                 b: 10
63 | 


--------------------------------------------------------------------------------
/docs/examples/metrics.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/docs/examples/metrics.json.gz


--------------------------------------------------------------------------------
/docs/source/_static/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/docs/source/_static/.gitignore


--------------------------------------------------------------------------------
/docs/source/api/AWSResourceGroup.rst:
--------------------------------------------------------------------------------
1 | AWSResourceGroup
2 | ================
3 | 
4 | .. autoclass:: clusterman.aws.aws_resource_group.AWSResourceGroup
5 |    :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/api/AutoScalingResourceGroup.rst:
--------------------------------------------------------------------------------
1 | AutoScalingResourceGroup
2 | ========================
3 | 
4 | .. autoclass:: clusterman.aws.auto_scaling_resource_group.AutoScalingResourceGroup
5 |    :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/api/Autoscaler.rst:
--------------------------------------------------------------------------------
1 | Autoscaler
2 | ==========
3 | 
4 | .. autoclass:: clusterman.autoscaler.autoscaler.Autoscaler
5 |    :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/api/MesosPoolManager.rst:
--------------------------------------------------------------------------------
1 | MesosPoolManager
2 | ================
3 | 
4 | .. autoclass:: clusterman.mesos.mesos_pool_manager.MesosPoolManager
5 |    :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/api/Signal.rst:
--------------------------------------------------------------------------------
1 | Signal
2 | ======
3 | 
4 | .. autoclass:: clusterman.interfaces.signal.Signal
5 |    :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/api/SpotFleetResourceGroup.rst:
--------------------------------------------------------------------------------
1 | SpotFleetResourceGroup
2 | ======================
3 | 
4 | .. autoclass:: clusterman.aws.spot_fleet_resource_group.SpotFleetResourceGroup
5 |    :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/api/aws_markets.rst:
--------------------------------------------------------------------------------
1 | AWS Markets
2 | ===========
3 | 
4 | .. autoclass:: clusterman.aws.markets.InstanceResources
5 | 
6 | .. class:: clusterman.aws.markets.InstanceMarket(instance, availability_zone)
7 | 


--------------------------------------------------------------------------------
/docs/source/api/clusterman_metrics.rst:
--------------------------------------------------------------------------------
1 | clusterman_metrics
2 | ==================
3 | 
4 | .. autoclass:: clusterman_metrics.ClustermanMetricsBotoClient
5 |    :members: __init__, get_writer, get_metric_values
6 | 
7 | .. automodule:: clusterman_metrics
8 |    :members: generate_key_with_dimensions
9 | 


--------------------------------------------------------------------------------
/docs/source/drainer.rst:
--------------------------------------------------------------------------------
 1 | Drainer
 2 | ==============
 3 | 
 4 | *Drainer* is the component to drain pods off the node before terminating.
 5 | It may drain and terminate nodes for three reasons:
 6 | 
 7 | * ``spot_interruption``
 8 | * ``node_migration``
 9 | * ``scaling_down``
10 | 
11 | **NOTE**: all settings are only compatible with Kubernetes clusters.
12 | 
13 | 
14 | Drainer Batch
15 | --------------------
16 | 
17 | The *Drainer batch* is the entrypoint of the draining logic.
18 | 
19 | The batch code can be invoked from the ``clusterman.batch.drainer`` Python module.
20 | 
21 | 
22 | .. _drainer_configuration:
23 | 
24 | Pool Configuration
25 | ------------------
26 | 
27 | The behaviour of the drainer logic for a pool is controlled by the ``draining`` section of the pool configuration.
28 | The allowed values for the drainer settings are as follows:
29 | 
30 | * ``draining_time_threshold_seconds``: maximum time to complete draining process (1800 by default)
31 | * ``redraining_delay_seconds``: how much to wait between draining tries in case of draining failure (15 by default).
32 | * ``force_terminate``: forcibly terminate the node after reaching `draining_time_threshold_seconds` (false by default).
33 | 
34 | 
35 | See :ref:`pool_configuration` for how an example configuration block would look like.
36 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. clusterman documentation master file, created by
 2 |    sphinx-quickstart on Thu Aug  3 09:34:59 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | clusterman
 7 | ======================================
 8 | 
 9 | Clusterman autoscales Mesos clusters based on the values of user-defined signals
10 | of resource utilization. It also provides tools to manually manage those clusters,
11 | and simulate how changes to autoscaling logic will impact the cost and performance.
12 | 
13 | 
14 | .. toctree::
15 |    :titlesonly:
16 | 
17 |    overview
18 | 
19 | .. toctree::
20 |    :maxdepth: 2
21 |    :caption: Autoscaling
22 | 
23 |    metrics
24 |    signals
25 |    autoscaler
26 |    configuration
27 |    resource_groups
28 | 
29 | 
30 | .. toctree::
31 |    :maxdepth: 2
32 |    :caption: Tools
33 | 
34 |    manage
35 |    simulator
36 |    tools
37 |    node_migration
38 |    drainer
39 | 
40 | 
41 | .. toctree::
42 |    :maxdepth: 1
43 |    :caption: API Reference
44 | 
45 |    api/AutoScalingResourceGroup
46 |    api/Autoscaler
47 |    api/AWSResourceGroup
48 |    api/aws_markets
49 |    api/clusterman_metrics
50 |    api/MesosPoolManager
51 |    api/Signal
52 |    api/SpotFleetResourceGroup
53 | 
54 | 
55 | Indices and tables
56 | ==================
57 | 
58 | * :ref:`genindex`
59 | * :ref:`modindex`
60 | * :ref:`search`
61 | 


--------------------------------------------------------------------------------
/docs/source/manage.rst:
--------------------------------------------------------------------------------
 1 | Cluster Management
 2 | ==================
 3 | 
 4 | Clusterman comes with a number of command-line tools to help with cluster management.
 5 | 
 6 | Discovery
 7 | ---------
 8 | 
 9 | The ``clusterman list-clusters`` and ``clusterman list-pools`` commands can aid in determining what clusters and pools
10 | Clusterman knows about:
11 | 
12 | .. program-output:: python -m clusterman.run list-clusters --help
13 |    :cwd: ../../
14 | 
15 | .. program-output:: python -m clusterman.run list-pools --help
16 |    :cwd: ../../
17 | 
18 | Management
19 | ----------
20 | 
21 | The ``clusterman manage`` command can be used to directly change the state of the cluster:
22 | 
23 | .. program-output:: python -m clusterman.run manage --help
24 |    :cwd: ../../
25 | 
26 | The ``--target-capacity`` option allows users to directly change the size of the Mesos cluster specified by the
27 | ``--cluster`` and ``--pool`` arguments.
28 | 
29 | Note that there can be up to a few minutes of "lag time" between when the manage command is issued and when
30 | changes are reflected in the cluster.  This is due to potential delays introduced into the pipeline while AWS finds and
31 | procures new instances for the cluster.  Therefore, it is not recommended to run ``clusterman manage`` repeatedly in
32 | short succession, or immediately after the autoscaler batch has run.
33 | 
34 | .. note:: Future versions of Clusterman may include a rate-limiter for the manage command
35 | 
36 | .. note:: By providing the existing target capacity value as the argument to ``--target-capacity``, you can force
37 |    Clusterman to attempt to prune any :attr:`fulfilled capacity <MesosPoolManager.fulfilled_capacity>` that is above the
38 |    desired :attr:`target capacity <MesosPoolManager.target_capacity>`.
39 | 
40 | Status
41 | ------
42 | 
43 | The ``clusterman status`` command provides information on the current state of the cluster:
44 | 
45 | .. program-output:: python -m clusterman.run status --help
46 |    :cwd: ../../
47 | 
48 | As noted above, the state of the cluster may take a few minutes to equilibrate after a ``clusterman manage`` command or
49 | the autoscaler has run, so the output from ``clusterman status`` may not accurately reflect the desired status.
50 | 


--------------------------------------------------------------------------------
/docs/source/resource_groups.rst:
--------------------------------------------------------------------------------
 1 | Resource Groups
 2 | ===============
 3 | 
 4 | Resource groups are wrappers around cloud provider APIs to enable scaling up and down groups of machines.  A resource
 5 | group implments the :py:class:`.ResourceGroup` interface, which provides the set of required methods for
 6 | Clusterman to interact with the resource group.  Currently, Clusterman supports the following types of resource groups:
 7 | 
 8 | * :py:class:`.AutoScalingResourceGroup`: `AWS autoscaling groups
 9 |   <https://docs.aws.amazon.com/autoscaling/ec2/userguide/AutoScalingGroup.html>`_
10 | * :py:class:`.SpotFleetResourceGroup`: `AWS spot fleet requests
11 |   <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-fleet-requests.html>`_
12 | 


--------------------------------------------------------------------------------
/docs/source/simulator.rst:
--------------------------------------------------------------------------------
 1 | Simulation
 2 | ==========
 3 | 
 4 | Running the Simulator
 5 | ---------------------
 6 | 
 7 | .. program-output:: python -m clusterman.run simulate --help
 8 |    :cwd: ../../
 9 | 
10 | .. _input_data_fmt:
11 | 
12 | Experimental Input Data
13 | -----------------------
14 | 
15 | The simulator can accept experimental input data for one or more metric timeseries using the ``--metrics-data-file``
16 | argument to ``clusterman simulate``.  The simulator expects this file to be stored as a compressed (gzipped) JSON file;
17 | the JSON schema is as follows::
18 | 
19 |     {
20 |         'metric_name_1': [
21 |             [<date-time-string>, value],
22 |             [<date-time-string>, value],
23 |             ...
24 |         ],
25 |         'metric_name_2': [
26 |             [<date-time-string>, value],
27 |             [<date-time-string>, value],
28 |             ...
29 |         },
30 |         ...
31 |     }
32 | 
33 | .. _dict_data_fmt:
34 | 
35 | Optional Multi-valued Timeseries Data
36 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
37 | 
38 | Some timeseries data needs to have multiple y-values per timestamp.  The metrics data file can optionally accept
39 | timeseries in a dictionary with the dictionary keys corresponding to the names of the individual timeseries.  For
40 | example::
41 | 
42 |     {
43 |         'metric_a': [
44 |             [
45 |                 <date-time-string>,
46 |                 {
47 |                   'key1': value,
48 |                   'key2': value
49 |                 }
50 |             ],
51 |             [
52 |                 <date-time-string>,
53 |                 {
54 |                   'key3': value
55 |                 }
56 |             ],
57 |             [
58 |                 <date-time-string>,
59 |                 {
60 |                   'key1': value,
61 |                   'key2': value,
62 |                   'key3': value
63 |                 }
64 |             ]
65 |         ]
66 |     }
67 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/examples/batch/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/examples/batch/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import inspect
15 | import os
16 | from contextlib import contextmanager
17 | 
18 | import botocore.exceptions
19 | import colorlog
20 | 
21 | logger = colorlog.getLogger(__name__)
22 | 
23 | 
24 | class BatchRunningSentinelMixin:  # pragma: no cover
25 |     def make_running_sentinel(self):
26 |         batch_name, ext = os.path.splitext(os.path.basename(inspect.getfile(self.__class__)))
27 |         sentinel_file = f"/tmp/{batch_name}.running"
28 |         with open(sentinel_file, "w") as f:
29 |             f.write(str(os.getpid()))
30 | 
31 | 
32 | @contextmanager
33 | def suppress_request_limit_exceeded():
34 |     try:
35 |         yield
36 |     except botocore.exceptions.ClientError as e:
37 |         if e.response.get("Error", {}).get("Code") == "RequestLimitExceeded":
38 |             logger.warning(e)
39 |         else:
40 |             raise
41 | 


--------------------------------------------------------------------------------
/examples/clusterman_metrics/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 121
3 | 


--------------------------------------------------------------------------------
/examples/clusterman_metrics/clusterman_metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .boto_client import ClustermanMetricsBotoClient
15 | from .boto_client import MetricsValuesDict
16 | from .simulation_client import ClustermanMetricsSimulationClient
17 | from .util.constants import APP_METRICS
18 | from .util.constants import METADATA
19 | from .util.constants import METRIC_TYPES
20 | from .util.constants import SYSTEM_METRICS
21 | from .util.meteorite import generate_key_with_dimensions
22 | 
23 | __all__ = [
24 |     "ClustermanMetricsBotoClient",
25 |     "MetricsValuesDict",
26 |     "ClustermanMetricsSimulationClient",
27 |     "APP_METRICS",
28 |     "METADATA",
29 |     "METRIC_TYPES",
30 |     "SYSTEM_METRICS",
31 |     "generate_key_with_dimensions",
32 | ]
33 | 


--------------------------------------------------------------------------------
/examples/clusterman_metrics/clusterman_metrics/util/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/examples/clusterman_metrics/clusterman_metrics/util/aws.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import json
15 | 
16 | import boto3
17 | import staticconf
18 | from clusterman_metrics.util.constants import CONFIG_NAMESPACE
19 | 
20 | 
21 | _metrics_session = None
22 | 
23 | 
24 | def _setup_session():
25 |     with open(staticconf.read_string("access_key_file", namespace=CONFIG_NAMESPACE)) as boto_cfg_file:
26 |         boto_cfg = json.load(boto_cfg_file)
27 |         _session = boto3.session.Session(
28 |             aws_access_key_id=boto_cfg["accessKeyId"],
29 |             aws_secret_access_key=boto_cfg["secretAccessKey"],
30 |         )
31 |     return _session
32 | 
33 | 
34 | def get_metrics_session():
35 |     global _metrics_session
36 | 
37 |     if not _metrics_session:
38 |         _metrics_session = _setup_session()
39 | 
40 |     return _metrics_session
41 | 


--------------------------------------------------------------------------------
/examples/clusterman_metrics/clusterman_metrics/util/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | CONFIG_NAMESPACE = "clusterman_metrics"
17 | 
18 | CLUSTERMAN_NAME = "clusterman"
19 | 
20 | SYSTEM_METRICS = "system_metrics"  #: metrics collected about the cluster state (e.g., CPU, memory allocation)
21 | APP_METRICS = "app_metrics"  #: metrics collected from client applications (e.g., number of application runs)
22 | METADATA = "metadata"  #: metrics collected about the cluster (e.g., current spot prices, instance types present)
23 | 
24 | METRIC_TYPES = frozenset(
25 |     [
26 |         SYSTEM_METRICS,
27 |         APP_METRICS,
28 |         METADATA,
29 |     ]
30 | )
31 | 


--------------------------------------------------------------------------------
/examples/clusterman_metrics/clusterman_metrics/util/costs.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import staticconf
15 | from clusterman_metrics.util.constants import CONFIG_NAMESPACE
16 | 
17 | 
18 | config_reader = staticconf.NamespaceReaders(CONFIG_NAMESPACE)
19 | 
20 | 
21 | def estimate_cost_per_hour(
22 |     cluster,
23 |     pool,
24 |     cpus=0,
25 |     mem=0,
26 | ):
27 |     cpu_cost = cpus * _get_resource_cost("cpus", cluster, pool)
28 |     mem_cost = mem * _get_resource_cost("mem", cluster, pool)
29 |     return max(cpu_cost, mem_cost)
30 | 
31 | 
32 | def _get_resource_cost(resource, cluster, pool):
33 |     default_cost = config_reader.read_float(
34 |         "cost_per_hour.defaults.{}".format(resource),
35 |         default=0,
36 |     )
37 |     return config_reader.read_float(
38 |         "cost_per_hour.{}.{}.{}".format(cluster, pool, resource),
39 |         default=default_cost,
40 |     )
41 | 
42 | 
43 | def should_warn(cost):
44 |     threshold = config_reader.read_float(
45 |         "cost_warning_threshold",
46 |         default=100,
47 |     )
48 |     return cost > threshold
49 | 


--------------------------------------------------------------------------------
/examples/clusterman_metrics/clusterman_metrics/util/meteorite.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import Mapping
15 | from typing import Optional
16 | from typing import Tuple
17 | 
18 | from clusterman_metrics.util.constants import APP_METRICS
19 | from clusterman_metrics.util.constants import CLUSTERMAN_NAME
20 | 
21 | 
22 | def _parse_dimensions(metric_name):
23 |     """Parse out existing dimensions from the metric name"""
24 |     try:
25 |         metric_name, dims = metric_name.split("|", 1)
26 |     except ValueError:
27 |         dims = ""
28 | 
29 |     return (
30 |         metric_name,
31 |         dict(dim_pair.split("=") for dim_pair in dims.split(",") if dim_pair),
32 |     )
33 | 
34 | 
35 | def generate_key_with_dimensions(metric_name: str, dimensions: Optional[Mapping[str, str]] = None) -> str:
36 |     """Helper function to generate a key used to reference metric timeseries data in DynamoDB; this key will
37 |     be parsed by ``get_meteorite_identifiers`` to store data in SignalFX.
38 | 
39 |     :param metric_name: the name of the metric (can include some pre-existing dimensions)
40 |     :param dimensions: dict of dimension names to values; dimensions in the metric name will by overwritten by
41 |         values here
42 |     :returns: string that can be passed to ``get_writer`` as the metric key
43 |     """
44 |     if not dimensions:
45 |         return metric_name
46 | 
47 |     # dimensions passed in override dimensions in the name
48 |     metric_name, new_dimensions = _parse_dimensions(metric_name)
49 |     new_dimensions.update(dimensions)
50 | 
51 |     dimension_parts = []
52 |     for key, value in sorted(new_dimensions.items()):
53 |         dimension_parts.append("{key}={value}".format(key=key, value=value))
54 | 
55 |     return "{metric_name}|{dim_string}".format(
56 |         metric_name=metric_name,
57 |         dim_string=",".join(dimension_parts),
58 |     )
59 | 
60 | 
61 | def get_meteorite_identifiers(metric_type: str, metric_key: str) -> Tuple[str, Optional[Mapping[str, str]]]:
62 |     """
63 |     Given the primary key for a timeseries in the datastore and its Clusterman metric type, return the metric name and
64 |     dimensions for that timeseries in meteorite.
65 | 
66 |     :param metric_type: string, one of METRIC_TYPES
67 |     :param metric_key: string, the unique key for the timeseries in the datastore.
68 |     :returns: (metric_name, dimensions_dict) tuple. Dimensions may be None.
69 |     """
70 |     dimensions = None
71 |     name_parts = [CLUSTERMAN_NAME, metric_type]
72 | 
73 |     metric_name, dimensions = _parse_dimensions(metric_key)
74 | 
75 |     if metric_type == APP_METRICS:
76 |         # Namespace app metrics by the app identifier.
77 |         name_parts.extend(metric_name.split(",", 1))
78 |     else:
79 |         name_parts.append(metric_name)
80 | 
81 |     meteorite_name = ".".join(name_parts)
82 |     return meteorite_name, dimensions
83 | 


--------------------------------------------------------------------------------
/examples/clusterman_metrics/clusterman_metrics/util/misc.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from decimal import Decimal
15 | from decimal import getcontext
16 | from decimal import localcontext
17 | from decimal import ROUND_HALF_UP
18 | 
19 | MAX_DECIMAL_PLACES = 20
20 | _PLACES_VALUE = Decimal(10) ** (-1 * MAX_DECIMAL_PLACES)
21 | 
22 | 
23 | def convert_decimal(numeric):
24 |     full_decimal = Decimal(numeric)
25 |     _, digits, exponent = full_decimal.as_tuple()
26 |     # Round to MAX_DECIMAL_PLACES, if result has more places than that.
27 |     if exponent < -MAX_DECIMAL_PLACES:
28 |         # quantize can raise `decimal.InvalidOperation` if result is greater
29 |         # than context precision, which is 28 by default. to get around this,
30 |         # temporarily set a new precision up to the max number of sig figs  of
31 |         # `full_decimal`, which is also the max for the result of `quantize`.
32 |         # this ensures that the result of `quantize` will be within the precision
33 |         # limit, and not raise the error.
34 |         with localcontext() as ctx:
35 |             ctx.prec = max(len(digits), getcontext().prec)
36 |             return full_decimal.quantize(_PLACES_VALUE, rounding=ROUND_HALF_UP)
37 |     else:
38 |         return full_decimal
39 | 


--------------------------------------------------------------------------------
/examples/clusterman_metrics/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheel]
2 | universal = True
3 | 


--------------------------------------------------------------------------------
/examples/clusterman_metrics/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from setuptools import find_packages
15 | from setuptools import setup
16 | 
17 | 
18 | setup(
19 |     name="clusterman-metrics",
20 |     version="1.0.0",
21 |     classifiers=[
22 |         "Programming Language :: Python :: 3",
23 |         "Programming Language :: Python :: 3.8",
24 |     ],
25 |     package_data={str("clusterman_metrics"): [str("py.typed")]},
26 |     install_requires=[
27 |         "boto3",
28 |         "PyStaticConfiguration",
29 |     ],
30 |     packages=find_packages(exclude=("tests*", "testing*")),
31 |     zip_safe=False,
32 | )
33 | 


--------------------------------------------------------------------------------
/examples/schemas/clusterman.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "http://json-schema.org/draft-06/schema",
 3 |     "type": "object",
 4 |     "required": [
 5 |         "autoscaling", "autoscale_signal", "aws", "batches", "clusters", "mesos_clusters", "module_config", "module_env_config",
 6 |         "sensu_config"
 7 |     ],
 8 |     "definitions": {
 9 |         "cluster": {
10 |             "type": "object",
11 |             "required": ["aws_region"],
12 |             "properties": {
13 |                 "aws_region": {"$ref": "definitions.json#awsRegion"},
14 |                 "cluster_manager": {"type": "string", "enum": ["mesos", "kubernetes"]},
15 |                 "drain_queue_url": {"type": "string", "format": "uri"},
16 |                 "kubeconfig_path": {"type": "string"},
17 |                 "fqdn": {"type": "string"},
18 |                 "mesos_master_fqdn": {"type": "string"},
19 |                 "sensu_config": {"$ref": "definitions.json#sensu_config"},
20 |                 "termination_queue_url": {"type": "string", "format": "uri"},
21 |                 "warning_queue_url": {"type": "string", "format": "uri"}
22 |             },
23 |             "additionalProperties": false
24 |         }
25 |     },
26 |     "properties": {
27 |         "autoscaling": {
28 |             "type": "object",
29 |             "properties": {
30 |                 "default_signal_role": {"type": "string"},
31 |                 "excluded_resources": {
32 |                     "type": "array",
33 |                     "items": {"type": "string", "enum": ["cpus", "mem", "disk", "gpus"]}
34 |                 },
35 |                 "setpoint": {"$ref": "definitions.json#percentage"},
36 |                 "setpoint_margin": {"$ref": "definitions.json#percentage"},
37 |                 "target_capacity_margin": {"$ref": "definitions.json#percentage"}
38 |             },
39 |             "required": ["default_signal_role", "excluded_resources", "setpoint", "setpoint_margin", "target_capacity_margin"],
40 |             "additionalProperties": false
41 |         },
42 |         "autoscale_signal": {"$ref": "definitions.json#autoscale_signal"},
43 |         "aws": {
44 |             "type": "object",
45 |             "properties": {
46 |                 "access_key_file": {"type": "string"}
47 |             },
48 |             "required": ["access_key_file"]
49 |         },
50 |         "batches": {
51 |             "type": "object",
52 |             "additionalProperties": {
53 |                 "type": "object",
54 |                 "required": ["run_interval_seconds"],
55 |                 "properties": {
56 |                     "run_interval_seconds": {"$ref": "definitions.json#posint"}
57 |                 }
58 |             }
59 |         },
60 |         "drain_termination_timeout_seconds": {
61 |             "type": "object",
62 |             "properties": {
63 |                 "sfr": {"$ref": "definitions.json#posint"}
64 |             }
65 |         },
66 |         "clusters": {
67 |             "type": "object",
68 |             "additionalProperties": {"$ref": "#/definitions/cluster"}
69 |         },
70 |         "mesos_clusters": {
71 |             "type": "object",
72 |             "additionalProperties": {"$ref": "#/definitions/cluster"}
73 |         },
74 |         "module_config": {
75 |             "type": "array"
76 |         },
77 |         "module_env_config": {
78 |             "type": "array"
79 |         },
80 |         "sensu_config": {"$ref": "definitions.json#sensu_config"}
81 |     },
82 |     "additionalProperties": false
83 | }
84 | 


--------------------------------------------------------------------------------
/examples/schemas/definitions.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "http://json-schema.org/draft-04/schema",
 3 |     "awsRegion": {
 4 |         "type": "string",
 5 |         "enum": ["us-east-1", "us-west-1", "us-west-2"]
 6 |     },
 7 |     "infinity": {
 8 |         "type": "number",
 9 |         "minimum": Infinity
10 |     },
11 |     "nonnegative_int": {
12 |         "type": "integer",
13 |         "minimum": 0
14 |     },
15 |     "percentage": {
16 |         "type": "number",
17 |         "minimum": 0,
18 |         "maximum": 1
19 |     },
20 |     "posint": {
21 |         "type": "integer",
22 |         "minimum": 1
23 |     },
24 |     "autoscale_signal": {
25 |         "type": "object",
26 |         "required": ["name", "branch_or_tag", "period_minutes"],
27 |         "properties": {
28 |             "name": {"type": "string"},
29 |             "repository": {"type": "string"},
30 |             "branch_or_tag": {"type": "string"},
31 |             "period_minutes": {"$ref": "shared.json#posint"},
32 |             "parameters": {
33 |                 "type": "array",
34 |                 "items": {"type": "object"}
35 |             },
36 |             "required_metrics": {
37 |                 "type": "array",
38 |                 "items": {
39 |                     "type": "object",
40 |                     "required": ["name", "type", "minute_range"],
41 |                     "properties": {
42 |                         "name": {"type": "string"},
43 |                         "type": {
44 |                             "type": "string",
45 |                             "enum": ["system_metrics", "app_metrics"]
46 |                         },
47 |                         "minute_range": {"$ref": "shared.json#posint"},
48 |                         "regex": {"type": "boolean"}
49 |                     },
50 |                     "additionalProperties": false
51 |                 }
52 |             }
53 |         }
54 |     },
55 |     "sensu_config": {
56 |         "type": "array",
57 |         "minItems": 1,
58 |         "maxItems": 1,
59 |         "items": {
60 |             "type": "object",
61 |             "required": ["team", "runbook"],
62 |             "properties": {
63 |                 "team": {"type": "string"},
64 |                 "runbook": {"type": "string"},
65 |                 "page": {"type": "boolean"},
66 |                 "notification_email": {"type": "string"},
67 |                 "irc_channels": {"type": "array", "items": {"type": "string"}},
68 |                 "slack_channels": {"type": "array", "items": {"type": "string"}},
69 |                 "ticket": {"type": "boolean"},
70 |                 "project": {"type": "string"},
71 |                 "tags": {"type": "array", "items": {"type": "string"}}
72 |             },
73 |             "additionalProperties": false
74 |         }
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/examples/schemas/pool.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "http://json-schema.org/draft-04/schema",
 3 |     "sfr_resource_group_definition": {
 4 |         "type": "object",
 5 |         "properties": {
 6 |             "s3": {
 7 |                 "type": "object",
 8 |                 "required": ["bucket", "prefix"],
 9 |                 "properties": {
10 |                     "bucket": {"type": "string"},
11 |                     "prefix": {"type": "string"}
12 |                 },
13 |                 "additionalProperties": false
14 |             },
15 |             "tag": {"type": "string"}
16 |         },
17 |         "additionalProperties": false
18 |     },
19 |     "resource_groups_definition": {
20 |         "type": "array",
21 |         "items": [
22 |             {
23 |                 "type": "object",
24 |                 "properties": {
25 |                     "sfr": {
26 |                         "$ref": "#/sfr_resource_group_definition"
27 |                     }
28 |                 }
29 |             }
30 |         ]
31 |     },
32 |     "type": "object",
33 |     "required": ["resource_groups", "scaling_limits"],
34 |     "properties": {
35 |         "resource_groups": {"$ref": "#/resource_groups_definition"},
36 |         "draining_enabled": {"type": "boolean"},
37 |         "scaling_limits": {
38 |             "type": "object",
39 |             "required": ["min_capacity", "max_capacity", "max_weight_to_add", "max_weight_to_remove"],
40 |             "properties": {
41 |                 "min_capacity": {"$ref": "definitions.json#nonnegative_int"},
42 |                 "max_capacity": {"$ref": "definitions.json#posint"},
43 |                 "max_tasks_to_kill": {"anyOf": [
44 |                     {"$ref": "definitions.json#nonnegative_int"},
45 |                     {"$ref": "definitions.json#infinity"}
46 |                 ]},
47 |                 "max_weight_to_add": {"$ref": "definitions.json#posint"},
48 |                 "max_weight_to_remove": {"$ref": "definitions.json#posint"}
49 |             },
50 |             "additionalProperties": false
51 |         },
52 |         "autoscale_signal": {"$ref": "definitions.json#autoscale_signal"},
53 |         "autoscaling": {
54 |             "type": "object",
55 |             "properties": {
56 |                 "excluded_resources": {
57 |                     "type": "array",
58 |                     "items": {"type": "string", "enum": ["cpus", "mem", "disk", "gpus"]}
59 |                 },
60 |                 "setpoint": {"$ref": "definitions.json#percentage"},
61 |                 "setpoint_margin": {"$ref": "definitions.json#percentage"},
62 |                 "target_capacity_margin": {"$ref": "definitions.json#percentage"}
63 |             },
64 |             "additionalProperties": false
65 |         },
66 |         "sensu_config": {"$ref": "definitions.json#sensu_config"},
67 |         "alert_on_max_capacity": {"type": "boolean"}
68 |     },
69 |     "additionalProperties": false
70 | }
71 | 


--------------------------------------------------------------------------------
/examples/supervisord.conf:
--------------------------------------------------------------------------------
 1 | [unix_http_server]
 2 | file=/tmp/supervisor.sock   ; the path to the socket file
 3 | 
 4 | [inet_http_server]
 5 | port=127.0.0.1:9001
 6 | 
 7 | [supervisord]
 8 | logfile=/tmp/supervisord.log ; main log file; default $CWD/supervisord.log
 9 | logfile_maxbytes=50MB        ; max main logfile bytes b4 rotation; default 50MB
10 | logfile_backups=5            ; # of main logfile backups; 0 means none, default 10
11 | loglevel=info                ; log level; default info; others: debug,warn,trace
12 | pidfile=/tmp/supervisord.pid ; supervisord pidfile; default supervisord.pid
13 | nodaemon=true                ; start in foreground if true; default false
14 | minfds=1024                  ; min. avail startup file descriptors; default 1024
15 | minprocs=200                 ; min. avail process descriptors;default 200
16 | 
17 | [rpcinterface:supervisor]
18 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
19 | 
20 | [supervisorctl]
21 | serverurl=unix:///tmp/supervisor.sock ; use a unix:// URL  for a unix socket
22 | 
23 | [program:fetch_signals]
24 | process_name=fetch_signals_%(process_num)s
25 | numprocs=%(ENV_CMAN_NUM_VERSIONS)s
26 | command=/usr/bin/fetch_clusterman_signal %(process_num)s
27 | autostart=true
28 | autorestart=false
29 | startretries=0
30 | stdout_logfile=/dev/stdout
31 | stdout_logfile_maxbytes=0
32 | redirect_stderr=true
33 | startsecs=0
34 | 
35 | [program:run_signals]
36 | process_name=run_signals_%(process_num)s
37 | numprocs=%(ENV_CMAN_NUM_SIGNALS)s
38 | command=/usr/bin/run_clusterman_signal %(process_num)s
39 | autostart=false
40 | startretries=0
41 | stopasgroup=true
42 | stdout_logfile=/dev/stdout
43 | stdout_logfile_maxbytes=0
44 | redirect_stderr=true
45 | 
46 | [program:autoscaler]
47 | directory=/code
48 | environment=PATH=/code/virtualenv_run/bin
49 | command=python -m examples.batch.autoscaler %(ENV_CMAN_ARGS)s
50 | autostart=false
51 | autorestart=false
52 | 
53 | stdout_logfile=/dev/stderr
54 | stdout_logfile_maxbytes=0
55 | stderr_logfile=/dev/stderr
56 | stderr_logfile_maxbytes=0
57 | 


--------------------------------------------------------------------------------
/examples/terraform/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "name_prefix" {
 2 |     default = "clusterman"
 3 | }
 4 | variable "metric_types" {
 5 |     type = "list"
 6 |     default = ["metadata", "app_metrics", "system_metrics"]
 7 | }
 8 | variable "read_capacity" {
 9 |     default = 5
10 | }
11 | variable "write_capacity" {
12 |     default = 5
13 | }
14 | variable "read_autoscaling_enabled" {
15 |     default = "false"
16 | }
17 | variable "write_autoscaling_enabled" {
18 |     default = "false"
19 | }
20 | variable "max_read_capacity" {
21 |     default = 100
22 | }
23 | variable "max_write_capacity" {
24 |     default = 100
25 | }
26 | 


--------------------------------------------------------------------------------
/extra-requirements-yelp-dev.txt:
--------------------------------------------------------------------------------
1 | static-completion==0.1.7
2 | 


--------------------------------------------------------------------------------
/extra-requirements-yelp.txt:
--------------------------------------------------------------------------------
1 | clusterman-metrics==2.2.1
2 | monk==1.1.0
3 | pysensu-yelp==0.4.1
4 | yelp-batch==11.2.7
5 | yelp-clog==4.1.0
6 | yelp-lib==13.1.5
7 | yelp-meteorite==1.5.1
8 | 


--------------------------------------------------------------------------------
/images/architecture-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/images/architecture-diagram.png


--------------------------------------------------------------------------------
/itest_status.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import sys
15 | from subprocess import run
16 | 
17 | 
18 | def get_pid(batch_name):
19 |     output = run(
20 |         f'ps -ef | egrep "python -m {batch_name}(\s+|$)"',
21 |         shell=True,
22 |         capture_output=True,
23 |     )
24 | 
25 |     return output.stdout.split()[1].decode()
26 | 
27 | 
28 | def check_status(batch_name):  # pragma: no cover
29 |     # status written by BatchRunningSentinelMixin
30 |     status_file = f'/tmp/{batch_name.split(".")[-1]}.running'
31 | 
32 |     try:
33 |         with open(status_file) as f:
34 |             status_pid = f.read()
35 |         batch_pid = get_pid(batch_name)
36 |     except FileNotFoundError:
37 |         print(f"{batch_name} has not finished initialization")
38 |         sys.exit(1)
39 | 
40 |     assert status_pid == batch_pid
41 |     print(f"{batch_name} completed initialization and is running at PID {status_pid}")
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     check_status(sys.argv[1])
46 | 


--------------------------------------------------------------------------------
/itests/draining_queue.feature:
--------------------------------------------------------------------------------
 1 | Feature: make sure the drainer is working properly
 2 | 
 3 |     Scenario: process the draining queue
 4 |         Given a draining client
 5 |           And a message in the draining queue
 6 |          When the draining queue is processed
 7 |          Then the host should be submitted for termination
 8 |           And all queues are empty
 9 | 
10 |     Scenario: process the termination queue
11 |         Given a draining client
12 |           And a message in the termination queue
13 |          When the termination queue is processed
14 |          Then the host should be terminated
15 |           And all queues are empty
16 | 
17 |     Scenario: process the warning queue
18 |         Given a draining client
19 |           And a message in the warning queue
20 |          When the warning queue is processed
21 |          Then the host should be submitted for draining
22 |           And all queues are empty
23 | 


--------------------------------------------------------------------------------
/itests/simulation_aws_price_computations.feature:
--------------------------------------------------------------------------------
 1 | Feature: make sure we're computing spot prices correctly
 2 | 
 3 |     Scenario: one instance with constant price
 4 |        Given market A has 1 instance at time 0
 5 |          And market A costs $1/hour at time 0
 6 |         When the simulator runs for 2 hours
 7 |         Then the simulated cluster costs $2 total
 8 | 
 9 |     Scenario: one instance with price increase
10 |        Given market A has 1 instance at time 0
11 |          And market A costs $1/hour at time 0
12 |          And market A costs $2/hour at time 1800
13 |         When the simulator runs for 2 hours
14 |         Then the simulated cluster costs $3 total
15 | 
16 |     Scenario: two instances in the same market are launched at the same time
17 |        Given market A has 2 instances at time 0
18 |          And market A costs $1/hour at time 0
19 |          And market A costs $2/hour at time 1800
20 |         When the simulator runs for 2 hours
21 |         Then the simulated cluster costs $6 total
22 | 
23 |     Scenario: two instances in the same market are launched at different times
24 |        Given market A has 1 instances at time 0
25 |          And market A has 2 instances at time 1800
26 |          And market A costs $1/hour at time 0
27 |          And market A costs $2/hour at time 1200
28 |         When the simulator runs for 2 hours
29 |         Then the simulated cluster costs $6 total
30 | 
31 |     Scenario: two instances in different markets are launched at different times
32 |        Given market A has 1 instance at time 0
33 |          And market B has 1 instance at time 1800
34 |          And market A costs $1/hour at time 0
35 |          And market A costs $2/hour at time 1200
36 |          And market B costs $0.50/hour at time 0
37 |          And market B costs $0.75/hour at time 4500
38 |         When the simulator runs for 2 hours
39 |         Then the simulated cluster costs $3.875 total
40 | 
41 |     Scenario: (per-hour billing) two instances in different markets are launched at diff. times and one is terminated
42 |        Given market A has 1 instance at time 0
43 |          And market B has 1 instance at time 1920
44 |          And market B has 0 instances at time 5400
45 |          And market A costs $1/hour at time 0
46 |          And market A costs $2/hour at time 1800
47 |          And market B costs $0.50/hour at time 0
48 |          And market B costs $0.75/hour at time 4500
49 |         When the simulator runs for 2 hours
50 |         Then the simulated cluster costs $3.5 total
51 | 
52 |     Scenario: (per-sec billing) two instances in different markets are launched at diff. times and one is terminated
53 |        Given market A has 1 instance at time 0
54 |          And market B has 1 instance at time 1920
55 |          And market B has 0 instances at time 5400
56 |          And market A costs $1/hour at time 0
57 |          And market A costs $2/hour at time 1800
58 |          And market B costs $0.50/hour at time 0
59 |          And market B costs $0.75/hour at time 4500
60 |         When the simulator runs for 2 hours and billing is per-second
61 |         Then the simulated cluster costs $4.05 total
62 | 


--------------------------------------------------------------------------------
/itests/simulation_join_delay.feature:
--------------------------------------------------------------------------------
 1 | Feature: make sure the simulator join-delay params work correctly
 2 | 
 3 |     Scenario Outline: instances should wait to join the cluster
 4 |         Given market A has 1 instance at time 0
 5 |          When the instance takes <time> seconds to join
 6 |           And the simulator runs for 2 hours
 7 |          Then the instance start time should be 0
 8 |           And the instance join time should be <time>
 9 | 
10 |       Examples:
11 |         | time |
12 |         | 0    |
13 |         | 300  |
14 | 
15 |     Scenario: instances should join the cluster immediate if the override is set
16 |         Given market A has 1 instance at time 0
17 |          When the instance takes 300 seconds to join
18 |           And the join-delay override flag is set
19 |           And the simulator runs for 2 hours
20 |          Then the instance start time should be 0
21 |           And the instance join time should be 0
22 | 
23 |     Scenario: the instance is terminated before it joins
24 |         Given market A has 1 instance at time 0
25 |           And market A has 0 instances at time 120
26 |          When the instance takes 300 seconds to join
27 |           And the simulator runs for 2 hours
28 |          Then no instances should join the Mesos cluster
29 | 
30 |     Scenario: the instance is terminated after it joins
31 |         Given market A has 1 instance at time 0
32 |           And market A has 0 instances at time 1800
33 |          When the instance takes 300 seconds to join
34 |           And the simulator runs for 2 hours
35 |          Then instances should join the Mesos cluster
36 | 


--------------------------------------------------------------------------------
/itests/simulation_spot_fleet_diversification.feature:
--------------------------------------------------------------------------------
 1 | Feature: make sure simulated spot fleets diversify properly
 2 | 
 3 |     Scenario Outline: the simulated spot fleet should be diversified
 4 |         Given a simulated spot fleet resource group
 5 |          When we request <quantity> target capacity
 6 |          Then the simulated spot fleet should be diversified
 7 |           And the fulfilled capacity should be above the target capacity
 8 | 
 9 |       Examples:
10 |         | quantity |
11 |         | 200      |
12 |         | 750      |
13 |         | 1500     |
14 | 
15 |     Scenario Outline: the simulated spot fleet should refill empty markets
16 |         Given a simulated spot fleet resource group
17 |          When capacity in one market drops
18 |           And we request <quantity> target capacity
19 |          Then the spot fleet should have no instances from the empty market
20 |           And the fulfilled capacity should be above the target capacity
21 | 
22 |       Examples:
23 |         | quantity |
24 |         | 100      |
25 |         | 1000     |
26 | 
27 |     Scenario Outline: the simulated spot fleet should not fill markets that are over their target
28 |         Given a simulated spot fleet resource group
29 |          When capacity in one market is high
30 |           And we request <quantity> target capacity
31 |          Then the spot fleet should not add instances from the high market
32 |           And the fulfilled capacity should be above the target capacity
33 | 
34 |       Examples:
35 |         | quantity |
36 |         | 100      |
37 |         | 500      |
38 | 


--------------------------------------------------------------------------------
/itests/steps/exceptions.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import behave
15 | from hamcrest import assert_that
16 | from hamcrest import equal_to
17 | 
18 | 
19 | @behave.then("a (?P<error_type>.*Error|.*Exception) is raised")
20 | def check_exception(context, error_type):
21 |     assert_that(type(context.exception).__name__, equal_to(error_type))
22 | 
23 | 
24 | @behave.then("no exception is raised")
25 | def check_no_exception(context):
26 |     if hasattr(context, "exception"):
27 |         raise context.exception
28 | 


--------------------------------------------------------------------------------
/itests/steps/log.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import behave
15 | from hamcrest import assert_that
16 | 
17 | 
18 | @behave.then('the log should contain "(?P<log_line>.*)"')
19 | def check_log(context, log_line):
20 |     assert_that(context.log_capture.find_event(log_line))
21 | 


--------------------------------------------------------------------------------
/jenkins.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # For more documentation on what this file does and what you can tweak:
 3 | # http://y/jenkinsfile
 4 | 
 5 | types: [paasta, debian]
 6 | overrides:
 7 |   paasta:
 8 |     chatChannels: ['#clusterman']
 9 |     globalEnvVars:
10 |       PAASTA_ENV: YELP
11 |   debian:
12 |     platforms: [bionic, jammy]
13 |     globalEnvVars:
14 |       PAASTA_ENV: YELP
15 | host:
16 |   os_release: jammy
17 |   min_cpu: 0.5
18 |   min_memory: 1
19 |   max_cpu: 6
20 |   max_memory: 6
21 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | python_version = 3.8
3 | check_untyped_defs = False
4 | warn_incomplete_stub = True
5 | follow_imports = silent
6 | ignore_missing_imports = True
7 | mypy_path = stubs
8 | 


--------------------------------------------------------------------------------
/package/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | dist/
3 | 


--------------------------------------------------------------------------------
/package/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: itest_% itest_%-external package_% package_%-external build_%_docker
 2 | 
 3 | SYSTEM_PKG_NAME ?= clusterman
 4 | PYTHON_PKG_NAME ?= $(shell cd $(CURDIR)/.. && python setup.py --name)
 5 | PACKAGE_VERSION ?= $(shell cd $(CURDIR)/.. && python setup.py --version)
 6 | ACCEPTANCE_DIR:=$(CURDIR)/../acceptance
 7 | 
 8 | ifeq ($(findstring .yelpcorp.com,$(shell hostname -f)), .yelpcorp.com)
 9 | 	PAASTA_ENV ?= YELP
10 | else
11 | 	PAASTA_ENV ?= $(shell hostname --fqdn)
12 | endif
13 | 
14 | ifeq ($(PAASTA_ENV),YELP)
15 | 	export DOCKER_REGISTRY ?= docker-dev.yelpcorp.com
16 | 	export BIONIC_IMAGE_NAME ?= bionic_pkgbuild
17 | 	export JAMMY_IMAGE_NAME ?= jammy_pkgbuild
18 | else
19 | 	export DOCKER_REGISTRY ?= docker.io
20 | 	export BIONIC_IMAGE_NAME ?= ubuntu:bionic
21 | 	export JAMMY_IMAGE_NAME ?= ubuntu:jammy
22 | endif
23 | 
24 | UID:=`id -u`
25 | GID:=`id -g`
26 | DOCKER_BUILD_RUN:=docker run -t -e CI=${CI} -e PAASTA_ENV=${PAASTA_ENV} -v $(CURDIR)/..:/src:ro -v $(CURDIR)/dist:/dist:rw
27 | VERSIONED_FILES:=$(shell cd $(CURDIR)/.. && git ls-files -z --cached --modified | xargs -0 -I@ echo -n "'@' ")
28 | DOCKER_WORKDIR:=mkdir -p /work && cd /src && cp -vP --parents $(VERSIONED_FILES) /work/ && cp -r completions /work/ && cd /work
29 | 
30 | itest_%: export EXTRA_VOLUME_MOUNTS=/nail/etc/services/services.yaml:/nail/etc/services/services.yaml:ro
31 | itest_%: package_% dist/%/Packages.gz
32 | 	./debian-itest-runner $* $(SYSTEM_PKG_NAME) $(PACKAGE_VERSION) $(PAASTA_ENV)
33 | 
34 | itest_%-external: export EXTRA_VOLUME_MOUNTS=$(ACCEPTANCE_DIR)/srv-configs/clusterman-external.yaml:/nail/srv/configs/clusterman.yaml:ro
35 | itest_%-external: package_%_external dist/%/Packages.gz
36 | 	./debian-itest-runner $* $(SYSTEM_PKG_NAME) $(PACKAGE_VERSION) $(PAASTA_ENV)
37 | 
38 | dist/%/Packages.gz:
39 | 	$(DOCKER_BUILD_RUN) $(SYSTEM_PKG_NAME)_$*_container /bin/bash -c "\
40 | 	  cd /dist/$* && dpkg-scanpackages . /dev/null | gzip -9c > Packages.gz \
41 | 	  && chown $(UID):$(GID) Packages.gz"
42 | 
43 | package_%: build_%_docker
44 | 	mkdir -p $(CURDIR)/dist/$*
45 | 	$(DOCKER_BUILD_RUN) \
46 | 		$(SYSTEM_PKG_NAME)_$*_container /bin/bash -c "\
47 | 		$(DOCKER_WORKDIR) && cat extra-requirements-yelp.txt >> requirements.txt \
48 | 		&& dpkg-buildpackage -d \
49 | 		&& mv ../*.deb /dist/$*/$(SYSTEM_PKG_NAME)_$(PACKAGE_VERSION)_amd64.deb \
50 | 		&& chown $(UID):$(GID) /dist/$*/*.deb"
51 | 
52 | package_%_external: build_%_docker
53 | 	mkdir -p $(CURDIR)/dist/$*
54 | 	$(DOCKER_BUILD_RUN) \
55 | 		$(SYSTEM_PKG_NAME)_$*_container /bin/bash -c "\
56 | 		$(DOCKER_WORKDIR) && echo \"file:///work/examples/clusterman_metrics\" >> requirements.txt \
57 | 		&& dpkg-buildpackage -d \
58 | 		&& mv ../*.deb /dist/$*/$(SYSTEM_PKG_NAME)_$(PACKAGE_VERSION)_amd64.deb \
59 | 		&& chown $(UID):$(GID) /dist/$*/*.deb"
60 | 
61 | build_%_docker:
62 | 	mkdir -p dist
63 | 	cd dockerfiles/$*/ && docker build --build-arg DOCKER_REGISTRY=${DOCKER_REGISTRY} --build-arg IMAGE_NAME=$($(shell echo '$*' | tr '[:lower:]' '[:upper:]')_IMAGE_NAME) -t "$(SYSTEM_PKG_NAME)_$*_container" .
64 | 


--------------------------------------------------------------------------------
/package/debian-itest-runner:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | ACCEPTANCE_DIR=$(pwd)/../acceptance
 4 | 
 5 | source ${ACCEPTANCE_DIR}/utils.sh
 6 | trap cleanup SIGINT
 7 | 
 8 | DISTRIB_CODENAME=$1
 9 | PACKAGE_NAME=$2
10 | PACKAGE_VERSION=$3
11 | PAASTA_ENV=$4
12 | declare -A CODENAME_TO_IMAGE=( ["bionic"]=${BIONIC_IMAGE_NAME} ["jammy"]=${JAMMY_IMAGE_NAME})
13 | DOCKER_IMAGE=${CODENAME_TO_IMAGE[$DISTRIB_CODENAME]}
14 | CONTAINER_NAME=clusterman_debian_itest_${DISTRIB_CODENAME}
15 | 
16 | if [ "${EXTRA_VOLUME_MOUNTS}" ]; then
17 |     EXTRA_FLAGS="-v ${EXTRA_VOLUME_MOUNTS}"
18 | fi
19 | setup_networks
20 | 
21 | docker run -t -v "$(pwd)/itest:/itest:rw" -v "$(pwd)/dist:/dist:ro" \
22 |     -v "${ACCEPTANCE_DIR}/run_instance.py:/itest/run_instance.py:ro" \
23 | 	-v "${ACCEPTANCE_DIR}/autoscaler_config.tmpl:/itest/autoscaler_config.tmpl:ro" \
24 | 	-v "${ACCEPTANCE_DIR}/srv-configs:/nail/srv/configs:ro" \
25 | 	-v "${ACCEPTANCE_DIR}/clusterman.sh:/etc/boto_cfg/clusterman.sh:ro" \
26 | 	-v "${ACCEPTANCE_DIR}/clusterman.json:/etc/boto_cfg/clusterman.json:ro" \
27 | 	-v "${ACCEPTANCE_DIR}/clusterman.json:/etc/boto_cfg/clusterman_metrics.json:ro" \
28 |     -v "${ACCEPTANCE_DIR}/${DISTRIB_CODENAME}/clusterman_signals_acceptance.tar.gz:/itest/${DISTRIB_CODENAME}/clusterman_signals_acceptance.tar.gz" \
29 |     ${EXTRA_FLAGS} \
30 |     --name "${CONTAINER_NAME}" \
31 |     "${DOCKER_REGISTRY}/${DOCKER_IMAGE}" sleep infinity &
32 | 
33 | while [ -z "${CONTAINER}" ]; do CONTAINER=$(docker ps | egrep "${CONTAINER_NAME}" | cut -d' ' -f1); done
34 | 
35 | if [ "${EXAMPLE}" ]; then
36 |     EXAMPLE_FLAG="-e EXAMPLE=true"
37 | fi
38 | 
39 | docker network connect "clusterman_${DISTRIB_CODENAME}_default" "${CONTAINER}"
40 | docker network connect "clusterman_${DISTRIB_CODENAME}_acceptance" "${CONTAINER}"
41 | docker exec ${EXAMPLE_FLAG} "${CONTAINER}" /itest/ubuntu.sh "${PACKAGE_NAME}" "${PACKAGE_VERSION}" "${PAASTA_ENV}"
42 | cleanup
43 | docker rm "${CONTAINER_NAME}"
44 | 


--------------------------------------------------------------------------------
/package/dockerfiles/bionic/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG DOCKER_REGISTRY
 2 | ARG IMAGE_NAME
 3 | FROM ${DOCKER_REGISTRY}/${IMAGE_NAME}
 4 | 
 5 | RUN apt-get update -yq && apt-get upgrade -y && \
 6 |     apt-get install -yq \
 7 |         # needed to add a ppa
 8 |         software-properties-common && \
 9 |     add-apt-repository ppa:deadsnakes/ppa
10 | 
11 | 
12 | 
13 | RUN     apt-get -yq update && apt-get install -yq --no-install-recommends \
14 |             debhelper \
15 |             dh-virtualenv \
16 |             dpkg-dev \
17 |             gcc \
18 |             gdebi-core \
19 |             libfreetype6 \
20 |             libatlas-base-dev \
21 |             libatlas3-base \
22 |             libxau-dev \
23 |             libpng-dev \
24 |             libbrotli-dev \
25 |             liblzma-dev \
26 |             libjpeg-dev \
27 |             libwebp-dev \
28 |             libwebpmux3 \
29 |             libyaml-dev \
30 |             python3.8-dev \
31 |             python3-pip \
32 |             python3.8-distutils
33 | 
34 | RUN ldconfig
35 | 
36 | WORKDIR /work
37 | 


--------------------------------------------------------------------------------
/package/dockerfiles/jammy/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG DOCKER_REGISTRY
 2 | ARG IMAGE_NAME
 3 | FROM ${DOCKER_REGISTRY}/${IMAGE_NAME}
 4 | # TODO: remove this once we're no longer using py3.7 for clusterman
 5 | RUN apt-get update -yq && apt-get upgrade -y && \
 6 |     # TODO: this really shouldn't install recommended packages, but luisp gave up on trying to figure
 7 |     # out what else was needed to get adding a ppa to work on jammy. in any case, this Dockerfile is
 8 |     # only used for building debs so having a little bloat is fine
 9 |     apt-get install -yq \
10 |         # needed to add a ppa
11 |         software-properties-common && \
12 |     add-apt-repository ppa:deadsnakes/ppa
13 | 
14 | RUN     apt-get -yq update && DEBIAN_FRONTEND=noninteractive apt-get install -yq --no-install-recommends \
15 |             debhelper \
16 |             dh-virtualenv \
17 |             dpkg-dev \
18 |             gcc \
19 |             gdebi-core \
20 |             libfreetype6 \
21 |             libatlas3-base \
22 |             libxau-dev \
23 |             libpng-dev \
24 |             libbrotli-dev \
25 |             liblzma-dev \
26 |             libjpeg-dev \
27 |             libwebp-dev \
28 |             # needed for numpy
29 |             libwebpmux3 \
30 |             libatlas3-base \
31 |             libyaml-dev \
32 |             python3.8-dev \
33 |             python3-pip \
34 |             python3.8-distutils
35 | 
36 | RUN ldconfig
37 | WORKDIR /work
38 | 


--------------------------------------------------------------------------------
/package/dockerfiles/xenial/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG DOCKER_REGISTRY
 2 | ARG IMAGE_NAME
 3 | FROM ${DOCKER_REGISTRY}/${IMAGE_NAME}
 4 | 
 5 | RUN     apt-get -yq update && apt-get install -yq --no-install-recommends software-properties-common
 6 | RUN     apt-get -yq update && apt-get install -yq --no-install-recommends \
 7 |             debhelper \
 8 |             dpkg-dev \
 9 |             gcc \
10 |             gdebi-core \
11 |             libfreetype6 \
12 |             libatlas-base-dev \
13 |             libyaml-dev \
14 |             python3.7-dev \
15 |             python-pip \
16 |             wget
17 | RUN     cd /tmp && \
18 |         wget http://mirrors.kernel.org/ubuntu/pool/universe/d/dh-virtualenv/dh-virtualenv_1.0-1_all.deb && \
19 |         gdebi -n dh-virtualenv*.deb && \
20 |         rm dh-virtualenv_*.deb
21 | WORKDIR /work
22 | 


--------------------------------------------------------------------------------
/package/itest/metrics.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/package/itest/metrics.json.gz


--------------------------------------------------------------------------------
/package/itest/metrics.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # run `clusterman generate-data -i metrics.yaml` to regenerate metrics.json.gz, used in the acceptance tests
 3 | metadata:
 4 |     spot_prices|aws_availability_zone=us-west-2a,aws_instance_type=m3.large: &spot_prices
 5 |         start_time: "2017-12-01T08:00:00Z"
 6 |         end_time: "2017-12-01T09:00:00Z"
 7 | 
 8 |         frequency: 1m
 9 |         values:
10 |             distribution: randint
11 |             params:
12 |                 a: 1
13 |                 b: 2
14 | 
15 | 
16 |     spot_prices|aws_availability_zone=us-west-2b,aws_instance_type=m3.large: *spot_prices
17 |     spot_prices|aws_availability_zone=us-west-2c,aws_instance_type=m3.large: *spot_prices
18 | 
19 |     target_capacity|cluster=local-dev,pool=default.mesos: &capacity
20 |         start_time: "2017-12-01T08:00:00Z"
21 |         end_time: "2017-12-01T09:00:00Z"
22 | 
23 |         frequency: 1m
24 |         values:
25 |             distribution: randint
26 |             params:
27 |                 a: 1
28 |                 b: 10
29 | 
30 |     fulfilled_capacity|cluster=local-dev,pool=default.mesos:
31 |         start_time: "2017-12-01T08:00:00Z"
32 |         end_time: "2017-12-01T09:00:00Z"
33 | 
34 |         dict_keys:
35 |             - <m3.large, us-west-2a>
36 |             - <m3.large, us-west-2b>
37 |             - <m3.large, us-west-2c>
38 | 
39 |         frequency: 1m
40 |         values:
41 |             distribution: randint
42 |             params:
43 |                 a: 1
44 |                 b: 10
45 | 
46 | system_metrics:
47 |     cpus_allocated|cluster=local-dev,pool=default.mesos: *capacity
48 |     mem_allocated|cluster=local-dev,pool=default.mesos: *capacity
49 |     disk_allocated|cluster=local-dev,pool=default.mesos: *capacity
50 |     gpus_allocated|cluster=local-dev,pool=default.mesos: *capacity
51 | 


--------------------------------------------------------------------------------
/package/itest/ubuntu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -exo pipefail
 4 | 
 5 | cd /
 6 | 
 7 | highlight() {
 8 |   echo -n "$(tput setaf 3)"
 9 |   echo -n "$@"
10 |   echo "$(tput op)"
11 | }
12 | 
13 | highlight_exec() {
14 |   highlight "$@"
15 |   command "$@"
16 |   return $?
17 | }
18 | 
19 | PACKAGE_NAME="$1"
20 | PACKAGE_VERSION="$2"
21 | PAASTA_ENV="$3"
22 | 
23 | # This will get DISTRIB_CODENAME
24 | source /etc/lsb-release
25 | export DISTRIB_CODENAME
26 | echo $PACKAGE_NAME $PACKAGE_VERSION $DISTRIB_CODENAME $EXAMPLE
27 | 
28 | # Set up the timezone so clusterman_metrics gets the right data
29 | export TZ=US/Pacific
30 | ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
31 | 
32 | apt-get update && apt-get install -y software-properties-common
33 | # we really only need this externally, but we use a python not included
34 | # by ubuntu - so add the deadsnakes ppa to bring that in
35 | if [ "${PAASTA_ENV}" != "YELP" ]; then
36 |     add-apt-repository ppa:deadsnakes/ppa
37 | fi
38 | # our debian/control will already install py3.8, but we want to install it ahead of time so that
39 | # we can also get the right pip version installed as well.
40 | apt-get install -y --force-yes python3.8 python3-pip python3-yaml
41 | # Install package directly with any needed dependencies
42 | 
43 | # we also install python3-distutils here to avoid issues on newer ubuntus
44 | # where disutils isn't included with python (and even though clusterman depends on it, the right
45 | # version isn't installed in this itest container)
46 | apt-get install -y --force-yes python3.8-distutils
47 | 
48 | apt-get install -y --force-yes ./dist/${DISTRIB_CODENAME}/clusterman_${PACKAGE_VERSION}_amd64.deb
49 | 
50 | # Sometimes our acceptance tests run in parallel on the same box, so we need to use different CIDR ranges
51 | CIDR_BLOCK="10.1.0.0/24"
52 | 
53 | export ACCEPTANCE_ROOT=/itest
54 | python3.8 -m pip install boto3 simplejson pyyaml
55 | python3.8 /itest/run_instance.py \
56 |     http://moto-ec2:5000/ \
57 |     http://moto-s3:5000/ \
58 |     http://moto-dynamodb:5000/ \
59 |     "${CIDR_BLOCK}"
60 | 
61 | # Run the critical clusterman CLI commands
62 | if [ ! "${EXAMPLE}" ]; then
63 |     highlight_exec /usr/bin/clusterman --version
64 |     highlight "$0:" 'success!'
65 | else
66 |     /bin/bash
67 | fi
68 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 120
3 | target_version = ['py38']
4 | 


--------------------------------------------------------------------------------
/requirements-bootstrap.txt:
--------------------------------------------------------------------------------
1 | pip==18.1.0
2 | venv-update==3.2.4
3 | setuptools==49.2.1
4 | virtualenv==20.6.0
5 | wheel==0.32.3
6 | 


--------------------------------------------------------------------------------
/requirements-dev-minimal.txt:
--------------------------------------------------------------------------------
 1 | behave>=1.2.6
 2 | coverage
 3 | flake8
 4 | ipdb
 5 | moto>=1.3.15
 6 | mypy
 7 | pre-commit
 8 | pyhamcrest
 9 | pytest
10 | requirements-tools
11 | types-cachetools
12 | types-PyYAML
13 | types-requests
14 | types-retry
15 | types-urllib3
16 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | atomicwrites==1.3.0
 2 | attrs==19.2.0
 3 | aws-sam-translator==1.15.1
 4 | aws-xray-sdk==2.4.2
 5 | backcall==0.1.0
 6 | backports.entry-points-selectable==1.1.0
 7 | behave==1.2.6
 8 | boto==2.49.0
 9 | cffi==1.12.3
10 | cfgv==2.0.1
11 | cfn-lint==0.24.4
12 | coverage==4.5.4
13 | cryptography==39.0.1
14 | distlib==0.3.4
15 | docker==4.1.0
16 | ecdsa==0.13.3
17 | entrypoints==0.3
18 | filelock==3.0.12
19 | flake8==3.7.8
20 | identify==2.5.1
21 | importlib-metadata==6.6.0
22 | ipdb==0.13.9
23 | ipython==7.17.0
24 | ipython-genutils==0.2.0
25 | jedi==0.15.1
26 | Jinja2==2.10.3
27 | jsondiff==1.1.2
28 | jsonpatch==1.24
29 | jsonpointer==2.0
30 | jsonschema==3.1.1
31 | MarkupSafe==1.1.1
32 | mccabe==0.6.1
33 | mock==3.0.5
34 | more-itertools==9.1.0
35 | moto==1.3.16
36 | mypy==0.981
37 | nodeenv==1.3.3
38 | parse==1.12.1
39 | parse-type==0.5.2
40 | parso==0.5.1
41 | pexpect==4.7.0
42 | pickleshare==0.7.5
43 | platformdirs==2.5.2
44 | pluggy==0.13.0
45 | pre-commit==2.9.2
46 | prompt-toolkit==2.0.10
47 | ptyprocess==0.6.0
48 | pycodestyle==2.5.0
49 | pycparser==2.19
50 | pyflakes==2.1.1
51 | Pygments==2.4.2
52 | PyHamcrest==1.9.0
53 | pyrsistent==0.15.4
54 | pytest==5.2.1
55 | python-jose==3.1.0
56 | pytz==2019.3
57 | requirements-tools==2.0.0
58 | responses==0.10.6
59 | sshpubkeys==3.1.0
60 | toml==0.10.2
61 | tomli==2.0.1
62 | traitlets==4.3.3
63 | types-cachetools==5.2.1
64 | types-PyYAML==6.0.12
65 | types-requests==2.28.11
66 | types-retry==0.9.9
67 | types-urllib3==1.26.25
68 | virtualenv==20.6.0
69 | wcwidth==0.1.7
70 | Werkzeug==0.16.0
71 | wrapt==1.11.2
72 | xmltodict==0.12.0
73 | zipp==3.15.0
74 | 


--------------------------------------------------------------------------------
/requirements-docs.txt:
--------------------------------------------------------------------------------
 1 | alabaster==0.7.12
 2 | Babel==2.7.0
 3 | imagesize==1.1.0
 4 | Jinja2==2.10.3
 5 | MarkupSafe==1.1.1
 6 | packaging==19.2
 7 | Pygments==2.4.2
 8 | pytz==2019.3
 9 | snowballstemmer==2.0.0
10 | Sphinx==2.2.1
11 | sphinx-autodoc-annotation==1.0.post1
12 | sphinx-rtd-theme==0.4.3
13 | sphinxcontrib-applehelp==1.0.1
14 | sphinxcontrib-devhelp==1.0.1
15 | sphinxcontrib-htmlhelp==1.0.2
16 | sphinxcontrib-jsmath==1.0.1
17 | sphinxcontrib-programoutput==0.15
18 | sphinxcontrib-qthelp==1.0.2
19 | sphinxcontrib-serializinghtml==1.1.3
20 | 


--------------------------------------------------------------------------------
/requirements-minimal.txt:
--------------------------------------------------------------------------------
 1 | arrow
 2 | boto3
 3 | botocore
 4 | cachetools
 5 | colorlog
 6 | colorama
 7 | humanfriendly
 8 | jsonpickle
 9 | kubernetes
10 | matplotlib>=3.4.2
11 | mypy-extensions
12 | numpy>=1.20.3
13 | packaging>=23.1
14 | parsedatetime
15 | PyStaticConfiguration
16 | PyYAML>=5.4
17 | requests>=2.22.0
18 | retry
19 | semver
20 | signalfx
21 | simplejson
22 | sortedcontainers
23 | typing-extensions
24 | rook
25 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | arrow==0.17.0
 2 | boto3==1.11.11
 3 | botocore==1.14.11
 4 | cachetools==3.1.1
 5 | certifi==2019.9.11
 6 | chardet==3.0.4
 7 | colorama==0.4.1
 8 | colorlog==4.0.2
 9 | cycler==0.10.0
10 | decorator==4.4.0
11 | docutils==0.15.2
12 | future==0.18.0
13 | google-auth==1.6.3
14 | humanfriendly==4.18
15 | idna==2.8
16 | jmespath==0.9.4
17 | jsonpickle==1.4.2
18 | kiwisolver==1.1.0
19 | kubernetes==10.0.1
20 | matplotlib==3.4.2
21 | mypy-extensions==0.4.3
22 | numpy==1.21.6
23 | oauthlib==3.1.0
24 | packaging==23.1
25 | parsedatetime==2.4
26 | pillow==9.4.0
27 | protobuf==3.10.0
28 | py==1.8.0
29 | pyasn1==0.4.7
30 | pyasn1-modules==0.2.7
31 | pyformance==0.4
32 | pyparsing==2.4.2
33 | PyStaticConfiguration==0.10.4
34 | python-dateutil==2.8.0
35 | PyYAML==5.4.1
36 | requests==2.22.0
37 | requests-oauthlib==1.2.0
38 | retry==0.9.2
39 | rsa==4.6
40 | s3transfer==0.3.2
41 | semver==2.13.0
42 | setuptools==49.2.1
43 | signalfx==1.1.1
44 | simplejson==3.16.0
45 | six==1.15.0
46 | sortedcontainers==2.1.0
47 | sseclient-py==1.7
48 | typing-extensions==4.3.0
49 | urllib3==1.25.9
50 | websocket-client==0.56.0
51 | ws4py==0.5.1
52 | rook==0.1.160
53 | distro==1.7.0
54 | funcsigs==1.0.2
55 | psutil==5.9.1
56 | 


--------------------------------------------------------------------------------
/service-itest-runner:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source acceptance/utils.sh
 4 | trap cleanup SIGINT
 5 | 
 6 | BATCH=$1
 7 | ARGS=$2
 8 | STATUS_EXECUTABLE=${3:-$BATCH}
 9 | IMAGE_NAME="clusterman-dev-$(whoami)"
10 | 
11 | docker inspect --type=image "${IMAGE_NAME}" > /dev/null || \
12 |     IMAGE_NAME="docker-paasta.yelpcorp.com:443/services-clusterman:paasta-$(git rev-parse HEAD)"
13 | DISTRIB_CODENAME=$(docker run -t "${IMAGE_NAME}" lsb_release -cs | tr -d '\n\r')
14 | 
15 | # Sometimes our acceptance tests run in parallel on the same box, so we need to use different CIDR ranges
16 | CIDR_BLOCK="10.1.0.0/24"
17 | 
18 | if [ "${EXTRA_VOLUME_MOUNTS}" ]; then
19 |     EXTRA_FLAGS="-v ${EXTRA_VOLUME_MOUNTS}"
20 | fi
21 | 
22 | setup_networks
23 | 
24 | docker run -t -v "$(pwd)/acceptance/srv-configs:/nail/srv/configs:ro" \
25 |     ${EXTRA_FLAGS} \
26 |     -v "$(pwd)/acceptance/clusterman.sh:/etc/boto_cfg/clusterman.sh:ro" \
27 |     -v "$(pwd)/acceptance/clusterman.json:/etc/boto_cfg/clusterman.json:ro" \
28 |     -v "$(pwd)/acceptance/clusterman.json:/etc/boto_cfg/clusterman_metrics.json:ro" \
29 |     "${IMAGE_NAME}" sleep infinity &
30 | while [ -z "${CONTAINER}" ]; do CONTAINER=$(docker ps | egrep "${IMAGE_NAME}" | cut -d' ' -f1); done
31 | docker network connect "clusterman_${DISTRIB_CODENAME}_default" "${CONTAINER}"
32 | docker network connect "clusterman_${DISTRIB_CODENAME}_acceptance" "${CONTAINER}"
33 | docker exec -e "DISTRIB_CODENAME=${DISTRIB_CODENAME}" -e ACCEPTANCE_ROOT=/code/acceptance --user=0 "${CONTAINER}" python acceptance/run_instance.py \
34 |     http://moto-ec2:5000/ \
35 |     http://moto-s3:5000/ \
36 |     http://moto-dynamodb:5000/ \
37 |     "${CIDR_BLOCK}"
38 | 
39 | docker exec -t --user=0 \
40 |     -e CMAN_CLUSTER=local-dev \
41 |     -e CMAN_POOL=default \
42 |     -e CMAN_SCHEDULER=mesos \
43 |     "${CONTAINER}" /bin/bash -c "AWS_ENDPOINT_URL_ARGS='--endpoint-url http://moto-s3:5000' python -m $BATCH $ARGS" &
44 | 
45 | sleep 1
46 | count=0
47 | while true; do
48 |     docker exec -t "${CONTAINER}" /bin/sh -c "python itest_status.py $STATUS_EXECUTABLE"
49 |     exitcode=$?
50 |     count=$((count+1))
51 |     if [ $exitcode -eq 0 ] || [ $count -ge 10 ]; then
52 |         break;
53 |     fi
54 |     sleep 10;
55 | done;
56 | cleanup
57 | if [ $exitcode -ne 0 ] || [ $count -ge 10 ]; then
58 |     echo "something broke ($exitcode) or it took too long ($count iterations)"
59 |     exit 1
60 | else
61 |     echo "success!"
62 | fi
63 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from setuptools import find_packages
15 | from setuptools import setup
16 | 
17 | from clusterman import __version__
18 | 
19 | setup(
20 |     name="clusterman",
21 |     version=__version__,
22 |     provides=["clusterman"],
23 |     author="Compute Infrastructure",
24 |     author_email="compute-infra+github@yelp.com",
25 |     description="Distributed cluster scaling and management tools",
26 |     packages=find_packages(exclude=["tests"]),
27 |     setup_requires=["setuptools"],
28 |     include_package_data=True,
29 |     install_requires=[],
30 |     scripts=[
31 |         "clusterman/supervisord/fetch_clusterman_signal",
32 |         "clusterman/supervisord/run_clusterman_signal",
33 |     ],
34 |     entry_points={
35 |         "console_scripts": [
36 |             "clusterman=clusterman.run:main",
37 |         ],
38 |         "static_completion": [
39 |             "clusterman=clusterman.args:get_parser",
40 |         ],
41 |     },
42 | )
43 | 


--------------------------------------------------------------------------------
/stubs/simplejson.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | from typing import AnyStr
 3 | from typing import Callable
 4 | from typing import Dict
 5 | from typing import IO
 6 | from typing import List
 7 | from typing import Optional
 8 | from typing import Tuple
 9 | 
10 | def dumps(
11 |     obj: Any,
12 |     skipkeys: bool = ...,
13 |     ensure_ascii: bool = ...,
14 |     check_circular: bool = ...,
15 |     allow_nan: bool = ...,
16 |     cls: Any = ...,
17 |     indent: Optional[int] = ...,
18 |     separators: Optional[Tuple[str, str]] = ...,
19 |     encoding: str = ...,
20 |     default: Optional[Callable[[Any], Any]] = ...,
21 |     sort_keys: bool = ...,
22 |     use_decimanl: bool = ...,
23 |     named_tuple_as_object: bool = ...,
24 |     tuple_as_array: bool = ...,
25 |     bigint_as_string: bool = ...,
26 |     item_sort_key: Optional[Callable[[Any], Any]] = ...,
27 |     for_json: bool = ...,
28 |     ignore_nan: bool = ...,
29 |     int_as_string_bitcount: Optional[int] = ...,
30 |     iterable_as_array: bool = ...,
31 |     **kwds: Any,
32 | ) -> str: ...
33 | def loads(
34 |     s: AnyStr,
35 |     encoding: Any = ...,
36 |     cls: Any = ...,
37 |     object_hook: Optional[Callable[[Dict], Any]] = ...,
38 |     parse_float: Optional[Callable[[str], Any]] = ...,
39 |     parse_int: Optional[Callable[[str], Any]] = ...,
40 |     parse_constant: Optional[Callable[[str], Any]] = ...,
41 |     object_pairs_hook: Optional[Callable[[List[Tuple[Any, Any]]], Any]] = ...,
42 |     use_decimal: bool = ...,
43 |     **kwds: Any,
44 | ) -> Any: ...
45 | def load(
46 |     fp: IO[str],
47 |     encoding: Optional[str] = ...,
48 |     cls: Any = ...,
49 |     object_hook: Optional[Callable[[Dict], Any]] = ...,
50 |     parse_float: Optional[Callable[[str], Any]] = ...,
51 |     parse_int: Optional[Callable[[str], Any]] = ...,
52 |     parse_constant: Optional[Callable[[str], Any]] = ...,
53 |     object_pairs_hook: Optional[Callable[[List[Tuple[Any, Any]]], Any]] = ...,
54 |     use_decimal: bool = ...,
55 |     namedtuple_as_object: bool = ...,
56 |     tuple_as_array: bool = ...,
57 |     **kwds: Any,
58 | ) -> Any: ...
59 | 


--------------------------------------------------------------------------------
/stubs/sorteddict.pyi:
--------------------------------------------------------------------------------
1 | from typing import MutableMapping
2 | from typing import TypeVar
3 | 
4 | K = TypeVar("K")
5 | V = TypeVar("V")
6 | 
7 | class SortedDict(MutableMapping):
8 |     def __getitem__(self, index: K) -> V: ...
9 | 


--------------------------------------------------------------------------------
/stubs/staticconf.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Mapping
 2 | from typing import List
 3 | from typing import Optional
 4 | 
 5 | from staticconf.config import DEFAULT as DEFAULT_NAMESPACE
 6 | 
 7 | def read(config_key: str, default: Optional[str] = None, namespace: str = DEFAULT_NAMESPACE) -> str: ...
 8 | def read_bool(config_key: str, default: Optional[bool] = None, namespace: str = DEFAULT_NAMESPACE) -> bool: ...
 9 | def read_float(config_key: str, default: Optional[float] = None, namespace: str = DEFAULT_NAMESPACE) -> float: ...
10 | def read_list(config_key: str, default: Optional[List] = None, namespace: str = DEFAULT_NAMESPACE) -> List: ...
11 | def read_int(config_key: str, default: Optional[int] = None, namespace: str = DEFAULT_NAMESPACE) -> int: ...
12 | def read_string(config_key: str, default: Optional[str] = None, namespace: str = DEFAULT_NAMESPACE) -> str: ...
13 | def YamlConfiguration(filename: str, namespace: str = DEFAULT_NAMESPACE) -> None: ...
14 | def JSONConfiguration(filename: str, namespace: str = DEFAULT_NAMESPACE) -> None: ...
15 | def DictConfiguration(config: Mapping, namespace: str = DEFAULT_NAMESPACE) -> None: ...
16 | 
17 | class NamespaceAccessor:
18 |     def read_bool(
19 |         self, config_key: str, default: Optional[bool] = None, namespace: str = DEFAULT_NAMESPACE
20 |     ) -> bool: ...
21 |     def read_float(
22 |         self, config_key: str, default: Optional[float] = None, namespace: str = DEFAULT_NAMESPACE
23 |     ) -> float: ...
24 |     def read_list(
25 |         self, config_key: str, default: Optional[List] = None, namespace: str = DEFAULT_NAMESPACE
26 |     ) -> List: ...
27 |     def read_int(self, config_key: str, default: Optional[int] = None, namespace: str = DEFAULT_NAMESPACE) -> int: ...
28 |     def read_string(
29 |         self, config_key: str, default: Optional[str] = None, namespace: str = DEFAULT_NAMESPACE
30 |     ) -> str: ...
31 | 
32 | def NamespaceReaders(namespace: str) -> NamespaceAccessor: ...
33 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/args_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from argparse import Namespace
15 | from unittest import mock
16 | 
17 | import pytest
18 | 
19 | from clusterman.args import _get_validated_args
20 | 
21 | 
22 | @pytest.fixture
23 | def mock_args():
24 |     args = Namespace()
25 |     args.log_level = "debug"
26 |     return args
27 | 
28 | 
29 | @mock.patch("clusterman.args.argparse.ArgumentParser", autospec=True)
30 | class TestArgumentParser:
31 |     def test_no_subcommand(self, mock_parser, mock_args):
32 |         mock_args.subcommand = None
33 |         mock_parser.parse_args.return_value = mock_args
34 |         with pytest.raises(SystemExit):
35 |             _get_validated_args(None, mock_parser)
36 | 
37 |     def test_no_entrypoint(self, mock_parser, mock_args):
38 |         mock_args.subcommand = "foo"
39 |         mock_parser.parse_args.return_value = mock_args
40 |         with pytest.raises(SystemExit):
41 |             _get_validated_args(None, mock_parser)
42 | 
43 |     def test_no_cluster(self, mock_parser, mock_args):
44 |         mock_args.subcommand = "foo"
45 |         mock_args.cluster = None
46 |         mock_parser.parse_args.return_value = mock_args
47 |         with pytest.raises(SystemExit):
48 |             _get_validated_args(None, mock_parser)
49 | 


--------------------------------------------------------------------------------
/tests/autoscaler/config_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import staticconf.testing
15 | 
16 | from clusterman.autoscaler.config import AutoscalingConfig
17 | from clusterman.autoscaler.config import get_autoscaling_config
18 | 
19 | 
20 | def test_get_autoscaling_config():
21 |     default_autoscaling_values = {
22 |         "setpoint": 0.7,
23 |         "target_capacity_margin": 0.1,
24 |         "excluded_resources": ["gpus"],
25 |     }
26 |     pool_autoscaling_values = {"setpoint": 0.8, "excluded_resources": ["cpus"]}
27 |     with staticconf.testing.MockConfiguration(
28 |         {"autoscaling": default_autoscaling_values}
29 |     ), staticconf.testing.MockConfiguration({"autoscaling": pool_autoscaling_values}, namespace="pool_namespace"):
30 |         autoscaling_config = get_autoscaling_config("pool_namespace")
31 | 
32 |         assert autoscaling_config == AutoscalingConfig(["cpus"], 0.8, 0.1)
33 | 


--------------------------------------------------------------------------------
/tests/autoscaler/offset_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from unittest import mock
15 | 
16 | import arrow
17 | import pytest
18 | 
19 | from clusterman.autoscaler.offset import get_capacity_offset
20 | from clusterman.autoscaler.offset import remove_capacity_offset
21 | from clusterman.autoscaler.offset import set_capacity_offset
22 | 
23 | 
24 | @pytest.mark.parametrize("until", (1671030903, "2022-12-14T15:15:03+00:00"))
25 | def test_set_capacity_offset(until):
26 |     with mock.patch("clusterman.autoscaler.offset.dynamodb") as mock_dynamo:
27 |         with mock.patch("clusterman.autoscaler.offset.time") as mock_time:
28 |             mock_time.time.return_value = 1234567890
29 |             set_capacity_offset("mesos-test", "bar", "mesos", until, 0.9)
30 |             mock_dynamo.put_item.assert_called_once_with(
31 |                 TableName="clusterman_cluster_state",
32 |                 Item={
33 |                     "state": {"S": "autoscaler_capacity_offset"},
34 |                     "entity": {"S": "mesos-test.bar.mesos"},
35 |                     "timestamp": {"N": "1234567890"},
36 |                     "expiration_timestamp": {"N": "1671030903"},
37 |                     "offset": {"N": "0.9"},
38 |                 },
39 |             )
40 | 
41 | 
42 | def test_remove_capacity_offset():
43 |     with mock.patch("clusterman.autoscaler.offset.dynamodb") as mock_dynamo:
44 |         remove_capacity_offset("mesos-test", "bar", "mesos")
45 |         mock_dynamo.delete_item.assert_called_once_with(
46 |             TableName="clusterman_cluster_state",
47 |             Key={"state": {"S": "autoscaler_capacity_offset"}, "entity": {"S": "mesos-test.bar.mesos"}},
48 |         )
49 | 
50 | 
51 | def test_get_capacity_offset_no_data_for_cluster():
52 |     with mock.patch("clusterman.autoscaler.offset.dynamodb") as mock_dynamo:
53 |         mock_dynamo.get_item.return_value = {"ResponseMetadata": {"foo": "asdf"}}
54 |         assert get_capacity_offset("mesos-test", "bar", "mesos", arrow.get(300)) == 0
55 | 
56 | 
57 | @pytest.mark.parametrize("exp_timestamp,expected_value", ((None, 0.8), ("100", 0), ("400", 0.8)))
58 | def test_capacity_offset_with_expiration_timestamp(exp_timestamp, expected_value):
59 |     with mock.patch("clusterman.autoscaler.offset.dynamodb") as mock_dynamo:
60 |         mock_dynamo.get_item.return_value = {
61 |             "ResponseMetadata": {"foo": "asdf"},
62 |             "Item": {
63 |                 "state": {"S": "autoscaler_paused"},
64 |                 "entity": {"S": "mesos-test.bar.mesos"},
65 |                 "offset": {"N": "0.8"},
66 |             },
67 |         }
68 |         if exp_timestamp:
69 |             mock_dynamo.get_item.return_value["Item"]["expiration_timestamp"] = {"N": exp_timestamp}
70 |         assert get_capacity_offset("mesos-test", "bar", "mesos", arrow.get(300)) == expected_value
71 | 


--------------------------------------------------------------------------------
/tests/autoscaler/toggle_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from unittest import mock
15 | 
16 | import arrow
17 | import pytest
18 | 
19 | from clusterman.autoscaler.toggle import autoscaling_is_paused
20 | 
21 | 
22 | def test_is_paused_no_data_for_cluster():
23 |     with mock.patch("clusterman.autoscaler.toggle.dynamodb") as mock_dynamo:
24 |         mock_dynamo.get_item.return_value = {"ResponseMetadata": {"foo": "asdf"}}
25 |         assert not autoscaling_is_paused("mesos-test", "bar", "mesos", arrow.get(300))
26 | 
27 | 
28 | @pytest.mark.parametrize("exp_timestamp", [None, "100", "400"])
29 | def test_is_paused_with_expiration_timestamp(exp_timestamp):
30 |     with mock.patch("clusterman.autoscaler.toggle.dynamodb") as mock_dynamo:
31 |         mock_dynamo.get_item.return_value = {
32 |             "ResponseMetadata": {"foo": "asdf"},
33 |             "Item": {
34 |                 "state": {"S": "autoscaler_paused"},
35 |                 "entity": {"S": "mesos-test.bar.mesos"},
36 |             },
37 |         }
38 |         if exp_timestamp:
39 |             mock_dynamo.get_item.return_value["Item"]["expiration_timestamp"] = {"N": exp_timestamp}
40 |         assert autoscaling_is_paused("mesos-test", "bar", "mesos", arrow.get(300)) == (
41 |             not exp_timestamp or int(exp_timestamp) > 300
42 |         )
43 | 


--------------------------------------------------------------------------------
/tests/aws/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/aws/client_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import math
15 | from unittest import mock
16 | from unittest.mock import call
17 | 
18 | import pytest
19 | 
20 | from clusterman.aws.client import ec2_describe_instances
21 | from clusterman.aws.client import MAX_PAGE_SIZE
22 | 
23 | 
24 | def test_empty_instance_ids():
25 |     assert ec2_describe_instances(instance_ids=None) == []
26 |     assert ec2_describe_instances(instance_ids=[]) == []
27 | 
28 | 
29 | @pytest.mark.parametrize("value_numbers", [200, 500, 1100])
30 | def test_over_filter_limits(value_numbers):
31 |     instance_ids = list(range(value_numbers))
32 |     with mock.patch("clusterman.aws.client.ec2.describe_instances") as mock_describe_instances:
33 |         ec2_describe_instances(instance_ids)
34 |         target_call_count = math.ceil(value_numbers / MAX_PAGE_SIZE)
35 |         assert mock_describe_instances.call_count == target_call_count
36 |         assert mock_describe_instances.call_args_list == [
37 |             call(InstanceIds=instance_ids[i * MAX_PAGE_SIZE : (i + 1) * MAX_PAGE_SIZE])
38 |             for i in range(target_call_count)
39 |         ]
40 | 


--------------------------------------------------------------------------------
/tests/aws/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import moto
15 | import pytest
16 | 
17 | from clusterman.aws.client import ec2
18 | 
19 | 
20 | @pytest.fixture(autouse=True)
21 | def setup_ec2():
22 |     mock_ec2_obj = moto.mock_ec2()
23 |     mock_ec2_obj.start()
24 |     yield
25 |     mock_ec2_obj.stop()
26 | 
27 | 
28 | @pytest.fixture(autouse=True)
29 | def setup_autoscaling():
30 |     mock_autoscaling_obj = moto.mock_autoscaling()
31 |     mock_autoscaling_obj.start()
32 |     yield
33 |     mock_autoscaling_obj.stop()
34 | 
35 | 
36 | @pytest.fixture
37 | def mock_subnet():
38 |     vpc_response = ec2.create_vpc(CidrBlock="10.0.0.0/24")
39 |     return ec2.create_subnet(
40 |         CidrBlock="10.0.0.0/24",
41 |         VpcId=vpc_response["Vpc"]["VpcId"],
42 |         AvailabilityZone="us-west-2a",
43 |     )
44 | 


--------------------------------------------------------------------------------
/tests/batch/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/batch/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import warnings
15 | 
16 | import pytest
17 | import staticconf.testing
18 | 
19 | try:
20 |     import yelp_batch  # noqa
21 | except ImportError:
22 |     warnings.warn("Could not import yelp_batch, are you in a Yelp-y environment?  Skipping these tests")
23 |     collect_ignore_glob = ["*"]
24 | 
25 | 
26 | @pytest.fixture(autouse=True)
27 | def mock_setup_config_directory():
28 |     with staticconf.testing.PatchConfiguration({"cluster_config_directory": "/a/fake/directory/"}):
29 |         yield
30 | 


--------------------------------------------------------------------------------
/tests/batch/drainer_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from unittest import mock
15 | 
16 | import pytest
17 | import staticconf.testing
18 | 
19 | from clusterman.batch.drainer import NodeDrainerBatch
20 | 
21 | 
22 | class LoopBreak(Exception):
23 |     pass
24 | 
25 | 
26 | def test_drainer_batch_process_queues():
27 |     batch = NodeDrainerBatch()
28 |     batch.run_interval = 5
29 |     batch.logger = mock.MagicMock()
30 |     batch.options = mock.MagicMock(cluster="westeros-prod", autorestart_interval_minutes=0)
31 |     with mock.patch(
32 |         "clusterman.batch.drainer.DrainingClient",
33 |         autospec=True,
34 |     ) as mock_draining_client, staticconf.testing.PatchConfiguration(
35 |         {
36 |             "clusters": {
37 |                 "westeros-prod": {
38 |                     "mesos_master_fqdn": "westeros-prod",
39 |                     "cluster_manager": "mesos",
40 |                 }
41 |             }
42 |         },
43 |     ), mock.patch(
44 |         "clusterman.batch.drainer.time.sleep", autospec=True, side_effect=LoopBreak
45 |     ), mock.patch(
46 |         "clusterman.batch.drainer.KubernetesClusterConnector",
47 |         autospec=True,
48 |     ):
49 | 
50 |         mock_draining_client.return_value.process_termination_queue.return_value = False
51 |         mock_draining_client.return_value.process_drain_queue.return_value = False
52 |         mock_draining_client.return_value.process_warning_queue.return_value = False
53 |         with pytest.raises(LoopBreak):
54 |             batch.run()
55 |         assert mock_draining_client.return_value.process_termination_queue.called
56 |         assert mock_draining_client.return_value.process_drain_queue.called
57 |         assert mock_draining_client.return_value.clean_processing_hosts_cache.called
58 |         assert mock_draining_client.return_value.process_warning_queue.called
59 | 


--------------------------------------------------------------------------------
/tests/batch/util_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from unittest import mock
15 | 
16 | import pytest
17 | from botocore.exceptions import ClientError
18 | 
19 | try:
20 |     from clusterman.batch.util import suppress_request_limit_exceeded
21 | except ImportError:
22 |     pytest.mark.skip("Could not import the batch; are you in a Yelp-y environment?")
23 | 
24 | 
25 | @mock.patch("clusterman.batch.util.get_monitoring_client")
26 | @mock.patch("clusterman.batch.util.logger")
27 | def test_suppress_rle(mock_logger, mock_monitoring_client):
28 |     mock_counter = mock_monitoring_client.return_value.create_counter.return_value
29 |     with suppress_request_limit_exceeded():
30 |         raise ClientError({"Error": {"Code": "RequestLimitExceeded"}}, "foo")
31 |     assert mock_logger.warning.call_count == 1
32 |     assert mock_monitoring_client.return_value.create_counter.call_count == 1
33 |     assert mock_counter.count.call_count == 1
34 | 
35 | 
36 | @mock.patch("clusterman.batch.util.get_monitoring_client")
37 | @mock.patch("clusterman.batch.util.logger")
38 | def test_ignore_other_exceptions(mock_logger, mock_monitoring_client):
39 |     mock_counter = mock_monitoring_client.return_value.create_counter.return_value
40 |     with suppress_request_limit_exceeded(), pytest.raises(Exception):
41 |         raise Exception("foo")
42 |     assert mock_logger.warning.call_count == 0
43 |     assert mock_monitoring_client.return_value.create_counter.call_count == 0
44 |     assert mock_counter.count.call_count == 0
45 | 


--------------------------------------------------------------------------------
/tests/cli/migrate_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import argparse
15 | from unittest.mock import call
16 | from unittest.mock import patch
17 | 
18 | import packaging.version
19 | 
20 | from clusterman.cli.migrate import main_start
21 | from clusterman.cli.migrate import main_stop
22 | from clusterman.migration.event import MigrationCondition
23 | from clusterman.migration.event import MigrationEvent
24 | from clusterman.migration.event_enums import ConditionOperator
25 | from clusterman.migration.event_enums import ConditionTrait
26 | from clusterman.migration.event_enums import MigrationStatus
27 | 
28 | 
29 | @patch("clusterman.cli.migrate.time")
30 | @patch("clusterman.cli.migrate.KubernetesClusterConnector")
31 | def test_migrate_command(mock_connector, mock_time):
32 |     mock_args = argparse.Namespace(
33 |         cluster="mesos-test",
34 |         pool="bar",
35 |         label_selector=[],
36 |         condition_trait="lsbrelease",
37 |         condition_operator="ge",
38 |         condition_target="22.04",
39 |     )
40 |     mock_time.time.return_value = 111222333
41 |     main_start(mock_args)
42 |     mock_connector.assert_called_once_with("mesos-test", "bar", init_crd=True)
43 |     mock_connector.return_value.create_node_migration_resource.assert_called_once_with(
44 |         MigrationEvent(
45 |             resource_name="mesos-test-bar-111222333",
46 |             cluster="mesos-test",
47 |             pool="bar",
48 |             label_selectors=[],
49 |             condition=MigrationCondition(
50 |                 ConditionTrait.LSBRELEASE, ConditionOperator.GE, packaging.version.parse("22.04")
51 |             ),
52 |         ),
53 |         MigrationStatus.PENDING,
54 |     )
55 | 
56 | 
57 | @patch("clusterman.cli.migrate.KubernetesClusterConnector")
58 | def test_migrate_stop_command(mock_connector):
59 |     mock_args = argparse.Namespace(
60 |         cluster="mesos-test",
61 |         pool="bar",
62 |     )
63 |     mock_connector.return_value.list_node_migration_resources.return_value = [
64 |         MigrationEvent(
65 |             resource_name=f"mesos-test-bar-{str(i) * 8}",
66 |             cluster="mesos-test",
67 |             pool="bar",
68 |             label_selectors=[],
69 |             condition=MigrationCondition(
70 |                 ConditionTrait.LSBRELEASE, ConditionOperator.GE, packaging.version.parse(f"2{i}.04")
71 |             ),
72 |         )
73 |         for i in range(1, 3)
74 |     ]
75 |     main_stop(mock_args)
76 |     mock_connector.assert_called_once_with("mesos-test", "bar", init_crd=True)
77 |     mock_connector.return_value.mark_node_migration_resource.assert_has_calls(
78 |         [
79 |             call("mesos-test-bar-11111111", MigrationStatus.STOP),
80 |             call("mesos-test-bar-22222222", MigrationStatus.STOP),
81 |         ]
82 |     )
83 | 


--------------------------------------------------------------------------------
/tests/cli/simulate_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from argparse import ArgumentError
15 | from argparse import Namespace
16 | from unittest import mock
17 | 
18 | import pytest
19 | 
20 | from clusterman.cli.simulate import main
21 | 
22 | 
23 | @pytest.fixture
24 | def args():
25 |     return Namespace(
26 |         start_time="2018-01-01 00:00:00",
27 |         end_time="2018-01-01 00:00:00",
28 |         cluster="foo",
29 |         pool="bar",
30 |         cluster_config_dir="baz",
31 |         metrics_data_files=None,
32 |         simulation_result_file=None,
33 |         comparison_operator="div",
34 |         output_prefix="",
35 |         join_delay_params=[0, 0],
36 |         cpus_per_weight=10,
37 |         ebs_volume_size=0,
38 |     )
39 | 
40 | 
41 | def test_main_too_many_compares(args):
42 |     args.compare = ["sim1", "sim2", "sim3"]
43 |     with pytest.raises(ArgumentError):
44 |         main(args)
45 | 
46 | 
47 | @pytest.mark.parametrize("compare", [[], ["sim1"], ["sim1", "sim2"]])
48 | def test_main_compare_param(compare, args):
49 |     args.compare = compare
50 |     with mock.patch("clusterman.cli.simulate.read_object_from_compressed_json") as mock_read, mock.patch(
51 |         "clusterman.cli.simulate.write_object_to_compressed_json"
52 |     ) as mock_write, mock.patch("clusterman.cli.simulate._load_metrics") as mock_load_metrics, mock.patch(
53 |         "clusterman.cli.simulate._run_simulation"
54 |     ) as mock_run_simulation, mock.patch(
55 |         "clusterman.cli.simulate.operator"
56 |     ) as mock_operator, mock.patch(
57 |         "clusterman.cli.simulate.make_report"
58 |     ):
59 |         main(args)
60 |         expected_call_count = 1 if len(compare) < 2 else 0
61 |         assert mock_load_metrics.call_count == expected_call_count
62 |         assert mock_run_simulation.call_count == expected_call_count
63 |         assert mock_read.call_count == len(compare)
64 |         assert mock_write.call_count == 0
65 |         assert mock_operator.div.call_count == (len(compare) > 0)
66 | 


--------------------------------------------------------------------------------
/tests/common/sfx_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from clusterman.common.sfx import _make_filter_string
15 | 
16 | 
17 | def test_make_filter_string():
18 |     assert _make_filter_string([("foo", "bar"), ("fizz", "buzz")]) == 'filter("foo", "bar") and filter("fizz", "buzz")'
19 | 


--------------------------------------------------------------------------------
/tests/interfaces/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/tests/interfaces/__init__.py


--------------------------------------------------------------------------------
/tests/interfaces/signal_test.py:
--------------------------------------------------------------------------------
 1 | from unittest import mock
 2 | 
 3 | import arrow
 4 | import pytest
 5 | import staticconf
 6 | from clusterman_metrics import APP_METRICS
 7 | from clusterman_metrics import METADATA
 8 | from clusterman_metrics import SYSTEM_METRICS
 9 | 
10 | from clusterman.exceptions import MetricsError
11 | from clusterman.interfaces.signal import get_metrics_for_signal
12 | 
13 | 
14 | @pytest.mark.parametrize("end_time", [arrow.get(3600), arrow.get(10000), arrow.get(35000)])
15 | def test_get_metrics(end_time):
16 | 
17 |     required_metrics = staticconf.read_list(
18 |         "autoscale_signal.required_metrics",
19 |         namespace="bar.mesos_config",
20 |     )
21 |     metrics_client = mock.Mock()
22 |     metrics_client.get_metric_values.side_effect = [
23 |         {"cpus_allocated": [(1, 2), (3, 4)]},
24 |         {"cpus_allocated": [(5, 6), (7, 8)]},
25 |         {"app1,cost": [(1, 2.5), (3, 4.5)]},
26 |     ]
27 |     metrics = get_metrics_for_signal("foo", "bar", "mesos", "app1", metrics_client, required_metrics, end_time)
28 |     assert metrics_client.get_metric_values.call_args_list == [
29 |         mock.call(
30 |             "cpus_allocated",
31 |             SYSTEM_METRICS,
32 |             end_time.shift(minutes=-10).timestamp,
33 |             end_time.timestamp,
34 |             app_identifier="app1",
35 |             extra_dimensions={"cluster": "foo", "pool": "bar"},
36 |             is_regex=False,
37 |         ),
38 |         mock.call(
39 |             "cpus_allocated",
40 |             SYSTEM_METRICS,
41 |             end_time.shift(minutes=-10).timestamp,
42 |             end_time.timestamp,
43 |             app_identifier="app1",
44 |             extra_dimensions={"cluster": "foo", "pool": "bar.mesos"},
45 |             is_regex=False,
46 |         ),
47 |         mock.call(
48 |             "cost",
49 |             APP_METRICS,
50 |             end_time.shift(minutes=-30).timestamp,
51 |             end_time.timestamp,
52 |             app_identifier="app1",
53 |             extra_dimensions={},
54 |             is_regex=False,
55 |         ),
56 |     ]
57 |     assert "cpus_allocated" in metrics
58 |     assert "app1,cost" in metrics
59 | 
60 | 
61 | def test_get_metadata_metrics():
62 |     with pytest.raises(MetricsError):
63 |         required_metrics = [{"name": "total_cpus", "type": METADATA, "minute_range": 10}]
64 |         get_metrics_for_signal("foo", "bar", "mesos", "app1", mock.Mock(), required_metrics, arrow.get(0))
65 | 


--------------------------------------------------------------------------------
/tests/migration/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/migration/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import pytest
15 | import semver
16 | 
17 | from clusterman.migration.event import ConditionOperator
18 | from clusterman.migration.event import ConditionTrait
19 | from clusterman.migration.event import MigrationCondition
20 | from clusterman.migration.event import MigrationEvent
21 | 
22 | 
23 | @pytest.fixture
24 | def mock_migration_event():
25 |     yield MigrationEvent(
26 |         resource_name="mesos-test-bar-111222333",
27 |         cluster="mesos-test",
28 |         pool="bar",
29 |         label_selectors=[],
30 |         condition=MigrationCondition(ConditionTrait.KERNEL, ConditionOperator.GE, semver.VersionInfo.parse("1.2.3")),
31 |     )
32 | 


--------------------------------------------------------------------------------
/tests/migration/migration_event_enums_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from datetime import timedelta
15 | from typing import Any
16 | 
17 | import packaging.version
18 | import pytest
19 | import semver
20 | 
21 | from clusterman.aws.markets import InstanceMarket
22 | from clusterman.interfaces.types import AgentMetadata
23 | from clusterman.interfaces.types import ClusterNodeMetadata
24 | from clusterman.interfaces.types import InstanceMetadata
25 | from clusterman.migration.event_enums import ConditionOperator
26 | from clusterman.migration.event_enums import ConditionTrait
27 | 
28 | 
29 | @pytest.mark.parametrize(
30 |     "op,left,right,result",
31 |     (
32 |         (ConditionOperator.LT, 1, 2, True),
33 |         (ConditionOperator.LT, 3, 2, False),
34 |         (ConditionOperator.EQ, 2, 2, True),
35 |         (ConditionOperator.NE, 1, 2, True),
36 |         (ConditionOperator.IN, 1, (1, 2, 3), True),
37 |         (ConditionOperator.IN, 1, (2, 3), False),
38 |         (ConditionOperator.NOTIN, 1, (3, 4, 5), True),
39 |     ),
40 | )
41 | def test_operator_apply(op: ConditionOperator, left: Any, right: Any, result: bool):
42 |     assert op.apply(left, right) is result
43 | 
44 | 
45 | @pytest.mark.parametrize(
46 |     "enum_val,expected",
47 |     (
48 |         (ConditionTrait.INSTANCE_TYPE, "m6a.4xlarge"),
49 |         (ConditionTrait.UPTIME, 10 * 24 * 60 * 60),
50 |         (ConditionTrait.KERNEL, semver.VersionInfo.parse("3.2.1")),
51 |         (ConditionTrait.LSBRELEASE, packaging.version.parse("20.04")),
52 |     ),
53 | )
54 | def test_trait_get_from(enum_val, expected):
55 |     node_metadata = ClusterNodeMetadata(
56 |         agent=AgentMetadata(kernel="3.2.1", lsbrelease="20.04"),
57 |         instance=InstanceMetadata(market=InstanceMarket("m6a.4xlarge", None), weight=None, uptime=timedelta(days=10)),
58 |     )
59 |     assert enum_val.get_from(node_metadata) == expected
60 | 


--------------------------------------------------------------------------------
/tests/migration/migration_settings_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from datetime import timedelta
15 | 
16 | import pytest
17 | 
18 | from clusterman.aws.markets import InstanceMarket
19 | from clusterman.interfaces.types import AgentMetadata
20 | from clusterman.interfaces.types import ClusterNodeMetadata
21 | from clusterman.interfaces.types import InstanceMetadata
22 | from clusterman.migration.settings import MigrationPrecendence
23 | from clusterman.migration.settings import PoolPortion
24 | 
25 | 
26 | @pytest.mark.parametrize(
27 |     "initval,poolsize,result",
28 |     (
29 |         ("3%", 100, 3),
30 |         (1, 100, 1),
31 |         ("3%", 3, 1),
32 |         (1, 3, 1),
33 |         ("1", 3, 1),
34 |         ("0%", 3, 0),
35 |     ),
36 | )
37 | def test_pool_portion(initval, poolsize, result):
38 |     assert PoolPortion(initval).of(poolsize) == result
39 | 
40 | 
41 | @pytest.mark.parametrize(
42 |     "initval,exctype",
43 |     (
44 |         ("-3%", ValueError),
45 |         (-1, ValueError),
46 |         ("foobar", ValueError),
47 |     ),
48 | )
49 | def test_pool_portion_error(initval, exctype):
50 |     with pytest.raises(exctype):
51 |         PoolPortion(initval)
52 | 
53 | 
54 | @pytest.mark.parametrize(
55 |     "initval,expected",
56 |     (
57 |         ("5%", True),
58 |         (1, True),
59 |         ("0%", False),
60 |         (0, False),
61 |     ),
62 | )
63 | def test_pool_portion_truthy(initval, expected):
64 |     assert bool(PoolPortion(initval)) is expected
65 | 
66 | 
67 | @pytest.mark.parametrize(
68 |     "precedence,expected_agent_id_order",
69 |     (
70 |         (MigrationPrecendence.UPTIME, ["3", "2", "1"]),
71 |         (MigrationPrecendence.TASK_COUNT, ["1", "3", "2"]),
72 |         (MigrationPrecendence.AZ_NAME, ["2", "1", "3"]),
73 |     ),
74 | )
75 | def test_migration_precedence(precedence, expected_agent_id_order):
76 |     nodes = [
77 |         ClusterNodeMetadata(
78 |             agent=AgentMetadata(agent_id="1", task_count=1),
79 |             instance=InstanceMetadata(
80 |                 market=InstanceMarket("m6a.4xlarge", "us-west-2b"), weight=None, uptime=timedelta(days=10)
81 |             ),
82 |         ),
83 |         ClusterNodeMetadata(
84 |             agent=AgentMetadata(agent_id="2", task_count=3),
85 |             instance=InstanceMetadata(
86 |                 market=InstanceMarket("m6a.4xlarge", "us-west-2a"), weight=None, uptime=timedelta(days=50)
87 |             ),
88 |         ),
89 |         ClusterNodeMetadata(
90 |             agent=AgentMetadata(agent_id="3", task_count=2),
91 |             instance=InstanceMetadata(
92 |                 market=InstanceMarket("m6a.4xlarge", "us-west-2b"), weight=None, uptime=timedelta(days=90)
93 |             ),
94 |         ),
95 |     ]
96 |     assert [node.agent.agent_id for node in sorted(nodes, key=precedence.sort_key)] == expected_agent_id_order
97 | 


--------------------------------------------------------------------------------
/tests/monitoring_lib_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from unittest import mock
15 | 
16 | import pytest
17 | 
18 | from clusterman.monitoring_lib import get_monitoring_client
19 | from clusterman.monitoring_lib import LogMonitoringClient
20 | from clusterman.monitoring_lib import SignalFXMonitoringClient
21 | 
22 | 
23 | @pytest.mark.parametrize("ym", [None, mock.Mock()])
24 | def test_default_monitoring_client(ym):
25 |     with mock.patch("clusterman.monitoring_lib.yelp_meteorite", ym):
26 |         assert get_monitoring_client() == (LogMonitoringClient if not ym else SignalFXMonitoringClient)
27 | 


--------------------------------------------------------------------------------
/tests/signals/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/clusterman/55d0831d80d809b1b1d1a9120712fa6c076cac50/tests/signals/__init__.py


--------------------------------------------------------------------------------
/tests/simulator/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 


--------------------------------------------------------------------------------
/tests/simulator/conftest.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from unittest import mock
15 | 
16 | import arrow
17 | import pytest
18 | import staticconf.testing
19 | 
20 | from clusterman.simulator.simulator import SimulationMetadata
21 | from clusterman.simulator.simulator import Simulator
22 | 
23 | 
24 | @pytest.fixture
25 | def simulator():
26 |     with mock.patch("clusterman.simulator.simulator.PiecewiseConstantFunction"):
27 |         return Simulator(
28 |             SimulationMetadata("test", "testing", "mesos", "test-tag"),
29 |             arrow.get(0),
30 |             arrow.get(3600),
31 |             None,
32 |             None,
33 |         )
34 | 
35 | 
36 | @pytest.fixture(autouse=True)
37 | def sim_params():
38 |     with staticconf.testing.PatchConfiguration(
39 |         {
40 |             "join_delay_mean_seconds": 0,
41 |             "join_delay_stdev_seconds": 0,
42 |         }
43 |     ):
44 |         yield
45 | 


--------------------------------------------------------------------------------
/tests/simulator/io_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from unittest import mock
15 | 
16 | import arrow
17 | import jsonpickle
18 | import pytest
19 | from clusterman_metrics import SYSTEM_METRICS
20 | 
21 | from clusterman.simulator.io import read_object_from_compressed_json
22 | from clusterman.simulator.io import write_object_to_compressed_json
23 | 
24 | 
25 | @pytest.fixture
26 | def mock_ts_1():
27 |     return {SYSTEM_METRICS: {"metric_1": [(arrow.get(1), 1.0), (arrow.get(2), 2.0), (arrow.get(3), 3.0)]}}
28 | 
29 | 
30 | @pytest.yield_fixture
31 | def mock_open():
32 |     with mock.patch("clusterman.simulator.io.gzip") as mgz:
33 |         mock_open_obj = mock.Mock()
34 |         mgz.open.return_value.__enter__ = mock.Mock(return_value=mock_open_obj)
35 |         yield mock_open_obj
36 | 
37 | 
38 | def test_write_new_object(mock_ts_1, mock_open):
39 |     write_object_to_compressed_json(mock_ts_1, "foo")
40 |     assert jsonpickle.decode(mock_open.write.call_args[0][0]) == mock_ts_1
41 | 
42 | 
43 | @pytest.mark.parametrize("raw_ts", [True, False])
44 | def test_read_new_object(raw_ts, mock_ts_1, mock_open):
45 |     expected_return = {
46 |         SYSTEM_METRICS: {
47 |             "metric_1": [((t.timestamp, v) if raw_ts else (t, v)) for t, v in mock_ts_1[SYSTEM_METRICS]["metric_1"]]
48 |         }
49 |     }
50 |     mock_open.read.return_value = jsonpickle.encode(expected_return).encode()
51 |     assert read_object_from_compressed_json("foo", raw_timestamps=raw_ts) == expected_return
52 | 


--------------------------------------------------------------------------------
/tests/simulator/simulated_cluster_connector_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from unittest import mock
15 | 
16 | import arrow
17 | import pytest
18 | 
19 | from clusterman.aws.markets import get_market_resources
20 | from clusterman.aws.markets import InstanceMarket
21 | from clusterman.interfaces.cluster_connector import AgentMetadata
22 | from clusterman.interfaces.types import AgentState
23 | from clusterman.interfaces.types import ClustermanResources
24 | from clusterman.simulator.simulated_aws_cluster import Instance
25 | from clusterman.simulator.simulated_cluster_connector import SimulatedClusterConnector
26 | from clusterman.simulator.simulated_spot_fleet_resource_group import SimulatedSpotFleetResourceGroup
27 | 
28 | 
29 | TEST_MARKET = InstanceMarket("c3.4xlarge", "us-west-2a")
30 | 
31 | 
32 | @pytest.fixture
33 | def ssfrg_config():
34 |     return {"LaunchSpecifications": [], "AllocationStrategy": "diversified"}
35 | 
36 | 
37 | @pytest.fixture
38 | def mock_ssfrg(ssfrg_config):
39 |     ssfrg = SimulatedSpotFleetResourceGroup(ssfrg_config, None)
40 |     instances = [Instance(TEST_MARKET, arrow.get(0), join_time=arrow.get(0)) for i in range(10)]
41 |     ssfrg.instances = {instance.id: instance for instance in instances}
42 |     return ssfrg
43 | 
44 | 
45 | @pytest.fixture
46 | def mock_cluster_connector(mock_ssfrg, simulator):
47 |     simulator.aws_clusters = [mock_ssfrg]
48 |     return SimulatedClusterConnector("foo", "bar", simulator)
49 | 
50 | 
51 | def test_get_agent_metadata(mock_cluster_connector):
52 |     instance = list(mock_cluster_connector.simulator.aws_clusters[0].instances.values())[0]
53 |     mesos_resources = ClustermanResources(
54 |         get_market_resources(TEST_MARKET).cpus,
55 |         get_market_resources(TEST_MARKET).mem * 1000,
56 |         get_market_resources(TEST_MARKET).disk * 1000,
57 |     )
58 |     assert mock_cluster_connector.get_agent_metadata(instance.ip_address) == AgentMetadata(
59 |         agent_id=mock.ANY,
60 |         state=AgentState.IDLE,
61 |         total_resources=mesos_resources,
62 |     )
63 | 
64 | 
65 | def test_get_agent_metadata_unknown(mock_cluster_connector):
66 |     assert mock_cluster_connector.get_agent_metadata("1.2.3.4") == AgentMetadata(
67 |         state=AgentState.ORPHANED,
68 |     )
69 | 
70 | 
71 | def test_simulated_agents(mock_cluster_connector):
72 |     assert mock_cluster_connector.get_resource_total("cpus") == 10 * get_market_resources(TEST_MARKET).cpus
73 |     assert mock_cluster_connector.get_resource_total("mem") == 10 * get_market_resources(TEST_MARKET).mem
74 |     assert mock_cluster_connector.get_resource_total("disk") == 10 * get_market_resources(TEST_MARKET).disk
75 | 


--------------------------------------------------------------------------------
/tests/tools/signalfx_scraper_test.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Yelp Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from unittest import mock
15 | 
16 | import arrow
17 | 
18 | from clusterman.common.sfx import Aggregation
19 | from clusterman.tools.signalfx_scraper import get_parser
20 | from clusterman.tools.signalfx_scraper import main
21 | 
22 | 
23 | @mock.patch("clusterman.tools.signalfx_scraper.basic_sfx_query", autospec=True)
24 | @mock.patch("clusterman.tools.signalfx_scraper.write_object_to_compressed_json", autospec=True)
25 | @mock.patch("clusterman.tools.signalfx_scraper.ask_for_choice", autospec=True)
26 | def test_main(mock_metric_choice, mock_write, mock_query):
27 |     mock_metric_choice.side_effect = ["system_metrics", "app_metrics"]
28 |     mock_query.side_effect = [["a1", "a2"], ["b1", "b2", "b3"]]
29 | 
30 |     parser = get_parser()
31 |     args = parser.parse_args(
32 |         [
33 |             "--start-time",
34 |             "2017-10-01",
35 |             "--end-time",
36 |             "2017-10-01T12:00:00",
37 |             "--src-metric-names",
38 |             "src.first.name",
39 |             "src.second.name",
40 |             "--dest-file",
41 |             "destfile",
42 |             "--api-token",
43 |             "token",
44 |             "--filter",
45 |             "region:us-west-2a",
46 |             "cluster:releng",
47 |         ]
48 |     )
49 |     main(args)
50 | 
51 |     expected_start = arrow.get("2017-10-01").replace(tzinfo="US/Pacific")
52 |     expected_end = expected_start.shift(hours=12)
53 |     expected_filters = [["region", "us-west-2a"], ["cluster", "releng"]]
54 |     assert mock_query.call_args_list == [
55 |         mock.call(
56 |             "token",
57 |             "src.first.name",
58 |             expected_start,
59 |             expected_end,
60 |             filters=expected_filters,
61 |             aggregation=Aggregation("sum", by=["AZ", "inst_type"]),
62 |             extrapolation="last_value",
63 |             max_extrapolations=3,
64 |         ),
65 |         mock.call(
66 |             "token",
67 |             "src.second.name",
68 |             expected_start,
69 |             expected_end,
70 |             filters=expected_filters,
71 |             aggregation=Aggregation("sum", by=["AZ", "inst_type"]),
72 |             extrapolation="last_value",
73 |             max_extrapolations=3,
74 |         ),
75 |     ]
76 | 
77 |     expected_values = {
78 |         "system_metrics": {"src.first.name": ["a1", "a2"]},
79 |         "app_metrics": {"src.second.name": ["b1", "b2", "b3"]},
80 |     }
81 |     assert mock_write.call_args_list == [
82 |         mock.call(expected_values, "destfile"),
83 |     ]
84 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
  1 | [tox]
  2 | envlist = py38
  3 | skipsdist = true
  4 | tox_pip_extensions_ext_venv_update = true
  5 | 
  6 | [testenv]
  7 | passenv = HOME SSH_AUTH_SOCK USER LANG PIP_INDEX_URL
  8 | basepython = python3.8
  9 | envdir = virtualenv_run
 10 | usedevelop = true
 11 | deps =
 12 |     -rrequirements.txt
 13 |     -rrequirements-dev.txt
 14 | commands =
 15 |     check-requirements -v
 16 |     # optionally install yelpy requirements - this is fine to fail in GHA
 17 |     # this is also explicitly *after* check-requirements as it does not
 18 |     # understand these extra files
 19 |     -pip install -rextra-requirements-yelp.txt -rextra-requirements-yelp-dev.txt
 20 |     mypy clusterman tests
 21 |     coverage erase
 22 |     coverage run -m pytest tests
 23 |     behave itests --no-source --no-timings --tags=-skip {posargs}
 24 |     coverage report --show-missing --skip-covered --fail-under=70
 25 |     pre-commit install -f --install-hooks
 26 |     pre-commit run --all-files
 27 | 
 28 | [testenv:external]
 29 | envdir = virtualenv_run
 30 | deps =
 31 |     {[testenv]deps}
 32 |     -eexamples/clusterman_metrics
 33 | 
 34 | [testenv:yelp]
 35 | envdir = virtualenv_run
 36 | deps =
 37 |     {[testenv]deps}
 38 |     -rextra-requirements-yelp.txt
 39 |     -rextra-requirements-yelp-dev.txt
 40 | 
 41 | [testenv:completions]
 42 | envdir = .tox/completions
 43 | usedevelop = true
 44 | deps =
 45 |     {[testenv]deps}
 46 |     -rextra-requirements-yelp.txt
 47 |     -rextra-requirements-yelp-dev.txt
 48 | commands =
 49 | 	static_completion clusterman bash --write-vendor-directory completions
 50 | 	static_completion clusterman zsh --write-vendor-directory completions
 51 | 	static_completion clusterman fish --write-vendor-directory completions
 52 | 
 53 | [testenv:docs]
 54 | envdir = .tox/docs
 55 | deps =
 56 |     -rrequirements-docs.txt
 57 | changedir = docs
 58 | commands =
 59 |     sphinx-build -b html -d build/doctrees source build/html
 60 | 
 61 | [testenv:virtualenv_run-dev]
 62 | deps =
 63 |     -rrequirements.txt
 64 |     -rrequirements-dev.txt
 65 | commands =
 66 | 
 67 | [testenv:virtualenv_run]
 68 | deps = -rrequirements.txt
 69 | usedevelop = false
 70 | commands =
 71 | 
 72 | [testenv:acceptance]
 73 | basepython = python3.8
 74 | envdir = .tox/acceptance
 75 | passenv = COMPOSE_PROJECT_NAME PIP_INDEX_URL
 76 | deps =
 77 |     docker-compose
 78 |     boto3
 79 |     simplejson
 80 |     urllib3<1.27
 81 |     cryptography==39.0.1
 82 | commands =
 83 |     docker-compose -f acceptance/docker-compose.yaml down
 84 |     docker-compose -f acceptance/docker-compose.yaml pull
 85 |     docker-compose -f acceptance/docker-compose.yaml build --build-arg DOCKER_REGISTRY={env:DOCKER_REGISTRY:docker-dev.yelpcorp.com} --build-arg IMAGE_NAME={env:JAMMY_IMAGE_NAME:jammy_pkgbuild}
 86 |     docker-compose -f acceptance/docker-compose.yaml up -d zookeeper mesosmaster mesosagent moto-ec2 moto-s3 moto-dynamodb moto-sts
 87 | 
 88 | [flake8]
 89 | exclude = .git,__pycache__,.tox,docs,virtualenv_run
 90 | filename = *.py,*.wsgi
 91 | max-line-length = 120
 92 | ignore = E121,E123,E126,E133,E203,E226,E231,E241,E242,E704,W503,W504,W505,W605
 93 | 
 94 | [pytest]
 95 | norecursedirs = .* docs virtualenv_run
 96 | filterwarnings =
 97 |     # ignore a bunch of noisy warnings that we can't do anything about
 98 |     ignore:invalid escape sequence:DeprecationWarning:.*(moto|boto|parsedatetime|samtranslator)
 99 | 
100 | [pycodestyle]
101 | ignore = E731
102 | 


--------------------------------------------------------------------------------