├── .gitignore ├── docs └── kubernetes-ec2-autoscaler-2.png ├── requirements.txt ├── autoscaler ├── __init__.py ├── config.py ├── aws_utils.py ├── capacity.py ├── utils.py ├── notification.py ├── azure.py ├── kube.py ├── azure_api.py ├── autoscaling_groups.py └── cluster.py ├── CONTRIBUTORS.md ├── autoscaler-secret.yaml ├── README.md ├── setup.py ├── production-requirements.txt ├── Dockerfile ├── .travis.yml ├── test ├── data │ ├── kube_config.yaml │ ├── busybox.yaml │ ├── node.yaml │ ├── ds-pod.yaml │ └── rc-pod.yaml ├── test_capacity.py ├── test_azure.py ├── test_azure_api.py └── test_cluster.py ├── LICENSE ├── scaling-controller.yaml ├── autoscaler-dep.yaml ├── data └── capacity.json └── main.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | .mypy_cache 4 | -------------------------------------------------------------------------------- /docs/kubernetes-ec2-autoscaler-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/kubernetes-ec2-autoscaler/HEAD/docs/kubernetes-ec2-autoscaler-2.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r production-requirements.txt 2 | 3 | # for tests 4 | nose>=1.3.7 5 | PyYAML>=3.11 6 | moto>=0.4.25,<1.0.0 7 | mock>=2.0.0 8 | mypy>=0.501 9 | -------------------------------------------------------------------------------- /autoscaler/__init__.py: -------------------------------------------------------------------------------- 1 | from autoscaler.kube import KubePodStatus 2 | from autoscaler.kube import KubePod 3 | from autoscaler.kube import KubeNode 4 | from autoscaler.kube import KubeResource 5 | 6 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | # Contributors (sorted alphabetically) 2 | 3 | - [Brian Kassouf](https://github.com/briankassouf) 4 | - [Pamela Vagata](https://github.com/pvagata) 5 | - [Peter Chen](https://github.com/neocxi) 6 | - [Vicki Cheung](https://github.com/vicki-c) 7 | -------------------------------------------------------------------------------- /autoscaler/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class Config(object): 5 | CAPACITY_DATA = os.environ.get('CAPACITY_DATA', 'data/capacity.json') 6 | CAPACITY_CPU_RESERVE = float(os.environ.get('CAPACITY_CPU_RESERVE', 0.0)) 7 | 8 | NAMESPACE = os.environ.get('NAMESPACE', 'system') 9 | -------------------------------------------------------------------------------- /autoscaler-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: autoscaler 5 | namespace: kube-system 6 | data: 7 | # replace aws keys below with your own BASE64 Encoded keys 8 | aws-access-key-id: YmFzZTY0ZW5jb2RlZGFjY2Vzc2tleQ== 9 | aws-secret-access-key: YmFzZTY0ZW5jb2RlZHNlY3JldGFjY2Vzc2tleQ== 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Status:** Archived (no changes will be merged) 2 | 3 | # kubernetes-ec2-autoscaler 4 | 5 | kubernetes-ec2-autoscaler was a node-level autoscaler for [Kubernetes](http://kubernetes.io/) 6 | on AWS EC2 designed for batch jobs. 7 | 8 | We recommend looking at the [Kubernetes Cluster Autoscaler](https://github.com/kubernetes/autoscaler). 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name='autoscaler', 4 | version='0.0.1', 5 | packages=find_packages(), 6 | install_requires=[ 7 | 'pykube', 8 | 'requests', 9 | 'ipdb', 10 | 'boto', 11 | 'boto3', 12 | 'botocore', 13 | 'click', 14 | ] 15 | ) 16 | 17 | -------------------------------------------------------------------------------- /production-requirements.txt: -------------------------------------------------------------------------------- 1 | requests[security]>=2.12.0 2 | boto==2.39.0 3 | boto3>=1.3.1 4 | botocore>=1.3.26 5 | click>=6.2 6 | python-dateutil>=2.5.3 7 | cachetools>=2.0.0 8 | JSON-log-formatter>=0.1.0 9 | pytz>=2016.10 10 | 11 | pykube>=0.14.0 12 | azure-mgmt-compute>=0.33.1rc1,<1.0.0 13 | azure-mgmt-resource>=0.31.0,<1.0.0 14 | azure-monitor>=0.3.0,<1.0.0 15 | 16 | # for instrumentation 17 | datadog>=0.14.0 18 | # for error tracking 19 | raven>=5.32.0 20 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6-alpine 2 | 3 | RUN apk --update add --virtual build-dependencies \ 4 | python-dev libffi-dev openssl-dev build-base && \ 5 | pip install --upgrade pip cffi cryptography && \ 6 | apk del build-dependencies && \ 7 | apk add --no-cache bash git && \ 8 | rm -rf /var/cache/apk/* 9 | 10 | COPY production-requirements.txt /app/requirements.txt 11 | RUN pip install -r /app/requirements.txt 12 | COPY . /app/ 13 | WORKDIR /app 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | language: python 3 | python: 4 | - "3.6" 5 | services: 6 | - docker 7 | # Workaround for https://github.com/travis-ci/travis-ci/issues/5246 8 | env: 9 | - BOTO_CONFIG=/tmp/nowhere 10 | install: 11 | - "pip install -r requirements.txt" 12 | - "mkdir -p ~/.kube" 13 | - "ln -s $PWD/test/data/kube_config.yaml ~/.kube/config" 14 | script: 15 | - "nosetests test/" 16 | - "mypy --ignore-missing-imports autoscaler/azure_api.py" 17 | - "docker build ." 18 | cache: pip 19 | -------------------------------------------------------------------------------- /autoscaler/aws_utils.py: -------------------------------------------------------------------------------- 1 | def fetch_all(aws_paged_func, kwargs, list_field, next_token=None): 2 | if next_token == '': 3 | return [] 4 | 5 | our_kwargs = dict(kwargs) 6 | if next_token is not None: 7 | our_kwargs['NextToken'] = next_token 8 | 9 | page_data = aws_paged_func(**our_kwargs) 10 | next_items = fetch_all( 11 | aws_paged_func, kwargs, list_field, 12 | next_token=page_data.get('NextToken', '')) 13 | 14 | return page_data[list_field] + next_items 15 | -------------------------------------------------------------------------------- /test/data/kube_config.yaml: -------------------------------------------------------------------------------- 1 | clusters: 2 | [ 3 | { 4 | name: "test-cluster", 5 | cluster: 6 | { 7 | server: "http://localhost:8080" 8 | } 9 | } 10 | ] 11 | users: 12 | [ 13 | { 14 | name: "test-user", 15 | user: {} 16 | } 17 | ] 18 | contexts: 19 | [ 20 | { 21 | name: "test-cluster", 22 | context: 23 | { 24 | cluster: "test-cluster", 25 | user: "test-user" 26 | } 27 | } 28 | ] 29 | current-context: "test-cluster" 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2016 OpenAI (http://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /test/test_capacity.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import unittest 3 | 4 | import pykube 5 | import yaml 6 | 7 | import autoscaler.capacity as capacity 8 | from autoscaler.kube import KubeNode, KubePod 9 | 10 | 11 | class TestCapacity(unittest.TestCase): 12 | def setUp(self): 13 | dir_path = os.path.dirname(os.path.realpath(__file__)) 14 | with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f: 15 | dummy_pod = yaml.load(f.read()) 16 | with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f: 17 | self.dummy_node = yaml.load(f.read()) 18 | # KubeNode expects these to be strings 19 | for condition in self.dummy_node['status']['conditions']: 20 | condition['lastHeartbeatTime'] = str(condition['lastHeartbeatTime']) 21 | 22 | # this isn't actually used here 23 | # only needed to create the KubePod object... 24 | self.api = pykube.HTTPClient(pykube.KubeConfig.from_file('~/.kube/config')) 25 | 26 | self.dummy_pod = dummy_pod 27 | 28 | def test_can_fit(self): 29 | pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) 30 | node = KubeNode(pykube.Node(self.api, self.dummy_node)) 31 | assert node.can_fit(pod.resources) 32 | 33 | def test_possible(self): 34 | pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) 35 | assert capacity.is_possible(pod) 36 | 37 | def test_impossible(self): 38 | self.dummy_pod['spec']['nodeSelector'] = { 39 | 'aws/type': 't2.micro' 40 | } 41 | 42 | print(repr(self.dummy_pod['metadata']['creationTimestamp'])) 43 | from dateutil.parser import parse as dateutil_parse 44 | print(dateutil_parse(self.dummy_pod['metadata']['creationTimestamp'])) 45 | pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) 46 | assert not capacity.is_possible(pod) 47 | -------------------------------------------------------------------------------- /test/data/busybox.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | creationTimestamp: "2016-07-14T06:46:14Z" 5 | name: busybox 6 | namespace: default 7 | resourceVersion: "229323584" 8 | selfLink: /api/v1/namespaces/default/pods/busybox 9 | uid: a85c73b6-498e-11e6-ab0a-0af8d945d5d3 10 | spec: 11 | containers: 12 | - command: 13 | - sleep 14 | - "3600" 15 | image: busybox 16 | imagePullPolicy: Always 17 | name: busybox 18 | resources: 19 | limits: 20 | cpu: 1800m 21 | memory: 1500Mi 22 | requests: 23 | cpu: 1500m 24 | memory: 2500Mi 25 | terminationMessagePath: /dev/termination-log 26 | volumeMounts: 27 | - mountPath: /var/run/secrets/kubernetes.io/serviceaccount 28 | name: default-token-1h0fa 29 | readOnly: true 30 | dnsPolicy: ClusterFirst 31 | imagePullSecrets: 32 | - name: quay-login-secret 33 | nodeName: 10.0.0.228 34 | restartPolicy: Always 35 | securityContext: {} 36 | serviceAccount: default 37 | serviceAccountName: default 38 | terminationGracePeriodSeconds: 30 39 | volumes: 40 | - name: default-token-1h0fa 41 | secret: 42 | secretName: default-token-1h0fa 43 | status: 44 | conditions: 45 | - lastProbeTime: null 46 | lastTransitionTime: 2016-08-23T21:22:43Z 47 | status: "True" 48 | type: Ready 49 | containerStatuses: 50 | - containerID: docker://4643e395f45d65015a98c13a509b4429bd357477d0dfb6719a835ce8135f1c06 51 | image: busybox 52 | imageID: docker://sha256:2b8fd9751c4c0f5dd266fcae00707e67a2545ef34f9a29354585f93dac906749 53 | lastState: 54 | terminated: 55 | containerID: docker://76e9c5151cc98f4d3aa1f73704112a4befcc451104a493671fc89a401e45f907 56 | exitCode: 0 57 | finishedAt: 2016-08-23T21:22:40Z 58 | reason: Completed 59 | startedAt: 2016-08-23T20:22:40Z 60 | name: busybox 61 | ready: true 62 | restartCount: 974 63 | state: 64 | running: 65 | startedAt: 2016-08-23T21:22:42Z 66 | hostIP: 10.0.0.228 67 | phase: Running 68 | podIP: 10.240.112.166 69 | startTime: "2016-07-14T06:46:25Z" 70 | -------------------------------------------------------------------------------- /test/data/node.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Node 3 | metadata: 4 | annotations: 5 | aws/sgs: kubernetes-worker 6 | creationTimestamp: "2016-08-25T05:13:16Z" 7 | labels: 8 | aws/ami-id: ami-deadbeef 9 | aws/az: us-west-2c 10 | aws/class: t 11 | aws/id: i-aaaaaaaa 12 | aws/region: us-west-2 13 | aws/type: t2.medium 14 | kubernetes.io/hostname: 10.0.0.228 15 | name: 10.0.0.228 16 | resourceVersion: "234864209" 17 | selfLink: /api/v1/nodes/10.0.0.228 18 | uid: a0f990ae-6a82-11e6-b203-0a0bdd34364d 19 | spec: 20 | externalID: 10.0.0.228 21 | status: 22 | addresses: 23 | - address: 10.0.0.228 24 | type: LegacyHostIP 25 | - address: 10.0.0.228 26 | type: InternalIP 27 | allocatable: 28 | cpu: "2" 29 | memory: 3952Mi 30 | pods: "30" 31 | capacity: 32 | cpu: "2" 33 | memory: 3952Mi 34 | pods: "30" 35 | conditions: 36 | - lastHeartbeatTime: 2016-08-25T21:50:36Z 37 | lastTransitionTime: 2016-08-25T05:13:07Z 38 | message: kubelet has sufficient disk space available 39 | reason: KubeletHasSufficientDisk 40 | status: "False" 41 | type: OutOfDisk 42 | - lastHeartbeatTime: 2016-08-25T21:50:36Z 43 | lastTransitionTime: 2016-08-25T05:13:07Z 44 | message: kubelet is posting ready status 45 | reason: KubeletReady 46 | status: "True" 47 | type: Ready 48 | daemonEndpoints: 49 | kubeletEndpoint: 50 | Port: 10250 51 | images: 52 | - names: 53 | - datadog/docker-dd-agent:kubernetes 54 | sizeBytes: 301845578 55 | - names: 56 | - nvidia/cuda:7.5-cudnn4-devel-ubuntu14.04 57 | sizeBytes: 1353527489 58 | - names: 59 | - newrelic/nrsysmond:latest 60 | sizeBytes: 192693160 61 | - names: 62 | - gcr.io/google_containers/pause:2.0 63 | sizeBytes: 350164 64 | nodeInfo: 65 | bootID: dd1f6c71-caa5-45b8-8751-5dd981bfba76 66 | containerRuntimeVersion: docker://1.11.2 67 | kernelVersion: 3.13.0-88-generic 68 | kubeProxyVersion: v1.2.1 69 | kubeletVersion: v1.2.1 70 | machineID: 1f2629f74f7c79e2a4583d1a5674725a 71 | osImage: Ubuntu 14.04.3 LTS 72 | systemUUID: EC228B31-FEB9-F4EA-A8CC-1E437E7C7893 73 | -------------------------------------------------------------------------------- /scaling-controller.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: autoscaler 5 | namespace: system 6 | spec: 7 | replicas: 1 8 | template: 9 | metadata: 10 | labels: 11 | app: autoscaler 12 | openai/do-not-drain: "true" 13 | spec: 14 | containers: 15 | - name: autoscaler 16 | image: quay.io/openai/kubernetes-ec2-autoscaler:azure 17 | env: 18 | - name: AWS_ACCESS_KEY_ID 19 | valueFrom: 20 | secretKeyRef: 21 | name: autoscaler 22 | key: aws-access-key-id 23 | - name: AWS_SECRET_ACCESS_KEY 24 | valueFrom: 25 | secretKeyRef: 26 | name: autoscaler 27 | key: aws-secret-access-key 28 | - name: SLACK_HOOK 29 | valueFrom: 30 | secretKeyRef: 31 | name: autoscaler 32 | key: slack-hook 33 | - name: SLACK_BOT_TOKEN 34 | valueFrom: 35 | secretKeyRef: 36 | name: autoscaler 37 | key: slack-bot-token 38 | - name: DATADOG_API_KEY 39 | valueFrom: 40 | secretKeyRef: 41 | name: autoscaler 42 | key: datadog-api-key 43 | - name: SENTRY_DSN 44 | valueFrom: 45 | secretKeyRef: 46 | name: autoscaler 47 | key: sentry-dsn 48 | - name: PYKUBE_KUBERNETES_SERVICE_HOST 49 | value: 10.100.0.1 50 | # value: kubernetes.default 51 | - name: DATADOG_TAGS 52 | value: env:sci 53 | - name: NAMESPACE 54 | value: system.svc.sci.openai.org 55 | command: 56 | - python 57 | - main.py 58 | - --azure-regions 59 | - us-east,us-south-central 60 | - --aws-regions 61 | - us-west-2,us-east-1,us-west-1 62 | - --cluster-name 63 | - openai-kubernetes 64 | - -vvv 65 | - --type-idle-threshold 66 | - "0" 67 | - --over-provision 68 | - "1" 69 | - --sleep 70 | - "30" 71 | imagePullPolicy: Always 72 | restartPolicy: Always 73 | dnsPolicy: Default # Don't use cluster DNS. 74 | nodeSelector: 75 | aws/region: us-west-2 76 | -------------------------------------------------------------------------------- /test/data/ds-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | creationTimestamp: "2016-08-25T05:11:56Z" 5 | generateName: container-monitor- 6 | labels: 7 | app: container-monitor 8 | name: container-monitor-y6w5d 9 | namespace: system 10 | ownerReferences: 11 | - apiVersion: apps/v1 12 | blockOwnerDeletion: true 13 | controller: true 14 | kind: DaemonSet 15 | name: container-monitor 16 | uid: 826c7436-4558-11e6-afb0-0af8d945d5d3 17 | resourceVersion: "232653318" 18 | selfLink: /api/v1/namespaces/system/pods/container-monitor-y6w5d 19 | uid: 718ec371-6a82-11e6-b203-0a0bdd34364d 20 | spec: 21 | containers: 22 | - env: 23 | - name: SLACK_HOOK 24 | valueFrom: 25 | secretKeyRef: 26 | key: slack-hook 27 | name: container-monitor 28 | - name: SLACK_TOKEN 29 | valueFrom: 30 | secretKeyRef: 31 | key: slack-token 32 | name: container-monitor 33 | image: container-monitor 34 | imagePullPolicy: Always 35 | name: container-monitor 36 | resources: {} 37 | terminationMessagePath: /dev/termination-log 38 | volumeMounts: 39 | - mountPath: /var/run/docker.sock 40 | name: dockersocket 41 | - mountPath: /var/run/secrets/kubernetes.io/serviceaccount 42 | name: default-token-lbbq5 43 | readOnly: true 44 | dnsPolicy: ClusterFirst 45 | nodeName: 10.0.0.228 46 | restartPolicy: Always 47 | securityContext: {} 48 | serviceAccount: default 49 | serviceAccountName: default 50 | terminationGracePeriodSeconds: 30 51 | volumes: 52 | - hostPath: 53 | path: /var/run/docker.sock 54 | name: dockersocket 55 | - name: default-token-lbbq5 56 | secret: 57 | secretName: default-token-lbbq5 58 | status: 59 | conditions: 60 | - lastProbeTime: null 61 | lastTransitionTime: 2016-08-25T05:12:21Z 62 | status: "True" 63 | type: Ready 64 | containerStatuses: 65 | - containerID: docker://6acb85b56d578202a04125d30db7cb33b180559ecedd612ce61c6f77a45c8f2a 66 | image: container-monitor 67 | imageID: docker://sha256:33985d876f3c4ea686af447dd67f14d2efe035eac8dab4132107d8d75f4ce7d1 68 | lastState: {} 69 | name: container-monitor 70 | ready: true 71 | restartCount: 0 72 | state: 73 | running: 74 | startedAt: 2016-08-25T05:12:20Z 75 | hostIP: 10.0.0.228 76 | phase: Running 77 | podIP: 10.240.112.167 78 | startTime: "2016-08-25T05:11:53Z" 79 | -------------------------------------------------------------------------------- /autoscaler-dep.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: extensions/v1beta1 2 | kind: Deployment 3 | metadata: 4 | name: autoscaler 5 | namespace: kube-system 6 | labels: 7 | k8s-app: autoscaler 8 | spec: 9 | replicas: 1 10 | selector: 11 | matchLabels: 12 | k8s-app: autoscaler 13 | template: 14 | metadata: 15 | labels: 16 | k8s-app: autoscaler 17 | openai/do-not-drain: "true" 18 | # annotations: 19 | # scheduler.alpha.kubernetes.io/tolerations: "[{key: dedicated, value: master}]" 20 | spec: 21 | containers: 22 | - name: autoscaler 23 | image: quay.io/openai/kubernetes-ec2-autoscaler 24 | resources: 25 | limits: 26 | cpu: 500m 27 | memory: 300Mi 28 | requests: 29 | cpu: 100m 30 | memory: 100Mi 31 | env: 32 | - name: CAPACITY_CPU_RESERVE 33 | value: "0.1" 34 | - name: AWS_ACCESS_KEY_ID 35 | valueFrom: 36 | secretKeyRef: 37 | name: autoscaler 38 | key: aws-access-key-id 39 | - name: AWS_SECRET_ACCESS_KEY 40 | valueFrom: 41 | secretKeyRef: 42 | name: autoscaler 43 | key: aws-secret-access-key 44 | - name: SLACK_HOOK 45 | valueFrom: 46 | secretKeyRef: 47 | name: autoscaler 48 | key: slack-hook 49 | - name: SLACK_BOT_TOKEN 50 | valueFrom: 51 | secretKeyRef: 52 | name: autoscaler 53 | key: slack-bot-token 54 | - name: DATADOG_API_KEY 55 | valueFrom: 56 | secretKeyRef: 57 | name: autoscaler 58 | key: datadog-api-key 59 | - name: PYKUBE_KUBERNETES_SERVICE_HOST 60 | value: kubernetes.default 61 | #value: 10.100.0.1 62 | - name: DATADOG_TAGS 63 | value: env:sci 64 | command: 65 | - python 66 | - main.py 67 | - --regions 68 | - us-east-1 69 | - --cluster-name 70 | - pipeline.kubernetes.dev.aws.away.black 71 | - -vvv 72 | - --type-idle-threshold 73 | - "0" 74 | - --over-provision 75 | - "1" 76 | - --sleep 77 | - "30" 78 | imagePullPolicy: Always 79 | restartPolicy: Always 80 | # dnsPolicy: Default # Don't use cluster DNS. 81 | nodeSelector: 82 | kubernetes.io/role: master 83 | -------------------------------------------------------------------------------- /test/data/rc-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | creationTimestamp: "2016-08-25T20:36:43Z" 5 | generateName: autoscaler- 6 | labels: 7 | app: autoscaler 8 | name: autoscaler-opnax 9 | namespace: system 10 | ownerReferences: 11 | - apiVersion: apps/v1 12 | blockOwnerDeletion: true 13 | controller: true 14 | kind: ReplicationController 15 | name: autoscaler 16 | uid: bfa94a99-47e6-11e6-8fc1-06a19a97a573 17 | resourceVersion: "234714730" 18 | selfLink: /api/v1/namespaces/system/pods/autoscaler-opnax 19 | uid: a2acf237-6b03-11e6-b203-0a0bdd34364d 20 | spec: 21 | containers: 22 | - command: 23 | - python 24 | - main.py 25 | - --regions 26 | - us-west-2,us-east-1,us-west-1 27 | - --cluster-name 28 | - openai-kubernetes 29 | - -vvv 30 | env: 31 | - name: AWS_ACCESS_KEY_ID 32 | valueFrom: 33 | secretKeyRef: 34 | key: aws-access-key-id 35 | name: autoscaler 36 | - name: AWS_SECRET_ACCESS_KEY 37 | valueFrom: 38 | secretKeyRef: 39 | key: aws-secret-access-key 40 | name: autoscaler 41 | - name: SLACK_HOOK 42 | valueFrom: 43 | secretKeyRef: 44 | key: slack-hook 45 | name: autoscaler 46 | - name: PYKUBE_KUBERNETES_SERVICE_HOST 47 | value: kubernetes.default 48 | image: autoscaler 49 | imagePullPolicy: Always 50 | name: autoscaler 51 | resources: {} 52 | terminationMessagePath: /dev/termination-log 53 | volumeMounts: 54 | - mountPath: /var/run/secrets/kubernetes.io/serviceaccount 55 | name: default-token-lbbq5 56 | readOnly: true 57 | resources: 58 | limits: 59 | cpu: 1500m 60 | requests: 61 | cpu: 1500m 62 | dnsPolicy: ClusterFirst 63 | nodeName: 10.0.0.228 64 | nodeSelector: 65 | aws/region: us-west-2 66 | restartPolicy: Always 67 | securityContext: {} 68 | serviceAccount: default 69 | serviceAccountName: default 70 | terminationGracePeriodSeconds: 30 71 | volumes: 72 | - name: default-token-lbbq5 73 | secret: 74 | secretName: default-token-lbbq5 75 | status: 76 | conditions: 77 | - lastProbeTime: null 78 | lastTransitionTime: 2016-08-25T20:36:46Z 79 | status: "True" 80 | type: Ready 81 | containerStatuses: 82 | - containerID: docker://756bbb9ae937f86aaaf4a48d2512ee469ff734edcde04e0e92969988c5707741 83 | image: autoscaler 84 | imageID: docker://sha256:13a5c2fc6127f99357e5b3da0e2e1aa1c4f8afea35cac595a8ed588065ea42f9 85 | lastState: {} 86 | name: autoscaler 87 | ready: true 88 | restartCount: 0 89 | state: 90 | running: 91 | startedAt: 2016-08-25T20:36:45Z 92 | hostIP: 10.0.0.228 93 | phase: Running 94 | podIP: 10.240.112.168 95 | startTime: "2016-08-25T20:36:33Z" 96 | -------------------------------------------------------------------------------- /autoscaler/capacity.py: -------------------------------------------------------------------------------- 1 | """ 2 | module to handle capacity of resources 3 | """ 4 | import json 5 | 6 | from autoscaler.config import Config 7 | from autoscaler.kube import KubeResource 8 | 9 | # RESOURCE_SPEC should denote the amount of resouces that are available 10 | # to workload pods on a new, clean node, i.e. resouces used by system pods 11 | # have to be accounted for 12 | with open(Config.CAPACITY_DATA, 'r') as f: 13 | data = json.loads(f.read()) 14 | RESOURCE_SPEC = {} 15 | for instance_type, resource_spec in data.items(): 16 | resource_spec['cpu'] -= Config.CAPACITY_CPU_RESERVE 17 | resource = KubeResource(**resource_spec) 18 | RESOURCE_SPEC[instance_type] = resource 19 | 20 | DEFAULT_TYPE_SELECTOR_KEYS = ('aws/type', 'azure/type') 21 | DEFAULT_CLASS_SELECTOR_KEYS = ('aws/class', 'azure/class') 22 | COMPUTING_SELECTOR_KEY = 'openai/computing' 23 | 24 | 25 | def is_possible(pod): 26 | """ 27 | returns whether the pod is possible under the maximum allowable capacity 28 | """ 29 | max_pod_capacity = max_capacity_for_selectors(pod.selectors, pod.resources) 30 | if not max_pod_capacity: 31 | return False 32 | return (max_pod_capacity - pod.resources).possible 33 | 34 | 35 | def max_capacity_for_selectors(selectors, resource_requests): 36 | """ 37 | returns the maximum capacity that is possible for the given selectors 38 | """ 39 | selector = '' 40 | for key in DEFAULT_TYPE_SELECTOR_KEYS: 41 | if key in selectors: 42 | selector = selectors[key] 43 | break 44 | class_ = '' 45 | for key in DEFAULT_CLASS_SELECTOR_KEYS: 46 | if key in selectors: 47 | class_ = selectors[key] 48 | break 49 | 50 | unit_caps = RESOURCE_SPEC 51 | 52 | # HACK: we modify our types with -modifier for special groups 53 | # e.g. c4.8xlarge-public 54 | # our selectors don't have dashes otherwise, so remove the modifier 55 | selector, _, _ = selector.partition('-') 56 | class_, _, _ = class_.partition('-') 57 | azure_class = 'Standard_{}'.format(class_) 58 | 59 | # if an instance type was specified 60 | if selector in unit_caps: 61 | return unit_caps[selector] 62 | 63 | max_capacity = None 64 | for type_, resource in unit_caps.items(): 65 | if (not class_ or type_.startswith(class_) or 66 | type_.startswith(azure_class)): 67 | if not max_capacity or (resource - max_capacity).possible or (resource - resource_requests).possible: 68 | max_capacity = resource 69 | 70 | return max_capacity 71 | 72 | 73 | def get_unit_capacity(group): 74 | """ 75 | returns the KubeResource provided by one unit in the 76 | AutoScalingGroup or KubeNode 77 | """ 78 | return RESOURCE_SPEC[group.instance_type] 79 | -------------------------------------------------------------------------------- /data/capacity.json: -------------------------------------------------------------------------------- 1 | { 2 | "t2.micro": {"cpu": 1, "memory": 1040699392, "pods": 30}, 3 | "t2.medium": {"cpu": 2, "memory": 4143996928, "pods": 30}, 4 | "m4.large": {"cpu": 1, "memory": 8413763584, "pods": 30}, 5 | "m4.xlarge": {"cpu": 2, "memory": 16827527168, "pods": 30}, 6 | "m4.2xlarge": {"cpu": 4, "memory": 33738887168, "pods": 30}, 7 | "m4.4xlarge": {"cpu": 8, "memory": 67549102080, "pods": 30}, 8 | "m4.10xlarge": {"cpu": 20, "memory": 169017131008, "pods": 30}, 9 | "m4.16xlarge": {"cpu": 32, "memory": 274877906944, "pods": 30}, 10 | "g2.2xlarge": {"cpu": 4, "memory": 15770570752, "gpu": 1, "pods": 30}, 11 | "g2.8xlarge": {"cpu": 16, "memory": 63311153536, "gpu": 4, "pods": 30}, 12 | "c4.2xlarge": {"cpu": 4, "memory": 15770546176, "pods": 30}, 13 | "c4.4xlarge": {"cpu": 8, "memory": 31612444672, "pods": 30}, 14 | "c4.8xlarge": {"cpu": 16, "memory": 63311120768, "pods": 30}, 15 | "c4.xlarge": {"cpu": 2, "memory": 7843344384, "pods": 30}, 16 | "c4.large": {"cpu": 2, "memory": 3947372544, "pods": 30}, 17 | "p2.xlarge": {"cpu": 2, "memory": 65498251264, "pods": 30}, 18 | "p2.8xlarge": {"cpu": 16, "memory": 523986010112, "pods": 30}, 19 | "p2.16xlarge": {"cpu": 32, "memory": 785979015168, "pods": 30}, 20 | "Standard_DS1": {"cpu": 1, "memory": 3758096384, "pods": 30}, 21 | "Standard_D1_v2": {"cpu": 1, "memory": 3758096384, "pods": 30}, 22 | "Standard_D2_v2": {"cpu": 2, "memory": 7516192768, "pods": 30}, 23 | "Standard_D3_v2": {"cpu": 4, "memory": 15032385536, "pods": 30}, 24 | "Standard_D4_v2": {"cpu": 8, "memory": 30064771072, "pods": 30}, 25 | "Standard_D5_v2": {"cpu": 16, "memory": 60129542144, "pods": 30}, 26 | "Standard_D11_v2": {"cpu": 2, "memory": 15032385536, "pods": 30}, 27 | "Standard_D12_v2": {"cpu": 4, "memory": 30064771072, "pods": 30}, 28 | "Standard_D13_v2": {"cpu": 8, "memory": 60129542144, "pods": 30}, 29 | "Standard_D14_v2": {"cpu": 16, "memory": 120259084288, "pods": 30}, 30 | "Standard_D15_v2": {"cpu": 20, "memory": 150323855360, "pods": 30}, 31 | "Standard_DS15_v2": {"cpu": 20, "memory": 150323855360, "pods": 30}, 32 | "Standard_NC6": {"cpu": 6, "memory": 60129542144, "alpha.kubernetes.io/nvidia-gpu": 1, "pods": 30}, 33 | "Standard_NC12": {"cpu": 12, "memory": 120259084288, "alpha.kubernetes.io/nvidia-gpu": 2, "pods": 30}, 34 | "Standard_NC24": {"cpu": 24, "memory": 240518168576, "alpha.kubernetes.io/nvidia-gpu": 4, "pods": 30}, 35 | "Standard_ND6s": {"cpu": 6, "memory": 120259084288, "alpha.kubernetes.io/nvidia-gpu": 1, "pods": 30}, 36 | "Standard_ND12s": {"cpu": 12, "memory": 240518168576, "alpha.kubernetes.io/nvidia-gpu": 2, "pods": 30}, 37 | "Standard_ND24s": {"cpu": 24, "memory": 481036337152, "alpha.kubernetes.io/nvidia-gpu": 4, "pods": 30}, 38 | "Standard_H8": {"cpu": 8, "memory": 57982058496, "pods": 30}, 39 | "Standard_H16": {"cpu": 16, "memory": 120259084288, "pods": 30}, 40 | "Standard_H8m": {"cpu": 8, "memory": 120259084288, "pods": 30}, 41 | "Standard_H16m": {"cpu": 16, "memory": 240518168576, "pods": 30}, 42 | "Standard_H16r": {"cpu": 16, "memory": 120259084288, "pods": 30}, 43 | "Standard_H16mr": {"cpu": 16, "memory": 240518168576, "pods": 30} 44 | } 45 | -------------------------------------------------------------------------------- /autoscaler/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from abc import ABC 4 | 5 | from threading import Lock 6 | 7 | 8 | # A callback that only triggers exactly once, after being called N times 9 | class CountDownCallback: 10 | def __init__(self, count, delegate): 11 | self._count = count 12 | self._delegate = delegate 13 | self._lock = Lock() 14 | 15 | def __call__(self, *args, **kwargs): 16 | self._lock.acquire() 17 | self._count -= 1 18 | if self._count == 0: 19 | self._delegate(*args, **kwargs) 20 | self._lock.release() 21 | 22 | 23 | class Future(ABC): 24 | def result(self): 25 | pass 26 | 27 | def add_done_callback(self, fn): 28 | pass 29 | 30 | 31 | class CompletedFuture(Future): 32 | def __init__(self, value): 33 | self._value = value 34 | 35 | def result(self): 36 | return self._value 37 | 38 | def add_done_callback(self, fn): 39 | fn(self) 40 | 41 | 42 | class TransformingFuture(Future): 43 | def __init__(self, value, delegate): 44 | self._value = value 45 | self._delegate = delegate 46 | 47 | def result(self): 48 | self._delegate.result() 49 | return self._value 50 | 51 | def add_done_callback(self, fn): 52 | self._delegate.add_done_callback(lambda _: fn(self)) 53 | 54 | 55 | class AllCompletedFuture(Future): 56 | def __init__(self, futures): 57 | self._futures = futures 58 | 59 | def result(self): 60 | return [future.result() for future in self._futures] 61 | 62 | def add_done_callback(self, fn): 63 | callback = CountDownCallback(len(self._futures), lambda _: fn(self)) 64 | for future in self._futures: 65 | future.add_done_callback(callback) 66 | 67 | 68 | def selectors_to_hash(selectors): 69 | return json.dumps(selectors, sort_keys=True) 70 | 71 | 72 | def get_groups_for_hash(asgs, selectors_hash): 73 | """ 74 | returns a list of groups from asg that match the selectors 75 | """ 76 | selectors = json.loads(selectors_hash) 77 | groups = [] 78 | for asg in asgs: 79 | if asg.is_match_for_selectors(selectors): 80 | groups.append(asg) 81 | return groups 82 | 83 | 84 | def get_group_for_node(asgs, node): 85 | for asg in asgs: 86 | if asg.contains(node): 87 | return asg 88 | return None 89 | 90 | 91 | SI_suffix = { 92 | 'y': 1e-24, # yocto 93 | 'z': 1e-21, # zepto 94 | 'a': 1e-18, # atto 95 | 'f': 1e-15, # femto 96 | 'p': 1e-12, # pico 97 | 'n': 1e-9, # nano 98 | 'u': 1e-6, # micro 99 | 'm': 1e-3, # mili 100 | 'c': 1e-2, # centi 101 | 'd': 1e-1, # deci 102 | 'k': 1e3, # kilo 103 | 'M': 1e6, # mega 104 | 'G': 1e9, # giga 105 | 'T': 1e12, # tera 106 | 'P': 1e15, # peta 107 | 'E': 1e18, # exa 108 | 'Z': 1e21, # zetta 109 | 'Y': 1e24, # yotta 110 | # Kube also uses the power of 2 equivalent 111 | 'Ki': 2**10, 112 | 'Mi': 2**20, 113 | 'Gi': 2**30, 114 | 'Ti': 2**40, 115 | 'Pi': 2**50, 116 | 'Ei': 2**60, 117 | } 118 | SI_regex = re.compile(r"([0-9.]+)(%s)?$" % "|".join(SI_suffix.keys())) 119 | 120 | 121 | def parse_SI(s): 122 | m = SI_regex.match(s) 123 | if m is None: 124 | raise ValueError("Unknown SI quantity: %s" % s) 125 | num_s, unit = m.groups() 126 | multiplier = SI_suffix[unit] if unit else 1. # unitless 127 | return float(num_s) * multiplier 128 | 129 | 130 | def parse_resource(resource): 131 | try: 132 | return float(resource) 133 | except ValueError: 134 | return parse_SI(resource) 135 | 136 | 137 | def parse_bool_label(value): 138 | return str(value).lower() in ('1', 'true') 139 | 140 | 141 | def get_relevant_selectors(node_selectors): 142 | selectors = dict((k, v) for (k, v) in node_selectors.items() 143 | if k.startswith('aws/') or k.startswith('openai/')) 144 | return selectors 145 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | import time 4 | 5 | import click 6 | 7 | from autoscaler.cluster import Cluster 8 | from autoscaler.notification import Notifier 9 | 10 | logger = logging.getLogger('autoscaler') 11 | 12 | DEBUG_LOGGING_MAP = { 13 | 0: logging.CRITICAL, 14 | 1: logging.WARNING, 15 | 2: logging.INFO, 16 | 3: logging.DEBUG 17 | } 18 | 19 | 20 | @click.command() 21 | @click.option("--cluster-name") 22 | @click.option("--aws-regions", default="us-west-1") 23 | @click.option("--sleep", default=60) 24 | @click.option("--kubeconfig", default=None, 25 | help='Full path to kubeconfig file. If not provided, ' 26 | 'we assume that we\'re running on kubernetes.') 27 | @click.option("--pod-namespace", default=None, 28 | help='The namespace to look for out-of-resource pods in. By ' 29 | 'default, this will look in all namespaces.') 30 | @click.option("--idle-threshold", default=3300) 31 | @click.option("--type-idle-threshold", default=3600*24*7) 32 | @click.option("--over-provision", default=5) 33 | @click.option("--max-scale-in-fraction", default=0.1) 34 | @click.option("--drain-utilization", default=0.0) 35 | @click.option("--azure-slow-scale-classes", default="") 36 | @click.option("--azure-resource-groups") 37 | @click.option("--azure-client-id", default=None, envvar='AZURE_CLIENT_ID') 38 | @click.option("--azure-client-secret", default=None, envvar='AZURE_CLIENT_SECRET') 39 | @click.option("--azure-subscription-id", default=None, envvar='AZURE_SUBSCRIPTION_ID') 40 | @click.option("--azure-tenant-id", default=None, envvar='AZURE_TENANT_ID') 41 | @click.option("--aws-access-key", default=None, envvar='AWS_ACCESS_KEY_ID') 42 | @click.option("--aws-secret-key", default=None, envvar='AWS_SECRET_ACCESS_KEY') 43 | @click.option("--use-aws-iam-role", is_flag=True) 44 | @click.option("--datadog-api-key", default=None, envvar='DATADOG_API_KEY') 45 | @click.option("--instance-init-time", default=25 * 60) 46 | @click.option("--no-scale", is_flag=True) 47 | @click.option("--no-maintenance", is_flag=True) 48 | @click.option("--slack-hook", default=None, envvar='SLACK_HOOK', 49 | help='Slack webhook URL. If provided, post scaling messages ' 50 | 'to Slack.') 51 | @click.option("--slack-bot-token", default=None, envvar='SLACK_BOT_TOKEN', 52 | help='Slack bot token. If provided, post scaling messages ' 53 | 'to Slack users directly.') 54 | @click.option("--dry-run", is_flag=True) 55 | @click.option('--verbose', '-v', 56 | help="Sets the debug noise level, specify multiple times " 57 | "for more verbosity.", 58 | type=click.IntRange(0, 3, clamp=True), 59 | count=True) 60 | def main(cluster_name, aws_regions, azure_resource_groups, azure_slow_scale_classes, sleep, kubeconfig, 61 | azure_client_id, azure_client_secret, azure_subscription_id, azure_tenant_id, 62 | aws_access_key, aws_secret_key, use_aws_iam_role, pod_namespace, datadog_api_key, 63 | idle_threshold, type_idle_threshold, max_scale_in_fraction, drain_utilization, 64 | over_provision, instance_init_time, no_scale, no_maintenance, 65 | slack_hook, slack_bot_token, dry_run, verbose): 66 | logger_handler = logging.StreamHandler(sys.stderr) 67 | logger_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%dT%H:%M:%S%z')) 68 | logger.addHandler(logger_handler) 69 | logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL)) 70 | 71 | aws_regions_list = aws_regions.split(',') if aws_regions else [] 72 | if not ((aws_secret_key and aws_access_key) or use_aws_iam_role) and aws_regions_list: 73 | logger.error("Missing AWS credentials. Please provide aws-access-key and aws-secret-key.") 74 | sys.exit(1) 75 | 76 | notifier = Notifier(slack_hook, slack_bot_token) 77 | cluster = Cluster(aws_access_key=aws_access_key, 78 | aws_secret_key=aws_secret_key, 79 | use_aws_iam_role=use_aws_iam_role, 80 | aws_regions=aws_regions_list, 81 | azure_client_id=azure_client_id, 82 | azure_client_secret=azure_client_secret, 83 | azure_subscription_id=azure_subscription_id, 84 | azure_tenant_id=azure_tenant_id, 85 | azure_resource_group_names=azure_resource_groups.split(',') if azure_resource_groups else [], 86 | azure_slow_scale_classes=azure_slow_scale_classes.split(',') if azure_slow_scale_classes else [], 87 | kubeconfig=kubeconfig, 88 | pod_namespace=pod_namespace, 89 | idle_threshold=idle_threshold, 90 | instance_init_time=instance_init_time, 91 | type_idle_threshold=type_idle_threshold, 92 | cluster_name=cluster_name, 93 | max_scale_in_fraction=max_scale_in_fraction, 94 | drain_utilization_below=drain_utilization, 95 | scale_up=not no_scale, 96 | maintainance=not no_maintenance, 97 | over_provision=over_provision, 98 | datadog_api_key=datadog_api_key, 99 | notifier=notifier, 100 | dry_run=dry_run, 101 | ) 102 | backoff = sleep 103 | while True: 104 | scaled = cluster.scale_loop() 105 | if scaled: 106 | time.sleep(sleep) 107 | backoff = sleep 108 | else: 109 | logger.warn("backoff: %s" % backoff) 110 | backoff *= 2 111 | time.sleep(backoff) 112 | 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /autoscaler/notification.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | import logging 4 | import operator 5 | 6 | from cachetools import TTLCache, cachedmethod 7 | import json_log_formatter 8 | import requests 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | struct_logger = logging.getLogger('autoscaler.notification.struct') 13 | formatter = json_log_formatter.JSONFormatter() 14 | handler = logging.StreamHandler() 15 | handler.setFormatter(formatter) 16 | struct_logger.addHandler(handler) 17 | struct_logger.setLevel(logging.DEBUG) 18 | struct_logger.propagate = False 19 | 20 | 21 | def _cache_key(notifier, owner, message, pods): 22 | md5 = hashlib.md5() 23 | md5.update(owner.encode('utf-8')) 24 | md5.update(message.encode('utf-8')) 25 | 26 | for pod in sorted(pods, key=lambda p: p.uid): 27 | md5.update(pod.uid.encode('utf-8')) 28 | 29 | key = 'v0.md5.{}'.format(md5.hexdigest()) 30 | return key 31 | 32 | 33 | def _generate_pod_string(pods): 34 | if len(pods) > 5: 35 | pods_string = '{}, and {} others'.format( 36 | ', '.join('{}/{}'.format(pod.namespace, pod.name) for pod in pods[:4]), 37 | len(pods) - 4) 38 | else: 39 | pods_string = ', '.join('{}/{}'.format(pod.namespace, pod.name) for pod in pods) 40 | return pods_string 41 | 42 | 43 | def struct_log(message, pods, extra=None): 44 | for pod in pods: 45 | log_extra = { 46 | 'pod_name': '{}/{}'.format(pod.namespace, pod.name), 47 | 'pod_id': pod.uid, 48 | '_log_streaming_target_mapping': 'kubernetes-ec2-autoscaler' 49 | } 50 | if extra: 51 | log_extra.update(extra) 52 | struct_logger.debug(message, extra=log_extra) 53 | 54 | 55 | class Notifier(object): 56 | MESSAGE_URL = 'https://slack.com/api/chat.postMessage' 57 | 58 | def __init__(self, hook=None, bot_token=None): 59 | self.hook = hook 60 | self.bot_token = bot_token 61 | 62 | self.cache = TTLCache(maxsize=128, ttl=60*30) 63 | 64 | def notify_scale(self, asg, units_requested, pods): 65 | struct_log('scale', pods, 66 | extra={'asg': str(asg), 'units_requested': units_requested}) 67 | 68 | if not self.hook: 69 | logger.debug('SLACK_HOOK not configured.') 70 | return 71 | 72 | pods_string = _generate_pod_string(pods) 73 | 74 | message = 'ASG {}[{}] scaling up by {} to new capacity {}'.format( 75 | asg.name, asg.region, units_requested, asg.desired_capacity) 76 | message += '\n' 77 | message += 'Change triggered by {}'.format(pods_string) 78 | 79 | try: 80 | resp = requests.post(self.hook, json={ 81 | "text": message, 82 | "username": "kubernetes-ec2-autoscaler", 83 | "icon_emoji": ":rabbit:", 84 | }) 85 | logger.debug('SLACK: %s', resp.text) 86 | except requests.exceptions.ConnectionError as e: 87 | logger.critical('Failed to SLACK: %s', e) 88 | 89 | self.message_owners( 90 | 'ASG {}[{}] scaling up'.format(asg.name, asg.region), pods) 91 | 92 | def notify_failed_to_scale(self, selectors_hash, pods): 93 | struct_log('failed to scale', pods, 94 | extra={'selectors_hash': selectors_hash}) 95 | 96 | if not self.hook: 97 | logger.debug('SLACK_HOOK not configured.') 98 | return 99 | 100 | pods_string = _generate_pod_string(pods) 101 | 102 | main_message = 'Failed to scale {} sufficiently. Backing off...'.format( 103 | json.dumps(selectors_hash)) 104 | message = main_message + '\n' 105 | message += 'Pods affected: {}'.format(pods_string) 106 | 107 | try: 108 | resp = requests.post(self.hook, json={ 109 | "text": message, 110 | "username": "kubernetes-ec2-autoscaler", 111 | "icon_emoji": ":rabbit:", 112 | }) 113 | logger.debug('SLACK: %s', resp.text) 114 | except requests.exceptions.ConnectionError as e: 115 | logger.critical('Failed to SLACK: %s', e) 116 | 117 | self.message_owners(main_message, pods) 118 | 119 | def notify_invalid_pod_capacity(self, pod, recommended_capacity): 120 | struct_log('invalid pod capacity', [pod], 121 | extra={'recommended_capacity': str(recommended_capacity)}) 122 | 123 | if not self.hook: 124 | logger.debug('SLACK_HOOK not configured.') 125 | return 126 | 127 | message = ("Pending pod {}/{} cannot fit {}. " 128 | "Please check that requested resource amount is " 129 | "consistent with node selectors (recommended max: {}). " 130 | "Scheduling skipped.".format(pod.namespace, pod.name, json.dumps(pod.selectors), recommended_capacity)) 131 | 132 | try: 133 | resp = requests.post(self.hook, json={ 134 | "text": message, 135 | "username": "kubernetes-ec2-autoscaler", 136 | "icon_emoji": ":rabbit:", 137 | }) 138 | logger.debug('SLACK: %s', resp.text) 139 | except requests.exceptions.ConnectionError as e: 140 | logger.critical('Failed to SLACK: %s', e) 141 | 142 | self.message_owners(message, [pod]) 143 | 144 | def notify_drained_node(self, node, pods): 145 | struct_log('drain', pods, extra={'node': str(node)}) 146 | 147 | if not self.hook: 148 | logger.debug('SLACK_HOOK not configured.') 149 | return 150 | 151 | pods_string = _generate_pod_string(pods) 152 | 153 | message = 'Node {} drained.'.format(node) 154 | message += '\n' 155 | message += 'Pod affected: {}'.format(pods_string) 156 | 157 | try: 158 | resp = requests.post(self.hook, json={ 159 | "text": message, 160 | "username": "kubernetes-ec2-autoscaler", 161 | "icon_emoji": ":rabbit:", 162 | }) 163 | logger.debug('SLACK: %s', resp.text) 164 | except requests.exceptions.ConnectionError as e: 165 | logger.critical('Failed to SLACK: %s', e) 166 | 167 | def message_owners(self, message, pods): 168 | if not self.bot_token: 169 | logger.debug('SLACK_BOT_TOKEN not configured.') 170 | return 171 | 172 | pods_by_owner = {} 173 | for pod in pods: 174 | if pod.owner: 175 | pods_by_owner.setdefault(pod.owner, []).append(pod) 176 | 177 | for owner, pods in pods_by_owner.items(): 178 | self.message_owner(owner, message, pods) 179 | 180 | @cachedmethod(operator.attrgetter('cache'), key=_cache_key) 181 | def message_owner(self, owner, message, pods): 182 | attachments = [{ 183 | 'pretext': 'Relevant pods', 184 | 'text': ', '.join('{}/{}'.format(pod.namespace, pod.name) for pod in pods) 185 | }] 186 | 187 | try: 188 | resp = requests.post(self.MESSAGE_URL, data={ 189 | "text": message, 190 | "attachments": json.dumps(attachments), 191 | "token": self.bot_token, 192 | "channel": "@{}".format(owner), 193 | "username": "kubernetes-ec2-autoscaler", 194 | "icon_emoji": ":rabbit:", 195 | }) 196 | logger.debug('SLACK: %s', resp.text) 197 | except requests.exceptions.RequestException as e: 198 | logger.critical('Failed to SLACK: %s', e) 199 | -------------------------------------------------------------------------------- /test/test_azure.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from datetime import datetime, timedelta 4 | 5 | import collections 6 | import mock 7 | import pykube 8 | import pytz 9 | import yaml 10 | from azure.mgmt.compute.models import VirtualMachineScaleSetVM, \ 11 | VirtualMachineInstanceView, VirtualMachineSize, Usage, UsageName 12 | from azure.mgmt.resource.resources.models import ResourceGroup 13 | 14 | from autoscaler import KubePod 15 | from autoscaler.azure import AzureVirtualScaleSet 16 | from autoscaler.azure_api import AzureScaleSet, AzureWrapper 17 | 18 | 19 | def _default_mock_clients(region, instances=[], quotas={'Dv2': 100, 'NC': 100}): 20 | sizes = [ 21 | VirtualMachineSize(name="Standard_D1_v2", number_of_cores=1), 22 | VirtualMachineSize(name="Standard_NC24", number_of_cores=24) 23 | ] 24 | mock_client = mock.Mock() 25 | mock_client.virtual_machine_scale_set_vms = mock.Mock() 26 | mock_client.virtual_machine_scale_set_vms.list = mock.Mock(return_value=instances) 27 | mock_client.virtual_machine_scale_sets = mock.Mock() 28 | mock_client.virtual_machine_scale_sets.create_or_update = mock.Mock() 29 | mock_client.virtual_machine_scale_sets.delete_instances = mock.Mock() 30 | mock_client.virtual_machine_sizes = mock.Mock() 31 | mock_client.virtual_machine_sizes.list = mock.Mock(return_value=sizes) 32 | mock_client.usage = mock.Mock() 33 | usage_limits = [] 34 | for k, v in quotas.items(): 35 | usage_limits.append(Usage(name=UsageName(value="standard" + k + "Family"), limit=v, current_value=0)) 36 | mock_client.usage.list = mock.Mock(return_value=usage_limits) 37 | 38 | monitor_client = mock.Mock() 39 | monitor_client.activity_logs = mock.Mock() 40 | monitor_client.activity_logs.list = mock.Mock(return_value=[]) 41 | 42 | azure_resource_group = ResourceGroup(location=region) 43 | resource_client = mock.Mock() 44 | resource_client.resource_groups = mock.Mock() 45 | resource_client.activity_logs.get = mock.Mock(return_value=azure_resource_group) 46 | 47 | return (mock_client, monitor_client, resource_client) 48 | 49 | 50 | class TestCluster(unittest.TestCase): 51 | def test_failed_scale_up(self): 52 | region = 'test' 53 | 54 | mock_client, monitor_client, resource_client = _default_mock_clients(region) 55 | 56 | instance_type = 'Standard_D1_v2' 57 | resource_group = 'test-resource-group' 58 | failed_scale_set = AzureScaleSet(region, resource_group, 'test-scale-set1', instance_type, 0, 'Failed') 59 | scale_set = AzureScaleSet(region, resource_group, 'test-scale-set2', instance_type, 0, 'Succeeded') 60 | 61 | virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, False, [failed_scale_set, scale_set], []) 62 | 63 | virtual_scale_set.scale(5) 64 | 65 | mock_client.virtual_machine_scale_sets.create_or_update.assert_called_once() 66 | self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_args[1]['parameters'].sku.capacity, 5) 67 | 68 | def test_scale_up(self): 69 | region = 'test' 70 | 71 | mock_client, monitor_client, resource_client = _default_mock_clients(region) 72 | 73 | instance_type = 'Standard_D1_v2' 74 | resource_group = 'test-resource-group' 75 | scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 0, 'Succeeded') 76 | 77 | virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, False, [scale_set], []) 78 | 79 | virtual_scale_set.scale(5) 80 | 81 | mock_client.virtual_machine_scale_sets.create_or_update.assert_called_once() 82 | self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_args[1]['parameters'].sku.capacity, 5) 83 | 84 | def test_priority(self): 85 | region = 'test' 86 | 87 | mock_client, monitor_client, resource_client = _default_mock_clients(region) 88 | 89 | instance_type = 'Standard_D1_v2' 90 | resource_group = 'test-resource-group' 91 | scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 0, 'Succeeded', priority=-1) 92 | # Name sorts lexicographically before previous scale set, but priority is after it 93 | scale_set2 = AzureScaleSet(region, resource_group, 'a-test-scale-set', instance_type, 0, 'Succeeded', priority=1) 94 | 95 | virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, True, [scale_set, scale_set2], []) 96 | 97 | virtual_scale_set.scale(1) 98 | 99 | self.assertEqual(virtual_scale_set.global_priority, -1) 100 | 101 | self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_count, 1) 102 | self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_args_list[0][0][1], 'test-scale-set') 103 | 104 | def test_slow_scale_up(self): 105 | region = 'test' 106 | 107 | mock_client, monitor_client, resource_client = _default_mock_clients(region) 108 | 109 | instance_type = 'Standard_D1_v2' 110 | resource_group = 'test-resource-group' 111 | scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 0, 'Succeeded') 112 | scale_set2 = AzureScaleSet(region, resource_group, 'test-scale-set2', instance_type, 0, 'Succeeded') 113 | 114 | virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, True, [scale_set, scale_set2], []) 115 | 116 | virtual_scale_set.scale(2) 117 | 118 | self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_count, 2) 119 | self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_args_list[0][1]['parameters'].sku.capacity, 1) 120 | self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_args_list[1][1]['parameters'].sku.capacity, 1) 121 | 122 | def test_tainted_scale_set(self): 123 | region = 'test' 124 | 125 | mock_client, monitor_client, resource_client = _default_mock_clients(region) 126 | 127 | instance_type = 'Standard_NC24' 128 | resource_group = 'test-resource-group' 129 | scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 0, 'Succeeded', no_schedule_taints={'gpu': 'yes'}) 130 | 131 | virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, True, [scale_set], []) 132 | 133 | dir_path = os.path.dirname(os.path.realpath(__file__)) 134 | with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f: 135 | dummy_pod = yaml.load(f.read()) 136 | pod = KubePod(pykube.Pod(None, dummy_pod)) 137 | 138 | self.assertFalse(virtual_scale_set.is_taints_tolerated(pod)) 139 | 140 | dummy_pod['spec']['tolerations'] = [{'key': 'gpu', 'operator': 'Exists'}] 141 | pod = KubePod(pykube.Pod(None, dummy_pod)) 142 | self.assertTrue(virtual_scale_set.is_taints_tolerated(pod)) 143 | 144 | def test_out_of_quota(self): 145 | region = 'test' 146 | 147 | mock_client, monitor_client, resource_client = _default_mock_clients(region) 148 | 149 | instance_type = 'Standard_D1_v2' 150 | resource_group = 'test-resource-group' 151 | scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 0, 'Succeeded', 152 | timeout_until=datetime.now(pytz.utc) + timedelta(minutes=10), timeout_reason="fake reason") 153 | virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, False, [scale_set], []) 154 | self.assertTrue(virtual_scale_set.is_timed_out()) 155 | 156 | def test_near_quota_limit(self): 157 | region = 'test' 158 | 159 | mock_client, monitor_client, resource_client = _default_mock_clients(region, quotas={'Dv2': 5}) 160 | 161 | instance_type = 'Standard_D1_v2' 162 | resource_group = 'test-resource-group' 163 | scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 0, 'Succeeded') 164 | 165 | virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, False, [scale_set], []) 166 | 167 | virtual_scale_set.scale(10) 168 | 169 | mock_client.virtual_machine_scale_sets.create_or_update.assert_called_once() 170 | self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_args[1]['parameters'].sku.capacity, 5) 171 | 172 | def test_scale_in(self): 173 | region = 'test' 174 | resource_group = 'test-resource-group' 175 | 176 | instance = VirtualMachineScaleSetVM(location=region) 177 | instance.vm_id = 'test-vm-id' 178 | instance.instance_id = 0 179 | instance.instance_view = VirtualMachineInstanceView() 180 | instance.instance_view.statuses = [] 181 | 182 | mock_client, monitor_client, resource_client = _default_mock_clients(region, instances=[instance]) 183 | 184 | TestNode = collections.namedtuple('TestNode', ['instance_id', 'unschedulable']) 185 | test_node = TestNode(instance_id=instance.vm_id, unschedulable=False) 186 | 187 | instance_type = 'Standard_D1_v2' 188 | scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 1, 'Succeeded') 189 | 190 | virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, False, [scale_set], [test_node]) 191 | 192 | self.assertEqual(virtual_scale_set.instance_ids, {instance.vm_id}) 193 | self.assertEqual(virtual_scale_set.nodes, [test_node]) 194 | 195 | virtual_scale_set.scale_nodes_in([test_node]) 196 | mock_client.virtual_machine_scale_sets.delete_instances.assert_called_once_with(resource_group, scale_set.name, [instance.instance_id]) 197 | -------------------------------------------------------------------------------- /autoscaler/azure.py: -------------------------------------------------------------------------------- 1 | import http 2 | import logging 3 | from typing import List, Tuple, MutableMapping 4 | from datetime import datetime 5 | 6 | import re 7 | from requests.packages.urllib3 import Retry 8 | 9 | import autoscaler.utils as utils 10 | from autoscaler.autoscaling_groups import AutoScalingGroup 11 | from autoscaler.azure_api import AzureApi, AzureScaleSet, AzureScaleSetInstance 12 | from autoscaler.utils import TransformingFuture, AllCompletedFuture, CompletedFuture 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | _RETRY_TIME_LIMIT = 30 18 | 19 | 20 | class AzureBoundedRetry(Retry): 21 | """ 22 | XXX: Azure sometimes sends us a Retry-After: 1200, even when we still have quota, causing our client to appear to hang. 23 | Ignore them and just retry after 30secs 24 | """ 25 | def __init__(self, **kwargs): 26 | super().__init__(**kwargs) 27 | 28 | @staticmethod 29 | def from_retry(retry): 30 | new_retry = AzureBoundedRetry() 31 | new_retry.total = retry.total 32 | new_retry.connect = retry.connect 33 | new_retry.read = retry.read 34 | new_retry.backoff_factor = retry.backoff_factor 35 | new_retry.BACKOFF_MAX = retry.BACKOFF_MAX 36 | new_retry.status_forcelist = retry.status_forcelist 37 | new_retry.method_whitelist = retry.method_whitelist 38 | 39 | return new_retry 40 | 41 | def get_retry_after(self, response): 42 | retry_after = super().get_retry_after(response) 43 | if response.status != http.HTTPStatus.TOO_MANY_REQUESTS or retry_after <= _RETRY_TIME_LIMIT: 44 | return retry_after 45 | 46 | headers = {} 47 | for header in ['Retry-After', 48 | 'x-ms-ratelimit-remaining-subscription-reads', 49 | 'x-ms-ratelimit-remaining-subscription-writes', 50 | 'x-ms-ratelimit-remaining-tenant-reads', 51 | 'x-ms-ratelimit-remaining-tenant-writes', 52 | 'x-ms-ratelimit-remaining-subscription-resource-requests', 53 | 'x-ms-ratelimit-remaining-subscription-resource-entities-read', 54 | 'x-ms-ratelimit-remaining-tenant-resource-requests', 55 | 'x-ms-ratelimit-remaining-tenant-resource-entities-read']: 56 | value = response.getheader(header) 57 | if value is not None: 58 | headers[header] = value 59 | 60 | logger.warn("Azure request throttled: {}".format(headers)) 61 | return _RETRY_TIME_LIMIT 62 | 63 | 64 | class AzureGroups(object): 65 | def __init__(self, resource_groups, slow_scale_classes, client: AzureApi): 66 | self.resource_groups = resource_groups 67 | self.slow_scale_classes = slow_scale_classes 68 | self.client = client 69 | 70 | def get_all_groups(self, kube_nodes): 71 | 72 | groups = [] 73 | if self.client: 74 | for resource_group in self.resource_groups: 75 | scale_sets_by_type = {} 76 | for scale_set in self.client.list_scale_sets(resource_group.name): 77 | scale_sets_by_type.setdefault((scale_set.location, scale_set.instance_type), []).append(scale_set) 78 | for key, scale_sets in scale_sets_by_type.items(): 79 | location, instance_type = key 80 | slow_scale = _get_azure_class(instance_type) in self.slow_scale_classes 81 | groups.append(AzureVirtualScaleSet(location, resource_group.name, self.client, instance_type, slow_scale, scale_sets, kube_nodes)) 82 | 83 | return groups 84 | 85 | 86 | _CLASS_PAT = re.compile(r'\w+_(?P[A-Z]+).+') 87 | 88 | 89 | def _get_azure_class(type_): 90 | m = _CLASS_PAT.match(type_) 91 | return m.group('class') 92 | 93 | 94 | _SCALE_SET_SIZE_LIMIT = 300 95 | 96 | 97 | # Appears as an unbounded scale set. Currently, Azure Scale Sets have a limit of 300 hosts. 98 | class AzureVirtualScaleSet(AutoScalingGroup): 99 | provider = 'azure' 100 | 101 | def __init__(self, region, resource_group, client: AzureApi, instance_type, slow_scale: bool, scale_sets: List[AzureScaleSet], kube_nodes): 102 | self.client = client 103 | self.instance_type = instance_type 104 | self.tags = {} 105 | self.name = 'virtual_scale_set_' + instance_type + '_' + region + '_' + resource_group 106 | self.scale_sets = dict((scale_set.name, scale_set) for scale_set in scale_sets) 107 | self.desired_capacity = sum(scale_set.capacity for scale_set in scale_sets) 108 | 109 | self.region = region 110 | self.resource_group = resource_group 111 | 112 | self.selectors = dict(self.tags) 113 | # HACK: for matching node selectors 114 | self.selectors['azure/type'] = self.instance_type 115 | self.selectors['azure/class'] = _get_azure_class(self.instance_type) 116 | self.slow_scale = slow_scale 117 | 118 | self.min_size = 0 119 | self.max_size = 10000 120 | self.is_spot = False 121 | 122 | self.vm_id_to_instance: MutableMapping[str, Tuple[str, AzureScaleSetInstance]] = {} 123 | self.instances = {} 124 | self.timeout_until = None 125 | self.timeout_reason = None 126 | self._global_priority = None 127 | self.no_schedule_taints = {} 128 | for scale_set in scale_sets: 129 | if scale_set.timeout_until is not None: 130 | if self.timeout_until is None or self.timeout_until < scale_set.timeout_until: 131 | self.timeout_until = scale_set.timeout_until 132 | self.timeout_reason = scale_set.name + ": " + scale_set.timeout_reason 133 | if scale_set.priority is not None: 134 | if self._global_priority is None: 135 | self._global_priority = scale_set.priority 136 | else: 137 | self._global_priority = min(scale_set.priority, self._global_priority) 138 | if not self.no_schedule_taints: 139 | self.no_schedule_taints = scale_set.no_schedule_taints 140 | 141 | if scale_set.capacity == 0: 142 | continue 143 | for instance in self.client.list_scale_set_instances(scale_set): 144 | self.vm_id_to_instance[instance.vm_id] = (scale_set.name, instance) 145 | self.instances[instance.vm_id] = AzureInstance(instance.vm_id, self.instance_type, instance.launch_time, self.tags) 146 | 147 | self.nodes = [node for node in kube_nodes if node.instance_id in self.vm_id_to_instance] 148 | self.unschedulable_nodes = [n for n in self.nodes if n.unschedulable] 149 | 150 | self._id = (self.region, self.name) 151 | 152 | def is_timed_out(self): 153 | if self.timeout_until and datetime.now(self.timeout_until.tzinfo) < self.timeout_until: 154 | logger.warn("{} is timed out until {} because {}".format(self._id, self.timeout_until, self.timeout_reason)) 155 | return True 156 | return False 157 | 158 | @property 159 | def global_priority(self): 160 | if self._global_priority is None: 161 | return super().global_priority 162 | return self._global_priority 163 | 164 | def get_azure_instances(self): 165 | return self.instances.values() 166 | 167 | @property 168 | def instance_ids(self): 169 | return self.vm_id_to_instance.keys() 170 | 171 | def set_desired_capacity(self, new_desired_capacity): 172 | """ 173 | sets the desired capacity of the underlying ASG directly. 174 | note that this is for internal control. 175 | for scaling purposes, please use scale() instead. 176 | """ 177 | scale_out = new_desired_capacity - self.desired_capacity 178 | assert scale_out >= 0 179 | if scale_out == 0: 180 | return CompletedFuture(False) 181 | 182 | remaining_instances = self.client.get_remaining_instances(self.resource_group, self.instance_type) 183 | 184 | futures = [] 185 | for scale_set in sorted(self.scale_sets.values(), key=lambda x: (x.priority, x.name)): 186 | if scale_set.capacity < _SCALE_SET_SIZE_LIMIT: 187 | if self.slow_scale: 188 | new_group_capacity = scale_set.capacity + 1 189 | else: 190 | new_group_capacity = min(_SCALE_SET_SIZE_LIMIT, scale_set.capacity + scale_out, scale_set.capacity + remaining_instances) 191 | if scale_set.provisioning_state == 'Updating': 192 | logger.warn("Update of {} already in progress".format(scale_set.name)) 193 | continue 194 | if scale_set.provisioning_state == 'Failed': 195 | logger.error("{} failed provisioning. Skipping it for scaling.".format(scale_set.name)) 196 | continue 197 | scale_out -= (new_group_capacity - scale_set.capacity) 198 | remaining_instances -= (new_group_capacity - scale_set.capacity) 199 | # Update our cached version 200 | self.scale_sets[scale_set.name].capacity = new_group_capacity 201 | futures.append(self.client.update_scale_set(scale_set, new_group_capacity)) 202 | logger.info("Scaling Azure Scale Set {} to {}".format(scale_set.name, new_group_capacity)) 203 | if scale_out == 0 or remaining_instances == 0: 204 | break 205 | 206 | if remaining_instances == 0: 207 | logger.warning("Out of quota for {}!".format(self.instance_type)) 208 | 209 | if scale_out > 0: 210 | logger.error("Not enough scale sets to reach desired capacity {} for {}".format(new_desired_capacity, self)) 211 | 212 | self.desired_capacity = new_desired_capacity - scale_out 213 | logger.info("ASG: {} new_desired_capacity: {}".format(self, new_desired_capacity)) 214 | 215 | return TransformingFuture(True, AllCompletedFuture(futures)) 216 | 217 | def terminate_instances(self, vm_ids): 218 | vm_ids = list(vm_ids) 219 | instances = {} 220 | for vm_id in vm_ids: 221 | scale_set_name, instance = self.vm_id_to_instance[vm_id] 222 | # Update our cached copy of the Scale Set 223 | self.scale_sets[scale_set_name].capacity -= 1 224 | instances.setdefault(scale_set_name, []).append(instance) 225 | logger.info('Terminated instances %s', vm_ids) 226 | 227 | futures = [] 228 | for scale_set_name, scale_set_instances in instances.items(): 229 | futures.append(self.client.terminate_scale_set_instances(self.scale_sets[scale_set_name], scale_set_instances)) 230 | return AllCompletedFuture(futures) 231 | 232 | def scale_nodes_in(self, nodes): 233 | """ 234 | scale down asg by terminating the given node. 235 | returns a future indicating when the request completes. 236 | """ 237 | for node in nodes: 238 | self.nodes.remove(node) 239 | return self.terminate_instances(node.instance_id for node in nodes) 240 | 241 | def __str__(self): 242 | return 'AzureVirtualScaleSet({name}, {selectors_hash})'.format(name=self.name, selectors_hash=utils.selectors_to_hash(self.selectors)) 243 | 244 | def __repr__(self): 245 | return str(self) 246 | 247 | 248 | class AzureInstance(object): 249 | provider = 'azure' 250 | 251 | def __init__(self, instance_id, instance_type, launch_time, tags): 252 | self.id = instance_id 253 | self.instance_type = instance_type 254 | self.launch_time = launch_time 255 | self.tags = tags 256 | 257 | def __str__(self): 258 | return 'AzureInstance({}, {})'.format(self.id, self.instance_type) 259 | 260 | def __repr__(self): 261 | return str(self) -------------------------------------------------------------------------------- /test/test_azure_api.py: -------------------------------------------------------------------------------- 1 | import json 2 | import unittest 3 | import mock 4 | from datetime import datetime 5 | 6 | import pytz 7 | from azure.mgmt.compute.models import VirtualMachineScaleSet, Sku 8 | from azure.monitor.models import EventData, LocalizableString 9 | 10 | from autoscaler.azure_api import AzureApi, AzureScaleSet, AzureWriteThroughCachedApi, \ 11 | AzureScaleSetInstance, AzureWrapper, TIMEOUT_PERIOD, PRIORITY_TAG, NO_SCHEDULE_TAINTS_TAG 12 | from autoscaler.utils import CompletedFuture 13 | 14 | 15 | class TestingFuture: 16 | def __init__(self): 17 | self.callbacks = [] 18 | 19 | def add_done_callback(self, fn): 20 | self.callbacks.append(fn) 21 | 22 | def complete(self): 23 | for callback in self.callbacks: 24 | callback(self) 25 | 26 | 27 | class TestWriteThroughCache(unittest.TestCase): 28 | def test_caching(self): 29 | scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded') 30 | instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now()) 31 | 32 | mock_api = mock.Mock(AzureApi) 33 | mock_api.list_scale_sets = mock.Mock(return_value=[scale_set]) 34 | mock_api.list_scale_set_instances = mock.Mock(return_value=[instance]) 35 | 36 | cached_api = AzureWriteThroughCachedApi(mock_api) 37 | 38 | self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set]) 39 | self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set]) 40 | 41 | self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance]) 42 | self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance]) 43 | 44 | mock_api.list_scale_sets.assert_called_once_with('test_rg') 45 | mock_api.list_scale_set_instances.assert_called_once_with(scale_set) 46 | 47 | def test_copied(self): 48 | scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded') 49 | instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now()) 50 | 51 | mock_api = mock.Mock(AzureApi) 52 | mock_api.list_scale_sets = mock.Mock(return_value=[scale_set]) 53 | mock_api.list_scale_set_instances = mock.Mock(return_value=[instance]) 54 | 55 | cached_api = AzureWriteThroughCachedApi(mock_api) 56 | 57 | returned_scale_set = cached_api.list_scale_sets('test_rg')[0] 58 | self.assertEqual(returned_scale_set.capacity, 1) 59 | returned_scale_set.capacity = 0 60 | self.assertEqual(cached_api.list_scale_sets('test_rg')[0].capacity, 1) 61 | 62 | returned_instance = cached_api.list_scale_set_instances(scale_set)[0] 63 | self.assertEqual(returned_instance.vm_id, 'fake_vm') 64 | returned_instance.vm_id = 'modified' 65 | self.assertEqual(cached_api.list_scale_set_instances(scale_set)[0].vm_id, 'fake_vm') 66 | 67 | def test_refresh(self): 68 | scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded') 69 | updated_scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 0, 'Succeeded') 70 | scale_set2 = AzureScaleSet('eastus', 'test_rg', 'test2', 'Standard_H16', 0, 'Succeeded') 71 | instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now()) 72 | 73 | mock_api = mock.Mock(AzureApi) 74 | mock_api.list_scale_sets = mock.Mock(return_value=[scale_set]) 75 | mock_api.list_scale_set_instances = mock.Mock(return_value=[instance]) 76 | 77 | cached_api = AzureWriteThroughCachedApi(mock_api) 78 | 79 | self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set]) 80 | self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance]) 81 | mock_api.list_scale_sets.assert_called_once_with('test_rg') 82 | mock_api.list_scale_set_instances.assert_called_once_with(scale_set) 83 | 84 | mock_api.list_scale_sets = mock.Mock(return_value=[updated_scale_set, scale_set2]) 85 | mock_api.list_scale_set_instances = mock.Mock(return_value=[]) 86 | self.assertEqual(set(cached_api.list_scale_sets('test_rg', force_refresh=True)), {updated_scale_set, scale_set2}) 87 | self.assertEqual(cached_api.list_scale_set_instances(updated_scale_set), []) 88 | mock_api.list_scale_sets.assert_called_once_with('test_rg') 89 | mock_api.list_scale_set_instances.assert_called_once_with(updated_scale_set) 90 | 91 | def test_update(self): 92 | scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded') 93 | updated_scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 0, 'Succeeded') 94 | instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now()) 95 | future = CompletedFuture(None) 96 | 97 | mock_api = mock.Mock(AzureApi) 98 | mock_api.list_scale_sets = mock.Mock(return_value=[scale_set]) 99 | mock_api.list_scale_set_instances = mock.Mock(return_value=[instance]) 100 | mock_api.update_scale_set = mock.Mock(return_value=future) 101 | 102 | cached_api = AzureWriteThroughCachedApi(mock_api) 103 | 104 | self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set]) 105 | self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance]) 106 | cached_api.update_scale_set(scale_set, 0).result() 107 | mock_api.list_scale_sets.assert_called_once_with('test_rg') 108 | mock_api.list_scale_set_instances.assert_called_once_with(scale_set) 109 | mock_api.update_scale_set.assert_called_once_with(scale_set, 0) 110 | 111 | mock_api.list_scale_sets = mock.Mock(return_value=[updated_scale_set]) 112 | mock_api.list_scale_set_instances = mock.Mock(return_value=[]) 113 | self.assertEqual(cached_api.list_scale_sets('test_rg'), [updated_scale_set]) 114 | self.assertEqual(cached_api.list_scale_set_instances(updated_scale_set), []) 115 | mock_api.list_scale_sets.assert_called_once_with('test_rg') 116 | mock_api.list_scale_set_instances.assert_called_once_with(updated_scale_set) 117 | 118 | def test_inconsistent_delegate(self): 119 | scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 0, 'Succeeded') 120 | updated_scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded') 121 | instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now()) 122 | future = CompletedFuture(None) 123 | 124 | mock_api = mock.Mock(AzureApi) 125 | mock_api.list_scale_sets = mock.Mock(return_value=[scale_set]) 126 | mock_api.list_scale_set_instances = mock.Mock(return_value=[]) 127 | mock_api.update_scale_set = mock.Mock(return_value=future) 128 | 129 | cached_api = AzureWriteThroughCachedApi(mock_api) 130 | 131 | self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set]) 132 | self.assertEqual(cached_api.list_scale_set_instances(scale_set), []) 133 | mock_api.list_scale_sets.assert_called_once_with('test_rg') 134 | mock_api.list_scale_set_instances.assert_called_once_with(scale_set) 135 | cached_api.update_scale_set(scale_set, 1).result() 136 | mock_api.update_scale_set.assert_called_once_with(scale_set, 1) 137 | 138 | mock_api.list_scale_sets = mock.Mock(return_value=[updated_scale_set]) 139 | mock_api.list_scale_set_instances = mock.Mock(return_value=[]) 140 | self.assertEqual(cached_api.list_scale_sets('test_rg'), [updated_scale_set]) 141 | self.assertEqual(cached_api.list_scale_set_instances(updated_scale_set), []) 142 | mock_api.list_scale_sets.assert_called_once_with('test_rg') 143 | mock_api.list_scale_set_instances.assert_called_once_with(updated_scale_set) 144 | 145 | # Test that even if there is inconsistency between the list_scale_sets and list_scale_set_instances, the 146 | # cache doesn't end up with bad data 147 | mock_api.list_scale_set_instances = mock.Mock(return_value=[instance]) 148 | self.assertEqual(cached_api.list_scale_set_instances(updated_scale_set), [instance]) 149 | mock_api.list_scale_set_instances.assert_called_once_with(updated_scale_set) 150 | 151 | def test_terminate(self): 152 | scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded') 153 | updated_scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 0, 'Succeeded') 154 | instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now()) 155 | future = CompletedFuture(None) 156 | 157 | mock_api = mock.Mock(AzureApi) 158 | mock_api.list_scale_sets = mock.Mock(return_value=[scale_set]) 159 | mock_api.list_scale_set_instances = mock.Mock(return_value=[instance]) 160 | mock_api.terminate_scale_set_instances = mock.Mock(return_value=future) 161 | 162 | cached_api = AzureWriteThroughCachedApi(mock_api) 163 | 164 | self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set]) 165 | self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance]) 166 | cached_api.terminate_scale_set_instances(scale_set, [instance]).result() 167 | mock_api.list_scale_sets.assert_called_once_with('test_rg') 168 | mock_api.list_scale_set_instances.assert_called_once_with(scale_set) 169 | mock_api.terminate_scale_set_instances.assert_called_once_with(scale_set, [instance]) 170 | 171 | mock_api.list_scale_sets = mock.Mock(return_value=[updated_scale_set]) 172 | mock_api.list_scale_set_instances = mock.Mock(return_value=[]) 173 | self.assertEqual(cached_api.list_scale_sets('test_rg'), [updated_scale_set]) 174 | self.assertEqual(cached_api.list_scale_set_instances(updated_scale_set), []) 175 | mock_api.list_scale_sets.assert_called_once_with('test_rg') 176 | mock_api.list_scale_set_instances.assert_called_once_with(updated_scale_set) 177 | 178 | def test_terminate_with_concurrent_read(self): 179 | scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded') 180 | updated_scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 0, 'Succeeded') 181 | instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now()) 182 | future = TestingFuture() 183 | 184 | mock_api = mock.Mock(AzureApi) 185 | mock_api.list_scale_sets = mock.Mock(return_value=[scale_set]) 186 | mock_api.list_scale_set_instances = mock.Mock(return_value=[instance]) 187 | mock_api.terminate_scale_set_instances = mock.Mock(return_value=future) 188 | 189 | cached_api = AzureWriteThroughCachedApi(mock_api) 190 | 191 | self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set]) 192 | self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance]) 193 | cached_api.terminate_scale_set_instances(scale_set, [instance]) 194 | mock_api.list_scale_sets.assert_called_once_with('test_rg') 195 | mock_api.list_scale_set_instances.assert_called_once_with(scale_set) 196 | mock_api.terminate_scale_set_instances.assert_called_once_with(scale_set, [instance]) 197 | 198 | # Call list again concurrently with the delete, and make sure it's still served from the cache 199 | self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set]) 200 | self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance]) 201 | mock_api.list_scale_sets.assert_called_once_with('test_rg') 202 | mock_api.list_scale_set_instances.assert_called_once_with(scale_set) 203 | 204 | future.complete() 205 | mock_api.list_scale_sets = mock.Mock(return_value=[updated_scale_set]) 206 | mock_api.list_scale_set_instances = mock.Mock(return_value=[]) 207 | self.assertEqual(cached_api.list_scale_sets('test_rg'), [updated_scale_set]) 208 | self.assertEqual(cached_api.list_scale_set_instances(updated_scale_set), []) 209 | mock_api.list_scale_sets.assert_called_once_with('test_rg') 210 | mock_api.list_scale_set_instances.assert_called_once_with(updated_scale_set) 211 | 212 | 213 | class TestWrapper(unittest.TestCase): 214 | def test_basic(self): 215 | scale_set = VirtualMachineScaleSet('eastus', {PRIORITY_TAG: '1', NO_SCHEDULE_TAINTS_TAG: json.dumps({'gpu': 'yes'})}, 216 | sku=Sku('Standard_H16', capacity=1)) 217 | scale_set.name = 'test' 218 | scale_set.provisioning_state = 'Succeeded' 219 | scale_set.id = 'fake_id' 220 | 221 | compute_client = mock.Mock() 222 | compute_client.virtual_machine_scale_sets = mock.Mock() 223 | compute_client.virtual_machine_scale_sets.list = mock.Mock(return_value=[scale_set]) 224 | 225 | monitor_client = mock.Mock() 226 | monitor_client.activity_logs = mock.Mock() 227 | monitor_client.activity_logs.list = mock.Mock(return_value=[]) 228 | 229 | api = AzureWrapper(compute_client, monitor_client, None) 230 | resource_group = 'test_rg' 231 | expected = AzureScaleSet(scale_set.location, resource_group, scale_set.name, scale_set.sku.name, scale_set.sku.capacity, 232 | scale_set.provisioning_state, priority=1, no_schedule_taints={'gpu': 'yes'}) 233 | self.assertEqual([expected], api.list_scale_sets(resource_group)) 234 | 235 | compute_client.virtual_machine_scale_sets.list.assert_called_once_with(resource_group) 236 | monitor_client.activity_logs.list.assert_called_once() 237 | 238 | def test_out_of_quota(self): 239 | scale_set = VirtualMachineScaleSet('eastus', {}, sku=Sku('Standard_H16', capacity=1)) 240 | scale_set.name = 'test' 241 | scale_set.provisioning_state = 'Succeeded' 242 | scale_set.id = 'fake_id' 243 | 244 | compute_client = mock.Mock() 245 | compute_client.virtual_machine_scale_sets = mock.Mock() 246 | compute_client.virtual_machine_scale_sets.list = mock.Mock(return_value=[scale_set]) 247 | 248 | reason = "Operation results in exceeding quota limits of Core. Maximum allowed: 800, Current in use: 784, Additional requested: 320." 249 | message = "{\"error\":{\"code\":\"OperationNotAllowed\",\"message\":\"" + reason + "\"}}" 250 | monitor_client = mock.Mock() 251 | monitor_client.activity_logs = mock.Mock() 252 | now = datetime.now(pytz.utc) 253 | monitor_client.activity_logs.list = mock.Mock(return_value=[EventData('Error', 254 | now, 255 | now, 256 | resource_id=scale_set.id, 257 | status=LocalizableString('Failed'), 258 | properties={'statusCode': 'Conflict', 259 | 'statusMessage': message})]) 260 | 261 | api = AzureWrapper(compute_client, monitor_client, None) 262 | resource_group = 'test_rg' 263 | expected = AzureScaleSet(scale_set.location, resource_group, scale_set.name, scale_set.sku.name, scale_set.sku.capacity, 264 | scale_set.provisioning_state, now + TIMEOUT_PERIOD, reason) 265 | acutal = api.list_scale_sets(resource_group) 266 | self.assertEqual([expected], acutal) 267 | 268 | compute_client.virtual_machine_scale_sets.list.assert_called_once_with(resource_group) 269 | monitor_client.activity_logs.list.assert_called_once() 270 | -------------------------------------------------------------------------------- /autoscaler/kube.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import logging 4 | 5 | from typing import Iterable, Mapping 6 | 7 | from dateutil.parser import parse as dateutil_parse 8 | import pykube.exceptions 9 | 10 | import autoscaler.utils as utils 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class KubePodStatus(object): 16 | RUNNING = 'Running' 17 | PENDING = 'Pending' 18 | CONTAINER_CREATING = 'ContainerCreating' 19 | SUCCEEDED = 'Succeeded' 20 | FAILED = 'Failed' 21 | 22 | _CORDON_LABEL = 'openai/cordoned-by-autoscaler' 23 | 24 | 25 | class KubePod(object): 26 | _DRAIN_GRACE_PERIOD = datetime.timedelta(seconds=60*60) 27 | 28 | def __init__(self, pod): 29 | self.original = pod 30 | 31 | metadata = pod.obj['metadata'] 32 | self.name = metadata['name'] 33 | self.namespace = metadata['namespace'] 34 | self.node_name = pod.obj['spec'].get('nodeName') 35 | self.status = pod.obj['status']['phase'] 36 | self.uid = metadata['uid'] 37 | self.selectors = pod.obj['spec'].get('nodeSelector', {}) 38 | self.labels = metadata.get('labels', {}) 39 | self.annotations = metadata.get('annotations', {}) 40 | self.owner_references = metadata.get('ownerReferences', []) 41 | self.owner = self.labels.get('owner', None) 42 | self.creation_time = dateutil_parse(metadata['creationTimestamp']) 43 | self.start_time = dateutil_parse(pod.obj['status']['startTime']) if 'startTime' in pod.obj['status'] else None 44 | self.scheduled_time = None 45 | 46 | for condition in pod.obj['status'].get('conditions', []): 47 | if condition['type'] == 'PodScheduled' and condition['status'] == 'True': 48 | self.scheduled_time = dateutil_parse(condition['lastTransitionTime']) 49 | 50 | # TODO: refactor 51 | requests = [c.get('resources', {}).get('requests', {}) for c in pod.obj['spec']['containers']] 52 | resource_requests = {} 53 | for d in requests: 54 | for k, v in d.items(): 55 | unitless_v = utils.parse_SI(v) 56 | resource_requests[k] = resource_requests.get(k, 0.0) + unitless_v 57 | self.resources = KubeResource(pods=1, **resource_requests) 58 | self.no_schedule_wildcard_toleration = False 59 | self.no_execute_wildcard_toleration = False 60 | self.no_schedule_existential_tolerations = set() 61 | self.no_execute_existential_tolerations = set() 62 | for toleration in pod.obj['spec'].get('tolerations', []): 63 | if toleration.get('operator', 'Equal') == 'Exists': 64 | effect = toleration.get('effect') 65 | if effect is None or effect == 'NoSchedule': 66 | if 'key' not in toleration: 67 | self.no_schedule_wildcard_toleration = True 68 | else: 69 | self.no_schedule_existential_tolerations.add(toleration['key']) 70 | if effect is None or effect == 'NoExecute': 71 | if 'key' not in toleration: 72 | self.no_execute_wildcard_toleration = True 73 | else: 74 | self.no_execute_existential_tolerations.add(toleration['key']) 75 | else: 76 | logger.warn("Equality tolerations not implemented. Pod {} has an equality toleration".format(pod)) 77 | 78 | self.required_pod_anti_affinity_expressions = [] 79 | anti_affinity_spec = pod.obj['spec'].get('affinity', {}).get('podAntiAffinity', {}) 80 | required_anti_affinity_expressions = anti_affinity_spec.get('requiredDuringSchedulingIgnoredDuringExecution', []) +\ 81 | anti_affinity_spec.get('requiredDuringSchedulingRequiredDuringExecution', []) 82 | for expression in required_anti_affinity_expressions: 83 | if expression.get('topologyKey') != 'kubernetes.io/hostname': 84 | logger.debug("Pod {} has non-hostname anti-affinity topology. Ignoring".format(pod)) 85 | continue 86 | self.required_pod_anti_affinity_expressions.append(expression['labelSelector']['matchExpressions']) 87 | 88 | def is_mirrored(self): 89 | is_daemonset = False 90 | for reference in self.owner_references: 91 | if reference.get('kind') == 'DaemonSet': 92 | is_daemonset = True 93 | break 94 | return is_daemonset or self.annotations.get('kubernetes.io/config.mirror') 95 | 96 | def is_replicated(self): 97 | return True if len(self.owner_references) > 0 else False 98 | 99 | def is_critical(self): 100 | return utils.parse_bool_label(self.labels.get('openai/do-not-drain')) 101 | 102 | def is_in_drain_grace_period(self): 103 | """ 104 | determines whether the pod is in a grace period for draining 105 | this prevents us from draining pods that are too new 106 | """ 107 | return (self.scheduled_time and 108 | (datetime.datetime.now(self.scheduled_time.tzinfo) - self.scheduled_time) < self._DRAIN_GRACE_PERIOD) 109 | 110 | def is_drainable(self): 111 | """ 112 | a pod is considered drainable if: 113 | - it's a daemon 114 | - it's a non-critical replicated pod that has exceeded grace period 115 | """ 116 | return (self.is_mirrored() or 117 | (self.is_replicated() and not self.is_critical() and not self.is_in_drain_grace_period())) 118 | 119 | def delete(self): 120 | logger.info('Deleting Pod %s/%s', self.namespace, self.name) 121 | return self.original.delete() 122 | 123 | def __hash__(self): 124 | return hash(self.uid) 125 | 126 | def __eq__(self, other): 127 | return self.uid == other.uid 128 | 129 | def __str__(self): 130 | return 'KubePod({namespace}, {name})'.format( 131 | namespace=self.namespace, name=self.name) 132 | 133 | def __repr__(self): 134 | return str(self) 135 | 136 | 137 | def reverse_bytes(value): 138 | assert len(value) % 2 == 0 139 | result = "" 140 | for i in range(len(value), 0, -2): 141 | result += value[i - 2: i] 142 | return result 143 | 144 | 145 | # Returns True iff all expressions in and_expression match labels on pod 146 | def match_anti_affinity_expression(and_expression: Iterable[Mapping], pod: KubePod): 147 | for expression in and_expression: 148 | label_value = pod.labels.get(expression['key']) 149 | if expression['operator'] == 'In' and label_value not in expression['values']: 150 | return False 151 | elif expression['operator'] == 'NotIn' and label_value in expression['values']: 152 | return False 153 | elif expression['operator'] == 'Exists' and label_value is None: 154 | return False 155 | elif expression['operator'] == 'DoesNotExist' and label_value is not None: 156 | return False 157 | return True 158 | 159 | 160 | class KubeNode(object): 161 | _HEARTBEAT_GRACE_PERIOD = datetime.timedelta(seconds=60*60) 162 | 163 | def __init__(self, node): 164 | self.original = node 165 | self.pykube_node = node 166 | 167 | metadata = node.obj['metadata'] 168 | self.name = metadata['name'] 169 | self.instance_id, self.region, self.instance_type, self.provider = self._get_instance_data() 170 | self.pods = [] 171 | 172 | self.capacity = KubeResource(**node.obj['status']['allocatable']) 173 | self.used_capacity = KubeResource() 174 | self.creation_time = dateutil_parse(metadata['creationTimestamp']) 175 | last_heartbeat_time = self.creation_time 176 | for condition in node.obj['status'].get('conditions', []): 177 | if condition.get('type') == 'Ready': 178 | last_heartbeat_time = dateutil_parse(condition['lastHeartbeatTime']) 179 | self.last_heartbeat_time = last_heartbeat_time 180 | self.no_schedule_taints = {} 181 | self.no_execute_taints = {} 182 | for taint in node.obj['spec'].get('taints', []): 183 | if taint['effect'] == 'NoSchedule': 184 | try: 185 | self.no_schedule_taints[taint['key']] = taint['value'] 186 | except: 187 | self.no_schedule_taints[taint['key']] = "" 188 | if taint['effect'] == 'NoExecute': 189 | self.no_execute_taints[taint['key']] = taint['value'] 190 | 191 | def _get_instance_data(self): 192 | """ 193 | returns a tuple (instance id, region, instance type) 194 | """ 195 | labels = self.original.obj['metadata'].get('labels', {}) 196 | instance_type = labels.get('aws/type', labels.get('beta.kubernetes.io/instance-type')) 197 | 198 | provider = self.original.obj['spec'].get('providerID', '') 199 | if provider.startswith('aws://'): 200 | az, instance_id = tuple(provider.split('/')[-2:]) 201 | if az and instance_id: 202 | return (instance_id, az[:-1], instance_type, 'aws') 203 | 204 | if labels.get('aws/id') and labels.get('aws/az'): 205 | instance_id = labels['aws/id'] 206 | region = labels['aws/az'][:-1] 207 | return (instance_id, region, instance_type, 'aws') 208 | 209 | assert provider.startswith('azure:////'), provider 210 | # Id is in wrong order: https://azure.microsoft.com/en-us/blog/accessing-and-using-azure-vm-unique-id/ 211 | big_endian_vm_id = provider.replace('azure:////', '') 212 | parts = big_endian_vm_id.split('-') 213 | instance_id = '-'.join([reverse_bytes(parts[0]), 214 | reverse_bytes(parts[1]), 215 | reverse_bytes(parts[2]), 216 | parts[3], 217 | parts[4]]).lower() 218 | instance_type = labels['azure/type'] 219 | return (instance_id, 'placeholder', instance_type, 'azure') 220 | 221 | @property 222 | def selectors(self): 223 | return self.original.obj['metadata'].get('labels', {}) 224 | 225 | @property 226 | def unschedulable(self): 227 | return self.original.obj['spec'].get('unschedulable', False) 228 | 229 | @property 230 | def can_uncordon(self): 231 | return utils.parse_bool_label(self.selectors.get(_CORDON_LABEL)) 232 | 233 | def drain(self, pods, notifier=None): 234 | for pod in pods: 235 | if pod.is_drainable() and not pod.is_mirrored(): 236 | pod.delete() 237 | 238 | logger.info("drained %s", self) 239 | if notifier: 240 | notifier.notify_drained_node(self, pods) 241 | 242 | def uncordon(self): 243 | if not utils.parse_bool_label(self.selectors.get(_CORDON_LABEL)): 244 | logger.debug('uncordon %s ignored', self) 245 | return False 246 | 247 | try: 248 | self.original.reload() 249 | self.original.obj['spec']['unschedulable'] = False 250 | self.original.update() 251 | logger.info("uncordoned %s", self) 252 | return True 253 | except pykube.exceptions.HTTPError as ex: 254 | logger.info("uncordon failed %s %s", self, ex) 255 | return False 256 | 257 | def cordon(self): 258 | try: 259 | self.original.reload() 260 | self.original.obj['spec']['unschedulable'] = True 261 | self.original.obj['metadata'].setdefault('labels', {})[_CORDON_LABEL] = 'true' 262 | self.original.update() 263 | logger.info("cordoned %s", self) 264 | return True 265 | except pykube.exceptions.HTTPError as ex: 266 | logger.info("cordon failed %s %s", self, ex) 267 | return False 268 | 269 | def delete(self): 270 | try: 271 | self.original.delete() 272 | logger.info("deleted %s", self) 273 | return True 274 | except pykube.exceptions.HTTPError as ex: 275 | logger.info("delete failed %s %s", self, ex) 276 | return False 277 | 278 | def count_pod(self, pod): 279 | assert isinstance(pod, KubePod) 280 | self.used_capacity += pod.resources 281 | self.pods.append(pod) 282 | 283 | def can_fit(self, resources): 284 | assert isinstance(resources, KubeResource) 285 | left = self.capacity - (self.used_capacity + resources) 286 | return left.possible 287 | 288 | def is_match(self, pod: KubePod): 289 | """ 290 | whether this node matches all the selectors on the pod 291 | """ 292 | for label, value in pod.selectors.items(): 293 | if self.selectors.get(label) != value: 294 | return False 295 | for key in self.no_schedule_taints: 296 | if not (pod.no_schedule_wildcard_toleration or key in pod.no_schedule_existential_tolerations): 297 | return False 298 | for key in self.no_execute_taints: 299 | if not (pod.no_execute_wildcard_toleration or key in pod.no_execute_existential_tolerations): 300 | return False 301 | for expression in pod.required_pod_anti_affinity_expressions: 302 | for pod in self.pods: 303 | if match_anti_affinity_expression(expression, pod): 304 | return False 305 | 306 | return True 307 | 308 | def is_managed(self): 309 | """ 310 | an instance is managed if we know its instance ID in ec2. 311 | """ 312 | return self.instance_id is not None 313 | 314 | def is_detached(self): 315 | return utils.parse_bool_label(self.selectors.get('openai/detached')) 316 | 317 | def is_dead(self): 318 | return datetime.datetime.now(self.last_heartbeat_time.tzinfo) - self.last_heartbeat_time > self._HEARTBEAT_GRACE_PERIOD 319 | 320 | def __hash__(self): 321 | return hash(self.name) 322 | 323 | def __eq__(self, other): 324 | return self.name == other.name 325 | 326 | def __str__(self): 327 | return "{}: {} ({})".format(self.name, self.instance_id, 328 | utils.selectors_to_hash(self.selectors)) 329 | 330 | 331 | class KubeResource(object): 332 | 333 | def __init__(self, **kwargs): 334 | self.raw = dict((k, utils.parse_resource(v)) 335 | for (k, v) in kwargs.items()) 336 | 337 | def __add__(self, other): 338 | keys = set(self.raw.keys()) | set(other.raw.keys()) 339 | raw_diff = dict((k, self.raw.get(k, 0) + other.raw.get(k, 0)) 340 | for k in keys) 341 | return KubeResource(**raw_diff) 342 | 343 | def __sub__(self, other): 344 | keys = set(self.raw.keys()) | set(other.raw.keys()) 345 | raw_diff = dict((k, self.raw.get(k, 0) - other.raw.get(k, 0)) 346 | for k in keys) 347 | return KubeResource(**raw_diff) 348 | 349 | def __mul__(self, multiplier): 350 | new_raw = dict((k, v * multiplier) for k, v in self.raw.items()) 351 | return KubeResource(**new_raw) 352 | 353 | def __rmul__(self, multiplier): 354 | return self.__mul__(multiplier) 355 | 356 | def __cmp__(self, other): 357 | """ 358 | should return a negative integer if self < other, 359 | zero if self == other, a positive integer if self > other. 360 | 361 | we consider self to be greater than other if it exceeds 362 | the resource amount in other in more resource types. 363 | e.g. if self = {cpu: 4, memory: 1K, gpu: 1}, 364 | other = {cpu: 2, memory: 2K}, then self exceeds the resource 365 | amount in other in both cpu and gpu, while other exceeds 366 | the resource amount in self in only memory, so self > other. 367 | """ 368 | resource_diff = (self - other).raw 369 | num_resource_types = len(resource_diff) 370 | num_eq = sum(1 for v in resource_diff.values() if v == 0) 371 | num_less = sum(1 for v in resource_diff.values() if v < 0) 372 | num_more = num_resource_types - num_eq - num_less 373 | return num_more - num_less 374 | 375 | def __str__(self): 376 | return str(self.raw) 377 | 378 | def get(self, key, default=None): 379 | return self.raw.get(key, default) 380 | 381 | @property 382 | def possible(self): 383 | return all([x >= 0 for x in self.raw.values()]) 384 | -------------------------------------------------------------------------------- /autoscaler/azure_api.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | import re 4 | 5 | from azure.monitor import MonitorClient 6 | from azure.monitor.models import EventData 7 | from copy import deepcopy 8 | from datetime import datetime, timedelta 9 | from threading import RLock, Condition 10 | from typing import List, Tuple, MutableMapping, Mapping 11 | 12 | import pytz 13 | from abc import ABC 14 | from azure.mgmt.compute import ComputeManagementClient 15 | from azure.mgmt.compute.models import VirtualMachineScaleSet, Sku 16 | from azure.mgmt.resource import ResourceManagementClient 17 | 18 | from autoscaler.utils import Future 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | PRIORITY_TAG = 'priority' 24 | # Value should be a json map of NoSchedule taint key-values 25 | NO_SCHEDULE_TAINTS_TAG = 'no_schedule_taints' 26 | 27 | 28 | class AzureScaleSet: 29 | def __init__(self, location: str, resource_group: str, name: str, instance_type: str, capacity: int, 30 | provisioning_state: str, timeout_until: datetime = None, timeout_reason: str = None, priority: int = None, 31 | no_schedule_taints: Mapping[str, str] = {}) -> None: 32 | self.name = name 33 | self.instance_type = instance_type 34 | self.capacity = capacity 35 | self.provisioning_state = provisioning_state 36 | self.resource_group = resource_group 37 | self.location = location 38 | self.timeout_until = timeout_until 39 | self.timeout_reason = timeout_reason 40 | self.priority = priority 41 | self.no_schedule_taints = no_schedule_taints 42 | 43 | def __str__(self): 44 | return 'AzureScaleSet({}, {}, {}, {})'.format(self.name, self.instance_type, self.capacity, self.provisioning_state) 45 | 46 | def __repr__(self): 47 | return str(self) 48 | 49 | def _key(self): 50 | return (self.name, self.instance_type, self.capacity, self.provisioning_state, self.resource_group, self.location, 51 | self.timeout_until, self.timeout_reason, self.priority, tuple(self.no_schedule_taints.items())) 52 | 53 | def __eq__(self, other: object) -> bool: 54 | if not isinstance(other, AzureScaleSet): 55 | return False 56 | return self._key() == other._key() 57 | 58 | def __hash__(self) -> int: 59 | return hash(self._key()) 60 | 61 | 62 | class AzureScaleSetInstance: 63 | def __init__(self, instance_id: str, vm_id: str, launch_time: datetime) -> None: 64 | self.instance_id = instance_id 65 | self.vm_id = vm_id 66 | self.launch_time = launch_time 67 | 68 | def __str__(self): 69 | return 'AzureScaleSetInstance({}, {}, {})'.format(self.instance_id, self.vm_id, self.launch_time) 70 | 71 | def __repr__(self): 72 | return str(self) 73 | 74 | def _key(self): 75 | return (self.instance_id, self.vm_id, self.launch_time) 76 | 77 | def __eq__(self, other: object) -> bool: 78 | if not isinstance(other, AzureScaleSetInstance): 79 | return False 80 | return self._key() == other._key() 81 | 82 | def __hash__(self) -> int: 83 | return hash(self._key()) 84 | 85 | 86 | class AzureApi(ABC): 87 | def list_scale_sets(self, resource_group_name: str) -> List[AzureScaleSet]: 88 | pass 89 | 90 | def list_scale_set_instances(self, scale_set: AzureScaleSet) -> List[AzureScaleSetInstance]: 91 | pass 92 | 93 | def update_scale_set(self, scale_set: AzureScaleSet, new_capacity: int) -> Future: 94 | pass 95 | 96 | def terminate_scale_set_instances(self, scale_set: AzureScaleSet, instances: List[AzureScaleSetInstance]) -> Future: 97 | pass 98 | 99 | def get_remaining_instances(self, resource_group_name: str, sku: str) -> int: 100 | pass 101 | 102 | 103 | TIMEOUT_PERIOD = timedelta(minutes=15) 104 | 105 | 106 | # Mangles a SKU name into the family name used for quotas 107 | def _azure_sku_family(name: str) -> str: 108 | match = re.match('Standard_(?P[A-Z]{1,2})[0-9]{1,2}_?(?Pv[0-9])?', name) 109 | if match is None: 110 | raise ValueError("SKU not from a recognized family: " + name) 111 | family = match.group('family') 112 | result = "standard" + family 113 | # Special case for one of Azure's new SKUs :( 114 | if family == 'ND': 115 | result += 'S' 116 | if match.group('version') is not None: 117 | result += match.group('version') 118 | result += 'Family' 119 | return result 120 | 121 | 122 | class AzureWrapper(AzureApi): 123 | def __init__(self, compute_client: ComputeManagementClient, monitor_client: MonitorClient, resource_client: ResourceManagementClient) -> None: 124 | self._compute_client = compute_client 125 | self._monitor_client = monitor_client 126 | self._resource_client = resource_client 127 | 128 | def list_scale_sets(self, resource_group_name: str) -> List[AzureScaleSet]: 129 | fifteen_minutes_ago = datetime.now(pytz.utc) - TIMEOUT_PERIOD 130 | filter_clause = "eventTimestamp ge '{}' and resourceGroupName eq '{}'".format(fifteen_minutes_ago, resource_group_name) 131 | select_clause = "authorization,status,subStatus,properties,resourceId,eventTimestamp" 132 | 133 | failures_by_scale_set: MutableMapping[str, List[EventData]] = {} 134 | for log in self._monitor_client.activity_logs.list(filter=filter_clause, select=select_clause): 135 | if (log.status and log.status.value == 'Failed') or (log.properties and log.properties.get('statusCode') == 'Conflict'): 136 | if log.authorization and log.authorization.action and 'delete' in log.authorization.action: 137 | continue 138 | failures_by_scale_set.setdefault(log.resource_id, []).append(log) 139 | 140 | result = [] 141 | for scale_set in self._compute_client.virtual_machine_scale_sets.list(resource_group_name): 142 | failures = sorted(failures_by_scale_set.get(scale_set.id, []), key=lambda x: x.event_timestamp, reverse=True) 143 | timeout_until = None 144 | timeout_reason = None 145 | for failure in failures: 146 | status_message = json.loads(failure.properties.get('statusMessage', "{}")) if failure.properties else {} 147 | error_details = status_message.get('error', {}) 148 | if 'message' in error_details: 149 | timeout_until = failure.event_timestamp + TIMEOUT_PERIOD 150 | timeout_reason = error_details['message'] 151 | # Stop if we found a message with details 152 | break 153 | if timeout_until is None: 154 | timeout_until = failure.event_timestamp + TIMEOUT_PERIOD 155 | timeout_reason = failure.sub_status.localized_value 156 | 157 | priority = int(scale_set.tags[PRIORITY_TAG]) if PRIORITY_TAG in scale_set.tags else None 158 | no_schedule_taints = json.loads(scale_set.tags.get(NO_SCHEDULE_TAINTS_TAG, '{}')) 159 | 160 | result.append(AzureScaleSet(scale_set.location, resource_group_name, scale_set.name, scale_set.sku.name, 161 | scale_set.sku.capacity, scale_set.provisioning_state, timeout_until=timeout_until, 162 | timeout_reason=timeout_reason, priority=priority, no_schedule_taints=no_schedule_taints)) 163 | return result 164 | 165 | def list_scale_set_instances(self, scale_set: AzureScaleSet) -> List[AzureScaleSetInstance]: 166 | result = [] 167 | for instance in self._compute_client.virtual_machine_scale_set_vms.list(scale_set.resource_group, scale_set.name, expand="instanceView"): 168 | launch_time = datetime.now(pytz.utc) 169 | for status in instance.instance_view.statuses: 170 | if status.code == 'ProvisioningState/succeeded': 171 | launch_time = status.time 172 | break 173 | result.append(AzureScaleSetInstance(instance.instance_id, instance.vm_id, launch_time)) 174 | return result 175 | 176 | def update_scale_set(self, scale_set: AzureScaleSet, new_capacity: int) -> Future: 177 | parameters = VirtualMachineScaleSet(scale_set.location, sku=Sku(name=scale_set.instance_type, capacity=new_capacity)) 178 | azure_op = self._compute_client.virtual_machine_scale_sets.create_or_update(scale_set.resource_group, scale_set.name, 179 | parameters=parameters) 180 | return AzureOperationPollerFutureAdapter(azure_op) 181 | 182 | def terminate_scale_set_instances(self, scale_set: AzureScaleSet, instances: List[AzureScaleSetInstance]) -> Future: 183 | future = self._compute_client.virtual_machine_scale_sets.delete_instances(scale_set.resource_group, scale_set.name, [instance.instance_id for instance in instances]) 184 | return AzureOperationPollerFutureAdapter(future) 185 | 186 | def get_remaining_instances(self, resource_group_name: str, sku: str): 187 | resource_group = self._resource_client.resource_groups.get(resource_group_name) 188 | cores_per_instance = None 189 | for vm_size in self._compute_client.virtual_machine_sizes.list(location=resource_group.location): 190 | if vm_size.name == sku: 191 | cores_per_instance = vm_size.number_of_cores 192 | 193 | if cores_per_instance is None: 194 | logger.warn("No metadata found for sku: " + sku) 195 | return 0 196 | 197 | for usage in self._compute_client.usage.list(location=resource_group.location): 198 | if usage.name.value == _azure_sku_family(sku): 199 | return (usage.limit - usage.current_value) // cores_per_instance 200 | 201 | logger.warn("No quota found matching: " + sku) 202 | return 0 203 | 204 | 205 | class AzureWriteThroughCachedApi(AzureApi): 206 | def __init__(self, delegate: AzureApi) -> None: 207 | self._delegate = delegate 208 | self._lock = RLock() 209 | self._instance_cache: MutableMapping[Tuple[str, str], List[AzureScaleSetInstance]] = {} 210 | self._scale_set_cache: MutableMapping[str, List[AzureScaleSet]] = {} 211 | self._remaining_instances_cache: MutableMapping[str, MutableMapping[str, int]] = {} 212 | 213 | def invalidate_quota_cache(self, resource_group_name: str) -> None: 214 | with self._lock: 215 | if resource_group_name in self._remaining_instances_cache: 216 | del self._remaining_instances_cache[resource_group_name] 217 | 218 | def list_scale_sets(self, resource_group_name: str, force_refresh=False) -> List[AzureScaleSet]: 219 | if not force_refresh: 220 | with self._lock: 221 | if resource_group_name in self._scale_set_cache: 222 | return deepcopy(self._scale_set_cache[resource_group_name]) 223 | 224 | scale_sets = self._delegate.list_scale_sets(resource_group_name) 225 | with self._lock: 226 | old_scale_sets = dict((x.name, x) for x in self._scale_set_cache.get(resource_group_name, [])) 227 | for scale_set in scale_sets: 228 | old_scale_set = old_scale_sets.get(scale_set.name) 229 | if not old_scale_set: 230 | continue 231 | 232 | # Check if Scale Set was changed externally 233 | if old_scale_set.capacity != scale_set.capacity: 234 | if (resource_group_name, scale_set.name) in self._instance_cache: 235 | del self._instance_cache[(resource_group_name, scale_set.name)] 236 | 237 | self._scale_set_cache[resource_group_name] = scale_sets 238 | return deepcopy(scale_sets) 239 | 240 | def list_scale_set_instances(self, scale_set: AzureScaleSet) -> List[AzureScaleSetInstance]: 241 | key = (scale_set.resource_group, scale_set.name) 242 | with self._lock: 243 | if key in self._instance_cache: 244 | return deepcopy(self._instance_cache[key]) 245 | 246 | instances = self._delegate.list_scale_set_instances(scale_set) 247 | # Make sure we don't poison the cache, if our delegate is eventually consistent 248 | if len(instances) == scale_set.capacity: 249 | with self._lock: 250 | self._instance_cache[key] = instances 251 | return deepcopy(instances) 252 | 253 | def update_scale_set(self, scale_set: AzureScaleSet, new_capacity: int) -> Future: 254 | future = self._delegate.update_scale_set(scale_set, new_capacity) 255 | future.add_done_callback(lambda _: self._invalidate(scale_set.resource_group, scale_set.name)) 256 | return future 257 | 258 | def terminate_scale_set_instances(self, scale_set: AzureScaleSet, instances: List[AzureScaleSetInstance]) -> Future: 259 | future = self._delegate.terminate_scale_set_instances(scale_set, instances) 260 | future.add_done_callback(lambda _: self._invalidate(scale_set.resource_group, scale_set.name)) 261 | return future 262 | 263 | def get_remaining_instances(self, resource_group_name: str, sku: str): 264 | with self._lock: 265 | if resource_group_name in self._remaining_instances_cache: 266 | cached = self._remaining_instances_cache[resource_group_name] 267 | if sku in cached: 268 | return cached[sku] 269 | remaining = self._delegate.get_remaining_instances(resource_group_name, sku) 270 | with self._lock: 271 | self._remaining_instances_cache.setdefault(resource_group_name, {})[sku] = remaining 272 | return remaining 273 | 274 | def _invalidate(self, resource_group_name: str, scale_set_name: str) -> None: 275 | with self._lock: 276 | if (resource_group_name, scale_set_name) in self._instance_cache: 277 | del self._instance_cache[(resource_group_name, scale_set_name)] 278 | 279 | if resource_group_name in self._scale_set_cache: 280 | del self._scale_set_cache[resource_group_name] 281 | 282 | if resource_group_name in self._remaining_instances_cache: 283 | del self._remaining_instances_cache[resource_group_name] 284 | 285 | 286 | _AZURE_API_MAX_WAIT = 10*60 287 | 288 | 289 | # Adapts an Azure async operation to behave like a Future 290 | class AzureOperationPollerFutureAdapter(Future): 291 | def __init__(self, azure_operation): 292 | self._done = False 293 | self._result = None 294 | self._exception = None 295 | # NOTE: All this complexity with a Condition is here because AzureOperationPoller is not reentrant, 296 | # so a callback added with add_done_callback() could not call result(), if we delegated everything 297 | self._condition = Condition() 298 | self._callbacks = [] 299 | self.azure_operation = azure_operation 300 | azure_operation.add_done_callback(self._handle_completion) 301 | 302 | def _handle_completion(self, result): 303 | with self._condition: 304 | self._done = True 305 | if self.azure_operation._exception is None: 306 | self._result = result 307 | else: 308 | self._exception = self.azure_operation._exception 309 | self._condition.notifyAll() 310 | callbacks = self._callbacks 311 | self._callbacks.clear() 312 | 313 | for callback in callbacks: 314 | callback(self) 315 | 316 | def result(self): 317 | callbacks = [] 318 | try: 319 | with self._condition: 320 | if not self._done: 321 | self._condition.wait(_AZURE_API_MAX_WAIT) 322 | if not self._done: 323 | # We reached the timeout 324 | self._exception = TimeoutError() 325 | self._done = True 326 | callbacks = self._callbacks 327 | self._callbacks.clear() 328 | if self._exception: 329 | raise self._exception 330 | return self._result 331 | finally: 332 | for callback in callbacks: 333 | callback(self) 334 | 335 | def add_done_callback(self, fn): 336 | with self._condition: 337 | if self._done: 338 | fn(self) 339 | else: 340 | self._callbacks.append(fn) 341 | -------------------------------------------------------------------------------- /test/test_cluster.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import json 3 | import os.path 4 | import unittest 5 | import copy 6 | from datetime import datetime, timedelta 7 | 8 | import boto3 9 | import pykube 10 | import mock 11 | import moto 12 | import yaml 13 | import pytz 14 | 15 | from autoscaler.cluster import Cluster, ClusterNodeState 16 | from autoscaler.kube import KubePod, KubeNode 17 | import autoscaler.utils as utils 18 | 19 | 20 | class TestCluster(unittest.TestCase): 21 | def setUp(self): 22 | # load dummy kube specs 23 | dir_path = os.path.dirname(os.path.realpath(__file__)) 24 | with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f: 25 | self.dummy_pod = yaml.load(f.read()) 26 | with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f: 27 | self.dummy_ds_pod = yaml.load(f.read()) 28 | with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f: 29 | self.dummy_rc_pod = yaml.load(f.read()) 30 | with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f: 31 | self.dummy_node = yaml.load(f.read()) 32 | for condition in self.dummy_node['status']['conditions']: 33 | if condition['type'] == 'Ready' and condition['status'] == 'True': 34 | condition['lastHeartbeatTime'] = datetime.now(condition['lastHeartbeatTime'].tzinfo) 35 | # Convert timestamps to strings to match PyKube 36 | for condition in self.dummy_node['status']['conditions']: 37 | condition['lastHeartbeatTime'] = datetime.isoformat(condition['lastHeartbeatTime']) 38 | condition['lastTransitionTime'] = datetime.isoformat(condition['lastTransitionTime']) 39 | 40 | 41 | # this isn't actually used here 42 | # only needed to create the KubePod object... 43 | self.api = pykube.HTTPClient(pykube.KubeConfig.from_file('~/.kube/config')) 44 | 45 | # start creating our mock ec2 environment 46 | self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()] 47 | for moto_mock in self.mocks: 48 | moto_mock.start() 49 | 50 | client = boto3.client('autoscaling', region_name='us-west-2') 51 | self.asg_client = client 52 | 53 | client.create_launch_configuration( 54 | LaunchConfigurationName='dummy-lc', 55 | ImageId='ami-deadbeef', 56 | KeyName='dummy-key', 57 | SecurityGroups=[ 58 | 'sg-cafebeef', 59 | ], 60 | InstanceType='t2.medium' 61 | ) 62 | 63 | client.create_auto_scaling_group( 64 | AutoScalingGroupName='dummy-asg', 65 | LaunchConfigurationName='dummy-lc', 66 | MinSize=0, 67 | MaxSize=10, 68 | VPCZoneIdentifier='subnet-beefbeef', 69 | Tags=[ 70 | { 71 | 'Key': 'KubernetesCluster', 72 | 'Value': 'dummy-cluster', 73 | 'PropagateAtLaunch': True 74 | }, 75 | { 76 | 'Key': 'KubernetesRole', 77 | 'Value': 'worker', 78 | 'PropagateAtLaunch': True 79 | } 80 | ] 81 | ) 82 | 83 | # finally our cluster 84 | self.cluster = Cluster( 85 | aws_access_key='fake', 86 | aws_secret_key='fake', 87 | aws_regions=['us-west-2', 'us-east-1', 'us-west-1'], 88 | azure_client_id='', 89 | azure_client_secret='', 90 | azure_subscription_id='', 91 | azure_tenant_id='', 92 | azure_resource_group_names=[], 93 | azure_slow_scale_classes=[], 94 | kubeconfig='~/.kube/config', 95 | pod_namespace=None, 96 | drain_utilization_below=0.3, 97 | idle_threshold=60, 98 | instance_init_time=60, 99 | type_idle_threshold=60, 100 | cluster_name='dummy-cluster', 101 | notifier=mock.Mock(), 102 | dry_run=False, 103 | use_aws_iam_role=False 104 | ) 105 | 106 | def tearDown(self): 107 | for moto_mock in self.mocks: 108 | moto_mock.stop() 109 | 110 | def _spin_up_node(self, launch_time=None): 111 | return self._spin_up_nodes(1, launch_time=launch_time)[0] 112 | 113 | def _spin_up_nodes(self, count, launch_time=None): 114 | assert count <= 256 115 | # spin up dummy ec2 node 116 | self.asg_client.set_desired_capacity(AutoScalingGroupName='dummy-asg', 117 | DesiredCapacity=count) 118 | response = self.asg_client.describe_auto_scaling_groups() 119 | nodes = [] 120 | for i, instance in enumerate(response['AutoScalingGroups'][0]['Instances']): 121 | instance_id = instance['InstanceId'] 122 | 123 | dummy_node = copy.deepcopy(self.dummy_node) 124 | dummy_node['metadata']['labels']['aws/id'] = instance_id 125 | dummy_node['metadata']['name'] = '10.0.' + str(i) + '.228' 126 | node = KubeNode(pykube.Node(self.api, dummy_node)) 127 | node.cordon = mock.Mock(return_value="mocked stuff") 128 | node.drain = mock.Mock(return_value="mocked stuff") 129 | node.uncordon = mock.Mock(return_value="mocked stuff") 130 | node.delete = mock.Mock(return_value="mocked stuff") 131 | nodes.append(node) 132 | return nodes 133 | 134 | def test_reap_dead_node(self): 135 | node = copy.deepcopy(self.dummy_node) 136 | TestInstance = collections.namedtuple('TestInstance', ['launch_time']) 137 | instance = TestInstance(datetime.now(pytz.utc)) 138 | 139 | ready_condition = None 140 | for condition in node['status']['conditions']: 141 | if condition['type'] == 'Ready': 142 | ready_condition = condition 143 | break 144 | ready_condition['status'] = 'Unknown' 145 | 146 | ready_condition['lastHeartbeatTime'] = datetime.isoformat(datetime.now(pytz.utc) - timedelta(minutes=30)) 147 | kube_node = KubeNode(pykube.Node(self.api, node)) 148 | kube_node.delete = mock.Mock(return_value="mocked stuff") 149 | self.cluster.maintain([kube_node], {kube_node.instance_id: instance}, {}, [], []) 150 | kube_node.delete.assert_not_called() 151 | 152 | ready_condition['lastHeartbeatTime'] = datetime.isoformat(datetime.now(pytz.utc) - timedelta(hours=2)) 153 | kube_node = KubeNode(pykube.Node(self.api, node)) 154 | kube_node.delete = mock.Mock(return_value="mocked stuff") 155 | self.cluster.maintain([kube_node], {kube_node.instance_id: instance}, {}, [], []) 156 | kube_node.delete.assert_called_once_with() 157 | 158 | def test_max_scale_in(self): 159 | node1 = copy.deepcopy(self.dummy_node) 160 | node2 = copy.deepcopy(self.dummy_node) 161 | TestInstance = collections.namedtuple('TestInstance', ['launch_time']) 162 | instance1 = TestInstance(datetime.now(pytz.utc)) 163 | instance2 = TestInstance(datetime.now(pytz.utc)) 164 | 165 | for node in [node1, node2]: 166 | for condition in node['status']['conditions']: 167 | if condition['type'] == 'Ready': 168 | condition['status'] = 'Unknown' 169 | condition['lastHeartbeatTime'] = datetime.isoformat(datetime.now(pytz.utc) - timedelta(hours=2)) 170 | break 171 | 172 | kube_node1 = KubeNode(pykube.Node(self.api, node1)) 173 | kube_node1.delete = mock.Mock(return_value="mocked stuff") 174 | kube_node2 = KubeNode(pykube.Node(self.api, node2)) 175 | kube_node2.delete = mock.Mock(return_value="mocked stuff") 176 | self.cluster.maintain([kube_node1, kube_node2], {kube_node1.instance_id: instance1, kube_node2.instance_id: instance2}, {}, [], []) 177 | kube_node1.delete.assert_not_called() 178 | kube_node2.delete.assert_not_called() 179 | 180 | def test_scale_up_selector(self): 181 | self.dummy_pod['spec']['nodeSelector'] = { 182 | 'aws/type': 'm4.large' 183 | } 184 | pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) 185 | selectors_hash = utils.selectors_to_hash(pod.selectors) 186 | asgs = self.cluster.autoscaling_groups.get_all_groups([]) 187 | self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) 188 | 189 | response = self.asg_client.describe_auto_scaling_groups() 190 | self.assertEqual(len(response['AutoScalingGroups']), 1) 191 | self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0) 192 | 193 | def test_scale_up(self): 194 | pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) 195 | selectors_hash = utils.selectors_to_hash(pod.selectors) 196 | asgs = self.cluster.autoscaling_groups.get_all_groups([]) 197 | self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) 198 | 199 | response = self.asg_client.describe_auto_scaling_groups() 200 | self.assertEqual(len(response['AutoScalingGroups']), 1) 201 | self.assertGreater(response['AutoScalingGroups'][0]['DesiredCapacity'], 0) 202 | 203 | def test_scale_up_notification(self): 204 | big_pod_spec = copy.deepcopy(self.dummy_pod) 205 | for container in big_pod_spec['spec']['containers']: 206 | container['resources']['requests']['cpu'] = '100' 207 | pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) 208 | big_pod = KubePod(pykube.Pod(self.api, big_pod_spec)) 209 | selectors_hash = utils.selectors_to_hash(pod.selectors) 210 | asgs = self.cluster.autoscaling_groups.get_all_groups([]) 211 | self.cluster.fulfill_pending(asgs, selectors_hash, [pod, big_pod]) 212 | self.cluster.notifier.notify_scale.assert_called_with(mock.ANY, mock.ANY, [pod]) 213 | 214 | def test_timed_out_group(self): 215 | with mock.patch('autoscaler.autoscaling_groups.AutoScalingGroup.is_timed_out') as is_timed_out: 216 | with mock.patch('autoscaler.autoscaling_groups.AutoScalingGroup.scale') as scale: 217 | is_timed_out.return_value = True 218 | scale.return_value = utils.CompletedFuture(None) 219 | 220 | pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) 221 | selectors_hash = utils.selectors_to_hash(pod.selectors) 222 | asgs = self.cluster.autoscaling_groups.get_all_groups([]) 223 | self.cluster.fulfill_pending(asgs, selectors_hash, [pod]) 224 | 225 | scale.assert_not_called() 226 | 227 | response = self.asg_client.describe_auto_scaling_groups() 228 | self.assertEqual(len(response['AutoScalingGroups']), 1) 229 | self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0) 230 | 231 | def test_scale_down(self): 232 | """ 233 | kube node with daemonset and no pod --> cordon 234 | """ 235 | node = self._spin_up_node() 236 | 237 | all_nodes = [node] 238 | managed_nodes = [n for n in all_nodes if node.is_managed()] 239 | running_insts_map = self.cluster.get_running_instances_map(managed_nodes, []) 240 | pods_to_schedule = {} 241 | asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) 242 | 243 | ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) 244 | running_or_pending_assigned_pods = [ds_pod] 245 | 246 | self.cluster.idle_threshold = -1 247 | self.cluster.type_idle_threshold = -1 248 | self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = -1 249 | self.cluster.maintain( 250 | managed_nodes, running_insts_map, 251 | pods_to_schedule, running_or_pending_assigned_pods, asgs) 252 | 253 | response = self.asg_client.describe_auto_scaling_groups() 254 | self.assertEqual(len(response['AutoScalingGroups']), 1) 255 | self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) 256 | node.cordon.assert_called_once_with() 257 | 258 | def test_scale_down_launch_grace_period(self): 259 | """ 260 | kube node with daemonset and no pod + launch grace period --> noop 261 | """ 262 | node = self._spin_up_node() 263 | all_nodes = [node] 264 | managed_nodes = [n for n in all_nodes if node.is_managed()] 265 | running_insts_map = self.cluster.get_running_instances_map(managed_nodes, []) 266 | pods_to_schedule = {} 267 | asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) 268 | 269 | ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) 270 | running_or_pending_assigned_pods = [ds_pod] 271 | 272 | self.cluster.idle_threshold = -1 273 | self.cluster.type_idle_threshold = -1 274 | self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = 60*30 275 | self.cluster.maintain( 276 | managed_nodes, running_insts_map, 277 | pods_to_schedule, running_or_pending_assigned_pods, asgs) 278 | 279 | response = self.asg_client.describe_auto_scaling_groups() 280 | self.assertEqual(len(response['AutoScalingGroups']), 1) 281 | self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) 282 | node.cordon.assert_not_called() 283 | 284 | def test_scale_down_grace_period(self): 285 | """ 286 | kube node with daemonset and no pod + grace period --> noop 287 | """ 288 | node = self._spin_up_node() 289 | all_nodes = [node] 290 | managed_nodes = [n for n in all_nodes if node.is_managed()] 291 | running_insts_map = self.cluster.get_running_instances_map(managed_nodes, []) 292 | pods_to_schedule = {} 293 | asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) 294 | 295 | # kube node with daemonset and no pod --> cordon 296 | ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) 297 | running_or_pending_assigned_pods = [ds_pod] 298 | 299 | self.cluster.maintain( 300 | managed_nodes, running_insts_map, 301 | pods_to_schedule, running_or_pending_assigned_pods, asgs) 302 | 303 | response = self.asg_client.describe_auto_scaling_groups() 304 | self.assertEqual(len(response['AutoScalingGroups']), 1) 305 | self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) 306 | node.cordon.assert_not_called() 307 | 308 | def test_scale_down_busy(self): 309 | """ 310 | kube node with daemonset and pod/rc-pod --> noop 311 | """ 312 | node = self._spin_up_node() 313 | all_nodes = [node] 314 | managed_nodes = [n for n in all_nodes if node.is_managed()] 315 | running_insts_map = self.cluster.get_running_instances_map(managed_nodes, []) 316 | pods_to_schedule = {} 317 | asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) 318 | 319 | # kube node with daemonset and pod --> noop 320 | ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) 321 | pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) 322 | rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod)) 323 | 324 | pod_scenarios = [ 325 | # kube node with daemonset and pod --> noop 326 | [ds_pod, pod], 327 | # kube node with daemonset and rc pod --> noop 328 | [ds_pod, rc_pod] 329 | ] 330 | 331 | # make sure we're not on grace period 332 | self.cluster.idle_threshold = -1 333 | self.cluster.type_idle_threshold = -1 334 | 335 | for pods in pod_scenarios: 336 | state = self.cluster.get_node_state( 337 | node, asgs[0], pods, pods_to_schedule, 338 | running_insts_map, collections.Counter()) 339 | self.assertEqual(state, ClusterNodeState.BUSY) 340 | 341 | self.cluster.maintain( 342 | managed_nodes, running_insts_map, 343 | pods_to_schedule, pods, asgs) 344 | 345 | response = self.asg_client.describe_auto_scaling_groups() 346 | self.assertEqual(len(response['AutoScalingGroups']), 1) 347 | self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) 348 | node.cordon.assert_not_called() 349 | 350 | def test_scale_down_under_utilized_undrainable(self): 351 | """ 352 | kube node with daemonset and pod/rc-pod --> noop 353 | """ 354 | node = self._spin_up_node() 355 | all_nodes = [node] 356 | managed_nodes = [n for n in all_nodes if node.is_managed()] 357 | running_insts_map = self.cluster.get_running_instances_map(managed_nodes, []) 358 | pods_to_schedule = {} 359 | asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) 360 | 361 | # create some undrainable pods 362 | ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) 363 | for container in self.dummy_pod['spec']['containers']: 364 | container.pop('resources', None) 365 | pod = KubePod(pykube.Pod(self.api, self.dummy_pod)) 366 | self.dummy_rc_pod['metadata']['labels']['openai/do-not-drain'] = 'true' 367 | for container in self.dummy_rc_pod['spec']['containers']: 368 | container.pop('resources', None) 369 | rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod)) 370 | 371 | pod_scenarios = [ 372 | # kube node with daemonset and pod with no resource ask --> noop 373 | [ds_pod, pod], 374 | # kube node with daemonset and critical rc pod --> noop 375 | [ds_pod, rc_pod] 376 | ] 377 | 378 | # make sure we're not on grace period 379 | self.cluster.idle_threshold = -1 380 | self.cluster.type_idle_threshold = -1 381 | self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = -1 382 | 383 | for pods in pod_scenarios: 384 | state = self.cluster.get_node_state( 385 | node, asgs[0], pods, pods_to_schedule, 386 | running_insts_map, collections.Counter()) 387 | self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE) 388 | 389 | self.cluster.maintain( 390 | managed_nodes, running_insts_map, 391 | pods_to_schedule, pods, asgs) 392 | 393 | response = self.asg_client.describe_auto_scaling_groups() 394 | self.assertEqual(len(response['AutoScalingGroups']), 1) 395 | self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) 396 | node.cordon.assert_not_called() 397 | 398 | def test_scale_down_under_utilized_drainable(self): 399 | """ 400 | kube node with daemonset and rc-pod --> cordon+drain 401 | """ 402 | node = self._spin_up_node() 403 | all_nodes = [node] 404 | managed_nodes = [n for n in all_nodes if node.is_managed()] 405 | running_insts_map = self.cluster.get_running_instances_map(managed_nodes, []) 406 | pods_to_schedule = {} 407 | asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes) 408 | 409 | # create some undrainable pods 410 | ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod)) 411 | for container in self.dummy_rc_pod['spec']['containers']: 412 | container.pop('resources', None) 413 | rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod)) 414 | pods = [ds_pod, rc_pod] 415 | 416 | # make sure we're not on grace period 417 | self.cluster.idle_threshold = -1 418 | self.cluster.type_idle_threshold = -1 419 | self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = -1 420 | 421 | state = self.cluster.get_node_state( 422 | node, asgs[0], pods, pods_to_schedule, 423 | running_insts_map, collections.Counter()) 424 | self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_DRAINABLE) 425 | 426 | self.cluster.maintain( 427 | managed_nodes, running_insts_map, 428 | pods_to_schedule, pods, asgs) 429 | 430 | response = self.asg_client.describe_auto_scaling_groups() 431 | self.assertEqual(len(response['AutoScalingGroups']), 1) 432 | self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1) 433 | node.cordon.assert_called_once_with() 434 | node.drain.assert_called_once_with(pods, notifier=mock.ANY) 435 | 436 | def test_prioritization(self): 437 | TestingGroup = collections.namedtuple('TestingGroup', ['region', 'name', 'selectors', 'global_priority', 'is_spot']) 438 | high_pri = TestingGroup('test', 'test', {}, -1, False) 439 | low_pri = TestingGroup('test', 'test', {}, 0, False) 440 | 441 | self.assertEqual([high_pri, low_pri], list(self.cluster._prioritize_groups([low_pri, high_pri]))) 442 | -------------------------------------------------------------------------------- /autoscaler/autoscaling_groups.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import re 4 | from concurrent.futures import ThreadPoolExecutor 5 | 6 | import botocore 7 | import pytz 8 | 9 | import autoscaler.aws_utils as aws_utils 10 | import autoscaler.utils as utils 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class AutoScalingGroups(object): 16 | _BOTO_CLIENT_TYPE = 'autoscaling' 17 | 18 | _CLUSTER_KEY = 'KubernetesCluster' 19 | _ROLE_KEYS = ('KubernetesRole', 'Role') 20 | _WORKER_ROLE_VALUES = ('worker', 'kubernetes-minion') 21 | 22 | def __init__(self, session, regions, cluster_name=None): 23 | """ 24 | cluster_name - if set, filter ASGs by cluster_name in tag field 25 | _CLUSTER_KEY 26 | """ 27 | self.session = session 28 | self.regions = regions 29 | self.cluster_name = cluster_name 30 | 31 | @staticmethod 32 | def get_all_raw_groups_and_launch_configs(client): 33 | raw_groups = aws_utils.fetch_all( 34 | client.describe_auto_scaling_groups, {'MaxRecords': 100}, 'AutoScalingGroups') 35 | all_launch_configs = {} 36 | batch_size = 50 37 | for launch_config_idx in range(0, len(raw_groups), batch_size): 38 | groups = raw_groups[launch_config_idx*batch_size:(launch_config_idx+1)*batch_size] 39 | kwargs = { 40 | 'LaunchConfigurationNames': [g['LaunchConfigurationName'] for g in groups] 41 | } 42 | launch_configs = aws_utils.fetch_all( 43 | client.describe_launch_configurations, 44 | kwargs, 'LaunchConfigurations') 45 | all_launch_configs.update((lc['LaunchConfigurationName'], lc) 46 | for lc in launch_configs) 47 | return raw_groups, all_launch_configs 48 | 49 | def get_all_groups(self, kube_nodes): 50 | groups = [] 51 | with ThreadPoolExecutor(max_workers=max(1, len(self.regions))) as executor: 52 | raw_groups_and_launch_configs = {} 53 | for region in self.regions: 54 | client = self.session.client(self._BOTO_CLIENT_TYPE, 55 | region_name=region) 56 | raw_groups_and_launch_configs[region] = executor.submit( 57 | AutoScalingGroups.get_all_raw_groups_and_launch_configs, client) 58 | 59 | for region in self.regions: 60 | raw_groups, launch_configs = raw_groups_and_launch_configs[region].result() 61 | 62 | client = self.session.client(self._BOTO_CLIENT_TYPE, 63 | region_name=region) 64 | for raw_group in sorted(raw_groups, key=lambda g: g['AutoScalingGroupName']): 65 | if self.cluster_name: 66 | cluster_name = None 67 | role = None 68 | for tag in raw_group['Tags']: 69 | if tag['Key'] == self._CLUSTER_KEY: 70 | cluster_name = tag['Value'] 71 | elif tag['Key'] in self._ROLE_KEYS: 72 | role = tag['Value'] 73 | if cluster_name != self.cluster_name or role not in self._WORKER_ROLE_VALUES: 74 | continue 75 | 76 | groups.append(AutoScalingGroup( 77 | client, region, kube_nodes, raw_group, 78 | launch_configs[raw_group['LaunchConfigurationName']])) 79 | 80 | return groups 81 | 82 | 83 | class AutoScalingTimeouts(object): 84 | _TIMEOUT = 3600 # 1 hour 85 | _SPOT_REQUEST_TIMEOUT = 300 # 5 minutes 86 | _MAX_OUTBIDS_IN_INTERVAL = 60*20 # 20 minutes 87 | _SPOT_HISTORY_PERIOD = 60*60*5 # 5 hours 88 | 89 | def __init__(self, session): 90 | """ 91 | """ 92 | self.session = session 93 | 94 | # ASGs to avoid because of recent launch failures 95 | # e.g. a region running out of capacity 96 | # try to favor other regions 97 | self._timeouts = {} 98 | self._last_activities = {} 99 | 100 | # ASGs to avoid because of spot pricing history 101 | self._spot_timeouts = {} 102 | self._spot_price_history = {} 103 | 104 | def refresh_timeouts(self, asgs, dry_run=False): 105 | """ 106 | refresh timeouts on ASGs using new data from aws 107 | """ 108 | self.time_out_spot_asgs(asgs) 109 | 110 | asgs_by_region = {} 111 | for asg in asgs: 112 | asgs_by_region.setdefault(asg.region, []).append(asg) 113 | 114 | for region, regional_asgs in asgs_by_region.items(): 115 | client = self.session.client('autoscaling', region_name=region) 116 | start_time_cutoff = None 117 | newest_completed_activity = None 118 | activities = {} 119 | for activity in self.iter_activities(client): 120 | if newest_completed_activity is None and activity['Progress'] == 100: 121 | newest_completed_activity = activity 122 | if activity['ActivityId'] == self._last_activities.get(region, None): 123 | break 124 | if start_time_cutoff is None: 125 | start_time_cutoff = ( 126 | datetime.datetime.now(activity['StartTime'].tzinfo) - 127 | datetime.timedelta(seconds=self._TIMEOUT)) 128 | if activity['StartTime'] < start_time_cutoff: 129 | # skip events that are too old to cut down the time 130 | # it takes the first time to go through events 131 | break 132 | activities.setdefault(activity['AutoScalingGroupName'], []).append(activity) 133 | 134 | self._last_activities[region] = newest_completed_activity['ActivityId'] 135 | for asg in regional_asgs: 136 | self.reconcile_limits(asg, activities.get(asg.name, []), dry_run=dry_run) 137 | 138 | def iter_activities(self, client): 139 | next_token = None 140 | while True: 141 | kwargs = {} 142 | if next_token: 143 | kwargs['NextToken'] = next_token 144 | data = client.describe_scaling_activities(**kwargs) 145 | for item in data['Activities']: 146 | yield item 147 | next_token = data.get('NextToken') 148 | if not next_token: 149 | break 150 | 151 | def revert_capacity(self, asg, entry, dry_run): 152 | """ 153 | try to decrease desired capacity to the original 154 | capacity before the capacity increase that caused 155 | the ASG activity entry. 156 | """ 157 | cause_m = AutoScalingCauseMessages.LAUNCH_INSTANCE.search(entry.get('Cause', '')) 158 | if cause_m: 159 | original_capacity = int(cause_m.group('original_capacity')) 160 | if asg.desired_capacity > original_capacity: 161 | # we tried to go over capacity and failed 162 | # now set the desired capacity back to a normal range 163 | if not dry_run: 164 | asg.set_desired_capacity(original_capacity) 165 | else: 166 | logger.info('[Dry run] Would have set desired capacity to %s', original_capacity) 167 | return True 168 | return False 169 | 170 | def time_out_asg(self, asg, entry): 171 | self._timeouts[asg._id] = ( 172 | entry['StartTime'] + datetime.timedelta(seconds=self._TIMEOUT)) 173 | logger.info('%s is timed out until %s', 174 | asg.name, self._timeouts[asg._id]) 175 | 176 | def reconcile_limits(self, asg, activities, dry_run=False): 177 | """ 178 | makes sure the ASG has valid capacity by processing errors 179 | in its recent scaling activities. 180 | marks an ASG as timed out if it recently had a capacity 181 | failure. 182 | """ 183 | for entry in activities: 184 | status_msg = entry.get('StatusMessage', '') 185 | if entry['StatusCode'] in ('Failed', 'Cancelled'): 186 | logger.warn('%s scaling failure: %s', asg, entry) 187 | 188 | m = AutoScalingErrorMessages.INSTANCE_LIMIT.match(status_msg) 189 | if m: 190 | max_desired_capacity = int(m.group('requested')) - 1 191 | if asg.desired_capacity > max_desired_capacity: 192 | self.time_out_asg(asg, entry) 193 | 194 | # we tried to go over capacity and failed 195 | # now set the desired capacity back to a normal range 196 | if not dry_run: 197 | asg.set_desired_capacity(max_desired_capacity) 198 | else: 199 | logger.info('[Dry run] Would have set desired capacity to %s', max_desired_capacity) 200 | return 201 | 202 | m = AutoScalingErrorMessages.VOLUME_LIMIT.match(status_msg) 203 | if m: 204 | # TODO: decrease desired capacity 205 | self.time_out_asg(asg, entry) 206 | return 207 | 208 | m = AutoScalingErrorMessages.CAPACITY_LIMIT.match(status_msg) 209 | if m: 210 | reverted = self.revert_capacity(asg, entry, dry_run) 211 | if reverted: 212 | self.time_out_asg(asg, entry) 213 | return 214 | 215 | m = AutoScalingErrorMessages.AZ_LIMIT.search(status_msg) 216 | if m and 'only-az' in asg.name: 217 | reverted = self.revert_capacity(asg, entry, dry_run) 218 | if reverted: 219 | self.time_out_asg(asg, entry) 220 | return 221 | 222 | m = AutoScalingErrorMessages.SPOT_REQUEST_CANCELLED.search(status_msg) 223 | if m: 224 | # we cancelled a spot request 225 | # don't carry on to reset timeout 226 | continue 227 | 228 | m = AutoScalingErrorMessages.SPOT_LIMIT.match(status_msg) 229 | if m: 230 | self.time_out_asg(asg, entry) 231 | 232 | if not dry_run: 233 | asg.set_desired_capacity(asg.actual_capacity) 234 | else: 235 | logger.info('[Dry run] Would have set desired capacity to %s', asg.actual_capacity) 236 | return 237 | elif entry['StatusCode'] == 'WaitingForSpotInstanceId': 238 | logger.warn('%s waiting for spot: %s', asg, entry) 239 | 240 | balance_cause_m = AutoScalingCauseMessages.AZ_BALANCE.search(entry.get('Cause', '')) 241 | if balance_cause_m: 242 | # sometimes ASGs will launch instances in other az's to 243 | # balance out the group 244 | # ignore these events 245 | # even if we cancel it, the ASG will just attempt to 246 | # launch again 247 | logger.info('ignoring AZ balance launch event') 248 | continue 249 | 250 | now = datetime.datetime.now(entry['StartTime'].tzinfo) 251 | if (now - entry['StartTime']) > datetime.timedelta(seconds=self._SPOT_REQUEST_TIMEOUT): 252 | self.time_out_asg(asg, entry) 253 | 254 | # try to cancel spot request and scale down ASG 255 | spot_request_m = AutoScalingErrorMessages.SPOT_REQUEST_WAITING.search(status_msg) 256 | if spot_request_m: 257 | spot_request_id = spot_request_m.group('request_id') 258 | if not dry_run: 259 | cancelled = self.cancel_spot_request(asg.region, spot_request_id) 260 | if cancelled: 261 | asg.set_desired_capacity(asg.desired_capacity - 1) 262 | else: 263 | logger.info('[Dry run] Would have cancelled spot request %s and decremented desired capacity.', 264 | spot_request_id) 265 | # don't return here so that we can cancel more spot requests 266 | 267 | self._timeouts[asg._id] = None 268 | logger.debug('%s has no timeout', asg.name) 269 | 270 | def is_timed_out(self, asg): 271 | timeout = self._timeouts.get(asg._id) 272 | spot_timeout = self._spot_timeouts.get(asg._id) 273 | 274 | if timeout and datetime.datetime.now(timeout.tzinfo) < timeout: 275 | return True 276 | 277 | if spot_timeout and datetime.datetime.now(pytz.utc) < spot_timeout: 278 | return True 279 | 280 | return False 281 | 282 | def cancel_spot_request(self, region, request_id): 283 | client = self.session.client('ec2', 284 | region_name=region) 285 | response = client.describe_spot_instance_requests( 286 | SpotInstanceRequestIds=[request_id] 287 | ) 288 | if len(response['SpotInstanceRequests']) == 0: 289 | return False 290 | 291 | spot_instance_req = response['SpotInstanceRequests'][0] 292 | if spot_instance_req['State'] in ('open', 'active'): 293 | response = client.cancel_spot_instance_requests( 294 | SpotInstanceRequestIds=[request_id] 295 | ) 296 | logger.info('Spot instance request %s cancelled.', request_id) 297 | return True 298 | 299 | return False 300 | 301 | def time_out_spot_asgs(self, asgs): 302 | """ 303 | Using recent spot pricing data from AWS, time out spot instance 304 | ASGs that would be outbid for more than _MAX_OUTBIDS_IN_INTERVAL seconds 305 | """ 306 | region_instance_asg_map = {} 307 | for asg in asgs: 308 | if not asg.is_spot: 309 | continue 310 | 311 | instance_asg_map = region_instance_asg_map.setdefault(asg.region, {}) 312 | instance_type = asg.launch_config['InstanceType'] 313 | instance_asg_map.setdefault(instance_type, []).append(asg) 314 | 315 | now = datetime.datetime.now(pytz.utc) 316 | since = now - datetime.timedelta(seconds=self._SPOT_HISTORY_PERIOD) 317 | 318 | for region, instance_asg_map in region_instance_asg_map.items(): 319 | # Expire old history 320 | history = [item for item in self._spot_price_history.get(region, []) if item['Timestamp'] > since] 321 | if history: 322 | newest_spot_price = max(item['Timestamp'] for item in history) 323 | else: 324 | newest_spot_price = since 325 | client = self.session.client('ec2', region_name=region) 326 | kwargs = { 327 | 'StartTime': newest_spot_price, 328 | 'InstanceTypes': list(instance_asg_map.keys()), 329 | 'ProductDescriptions': ['Linux/UNIX'] 330 | } 331 | history.extend(aws_utils.fetch_all( 332 | client.describe_spot_price_history, kwargs, 'SpotPriceHistory')) 333 | self._spot_price_history[region] = history 334 | for instance_type, asgs in instance_asg_map.items(): 335 | for asg in asgs: 336 | last_az_bid = {} 337 | outbid_time = {} 338 | bid_price = float(asg.launch_config['SpotPrice']) 339 | for item in history: 340 | if item['InstanceType'] != instance_type: 341 | continue 342 | 343 | if float(item['SpotPrice']) > bid_price: 344 | # we would've been outbid! 345 | if item['AvailabilityZone'] in last_az_bid: 346 | time_diff = (last_az_bid[item['AvailabilityZone']] - item['Timestamp']) 347 | else: 348 | time_diff = datetime.timedelta(seconds=0) 349 | outbid_time[item['AvailabilityZone']] = ( 350 | outbid_time.get(item['AvailabilityZone'], datetime.timedelta(seconds=0)) + 351 | time_diff) 352 | last_az_bid[item['AvailabilityZone']] = item['Timestamp'] 353 | 354 | if outbid_time: 355 | avg_outbid_time = sum(t.total_seconds() for t in outbid_time.values()) / len(outbid_time) 356 | else: 357 | avg_outbid_time = 0.0 358 | if avg_outbid_time > self._MAX_OUTBIDS_IN_INTERVAL: 359 | self._spot_timeouts[asg._id] = now + datetime.timedelta(seconds=self._TIMEOUT) 360 | logger.info('%s (%s) is spot timed out until %s (would have been outbid for %ss on average)', 361 | asg.name, asg.region, self._spot_timeouts[asg._id], avg_outbid_time) 362 | else: 363 | self._spot_timeouts[asg._id] = None 364 | 365 | 366 | class AutoScalingGroup(object): 367 | provider = 'aws' 368 | 369 | def __init__(self, client, region, kube_nodes, raw_group, launch_config): 370 | """ 371 | client - boto3 AutoScaling.Client 372 | region - AWS region string 373 | kube_nodes - list of KubeNode objects 374 | raw_group - raw ASG dictionary returned from AWS API 375 | launch_config - raw launch config dictionary returned from AWS API 376 | """ 377 | self.client = client 378 | self.region = region 379 | self.launch_config = launch_config 380 | self.selectors = self._extract_selectors(region, launch_config, raw_group['Tags']) 381 | self.name = raw_group['AutoScalingGroupName'] 382 | self.desired_capacity = raw_group['DesiredCapacity'] 383 | self.min_size = raw_group['MinSize'] 384 | self.max_size = raw_group['MaxSize'] 385 | 386 | self.is_spot = launch_config.get('SpotPrice') is not None 387 | self.instance_type = launch_config['InstanceType'] 388 | 389 | self.instance_ids = set(inst['InstanceId'] for inst in raw_group['Instances'] 390 | if inst.get('InstanceId')) 391 | self.nodes = [node for node in kube_nodes 392 | if node.instance_id in self.instance_ids] 393 | self.unschedulable_nodes = [n for n in self.nodes if n.unschedulable] 394 | self.no_schedule_taints = {} 395 | 396 | self._id = (self.region, self.name) 397 | 398 | def _extract_selectors(self, region, launch_config, tags_data): 399 | selectors = { 400 | 'aws/type': launch_config['InstanceType'], 401 | 'aws/class': launch_config['InstanceType'][0], 402 | 'aws/ami-id': launch_config['ImageId'], 403 | 'aws/region': region 404 | } 405 | for tag_data in tags_data: 406 | if tag_data['Key'].startswith('kube/'): 407 | selectors[tag_data['Key'][5:]] = tag_data['Value'] 408 | 409 | # adding kube label counterparts 410 | selectors['beta.kubernetes.io/instance-type'] = selectors['aws/type'] 411 | selectors['failure-domain.beta.kubernetes.io/region'] = selectors['aws/region'] 412 | 413 | return selectors 414 | 415 | def is_timed_out(self): 416 | return False 417 | 418 | @property 419 | def global_priority(self): 420 | return 0 421 | 422 | @property 423 | def actual_capacity(self): 424 | return len(self.nodes) 425 | 426 | def set_desired_capacity(self, new_desired_capacity): 427 | """ 428 | sets the desired capacity of the underlying ASG directly. 429 | note that this is for internal control. 430 | for scaling purposes, please use scale() instead. 431 | """ 432 | logger.info("ASG: {} new_desired_capacity: {}".format( 433 | self, new_desired_capacity)) 434 | self.client.set_desired_capacity(AutoScalingGroupName=self.name, 435 | DesiredCapacity=new_desired_capacity, 436 | HonorCooldown=False) 437 | self.desired_capacity = new_desired_capacity 438 | return utils.CompletedFuture(True) 439 | 440 | def scale(self, new_desired_capacity): 441 | """ 442 | scales the ASG to the new desired capacity. 443 | returns a future with the result True if desired capacity has been increased. 444 | """ 445 | desired_capacity = min(self.max_size, new_desired_capacity) 446 | num_unschedulable = len(self.unschedulable_nodes) 447 | num_schedulable = self.actual_capacity - num_unschedulable 448 | 449 | logger.info("Desired {}, currently at {}".format( 450 | desired_capacity, self.desired_capacity)) 451 | logger.info("Kube node: {} schedulable, {} unschedulable".format( 452 | num_schedulable, num_unschedulable)) 453 | 454 | # Try to get the number of schedulable nodes up if we don't have enough, regardless of whether 455 | # group's capacity is already at the same as the desired. 456 | if num_schedulable < desired_capacity: 457 | for node in self.unschedulable_nodes: 458 | if node.uncordon(): 459 | num_schedulable += 1 460 | # Uncordon only what we need 461 | if num_schedulable == desired_capacity: 462 | break 463 | 464 | if self.desired_capacity != desired_capacity: 465 | if self.desired_capacity == self.max_size: 466 | logger.info("Desired same as max, desired: {}, schedulable: {}".format( 467 | self.desired_capacity, num_schedulable)) 468 | return utils.CompletedFuture(False) 469 | 470 | scale_up = self.desired_capacity < desired_capacity 471 | # This should be a rare event 472 | # note: this micro-optimization is not worth doing as the race condition here is 473 | # tricky. when ec2 initializes some nodes in the meantime, asg will shutdown 474 | # nodes by its own policy 475 | # scale_down = self.desired_capacity > desired_capacity >= self.actual_capacity 476 | if scale_up: 477 | # should have gotten our num_schedulable to highest value possible 478 | # actually need to grow. 479 | return self.set_desired_capacity(desired_capacity) 480 | 481 | logger.info("Doing nothing: desired_capacity correctly set: {}, schedulable: {}".format( 482 | self.name, num_schedulable)) 483 | return utils.CompletedFuture(False) 484 | 485 | def scale_nodes_in(self, nodes): 486 | """ 487 | scale down asg by terminating the given node. 488 | returns a future indicating when the request completes. 489 | """ 490 | for node in nodes: 491 | try: 492 | # if we somehow end up in a situation where we have 493 | # more capacity than desired capacity, and the desired 494 | # capacity is at asg min size, then when we try to 495 | # terminate the instance while decrementing the desired 496 | # capacity, the aws api call will fail 497 | decrement_capacity = self.desired_capacity > self.min_size 498 | self.client.terminate_instance_in_auto_scaling_group( 499 | InstanceId=node.instance_id, 500 | ShouldDecrementDesiredCapacity=decrement_capacity) 501 | self.nodes.remove(node) 502 | logger.info('Scaled node %s in', node) 503 | except botocore.exceptions.ClientError as e: 504 | if str(e).find("Terminating instance without replacement will " 505 | "violate group's min size constraint.") == -1: 506 | raise e 507 | logger.error("Failed to terminate instance: %s", e) 508 | 509 | return utils.CompletedFuture(None) 510 | 511 | def contains(self, node): 512 | return node.instance_id in self.instance_ids 513 | 514 | def is_match_for_selectors(self, selectors): 515 | for label, value in selectors.items(): 516 | if self.selectors.get(label) != value: 517 | return False 518 | return True 519 | 520 | def is_taints_tolerated(self, pod): 521 | for label, value in pod.selectors.items(): 522 | if self.selectors.get(label) != value: 523 | return False 524 | for key in self.no_schedule_taints: 525 | if not (pod.no_schedule_wildcard_toleration or key in pod.no_schedule_existential_tolerations): 526 | return False 527 | return True 528 | 529 | def __str__(self): 530 | return 'AutoScalingGroup({name}, {selectors_hash})'.format(name=self.name, selectors_hash=utils.selectors_to_hash(self.selectors)) 531 | 532 | def __repr__(self): 533 | return str(self) 534 | 535 | 536 | class AutoScalingErrorMessages(object): 537 | INSTANCE_LIMIT = re.compile(r'You have requested more instances \((?P\d+)\) than your current instance limit of (?P\d+) allows for the specified instance type. Please visit http://aws.amazon.com/contact-us/ec2-request to request an adjustment to this limit. Launching EC2 instance failed.') 538 | VOLUME_LIMIT = re.compile(r'Instance became unhealthy while waiting for instance to be in InService state. Termination Reason: Client.VolumeLimitExceeded: Volume limit exceeded') 539 | CAPACITY_LIMIT = re.compile(r'Insufficient capacity\. Launching EC2 instance failed\.') 540 | SPOT_REQUEST_WAITING = re.compile(r'Placed Spot instance request: (?P.+). Waiting for instance\(s\)') 541 | SPOT_REQUEST_CANCELLED = re.compile(r'Spot instance request: (?P.+) has been cancelled\.') 542 | SPOT_LIMIT = re.compile(r'Max spot instance count exceeded\. Placing Spot instance request failed\.') 543 | AZ_LIMIT = re.compile(r'We currently do not have sufficient .+ capacity in the Availability Zone you requested (.+)\.') 544 | 545 | 546 | class AutoScalingCauseMessages(object): 547 | LAUNCH_INSTANCE = re.compile(r'At \d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ an instance was started in response to a difference between desired and actual capacity, increasing the capacity from (?P\d+) to (?P\d+)\.') 548 | AZ_BALANCE = re.compile(r'An instance was launched to aid in balancing the group\'s zones\.') 549 | -------------------------------------------------------------------------------- /autoscaler/cluster.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import datetime 3 | import logging 4 | import math 5 | import time 6 | 7 | import botocore 8 | import boto3 9 | import botocore.exceptions 10 | import datadog 11 | import pytz 12 | import pykube 13 | 14 | from enum import Enum 15 | 16 | from azure.mgmt.compute import ComputeManagementClient 17 | from azure.monitor import MonitorClient 18 | 19 | from dateutil.parser import parse as dateutil_parse 20 | 21 | from azure.mgmt.resource.resources import ResourceManagementClient 22 | from azure.common.credentials import ServicePrincipalCredentials 23 | from msrestazure.azure_exceptions import CloudError 24 | 25 | import autoscaler.autoscaling_groups as autoscaling_groups 26 | import autoscaler.azure as azure 27 | from autoscaler.azure_api import AzureWriteThroughCachedApi, AzureWrapper 28 | import autoscaler.capacity as capacity 29 | from autoscaler.kube import KubePod, KubeNode, KubeResource, KubePodStatus 30 | import autoscaler.utils as utils 31 | 32 | # we are interested in all pods, incl. system ones 33 | pykube.Pod.objects.namespace = None 34 | 35 | logger = logging.getLogger(__name__) 36 | 37 | 38 | class ClusterNodeState(Enum): 39 | DEAD = 'dead' 40 | INSTANCE_TERMINATED = 'instance-terminated' 41 | ASG_MIN_SIZE = 'asg-min-size' 42 | POD_PENDING = 'pod-pending' 43 | GRACE_PERIOD = 'grace-period' 44 | TYPE_GRACE_PERIOD = 'type-grace-period' 45 | IDLE_SCHEDULABLE = 'idle-schedulable' 46 | IDLE_UNSCHEDULABLE = 'idle-unschedulable' 47 | BUSY_UNSCHEDULABLE = 'busy-unschedulable' 48 | BUSY = 'busy' 49 | UNDER_UTILIZED_DRAINABLE = 'under-utilized-drainable' 50 | UNDER_UTILIZED_UNDRAINABLE = 'under-utilized-undrainable' 51 | LAUNCH_HR_GRACE_PERIOD = 'launch-hr-grace-period' 52 | DETACHED = 'detached' 53 | 54 | 55 | class Cluster(object): 56 | 57 | # the number of instances per type that is allowed to be idle 58 | # this is for keeping some spare capacity around for faster 59 | # pod scheduling, instead of keeping the cluster at capacity 60 | # and having to spin up nodes for every job submission 61 | TYPE_IDLE_COUNT = 5 62 | 63 | # since we pay for the full hour, don't prematurely kill instances 64 | # the number of minutes into the launch hour at which an instance 65 | # is fine to kill 66 | LAUNCH_HOUR_THRESHOLD = { 67 | 'aws': 60 * 30, 68 | 'azure': 60 * 5, # Azure is billed by the minute 69 | } 70 | 71 | # HACK: before we're ready to favor bigger instances in all cases 72 | # just prioritize the ones that we're confident about 73 | _GROUP_DEFAULT_PRIORITY = 10 74 | _GROUP_PRIORITIES = { 75 | 'g2.8xlarge': 2, 76 | 'm4.xlarge': 0, 77 | 'm4.2xlarge': 0, 78 | 'm4.4xlarge': 0, 79 | 'm4.10xlarge': 0 80 | } 81 | 82 | def __init__(self, aws_regions, aws_access_key, aws_secret_key, 83 | azure_client_id, azure_client_secret, azure_subscription_id, azure_tenant_id, 84 | azure_resource_group_names, azure_slow_scale_classes, kubeconfig, 85 | idle_threshold, type_idle_threshold, pod_namespace, 86 | instance_init_time, cluster_name, notifier, 87 | use_aws_iam_role=False, 88 | drain_utilization_below=0.0, 89 | max_scale_in_fraction=0.1, 90 | scale_up=True, maintainance=True, 91 | datadog_api_key=None, 92 | over_provision=5, dry_run=False): 93 | if kubeconfig: 94 | # for using locally 95 | logger.debug('Using kubeconfig %s', kubeconfig) 96 | self.api = pykube.HTTPClient( 97 | pykube.KubeConfig.from_file(kubeconfig)) 98 | else: 99 | # for using on kube 100 | logger.debug('Using kube service account') 101 | self.api = pykube.HTTPClient( 102 | pykube.KubeConfig.from_service_account()) 103 | if pod_namespace is None: 104 | self.pod_namespace = pykube.all 105 | else: 106 | self.pod_namespace = pod_namespace 107 | 108 | self.drain_utilization_below = drain_utilization_below 109 | self.max_scale_in_fraction = max_scale_in_fraction 110 | self._drained = {} 111 | self.session = None 112 | if aws_access_key and aws_secret_key: 113 | self.session = boto3.session.Session( 114 | aws_access_key_id=aws_access_key, 115 | aws_secret_access_key=aws_secret_key, 116 | region_name=aws_regions[0]) # provide a default region 117 | elif use_aws_iam_role is True: 118 | self.session = boto3.session.Session(region_name=aws_regions[0]) # provide a default region 119 | self.autoscaling_groups = autoscaling_groups.AutoScalingGroups( 120 | session=self.session, regions=aws_regions, 121 | cluster_name=cluster_name) 122 | self.autoscaling_timeouts = autoscaling_groups.AutoScalingTimeouts( 123 | self.session) 124 | 125 | azure_regions = [] 126 | resource_groups = [] 127 | self.azure_client = None 128 | if azure_client_id: 129 | azure_credentials = ServicePrincipalCredentials( 130 | client_id=azure_client_id, 131 | secret=azure_client_secret, 132 | tenant=azure_tenant_id 133 | ) 134 | 135 | # Setup the Azure client 136 | resource_client = ResourceManagementClient(azure_credentials, azure_subscription_id) 137 | resource_client.providers.register('Microsoft.Compute') 138 | resource_client.providers.register('Microsoft.Network') 139 | resource_client.providers.register('Microsoft.Insights') 140 | 141 | region_map = {} 142 | for resource_group_name in azure_resource_group_names: 143 | resource_group = resource_client.resource_groups.get(resource_group_name) 144 | location = resource_group.location 145 | if location in region_map: 146 | logger.fatal("{} and {} are both in {}. May only have one resource group per region".format( 147 | resource_group_name, region_map[location], location 148 | )) 149 | region_map[location] = resource_group_name 150 | azure_regions.append(location) 151 | resource_groups.append(resource_group) 152 | 153 | compute_client = ComputeManagementClient(azure_credentials, azure_subscription_id) 154 | compute_client.config.retry_policy.policy = azure.AzureBoundedRetry.from_retry(compute_client.config.retry_policy.policy) 155 | 156 | monitor_client = MonitorClient(azure_credentials, azure_subscription_id) 157 | monitor_client.config.retry_policy.policy = azure.AzureBoundedRetry.from_retry(monitor_client.config.retry_policy.policy) 158 | self.azure_client = AzureWriteThroughCachedApi(AzureWrapper(compute_client, monitor_client, resource_client)) 159 | 160 | self.azure_groups = azure.AzureGroups(resource_groups, azure_slow_scale_classes, self.azure_client) 161 | 162 | # config 163 | self.azure_resource_group_names = azure_resource_group_names 164 | self.azure_regions = azure_regions 165 | self.aws_regions = aws_regions 166 | self.idle_threshold = idle_threshold 167 | self.instance_init_time = instance_init_time 168 | self.type_idle_threshold = type_idle_threshold 169 | self.over_provision = over_provision 170 | 171 | self.scale_up = scale_up 172 | self.maintainance = maintainance 173 | 174 | self.notifier = notifier 175 | 176 | if datadog_api_key: 177 | datadog.initialize(api_key=datadog_api_key) 178 | logger.info('Datadog initialized') 179 | self.stats = datadog.ThreadStats() 180 | self.stats.start() 181 | 182 | self.dry_run = dry_run 183 | 184 | def scale_loop(self): 185 | """ 186 | runs one loop of scaling to current needs. 187 | returns True if successfully scaled. 188 | """ 189 | logger.info("++++++++++++++ Running Scaling Loop ++++++++++++++++") 190 | try: 191 | start_time = time.time() 192 | 193 | kube_lookup_start_time = time.time() 194 | pykube_nodes = pykube.Node.objects(self.api) 195 | if not pykube_nodes: 196 | logger.warn('Failed to list nodes. Please check kube configuration. Terminating scale loop.') 197 | return False 198 | 199 | all_nodes = list(map(KubeNode, pykube_nodes)) 200 | managed_nodes = [node for node in all_nodes if node.is_managed()] 201 | 202 | pods = list(map(KubePod, pykube.Pod.objects(self.api, namespace=self.pod_namespace))) 203 | 204 | running_or_pending_assigned_pods = [ 205 | p for p in pods if (p.status == KubePodStatus.RUNNING or p.status == KubePodStatus.CONTAINER_CREATING) or ( 206 | p.status == KubePodStatus.PENDING and p.node_name 207 | ) 208 | ] 209 | 210 | for node in all_nodes: 211 | for pod in running_or_pending_assigned_pods: 212 | if pod.node_name == node.name: 213 | node.count_pod(pod) 214 | self.stats.gauge('autoscaler.scaling_loop.kube_lookup_time', time.time() - kube_lookup_start_time) 215 | 216 | scaling_group_lookup_start_time = time.time() 217 | if self.azure_client is not None: 218 | for resource_group in self.azure_resource_group_names: 219 | # Force a refresh of the cache to pick up any new Scale Sets that have been created 220 | # or modified externally. 221 | self.azure_client.list_scale_sets(resource_group, force_refresh=True) 222 | # Force a refresh of the cache in case our quota was adjusted 223 | self.azure_client.invalidate_quota_cache(resource_group) 224 | asgs = self.autoscaling_groups.get_all_groups(all_nodes) 225 | azure_groups = self.azure_groups.get_all_groups(all_nodes) 226 | scaling_groups = asgs + azure_groups 227 | self.stats.gauge('autoscaler.scaling_loop.scaling_group_lookup_time', time.time() - scaling_group_lookup_start_time) 228 | 229 | instance_lookup_start_time = time.time() 230 | running_insts_map = self.get_running_instances_map(managed_nodes, azure_groups) 231 | self.stats.gauge('autoscaler.scaling_loop.instance_lookup_time', time.time() - instance_lookup_start_time) 232 | 233 | pods_to_schedule_lookup_start_time = time.time() 234 | pods_to_schedule = self.get_pods_to_schedule(pods) 235 | self.stats.gauge( 236 | 'autoscaler.scaling_loop.pods_to_schedule_lookup_time', 237 | time.time() - pods_to_schedule_lookup_start_time, 238 | ) 239 | 240 | pods_by_node = {} 241 | for p in running_or_pending_assigned_pods: 242 | pods_by_node.setdefault(p.node_name, []).append(p) 243 | 244 | if self.scale_up: 245 | logger.info( 246 | "++++++++++++++ Scaling Up Begins ++++++++++++++++") 247 | self.scale( 248 | pods_to_schedule, all_nodes, scaling_groups, 249 | running_insts_map) 250 | logger.info("++++++++++++++ Scaling Up Ends ++++++++++++++++") 251 | if self.maintainance: 252 | logger.info( 253 | "++++++++++++++ Maintenance Begins ++++++++++++++++") 254 | self.maintain( 255 | managed_nodes, running_insts_map, 256 | pods_to_schedule, running_or_pending_assigned_pods, 257 | scaling_groups) 258 | logger.info("++++++++++++++ Maintenance Ends ++++++++++++++++") 259 | 260 | self.stats.gauge('autoscaler.scaling_loop_time', time.time() - start_time) 261 | 262 | return True 263 | except botocore.exceptions.ClientError as e: 264 | logger.warn(e) 265 | return False 266 | 267 | def scale(self, pods_to_schedule, all_nodes, asgs, running_insts_map): 268 | """ 269 | scale up logic 270 | """ 271 | # TODO: generalize to azure 272 | self.autoscaling_timeouts.refresh_timeouts( 273 | [asg for asg in asgs if asg.provider == 'aws'], 274 | dry_run=self.dry_run) 275 | 276 | cached_live_nodes = [] 277 | for node in all_nodes: 278 | # either we know the physical node behind it and know it's alive 279 | # or we don't know it and assume it's alive 280 | if (node.instance_id and node.instance_id in running_insts_map) \ 281 | or (not node.is_managed()): 282 | cached_live_nodes.append(node) 283 | 284 | # selectors -> pending KubePods 285 | pending_pods = {} 286 | 287 | # for each pending & unassigned job, try to fit them on current machines or count requested 288 | # resources towards future machines 289 | for selectors_hash, pods in pods_to_schedule.items(): 290 | for pod in pods: 291 | fitting = None 292 | for node in cached_live_nodes: 293 | if node.unschedulable: 294 | continue 295 | if node.is_match(pod) and node.can_fit(pod.resources): 296 | fitting = node 297 | break 298 | if fitting is None: 299 | # because a pod may be able to fit in multiple groups 300 | # pick a group now 301 | selectors = dict(pod.selectors) 302 | pending_pods.setdefault(utils.selectors_to_hash(selectors), []).append(pod) 303 | logger.info( 304 | "{pod} is pending ({selectors_hash})".format( 305 | pod=pod, selectors_hash=selectors_hash)) 306 | else: 307 | fitting.count_pod(pod) 308 | logger.info("{pod} fits on {node}".format(pod=pod, 309 | node=fitting)) 310 | 311 | # scale each node type to reach the new capacity 312 | for selectors_hash in set(pending_pods.keys()): 313 | self.fulfill_pending(asgs, 314 | selectors_hash, 315 | pending_pods.get(selectors_hash, [])) 316 | 317 | # TODO: make sure desired capacities of untouched groups are consistent 318 | 319 | def maintain(self, cached_managed_nodes, running_insts_map, 320 | pods_to_schedule, running_or_pending_assigned_pods, asgs): 321 | """ 322 | maintains running instances: 323 | - determines if idle nodes should be drained and terminated 324 | - determines if there are bad nodes in ASGs (did not spin up under 325 | `instance_init_time` seconds) 326 | """ 327 | logger.info("++++++++++++++ Maintaining Nodes & Instances ++++++++++++++++") 328 | 329 | # for each type of instance, we keep one around for longer 330 | # in order to speed up job start up time 331 | idle_selector_hash = collections.Counter() 332 | 333 | pods_by_node = {} 334 | for p in running_or_pending_assigned_pods: 335 | pods_by_node.setdefault(p.node_name, []).append(p) 336 | 337 | stats_time = time.time() 338 | 339 | nodes_to_scale_in = {} 340 | nodes_to_delete = [] 341 | state_counts = dict((state, 0) for state in ClusterNodeState) 342 | for node in cached_managed_nodes: 343 | asg = utils.get_group_for_node(asgs, node) 344 | state = self.get_node_state( 345 | node, asg, pods_by_node.get(node.name, []), pods_to_schedule, 346 | running_insts_map, idle_selector_hash) 347 | 348 | logger.info("node: %-*s state: %s" % (75, node, state)) 349 | state_counts[state] += 1 350 | 351 | # state machine & why doesnt python have case? 352 | if state in (ClusterNodeState.POD_PENDING, ClusterNodeState.BUSY, 353 | ClusterNodeState.GRACE_PERIOD, 354 | ClusterNodeState.TYPE_GRACE_PERIOD, 355 | ClusterNodeState.ASG_MIN_SIZE, 356 | ClusterNodeState.LAUNCH_HR_GRACE_PERIOD, 357 | ClusterNodeState.DETACHED): 358 | # do nothing 359 | pass 360 | elif state == ClusterNodeState.UNDER_UTILIZED_DRAINABLE: 361 | if not self.dry_run: 362 | if not asg: 363 | logger.warn('Cannot find ASG for node %s. Not cordoned.', node) 364 | else: 365 | node.cordon() 366 | node.drain(pods_by_node.get(node.name, []), notifier=self.notifier) 367 | else: 368 | logger.info('[Dry run] Would have drained and cordoned %s', node) 369 | elif state == ClusterNodeState.IDLE_SCHEDULABLE: 370 | if not self.dry_run: 371 | if not asg: 372 | logger.warn('Cannot find ASG for node %s. Not cordoned.', node) 373 | else: 374 | node.cordon() 375 | else: 376 | logger.info('[Dry run] Would have cordoned %s', node) 377 | elif state == ClusterNodeState.BUSY_UNSCHEDULABLE: 378 | # this is duplicated in original scale logic 379 | if not self.dry_run: 380 | node.uncordon() 381 | else: 382 | logger.info('[Dry run] Would have uncordoned %s', node) 383 | elif state == ClusterNodeState.IDLE_UNSCHEDULABLE: 384 | # remove it from asg 385 | if not self.dry_run: 386 | nodes_to_delete.append(node) 387 | if not asg: 388 | logger.warn('Cannot find ASG for node %s. Not terminated.', node) 389 | else: 390 | nodes_to_scale_in.setdefault(asg, []).append(node) 391 | else: 392 | logger.info('[Dry run] Would have scaled in %s', node) 393 | elif state == ClusterNodeState.INSTANCE_TERMINATED: 394 | if not self.dry_run: 395 | nodes_to_delete.append(node) 396 | else: 397 | logger.info('[Dry run] Would have deleted %s', node) 398 | elif state == ClusterNodeState.DEAD: 399 | if not self.dry_run: 400 | nodes_to_delete.append(node) 401 | if asg: 402 | nodes_to_scale_in.setdefault(asg, []).append(node) 403 | else: 404 | logger.info('[Dry run] Would have reaped dead node %s', node) 405 | elif state == ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE: 406 | # noop for now 407 | pass 408 | else: 409 | raise Exception("Unhandled state: {}".format(state)) 410 | 411 | for state, count in state_counts.items(): 412 | self.stats.gauge('kubernetes.custom.node.state.{}'.format(state.value.replace('-', '_')), count) 413 | 414 | # these are instances that have been running for a while but it's not properly managed 415 | # i.e. not having registered to kube or not having proper meta data set 416 | managed_instance_ids = set(node.instance_id for node in cached_managed_nodes) 417 | instances_to_terminate = {} 418 | unmanaged_instance_count = 0 419 | for asg in asgs: 420 | unmanaged_instance_ids = (asg.instance_ids - managed_instance_ids) 421 | if len(unmanaged_instance_ids) > 0: 422 | if asg.provider == 'azure': 423 | for inst_id in unmanaged_instance_ids: 424 | inst = asg.instances[inst_id] 425 | if (datetime.datetime.now(inst.launch_time.tzinfo) 426 | - inst.launch_time).seconds >= self.instance_init_time: 427 | if not self.dry_run: 428 | logger.info("terminating unmanaged %s" % inst) 429 | instances_to_terminate.setdefault(asg, []).append(inst_id) 430 | unmanaged_instance_count += 1 431 | # TODO: try to delete node from kube as well 432 | # in the case where kubelet may have registered but node 433 | # labels have not been applied yet, so it appears unmanaged 434 | else: 435 | logger.info('[Dry run] Would have terminated unmanaged %s', inst) 436 | else: 437 | unmanaged_running_insts = self.get_running_instances_in_region( 438 | asg.region, list(unmanaged_instance_ids)) 439 | for inst in unmanaged_running_insts: 440 | if (datetime.datetime.now(inst.launch_time.tzinfo) 441 | - inst.launch_time).seconds >= self.instance_init_time: 442 | if not self.dry_run: 443 | asg.client.terminate_instance_in_auto_scaling_group( 444 | InstanceId=inst.id, ShouldDecrementDesiredCapacity=False) 445 | logger.info("terminating unmanaged %s" % inst) 446 | unmanaged_instance_count += 1 447 | # TODO: try to delete node from kube as well 448 | # in the case where kubelet may have registered but node 449 | # labels have not been applied yet, so it appears unmanaged 450 | else: 451 | logger.info( 452 | '[Dry run] Would have terminated unmanaged %s [%s]', inst, asg.region) 453 | self.stats.gauge('kubernetes.custom.node.state.unmanaged', unmanaged_instance_count) 454 | 455 | async_operations = [] 456 | total_instances = max(sum(len(asg.instance_ids) for asg in asgs), len(cached_managed_nodes)) 457 | max_allowed_scale_in = int(math.ceil(self.max_scale_in_fraction * total_instances)) 458 | to_scale_in = sum(len(nodes) for nodes in nodes_to_scale_in.values()) + \ 459 | sum(len(instance_ids) for instance_ids in instances_to_terminate.values()) 460 | to_scale_in = max(to_scale_in, len(nodes_to_delete)) 461 | if to_scale_in > max_allowed_scale_in: 462 | logger.error("TOO MANY NODES TO SCALE IN: {}, max allowed is {}".format(to_scale_in, max_allowed_scale_in)) 463 | elif not self.dry_run: 464 | for asg, nodes in nodes_to_scale_in.items(): 465 | async_operations.append(asg.scale_nodes_in(nodes)) 466 | 467 | for asg, instance_ids in instances_to_terminate.items(): 468 | async_operations.append(asg.terminate_instances(instance_ids)) 469 | 470 | for node in nodes_to_delete: 471 | node.delete() 472 | 473 | # Wait for all background scale-in operations to complete 474 | for operation in async_operations: 475 | try: 476 | operation.result() 477 | except CloudError as e: 478 | logger.warn("Error while deleting Azure node: {}".format(e.message)) 479 | except TimeoutError: 480 | logger.warn("Timeout while deleting Azure node") 481 | 482 | def fulfill_pending(self, asgs, selectors_hash, pods): 483 | """ 484 | selectors_hash - string repr of selectors 485 | pods - list of KubePods that are pending 486 | """ 487 | logger.info( 488 | "========= Scaling for %s ========", selectors_hash) 489 | logger.debug("pending: %s", pods[:5]) 490 | 491 | accounted_pods = dict((p, False) for p in pods) 492 | num_unaccounted = len(pods) 493 | 494 | groups = utils.get_groups_for_hash(asgs, selectors_hash) 495 | 496 | groups = self._prioritize_groups(groups) 497 | 498 | async_operations = [] 499 | for group in groups: 500 | logger.debug("group: %s", group) 501 | if (self.autoscaling_timeouts.is_timed_out(group) or group.is_timed_out() or group.max_size == group.desired_capacity) \ 502 | and not group.unschedulable_nodes: 503 | continue 504 | 505 | unit_capacity = capacity.get_unit_capacity(group) 506 | new_instance_resources = [] 507 | assigned_pods = [] 508 | for pod, acc in accounted_pods.items(): 509 | if acc or not (unit_capacity - pod.resources).possible or not group.is_taints_tolerated(pod): 510 | continue 511 | 512 | found_fit = False 513 | for i, instance in enumerate(new_instance_resources): 514 | if (instance - pod.resources).possible: 515 | new_instance_resources[i] = instance - pod.resources 516 | assigned_pods[i].append(pod) 517 | found_fit = True 518 | break 519 | if not found_fit: 520 | new_instance_resources.append( 521 | unit_capacity - pod.resources) 522 | assigned_pods.append([pod]) 523 | 524 | # new desired # machines = # running nodes + # machines required to fit jobs that don't 525 | # fit on running nodes. This scaling is conservative but won't 526 | # create starving 527 | units_needed = len(new_instance_resources) 528 | # The pods may not fit because of resource requests or taints. Don't scale in that case 529 | if units_needed == 0: 530 | continue 531 | units_needed += self.over_provision 532 | 533 | if self.autoscaling_timeouts.is_timed_out(group) or group.is_timed_out(): 534 | # if a machine is timed out, it cannot be scaled further 535 | # just account for its current capacity (it may have more 536 | # being launched, but we're being conservative) 537 | unavailable_units = max( 538 | 0, units_needed - (group.desired_capacity - group.actual_capacity)) 539 | else: 540 | unavailable_units = max( 541 | 0, units_needed - (group.max_size - group.actual_capacity)) 542 | units_requested = units_needed - unavailable_units 543 | 544 | logger.debug("units_needed: %s", units_needed) 545 | logger.debug("units_requested: %s", units_requested) 546 | 547 | new_capacity = group.actual_capacity + units_requested 548 | if not self.dry_run: 549 | async_operation = group.scale(new_capacity) 550 | async_operations.append(async_operation) 551 | 552 | def notify_if_scaled(future): 553 | if future.result(): 554 | flat_assigned_pods = [] 555 | for instance_pods in assigned_pods: 556 | flat_assigned_pods.extend(instance_pods) 557 | self.notifier.notify_scale(group, units_requested, flat_assigned_pods) 558 | 559 | async_operation.add_done_callback(notify_if_scaled) 560 | else: 561 | logger.info( 562 | '[Dry run] Would have scaled up (%s) to %s', group, new_capacity) 563 | 564 | for i in range(min(len(assigned_pods), units_requested)): 565 | for pod in assigned_pods[i]: 566 | accounted_pods[pod] = True 567 | num_unaccounted -= 1 568 | 569 | logger.debug("remining pending: %s", num_unaccounted) 570 | 571 | if not num_unaccounted: 572 | break 573 | 574 | if num_unaccounted: 575 | logger.warn('Failed to scale sufficiently.') 576 | self.notifier.notify_failed_to_scale(selectors_hash, pods) 577 | 578 | for operation in async_operations: 579 | try: 580 | operation.result() 581 | except CloudError as e: 582 | logger.warn("Error while scaling Scale Set: {}".format(e.message)) 583 | except TimeoutError: 584 | logger.warn("Timeout while scaling Scale Set") 585 | 586 | def get_running_instances_in_region(self, region, instance_ids): 587 | """ 588 | a generator for getting ec2.Instance objects given a list of 589 | instance IDs. 590 | """ 591 | if not region: 592 | logger.warn('Instance IDs without region: %s', instance_ids) 593 | return 594 | 595 | yielded_ids = set() 596 | try: 597 | running_insts = (self.session 598 | .resource('ec2', region_name=region) 599 | .instances 600 | .filter( 601 | InstanceIds=instance_ids, 602 | Filters=[{ 603 | 'Name': "instance-state-name", 604 | 'Values': ["running"]}] 605 | )) 606 | # we have to go through each instance to make sure 607 | # they actually exist and handle errors otherwise 608 | # boto collections do not always call DescribeInstance 609 | # when returning from filter, so it could error during 610 | # iteration 611 | for inst in running_insts: 612 | yield inst 613 | yielded_ids.add(inst.id) 614 | except botocore.exceptions.ClientError as e: 615 | logger.debug('Caught %s', e) 616 | if str(e).find("InvalidInstanceID.NotFound") == -1: 617 | raise e 618 | elif len(instance_ids) == 1: 619 | return 620 | else: 621 | # this should hopefully happen rarely so we resort to slow methods to 622 | # handle this case 623 | for instance_id in instance_ids: 624 | if instance_id in yielded_ids: 625 | continue 626 | for inst in self.get_running_instances_in_region(region, [instance_id]): 627 | yield inst 628 | 629 | def get_running_instances_map(self, nodes, azure_groups): 630 | """ 631 | given a list of KubeNode's, return a map of 632 | instance_id -> ec2.Instance object 633 | """ 634 | instance_map = {} 635 | 636 | # first get azure instances 637 | for group in azure_groups: 638 | if isinstance(group, azure.AzureVirtualScaleSet): 639 | for instance in group.get_azure_instances(): 640 | instance_map[instance.id] = instance 641 | 642 | # now get aws instances 643 | instance_id_by_region = {} 644 | for node in nodes: 645 | if node.provider == 'aws': 646 | instance_id_by_region.setdefault(node.region, []).append(node.instance_id) 647 | 648 | for region, instance_ids in instance_id_by_region.items(): 649 | # note that this assumes that all instances have a valid region 650 | # the regions referenced by the nodes may also be outside of the 651 | # list of regions provided by the user 652 | # this should be ok because they will just end up being nodes 653 | # unmanaged by autoscaling groups we know about 654 | region_instances = self.get_running_instances_in_region( 655 | region, instance_ids) 656 | instance_map.update((inst.id, inst) for inst in region_instances) 657 | 658 | return instance_map 659 | 660 | def _get_required_capacity(self, requested, group): 661 | """ 662 | returns the number of nodes within an autoscaling group that should 663 | be provisioned to fit the requested amount of KubeResource. 664 | 665 | TODO: bin packing would probably be better? 666 | 667 | requested - KubeResource 668 | group - AutoScalingGroup 669 | """ 670 | unit_capacity = capacity.get_unit_capacity(group) 671 | return max( 672 | # (peter) should 0.8 be configurable? 673 | int(math.ceil(requested.get(field, 0.0) / unit_capacity.get(field, 1.0))) 674 | for field in ('cpu', 'memory', 'pods') 675 | ) 676 | 677 | def _prioritize_groups(self, groups): 678 | """ 679 | returns the groups sorted in order of where we should try to schedule 680 | things first. we currently try to prioritize in the following order: 681 | - region 682 | - single-AZ groups over multi-AZ groups (for faster/cheaper network) 683 | - whether or not the group launches spot instances (prefer spot) 684 | - manually set _GROUP_PRIORITIES 685 | - group name 686 | """ 687 | def sort_key(group): 688 | region = self._GROUP_DEFAULT_PRIORITY 689 | try: 690 | region = (self.azure_regions + self.aws_regions).index(group.region) 691 | except ValueError: 692 | pass 693 | # Some ASGs are pinned to be in a single AZ. Sort them in front of 694 | # multi-ASG groups that won't have this tag set. 695 | pinned_to_az = group.selectors.get('aws/az', 'z') 696 | priority = self._GROUP_PRIORITIES.get( 697 | group.selectors.get('aws/type'), self._GROUP_DEFAULT_PRIORITY) 698 | return (group.global_priority, region, pinned_to_az, not group.is_spot, priority, group.name) 699 | return sorted(groups, key=sort_key) 700 | 701 | def get_node_state(self, node, asg, node_pods, pods_to_schedule, 702 | running_insts_map, idle_selector_hash): 703 | """ 704 | returns the ClusterNodeState for the given node 705 | 706 | params: 707 | node - KubeNode object 708 | asg - AutoScalingGroup object that this node belongs in. can be None. 709 | node_pods - list of KubePods assigned to this node 710 | pods_to_schedule - list of all pending pods 711 | running_inst_map - map of all (instance_id -> ec2.Instance object) 712 | idle_selector_hash - current map of idle nodes by type. may be modified 713 | """ 714 | pending_list = [] 715 | for pods in pods_to_schedule.values(): 716 | for pod in pods: 717 | # a pod is considered schedulable onto this node if all the 718 | # node selectors match 719 | # AND it doesn't use pod affinity (which we don't support yet) 720 | if (node.is_match(pod) and 721 | 'scheduler.alpha.kubernetes.io/affinity' not in pod.annotations): 722 | pending_list.append(pod) 723 | # we consider a node to be busy if it's running any non-DaemonSet pods 724 | # TODO: we can be a bit more aggressive in killing pods that are 725 | # replicated 726 | busy_list = [p for p in node_pods if not p.is_mirrored()] 727 | undrainable_list = [p for p in node_pods if not p.is_drainable()] 728 | utilization = sum((p.resources for p in busy_list), KubeResource()) 729 | under_utilized = (self.drain_utilization_below * 730 | node.capacity - utilization).possible 731 | drainable = not undrainable_list 732 | 733 | maybe_inst = running_insts_map.get(node.instance_id) 734 | if maybe_inst: 735 | age = (datetime.datetime.now(maybe_inst.launch_time.tzinfo) 736 | - maybe_inst.launch_time).seconds 737 | launch_hour_offset = age % 3600 738 | else: 739 | age = None 740 | 741 | instance_type = utils.selectors_to_hash( 742 | asg.selectors) if asg else node.instance_type 743 | 744 | type_spare_capacity = (instance_type and self.type_idle_threshold and 745 | idle_selector_hash[instance_type] < self.TYPE_IDLE_COUNT) 746 | 747 | if maybe_inst is None: 748 | return ClusterNodeState.INSTANCE_TERMINATED 749 | 750 | if node.is_detached(): 751 | return ClusterNodeState.DETACHED 752 | 753 | if node.is_dead(): 754 | return ClusterNodeState.DEAD 755 | 756 | if asg and len(asg.nodes) <= asg.min_size: 757 | return ClusterNodeState.ASG_MIN_SIZE 758 | 759 | if busy_list and not under_utilized: 760 | if node.unschedulable: 761 | return ClusterNodeState.BUSY_UNSCHEDULABLE 762 | return ClusterNodeState.BUSY 763 | 764 | if pending_list and not node.unschedulable: 765 | # logger.warn('PENDING: %s', pending_list) 766 | return ClusterNodeState.POD_PENDING 767 | 768 | if launch_hour_offset < self.LAUNCH_HOUR_THRESHOLD[node.provider] and not node.unschedulable: 769 | return ClusterNodeState.LAUNCH_HR_GRACE_PERIOD 770 | 771 | # elif node.provider == 'azure': 772 | # disabling scale down in azure for now while we ramp up 773 | # TODO: remove once azure is bootstrapped 774 | # state = ClusterNodeState.GRACE_PERIOD 775 | 776 | if (not type_spare_capacity and age <= self.idle_threshold) and not node.unschedulable: 777 | # there is already an instance of this type sitting idle 778 | # so we use the regular idle threshold for the grace period 779 | return ClusterNodeState.GRACE_PERIOD 780 | 781 | if (type_spare_capacity and age <= self.type_idle_threshold) and not node.unschedulable: 782 | # we don't have an instance of this type yet! 783 | # use the type idle threshold for the grace period 784 | # and mark the type as seen 785 | idle_selector_hash[instance_type] += 1 786 | return ClusterNodeState.TYPE_GRACE_PERIOD 787 | 788 | if under_utilized and (busy_list or not node.unschedulable): 789 | # nodes that are under utilized (but not completely idle) 790 | # have their own states to tell if we should drain them 791 | # for better binpacking or not 792 | if drainable: 793 | return ClusterNodeState.UNDER_UTILIZED_DRAINABLE 794 | return ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE 795 | 796 | if node.unschedulable: 797 | return ClusterNodeState.IDLE_UNSCHEDULABLE 798 | return ClusterNodeState.IDLE_SCHEDULABLE 799 | 800 | def get_pods_to_schedule(self, pods): 801 | """ 802 | given a list of KubePod objects, 803 | return a map of (selectors hash -> pods) to be scheduled 804 | """ 805 | pending_unassigned_pods = [ 806 | p for p in pods 807 | if p.status == KubePodStatus.PENDING and (not p.node_name) 808 | ] 809 | 810 | # we only consider a pod to be schedulable if it's pending and 811 | # unassigned and feasible 812 | pods_to_schedule = {} 813 | now = datetime.datetime.now(pytz.utc) 814 | for pod in pending_unassigned_pods: 815 | age = (now - pod.creation_time).total_seconds() 816 | self.stats.histogram('autoscaler.scaling_loop.pending_pod_age', age) 817 | 818 | if capacity.is_possible(pod): 819 | pods_to_schedule.setdefault( 820 | utils.selectors_to_hash(pod.selectors), []).append(pod) 821 | else: 822 | recommended_capacity = capacity.max_capacity_for_selectors( 823 | pod.selectors, pod.resources) 824 | logger.warn( 825 | "Pending pod %s cannot fit %s. " 826 | "Please check that requested resource amount is " 827 | "consistent with node selectors (recommended max: %s). " 828 | "Scheduling skipped." % (pod.name, pod.selectors, recommended_capacity)) 829 | self.notifier.notify_invalid_pod_capacity( 830 | pod, recommended_capacity) 831 | return pods_to_schedule 832 | --------------------------------------------------------------------------------