├── .gitignore
├── docs
    └── kubernetes-ec2-autoscaler-2.png
├── requirements.txt
├── autoscaler
    ├── __init__.py
    ├── config.py
    ├── aws_utils.py
    ├── capacity.py
    ├── utils.py
    ├── notification.py
    ├── azure.py
    ├── kube.py
    ├── azure_api.py
    ├── autoscaling_groups.py
    └── cluster.py
├── CONTRIBUTORS.md
├── autoscaler-secret.yaml
├── README.md
├── setup.py
├── production-requirements.txt
├── Dockerfile
├── .travis.yml
├── test
    ├── data
    │   ├── kube_config.yaml
    │   ├── busybox.yaml
    │   ├── node.yaml
    │   ├── ds-pod.yaml
    │   └── rc-pod.yaml
    ├── test_capacity.py
    ├── test_azure.py
    ├── test_azure_api.py
    └── test_cluster.py
├── LICENSE
├── scaling-controller.yaml
├── autoscaler-dep.yaml
├── data
    └── capacity.json
└── main.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | .mypy_cache
4 | 


--------------------------------------------------------------------------------
/docs/kubernetes-ec2-autoscaler-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/kubernetes-ec2-autoscaler/HEAD/docs/kubernetes-ec2-autoscaler-2.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r production-requirements.txt
2 | 
3 | # for tests
4 | nose>=1.3.7
5 | PyYAML>=3.11
6 | moto>=0.4.25,<1.0.0
7 | mock>=2.0.0
8 | mypy>=0.501
9 | 


--------------------------------------------------------------------------------
/autoscaler/__init__.py:
--------------------------------------------------------------------------------
1 | from autoscaler.kube import KubePodStatus
2 | from autoscaler.kube import KubePod
3 | from autoscaler.kube import KubeNode
4 | from autoscaler.kube import KubeResource
5 | 
6 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
1 | # Contributors (sorted alphabetically)
2 | 
3 | - [Brian Kassouf](https://github.com/briankassouf)
4 | - [Pamela Vagata](https://github.com/pvagata)
5 | - [Peter Chen](https://github.com/neocxi)
6 | - [Vicki Cheung](https://github.com/vicki-c)
7 | 


--------------------------------------------------------------------------------
/autoscaler/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | 
4 | class Config(object):
5 |     CAPACITY_DATA = os.environ.get('CAPACITY_DATA', 'data/capacity.json')
6 |     CAPACITY_CPU_RESERVE = float(os.environ.get('CAPACITY_CPU_RESERVE', 0.0))
7 | 
8 |     NAMESPACE = os.environ.get('NAMESPACE', 'system')
9 | 


--------------------------------------------------------------------------------
/autoscaler-secret.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Secret
 3 | metadata:
 4 |   name: autoscaler
 5 |   namespace: kube-system
 6 | data:
 7 | # replace aws keys below with your own BASE64 Encoded keys
 8 |   aws-access-key-id: YmFzZTY0ZW5jb2RlZGFjY2Vzc2tleQ==
 9 |   aws-secret-access-key: YmFzZTY0ZW5jb2RlZHNlY3JldGFjY2Vzc2tleQ==
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | **Status:** Archived (no changes will be merged)
2 | 
3 | # kubernetes-ec2-autoscaler
4 | 
5 | kubernetes-ec2-autoscaler was a node-level autoscaler for [Kubernetes](http://kubernetes.io/)
6 | on AWS EC2 designed for batch jobs.
7 | 
8 | We recommend looking at the [Kubernetes Cluster Autoscaler](https://github.com/kubernetes/autoscaler).
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(name='autoscaler',
 4 |       version='0.0.1',
 5 |       packages=find_packages(),
 6 |       install_requires=[
 7 |           'pykube',
 8 |           'requests',
 9 |           'ipdb',
10 |           'boto',
11 |           'boto3',
12 |           'botocore',
13 |           'click',
14 |       ]
15 | )
16 | 
17 | 


--------------------------------------------------------------------------------
/production-requirements.txt:
--------------------------------------------------------------------------------
 1 | requests[security]>=2.12.0
 2 | boto==2.39.0
 3 | boto3>=1.3.1
 4 | botocore>=1.3.26
 5 | click>=6.2
 6 | python-dateutil>=2.5.3
 7 | cachetools>=2.0.0
 8 | JSON-log-formatter>=0.1.0
 9 | pytz>=2016.10
10 | 
11 | pykube>=0.14.0
12 | azure-mgmt-compute>=0.33.1rc1,<1.0.0
13 | azure-mgmt-resource>=0.31.0,<1.0.0
14 | azure-monitor>=0.3.0,<1.0.0
15 | 
16 | # for instrumentation
17 | datadog>=0.14.0
18 | # for error tracking
19 | raven>=5.32.0
20 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.6-alpine
 2 | 
 3 | RUN apk --update add --virtual build-dependencies \
 4 |       python-dev libffi-dev openssl-dev build-base && \
 5 |     pip install --upgrade pip cffi cryptography && \
 6 |     apk del build-dependencies && \
 7 |     apk add --no-cache bash git && \
 8 |     rm -rf /var/cache/apk/*
 9 | 
10 | COPY production-requirements.txt /app/requirements.txt
11 | RUN pip install -r /app/requirements.txt
12 | COPY . /app/
13 | WORKDIR /app
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | language: python
 3 | python:
 4 |   - "3.6"
 5 | services:
 6 |   - docker
 7 | # Workaround for https://github.com/travis-ci/travis-ci/issues/5246
 8 | env:
 9 |   - BOTO_CONFIG=/tmp/nowhere
10 | install:
11 |   - "pip install -r requirements.txt"
12 |   - "mkdir -p ~/.kube"
13 |   - "ln -s $PWD/test/data/kube_config.yaml ~/.kube/config"
14 | script:
15 |   - "nosetests test/"
16 |   - "mypy --ignore-missing-imports autoscaler/azure_api.py"
17 |   - "docker build ."
18 | cache: pip
19 | 


--------------------------------------------------------------------------------
/autoscaler/aws_utils.py:
--------------------------------------------------------------------------------
 1 | def fetch_all(aws_paged_func, kwargs, list_field, next_token=None):
 2 |     if next_token == '':
 3 |         return []
 4 | 
 5 |     our_kwargs = dict(kwargs)
 6 |     if next_token is not None:
 7 |         our_kwargs['NextToken'] = next_token
 8 | 
 9 |     page_data = aws_paged_func(**our_kwargs)
10 |     next_items = fetch_all(
11 |         aws_paged_func, kwargs, list_field,
12 |         next_token=page_data.get('NextToken', ''))
13 | 
14 |     return page_data[list_field] + next_items
15 | 


--------------------------------------------------------------------------------
/test/data/kube_config.yaml:
--------------------------------------------------------------------------------
 1 | clusters:
 2 |   [
 3 |     {
 4 |       name: "test-cluster",
 5 |       cluster:
 6 |         {
 7 |           server: "http://localhost:8080"
 8 |         }
 9 |     }
10 |   ]
11 | users:
12 |   [
13 |     {
14 |       name: "test-user",
15 |       user: {}
16 |     }
17 |   ]
18 | contexts:
19 |   [
20 |     {
21 |       name: "test-cluster",
22 |       context:
23 |         {
24 |           cluster: "test-cluster",
25 |           user: "test-user"
26 |         }
27 |     }
28 |   ]
29 | current-context: "test-cluster"
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2016 OpenAI (http://openai.com)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/test/test_capacity.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import unittest
 3 | 
 4 | import pykube
 5 | import yaml
 6 | 
 7 | import autoscaler.capacity as capacity
 8 | from autoscaler.kube import KubeNode, KubePod
 9 | 
10 | 
11 | class TestCapacity(unittest.TestCase):
12 |     def setUp(self):
13 |         dir_path = os.path.dirname(os.path.realpath(__file__))
14 |         with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f:
15 |             dummy_pod = yaml.load(f.read())
16 |         with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f:
17 |             self.dummy_node = yaml.load(f.read())
18 |             # KubeNode expects these to be strings
19 |             for condition in self.dummy_node['status']['conditions']:
20 |                 condition['lastHeartbeatTime'] = str(condition['lastHeartbeatTime'])
21 | 
22 |         # this isn't actually used here
23 |         # only needed to create the KubePod object...
24 |         self.api = pykube.HTTPClient(pykube.KubeConfig.from_file('~/.kube/config'))
25 | 
26 |         self.dummy_pod = dummy_pod
27 | 
28 |     def test_can_fit(self):
29 |         pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
30 |         node = KubeNode(pykube.Node(self.api, self.dummy_node))
31 |         assert node.can_fit(pod.resources)
32 | 
33 |     def test_possible(self):
34 |         pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
35 |         assert capacity.is_possible(pod)
36 | 
37 |     def test_impossible(self):
38 |         self.dummy_pod['spec']['nodeSelector'] = {
39 |             'aws/type': 't2.micro'
40 |         }
41 | 
42 |         print(repr(self.dummy_pod['metadata']['creationTimestamp']))
43 |         from dateutil.parser import parse as dateutil_parse
44 |         print(dateutil_parse(self.dummy_pod['metadata']['creationTimestamp']))
45 |         pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
46 |         assert not capacity.is_possible(pod)
47 | 


--------------------------------------------------------------------------------
/test/data/busybox.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   creationTimestamp: "2016-07-14T06:46:14Z"
 5 |   name: busybox
 6 |   namespace: default
 7 |   resourceVersion: "229323584"
 8 |   selfLink: /api/v1/namespaces/default/pods/busybox
 9 |   uid: a85c73b6-498e-11e6-ab0a-0af8d945d5d3
10 | spec:
11 |   containers:
12 |   - command:
13 |     - sleep
14 |     - "3600"
15 |     image: busybox
16 |     imagePullPolicy: Always
17 |     name: busybox
18 |     resources:
19 |       limits:
20 |         cpu: 1800m
21 |         memory: 1500Mi
22 |       requests:
23 |         cpu: 1500m
24 |         memory: 2500Mi
25 |     terminationMessagePath: /dev/termination-log
26 |     volumeMounts:
27 |     - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
28 |       name: default-token-1h0fa
29 |       readOnly: true
30 |   dnsPolicy: ClusterFirst
31 |   imagePullSecrets:
32 |   - name: quay-login-secret
33 |   nodeName: 10.0.0.228
34 |   restartPolicy: Always
35 |   securityContext: {}
36 |   serviceAccount: default
37 |   serviceAccountName: default
38 |   terminationGracePeriodSeconds: 30
39 |   volumes:
40 |   - name: default-token-1h0fa
41 |     secret:
42 |       secretName: default-token-1h0fa
43 | status:
44 |   conditions:
45 |   - lastProbeTime: null
46 |     lastTransitionTime: 2016-08-23T21:22:43Z
47 |     status: "True"
48 |     type: Ready
49 |   containerStatuses:
50 |   - containerID: docker://4643e395f45d65015a98c13a509b4429bd357477d0dfb6719a835ce8135f1c06
51 |     image: busybox
52 |     imageID: docker://sha256:2b8fd9751c4c0f5dd266fcae00707e67a2545ef34f9a29354585f93dac906749
53 |     lastState:
54 |       terminated:
55 |         containerID: docker://76e9c5151cc98f4d3aa1f73704112a4befcc451104a493671fc89a401e45f907
56 |         exitCode: 0
57 |         finishedAt: 2016-08-23T21:22:40Z
58 |         reason: Completed
59 |         startedAt: 2016-08-23T20:22:40Z
60 |     name: busybox
61 |     ready: true
62 |     restartCount: 974
63 |     state:
64 |       running:
65 |         startedAt: 2016-08-23T21:22:42Z
66 |   hostIP: 10.0.0.228
67 |   phase: Running
68 |   podIP: 10.240.112.166
69 |   startTime: "2016-07-14T06:46:25Z"
70 | 


--------------------------------------------------------------------------------
/test/data/node.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Node
 3 | metadata:
 4 |   annotations:
 5 |     aws/sgs: kubernetes-worker
 6 |   creationTimestamp: "2016-08-25T05:13:16Z"
 7 |   labels:
 8 |     aws/ami-id: ami-deadbeef
 9 |     aws/az: us-west-2c
10 |     aws/class: t
11 |     aws/id: i-aaaaaaaa
12 |     aws/region: us-west-2
13 |     aws/type: t2.medium
14 |     kubernetes.io/hostname: 10.0.0.228
15 |   name: 10.0.0.228
16 |   resourceVersion: "234864209"
17 |   selfLink: /api/v1/nodes/10.0.0.228
18 |   uid: a0f990ae-6a82-11e6-b203-0a0bdd34364d
19 | spec:
20 |   externalID: 10.0.0.228
21 | status:
22 |   addresses:
23 |   - address: 10.0.0.228
24 |     type: LegacyHostIP
25 |   - address: 10.0.0.228
26 |     type: InternalIP
27 |   allocatable:
28 |     cpu: "2"
29 |     memory: 3952Mi
30 |     pods: "30"
31 |   capacity:
32 |     cpu: "2"
33 |     memory: 3952Mi
34 |     pods: "30"
35 |   conditions:
36 |   - lastHeartbeatTime: 2016-08-25T21:50:36Z
37 |     lastTransitionTime: 2016-08-25T05:13:07Z
38 |     message: kubelet has sufficient disk space available
39 |     reason: KubeletHasSufficientDisk
40 |     status: "False"
41 |     type: OutOfDisk
42 |   - lastHeartbeatTime: 2016-08-25T21:50:36Z
43 |     lastTransitionTime: 2016-08-25T05:13:07Z
44 |     message: kubelet is posting ready status
45 |     reason: KubeletReady
46 |     status: "True"
47 |     type: Ready
48 |   daemonEndpoints:
49 |     kubeletEndpoint:
50 |       Port: 10250
51 |   images:
52 |   - names:
53 |     - datadog/docker-dd-agent:kubernetes
54 |     sizeBytes: 301845578
55 |   - names:
56 |     - nvidia/cuda:7.5-cudnn4-devel-ubuntu14.04
57 |     sizeBytes: 1353527489
58 |   - names:
59 |     - newrelic/nrsysmond:latest
60 |     sizeBytes: 192693160
61 |   - names:
62 |     - gcr.io/google_containers/pause:2.0
63 |     sizeBytes: 350164
64 |   nodeInfo:
65 |     bootID: dd1f6c71-caa5-45b8-8751-5dd981bfba76
66 |     containerRuntimeVersion: docker://1.11.2
67 |     kernelVersion: 3.13.0-88-generic
68 |     kubeProxyVersion: v1.2.1
69 |     kubeletVersion: v1.2.1
70 |     machineID: 1f2629f74f7c79e2a4583d1a5674725a
71 |     osImage: Ubuntu 14.04.3 LTS
72 |     systemUUID: EC228B31-FEB9-F4EA-A8CC-1E437E7C7893
73 | 


--------------------------------------------------------------------------------
/scaling-controller.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: extensions/v1beta1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: autoscaler
 5 |   namespace: system
 6 | spec:
 7 |   replicas: 1
 8 |   template:
 9 |     metadata:
10 |       labels:
11 |         app: autoscaler
12 |         openai/do-not-drain: "true"
13 |     spec:
14 |       containers:
15 |       - name: autoscaler
16 |         image: quay.io/openai/kubernetes-ec2-autoscaler:azure
17 |         env:
18 |         - name: AWS_ACCESS_KEY_ID
19 |           valueFrom:
20 |             secretKeyRef:
21 |               name: autoscaler
22 |               key: aws-access-key-id
23 |         - name: AWS_SECRET_ACCESS_KEY
24 |           valueFrom:
25 |             secretKeyRef:
26 |               name: autoscaler
27 |               key: aws-secret-access-key
28 |         - name: SLACK_HOOK
29 |           valueFrom:
30 |             secretKeyRef:
31 |               name: autoscaler
32 |               key: slack-hook
33 |         - name: SLACK_BOT_TOKEN
34 |           valueFrom:
35 |             secretKeyRef:
36 |               name: autoscaler
37 |               key: slack-bot-token
38 |         - name: DATADOG_API_KEY
39 |           valueFrom:
40 |             secretKeyRef:
41 |               name: autoscaler
42 |               key: datadog-api-key
43 |         - name: SENTRY_DSN
44 |           valueFrom:
45 |             secretKeyRef:
46 |               name: autoscaler
47 |               key: sentry-dsn
48 |         - name: PYKUBE_KUBERNETES_SERVICE_HOST
49 |           value: 10.100.0.1
50 |           # value: kubernetes.default
51 |         - name: DATADOG_TAGS
52 |           value: env:sci
53 |         - name: NAMESPACE
54 |           value: system.svc.sci.openai.org
55 |         command:
56 |             - python
57 |             - main.py
58 |             - --azure-regions
59 |             - us-east,us-south-central
60 |             - --aws-regions
61 |             - us-west-2,us-east-1,us-west-1
62 |             - --cluster-name
63 |             - openai-kubernetes
64 |             - -vvv
65 |             - --type-idle-threshold
66 |             - "0"
67 |             - --over-provision
68 |             - "1"
69 |             - --sleep
70 |             - "30"
71 |         imagePullPolicy: Always
72 |       restartPolicy: Always
73 |       dnsPolicy: Default  # Don't use cluster DNS.
74 |       nodeSelector:
75 |         aws/region: us-west-2
76 | 


--------------------------------------------------------------------------------
/test/data/ds-pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   creationTimestamp: "2016-08-25T05:11:56Z"
 5 |   generateName: container-monitor-
 6 |   labels:
 7 |     app: container-monitor
 8 |   name: container-monitor-y6w5d
 9 |   namespace: system
10 |   ownerReferences:
11 |   - apiVersion: apps/v1
12 |     blockOwnerDeletion: true
13 |     controller: true
14 |     kind: DaemonSet
15 |     name: container-monitor
16 |     uid: 826c7436-4558-11e6-afb0-0af8d945d5d3
17 |   resourceVersion: "232653318"
18 |   selfLink: /api/v1/namespaces/system/pods/container-monitor-y6w5d
19 |   uid: 718ec371-6a82-11e6-b203-0a0bdd34364d
20 | spec:
21 |   containers:
22 |   - env:
23 |     - name: SLACK_HOOK
24 |       valueFrom:
25 |         secretKeyRef:
26 |           key: slack-hook
27 |           name: container-monitor
28 |     - name: SLACK_TOKEN
29 |       valueFrom:
30 |         secretKeyRef:
31 |           key: slack-token
32 |           name: container-monitor
33 |     image: container-monitor
34 |     imagePullPolicy: Always
35 |     name: container-monitor
36 |     resources: {}
37 |     terminationMessagePath: /dev/termination-log
38 |     volumeMounts:
39 |     - mountPath: /var/run/docker.sock
40 |       name: dockersocket
41 |     - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
42 |       name: default-token-lbbq5
43 |       readOnly: true
44 |   dnsPolicy: ClusterFirst
45 |   nodeName: 10.0.0.228
46 |   restartPolicy: Always
47 |   securityContext: {}
48 |   serviceAccount: default
49 |   serviceAccountName: default
50 |   terminationGracePeriodSeconds: 30
51 |   volumes:
52 |   - hostPath:
53 |       path: /var/run/docker.sock
54 |     name: dockersocket
55 |   - name: default-token-lbbq5
56 |     secret:
57 |       secretName: default-token-lbbq5
58 | status:
59 |   conditions:
60 |   - lastProbeTime: null
61 |     lastTransitionTime: 2016-08-25T05:12:21Z
62 |     status: "True"
63 |     type: Ready
64 |   containerStatuses:
65 |   - containerID: docker://6acb85b56d578202a04125d30db7cb33b180559ecedd612ce61c6f77a45c8f2a
66 |     image: container-monitor
67 |     imageID: docker://sha256:33985d876f3c4ea686af447dd67f14d2efe035eac8dab4132107d8d75f4ce7d1
68 |     lastState: {}
69 |     name: container-monitor
70 |     ready: true
71 |     restartCount: 0
72 |     state:
73 |       running:
74 |         startedAt: 2016-08-25T05:12:20Z
75 |   hostIP: 10.0.0.228
76 |   phase: Running
77 |   podIP: 10.240.112.167
78 |   startTime: "2016-08-25T05:11:53Z"
79 | 


--------------------------------------------------------------------------------
/autoscaler-dep.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: extensions/v1beta1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: autoscaler
 5 |   namespace: kube-system
 6 |   labels:
 7 |     k8s-app: autoscaler
 8 | spec:
 9 |   replicas: 1
10 |   selector:
11 |     matchLabels:
12 |       k8s-app: autoscaler
13 |   template:
14 |     metadata:
15 |       labels:
16 |         k8s-app: autoscaler
17 |         openai/do-not-drain: "true"
18 | #      annotations:
19 | #        scheduler.alpha.kubernetes.io/tolerations: "[{key: dedicated, value: master}]"
20 |     spec:
21 |       containers:
22 |       - name: autoscaler
23 |         image: quay.io/openai/kubernetes-ec2-autoscaler
24 |         resources:
25 |           limits:
26 |             cpu: 500m
27 |             memory: 300Mi
28 |           requests:
29 |             cpu: 100m
30 |             memory: 100Mi
31 |         env:
32 |         - name: CAPACITY_CPU_RESERVE
33 |           value: "0.1"
34 |         - name: AWS_ACCESS_KEY_ID
35 |           valueFrom:
36 |             secretKeyRef:
37 |               name: autoscaler
38 |               key: aws-access-key-id
39 |         - name: AWS_SECRET_ACCESS_KEY
40 |           valueFrom:
41 |             secretKeyRef:
42 |               name: autoscaler
43 |               key: aws-secret-access-key
44 |         - name: SLACK_HOOK
45 |           valueFrom:
46 |             secretKeyRef:
47 |               name: autoscaler
48 |               key: slack-hook
49 |         - name: SLACK_BOT_TOKEN
50 |           valueFrom:
51 |             secretKeyRef:
52 |               name: autoscaler
53 |               key: slack-bot-token
54 |         - name: DATADOG_API_KEY
55 |           valueFrom:
56 |             secretKeyRef:
57 |               name: autoscaler
58 |               key: datadog-api-key
59 |         - name: PYKUBE_KUBERNETES_SERVICE_HOST
60 |           value: kubernetes.default
61 |           #value: 10.100.0.1
62 |         - name: DATADOG_TAGS
63 |           value: env:sci
64 |         command:
65 |             - python
66 |             - main.py
67 |             - --regions
68 |             - us-east-1
69 |             - --cluster-name
70 |             - pipeline.kubernetes.dev.aws.away.black
71 |             - -vvv
72 |             - --type-idle-threshold
73 |             - "0"
74 |             - --over-provision
75 |             - "1"
76 |             - --sleep
77 |             - "30"
78 |         imagePullPolicy: Always
79 |       restartPolicy: Always
80 | #      dnsPolicy: Default  # Don't use cluster DNS.
81 |       nodeSelector:
82 |         kubernetes.io/role: master
83 | 


--------------------------------------------------------------------------------
/test/data/rc-pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   creationTimestamp: "2016-08-25T20:36:43Z"
 5 |   generateName: autoscaler-
 6 |   labels:
 7 |     app: autoscaler
 8 |   name: autoscaler-opnax
 9 |   namespace: system
10 |   ownerReferences:
11 |   - apiVersion: apps/v1
12 |     blockOwnerDeletion: true
13 |     controller: true
14 |     kind: ReplicationController
15 |     name: autoscaler
16 |     uid: bfa94a99-47e6-11e6-8fc1-06a19a97a573
17 |   resourceVersion: "234714730"
18 |   selfLink: /api/v1/namespaces/system/pods/autoscaler-opnax
19 |   uid: a2acf237-6b03-11e6-b203-0a0bdd34364d
20 | spec:
21 |   containers:
22 |   - command:
23 |     - python
24 |     - main.py
25 |     - --regions
26 |     - us-west-2,us-east-1,us-west-1
27 |     - --cluster-name
28 |     - openai-kubernetes
29 |     - -vvv
30 |     env:
31 |     - name: AWS_ACCESS_KEY_ID
32 |       valueFrom:
33 |         secretKeyRef:
34 |           key: aws-access-key-id
35 |           name: autoscaler
36 |     - name: AWS_SECRET_ACCESS_KEY
37 |       valueFrom:
38 |         secretKeyRef:
39 |           key: aws-secret-access-key
40 |           name: autoscaler
41 |     - name: SLACK_HOOK
42 |       valueFrom:
43 |         secretKeyRef:
44 |           key: slack-hook
45 |           name: autoscaler
46 |     - name: PYKUBE_KUBERNETES_SERVICE_HOST
47 |       value: kubernetes.default
48 |     image: autoscaler
49 |     imagePullPolicy: Always
50 |     name: autoscaler
51 |     resources: {}
52 |     terminationMessagePath: /dev/termination-log
53 |     volumeMounts:
54 |     - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
55 |       name: default-token-lbbq5
56 |       readOnly: true
57 |     resources:
58 |       limits:
59 |         cpu: 1500m
60 |       requests:
61 |         cpu: 1500m
62 |   dnsPolicy: ClusterFirst
63 |   nodeName: 10.0.0.228
64 |   nodeSelector:
65 |     aws/region: us-west-2
66 |   restartPolicy: Always
67 |   securityContext: {}
68 |   serviceAccount: default
69 |   serviceAccountName: default
70 |   terminationGracePeriodSeconds: 30
71 |   volumes:
72 |   - name: default-token-lbbq5
73 |     secret:
74 |       secretName: default-token-lbbq5
75 | status:
76 |   conditions:
77 |   - lastProbeTime: null
78 |     lastTransitionTime: 2016-08-25T20:36:46Z
79 |     status: "True"
80 |     type: Ready
81 |   containerStatuses:
82 |   - containerID: docker://756bbb9ae937f86aaaf4a48d2512ee469ff734edcde04e0e92969988c5707741
83 |     image: autoscaler
84 |     imageID: docker://sha256:13a5c2fc6127f99357e5b3da0e2e1aa1c4f8afea35cac595a8ed588065ea42f9
85 |     lastState: {}
86 |     name: autoscaler
87 |     ready: true
88 |     restartCount: 0
89 |     state:
90 |       running:
91 |         startedAt: 2016-08-25T20:36:45Z
92 |   hostIP: 10.0.0.228
93 |   phase: Running
94 |   podIP: 10.240.112.168
95 |   startTime: "2016-08-25T20:36:33Z"
96 | 


--------------------------------------------------------------------------------
/autoscaler/capacity.py:
--------------------------------------------------------------------------------
 1 | """
 2 | module to handle capacity of resources
 3 | """
 4 | import json
 5 | 
 6 | from autoscaler.config import Config
 7 | from autoscaler.kube import KubeResource
 8 | 
 9 | # RESOURCE_SPEC should denote the amount of resouces that are available
10 | # to workload pods on a new, clean node, i.e. resouces used by system pods
11 | # have to be accounted for
12 | with open(Config.CAPACITY_DATA, 'r') as f:
13 |     data = json.loads(f.read())
14 |     RESOURCE_SPEC = {}
15 |     for instance_type, resource_spec in data.items():
16 |         resource_spec['cpu'] -= Config.CAPACITY_CPU_RESERVE
17 |         resource = KubeResource(**resource_spec)
18 |         RESOURCE_SPEC[instance_type] = resource
19 | 
20 | DEFAULT_TYPE_SELECTOR_KEYS = ('aws/type', 'azure/type')
21 | DEFAULT_CLASS_SELECTOR_KEYS = ('aws/class', 'azure/class')
22 | COMPUTING_SELECTOR_KEY = 'openai/computing'
23 | 
24 | 
25 | def is_possible(pod):
26 |     """
27 |     returns whether the pod is possible under the maximum allowable capacity
28 |     """
29 |     max_pod_capacity = max_capacity_for_selectors(pod.selectors, pod.resources)
30 |     if not max_pod_capacity:
31 |         return False
32 |     return (max_pod_capacity - pod.resources).possible
33 | 
34 | 
35 | def max_capacity_for_selectors(selectors, resource_requests):
36 |     """
37 |     returns the maximum capacity that is possible for the given selectors
38 |     """
39 |     selector = ''
40 |     for key in DEFAULT_TYPE_SELECTOR_KEYS:
41 |         if key in selectors:
42 |             selector = selectors[key]
43 |             break
44 |     class_ = ''
45 |     for key in DEFAULT_CLASS_SELECTOR_KEYS:
46 |         if key in selectors:
47 |             class_ = selectors[key]
48 |             break
49 | 
50 |     unit_caps = RESOURCE_SPEC
51 | 
52 |     # HACK: we modify our types with -modifier for special groups
53 |     # e.g. c4.8xlarge-public
54 |     # our selectors don't have dashes otherwise, so remove the modifier
55 |     selector, _, _ = selector.partition('-')
56 |     class_, _, _ = class_.partition('-')
57 |     azure_class = 'Standard_{}'.format(class_)
58 | 
59 |     # if an instance type was specified
60 |     if selector in unit_caps:
61 |         return unit_caps[selector]
62 | 
63 |     max_capacity = None
64 |     for type_, resource in unit_caps.items():
65 |         if (not class_ or type_.startswith(class_) or
66 |                 type_.startswith(azure_class)):
67 |             if not max_capacity or (resource - max_capacity).possible or (resource - resource_requests).possible:
68 |                 max_capacity = resource
69 | 
70 |     return max_capacity
71 | 
72 | 
73 | def get_unit_capacity(group):
74 |     """
75 |     returns the KubeResource provided by one unit in the
76 |     AutoScalingGroup or KubeNode
77 |     """
78 |     return RESOURCE_SPEC[group.instance_type]
79 | 


--------------------------------------------------------------------------------
/data/capacity.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "t2.micro": {"cpu": 1, "memory": 1040699392, "pods": 30},
 3 |     "t2.medium": {"cpu": 2, "memory": 4143996928, "pods": 30},
 4 |     "m4.large": {"cpu": 1, "memory": 8413763584, "pods": 30},
 5 |     "m4.xlarge": {"cpu": 2, "memory": 16827527168, "pods": 30},
 6 |     "m4.2xlarge": {"cpu": 4, "memory": 33738887168, "pods": 30},
 7 |     "m4.4xlarge": {"cpu": 8, "memory": 67549102080, "pods": 30},
 8 |     "m4.10xlarge": {"cpu": 20, "memory": 169017131008, "pods": 30},
 9 |     "m4.16xlarge": {"cpu": 32, "memory": 274877906944, "pods": 30},
10 |     "g2.2xlarge": {"cpu": 4, "memory": 15770570752, "gpu": 1, "pods": 30},
11 |     "g2.8xlarge": {"cpu": 16, "memory": 63311153536, "gpu": 4, "pods": 30},
12 |     "c4.2xlarge": {"cpu": 4, "memory": 15770546176, "pods": 30},
13 |     "c4.4xlarge": {"cpu": 8, "memory": 31612444672, "pods": 30},
14 |     "c4.8xlarge": {"cpu": 16, "memory": 63311120768, "pods": 30},
15 |     "c4.xlarge": {"cpu": 2, "memory": 7843344384, "pods": 30},
16 |     "c4.large": {"cpu": 2, "memory": 3947372544, "pods": 30},
17 |     "p2.xlarge": {"cpu": 2, "memory": 65498251264, "pods": 30},
18 |     "p2.8xlarge": {"cpu": 16, "memory": 523986010112, "pods": 30},
19 |     "p2.16xlarge": {"cpu": 32, "memory": 785979015168, "pods": 30},
20 |     "Standard_DS1": {"cpu": 1, "memory": 3758096384, "pods": 30},
21 |     "Standard_D1_v2": {"cpu": 1, "memory": 3758096384, "pods": 30},
22 |     "Standard_D2_v2": {"cpu": 2, "memory": 7516192768, "pods": 30},
23 |     "Standard_D3_v2": {"cpu": 4, "memory": 15032385536, "pods": 30},
24 |     "Standard_D4_v2": {"cpu": 8, "memory": 30064771072, "pods": 30},
25 |     "Standard_D5_v2": {"cpu": 16, "memory": 60129542144, "pods": 30},
26 |     "Standard_D11_v2": {"cpu": 2, "memory": 15032385536, "pods": 30},
27 |     "Standard_D12_v2": {"cpu": 4, "memory": 30064771072, "pods": 30},
28 |     "Standard_D13_v2": {"cpu": 8, "memory": 60129542144, "pods": 30},
29 |     "Standard_D14_v2": {"cpu": 16, "memory": 120259084288, "pods": 30},
30 |     "Standard_D15_v2": {"cpu": 20, "memory": 150323855360, "pods": 30},
31 |     "Standard_DS15_v2": {"cpu": 20, "memory": 150323855360, "pods": 30},
32 |     "Standard_NC6": {"cpu": 6, "memory": 60129542144, "alpha.kubernetes.io/nvidia-gpu": 1, "pods": 30},
33 |     "Standard_NC12": {"cpu": 12, "memory": 120259084288, "alpha.kubernetes.io/nvidia-gpu": 2, "pods": 30},
34 |     "Standard_NC24": {"cpu": 24, "memory": 240518168576, "alpha.kubernetes.io/nvidia-gpu": 4, "pods": 30},
35 |     "Standard_ND6s": {"cpu": 6, "memory": 120259084288, "alpha.kubernetes.io/nvidia-gpu": 1, "pods": 30},
36 |     "Standard_ND12s": {"cpu": 12, "memory": 240518168576, "alpha.kubernetes.io/nvidia-gpu": 2, "pods": 30},
37 |     "Standard_ND24s": {"cpu": 24, "memory": 481036337152, "alpha.kubernetes.io/nvidia-gpu": 4, "pods": 30},
38 |     "Standard_H8": {"cpu": 8, "memory": 57982058496, "pods": 30},
39 |     "Standard_H16": {"cpu": 16, "memory": 120259084288, "pods": 30},
40 |     "Standard_H8m": {"cpu": 8, "memory": 120259084288, "pods": 30},
41 |     "Standard_H16m": {"cpu": 16, "memory": 240518168576, "pods": 30},
42 |     "Standard_H16r": {"cpu": 16, "memory": 120259084288, "pods": 30},
43 |     "Standard_H16mr": {"cpu": 16, "memory": 240518168576, "pods": 30}
44 | }
45 | 


--------------------------------------------------------------------------------
/autoscaler/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from abc import ABC
  4 | 
  5 | from threading import Lock
  6 | 
  7 | 
  8 | # A callback that only triggers exactly once, after being called N times
  9 | class CountDownCallback:
 10 |     def __init__(self, count, delegate):
 11 |         self._count = count
 12 |         self._delegate = delegate
 13 |         self._lock = Lock()
 14 | 
 15 |     def __call__(self, *args, **kwargs):
 16 |         self._lock.acquire()
 17 |         self._count -= 1
 18 |         if self._count == 0:
 19 |             self._delegate(*args, **kwargs)
 20 |         self._lock.release()
 21 | 
 22 | 
 23 | class Future(ABC):
 24 |     def result(self):
 25 |         pass
 26 | 
 27 |     def add_done_callback(self, fn):
 28 |         pass
 29 | 
 30 | 
 31 | class CompletedFuture(Future):
 32 |     def __init__(self, value):
 33 |         self._value = value
 34 | 
 35 |     def result(self):
 36 |         return self._value
 37 | 
 38 |     def add_done_callback(self, fn):
 39 |         fn(self)
 40 | 
 41 | 
 42 | class TransformingFuture(Future):
 43 |     def __init__(self, value, delegate):
 44 |         self._value = value
 45 |         self._delegate = delegate
 46 | 
 47 |     def result(self):
 48 |         self._delegate.result()
 49 |         return self._value
 50 | 
 51 |     def add_done_callback(self, fn):
 52 |         self._delegate.add_done_callback(lambda _: fn(self))
 53 | 
 54 | 
 55 | class AllCompletedFuture(Future):
 56 |     def __init__(self, futures):
 57 |         self._futures = futures
 58 | 
 59 |     def result(self):
 60 |         return [future.result() for future in self._futures]
 61 | 
 62 |     def add_done_callback(self, fn):
 63 |         callback = CountDownCallback(len(self._futures), lambda _: fn(self))
 64 |         for future in self._futures:
 65 |             future.add_done_callback(callback)
 66 | 
 67 | 
 68 | def selectors_to_hash(selectors):
 69 |     return json.dumps(selectors, sort_keys=True)
 70 | 
 71 | 
 72 | def get_groups_for_hash(asgs, selectors_hash):
 73 |     """
 74 |     returns a list of groups from asg that match the selectors
 75 |     """
 76 |     selectors = json.loads(selectors_hash)
 77 |     groups = []
 78 |     for asg in asgs:
 79 |         if asg.is_match_for_selectors(selectors):
 80 |             groups.append(asg)
 81 |     return groups
 82 | 
 83 | 
 84 | def get_group_for_node(asgs, node):
 85 |     for asg in asgs:
 86 |         if asg.contains(node):
 87 |             return asg
 88 |     return None
 89 | 
 90 | 
 91 | SI_suffix = {
 92 |     'y': 1e-24,  # yocto
 93 |     'z': 1e-21,  # zepto
 94 |     'a': 1e-18,  # atto
 95 |     'f': 1e-15,  # femto
 96 |     'p': 1e-12,  # pico
 97 |     'n': 1e-9,  # nano
 98 |     'u': 1e-6,  # micro
 99 |     'm': 1e-3,  # mili
100 |     'c': 1e-2,  # centi
101 |     'd': 1e-1,  # deci
102 |     'k': 1e3,  # kilo
103 |     'M': 1e6,  # mega
104 |     'G': 1e9,  # giga
105 |     'T': 1e12,  # tera
106 |     'P': 1e15,  # peta
107 |     'E': 1e18,  # exa
108 |     'Z': 1e21,  # zetta
109 |     'Y': 1e24,  # yotta
110 |     # Kube also uses the power of 2 equivalent
111 |     'Ki': 2**10,
112 |     'Mi': 2**20,
113 |     'Gi': 2**30,
114 |     'Ti': 2**40,
115 |     'Pi': 2**50,
116 |     'Ei': 2**60,
117 | }
118 | SI_regex = re.compile(r"([0-9.]+)(%s)?$" % "|".join(SI_suffix.keys()))
119 | 
120 | 
121 | def parse_SI(s):
122 |     m = SI_regex.match(s)
123 |     if m is None:
124 |         raise ValueError("Unknown SI quantity: %s" % s)
125 |     num_s, unit = m.groups()
126 |     multiplier = SI_suffix[unit] if unit else 1.  # unitless
127 |     return float(num_s) * multiplier
128 | 
129 | 
130 | def parse_resource(resource):
131 |     try:
132 |         return float(resource)
133 |     except ValueError:
134 |         return parse_SI(resource)
135 | 
136 | 
137 | def parse_bool_label(value):
138 |     return str(value).lower() in ('1', 'true')
139 | 
140 | 
141 | def get_relevant_selectors(node_selectors):
142 |     selectors = dict((k, v) for (k, v) in node_selectors.items()
143 |                      if k.startswith('aws/') or k.startswith('openai/'))
144 |     return selectors
145 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import sys
  3 | import time
  4 | 
  5 | import click
  6 | 
  7 | from autoscaler.cluster import Cluster
  8 | from autoscaler.notification import Notifier
  9 | 
 10 | logger = logging.getLogger('autoscaler')
 11 | 
 12 | DEBUG_LOGGING_MAP = {
 13 |     0: logging.CRITICAL,
 14 |     1: logging.WARNING,
 15 |     2: logging.INFO,
 16 |     3: logging.DEBUG
 17 | }
 18 | 
 19 | 
 20 | @click.command()
 21 | @click.option("--cluster-name")
 22 | @click.option("--aws-regions", default="us-west-1")
 23 | @click.option("--sleep", default=60)
 24 | @click.option("--kubeconfig", default=None,
 25 |               help='Full path to kubeconfig file. If not provided, '
 26 |                    'we assume that we\'re running on kubernetes.')
 27 | @click.option("--pod-namespace", default=None,
 28 |               help='The namespace to look for out-of-resource pods in. By '
 29 |                    'default, this will look in all namespaces.')
 30 | @click.option("--idle-threshold", default=3300)
 31 | @click.option("--type-idle-threshold", default=3600*24*7)
 32 | @click.option("--over-provision", default=5)
 33 | @click.option("--max-scale-in-fraction", default=0.1)
 34 | @click.option("--drain-utilization", default=0.0)
 35 | @click.option("--azure-slow-scale-classes", default="")
 36 | @click.option("--azure-resource-groups")
 37 | @click.option("--azure-client-id", default=None, envvar='AZURE_CLIENT_ID')
 38 | @click.option("--azure-client-secret", default=None, envvar='AZURE_CLIENT_SECRET')
 39 | @click.option("--azure-subscription-id", default=None, envvar='AZURE_SUBSCRIPTION_ID')
 40 | @click.option("--azure-tenant-id", default=None, envvar='AZURE_TENANT_ID')
 41 | @click.option("--aws-access-key", default=None, envvar='AWS_ACCESS_KEY_ID')
 42 | @click.option("--aws-secret-key", default=None, envvar='AWS_SECRET_ACCESS_KEY')
 43 | @click.option("--use-aws-iam-role", is_flag=True)
 44 | @click.option("--datadog-api-key", default=None, envvar='DATADOG_API_KEY')
 45 | @click.option("--instance-init-time", default=25 * 60)
 46 | @click.option("--no-scale", is_flag=True)
 47 | @click.option("--no-maintenance", is_flag=True)
 48 | @click.option("--slack-hook", default=None, envvar='SLACK_HOOK',
 49 |               help='Slack webhook URL. If provided, post scaling messages '
 50 |                    'to Slack.')
 51 | @click.option("--slack-bot-token", default=None, envvar='SLACK_BOT_TOKEN',
 52 |               help='Slack bot token. If provided, post scaling messages '
 53 |                    'to Slack users directly.')
 54 | @click.option("--dry-run", is_flag=True)
 55 | @click.option('--verbose', '-v',
 56 |               help="Sets the debug noise level, specify multiple times "
 57 |                    "for more verbosity.",
 58 |               type=click.IntRange(0, 3, clamp=True),
 59 |               count=True)
 60 | def main(cluster_name, aws_regions, azure_resource_groups, azure_slow_scale_classes, sleep, kubeconfig,
 61 |          azure_client_id, azure_client_secret, azure_subscription_id, azure_tenant_id,
 62 |          aws_access_key, aws_secret_key, use_aws_iam_role, pod_namespace, datadog_api_key,
 63 |          idle_threshold, type_idle_threshold, max_scale_in_fraction, drain_utilization,
 64 |          over_provision, instance_init_time, no_scale, no_maintenance,
 65 |          slack_hook, slack_bot_token, dry_run, verbose):
 66 |     logger_handler = logging.StreamHandler(sys.stderr)
 67 |     logger_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%dT%H:%M:%S%z'))
 68 |     logger.addHandler(logger_handler)
 69 |     logger.setLevel(DEBUG_LOGGING_MAP.get(verbose, logging.CRITICAL))
 70 | 
 71 |     aws_regions_list = aws_regions.split(',') if aws_regions else []
 72 |     if not ((aws_secret_key and aws_access_key) or use_aws_iam_role) and aws_regions_list:
 73 |         logger.error("Missing AWS credentials. Please provide aws-access-key and aws-secret-key.")
 74 |         sys.exit(1)
 75 | 
 76 |     notifier = Notifier(slack_hook, slack_bot_token)
 77 |     cluster = Cluster(aws_access_key=aws_access_key,
 78 |                       aws_secret_key=aws_secret_key,
 79 |                       use_aws_iam_role=use_aws_iam_role,
 80 |                       aws_regions=aws_regions_list,
 81 |                       azure_client_id=azure_client_id,
 82 |                       azure_client_secret=azure_client_secret,
 83 |                       azure_subscription_id=azure_subscription_id,
 84 |                       azure_tenant_id=azure_tenant_id,
 85 |                       azure_resource_group_names=azure_resource_groups.split(',') if azure_resource_groups else [],
 86 |                       azure_slow_scale_classes=azure_slow_scale_classes.split(',') if azure_slow_scale_classes else [],
 87 |                       kubeconfig=kubeconfig,
 88 |                       pod_namespace=pod_namespace,
 89 |                       idle_threshold=idle_threshold,
 90 |                       instance_init_time=instance_init_time,
 91 |                       type_idle_threshold=type_idle_threshold,
 92 |                       cluster_name=cluster_name,
 93 |                       max_scale_in_fraction=max_scale_in_fraction,
 94 |                       drain_utilization_below=drain_utilization,
 95 |                       scale_up=not no_scale,
 96 |                       maintainance=not no_maintenance,
 97 |                       over_provision=over_provision,
 98 |                       datadog_api_key=datadog_api_key,
 99 |                       notifier=notifier,
100 |                       dry_run=dry_run,
101 |                       )
102 |     backoff = sleep
103 |     while True:
104 |         scaled = cluster.scale_loop()
105 |         if scaled:
106 |             time.sleep(sleep)
107 |             backoff = sleep
108 |         else:
109 |             logger.warn("backoff: %s" % backoff)
110 |             backoff *= 2
111 |             time.sleep(backoff)
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     main()
116 | 


--------------------------------------------------------------------------------
/autoscaler/notification.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import json
  3 | import logging
  4 | import operator
  5 | 
  6 | from cachetools import TTLCache, cachedmethod
  7 | import json_log_formatter
  8 | import requests
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | struct_logger = logging.getLogger('autoscaler.notification.struct')
 13 | formatter = json_log_formatter.JSONFormatter()
 14 | handler = logging.StreamHandler()
 15 | handler.setFormatter(formatter)
 16 | struct_logger.addHandler(handler)
 17 | struct_logger.setLevel(logging.DEBUG)
 18 | struct_logger.propagate = False
 19 | 
 20 | 
 21 | def _cache_key(notifier, owner, message, pods):
 22 |     md5 = hashlib.md5()
 23 |     md5.update(owner.encode('utf-8'))
 24 |     md5.update(message.encode('utf-8'))
 25 | 
 26 |     for pod in sorted(pods, key=lambda p: p.uid):
 27 |         md5.update(pod.uid.encode('utf-8'))
 28 | 
 29 |     key = 'v0.md5.{}'.format(md5.hexdigest())
 30 |     return key
 31 | 
 32 | 
 33 | def _generate_pod_string(pods):
 34 |     if len(pods) > 5:
 35 |         pods_string = '{}, and {} others'.format(
 36 |             ', '.join('{}/{}'.format(pod.namespace, pod.name) for pod in pods[:4]),
 37 |             len(pods) - 4)
 38 |     else:
 39 |         pods_string = ', '.join('{}/{}'.format(pod.namespace, pod.name) for pod in pods)
 40 |     return pods_string
 41 | 
 42 | 
 43 | def struct_log(message, pods, extra=None):
 44 |     for pod in pods:
 45 |         log_extra = {
 46 |             'pod_name': '{}/{}'.format(pod.namespace, pod.name),
 47 |             'pod_id': pod.uid,
 48 |             '_log_streaming_target_mapping': 'kubernetes-ec2-autoscaler'
 49 |         }
 50 |         if extra:
 51 |             log_extra.update(extra)
 52 |         struct_logger.debug(message, extra=log_extra)
 53 | 
 54 | 
 55 | class Notifier(object):
 56 |     MESSAGE_URL = 'https://slack.com/api/chat.postMessage'
 57 | 
 58 |     def __init__(self, hook=None, bot_token=None):
 59 |         self.hook = hook
 60 |         self.bot_token = bot_token
 61 | 
 62 |         self.cache = TTLCache(maxsize=128, ttl=60*30)
 63 | 
 64 |     def notify_scale(self, asg, units_requested, pods):
 65 |         struct_log('scale', pods,
 66 |                    extra={'asg': str(asg), 'units_requested': units_requested})
 67 | 
 68 |         if not self.hook:
 69 |             logger.debug('SLACK_HOOK not configured.')
 70 |             return
 71 | 
 72 |         pods_string = _generate_pod_string(pods)
 73 | 
 74 |         message = 'ASG {}[{}] scaling up by {} to new capacity {}'.format(
 75 |             asg.name, asg.region, units_requested, asg.desired_capacity)
 76 |         message += '\n'
 77 |         message += 'Change triggered by {}'.format(pods_string)
 78 | 
 79 |         try:
 80 |             resp = requests.post(self.hook, json={
 81 |                 "text": message,
 82 |                 "username": "kubernetes-ec2-autoscaler",
 83 |                 "icon_emoji": ":rabbit:",
 84 |             })
 85 |             logger.debug('SLACK: %s', resp.text)
 86 |         except requests.exceptions.ConnectionError as e:
 87 |             logger.critical('Failed to SLACK: %s', e)
 88 | 
 89 |         self.message_owners(
 90 |             'ASG {}[{}] scaling up'.format(asg.name, asg.region), pods)
 91 | 
 92 |     def notify_failed_to_scale(self, selectors_hash, pods):
 93 |         struct_log('failed to scale', pods,
 94 |                    extra={'selectors_hash': selectors_hash})
 95 | 
 96 |         if not self.hook:
 97 |             logger.debug('SLACK_HOOK not configured.')
 98 |             return
 99 | 
100 |         pods_string = _generate_pod_string(pods)
101 | 
102 |         main_message = 'Failed to scale {} sufficiently. Backing off...'.format(
103 |             json.dumps(selectors_hash))
104 |         message = main_message + '\n'
105 |         message += 'Pods affected: {}'.format(pods_string)
106 | 
107 |         try:
108 |             resp = requests.post(self.hook, json={
109 |                 "text": message,
110 |                 "username": "kubernetes-ec2-autoscaler",
111 |                 "icon_emoji": ":rabbit:",
112 |             })
113 |             logger.debug('SLACK: %s', resp.text)
114 |         except requests.exceptions.ConnectionError as e:
115 |             logger.critical('Failed to SLACK: %s', e)
116 | 
117 |         self.message_owners(main_message, pods)
118 | 
119 |     def notify_invalid_pod_capacity(self, pod, recommended_capacity):
120 |         struct_log('invalid pod capacity', [pod],
121 |                    extra={'recommended_capacity': str(recommended_capacity)})
122 | 
123 |         if not self.hook:
124 |             logger.debug('SLACK_HOOK not configured.')
125 |             return
126 | 
127 |         message = ("Pending pod {}/{} cannot fit {}. "
128 |                    "Please check that requested resource amount is "
129 |                    "consistent with node selectors (recommended max: {}). "
130 |                    "Scheduling skipped.".format(pod.namespace, pod.name, json.dumps(pod.selectors), recommended_capacity))
131 | 
132 |         try:
133 |             resp = requests.post(self.hook, json={
134 |                 "text": message,
135 |                 "username": "kubernetes-ec2-autoscaler",
136 |                 "icon_emoji": ":rabbit:",
137 |             })
138 |             logger.debug('SLACK: %s', resp.text)
139 |         except requests.exceptions.ConnectionError as e:
140 |             logger.critical('Failed to SLACK: %s', e)
141 | 
142 |         self.message_owners(message, [pod])
143 | 
144 |     def notify_drained_node(self, node, pods):
145 |         struct_log('drain', pods, extra={'node': str(node)})
146 | 
147 |         if not self.hook:
148 |             logger.debug('SLACK_HOOK not configured.')
149 |             return
150 | 
151 |         pods_string = _generate_pod_string(pods)
152 | 
153 |         message = 'Node {} drained.'.format(node)
154 |         message += '\n'
155 |         message += 'Pod affected: {}'.format(pods_string)
156 | 
157 |         try:
158 |             resp = requests.post(self.hook, json={
159 |                 "text": message,
160 |                 "username": "kubernetes-ec2-autoscaler",
161 |                 "icon_emoji": ":rabbit:",
162 |             })
163 |             logger.debug('SLACK: %s', resp.text)
164 |         except requests.exceptions.ConnectionError as e:
165 |             logger.critical('Failed to SLACK: %s', e)
166 | 
167 |     def message_owners(self, message, pods):
168 |         if not self.bot_token:
169 |             logger.debug('SLACK_BOT_TOKEN not configured.')
170 |             return
171 | 
172 |         pods_by_owner = {}
173 |         for pod in pods:
174 |             if pod.owner:
175 |                 pods_by_owner.setdefault(pod.owner, []).append(pod)
176 | 
177 |         for owner, pods in pods_by_owner.items():
178 |             self.message_owner(owner, message, pods)
179 | 
180 |     @cachedmethod(operator.attrgetter('cache'), key=_cache_key)
181 |     def message_owner(self, owner, message, pods):
182 |         attachments = [{
183 |             'pretext': 'Relevant pods',
184 |             'text': ', '.join('{}/{}'.format(pod.namespace, pod.name) for pod in pods)
185 |         }]
186 | 
187 |         try:
188 |             resp = requests.post(self.MESSAGE_URL, data={
189 |                 "text": message,
190 |                 "attachments": json.dumps(attachments),
191 |                 "token": self.bot_token,
192 |                 "channel": "@{}".format(owner),
193 |                 "username": "kubernetes-ec2-autoscaler",
194 |                 "icon_emoji": ":rabbit:",
195 |             })
196 |             logger.debug('SLACK: %s', resp.text)
197 |         except requests.exceptions.RequestException as e:
198 |             logger.critical('Failed to SLACK: %s', e)
199 | 


--------------------------------------------------------------------------------
/test/test_azure.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import unittest
  3 | from datetime import datetime, timedelta
  4 | 
  5 | import collections
  6 | import mock
  7 | import pykube
  8 | import pytz
  9 | import yaml
 10 | from azure.mgmt.compute.models import VirtualMachineScaleSetVM, \
 11 |     VirtualMachineInstanceView, VirtualMachineSize, Usage, UsageName
 12 | from azure.mgmt.resource.resources.models import ResourceGroup
 13 | 
 14 | from autoscaler import KubePod
 15 | from autoscaler.azure import AzureVirtualScaleSet
 16 | from autoscaler.azure_api import AzureScaleSet, AzureWrapper
 17 | 
 18 | 
 19 | def _default_mock_clients(region, instances=[], quotas={'Dv2': 100, 'NC': 100}):
 20 |     sizes = [
 21 |         VirtualMachineSize(name="Standard_D1_v2", number_of_cores=1),
 22 |         VirtualMachineSize(name="Standard_NC24", number_of_cores=24)
 23 |     ]
 24 |     mock_client = mock.Mock()
 25 |     mock_client.virtual_machine_scale_set_vms = mock.Mock()
 26 |     mock_client.virtual_machine_scale_set_vms.list = mock.Mock(return_value=instances)
 27 |     mock_client.virtual_machine_scale_sets = mock.Mock()
 28 |     mock_client.virtual_machine_scale_sets.create_or_update = mock.Mock()
 29 |     mock_client.virtual_machine_scale_sets.delete_instances = mock.Mock()
 30 |     mock_client.virtual_machine_sizes = mock.Mock()
 31 |     mock_client.virtual_machine_sizes.list = mock.Mock(return_value=sizes)
 32 |     mock_client.usage = mock.Mock()
 33 |     usage_limits = []
 34 |     for k, v in quotas.items():
 35 |         usage_limits.append(Usage(name=UsageName(value="standard" + k + "Family"), limit=v, current_value=0))
 36 |     mock_client.usage.list = mock.Mock(return_value=usage_limits)
 37 | 
 38 |     monitor_client = mock.Mock()
 39 |     monitor_client.activity_logs = mock.Mock()
 40 |     monitor_client.activity_logs.list = mock.Mock(return_value=[])
 41 | 
 42 |     azure_resource_group = ResourceGroup(location=region)
 43 |     resource_client = mock.Mock()
 44 |     resource_client.resource_groups = mock.Mock()
 45 |     resource_client.activity_logs.get = mock.Mock(return_value=azure_resource_group)
 46 | 
 47 |     return (mock_client, monitor_client, resource_client)
 48 | 
 49 | 
 50 | class TestCluster(unittest.TestCase):
 51 |     def test_failed_scale_up(self):
 52 |         region = 'test'
 53 | 
 54 |         mock_client, monitor_client, resource_client = _default_mock_clients(region)
 55 | 
 56 |         instance_type = 'Standard_D1_v2'
 57 |         resource_group = 'test-resource-group'
 58 |         failed_scale_set = AzureScaleSet(region, resource_group, 'test-scale-set1', instance_type, 0, 'Failed')
 59 |         scale_set = AzureScaleSet(region, resource_group, 'test-scale-set2', instance_type, 0, 'Succeeded')
 60 | 
 61 |         virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, False, [failed_scale_set, scale_set], [])
 62 | 
 63 |         virtual_scale_set.scale(5)
 64 | 
 65 |         mock_client.virtual_machine_scale_sets.create_or_update.assert_called_once()
 66 |         self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_args[1]['parameters'].sku.capacity, 5)
 67 | 
 68 |     def test_scale_up(self):
 69 |         region = 'test'
 70 | 
 71 |         mock_client, monitor_client, resource_client = _default_mock_clients(region)
 72 | 
 73 |         instance_type = 'Standard_D1_v2'
 74 |         resource_group = 'test-resource-group'
 75 |         scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 0, 'Succeeded')
 76 | 
 77 |         virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, False, [scale_set], [])
 78 | 
 79 |         virtual_scale_set.scale(5)
 80 | 
 81 |         mock_client.virtual_machine_scale_sets.create_or_update.assert_called_once()
 82 |         self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_args[1]['parameters'].sku.capacity, 5)
 83 | 
 84 |     def test_priority(self):
 85 |         region = 'test'
 86 | 
 87 |         mock_client, monitor_client, resource_client = _default_mock_clients(region)
 88 | 
 89 |         instance_type = 'Standard_D1_v2'
 90 |         resource_group = 'test-resource-group'
 91 |         scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 0, 'Succeeded', priority=-1)
 92 |         # Name sorts lexicographically before previous scale set, but priority is after it
 93 |         scale_set2 = AzureScaleSet(region, resource_group, 'a-test-scale-set', instance_type, 0, 'Succeeded', priority=1)
 94 | 
 95 |         virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, True, [scale_set, scale_set2], [])
 96 | 
 97 |         virtual_scale_set.scale(1)
 98 | 
 99 |         self.assertEqual(virtual_scale_set.global_priority, -1)
100 | 
101 |         self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_count, 1)
102 |         self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_args_list[0][0][1], 'test-scale-set')
103 | 
104 |     def test_slow_scale_up(self):
105 |         region = 'test'
106 | 
107 |         mock_client, monitor_client, resource_client = _default_mock_clients(region)
108 | 
109 |         instance_type = 'Standard_D1_v2'
110 |         resource_group = 'test-resource-group'
111 |         scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 0, 'Succeeded')
112 |         scale_set2 = AzureScaleSet(region, resource_group, 'test-scale-set2', instance_type, 0, 'Succeeded')
113 | 
114 |         virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, True, [scale_set, scale_set2], [])
115 | 
116 |         virtual_scale_set.scale(2)
117 | 
118 |         self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_count, 2)
119 |         self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_args_list[0][1]['parameters'].sku.capacity, 1)
120 |         self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_args_list[1][1]['parameters'].sku.capacity, 1)
121 | 
122 |     def test_tainted_scale_set(self):
123 |         region = 'test'
124 | 
125 |         mock_client, monitor_client, resource_client = _default_mock_clients(region)
126 | 
127 |         instance_type = 'Standard_NC24'
128 |         resource_group = 'test-resource-group'
129 |         scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 0, 'Succeeded', no_schedule_taints={'gpu': 'yes'})
130 | 
131 |         virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, True, [scale_set], [])
132 | 
133 |         dir_path = os.path.dirname(os.path.realpath(__file__))
134 |         with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f:
135 |             dummy_pod = yaml.load(f.read())
136 |         pod = KubePod(pykube.Pod(None, dummy_pod))
137 | 
138 |         self.assertFalse(virtual_scale_set.is_taints_tolerated(pod))
139 | 
140 |         dummy_pod['spec']['tolerations'] = [{'key': 'gpu', 'operator': 'Exists'}]
141 |         pod = KubePod(pykube.Pod(None, dummy_pod))
142 |         self.assertTrue(virtual_scale_set.is_taints_tolerated(pod))
143 | 
144 |     def test_out_of_quota(self):
145 |         region = 'test'
146 | 
147 |         mock_client, monitor_client, resource_client = _default_mock_clients(region)
148 | 
149 |         instance_type = 'Standard_D1_v2'
150 |         resource_group = 'test-resource-group'
151 |         scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 0, 'Succeeded',
152 |                                   timeout_until=datetime.now(pytz.utc) + timedelta(minutes=10), timeout_reason="fake reason")
153 |         virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, False, [scale_set], [])
154 |         self.assertTrue(virtual_scale_set.is_timed_out())
155 | 
156 |     def test_near_quota_limit(self):
157 |         region = 'test'
158 | 
159 |         mock_client, monitor_client, resource_client = _default_mock_clients(region, quotas={'Dv2': 5})
160 | 
161 |         instance_type = 'Standard_D1_v2'
162 |         resource_group = 'test-resource-group'
163 |         scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 0, 'Succeeded')
164 | 
165 |         virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, False, [scale_set], [])
166 | 
167 |         virtual_scale_set.scale(10)
168 | 
169 |         mock_client.virtual_machine_scale_sets.create_or_update.assert_called_once()
170 |         self.assertEqual(mock_client.virtual_machine_scale_sets.create_or_update.call_args[1]['parameters'].sku.capacity, 5)
171 | 
172 |     def test_scale_in(self):
173 |         region = 'test'
174 |         resource_group = 'test-resource-group'
175 | 
176 |         instance = VirtualMachineScaleSetVM(location=region)
177 |         instance.vm_id = 'test-vm-id'
178 |         instance.instance_id = 0
179 |         instance.instance_view = VirtualMachineInstanceView()
180 |         instance.instance_view.statuses = []
181 | 
182 |         mock_client, monitor_client, resource_client = _default_mock_clients(region, instances=[instance])
183 | 
184 |         TestNode = collections.namedtuple('TestNode', ['instance_id', 'unschedulable'])
185 |         test_node = TestNode(instance_id=instance.vm_id, unschedulable=False)
186 | 
187 |         instance_type = 'Standard_D1_v2'
188 |         scale_set = AzureScaleSet(region, resource_group, 'test-scale-set', instance_type, 1, 'Succeeded')
189 | 
190 |         virtual_scale_set = AzureVirtualScaleSet(region, resource_group, AzureWrapper(mock_client, monitor_client, resource_client), instance_type, False, [scale_set], [test_node])
191 | 
192 |         self.assertEqual(virtual_scale_set.instance_ids, {instance.vm_id})
193 |         self.assertEqual(virtual_scale_set.nodes, [test_node])
194 | 
195 |         virtual_scale_set.scale_nodes_in([test_node])
196 |         mock_client.virtual_machine_scale_sets.delete_instances.assert_called_once_with(resource_group, scale_set.name, [instance.instance_id])
197 | 


--------------------------------------------------------------------------------
/autoscaler/azure.py:
--------------------------------------------------------------------------------
  1 | import http
  2 | import logging
  3 | from typing import List, Tuple, MutableMapping
  4 | from datetime import datetime
  5 | 
  6 | import re
  7 | from requests.packages.urllib3 import Retry
  8 | 
  9 | import autoscaler.utils as utils
 10 | from autoscaler.autoscaling_groups import AutoScalingGroup
 11 | from autoscaler.azure_api import AzureApi, AzureScaleSet, AzureScaleSetInstance
 12 | from autoscaler.utils import TransformingFuture, AllCompletedFuture, CompletedFuture
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | _RETRY_TIME_LIMIT = 30
 18 | 
 19 | 
 20 | class AzureBoundedRetry(Retry):
 21 |     """
 22 |     XXX: Azure sometimes sends us a Retry-After: 1200, even when we still have quota, causing our client to appear to hang.
 23 |     Ignore them and just retry after 30secs
 24 |     """
 25 |     def __init__(self, **kwargs):
 26 |         super().__init__(**kwargs)
 27 | 
 28 |     @staticmethod
 29 |     def from_retry(retry):
 30 |         new_retry = AzureBoundedRetry()
 31 |         new_retry.total = retry.total
 32 |         new_retry.connect = retry.connect
 33 |         new_retry.read = retry.read
 34 |         new_retry.backoff_factor = retry.backoff_factor
 35 |         new_retry.BACKOFF_MAX = retry.BACKOFF_MAX
 36 |         new_retry.status_forcelist = retry.status_forcelist
 37 |         new_retry.method_whitelist = retry.method_whitelist
 38 | 
 39 |         return new_retry
 40 | 
 41 |     def get_retry_after(self, response):
 42 |         retry_after = super().get_retry_after(response)
 43 |         if response.status != http.HTTPStatus.TOO_MANY_REQUESTS or retry_after <= _RETRY_TIME_LIMIT:
 44 |             return retry_after
 45 | 
 46 |         headers = {}
 47 |         for header in ['Retry-After',
 48 |                        'x-ms-ratelimit-remaining-subscription-reads',
 49 |                        'x-ms-ratelimit-remaining-subscription-writes',
 50 |                        'x-ms-ratelimit-remaining-tenant-reads',
 51 |                        'x-ms-ratelimit-remaining-tenant-writes',
 52 |                        'x-ms-ratelimit-remaining-subscription-resource-requests',
 53 |                        'x-ms-ratelimit-remaining-subscription-resource-entities-read',
 54 |                        'x-ms-ratelimit-remaining-tenant-resource-requests',
 55 |                        'x-ms-ratelimit-remaining-tenant-resource-entities-read']:
 56 |             value = response.getheader(header)
 57 |             if value is not None:
 58 |                 headers[header] = value
 59 | 
 60 |         logger.warn("Azure request throttled: {}".format(headers))
 61 |         return _RETRY_TIME_LIMIT
 62 | 
 63 | 
 64 | class AzureGroups(object):
 65 |     def __init__(self, resource_groups, slow_scale_classes, client: AzureApi):
 66 |         self.resource_groups = resource_groups
 67 |         self.slow_scale_classes = slow_scale_classes
 68 |         self.client = client
 69 | 
 70 |     def get_all_groups(self, kube_nodes):
 71 | 
 72 |         groups = []
 73 |         if self.client:
 74 |             for resource_group in self.resource_groups:
 75 |                 scale_sets_by_type = {}
 76 |                 for scale_set in self.client.list_scale_sets(resource_group.name):
 77 |                     scale_sets_by_type.setdefault((scale_set.location, scale_set.instance_type), []).append(scale_set)
 78 |                 for key, scale_sets in scale_sets_by_type.items():
 79 |                     location, instance_type = key
 80 |                     slow_scale = _get_azure_class(instance_type) in self.slow_scale_classes
 81 |                     groups.append(AzureVirtualScaleSet(location, resource_group.name, self.client, instance_type, slow_scale, scale_sets, kube_nodes))
 82 | 
 83 |         return groups
 84 | 
 85 | 
 86 | _CLASS_PAT = re.compile(r'\w+_(?P<class>[A-Z]+).+')
 87 | 
 88 | 
 89 | def _get_azure_class(type_):
 90 |     m = _CLASS_PAT.match(type_)
 91 |     return m.group('class')
 92 | 
 93 | 
 94 | _SCALE_SET_SIZE_LIMIT = 300
 95 | 
 96 | 
 97 | # Appears as an unbounded scale set. Currently, Azure Scale Sets have a limit of 300 hosts.
 98 | class AzureVirtualScaleSet(AutoScalingGroup):
 99 |     provider = 'azure'
100 | 
101 |     def __init__(self, region, resource_group, client: AzureApi, instance_type, slow_scale: bool, scale_sets: List[AzureScaleSet], kube_nodes):
102 |         self.client = client
103 |         self.instance_type = instance_type
104 |         self.tags = {}
105 |         self.name = 'virtual_scale_set_' + instance_type + '_' + region + '_' + resource_group
106 |         self.scale_sets = dict((scale_set.name, scale_set) for scale_set in scale_sets)
107 |         self.desired_capacity = sum(scale_set.capacity for scale_set in scale_sets)
108 | 
109 |         self.region = region
110 |         self.resource_group = resource_group
111 | 
112 |         self.selectors = dict(self.tags)
113 |         # HACK: for matching node selectors
114 |         self.selectors['azure/type'] = self.instance_type
115 |         self.selectors['azure/class'] = _get_azure_class(self.instance_type)
116 |         self.slow_scale = slow_scale
117 | 
118 |         self.min_size = 0
119 |         self.max_size = 10000
120 |         self.is_spot = False
121 | 
122 |         self.vm_id_to_instance: MutableMapping[str, Tuple[str, AzureScaleSetInstance]] = {}
123 |         self.instances = {}
124 |         self.timeout_until = None
125 |         self.timeout_reason = None
126 |         self._global_priority = None
127 |         self.no_schedule_taints = {}
128 |         for scale_set in scale_sets:
129 |             if scale_set.timeout_until is not None:
130 |                 if self.timeout_until is None or self.timeout_until < scale_set.timeout_until:
131 |                     self.timeout_until = scale_set.timeout_until
132 |                     self.timeout_reason = scale_set.name + ": " + scale_set.timeout_reason
133 |             if scale_set.priority is not None:
134 |                 if self._global_priority is None:
135 |                     self._global_priority = scale_set.priority
136 |                 else:
137 |                     self._global_priority = min(scale_set.priority, self._global_priority)
138 |             if not self.no_schedule_taints:
139 |                 self.no_schedule_taints = scale_set.no_schedule_taints
140 | 
141 |             if scale_set.capacity == 0:
142 |                 continue
143 |             for instance in self.client.list_scale_set_instances(scale_set):
144 |                 self.vm_id_to_instance[instance.vm_id] = (scale_set.name, instance)
145 |                 self.instances[instance.vm_id] = AzureInstance(instance.vm_id, self.instance_type, instance.launch_time, self.tags)
146 | 
147 |         self.nodes = [node for node in kube_nodes if node.instance_id in self.vm_id_to_instance]
148 |         self.unschedulable_nodes = [n for n in self.nodes if n.unschedulable]
149 | 
150 |         self._id = (self.region, self.name)
151 | 
152 |     def is_timed_out(self):
153 |         if self.timeout_until and datetime.now(self.timeout_until.tzinfo) < self.timeout_until:
154 |             logger.warn("{} is timed out until {} because {}".format(self._id, self.timeout_until, self.timeout_reason))
155 |             return True
156 |         return False
157 | 
158 |     @property
159 |     def global_priority(self):
160 |         if self._global_priority is None:
161 |             return super().global_priority
162 |         return self._global_priority
163 | 
164 |     def get_azure_instances(self):
165 |         return self.instances.values()
166 | 
167 |     @property
168 |     def instance_ids(self):
169 |         return self.vm_id_to_instance.keys()
170 | 
171 |     def set_desired_capacity(self, new_desired_capacity):
172 |         """
173 |         sets the desired capacity of the underlying ASG directly.
174 |         note that this is for internal control.
175 |         for scaling purposes, please use scale() instead.
176 |         """
177 |         scale_out = new_desired_capacity - self.desired_capacity
178 |         assert scale_out >= 0
179 |         if scale_out == 0:
180 |             return CompletedFuture(False)
181 | 
182 |         remaining_instances = self.client.get_remaining_instances(self.resource_group, self.instance_type)
183 | 
184 |         futures = []
185 |         for scale_set in sorted(self.scale_sets.values(), key=lambda x: (x.priority, x.name)):
186 |             if scale_set.capacity < _SCALE_SET_SIZE_LIMIT:
187 |                 if self.slow_scale:
188 |                     new_group_capacity = scale_set.capacity + 1
189 |                 else:
190 |                     new_group_capacity = min(_SCALE_SET_SIZE_LIMIT, scale_set.capacity + scale_out, scale_set.capacity + remaining_instances)
191 |                 if scale_set.provisioning_state == 'Updating':
192 |                     logger.warn("Update of {} already in progress".format(scale_set.name))
193 |                     continue
194 |                 if scale_set.provisioning_state == 'Failed':
195 |                     logger.error("{} failed provisioning. Skipping it for scaling.".format(scale_set.name))
196 |                     continue
197 |                 scale_out -= (new_group_capacity - scale_set.capacity)
198 |                 remaining_instances -= (new_group_capacity - scale_set.capacity)
199 |                 # Update our cached version
200 |                 self.scale_sets[scale_set.name].capacity = new_group_capacity
201 |                 futures.append(self.client.update_scale_set(scale_set, new_group_capacity))
202 |                 logger.info("Scaling Azure Scale Set {} to {}".format(scale_set.name, new_group_capacity))
203 |             if scale_out == 0 or remaining_instances == 0:
204 |                 break
205 | 
206 |         if remaining_instances == 0:
207 |             logger.warning("Out of quota for {}!".format(self.instance_type))
208 | 
209 |         if scale_out > 0:
210 |             logger.error("Not enough scale sets to reach desired capacity {} for {}".format(new_desired_capacity, self))
211 | 
212 |         self.desired_capacity = new_desired_capacity - scale_out
213 |         logger.info("ASG: {} new_desired_capacity: {}".format(self, new_desired_capacity))
214 | 
215 |         return TransformingFuture(True, AllCompletedFuture(futures))
216 | 
217 |     def terminate_instances(self, vm_ids):
218 |         vm_ids = list(vm_ids)
219 |         instances = {}
220 |         for vm_id in vm_ids:
221 |             scale_set_name, instance = self.vm_id_to_instance[vm_id]
222 |             # Update our cached copy of the Scale Set
223 |             self.scale_sets[scale_set_name].capacity -= 1
224 |             instances.setdefault(scale_set_name, []).append(instance)
225 |         logger.info('Terminated instances %s', vm_ids)
226 | 
227 |         futures = []
228 |         for scale_set_name, scale_set_instances in instances.items():
229 |             futures.append(self.client.terminate_scale_set_instances(self.scale_sets[scale_set_name], scale_set_instances))
230 |         return AllCompletedFuture(futures)
231 | 
232 |     def scale_nodes_in(self, nodes):
233 |         """
234 |         scale down asg by terminating the given node.
235 |         returns a future indicating when the request completes.
236 |         """
237 |         for node in nodes:
238 |             self.nodes.remove(node)
239 |         return self.terminate_instances(node.instance_id for node in nodes)
240 | 
241 |     def __str__(self):
242 |         return 'AzureVirtualScaleSet({name}, {selectors_hash})'.format(name=self.name, selectors_hash=utils.selectors_to_hash(self.selectors))
243 | 
244 |     def __repr__(self):
245 |         return str(self)
246 | 
247 | 
248 | class AzureInstance(object):
249 |     provider = 'azure'
250 | 
251 |     def __init__(self, instance_id, instance_type, launch_time, tags):
252 |         self.id = instance_id
253 |         self.instance_type = instance_type
254 |         self.launch_time = launch_time
255 |         self.tags = tags
256 | 
257 |     def __str__(self):
258 |         return 'AzureInstance({}, {})'.format(self.id, self.instance_type)
259 | 
260 |     def __repr__(self):
261 |         return str(self)


--------------------------------------------------------------------------------
/test/test_azure_api.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import unittest
  3 | import mock
  4 | from datetime import datetime
  5 | 
  6 | import pytz
  7 | from azure.mgmt.compute.models import VirtualMachineScaleSet, Sku
  8 | from azure.monitor.models import EventData, LocalizableString
  9 | 
 10 | from autoscaler.azure_api import AzureApi, AzureScaleSet, AzureWriteThroughCachedApi, \
 11 |     AzureScaleSetInstance, AzureWrapper, TIMEOUT_PERIOD, PRIORITY_TAG, NO_SCHEDULE_TAINTS_TAG
 12 | from autoscaler.utils import CompletedFuture
 13 | 
 14 | 
 15 | class TestingFuture:
 16 |     def __init__(self):
 17 |         self.callbacks = []
 18 | 
 19 |     def add_done_callback(self, fn):
 20 |         self.callbacks.append(fn)
 21 | 
 22 |     def complete(self):
 23 |         for callback in self.callbacks:
 24 |             callback(self)
 25 | 
 26 | 
 27 | class TestWriteThroughCache(unittest.TestCase):
 28 |     def test_caching(self):
 29 |         scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded')
 30 |         instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now())
 31 | 
 32 |         mock_api = mock.Mock(AzureApi)
 33 |         mock_api.list_scale_sets = mock.Mock(return_value=[scale_set])
 34 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[instance])
 35 | 
 36 |         cached_api = AzureWriteThroughCachedApi(mock_api)
 37 | 
 38 |         self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set])
 39 |         self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set])
 40 | 
 41 |         self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance])
 42 |         self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance])
 43 | 
 44 |         mock_api.list_scale_sets.assert_called_once_with('test_rg')
 45 |         mock_api.list_scale_set_instances.assert_called_once_with(scale_set)
 46 | 
 47 |     def test_copied(self):
 48 |         scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded')
 49 |         instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now())
 50 | 
 51 |         mock_api = mock.Mock(AzureApi)
 52 |         mock_api.list_scale_sets = mock.Mock(return_value=[scale_set])
 53 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[instance])
 54 | 
 55 |         cached_api = AzureWriteThroughCachedApi(mock_api)
 56 | 
 57 |         returned_scale_set = cached_api.list_scale_sets('test_rg')[0]
 58 |         self.assertEqual(returned_scale_set.capacity, 1)
 59 |         returned_scale_set.capacity = 0
 60 |         self.assertEqual(cached_api.list_scale_sets('test_rg')[0].capacity, 1)
 61 | 
 62 |         returned_instance = cached_api.list_scale_set_instances(scale_set)[0]
 63 |         self.assertEqual(returned_instance.vm_id, 'fake_vm')
 64 |         returned_instance.vm_id = 'modified'
 65 |         self.assertEqual(cached_api.list_scale_set_instances(scale_set)[0].vm_id, 'fake_vm')
 66 | 
 67 |     def test_refresh(self):
 68 |         scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded')
 69 |         updated_scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 0, 'Succeeded')
 70 |         scale_set2 = AzureScaleSet('eastus', 'test_rg', 'test2', 'Standard_H16', 0, 'Succeeded')
 71 |         instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now())
 72 | 
 73 |         mock_api = mock.Mock(AzureApi)
 74 |         mock_api.list_scale_sets = mock.Mock(return_value=[scale_set])
 75 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[instance])
 76 | 
 77 |         cached_api = AzureWriteThroughCachedApi(mock_api)
 78 | 
 79 |         self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set])
 80 |         self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance])
 81 |         mock_api.list_scale_sets.assert_called_once_with('test_rg')
 82 |         mock_api.list_scale_set_instances.assert_called_once_with(scale_set)
 83 | 
 84 |         mock_api.list_scale_sets = mock.Mock(return_value=[updated_scale_set, scale_set2])
 85 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[])
 86 |         self.assertEqual(set(cached_api.list_scale_sets('test_rg', force_refresh=True)), {updated_scale_set, scale_set2})
 87 |         self.assertEqual(cached_api.list_scale_set_instances(updated_scale_set), [])
 88 |         mock_api.list_scale_sets.assert_called_once_with('test_rg')
 89 |         mock_api.list_scale_set_instances.assert_called_once_with(updated_scale_set)
 90 | 
 91 |     def test_update(self):
 92 |         scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded')
 93 |         updated_scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 0, 'Succeeded')
 94 |         instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now())
 95 |         future = CompletedFuture(None)
 96 | 
 97 |         mock_api = mock.Mock(AzureApi)
 98 |         mock_api.list_scale_sets = mock.Mock(return_value=[scale_set])
 99 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[instance])
100 |         mock_api.update_scale_set = mock.Mock(return_value=future)
101 | 
102 |         cached_api = AzureWriteThroughCachedApi(mock_api)
103 | 
104 |         self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set])
105 |         self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance])
106 |         cached_api.update_scale_set(scale_set, 0).result()
107 |         mock_api.list_scale_sets.assert_called_once_with('test_rg')
108 |         mock_api.list_scale_set_instances.assert_called_once_with(scale_set)
109 |         mock_api.update_scale_set.assert_called_once_with(scale_set, 0)
110 | 
111 |         mock_api.list_scale_sets = mock.Mock(return_value=[updated_scale_set])
112 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[])
113 |         self.assertEqual(cached_api.list_scale_sets('test_rg'), [updated_scale_set])
114 |         self.assertEqual(cached_api.list_scale_set_instances(updated_scale_set), [])
115 |         mock_api.list_scale_sets.assert_called_once_with('test_rg')
116 |         mock_api.list_scale_set_instances.assert_called_once_with(updated_scale_set)
117 | 
118 |     def test_inconsistent_delegate(self):
119 |         scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 0, 'Succeeded')
120 |         updated_scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded')
121 |         instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now())
122 |         future = CompletedFuture(None)
123 | 
124 |         mock_api = mock.Mock(AzureApi)
125 |         mock_api.list_scale_sets = mock.Mock(return_value=[scale_set])
126 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[])
127 |         mock_api.update_scale_set = mock.Mock(return_value=future)
128 | 
129 |         cached_api = AzureWriteThroughCachedApi(mock_api)
130 | 
131 |         self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set])
132 |         self.assertEqual(cached_api.list_scale_set_instances(scale_set), [])
133 |         mock_api.list_scale_sets.assert_called_once_with('test_rg')
134 |         mock_api.list_scale_set_instances.assert_called_once_with(scale_set)
135 |         cached_api.update_scale_set(scale_set, 1).result()
136 |         mock_api.update_scale_set.assert_called_once_with(scale_set, 1)
137 | 
138 |         mock_api.list_scale_sets = mock.Mock(return_value=[updated_scale_set])
139 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[])
140 |         self.assertEqual(cached_api.list_scale_sets('test_rg'), [updated_scale_set])
141 |         self.assertEqual(cached_api.list_scale_set_instances(updated_scale_set), [])
142 |         mock_api.list_scale_sets.assert_called_once_with('test_rg')
143 |         mock_api.list_scale_set_instances.assert_called_once_with(updated_scale_set)
144 | 
145 |         # Test that even if there is inconsistency between the list_scale_sets and list_scale_set_instances, the
146 |         # cache doesn't end up with bad data
147 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[instance])
148 |         self.assertEqual(cached_api.list_scale_set_instances(updated_scale_set), [instance])
149 |         mock_api.list_scale_set_instances.assert_called_once_with(updated_scale_set)
150 | 
151 |     def test_terminate(self):
152 |         scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded')
153 |         updated_scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 0, 'Succeeded')
154 |         instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now())
155 |         future = CompletedFuture(None)
156 | 
157 |         mock_api = mock.Mock(AzureApi)
158 |         mock_api.list_scale_sets = mock.Mock(return_value=[scale_set])
159 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[instance])
160 |         mock_api.terminate_scale_set_instances = mock.Mock(return_value=future)
161 | 
162 |         cached_api = AzureWriteThroughCachedApi(mock_api)
163 | 
164 |         self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set])
165 |         self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance])
166 |         cached_api.terminate_scale_set_instances(scale_set, [instance]).result()
167 |         mock_api.list_scale_sets.assert_called_once_with('test_rg')
168 |         mock_api.list_scale_set_instances.assert_called_once_with(scale_set)
169 |         mock_api.terminate_scale_set_instances.assert_called_once_with(scale_set, [instance])
170 | 
171 |         mock_api.list_scale_sets = mock.Mock(return_value=[updated_scale_set])
172 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[])
173 |         self.assertEqual(cached_api.list_scale_sets('test_rg'), [updated_scale_set])
174 |         self.assertEqual(cached_api.list_scale_set_instances(updated_scale_set), [])
175 |         mock_api.list_scale_sets.assert_called_once_with('test_rg')
176 |         mock_api.list_scale_set_instances.assert_called_once_with(updated_scale_set)
177 | 
178 |     def test_terminate_with_concurrent_read(self):
179 |         scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 1, 'Succeeded')
180 |         updated_scale_set = AzureScaleSet('eastus', 'test_rg', 'test', 'Standard_H16', 0, 'Succeeded')
181 |         instance = AzureScaleSetInstance('fake_id', 'fake_vm', datetime.now())
182 |         future = TestingFuture()
183 | 
184 |         mock_api = mock.Mock(AzureApi)
185 |         mock_api.list_scale_sets = mock.Mock(return_value=[scale_set])
186 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[instance])
187 |         mock_api.terminate_scale_set_instances = mock.Mock(return_value=future)
188 | 
189 |         cached_api = AzureWriteThroughCachedApi(mock_api)
190 | 
191 |         self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set])
192 |         self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance])
193 |         cached_api.terminate_scale_set_instances(scale_set, [instance])
194 |         mock_api.list_scale_sets.assert_called_once_with('test_rg')
195 |         mock_api.list_scale_set_instances.assert_called_once_with(scale_set)
196 |         mock_api.terminate_scale_set_instances.assert_called_once_with(scale_set, [instance])
197 | 
198 |         # Call list again concurrently with the delete, and make sure it's still served from the cache
199 |         self.assertEqual(cached_api.list_scale_sets('test_rg'), [scale_set])
200 |         self.assertEqual(cached_api.list_scale_set_instances(scale_set), [instance])
201 |         mock_api.list_scale_sets.assert_called_once_with('test_rg')
202 |         mock_api.list_scale_set_instances.assert_called_once_with(scale_set)
203 | 
204 |         future.complete()
205 |         mock_api.list_scale_sets = mock.Mock(return_value=[updated_scale_set])
206 |         mock_api.list_scale_set_instances = mock.Mock(return_value=[])
207 |         self.assertEqual(cached_api.list_scale_sets('test_rg'), [updated_scale_set])
208 |         self.assertEqual(cached_api.list_scale_set_instances(updated_scale_set), [])
209 |         mock_api.list_scale_sets.assert_called_once_with('test_rg')
210 |         mock_api.list_scale_set_instances.assert_called_once_with(updated_scale_set)
211 | 
212 | 
213 | class TestWrapper(unittest.TestCase):
214 |     def test_basic(self):
215 |         scale_set = VirtualMachineScaleSet('eastus', {PRIORITY_TAG: '1', NO_SCHEDULE_TAINTS_TAG: json.dumps({'gpu': 'yes'})},
216 |                                            sku=Sku('Standard_H16', capacity=1))
217 |         scale_set.name = 'test'
218 |         scale_set.provisioning_state = 'Succeeded'
219 |         scale_set.id = 'fake_id'
220 | 
221 |         compute_client = mock.Mock()
222 |         compute_client.virtual_machine_scale_sets = mock.Mock()
223 |         compute_client.virtual_machine_scale_sets.list = mock.Mock(return_value=[scale_set])
224 | 
225 |         monitor_client = mock.Mock()
226 |         monitor_client.activity_logs = mock.Mock()
227 |         monitor_client.activity_logs.list = mock.Mock(return_value=[])
228 | 
229 |         api = AzureWrapper(compute_client, monitor_client, None)
230 |         resource_group = 'test_rg'
231 |         expected = AzureScaleSet(scale_set.location, resource_group, scale_set.name, scale_set.sku.name, scale_set.sku.capacity,
232 |                                  scale_set.provisioning_state, priority=1, no_schedule_taints={'gpu': 'yes'})
233 |         self.assertEqual([expected], api.list_scale_sets(resource_group))
234 | 
235 |         compute_client.virtual_machine_scale_sets.list.assert_called_once_with(resource_group)
236 |         monitor_client.activity_logs.list.assert_called_once()
237 | 
238 |     def test_out_of_quota(self):
239 |         scale_set = VirtualMachineScaleSet('eastus', {}, sku=Sku('Standard_H16', capacity=1))
240 |         scale_set.name = 'test'
241 |         scale_set.provisioning_state = 'Succeeded'
242 |         scale_set.id = 'fake_id'
243 | 
244 |         compute_client = mock.Mock()
245 |         compute_client.virtual_machine_scale_sets = mock.Mock()
246 |         compute_client.virtual_machine_scale_sets.list = mock.Mock(return_value=[scale_set])
247 | 
248 |         reason = "Operation results in exceeding quota limits of Core. Maximum allowed: 800, Current in use: 784, Additional requested: 320."
249 |         message = "{\"error\":{\"code\":\"OperationNotAllowed\",\"message\":\"" + reason + "\"}}"
250 |         monitor_client = mock.Mock()
251 |         monitor_client.activity_logs = mock.Mock()
252 |         now = datetime.now(pytz.utc)
253 |         monitor_client.activity_logs.list = mock.Mock(return_value=[EventData('Error',
254 |                                                                               now,
255 |                                                                               now,
256 |                                                                               resource_id=scale_set.id,
257 |                                                                               status=LocalizableString('Failed'),
258 |                                                                               properties={'statusCode': 'Conflict',
259 |                                                                                           'statusMessage': message})])
260 | 
261 |         api = AzureWrapper(compute_client, monitor_client, None)
262 |         resource_group = 'test_rg'
263 |         expected = AzureScaleSet(scale_set.location, resource_group, scale_set.name, scale_set.sku.name, scale_set.sku.capacity,
264 |                                  scale_set.provisioning_state, now + TIMEOUT_PERIOD, reason)
265 |         acutal = api.list_scale_sets(resource_group)
266 |         self.assertEqual([expected], acutal)
267 | 
268 |         compute_client.virtual_machine_scale_sets.list.assert_called_once_with(resource_group)
269 |         monitor_client.activity_logs.list.assert_called_once()
270 | 


--------------------------------------------------------------------------------
/autoscaler/kube.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import json
  3 | import logging
  4 | 
  5 | from typing import Iterable, Mapping
  6 | 
  7 | from dateutil.parser import parse as dateutil_parse
  8 | import pykube.exceptions
  9 | 
 10 | import autoscaler.utils as utils
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class KubePodStatus(object):
 16 |     RUNNING = 'Running'
 17 |     PENDING = 'Pending'
 18 |     CONTAINER_CREATING = 'ContainerCreating'
 19 |     SUCCEEDED = 'Succeeded'
 20 |     FAILED = 'Failed'
 21 | 
 22 | _CORDON_LABEL = 'openai/cordoned-by-autoscaler'
 23 | 
 24 | 
 25 | class KubePod(object):
 26 |     _DRAIN_GRACE_PERIOD = datetime.timedelta(seconds=60*60)
 27 | 
 28 |     def __init__(self, pod):
 29 |         self.original = pod
 30 | 
 31 |         metadata = pod.obj['metadata']
 32 |         self.name = metadata['name']
 33 |         self.namespace = metadata['namespace']
 34 |         self.node_name = pod.obj['spec'].get('nodeName')
 35 |         self.status = pod.obj['status']['phase']
 36 |         self.uid = metadata['uid']
 37 |         self.selectors = pod.obj['spec'].get('nodeSelector', {})
 38 |         self.labels = metadata.get('labels', {})
 39 |         self.annotations = metadata.get('annotations', {})
 40 |         self.owner_references = metadata.get('ownerReferences', [])
 41 |         self.owner = self.labels.get('owner', None)
 42 |         self.creation_time = dateutil_parse(metadata['creationTimestamp'])
 43 |         self.start_time = dateutil_parse(pod.obj['status']['startTime']) if 'startTime' in pod.obj['status'] else None
 44 |         self.scheduled_time = None
 45 | 
 46 |         for condition in pod.obj['status'].get('conditions', []):
 47 |             if condition['type'] == 'PodScheduled' and condition['status'] == 'True':
 48 |                 self.scheduled_time = dateutil_parse(condition['lastTransitionTime'])
 49 | 
 50 |         # TODO: refactor
 51 |         requests = [c.get('resources', {}).get('requests', {}) for c in pod.obj['spec']['containers']]
 52 |         resource_requests = {}
 53 |         for d in requests:
 54 |             for k, v in d.items():
 55 |                 unitless_v = utils.parse_SI(v)
 56 |                 resource_requests[k] = resource_requests.get(k, 0.0) + unitless_v
 57 |         self.resources = KubeResource(pods=1, **resource_requests)
 58 |         self.no_schedule_wildcard_toleration = False
 59 |         self.no_execute_wildcard_toleration = False
 60 |         self.no_schedule_existential_tolerations = set()
 61 |         self.no_execute_existential_tolerations = set()
 62 |         for toleration in pod.obj['spec'].get('tolerations', []):
 63 |             if toleration.get('operator', 'Equal') == 'Exists':
 64 |                 effect = toleration.get('effect')
 65 |                 if effect is None or effect == 'NoSchedule':
 66 |                     if 'key' not in toleration:
 67 |                         self.no_schedule_wildcard_toleration = True
 68 |                     else:
 69 |                         self.no_schedule_existential_tolerations.add(toleration['key'])
 70 |                 if effect is None or effect == 'NoExecute':
 71 |                     if 'key' not in toleration:
 72 |                         self.no_execute_wildcard_toleration = True
 73 |                     else:
 74 |                         self.no_execute_existential_tolerations.add(toleration['key'])
 75 |             else:
 76 |                 logger.warn("Equality tolerations not implemented. Pod {} has an equality toleration".format(pod))
 77 | 
 78 |         self.required_pod_anti_affinity_expressions = []
 79 |         anti_affinity_spec = pod.obj['spec'].get('affinity', {}).get('podAntiAffinity', {})
 80 |         required_anti_affinity_expressions = anti_affinity_spec.get('requiredDuringSchedulingIgnoredDuringExecution', []) +\
 81 |                                              anti_affinity_spec.get('requiredDuringSchedulingRequiredDuringExecution', [])
 82 |         for expression in required_anti_affinity_expressions:
 83 |             if expression.get('topologyKey') != 'kubernetes.io/hostname':
 84 |                 logger.debug("Pod {} has non-hostname anti-affinity topology. Ignoring".format(pod))
 85 |                 continue
 86 |             self.required_pod_anti_affinity_expressions.append(expression['labelSelector']['matchExpressions'])
 87 | 
 88 |     def is_mirrored(self):
 89 |         is_daemonset = False
 90 |         for reference in self.owner_references:
 91 |             if reference.get('kind') == 'DaemonSet':
 92 |                 is_daemonset = True
 93 |                 break
 94 |         return is_daemonset or self.annotations.get('kubernetes.io/config.mirror')
 95 | 
 96 |     def is_replicated(self):
 97 |         return True if len(self.owner_references) > 0 else False
 98 | 
 99 |     def is_critical(self):
100 |         return utils.parse_bool_label(self.labels.get('openai/do-not-drain'))
101 | 
102 |     def is_in_drain_grace_period(self):
103 |         """
104 |         determines whether the pod is in a grace period for draining
105 |         this prevents us from draining pods that are too new
106 |         """
107 |         return (self.scheduled_time and
108 |                 (datetime.datetime.now(self.scheduled_time.tzinfo) - self.scheduled_time) < self._DRAIN_GRACE_PERIOD)
109 | 
110 |     def is_drainable(self):
111 |         """
112 |         a pod is considered drainable if:
113 |         - it's a daemon
114 |         - it's a non-critical replicated pod that has exceeded grace period
115 |         """
116 |         return (self.is_mirrored() or
117 |                 (self.is_replicated() and not self.is_critical() and not self.is_in_drain_grace_period()))
118 | 
119 |     def delete(self):
120 |         logger.info('Deleting Pod %s/%s', self.namespace, self.name)
121 |         return self.original.delete()
122 | 
123 |     def __hash__(self):
124 |         return hash(self.uid)
125 | 
126 |     def __eq__(self, other):
127 |         return self.uid == other.uid
128 | 
129 |     def __str__(self):
130 |         return 'KubePod({namespace}, {name})'.format(
131 |             namespace=self.namespace, name=self.name)
132 | 
133 |     def __repr__(self):
134 |         return str(self)
135 | 
136 | 
137 | def reverse_bytes(value):
138 |     assert len(value) % 2 == 0
139 |     result = ""
140 |     for i in range(len(value), 0, -2):
141 |         result += value[i - 2: i]
142 |     return result
143 | 
144 | 
145 | # Returns True iff all expressions in and_expression match labels on pod
146 | def match_anti_affinity_expression(and_expression: Iterable[Mapping], pod: KubePod):
147 |     for expression in and_expression:
148 |         label_value = pod.labels.get(expression['key'])
149 |         if expression['operator'] == 'In' and label_value not in expression['values']:
150 |             return False
151 |         elif expression['operator'] == 'NotIn' and label_value in expression['values']:
152 |             return False
153 |         elif expression['operator'] == 'Exists' and label_value is None:
154 |             return False
155 |         elif expression['operator'] == 'DoesNotExist' and label_value is not None:
156 |             return False
157 |     return True
158 | 
159 | 
160 | class KubeNode(object):
161 |     _HEARTBEAT_GRACE_PERIOD = datetime.timedelta(seconds=60*60)
162 | 
163 |     def __init__(self, node):
164 |         self.original = node
165 |         self.pykube_node = node
166 | 
167 |         metadata = node.obj['metadata']
168 |         self.name = metadata['name']
169 |         self.instance_id, self.region, self.instance_type, self.provider = self._get_instance_data()
170 |         self.pods = []
171 | 
172 |         self.capacity = KubeResource(**node.obj['status']['allocatable'])
173 |         self.used_capacity = KubeResource()
174 |         self.creation_time = dateutil_parse(metadata['creationTimestamp'])
175 |         last_heartbeat_time = self.creation_time
176 |         for condition in node.obj['status'].get('conditions', []):
177 |             if condition.get('type') == 'Ready':
178 |                 last_heartbeat_time = dateutil_parse(condition['lastHeartbeatTime'])
179 |         self.last_heartbeat_time = last_heartbeat_time
180 |         self.no_schedule_taints = {}
181 |         self.no_execute_taints = {}
182 |         for taint in node.obj['spec'].get('taints', []):
183 |             if taint['effect'] == 'NoSchedule':
184 |                 try:
185 |                     self.no_schedule_taints[taint['key']] = taint['value']
186 |                 except:
187 |                     self.no_schedule_taints[taint['key']] = ""
188 |             if taint['effect'] == 'NoExecute':
189 |                 self.no_execute_taints[taint['key']] = taint['value']
190 | 
191 |     def _get_instance_data(self):
192 |         """
193 |         returns a tuple (instance id, region, instance type)
194 |         """
195 |         labels = self.original.obj['metadata'].get('labels', {})
196 |         instance_type = labels.get('aws/type', labels.get('beta.kubernetes.io/instance-type'))
197 | 
198 |         provider = self.original.obj['spec'].get('providerID', '')
199 |         if provider.startswith('aws://'):
200 |             az, instance_id = tuple(provider.split('/')[-2:])
201 |             if az and instance_id:
202 |                 return (instance_id, az[:-1], instance_type, 'aws')
203 | 
204 |         if labels.get('aws/id') and labels.get('aws/az'):
205 |             instance_id = labels['aws/id']
206 |             region = labels['aws/az'][:-1]
207 |             return (instance_id, region, instance_type, 'aws')
208 | 
209 |         assert provider.startswith('azure:////'), provider
210 |         # Id is in wrong order: https://azure.microsoft.com/en-us/blog/accessing-and-using-azure-vm-unique-id/
211 |         big_endian_vm_id = provider.replace('azure:////', '')
212 |         parts = big_endian_vm_id.split('-')
213 |         instance_id = '-'.join([reverse_bytes(parts[0]),
214 |                                 reverse_bytes(parts[1]),
215 |                                 reverse_bytes(parts[2]),
216 |                                 parts[3],
217 |                                 parts[4]]).lower()
218 |         instance_type = labels['azure/type']
219 |         return (instance_id, 'placeholder', instance_type, 'azure')
220 | 
221 |     @property
222 |     def selectors(self):
223 |         return self.original.obj['metadata'].get('labels', {})
224 | 
225 |     @property
226 |     def unschedulable(self):
227 |         return self.original.obj['spec'].get('unschedulable', False)
228 | 
229 |     @property
230 |     def can_uncordon(self):
231 |         return utils.parse_bool_label(self.selectors.get(_CORDON_LABEL))
232 | 
233 |     def drain(self, pods, notifier=None):
234 |         for pod in pods:
235 |             if pod.is_drainable() and not pod.is_mirrored():
236 |                 pod.delete()
237 | 
238 |         logger.info("drained %s", self)
239 |         if notifier:
240 |             notifier.notify_drained_node(self, pods)
241 | 
242 |     def uncordon(self):
243 |         if not utils.parse_bool_label(self.selectors.get(_CORDON_LABEL)):
244 |             logger.debug('uncordon %s ignored', self)
245 |             return False
246 | 
247 |         try:
248 |             self.original.reload()
249 |             self.original.obj['spec']['unschedulable'] = False
250 |             self.original.update()
251 |             logger.info("uncordoned %s", self)
252 |             return True
253 |         except pykube.exceptions.HTTPError as ex:
254 |             logger.info("uncordon failed %s %s", self, ex)
255 |             return False
256 | 
257 |     def cordon(self):
258 |         try:
259 |             self.original.reload()
260 |             self.original.obj['spec']['unschedulable'] = True
261 |             self.original.obj['metadata'].setdefault('labels', {})[_CORDON_LABEL] = 'true'
262 |             self.original.update()
263 |             logger.info("cordoned %s", self)
264 |             return True
265 |         except pykube.exceptions.HTTPError as ex:
266 |             logger.info("cordon failed %s %s", self, ex)
267 |             return False
268 | 
269 |     def delete(self):
270 |         try:
271 |             self.original.delete()
272 |             logger.info("deleted %s", self)
273 |             return True
274 |         except pykube.exceptions.HTTPError as ex:
275 |             logger.info("delete failed %s %s", self, ex)
276 |             return False
277 | 
278 |     def count_pod(self, pod):
279 |         assert isinstance(pod, KubePod)
280 |         self.used_capacity += pod.resources
281 |         self.pods.append(pod)
282 | 
283 |     def can_fit(self, resources):
284 |         assert isinstance(resources, KubeResource)
285 |         left = self.capacity - (self.used_capacity + resources)
286 |         return left.possible
287 | 
288 |     def is_match(self, pod: KubePod):
289 |         """
290 |         whether this node matches all the selectors on the pod
291 |         """
292 |         for label, value in pod.selectors.items():
293 |             if self.selectors.get(label) != value:
294 |                 return False
295 |         for key in self.no_schedule_taints:
296 |             if not (pod.no_schedule_wildcard_toleration or key in pod.no_schedule_existential_tolerations):
297 |                 return False
298 |         for key in self.no_execute_taints:
299 |             if not (pod.no_execute_wildcard_toleration or key in pod.no_execute_existential_tolerations):
300 |                 return False
301 |         for expression in pod.required_pod_anti_affinity_expressions:
302 |             for pod in self.pods:
303 |                 if match_anti_affinity_expression(expression, pod):
304 |                     return False
305 | 
306 |         return True
307 | 
308 |     def is_managed(self):
309 |         """
310 |         an instance is managed if we know its instance ID in ec2.
311 |         """
312 |         return self.instance_id is not None
313 | 
314 |     def is_detached(self):
315 |         return utils.parse_bool_label(self.selectors.get('openai/detached'))
316 | 
317 |     def is_dead(self):
318 |         return datetime.datetime.now(self.last_heartbeat_time.tzinfo) - self.last_heartbeat_time > self._HEARTBEAT_GRACE_PERIOD
319 | 
320 |     def __hash__(self):
321 |         return hash(self.name)
322 | 
323 |     def __eq__(self, other):
324 |         return self.name == other.name
325 | 
326 |     def __str__(self):
327 |         return "{}: {} ({})".format(self.name, self.instance_id,
328 |                                     utils.selectors_to_hash(self.selectors))
329 | 
330 | 
331 | class KubeResource(object):
332 | 
333 |     def __init__(self, **kwargs):
334 |         self.raw = dict((k, utils.parse_resource(v))
335 |                         for (k, v) in kwargs.items())
336 | 
337 |     def __add__(self, other):
338 |         keys = set(self.raw.keys()) | set(other.raw.keys())
339 |         raw_diff = dict((k, self.raw.get(k, 0) + other.raw.get(k, 0))
340 |                         for k in keys)
341 |         return KubeResource(**raw_diff)
342 | 
343 |     def __sub__(self, other):
344 |         keys = set(self.raw.keys()) | set(other.raw.keys())
345 |         raw_diff = dict((k, self.raw.get(k, 0) - other.raw.get(k, 0))
346 |                         for k in keys)
347 |         return KubeResource(**raw_diff)
348 | 
349 |     def __mul__(self, multiplier):
350 |         new_raw = dict((k, v * multiplier) for k, v in self.raw.items())
351 |         return KubeResource(**new_raw)
352 | 
353 |     def __rmul__(self, multiplier):
354 |         return self.__mul__(multiplier)
355 | 
356 |     def __cmp__(self, other):
357 |         """
358 |         should return a negative integer if self < other,
359 |         zero if self == other, a positive integer if self > other.
360 | 
361 |         we consider self to be greater than other if it exceeds
362 |         the resource amount in other in more resource types.
363 |         e.g. if self = {cpu: 4, memory: 1K, gpu: 1},
364 |         other = {cpu: 2, memory: 2K}, then self exceeds the resource
365 |         amount in other in both cpu and gpu, while other exceeds
366 |         the resource amount in self in only memory, so self > other.
367 |         """
368 |         resource_diff = (self - other).raw
369 |         num_resource_types = len(resource_diff)
370 |         num_eq = sum(1 for v in resource_diff.values() if v == 0)
371 |         num_less = sum(1 for v in resource_diff.values() if v < 0)
372 |         num_more = num_resource_types - num_eq - num_less
373 |         return num_more - num_less
374 | 
375 |     def __str__(self):
376 |         return str(self.raw)
377 | 
378 |     def get(self, key, default=None):
379 |         return self.raw.get(key, default)
380 | 
381 |     @property
382 |     def possible(self):
383 |         return all([x >= 0 for x in self.raw.values()])
384 | 


--------------------------------------------------------------------------------
/autoscaler/azure_api.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import json
  3 | import re
  4 | 
  5 | from azure.monitor import MonitorClient
  6 | from azure.monitor.models import EventData
  7 | from copy import deepcopy
  8 | from datetime import datetime, timedelta
  9 | from threading import RLock, Condition
 10 | from typing import List, Tuple, MutableMapping, Mapping
 11 | 
 12 | import pytz
 13 | from abc import ABC
 14 | from azure.mgmt.compute import ComputeManagementClient
 15 | from azure.mgmt.compute.models import VirtualMachineScaleSet, Sku
 16 | from azure.mgmt.resource import ResourceManagementClient
 17 | 
 18 | from autoscaler.utils import Future
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | PRIORITY_TAG = 'priority'
 24 | # Value should be a json map of NoSchedule taint key-values
 25 | NO_SCHEDULE_TAINTS_TAG = 'no_schedule_taints'
 26 | 
 27 | 
 28 | class AzureScaleSet:
 29 |     def __init__(self, location: str, resource_group: str, name: str, instance_type: str, capacity: int,
 30 |                  provisioning_state: str, timeout_until: datetime = None, timeout_reason: str = None, priority: int = None,
 31 |                  no_schedule_taints: Mapping[str, str] = {}) -> None:
 32 |         self.name = name
 33 |         self.instance_type = instance_type
 34 |         self.capacity = capacity
 35 |         self.provisioning_state = provisioning_state
 36 |         self.resource_group = resource_group
 37 |         self.location = location
 38 |         self.timeout_until = timeout_until
 39 |         self.timeout_reason = timeout_reason
 40 |         self.priority = priority
 41 |         self.no_schedule_taints = no_schedule_taints
 42 | 
 43 |     def __str__(self):
 44 |         return 'AzureScaleSet({}, {}, {}, {})'.format(self.name, self.instance_type, self.capacity, self.provisioning_state)
 45 | 
 46 |     def __repr__(self):
 47 |         return str(self)
 48 | 
 49 |     def _key(self):
 50 |         return (self.name, self.instance_type, self.capacity, self.provisioning_state, self.resource_group, self.location,
 51 |                 self.timeout_until, self.timeout_reason, self.priority, tuple(self.no_schedule_taints.items()))
 52 | 
 53 |     def __eq__(self, other: object) -> bool:
 54 |         if not isinstance(other, AzureScaleSet):
 55 |             return False
 56 |         return self._key() == other._key()
 57 | 
 58 |     def __hash__(self) -> int:
 59 |         return hash(self._key())
 60 | 
 61 | 
 62 | class AzureScaleSetInstance:
 63 |     def __init__(self, instance_id: str, vm_id: str, launch_time: datetime) -> None:
 64 |         self.instance_id = instance_id
 65 |         self.vm_id = vm_id
 66 |         self.launch_time = launch_time
 67 | 
 68 |     def __str__(self):
 69 |         return 'AzureScaleSetInstance({}, {}, {})'.format(self.instance_id, self.vm_id, self.launch_time)
 70 | 
 71 |     def __repr__(self):
 72 |         return str(self)
 73 | 
 74 |     def _key(self):
 75 |         return (self.instance_id, self.vm_id, self.launch_time)
 76 | 
 77 |     def __eq__(self, other: object) -> bool:
 78 |         if not isinstance(other, AzureScaleSetInstance):
 79 |             return False
 80 |         return self._key() == other._key()
 81 | 
 82 |     def __hash__(self) -> int:
 83 |         return hash(self._key())
 84 | 
 85 | 
 86 | class AzureApi(ABC):
 87 |     def list_scale_sets(self, resource_group_name: str) -> List[AzureScaleSet]:
 88 |         pass
 89 | 
 90 |     def list_scale_set_instances(self, scale_set: AzureScaleSet) -> List[AzureScaleSetInstance]:
 91 |         pass
 92 | 
 93 |     def update_scale_set(self, scale_set: AzureScaleSet, new_capacity: int) -> Future:
 94 |         pass
 95 | 
 96 |     def terminate_scale_set_instances(self, scale_set: AzureScaleSet, instances: List[AzureScaleSetInstance]) -> Future:
 97 |         pass
 98 | 
 99 |     def get_remaining_instances(self, resource_group_name: str, sku: str) -> int:
100 |         pass
101 | 
102 | 
103 | TIMEOUT_PERIOD = timedelta(minutes=15)
104 | 
105 | 
106 | # Mangles a SKU name into the family name used for quotas
107 | def _azure_sku_family(name: str) -> str:
108 |     match = re.match('Standard_(?P<family>[A-Z]{1,2})[0-9]{1,2}_?(?P<version>v[0-9])?', name)
109 |     if match is None:
110 |         raise ValueError("SKU not from a recognized family: " + name)
111 |     family = match.group('family')
112 |     result = "standard" + family
113 |     # Special case for one of Azure's new SKUs :(
114 |     if family == 'ND':
115 |         result += 'S'
116 |     if match.group('version') is not None:
117 |         result += match.group('version')
118 |     result += 'Family'
119 |     return result
120 | 
121 | 
122 | class AzureWrapper(AzureApi):
123 |     def __init__(self, compute_client: ComputeManagementClient, monitor_client: MonitorClient, resource_client: ResourceManagementClient) -> None:
124 |         self._compute_client = compute_client
125 |         self._monitor_client = monitor_client
126 |         self._resource_client = resource_client
127 | 
128 |     def list_scale_sets(self, resource_group_name: str) -> List[AzureScaleSet]:
129 |         fifteen_minutes_ago = datetime.now(pytz.utc) - TIMEOUT_PERIOD
130 |         filter_clause = "eventTimestamp ge '{}' and resourceGroupName eq '{}'".format(fifteen_minutes_ago, resource_group_name)
131 |         select_clause = "authorization,status,subStatus,properties,resourceId,eventTimestamp"
132 | 
133 |         failures_by_scale_set: MutableMapping[str, List[EventData]] = {}
134 |         for log in self._monitor_client.activity_logs.list(filter=filter_clause, select=select_clause):
135 |             if (log.status and log.status.value == 'Failed') or (log.properties and log.properties.get('statusCode') == 'Conflict'):
136 |                 if log.authorization and log.authorization.action and 'delete' in log.authorization.action:
137 |                     continue
138 |                 failures_by_scale_set.setdefault(log.resource_id, []).append(log)
139 | 
140 |         result = []
141 |         for scale_set in self._compute_client.virtual_machine_scale_sets.list(resource_group_name):
142 |             failures = sorted(failures_by_scale_set.get(scale_set.id, []), key=lambda x: x.event_timestamp, reverse=True)
143 |             timeout_until = None
144 |             timeout_reason = None
145 |             for failure in failures:
146 |                 status_message = json.loads(failure.properties.get('statusMessage', "{}")) if failure.properties else {}
147 |                 error_details = status_message.get('error', {})
148 |                 if 'message' in error_details:
149 |                     timeout_until = failure.event_timestamp + TIMEOUT_PERIOD
150 |                     timeout_reason = error_details['message']
151 |                     # Stop if we found a message with details
152 |                     break
153 |                 if timeout_until is None:
154 |                     timeout_until = failure.event_timestamp + TIMEOUT_PERIOD
155 |                     timeout_reason = failure.sub_status.localized_value
156 | 
157 |             priority = int(scale_set.tags[PRIORITY_TAG]) if PRIORITY_TAG in scale_set.tags else None
158 |             no_schedule_taints = json.loads(scale_set.tags.get(NO_SCHEDULE_TAINTS_TAG, '{}'))
159 | 
160 |             result.append(AzureScaleSet(scale_set.location, resource_group_name, scale_set.name, scale_set.sku.name,
161 |                                         scale_set.sku.capacity, scale_set.provisioning_state, timeout_until=timeout_until,
162 |                                         timeout_reason=timeout_reason, priority=priority, no_schedule_taints=no_schedule_taints))
163 |         return result
164 | 
165 |     def list_scale_set_instances(self, scale_set: AzureScaleSet) -> List[AzureScaleSetInstance]:
166 |         result = []
167 |         for instance in self._compute_client.virtual_machine_scale_set_vms.list(scale_set.resource_group, scale_set.name, expand="instanceView"):
168 |             launch_time = datetime.now(pytz.utc)
169 |             for status in instance.instance_view.statuses:
170 |                 if status.code == 'ProvisioningState/succeeded':
171 |                     launch_time = status.time
172 |                     break
173 |             result.append(AzureScaleSetInstance(instance.instance_id, instance.vm_id, launch_time))
174 |         return result
175 | 
176 |     def update_scale_set(self, scale_set: AzureScaleSet, new_capacity: int) -> Future:
177 |         parameters = VirtualMachineScaleSet(scale_set.location, sku=Sku(name=scale_set.instance_type, capacity=new_capacity))
178 |         azure_op = self._compute_client.virtual_machine_scale_sets.create_or_update(scale_set.resource_group, scale_set.name,
179 |                                                                                     parameters=parameters)
180 |         return AzureOperationPollerFutureAdapter(azure_op)
181 | 
182 |     def terminate_scale_set_instances(self, scale_set: AzureScaleSet, instances: List[AzureScaleSetInstance]) -> Future:
183 |         future = self._compute_client.virtual_machine_scale_sets.delete_instances(scale_set.resource_group, scale_set.name, [instance.instance_id for instance in instances])
184 |         return AzureOperationPollerFutureAdapter(future)
185 | 
186 |     def get_remaining_instances(self, resource_group_name: str, sku: str):
187 |         resource_group = self._resource_client.resource_groups.get(resource_group_name)
188 |         cores_per_instance = None
189 |         for vm_size in self._compute_client.virtual_machine_sizes.list(location=resource_group.location):
190 |             if vm_size.name == sku:
191 |                 cores_per_instance = vm_size.number_of_cores
192 | 
193 |         if cores_per_instance is None:
194 |             logger.warn("No metadata found for sku: " + sku)
195 |             return 0
196 | 
197 |         for usage in self._compute_client.usage.list(location=resource_group.location):
198 |             if usage.name.value == _azure_sku_family(sku):
199 |                 return (usage.limit - usage.current_value) // cores_per_instance
200 | 
201 |         logger.warn("No quota found matching: " + sku)
202 |         return 0
203 | 
204 | 
205 | class AzureWriteThroughCachedApi(AzureApi):
206 |     def __init__(self, delegate: AzureApi) -> None:
207 |         self._delegate = delegate
208 |         self._lock = RLock()
209 |         self._instance_cache: MutableMapping[Tuple[str, str], List[AzureScaleSetInstance]] = {}
210 |         self._scale_set_cache: MutableMapping[str, List[AzureScaleSet]] = {}
211 |         self._remaining_instances_cache: MutableMapping[str, MutableMapping[str, int]] = {}
212 | 
213 |     def invalidate_quota_cache(self, resource_group_name: str) -> None:
214 |         with self._lock:
215 |             if resource_group_name in self._remaining_instances_cache:
216 |                 del self._remaining_instances_cache[resource_group_name]
217 | 
218 |     def list_scale_sets(self, resource_group_name: str, force_refresh=False) -> List[AzureScaleSet]:
219 |         if not force_refresh:
220 |             with self._lock:
221 |                 if resource_group_name in self._scale_set_cache:
222 |                     return deepcopy(self._scale_set_cache[resource_group_name])
223 | 
224 |         scale_sets = self._delegate.list_scale_sets(resource_group_name)
225 |         with self._lock:
226 |             old_scale_sets = dict((x.name, x) for x in self._scale_set_cache.get(resource_group_name, []))
227 |             for scale_set in scale_sets:
228 |                 old_scale_set = old_scale_sets.get(scale_set.name)
229 |                 if not old_scale_set:
230 |                     continue
231 | 
232 |                 # Check if Scale Set was changed externally
233 |                 if old_scale_set.capacity != scale_set.capacity:
234 |                     if (resource_group_name, scale_set.name) in self._instance_cache:
235 |                         del self._instance_cache[(resource_group_name, scale_set.name)]
236 | 
237 |             self._scale_set_cache[resource_group_name] = scale_sets
238 |         return deepcopy(scale_sets)
239 | 
240 |     def list_scale_set_instances(self, scale_set: AzureScaleSet) -> List[AzureScaleSetInstance]:
241 |         key = (scale_set.resource_group, scale_set.name)
242 |         with self._lock:
243 |             if key in self._instance_cache:
244 |                 return deepcopy(self._instance_cache[key])
245 | 
246 |         instances = self._delegate.list_scale_set_instances(scale_set)
247 |         # Make sure we don't poison the cache, if our delegate is eventually consistent
248 |         if len(instances) == scale_set.capacity:
249 |             with self._lock:
250 |                 self._instance_cache[key] = instances
251 |         return deepcopy(instances)
252 | 
253 |     def update_scale_set(self, scale_set: AzureScaleSet, new_capacity: int) -> Future:
254 |         future = self._delegate.update_scale_set(scale_set, new_capacity)
255 |         future.add_done_callback(lambda _: self._invalidate(scale_set.resource_group, scale_set.name))
256 |         return future
257 | 
258 |     def terminate_scale_set_instances(self, scale_set: AzureScaleSet, instances: List[AzureScaleSetInstance]) -> Future:
259 |         future = self._delegate.terminate_scale_set_instances(scale_set, instances)
260 |         future.add_done_callback(lambda _: self._invalidate(scale_set.resource_group, scale_set.name))
261 |         return future
262 | 
263 |     def get_remaining_instances(self, resource_group_name: str, sku: str):
264 |         with self._lock:
265 |             if resource_group_name in self._remaining_instances_cache:
266 |                 cached = self._remaining_instances_cache[resource_group_name]
267 |                 if sku in cached:
268 |                     return cached[sku]
269 |         remaining = self._delegate.get_remaining_instances(resource_group_name, sku)
270 |         with self._lock:
271 |             self._remaining_instances_cache.setdefault(resource_group_name, {})[sku] = remaining
272 |         return remaining
273 | 
274 |     def _invalidate(self, resource_group_name: str, scale_set_name: str) -> None:
275 |         with self._lock:
276 |             if (resource_group_name, scale_set_name) in self._instance_cache:
277 |                 del self._instance_cache[(resource_group_name, scale_set_name)]
278 | 
279 |             if resource_group_name in self._scale_set_cache:
280 |                 del self._scale_set_cache[resource_group_name]
281 | 
282 |             if resource_group_name in self._remaining_instances_cache:
283 |                 del self._remaining_instances_cache[resource_group_name]
284 | 
285 | 
286 | _AZURE_API_MAX_WAIT = 10*60
287 | 
288 | 
289 | # Adapts an Azure async operation to behave like a Future
290 | class AzureOperationPollerFutureAdapter(Future):
291 |     def __init__(self, azure_operation):
292 |         self._done = False
293 |         self._result = None
294 |         self._exception = None
295 |         # NOTE: All this complexity with a Condition is here because AzureOperationPoller is not reentrant,
296 |         # so a callback added with add_done_callback() could not call result(), if we delegated everything
297 |         self._condition = Condition()
298 |         self._callbacks = []
299 |         self.azure_operation = azure_operation
300 |         azure_operation.add_done_callback(self._handle_completion)
301 | 
302 |     def _handle_completion(self, result):
303 |         with self._condition:
304 |             self._done = True
305 |             if self.azure_operation._exception is None:
306 |                 self._result = result
307 |             else:
308 |                 self._exception = self.azure_operation._exception
309 |             self._condition.notifyAll()
310 |             callbacks = self._callbacks
311 |             self._callbacks.clear()
312 | 
313 |         for callback in callbacks:
314 |             callback(self)
315 | 
316 |     def result(self):
317 |         callbacks = []
318 |         try:
319 |             with self._condition:
320 |                 if not self._done:
321 |                     self._condition.wait(_AZURE_API_MAX_WAIT)
322 |                     if not self._done:
323 |                         # We reached the timeout
324 |                         self._exception = TimeoutError()
325 |                         self._done = True
326 |                         callbacks = self._callbacks
327 |                         self._callbacks.clear()
328 |                 if self._exception:
329 |                     raise self._exception
330 |                 return self._result
331 |         finally:
332 |             for callback in callbacks:
333 |                 callback(self)
334 | 
335 |     def add_done_callback(self, fn):
336 |         with self._condition:
337 |             if self._done:
338 |                 fn(self)
339 |             else:
340 |                 self._callbacks.append(fn)
341 | 


--------------------------------------------------------------------------------
/test/test_cluster.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import json
  3 | import os.path
  4 | import unittest
  5 | import copy
  6 | from datetime import datetime, timedelta
  7 | 
  8 | import boto3
  9 | import pykube
 10 | import mock
 11 | import moto
 12 | import yaml
 13 | import pytz
 14 | 
 15 | from autoscaler.cluster import Cluster, ClusterNodeState
 16 | from autoscaler.kube import KubePod, KubeNode
 17 | import autoscaler.utils as utils
 18 | 
 19 | 
 20 | class TestCluster(unittest.TestCase):
 21 |     def setUp(self):
 22 |         # load dummy kube specs
 23 |         dir_path = os.path.dirname(os.path.realpath(__file__))
 24 |         with open(os.path.join(dir_path, 'data/busybox.yaml'), 'r') as f:
 25 |             self.dummy_pod = yaml.load(f.read())
 26 |         with open(os.path.join(dir_path, 'data/ds-pod.yaml'), 'r') as f:
 27 |             self.dummy_ds_pod = yaml.load(f.read())
 28 |         with open(os.path.join(dir_path, 'data/rc-pod.yaml'), 'r') as f:
 29 |             self.dummy_rc_pod = yaml.load(f.read())
 30 |         with open(os.path.join(dir_path, 'data/node.yaml'), 'r') as f:
 31 |             self.dummy_node = yaml.load(f.read())
 32 |             for condition in self.dummy_node['status']['conditions']:
 33 |                 if condition['type'] == 'Ready' and condition['status'] == 'True':
 34 |                     condition['lastHeartbeatTime'] = datetime.now(condition['lastHeartbeatTime'].tzinfo)
 35 |             # Convert timestamps to strings to match PyKube
 36 |             for condition in self.dummy_node['status']['conditions']:
 37 |                 condition['lastHeartbeatTime'] = datetime.isoformat(condition['lastHeartbeatTime'])
 38 |                 condition['lastTransitionTime'] = datetime.isoformat(condition['lastTransitionTime'])
 39 | 
 40 | 
 41 |         # this isn't actually used here
 42 |         # only needed to create the KubePod object...
 43 |         self.api = pykube.HTTPClient(pykube.KubeConfig.from_file('~/.kube/config'))
 44 | 
 45 |         # start creating our mock ec2 environment
 46 |         self.mocks = [moto.mock_ec2(), moto.mock_autoscaling()]
 47 |         for moto_mock in self.mocks:
 48 |             moto_mock.start()
 49 | 
 50 |         client = boto3.client('autoscaling', region_name='us-west-2')
 51 |         self.asg_client = client
 52 | 
 53 |         client.create_launch_configuration(
 54 |             LaunchConfigurationName='dummy-lc',
 55 |             ImageId='ami-deadbeef',
 56 |             KeyName='dummy-key',
 57 |             SecurityGroups=[
 58 |                 'sg-cafebeef',
 59 |             ],
 60 |             InstanceType='t2.medium'
 61 |         )
 62 | 
 63 |         client.create_auto_scaling_group(
 64 |             AutoScalingGroupName='dummy-asg',
 65 |             LaunchConfigurationName='dummy-lc',
 66 |             MinSize=0,
 67 |             MaxSize=10,
 68 |             VPCZoneIdentifier='subnet-beefbeef',
 69 |             Tags=[
 70 |                 {
 71 |                     'Key': 'KubernetesCluster',
 72 |                     'Value': 'dummy-cluster',
 73 |                     'PropagateAtLaunch': True
 74 |                 },
 75 |                 {
 76 |                     'Key': 'KubernetesRole',
 77 |                     'Value': 'worker',
 78 |                     'PropagateAtLaunch': True
 79 |                 }
 80 |             ]
 81 |         )
 82 | 
 83 |         # finally our cluster
 84 |         self.cluster = Cluster(
 85 |             aws_access_key='fake',
 86 |             aws_secret_key='fake',
 87 |             aws_regions=['us-west-2', 'us-east-1', 'us-west-1'],
 88 |             azure_client_id='',
 89 |             azure_client_secret='',
 90 |             azure_subscription_id='',
 91 |             azure_tenant_id='',
 92 |             azure_resource_group_names=[],
 93 |             azure_slow_scale_classes=[],
 94 |             kubeconfig='~/.kube/config',
 95 |             pod_namespace=None,
 96 |             drain_utilization_below=0.3,
 97 |             idle_threshold=60,
 98 |             instance_init_time=60,
 99 |             type_idle_threshold=60,
100 |             cluster_name='dummy-cluster',
101 |             notifier=mock.Mock(),
102 |             dry_run=False,
103 |             use_aws_iam_role=False
104 |         )
105 | 
106 |     def tearDown(self):
107 |         for moto_mock in self.mocks:
108 |             moto_mock.stop()
109 | 
110 |     def _spin_up_node(self, launch_time=None):
111 |         return self._spin_up_nodes(1, launch_time=launch_time)[0]
112 | 
113 |     def _spin_up_nodes(self, count, launch_time=None):
114 |         assert count <= 256
115 |         # spin up dummy ec2 node
116 |         self.asg_client.set_desired_capacity(AutoScalingGroupName='dummy-asg',
117 |                                              DesiredCapacity=count)
118 |         response = self.asg_client.describe_auto_scaling_groups()
119 |         nodes = []
120 |         for i, instance in enumerate(response['AutoScalingGroups'][0]['Instances']):
121 |             instance_id = instance['InstanceId']
122 | 
123 |             dummy_node = copy.deepcopy(self.dummy_node)
124 |             dummy_node['metadata']['labels']['aws/id'] = instance_id
125 |             dummy_node['metadata']['name'] = '10.0.' + str(i) + '.228'
126 |             node = KubeNode(pykube.Node(self.api, dummy_node))
127 |             node.cordon = mock.Mock(return_value="mocked stuff")
128 |             node.drain = mock.Mock(return_value="mocked stuff")
129 |             node.uncordon = mock.Mock(return_value="mocked stuff")
130 |             node.delete = mock.Mock(return_value="mocked stuff")
131 |             nodes.append(node)
132 |         return nodes
133 | 
134 |     def test_reap_dead_node(self):
135 |         node = copy.deepcopy(self.dummy_node)
136 |         TestInstance = collections.namedtuple('TestInstance', ['launch_time'])
137 |         instance = TestInstance(datetime.now(pytz.utc))
138 | 
139 |         ready_condition = None
140 |         for condition in node['status']['conditions']:
141 |             if condition['type'] == 'Ready':
142 |                 ready_condition = condition
143 |                 break
144 |         ready_condition['status'] = 'Unknown'
145 | 
146 |         ready_condition['lastHeartbeatTime'] = datetime.isoformat(datetime.now(pytz.utc) - timedelta(minutes=30))
147 |         kube_node = KubeNode(pykube.Node(self.api, node))
148 |         kube_node.delete = mock.Mock(return_value="mocked stuff")
149 |         self.cluster.maintain([kube_node], {kube_node.instance_id: instance}, {}, [], [])
150 |         kube_node.delete.assert_not_called()
151 | 
152 |         ready_condition['lastHeartbeatTime'] = datetime.isoformat(datetime.now(pytz.utc) - timedelta(hours=2))
153 |         kube_node = KubeNode(pykube.Node(self.api, node))
154 |         kube_node.delete = mock.Mock(return_value="mocked stuff")
155 |         self.cluster.maintain([kube_node], {kube_node.instance_id: instance}, {}, [], [])
156 |         kube_node.delete.assert_called_once_with()
157 | 
158 |     def test_max_scale_in(self):
159 |         node1 = copy.deepcopy(self.dummy_node)
160 |         node2 = copy.deepcopy(self.dummy_node)
161 |         TestInstance = collections.namedtuple('TestInstance', ['launch_time'])
162 |         instance1 = TestInstance(datetime.now(pytz.utc))
163 |         instance2 = TestInstance(datetime.now(pytz.utc))
164 | 
165 |         for node in [node1, node2]:
166 |             for condition in node['status']['conditions']:
167 |                 if condition['type'] == 'Ready':
168 |                     condition['status'] = 'Unknown'
169 |                     condition['lastHeartbeatTime'] = datetime.isoformat(datetime.now(pytz.utc) - timedelta(hours=2))
170 |                     break
171 | 
172 |         kube_node1 = KubeNode(pykube.Node(self.api, node1))
173 |         kube_node1.delete = mock.Mock(return_value="mocked stuff")
174 |         kube_node2 = KubeNode(pykube.Node(self.api, node2))
175 |         kube_node2.delete = mock.Mock(return_value="mocked stuff")
176 |         self.cluster.maintain([kube_node1, kube_node2], {kube_node1.instance_id: instance1, kube_node2.instance_id: instance2}, {}, [], [])
177 |         kube_node1.delete.assert_not_called()
178 |         kube_node2.delete.assert_not_called()
179 | 
180 |     def test_scale_up_selector(self):
181 |         self.dummy_pod['spec']['nodeSelector'] = {
182 |             'aws/type': 'm4.large'
183 |         }
184 |         pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
185 |         selectors_hash = utils.selectors_to_hash(pod.selectors)
186 |         asgs = self.cluster.autoscaling_groups.get_all_groups([])
187 |         self.cluster.fulfill_pending(asgs, selectors_hash, [pod])
188 | 
189 |         response = self.asg_client.describe_auto_scaling_groups()
190 |         self.assertEqual(len(response['AutoScalingGroups']), 1)
191 |         self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)
192 | 
193 |     def test_scale_up(self):
194 |         pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
195 |         selectors_hash = utils.selectors_to_hash(pod.selectors)
196 |         asgs = self.cluster.autoscaling_groups.get_all_groups([])
197 |         self.cluster.fulfill_pending(asgs, selectors_hash, [pod])
198 | 
199 |         response = self.asg_client.describe_auto_scaling_groups()
200 |         self.assertEqual(len(response['AutoScalingGroups']), 1)
201 |         self.assertGreater(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)
202 | 
203 |     def test_scale_up_notification(self):
204 |         big_pod_spec = copy.deepcopy(self.dummy_pod)
205 |         for container in big_pod_spec['spec']['containers']:
206 |             container['resources']['requests']['cpu'] = '100'
207 |         pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
208 |         big_pod = KubePod(pykube.Pod(self.api, big_pod_spec))
209 |         selectors_hash = utils.selectors_to_hash(pod.selectors)
210 |         asgs = self.cluster.autoscaling_groups.get_all_groups([])
211 |         self.cluster.fulfill_pending(asgs, selectors_hash, [pod, big_pod])
212 |         self.cluster.notifier.notify_scale.assert_called_with(mock.ANY, mock.ANY, [pod])
213 | 
214 |     def test_timed_out_group(self):
215 |         with mock.patch('autoscaler.autoscaling_groups.AutoScalingGroup.is_timed_out') as is_timed_out:
216 |             with mock.patch('autoscaler.autoscaling_groups.AutoScalingGroup.scale') as scale:
217 |                 is_timed_out.return_value = True
218 |                 scale.return_value = utils.CompletedFuture(None)
219 | 
220 |                 pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
221 |                 selectors_hash = utils.selectors_to_hash(pod.selectors)
222 |                 asgs = self.cluster.autoscaling_groups.get_all_groups([])
223 |                 self.cluster.fulfill_pending(asgs, selectors_hash, [pod])
224 | 
225 |                 scale.assert_not_called()
226 | 
227 |                 response = self.asg_client.describe_auto_scaling_groups()
228 |                 self.assertEqual(len(response['AutoScalingGroups']), 1)
229 |                 self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 0)
230 | 
231 |     def test_scale_down(self):
232 |         """
233 |         kube node with daemonset and no pod --> cordon
234 |         """
235 |         node = self._spin_up_node()
236 | 
237 |         all_nodes = [node]
238 |         managed_nodes = [n for n in all_nodes if node.is_managed()]
239 |         running_insts_map = self.cluster.get_running_instances_map(managed_nodes, [])
240 |         pods_to_schedule = {}
241 |         asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)
242 | 
243 |         ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
244 |         running_or_pending_assigned_pods = [ds_pod]
245 | 
246 |         self.cluster.idle_threshold = -1
247 |         self.cluster.type_idle_threshold = -1
248 |         self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = -1
249 |         self.cluster.maintain(
250 |             managed_nodes, running_insts_map,
251 |             pods_to_schedule, running_or_pending_assigned_pods, asgs)
252 | 
253 |         response = self.asg_client.describe_auto_scaling_groups()
254 |         self.assertEqual(len(response['AutoScalingGroups']), 1)
255 |         self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
256 |         node.cordon.assert_called_once_with()
257 | 
258 |     def test_scale_down_launch_grace_period(self):
259 |         """
260 |         kube node with daemonset and no pod + launch grace period --> noop
261 |         """
262 |         node = self._spin_up_node()
263 |         all_nodes = [node]
264 |         managed_nodes = [n for n in all_nodes if node.is_managed()]
265 |         running_insts_map = self.cluster.get_running_instances_map(managed_nodes, [])
266 |         pods_to_schedule = {}
267 |         asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)
268 | 
269 |         ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
270 |         running_or_pending_assigned_pods = [ds_pod]
271 | 
272 |         self.cluster.idle_threshold = -1
273 |         self.cluster.type_idle_threshold = -1
274 |         self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = 60*30
275 |         self.cluster.maintain(
276 |             managed_nodes, running_insts_map,
277 |             pods_to_schedule, running_or_pending_assigned_pods, asgs)
278 | 
279 |         response = self.asg_client.describe_auto_scaling_groups()
280 |         self.assertEqual(len(response['AutoScalingGroups']), 1)
281 |         self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
282 |         node.cordon.assert_not_called()
283 | 
284 |     def test_scale_down_grace_period(self):
285 |         """
286 |         kube node with daemonset and no pod + grace period --> noop
287 |         """
288 |         node = self._spin_up_node()
289 |         all_nodes = [node]
290 |         managed_nodes = [n for n in all_nodes if node.is_managed()]
291 |         running_insts_map = self.cluster.get_running_instances_map(managed_nodes, [])
292 |         pods_to_schedule = {}
293 |         asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)
294 | 
295 |         # kube node with daemonset and no pod --> cordon
296 |         ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
297 |         running_or_pending_assigned_pods = [ds_pod]
298 | 
299 |         self.cluster.maintain(
300 |             managed_nodes, running_insts_map,
301 |             pods_to_schedule, running_or_pending_assigned_pods, asgs)
302 | 
303 |         response = self.asg_client.describe_auto_scaling_groups()
304 |         self.assertEqual(len(response['AutoScalingGroups']), 1)
305 |         self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
306 |         node.cordon.assert_not_called()
307 | 
308 |     def test_scale_down_busy(self):
309 |         """
310 |         kube node with daemonset and pod/rc-pod --> noop
311 |         """
312 |         node = self._spin_up_node()
313 |         all_nodes = [node]
314 |         managed_nodes = [n for n in all_nodes if node.is_managed()]
315 |         running_insts_map = self.cluster.get_running_instances_map(managed_nodes, [])
316 |         pods_to_schedule = {}
317 |         asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)
318 | 
319 |         # kube node with daemonset and pod --> noop
320 |         ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
321 |         pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
322 |         rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod))
323 | 
324 |         pod_scenarios = [
325 |             # kube node with daemonset and pod --> noop
326 |             [ds_pod, pod],
327 |             # kube node with daemonset and rc pod --> noop
328 |             [ds_pod, rc_pod]
329 |         ]
330 | 
331 |         # make sure we're not on grace period
332 |         self.cluster.idle_threshold = -1
333 |         self.cluster.type_idle_threshold = -1
334 | 
335 |         for pods in pod_scenarios:
336 |             state = self.cluster.get_node_state(
337 |                 node, asgs[0], pods, pods_to_schedule,
338 |                 running_insts_map, collections.Counter())
339 |             self.assertEqual(state, ClusterNodeState.BUSY)
340 | 
341 |             self.cluster.maintain(
342 |                 managed_nodes, running_insts_map,
343 |                 pods_to_schedule, pods, asgs)
344 | 
345 |             response = self.asg_client.describe_auto_scaling_groups()
346 |             self.assertEqual(len(response['AutoScalingGroups']), 1)
347 |             self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
348 |             node.cordon.assert_not_called()
349 | 
350 |     def test_scale_down_under_utilized_undrainable(self):
351 |         """
352 |         kube node with daemonset and pod/rc-pod --> noop
353 |         """
354 |         node = self._spin_up_node()
355 |         all_nodes = [node]
356 |         managed_nodes = [n for n in all_nodes if node.is_managed()]
357 |         running_insts_map = self.cluster.get_running_instances_map(managed_nodes, [])
358 |         pods_to_schedule = {}
359 |         asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)
360 | 
361 |         # create some undrainable pods
362 |         ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
363 |         for container in self.dummy_pod['spec']['containers']:
364 |             container.pop('resources', None)
365 |         pod = KubePod(pykube.Pod(self.api, self.dummy_pod))
366 |         self.dummy_rc_pod['metadata']['labels']['openai/do-not-drain'] = 'true'
367 |         for container in self.dummy_rc_pod['spec']['containers']:
368 |             container.pop('resources', None)
369 |         rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod))
370 | 
371 |         pod_scenarios = [
372 |             # kube node with daemonset and pod with no resource ask --> noop
373 |             [ds_pod, pod],
374 |             # kube node with daemonset and critical rc pod --> noop
375 |             [ds_pod, rc_pod]
376 |         ]
377 | 
378 |         # make sure we're not on grace period
379 |         self.cluster.idle_threshold = -1
380 |         self.cluster.type_idle_threshold = -1
381 |         self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = -1
382 | 
383 |         for pods in pod_scenarios:
384 |             state = self.cluster.get_node_state(
385 |                 node, asgs[0], pods, pods_to_schedule,
386 |                 running_insts_map, collections.Counter())
387 |             self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE)
388 | 
389 |             self.cluster.maintain(
390 |                 managed_nodes, running_insts_map,
391 |                 pods_to_schedule, pods, asgs)
392 | 
393 |             response = self.asg_client.describe_auto_scaling_groups()
394 |             self.assertEqual(len(response['AutoScalingGroups']), 1)
395 |             self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
396 |             node.cordon.assert_not_called()
397 | 
398 |     def test_scale_down_under_utilized_drainable(self):
399 |         """
400 |         kube node with daemonset and rc-pod --> cordon+drain
401 |         """
402 |         node = self._spin_up_node()
403 |         all_nodes = [node]
404 |         managed_nodes = [n for n in all_nodes if node.is_managed()]
405 |         running_insts_map = self.cluster.get_running_instances_map(managed_nodes, [])
406 |         pods_to_schedule = {}
407 |         asgs = self.cluster.autoscaling_groups.get_all_groups(all_nodes)
408 | 
409 |         # create some undrainable pods
410 |         ds_pod = KubePod(pykube.Pod(self.api, self.dummy_ds_pod))
411 |         for container in self.dummy_rc_pod['spec']['containers']:
412 |             container.pop('resources', None)
413 |         rc_pod = KubePod(pykube.Pod(self.api, self.dummy_rc_pod))
414 |         pods = [ds_pod, rc_pod]
415 | 
416 |         # make sure we're not on grace period
417 |         self.cluster.idle_threshold = -1
418 |         self.cluster.type_idle_threshold = -1
419 |         self.cluster.LAUNCH_HOUR_THRESHOLD['aws'] = -1
420 | 
421 |         state = self.cluster.get_node_state(
422 |             node, asgs[0], pods, pods_to_schedule,
423 |             running_insts_map, collections.Counter())
424 |         self.assertEqual(state, ClusterNodeState.UNDER_UTILIZED_DRAINABLE)
425 | 
426 |         self.cluster.maintain(
427 |             managed_nodes, running_insts_map,
428 |             pods_to_schedule, pods, asgs)
429 | 
430 |         response = self.asg_client.describe_auto_scaling_groups()
431 |         self.assertEqual(len(response['AutoScalingGroups']), 1)
432 |         self.assertEqual(response['AutoScalingGroups'][0]['DesiredCapacity'], 1)
433 |         node.cordon.assert_called_once_with()
434 |         node.drain.assert_called_once_with(pods, notifier=mock.ANY)
435 | 
436 |     def test_prioritization(self):
437 |         TestingGroup = collections.namedtuple('TestingGroup', ['region', 'name', 'selectors', 'global_priority', 'is_spot'])
438 |         high_pri = TestingGroup('test', 'test', {}, -1, False)
439 |         low_pri = TestingGroup('test', 'test', {}, 0, False)
440 | 
441 |         self.assertEqual([high_pri, low_pri], list(self.cluster._prioritize_groups([low_pri, high_pri])))
442 | 


--------------------------------------------------------------------------------
/autoscaler/autoscaling_groups.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import re
  4 | from concurrent.futures import ThreadPoolExecutor
  5 | 
  6 | import botocore
  7 | import pytz
  8 | 
  9 | import autoscaler.aws_utils as aws_utils
 10 | import autoscaler.utils as utils
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class AutoScalingGroups(object):
 16 |     _BOTO_CLIENT_TYPE = 'autoscaling'
 17 | 
 18 |     _CLUSTER_KEY = 'KubernetesCluster'
 19 |     _ROLE_KEYS = ('KubernetesRole', 'Role')
 20 |     _WORKER_ROLE_VALUES = ('worker', 'kubernetes-minion')
 21 | 
 22 |     def __init__(self, session, regions, cluster_name=None):
 23 |         """
 24 |         cluster_name - if set, filter ASGs by cluster_name in tag field
 25 |             _CLUSTER_KEY
 26 |         """
 27 |         self.session = session
 28 |         self.regions = regions
 29 |         self.cluster_name = cluster_name
 30 | 
 31 |     @staticmethod
 32 |     def get_all_raw_groups_and_launch_configs(client):
 33 |         raw_groups = aws_utils.fetch_all(
 34 |             client.describe_auto_scaling_groups, {'MaxRecords': 100}, 'AutoScalingGroups')
 35 |         all_launch_configs = {}
 36 |         batch_size = 50
 37 |         for launch_config_idx in range(0, len(raw_groups), batch_size):
 38 |             groups = raw_groups[launch_config_idx*batch_size:(launch_config_idx+1)*batch_size]
 39 |             kwargs = {
 40 |                 'LaunchConfigurationNames': [g['LaunchConfigurationName'] for g in groups]
 41 |             }
 42 |             launch_configs = aws_utils.fetch_all(
 43 |                 client.describe_launch_configurations,
 44 |                 kwargs, 'LaunchConfigurations')
 45 |             all_launch_configs.update((lc['LaunchConfigurationName'], lc)
 46 |                                       for lc in launch_configs)
 47 |         return raw_groups, all_launch_configs
 48 | 
 49 |     def get_all_groups(self, kube_nodes):
 50 |         groups = []
 51 |         with ThreadPoolExecutor(max_workers=max(1, len(self.regions))) as executor:
 52 |             raw_groups_and_launch_configs = {}
 53 |             for region in self.regions:
 54 |                 client = self.session.client(self._BOTO_CLIENT_TYPE,
 55 |                                              region_name=region)
 56 |                 raw_groups_and_launch_configs[region] = executor.submit(
 57 |                     AutoScalingGroups.get_all_raw_groups_and_launch_configs, client)
 58 | 
 59 |             for region in self.regions:
 60 |                 raw_groups, launch_configs = raw_groups_and_launch_configs[region].result()
 61 | 
 62 |                 client = self.session.client(self._BOTO_CLIENT_TYPE,
 63 |                                              region_name=region)
 64 |                 for raw_group in sorted(raw_groups, key=lambda g: g['AutoScalingGroupName']):
 65 |                     if self.cluster_name:
 66 |                         cluster_name = None
 67 |                         role = None
 68 |                         for tag in raw_group['Tags']:
 69 |                             if tag['Key'] == self._CLUSTER_KEY:
 70 |                                 cluster_name = tag['Value']
 71 |                             elif tag['Key'] in self._ROLE_KEYS:
 72 |                                 role = tag['Value']
 73 |                         if cluster_name != self.cluster_name or role not in self._WORKER_ROLE_VALUES:
 74 |                             continue
 75 | 
 76 |                     groups.append(AutoScalingGroup(
 77 |                         client, region, kube_nodes, raw_group,
 78 |                         launch_configs[raw_group['LaunchConfigurationName']]))
 79 | 
 80 |         return groups
 81 | 
 82 | 
 83 | class AutoScalingTimeouts(object):
 84 |     _TIMEOUT = 3600  # 1 hour
 85 |     _SPOT_REQUEST_TIMEOUT = 300  # 5 minutes
 86 |     _MAX_OUTBIDS_IN_INTERVAL = 60*20  # 20 minutes
 87 |     _SPOT_HISTORY_PERIOD = 60*60*5  # 5 hours
 88 | 
 89 |     def __init__(self, session):
 90 |         """
 91 |         """
 92 |         self.session = session
 93 | 
 94 |         # ASGs to avoid because of recent launch failures
 95 |         # e.g. a region running out of capacity
 96 |         # try to favor other regions
 97 |         self._timeouts = {}
 98 |         self._last_activities = {}
 99 | 
100 |         # ASGs to avoid because of spot pricing history
101 |         self._spot_timeouts = {}
102 |         self._spot_price_history = {}
103 | 
104 |     def refresh_timeouts(self, asgs, dry_run=False):
105 |         """
106 |         refresh timeouts on ASGs using new data from aws
107 |         """
108 |         self.time_out_spot_asgs(asgs)
109 | 
110 |         asgs_by_region = {}
111 |         for asg in asgs:
112 |             asgs_by_region.setdefault(asg.region, []).append(asg)
113 | 
114 |         for region, regional_asgs in asgs_by_region.items():
115 |             client = self.session.client('autoscaling', region_name=region)
116 |             start_time_cutoff = None
117 |             newest_completed_activity = None
118 |             activities = {}
119 |             for activity in self.iter_activities(client):
120 |                 if newest_completed_activity is None and activity['Progress'] == 100:
121 |                     newest_completed_activity = activity
122 |                 if activity['ActivityId'] == self._last_activities.get(region, None):
123 |                     break
124 |                 if start_time_cutoff is None:
125 |                     start_time_cutoff = (
126 |                         datetime.datetime.now(activity['StartTime'].tzinfo) -
127 |                         datetime.timedelta(seconds=self._TIMEOUT))
128 |                 if activity['StartTime'] < start_time_cutoff:
129 |                     # skip events that are too old to cut down the time
130 |                     # it takes the first time to go through events
131 |                     break
132 |                 activities.setdefault(activity['AutoScalingGroupName'], []).append(activity)
133 | 
134 |             self._last_activities[region] = newest_completed_activity['ActivityId']
135 |             for asg in regional_asgs:
136 |                 self.reconcile_limits(asg, activities.get(asg.name, []), dry_run=dry_run)
137 | 
138 |     def iter_activities(self, client):
139 |         next_token = None
140 |         while True:
141 |             kwargs = {}
142 |             if next_token:
143 |                 kwargs['NextToken'] = next_token
144 |             data = client.describe_scaling_activities(**kwargs)
145 |             for item in data['Activities']:
146 |                 yield item
147 |             next_token = data.get('NextToken')
148 |             if not next_token:
149 |                 break
150 | 
151 |     def revert_capacity(self, asg, entry, dry_run):
152 |         """
153 |         try to decrease desired capacity to the original
154 |         capacity before the capacity increase that caused
155 |         the ASG activity entry.
156 |         """
157 |         cause_m = AutoScalingCauseMessages.LAUNCH_INSTANCE.search(entry.get('Cause', ''))
158 |         if cause_m:
159 |             original_capacity = int(cause_m.group('original_capacity'))
160 |             if asg.desired_capacity > original_capacity:
161 |                 # we tried to go over capacity and failed
162 |                 # now set the desired capacity back to a normal range
163 |                 if not dry_run:
164 |                     asg.set_desired_capacity(original_capacity)
165 |                 else:
166 |                     logger.info('[Dry run] Would have set desired capacity to %s', original_capacity)
167 |                 return True
168 |         return False
169 | 
170 |     def time_out_asg(self, asg, entry):
171 |         self._timeouts[asg._id] = (
172 |             entry['StartTime'] + datetime.timedelta(seconds=self._TIMEOUT))
173 |         logger.info('%s is timed out until %s',
174 |                     asg.name, self._timeouts[asg._id])
175 | 
176 |     def reconcile_limits(self, asg, activities, dry_run=False):
177 |         """
178 |         makes sure the ASG has valid capacity by processing errors
179 |         in its recent scaling activities.
180 |         marks an ASG as timed out if it recently had a capacity
181 |         failure.
182 |         """
183 |         for entry in activities:
184 |             status_msg = entry.get('StatusMessage', '')
185 |             if entry['StatusCode'] in ('Failed', 'Cancelled'):
186 |                 logger.warn('%s scaling failure: %s', asg, entry)
187 | 
188 |                 m = AutoScalingErrorMessages.INSTANCE_LIMIT.match(status_msg)
189 |                 if m:
190 |                     max_desired_capacity = int(m.group('requested')) - 1
191 |                     if asg.desired_capacity > max_desired_capacity:
192 |                         self.time_out_asg(asg, entry)
193 | 
194 |                         # we tried to go over capacity and failed
195 |                         # now set the desired capacity back to a normal range
196 |                         if not dry_run:
197 |                             asg.set_desired_capacity(max_desired_capacity)
198 |                         else:
199 |                             logger.info('[Dry run] Would have set desired capacity to %s', max_desired_capacity)
200 |                     return
201 | 
202 |                 m = AutoScalingErrorMessages.VOLUME_LIMIT.match(status_msg)
203 |                 if m:
204 |                     # TODO: decrease desired capacity
205 |                     self.time_out_asg(asg, entry)
206 |                     return
207 | 
208 |                 m = AutoScalingErrorMessages.CAPACITY_LIMIT.match(status_msg)
209 |                 if m:
210 |                     reverted = self.revert_capacity(asg, entry, dry_run)
211 |                     if reverted:
212 |                         self.time_out_asg(asg, entry)
213 |                     return
214 | 
215 |                 m = AutoScalingErrorMessages.AZ_LIMIT.search(status_msg)
216 |                 if m and 'only-az' in asg.name:
217 |                     reverted = self.revert_capacity(asg, entry, dry_run)
218 |                     if reverted:
219 |                         self.time_out_asg(asg, entry)
220 |                     return
221 | 
222 |                 m = AutoScalingErrorMessages.SPOT_REQUEST_CANCELLED.search(status_msg)
223 |                 if m:
224 |                     # we cancelled a spot request
225 |                     # don't carry on to reset timeout
226 |                     continue
227 | 
228 |                 m = AutoScalingErrorMessages.SPOT_LIMIT.match(status_msg)
229 |                 if m:
230 |                     self.time_out_asg(asg, entry)
231 | 
232 |                     if not dry_run:
233 |                         asg.set_desired_capacity(asg.actual_capacity)
234 |                     else:
235 |                         logger.info('[Dry run] Would have set desired capacity to %s', asg.actual_capacity)
236 |                     return
237 |             elif entry['StatusCode'] == 'WaitingForSpotInstanceId':
238 |                 logger.warn('%s waiting for spot: %s', asg, entry)
239 | 
240 |                 balance_cause_m = AutoScalingCauseMessages.AZ_BALANCE.search(entry.get('Cause', ''))
241 |                 if balance_cause_m:
242 |                     # sometimes ASGs will launch instances in other az's to
243 |                     # balance out the group
244 |                     # ignore these events
245 |                     # even if we cancel it, the ASG will just attempt to
246 |                     # launch again
247 |                     logger.info('ignoring AZ balance launch event')
248 |                     continue
249 | 
250 |                 now = datetime.datetime.now(entry['StartTime'].tzinfo)
251 |                 if (now - entry['StartTime']) > datetime.timedelta(seconds=self._SPOT_REQUEST_TIMEOUT):
252 |                     self.time_out_asg(asg, entry)
253 | 
254 |                     # try to cancel spot request and scale down ASG
255 |                     spot_request_m = AutoScalingErrorMessages.SPOT_REQUEST_WAITING.search(status_msg)
256 |                     if spot_request_m:
257 |                         spot_request_id = spot_request_m.group('request_id')
258 |                         if not dry_run:
259 |                             cancelled = self.cancel_spot_request(asg.region, spot_request_id)
260 |                             if cancelled:
261 |                                 asg.set_desired_capacity(asg.desired_capacity - 1)
262 |                         else:
263 |                             logger.info('[Dry run] Would have cancelled spot request %s and decremented desired capacity.',
264 |                                         spot_request_id)
265 |                     # don't return here so that we can cancel more spot requests
266 | 
267 |         self._timeouts[asg._id] = None
268 |         logger.debug('%s has no timeout', asg.name)
269 | 
270 |     def is_timed_out(self, asg):
271 |         timeout = self._timeouts.get(asg._id)
272 |         spot_timeout = self._spot_timeouts.get(asg._id)
273 | 
274 |         if timeout and datetime.datetime.now(timeout.tzinfo) < timeout:
275 |             return True
276 | 
277 |         if spot_timeout and datetime.datetime.now(pytz.utc) < spot_timeout:
278 |             return True
279 | 
280 |         return False
281 | 
282 |     def cancel_spot_request(self, region, request_id):
283 |         client = self.session.client('ec2',
284 |                                      region_name=region)
285 |         response = client.describe_spot_instance_requests(
286 |             SpotInstanceRequestIds=[request_id]
287 |         )
288 |         if len(response['SpotInstanceRequests']) == 0:
289 |             return False
290 | 
291 |         spot_instance_req = response['SpotInstanceRequests'][0]
292 |         if spot_instance_req['State'] in ('open', 'active'):
293 |             response = client.cancel_spot_instance_requests(
294 |                 SpotInstanceRequestIds=[request_id]
295 |             )
296 |             logger.info('Spot instance request %s cancelled.', request_id)
297 |             return True
298 | 
299 |         return False
300 | 
301 |     def time_out_spot_asgs(self, asgs):
302 |         """
303 |         Using recent spot pricing data from AWS, time out spot instance
304 |         ASGs that would be outbid for more than _MAX_OUTBIDS_IN_INTERVAL seconds
305 |         """
306 |         region_instance_asg_map = {}
307 |         for asg in asgs:
308 |             if not asg.is_spot:
309 |                 continue
310 | 
311 |             instance_asg_map = region_instance_asg_map.setdefault(asg.region, {})
312 |             instance_type = asg.launch_config['InstanceType']
313 |             instance_asg_map.setdefault(instance_type, []).append(asg)
314 | 
315 |         now = datetime.datetime.now(pytz.utc)
316 |         since = now - datetime.timedelta(seconds=self._SPOT_HISTORY_PERIOD)
317 | 
318 |         for region, instance_asg_map in region_instance_asg_map.items():
319 |             # Expire old history
320 |             history = [item for item in self._spot_price_history.get(region, []) if item['Timestamp'] > since]
321 |             if history:
322 |                 newest_spot_price = max(item['Timestamp'] for item in history)
323 |             else:
324 |                 newest_spot_price = since
325 |             client = self.session.client('ec2', region_name=region)
326 |             kwargs = {
327 |                 'StartTime': newest_spot_price,
328 |                 'InstanceTypes': list(instance_asg_map.keys()),
329 |                 'ProductDescriptions': ['Linux/UNIX']
330 |             }
331 |             history.extend(aws_utils.fetch_all(
332 |                 client.describe_spot_price_history, kwargs, 'SpotPriceHistory'))
333 |             self._spot_price_history[region] = history
334 |             for instance_type, asgs in instance_asg_map.items():
335 |                 for asg in asgs:
336 |                     last_az_bid = {}
337 |                     outbid_time = {}
338 |                     bid_price = float(asg.launch_config['SpotPrice'])
339 |                     for item in history:
340 |                         if item['InstanceType'] != instance_type:
341 |                             continue
342 | 
343 |                         if float(item['SpotPrice']) > bid_price:
344 |                             # we would've been outbid!
345 |                             if item['AvailabilityZone'] in last_az_bid:
346 |                                 time_diff = (last_az_bid[item['AvailabilityZone']] - item['Timestamp'])
347 |                             else:
348 |                                 time_diff = datetime.timedelta(seconds=0)
349 |                             outbid_time[item['AvailabilityZone']] = (
350 |                                 outbid_time.get(item['AvailabilityZone'], datetime.timedelta(seconds=0)) +
351 |                                 time_diff)
352 |                         last_az_bid[item['AvailabilityZone']] = item['Timestamp']
353 | 
354 |                     if outbid_time:
355 |                         avg_outbid_time = sum(t.total_seconds() for t in outbid_time.values()) / len(outbid_time)
356 |                     else:
357 |                         avg_outbid_time = 0.0
358 |                     if avg_outbid_time > self._MAX_OUTBIDS_IN_INTERVAL:
359 |                         self._spot_timeouts[asg._id] = now + datetime.timedelta(seconds=self._TIMEOUT)
360 |                         logger.info('%s (%s) is spot timed out until %s (would have been outbid for %ss on average)',
361 |                                     asg.name, asg.region, self._spot_timeouts[asg._id], avg_outbid_time)
362 |                     else:
363 |                         self._spot_timeouts[asg._id] = None
364 | 
365 | 
366 | class AutoScalingGroup(object):
367 |     provider = 'aws'
368 | 
369 |     def __init__(self, client, region, kube_nodes, raw_group, launch_config):
370 |         """
371 |         client - boto3 AutoScaling.Client
372 |         region - AWS region string
373 |         kube_nodes - list of KubeNode objects
374 |         raw_group - raw ASG dictionary returned from AWS API
375 |         launch_config - raw launch config dictionary returned from AWS API
376 |         """
377 |         self.client = client
378 |         self.region = region
379 |         self.launch_config = launch_config
380 |         self.selectors = self._extract_selectors(region, launch_config, raw_group['Tags'])
381 |         self.name = raw_group['AutoScalingGroupName']
382 |         self.desired_capacity = raw_group['DesiredCapacity']
383 |         self.min_size = raw_group['MinSize']
384 |         self.max_size = raw_group['MaxSize']
385 | 
386 |         self.is_spot = launch_config.get('SpotPrice') is not None
387 |         self.instance_type = launch_config['InstanceType']
388 | 
389 |         self.instance_ids = set(inst['InstanceId'] for inst in raw_group['Instances']
390 |                                 if inst.get('InstanceId'))
391 |         self.nodes = [node for node in kube_nodes
392 |                       if node.instance_id in self.instance_ids]
393 |         self.unschedulable_nodes = [n for n in self.nodes if n.unschedulable]
394 |         self.no_schedule_taints = {}
395 | 
396 |         self._id = (self.region, self.name)
397 | 
398 |     def _extract_selectors(self, region, launch_config, tags_data):
399 |         selectors = {
400 |             'aws/type': launch_config['InstanceType'],
401 |             'aws/class': launch_config['InstanceType'][0],
402 |             'aws/ami-id': launch_config['ImageId'],
403 |             'aws/region': region
404 |         }
405 |         for tag_data in tags_data:
406 |             if tag_data['Key'].startswith('kube/'):
407 |                 selectors[tag_data['Key'][5:]] = tag_data['Value']
408 | 
409 |         # adding kube label counterparts
410 |         selectors['beta.kubernetes.io/instance-type'] = selectors['aws/type']
411 |         selectors['failure-domain.beta.kubernetes.io/region'] = selectors['aws/region']
412 | 
413 |         return selectors
414 | 
415 |     def is_timed_out(self):
416 |         return False
417 | 
418 |     @property
419 |     def global_priority(self):
420 |         return 0
421 | 
422 |     @property
423 |     def actual_capacity(self):
424 |         return len(self.nodes)
425 | 
426 |     def set_desired_capacity(self, new_desired_capacity):
427 |         """
428 |         sets the desired capacity of the underlying ASG directly.
429 |         note that this is for internal control.
430 |         for scaling purposes, please use scale() instead.
431 |         """
432 |         logger.info("ASG: {} new_desired_capacity: {}".format(
433 |             self, new_desired_capacity))
434 |         self.client.set_desired_capacity(AutoScalingGroupName=self.name,
435 |                                          DesiredCapacity=new_desired_capacity,
436 |                                          HonorCooldown=False)
437 |         self.desired_capacity = new_desired_capacity
438 |         return utils.CompletedFuture(True)
439 | 
440 |     def scale(self, new_desired_capacity):
441 |         """
442 |         scales the ASG to the new desired capacity.
443 |         returns a future with the result True if desired capacity has been increased.
444 |         """
445 |         desired_capacity = min(self.max_size, new_desired_capacity)
446 |         num_unschedulable = len(self.unschedulable_nodes)
447 |         num_schedulable = self.actual_capacity - num_unschedulable
448 | 
449 |         logger.info("Desired {}, currently at {}".format(
450 |             desired_capacity, self.desired_capacity))
451 |         logger.info("Kube node: {} schedulable, {} unschedulable".format(
452 |             num_schedulable, num_unschedulable))
453 | 
454 |         # Try to get the number of schedulable nodes up if we don't have enough, regardless of whether
455 |         # group's capacity is already at the same as the desired.
456 |         if num_schedulable < desired_capacity:
457 |             for node in self.unschedulable_nodes:
458 |                 if node.uncordon():
459 |                     num_schedulable += 1
460 |                     # Uncordon only what we need
461 |                     if num_schedulable == desired_capacity:
462 |                         break
463 | 
464 |         if self.desired_capacity != desired_capacity:
465 |             if self.desired_capacity == self.max_size:
466 |                 logger.info("Desired same as max, desired: {}, schedulable: {}".format(
467 |                     self.desired_capacity, num_schedulable))
468 |                 return utils.CompletedFuture(False)
469 | 
470 |             scale_up = self.desired_capacity < desired_capacity
471 |             # This should be a rare event
472 |             # note: this micro-optimization is not worth doing as the race condition here is
473 |             #    tricky. when ec2 initializes some nodes in the meantime, asg will shutdown
474 |             #    nodes by its own policy
475 |             # scale_down = self.desired_capacity > desired_capacity >= self.actual_capacity
476 |             if scale_up:
477 |                 # should have gotten our num_schedulable to highest value possible
478 |                 # actually need to grow.
479 |                 return self.set_desired_capacity(desired_capacity)
480 | 
481 |         logger.info("Doing nothing: desired_capacity correctly set: {}, schedulable: {}".format(
482 |             self.name, num_schedulable))
483 |         return utils.CompletedFuture(False)
484 | 
485 |     def scale_nodes_in(self, nodes):
486 |         """
487 |         scale down asg by terminating the given node.
488 |         returns a future indicating when the request completes.
489 |         """
490 |         for node in nodes:
491 |             try:
492 |                 # if we somehow end up in a situation where we have
493 |                 # more capacity than desired capacity, and the desired
494 |                 # capacity is at asg min size, then when we try to
495 |                 # terminate the instance while decrementing the desired
496 |                 # capacity, the aws api call will fail
497 |                 decrement_capacity = self.desired_capacity > self.min_size
498 |                 self.client.terminate_instance_in_auto_scaling_group(
499 |                     InstanceId=node.instance_id,
500 |                     ShouldDecrementDesiredCapacity=decrement_capacity)
501 |                 self.nodes.remove(node)
502 |                 logger.info('Scaled node %s in', node)
503 |             except botocore.exceptions.ClientError as e:
504 |                 if str(e).find("Terminating instance without replacement will "
505 |                                "violate group's min size constraint.") == -1:
506 |                     raise e
507 |                 logger.error("Failed to terminate instance: %s", e)
508 | 
509 |         return utils.CompletedFuture(None)
510 | 
511 |     def contains(self, node):
512 |         return node.instance_id in self.instance_ids
513 | 
514 |     def is_match_for_selectors(self, selectors):
515 |         for label, value in selectors.items():
516 |             if self.selectors.get(label) != value:
517 |                 return False
518 |         return True
519 | 
520 |     def is_taints_tolerated(self, pod):
521 |         for label, value in pod.selectors.items():
522 |             if self.selectors.get(label) != value:
523 |                 return False
524 |         for key in self.no_schedule_taints:
525 |             if not (pod.no_schedule_wildcard_toleration or key in pod.no_schedule_existential_tolerations):
526 |                 return False
527 |         return True
528 | 
529 |     def __str__(self):
530 |         return 'AutoScalingGroup({name}, {selectors_hash})'.format(name=self.name, selectors_hash=utils.selectors_to_hash(self.selectors))
531 | 
532 |     def __repr__(self):
533 |         return str(self)
534 | 
535 | 
536 | class AutoScalingErrorMessages(object):
537 |     INSTANCE_LIMIT = re.compile(r'You have requested more instances \((?P<requested>\d+)\) than your current instance limit of (?P<limit>\d+) allows for the specified instance type. Please visit http://aws.amazon.com/contact-us/ec2-request to request an adjustment to this limit. Launching EC2 instance failed.')
538 |     VOLUME_LIMIT = re.compile(r'Instance became unhealthy while waiting for instance to be in InService state. Termination Reason: Client.VolumeLimitExceeded: Volume limit exceeded')
539 |     CAPACITY_LIMIT = re.compile(r'Insufficient capacity\. Launching EC2 instance failed\.')
540 |     SPOT_REQUEST_WAITING = re.compile(r'Placed Spot instance request: (?P<request_id>.+). Waiting for instance\(s\)')
541 |     SPOT_REQUEST_CANCELLED = re.compile(r'Spot instance request: (?P<request_id>.+) has been cancelled\.')
542 |     SPOT_LIMIT = re.compile(r'Max spot instance count exceeded\. Placing Spot instance request failed\.')
543 |     AZ_LIMIT = re.compile(r'We currently do not have sufficient .+ capacity in the Availability Zone you requested (.+)\.')
544 | 
545 | 
546 | class AutoScalingCauseMessages(object):
547 |     LAUNCH_INSTANCE = re.compile(r'At \d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ an instance was started in response to a difference between desired and actual capacity, increasing the capacity from (?P<original_capacity>\d+) to (?P<target_capacity>\d+)\.')
548 |     AZ_BALANCE = re.compile(r'An instance was launched to aid in balancing the group\'s zones\.')
549 | 


--------------------------------------------------------------------------------
/autoscaler/cluster.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import datetime
  3 | import logging
  4 | import math
  5 | import time
  6 | 
  7 | import botocore
  8 | import boto3
  9 | import botocore.exceptions
 10 | import datadog
 11 | import pytz
 12 | import pykube
 13 | 
 14 | from enum import Enum
 15 | 
 16 | from azure.mgmt.compute import ComputeManagementClient
 17 | from azure.monitor import MonitorClient
 18 | 
 19 | from dateutil.parser import parse as dateutil_parse
 20 | 
 21 | from azure.mgmt.resource.resources import ResourceManagementClient
 22 | from azure.common.credentials import ServicePrincipalCredentials
 23 | from msrestazure.azure_exceptions import CloudError
 24 | 
 25 | import autoscaler.autoscaling_groups as autoscaling_groups
 26 | import autoscaler.azure as azure
 27 | from autoscaler.azure_api import AzureWriteThroughCachedApi, AzureWrapper
 28 | import autoscaler.capacity as capacity
 29 | from autoscaler.kube import KubePod, KubeNode, KubeResource, KubePodStatus
 30 | import autoscaler.utils as utils
 31 | 
 32 | # we are interested in all pods, incl. system ones
 33 | pykube.Pod.objects.namespace = None
 34 | 
 35 | logger = logging.getLogger(__name__)
 36 | 
 37 | 
 38 | class ClusterNodeState(Enum):
 39 |     DEAD = 'dead'
 40 |     INSTANCE_TERMINATED = 'instance-terminated'
 41 |     ASG_MIN_SIZE = 'asg-min-size'
 42 |     POD_PENDING = 'pod-pending'
 43 |     GRACE_PERIOD = 'grace-period'
 44 |     TYPE_GRACE_PERIOD = 'type-grace-period'
 45 |     IDLE_SCHEDULABLE = 'idle-schedulable'
 46 |     IDLE_UNSCHEDULABLE = 'idle-unschedulable'
 47 |     BUSY_UNSCHEDULABLE = 'busy-unschedulable'
 48 |     BUSY = 'busy'
 49 |     UNDER_UTILIZED_DRAINABLE = 'under-utilized-drainable'
 50 |     UNDER_UTILIZED_UNDRAINABLE = 'under-utilized-undrainable'
 51 |     LAUNCH_HR_GRACE_PERIOD = 'launch-hr-grace-period'
 52 |     DETACHED = 'detached'
 53 | 
 54 | 
 55 | class Cluster(object):
 56 | 
 57 |     # the number of instances per type that is allowed to be idle
 58 |     # this is for keeping some spare capacity around for faster
 59 |     # pod scheduling, instead of keeping the cluster at capacity
 60 |     # and having to spin up nodes for every job submission
 61 |     TYPE_IDLE_COUNT = 5
 62 | 
 63 |     # since we pay for the full hour, don't prematurely kill instances
 64 |     # the number of minutes into the launch hour at which an instance
 65 |     # is fine to kill
 66 |     LAUNCH_HOUR_THRESHOLD = {
 67 |         'aws': 60 * 30,
 68 |         'azure': 60 * 5,  # Azure is billed by the minute
 69 |     }
 70 | 
 71 |     # HACK: before we're ready to favor bigger instances in all cases
 72 |     # just prioritize the ones that we're confident about
 73 |     _GROUP_DEFAULT_PRIORITY = 10
 74 |     _GROUP_PRIORITIES = {
 75 |         'g2.8xlarge': 2,
 76 |         'm4.xlarge': 0,
 77 |         'm4.2xlarge': 0,
 78 |         'm4.4xlarge': 0,
 79 |         'm4.10xlarge': 0
 80 |     }
 81 | 
 82 |     def __init__(self, aws_regions, aws_access_key, aws_secret_key,
 83 |                  azure_client_id, azure_client_secret, azure_subscription_id, azure_tenant_id,
 84 |                  azure_resource_group_names, azure_slow_scale_classes, kubeconfig,
 85 |                  idle_threshold, type_idle_threshold, pod_namespace,
 86 |                  instance_init_time, cluster_name, notifier,
 87 |                  use_aws_iam_role=False,
 88 |                  drain_utilization_below=0.0,
 89 |                  max_scale_in_fraction=0.1,
 90 |                  scale_up=True, maintainance=True,
 91 |                  datadog_api_key=None,
 92 |                  over_provision=5, dry_run=False):
 93 |         if kubeconfig:
 94 |             # for using locally
 95 |             logger.debug('Using kubeconfig %s', kubeconfig)
 96 |             self.api = pykube.HTTPClient(
 97 |                 pykube.KubeConfig.from_file(kubeconfig))
 98 |         else:
 99 |             # for using on kube
100 |             logger.debug('Using kube service account')
101 |             self.api = pykube.HTTPClient(
102 |                 pykube.KubeConfig.from_service_account())
103 |         if pod_namespace is None:
104 |             self.pod_namespace = pykube.all
105 |         else:
106 |             self.pod_namespace = pod_namespace
107 | 
108 |         self.drain_utilization_below = drain_utilization_below
109 |         self.max_scale_in_fraction = max_scale_in_fraction
110 |         self._drained = {}
111 |         self.session = None
112 |         if aws_access_key and aws_secret_key:
113 |             self.session = boto3.session.Session(
114 |                 aws_access_key_id=aws_access_key,
115 |                 aws_secret_access_key=aws_secret_key,
116 |                 region_name=aws_regions[0])  # provide a default region
117 |         elif use_aws_iam_role is True:
118 |             self.session = boto3.session.Session(region_name=aws_regions[0])  # provide a default region
119 |         self.autoscaling_groups = autoscaling_groups.AutoScalingGroups(
120 |             session=self.session, regions=aws_regions,
121 |             cluster_name=cluster_name)
122 |         self.autoscaling_timeouts = autoscaling_groups.AutoScalingTimeouts(
123 |             self.session)
124 | 
125 |         azure_regions = []
126 |         resource_groups = []
127 |         self.azure_client = None
128 |         if azure_client_id:
129 |             azure_credentials = ServicePrincipalCredentials(
130 |                 client_id=azure_client_id,
131 |                 secret=azure_client_secret,
132 |                 tenant=azure_tenant_id
133 |             )
134 | 
135 |             # Setup the Azure client
136 |             resource_client = ResourceManagementClient(azure_credentials, azure_subscription_id)
137 |             resource_client.providers.register('Microsoft.Compute')
138 |             resource_client.providers.register('Microsoft.Network')
139 |             resource_client.providers.register('Microsoft.Insights')
140 | 
141 |             region_map = {}
142 |             for resource_group_name in azure_resource_group_names:
143 |                 resource_group = resource_client.resource_groups.get(resource_group_name)
144 |                 location = resource_group.location
145 |                 if location in region_map:
146 |                     logger.fatal("{} and {} are both in {}. May only have one resource group per region".format(
147 |                         resource_group_name, region_map[location], location
148 |                     ))
149 |                 region_map[location] = resource_group_name
150 |                 azure_regions.append(location)
151 |                 resource_groups.append(resource_group)
152 | 
153 |             compute_client = ComputeManagementClient(azure_credentials, azure_subscription_id)
154 |             compute_client.config.retry_policy.policy = azure.AzureBoundedRetry.from_retry(compute_client.config.retry_policy.policy)
155 | 
156 |             monitor_client = MonitorClient(azure_credentials, azure_subscription_id)
157 |             monitor_client.config.retry_policy.policy = azure.AzureBoundedRetry.from_retry(monitor_client.config.retry_policy.policy)
158 |             self.azure_client = AzureWriteThroughCachedApi(AzureWrapper(compute_client, monitor_client, resource_client))
159 | 
160 |         self.azure_groups = azure.AzureGroups(resource_groups, azure_slow_scale_classes, self.azure_client)
161 | 
162 |         # config
163 |         self.azure_resource_group_names = azure_resource_group_names
164 |         self.azure_regions = azure_regions
165 |         self.aws_regions = aws_regions
166 |         self.idle_threshold = idle_threshold
167 |         self.instance_init_time = instance_init_time
168 |         self.type_idle_threshold = type_idle_threshold
169 |         self.over_provision = over_provision
170 | 
171 |         self.scale_up = scale_up
172 |         self.maintainance = maintainance
173 | 
174 |         self.notifier = notifier
175 | 
176 |         if datadog_api_key:
177 |             datadog.initialize(api_key=datadog_api_key)
178 |             logger.info('Datadog initialized')
179 |         self.stats = datadog.ThreadStats()
180 |         self.stats.start()
181 | 
182 |         self.dry_run = dry_run
183 | 
184 |     def scale_loop(self):
185 |         """
186 |         runs one loop of scaling to current needs.
187 |         returns True if successfully scaled.
188 |         """
189 |         logger.info("++++++++++++++ Running Scaling Loop ++++++++++++++++")
190 |         try:
191 |             start_time = time.time()
192 | 
193 |             kube_lookup_start_time = time.time()
194 |             pykube_nodes = pykube.Node.objects(self.api)
195 |             if not pykube_nodes:
196 |                 logger.warn('Failed to list nodes. Please check kube configuration. Terminating scale loop.')
197 |                 return False
198 | 
199 |             all_nodes = list(map(KubeNode, pykube_nodes))
200 |             managed_nodes = [node for node in all_nodes if node.is_managed()]
201 | 
202 |             pods = list(map(KubePod, pykube.Pod.objects(self.api, namespace=self.pod_namespace)))
203 | 
204 |             running_or_pending_assigned_pods = [
205 |                 p for p in pods if (p.status == KubePodStatus.RUNNING or p.status == KubePodStatus.CONTAINER_CREATING) or (
206 |                     p.status == KubePodStatus.PENDING and p.node_name
207 |                 )
208 |             ]
209 | 
210 |             for node in all_nodes:
211 |                 for pod in running_or_pending_assigned_pods:
212 |                     if pod.node_name == node.name:
213 |                         node.count_pod(pod)
214 |             self.stats.gauge('autoscaler.scaling_loop.kube_lookup_time', time.time() - kube_lookup_start_time)
215 | 
216 |             scaling_group_lookup_start_time = time.time()
217 |             if self.azure_client is not None:
218 |                 for resource_group in self.azure_resource_group_names:
219 |                     # Force a refresh of the cache to pick up any new Scale Sets that have been created
220 |                     # or modified externally.
221 |                     self.azure_client.list_scale_sets(resource_group, force_refresh=True)
222 |                     # Force a refresh of the cache in case our quota was adjusted
223 |                     self.azure_client.invalidate_quota_cache(resource_group)
224 |             asgs = self.autoscaling_groups.get_all_groups(all_nodes)
225 |             azure_groups = self.azure_groups.get_all_groups(all_nodes)
226 |             scaling_groups = asgs + azure_groups
227 |             self.stats.gauge('autoscaler.scaling_loop.scaling_group_lookup_time', time.time() - scaling_group_lookup_start_time)
228 | 
229 |             instance_lookup_start_time = time.time()
230 |             running_insts_map = self.get_running_instances_map(managed_nodes, azure_groups)
231 |             self.stats.gauge('autoscaler.scaling_loop.instance_lookup_time', time.time() - instance_lookup_start_time)
232 | 
233 |             pods_to_schedule_lookup_start_time = time.time()
234 |             pods_to_schedule = self.get_pods_to_schedule(pods)
235 |             self.stats.gauge(
236 |                 'autoscaler.scaling_loop.pods_to_schedule_lookup_time',
237 |                 time.time() - pods_to_schedule_lookup_start_time,
238 |             )
239 | 
240 |             pods_by_node = {}
241 |             for p in running_or_pending_assigned_pods:
242 |                 pods_by_node.setdefault(p.node_name, []).append(p)
243 | 
244 |             if self.scale_up:
245 |                 logger.info(
246 |                     "++++++++++++++ Scaling Up Begins ++++++++++++++++")
247 |                 self.scale(
248 |                     pods_to_schedule, all_nodes, scaling_groups,
249 |                     running_insts_map)
250 |                 logger.info("++++++++++++++ Scaling Up Ends ++++++++++++++++")
251 |             if self.maintainance:
252 |                 logger.info(
253 |                     "++++++++++++++ Maintenance Begins ++++++++++++++++")
254 |                 self.maintain(
255 |                     managed_nodes, running_insts_map,
256 |                     pods_to_schedule, running_or_pending_assigned_pods,
257 |                     scaling_groups)
258 |                 logger.info("++++++++++++++ Maintenance Ends ++++++++++++++++")
259 | 
260 |             self.stats.gauge('autoscaler.scaling_loop_time', time.time() - start_time)
261 | 
262 |             return True
263 |         except botocore.exceptions.ClientError as e:
264 |             logger.warn(e)
265 |             return False
266 | 
267 |     def scale(self, pods_to_schedule, all_nodes, asgs, running_insts_map):
268 |         """
269 |         scale up logic
270 |         """
271 |         # TODO: generalize to azure
272 |         self.autoscaling_timeouts.refresh_timeouts(
273 |             [asg for asg in asgs if asg.provider == 'aws'],
274 |             dry_run=self.dry_run)
275 | 
276 |         cached_live_nodes = []
277 |         for node in all_nodes:
278 |             # either we know the physical node behind it and know it's alive
279 |             # or we don't know it and assume it's alive
280 |             if (node.instance_id and node.instance_id in running_insts_map) \
281 |                     or (not node.is_managed()):
282 |                 cached_live_nodes.append(node)
283 | 
284 |         # selectors -> pending KubePods
285 |         pending_pods = {}
286 | 
287 |         # for each pending & unassigned job, try to fit them on current machines or count requested
288 |         #   resources towards future machines
289 |         for selectors_hash, pods in pods_to_schedule.items():
290 |             for pod in pods:
291 |                 fitting = None
292 |                 for node in cached_live_nodes:
293 |                     if node.unschedulable:
294 |                         continue
295 |                     if node.is_match(pod) and node.can_fit(pod.resources):
296 |                         fitting = node
297 |                         break
298 |                 if fitting is None:
299 |                     # because a pod may be able to fit in multiple groups
300 |                     # pick a group now
301 |                     selectors = dict(pod.selectors)
302 |                     pending_pods.setdefault(utils.selectors_to_hash(selectors), []).append(pod)
303 |                     logger.info(
304 |                         "{pod} is pending ({selectors_hash})".format(
305 |                             pod=pod, selectors_hash=selectors_hash))
306 |                 else:
307 |                     fitting.count_pod(pod)
308 |                     logger.info("{pod} fits on {node}".format(pod=pod,
309 |                                                               node=fitting))
310 | 
311 |         # scale each node type to reach the new capacity
312 |         for selectors_hash in set(pending_pods.keys()):
313 |             self.fulfill_pending(asgs,
314 |                                  selectors_hash,
315 |                                  pending_pods.get(selectors_hash, []))
316 | 
317 |         # TODO: make sure desired capacities of untouched groups are consistent
318 | 
319 |     def maintain(self, cached_managed_nodes, running_insts_map,
320 |                  pods_to_schedule, running_or_pending_assigned_pods, asgs):
321 |         """
322 |         maintains running instances:
323 |         - determines if idle nodes should be drained and terminated
324 |         - determines if there are bad nodes in ASGs (did not spin up under
325 |           `instance_init_time` seconds)
326 |         """
327 |         logger.info("++++++++++++++ Maintaining Nodes & Instances ++++++++++++++++")
328 | 
329 |         # for each type of instance, we keep one around for longer
330 |         # in order to speed up job start up time
331 |         idle_selector_hash = collections.Counter()
332 | 
333 |         pods_by_node = {}
334 |         for p in running_or_pending_assigned_pods:
335 |             pods_by_node.setdefault(p.node_name, []).append(p)
336 | 
337 |         stats_time = time.time()
338 | 
339 |         nodes_to_scale_in = {}
340 |         nodes_to_delete = []
341 |         state_counts = dict((state, 0) for state in ClusterNodeState)
342 |         for node in cached_managed_nodes:
343 |             asg = utils.get_group_for_node(asgs, node)
344 |             state = self.get_node_state(
345 |                 node, asg, pods_by_node.get(node.name, []), pods_to_schedule,
346 |                 running_insts_map, idle_selector_hash)
347 | 
348 |             logger.info("node: %-*s state: %s" % (75, node, state))
349 |             state_counts[state] += 1
350 | 
351 |             # state machine & why doesnt python have case?
352 |             if state in (ClusterNodeState.POD_PENDING, ClusterNodeState.BUSY,
353 |                          ClusterNodeState.GRACE_PERIOD,
354 |                          ClusterNodeState.TYPE_GRACE_PERIOD,
355 |                          ClusterNodeState.ASG_MIN_SIZE,
356 |                          ClusterNodeState.LAUNCH_HR_GRACE_PERIOD,
357 |                          ClusterNodeState.DETACHED):
358 |                 # do nothing
359 |                 pass
360 |             elif state == ClusterNodeState.UNDER_UTILIZED_DRAINABLE:
361 |                 if not self.dry_run:
362 |                     if not asg:
363 |                         logger.warn('Cannot find ASG for node %s. Not cordoned.', node)
364 |                     else:
365 |                         node.cordon()
366 |                         node.drain(pods_by_node.get(node.name, []), notifier=self.notifier)
367 |                 else:
368 |                     logger.info('[Dry run] Would have drained and cordoned %s', node)
369 |             elif state == ClusterNodeState.IDLE_SCHEDULABLE:
370 |                 if not self.dry_run:
371 |                     if not asg:
372 |                         logger.warn('Cannot find ASG for node %s. Not cordoned.', node)
373 |                     else:
374 |                         node.cordon()
375 |                 else:
376 |                     logger.info('[Dry run] Would have cordoned %s', node)
377 |             elif state == ClusterNodeState.BUSY_UNSCHEDULABLE:
378 |                 # this is duplicated in original scale logic
379 |                 if not self.dry_run:
380 |                     node.uncordon()
381 |                 else:
382 |                     logger.info('[Dry run] Would have uncordoned %s', node)
383 |             elif state == ClusterNodeState.IDLE_UNSCHEDULABLE:
384 |                 # remove it from asg
385 |                 if not self.dry_run:
386 |                     nodes_to_delete.append(node)
387 |                     if not asg:
388 |                         logger.warn('Cannot find ASG for node %s. Not terminated.', node)
389 |                     else:
390 |                         nodes_to_scale_in.setdefault(asg, []).append(node)
391 |                 else:
392 |                     logger.info('[Dry run] Would have scaled in %s', node)
393 |             elif state == ClusterNodeState.INSTANCE_TERMINATED:
394 |                 if not self.dry_run:
395 |                     nodes_to_delete.append(node)
396 |                 else:
397 |                     logger.info('[Dry run] Would have deleted %s', node)
398 |             elif state == ClusterNodeState.DEAD:
399 |                 if not self.dry_run:
400 |                     nodes_to_delete.append(node)
401 |                     if asg:
402 |                         nodes_to_scale_in.setdefault(asg, []).append(node)
403 |                 else:
404 |                     logger.info('[Dry run] Would have reaped dead node %s', node)
405 |             elif state == ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE:
406 |                 # noop for now
407 |                 pass
408 |             else:
409 |                 raise Exception("Unhandled state: {}".format(state))
410 | 
411 |         for state, count in state_counts.items():
412 |             self.stats.gauge('kubernetes.custom.node.state.{}'.format(state.value.replace('-', '_')), count)
413 | 
414 |         # these are instances that have been running for a while but it's not properly managed
415 |         #   i.e. not having registered to kube or not having proper meta data set
416 |         managed_instance_ids = set(node.instance_id for node in cached_managed_nodes)
417 |         instances_to_terminate = {}
418 |         unmanaged_instance_count = 0
419 |         for asg in asgs:
420 |             unmanaged_instance_ids = (asg.instance_ids - managed_instance_ids)
421 |             if len(unmanaged_instance_ids) > 0:
422 |                 if asg.provider == 'azure':
423 |                     for inst_id in unmanaged_instance_ids:
424 |                         inst = asg.instances[inst_id]
425 |                         if (datetime.datetime.now(inst.launch_time.tzinfo)
426 |                                 - inst.launch_time).seconds >= self.instance_init_time:
427 |                             if not self.dry_run:
428 |                                 logger.info("terminating unmanaged %s" % inst)
429 |                                 instances_to_terminate.setdefault(asg, []).append(inst_id)
430 |                                 unmanaged_instance_count += 1
431 |                                 # TODO: try to delete node from kube as well
432 |                                 # in the case where kubelet may have registered but node
433 |                                 # labels have not been applied yet, so it appears unmanaged
434 |                             else:
435 |                                 logger.info('[Dry run] Would have terminated unmanaged %s', inst)
436 |                 else:
437 |                     unmanaged_running_insts = self.get_running_instances_in_region(
438 |                         asg.region, list(unmanaged_instance_ids))
439 |                     for inst in unmanaged_running_insts:
440 |                         if (datetime.datetime.now(inst.launch_time.tzinfo)
441 |                                 - inst.launch_time).seconds >= self.instance_init_time:
442 |                             if not self.dry_run:
443 |                                 asg.client.terminate_instance_in_auto_scaling_group(
444 |                                     InstanceId=inst.id, ShouldDecrementDesiredCapacity=False)
445 |                                 logger.info("terminating unmanaged %s" % inst)
446 |                                 unmanaged_instance_count += 1
447 |                                 # TODO: try to delete node from kube as well
448 |                                 # in the case where kubelet may have registered but node
449 |                                 # labels have not been applied yet, so it appears unmanaged
450 |                             else:
451 |                                 logger.info(
452 |                                     '[Dry run] Would have terminated unmanaged %s [%s]', inst, asg.region)
453 |         self.stats.gauge('kubernetes.custom.node.state.unmanaged', unmanaged_instance_count)
454 | 
455 |         async_operations = []
456 |         total_instances = max(sum(len(asg.instance_ids) for asg in asgs), len(cached_managed_nodes))
457 |         max_allowed_scale_in = int(math.ceil(self.max_scale_in_fraction * total_instances))
458 |         to_scale_in = sum(len(nodes) for nodes in nodes_to_scale_in.values()) + \
459 |                       sum(len(instance_ids) for instance_ids in instances_to_terminate.values())
460 |         to_scale_in = max(to_scale_in, len(nodes_to_delete))
461 |         if to_scale_in > max_allowed_scale_in:
462 |             logger.error("TOO MANY NODES TO SCALE IN: {}, max allowed is {}".format(to_scale_in, max_allowed_scale_in))
463 |         elif not self.dry_run:
464 |             for asg, nodes in nodes_to_scale_in.items():
465 |                 async_operations.append(asg.scale_nodes_in(nodes))
466 | 
467 |             for asg, instance_ids in instances_to_terminate.items():
468 |                 async_operations.append(asg.terminate_instances(instance_ids))
469 | 
470 |             for node in nodes_to_delete:
471 |                 node.delete()
472 | 
473 |         # Wait for all background scale-in operations to complete
474 |         for operation in async_operations:
475 |             try:
476 |                 operation.result()
477 |             except CloudError as e:
478 |                 logger.warn("Error while deleting Azure node: {}".format(e.message))
479 |             except TimeoutError:
480 |                 logger.warn("Timeout while deleting Azure node")
481 | 
482 |     def fulfill_pending(self, asgs, selectors_hash, pods):
483 |         """
484 |         selectors_hash - string repr of selectors
485 |         pods - list of KubePods that are pending
486 |         """
487 |         logger.info(
488 |             "========= Scaling for %s ========", selectors_hash)
489 |         logger.debug("pending: %s", pods[:5])
490 | 
491 |         accounted_pods = dict((p, False) for p in pods)
492 |         num_unaccounted = len(pods)
493 | 
494 |         groups = utils.get_groups_for_hash(asgs, selectors_hash)
495 | 
496 |         groups = self._prioritize_groups(groups)
497 | 
498 |         async_operations = []
499 |         for group in groups:
500 |             logger.debug("group: %s", group)
501 |             if (self.autoscaling_timeouts.is_timed_out(group) or group.is_timed_out() or group.max_size == group.desired_capacity) \
502 |                     and not group.unschedulable_nodes:
503 |                 continue
504 | 
505 |             unit_capacity = capacity.get_unit_capacity(group)
506 |             new_instance_resources = []
507 |             assigned_pods = []
508 |             for pod, acc in accounted_pods.items():
509 |                 if acc or not (unit_capacity - pod.resources).possible or not group.is_taints_tolerated(pod):
510 |                     continue
511 | 
512 |                 found_fit = False
513 |                 for i, instance in enumerate(new_instance_resources):
514 |                     if (instance - pod.resources).possible:
515 |                         new_instance_resources[i] = instance - pod.resources
516 |                         assigned_pods[i].append(pod)
517 |                         found_fit = True
518 |                         break
519 |                 if not found_fit:
520 |                     new_instance_resources.append(
521 |                         unit_capacity - pod.resources)
522 |                     assigned_pods.append([pod])
523 | 
524 |             # new desired # machines = # running nodes + # machines required to fit jobs that don't
525 |             # fit on running nodes. This scaling is conservative but won't
526 |             # create starving
527 |             units_needed = len(new_instance_resources)
528 |             # The pods may not fit because of resource requests or taints. Don't scale in that case
529 |             if units_needed == 0:
530 |                 continue
531 |             units_needed += self.over_provision
532 | 
533 |             if self.autoscaling_timeouts.is_timed_out(group) or group.is_timed_out():
534 |                 # if a machine is timed out, it cannot be scaled further
535 |                 # just account for its current capacity (it may have more
536 |                 # being launched, but we're being conservative)
537 |                 unavailable_units = max(
538 |                     0, units_needed - (group.desired_capacity - group.actual_capacity))
539 |             else:
540 |                 unavailable_units = max(
541 |                     0, units_needed - (group.max_size - group.actual_capacity))
542 |             units_requested = units_needed - unavailable_units
543 | 
544 |             logger.debug("units_needed: %s", units_needed)
545 |             logger.debug("units_requested: %s", units_requested)
546 | 
547 |             new_capacity = group.actual_capacity + units_requested
548 |             if not self.dry_run:
549 |                 async_operation = group.scale(new_capacity)
550 |                 async_operations.append(async_operation)
551 | 
552 |                 def notify_if_scaled(future):
553 |                     if future.result():
554 |                         flat_assigned_pods = []
555 |                         for instance_pods in assigned_pods:
556 |                             flat_assigned_pods.extend(instance_pods)
557 |                         self.notifier.notify_scale(group, units_requested, flat_assigned_pods)
558 | 
559 |                 async_operation.add_done_callback(notify_if_scaled)
560 |             else:
561 |                 logger.info(
562 |                     '[Dry run] Would have scaled up (%s) to %s', group, new_capacity)
563 | 
564 |             for i in range(min(len(assigned_pods), units_requested)):
565 |                 for pod in assigned_pods[i]:
566 |                     accounted_pods[pod] = True
567 |                     num_unaccounted -= 1
568 | 
569 |             logger.debug("remining pending: %s", num_unaccounted)
570 | 
571 |             if not num_unaccounted:
572 |                 break
573 | 
574 |         if num_unaccounted:
575 |             logger.warn('Failed to scale sufficiently.')
576 |             self.notifier.notify_failed_to_scale(selectors_hash, pods)
577 | 
578 |         for operation in async_operations:
579 |             try:
580 |                 operation.result()
581 |             except CloudError as e:
582 |                 logger.warn("Error while scaling Scale Set: {}".format(e.message))
583 |             except TimeoutError:
584 |                 logger.warn("Timeout while scaling Scale Set")
585 | 
586 |     def get_running_instances_in_region(self, region, instance_ids):
587 |         """
588 |         a generator for getting ec2.Instance objects given a list of
589 |         instance IDs.
590 |         """
591 |         if not region:
592 |             logger.warn('Instance IDs without region: %s', instance_ids)
593 |             return
594 | 
595 |         yielded_ids = set()
596 |         try:
597 |             running_insts = (self.session
598 |                              .resource('ec2', region_name=region)
599 |                              .instances
600 |                              .filter(
601 |                                  InstanceIds=instance_ids,
602 |                                  Filters=[{
603 |                                      'Name': "instance-state-name",
604 |                                      'Values': ["running"]}]
605 |                              ))
606 |             # we have to go through each instance to make sure
607 |             # they actually exist and handle errors otherwise
608 |             # boto collections do not always call DescribeInstance
609 |             # when returning from filter, so it could error during
610 |             # iteration
611 |             for inst in running_insts:
612 |                 yield inst
613 |                 yielded_ids.add(inst.id)
614 |         except botocore.exceptions.ClientError as e:
615 |             logger.debug('Caught %s', e)
616 |             if str(e).find("InvalidInstanceID.NotFound") == -1:
617 |                 raise e
618 |             elif len(instance_ids) == 1:
619 |                 return
620 |             else:
621 |                 # this should hopefully happen rarely so we resort to slow methods to
622 |                 # handle this case
623 |                 for instance_id in instance_ids:
624 |                     if instance_id in yielded_ids:
625 |                         continue
626 |                     for inst in self.get_running_instances_in_region(region, [instance_id]):
627 |                         yield inst
628 | 
629 |     def get_running_instances_map(self, nodes, azure_groups):
630 |         """
631 |         given a list of KubeNode's, return a map of
632 |         instance_id -> ec2.Instance object
633 |         """
634 |         instance_map = {}
635 | 
636 |         # first get azure instances
637 |         for group in azure_groups:
638 |             if isinstance(group, azure.AzureVirtualScaleSet):
639 |                 for instance in group.get_azure_instances():
640 |                     instance_map[instance.id] = instance
641 | 
642 |         # now get aws instances
643 |         instance_id_by_region = {}
644 |         for node in nodes:
645 |             if node.provider == 'aws':
646 |                 instance_id_by_region.setdefault(node.region, []).append(node.instance_id)
647 | 
648 |         for region, instance_ids in instance_id_by_region.items():
649 |             # note that this assumes that all instances have a valid region
650 |             # the regions referenced by the nodes may also be outside of the
651 |             # list of regions provided by the user
652 |             # this should be ok because they will just end up being nodes
653 |             # unmanaged by autoscaling groups we know about
654 |             region_instances = self.get_running_instances_in_region(
655 |                 region, instance_ids)
656 |             instance_map.update((inst.id, inst) for inst in region_instances)
657 | 
658 |         return instance_map
659 | 
660 |     def _get_required_capacity(self, requested, group):
661 |         """
662 |         returns the number of nodes within an autoscaling group that should
663 |         be provisioned to fit the requested amount of KubeResource.
664 | 
665 |         TODO: bin packing would probably be better?
666 | 
667 |         requested - KubeResource
668 |         group - AutoScalingGroup
669 |         """
670 |         unit_capacity = capacity.get_unit_capacity(group)
671 |         return max(
672 |             # (peter) should 0.8 be configurable?
673 |             int(math.ceil(requested.get(field, 0.0) / unit_capacity.get(field, 1.0)))
674 |             for field in ('cpu', 'memory', 'pods')
675 |         )
676 | 
677 |     def _prioritize_groups(self, groups):
678 |         """
679 |         returns the groups sorted in order of where we should try to schedule
680 |         things first. we currently try to prioritize in the following order:
681 |         - region
682 |         - single-AZ groups over multi-AZ groups (for faster/cheaper network)
683 |         - whether or not the group launches spot instances (prefer spot)
684 |         - manually set _GROUP_PRIORITIES
685 |         - group name
686 |         """
687 |         def sort_key(group):
688 |             region = self._GROUP_DEFAULT_PRIORITY
689 |             try:
690 |                 region = (self.azure_regions + self.aws_regions).index(group.region)
691 |             except ValueError:
692 |                 pass
693 |             # Some ASGs are pinned to be in a single AZ. Sort them in front of
694 |             # multi-ASG groups that won't have this tag set.
695 |             pinned_to_az = group.selectors.get('aws/az', 'z')
696 |             priority = self._GROUP_PRIORITIES.get(
697 |                 group.selectors.get('aws/type'), self._GROUP_DEFAULT_PRIORITY)
698 |             return (group.global_priority, region, pinned_to_az, not group.is_spot, priority, group.name)
699 |         return sorted(groups, key=sort_key)
700 | 
701 |     def get_node_state(self, node, asg, node_pods, pods_to_schedule,
702 |                        running_insts_map, idle_selector_hash):
703 |         """
704 |         returns the ClusterNodeState for the given node
705 | 
706 |         params:
707 |         node - KubeNode object
708 |         asg - AutoScalingGroup object that this node belongs in. can be None.
709 |         node_pods - list of KubePods assigned to this node
710 |         pods_to_schedule - list of all pending pods
711 |         running_inst_map - map of all (instance_id -> ec2.Instance object)
712 |         idle_selector_hash - current map of idle nodes by type. may be modified
713 |         """
714 |         pending_list = []
715 |         for pods in pods_to_schedule.values():
716 |             for pod in pods:
717 |                 # a pod is considered schedulable onto this node if all the
718 |                 # node selectors match
719 |                 # AND it doesn't use pod affinity (which we don't support yet)
720 |                 if (node.is_match(pod) and
721 |                         'scheduler.alpha.kubernetes.io/affinity' not in pod.annotations):
722 |                     pending_list.append(pod)
723 |         # we consider a node to be busy if it's running any non-DaemonSet pods
724 |         # TODO: we can be a bit more aggressive in killing pods that are
725 |         # replicated
726 |         busy_list = [p for p in node_pods if not p.is_mirrored()]
727 |         undrainable_list = [p for p in node_pods if not p.is_drainable()]
728 |         utilization = sum((p.resources for p in busy_list), KubeResource())
729 |         under_utilized = (self.drain_utilization_below *
730 |                           node.capacity - utilization).possible
731 |         drainable = not undrainable_list
732 | 
733 |         maybe_inst = running_insts_map.get(node.instance_id)
734 |         if maybe_inst:
735 |             age = (datetime.datetime.now(maybe_inst.launch_time.tzinfo)
736 |                    - maybe_inst.launch_time).seconds
737 |             launch_hour_offset = age % 3600
738 |         else:
739 |             age = None
740 | 
741 |         instance_type = utils.selectors_to_hash(
742 |             asg.selectors) if asg else node.instance_type
743 | 
744 |         type_spare_capacity = (instance_type and self.type_idle_threshold and
745 |                                idle_selector_hash[instance_type] < self.TYPE_IDLE_COUNT)
746 | 
747 |         if maybe_inst is None:
748 |             return ClusterNodeState.INSTANCE_TERMINATED
749 | 
750 |         if node.is_detached():
751 |             return ClusterNodeState.DETACHED
752 | 
753 |         if node.is_dead():
754 |             return ClusterNodeState.DEAD
755 | 
756 |         if asg and len(asg.nodes) <= asg.min_size:
757 |             return ClusterNodeState.ASG_MIN_SIZE
758 | 
759 |         if busy_list and not under_utilized:
760 |             if node.unschedulable:
761 |                 return ClusterNodeState.BUSY_UNSCHEDULABLE
762 |             return ClusterNodeState.BUSY
763 | 
764 |         if pending_list and not node.unschedulable:
765 |             # logger.warn('PENDING: %s', pending_list)
766 |             return ClusterNodeState.POD_PENDING
767 | 
768 |         if launch_hour_offset < self.LAUNCH_HOUR_THRESHOLD[node.provider] and not node.unschedulable:
769 |             return ClusterNodeState.LAUNCH_HR_GRACE_PERIOD
770 | 
771 |         # elif node.provider == 'azure':
772 |             # disabling scale down in azure for now while we ramp up
773 |             # TODO: remove once azure is bootstrapped
774 |             # state = ClusterNodeState.GRACE_PERIOD
775 | 
776 |         if (not type_spare_capacity and age <= self.idle_threshold) and not node.unschedulable:
777 |             # there is already an instance of this type sitting idle
778 |             # so we use the regular idle threshold for the grace period
779 |             return ClusterNodeState.GRACE_PERIOD
780 | 
781 |         if (type_spare_capacity and age <= self.type_idle_threshold) and not node.unschedulable:
782 |             # we don't have an instance of this type yet!
783 |             # use the type idle threshold for the grace period
784 |             # and mark the type as seen
785 |             idle_selector_hash[instance_type] += 1
786 |             return ClusterNodeState.TYPE_GRACE_PERIOD
787 | 
788 |         if under_utilized and (busy_list or not node.unschedulable):
789 |             # nodes that are under utilized (but not completely idle)
790 |             # have their own states to tell if we should drain them
791 |             # for better binpacking or not
792 |             if drainable:
793 |                 return ClusterNodeState.UNDER_UTILIZED_DRAINABLE
794 |             return ClusterNodeState.UNDER_UTILIZED_UNDRAINABLE
795 | 
796 |         if node.unschedulable:
797 |             return ClusterNodeState.IDLE_UNSCHEDULABLE
798 |         return ClusterNodeState.IDLE_SCHEDULABLE
799 | 
800 |     def get_pods_to_schedule(self, pods):
801 |         """
802 |         given a list of KubePod objects,
803 |         return a map of (selectors hash -> pods) to be scheduled
804 |         """
805 |         pending_unassigned_pods = [
806 |             p for p in pods
807 |             if p.status == KubePodStatus.PENDING and (not p.node_name)
808 |         ]
809 | 
810 |         # we only consider a pod to be schedulable if it's pending and
811 |         # unassigned and feasible
812 |         pods_to_schedule = {}
813 |         now = datetime.datetime.now(pytz.utc)
814 |         for pod in pending_unassigned_pods:
815 |             age = (now - pod.creation_time).total_seconds()
816 |             self.stats.histogram('autoscaler.scaling_loop.pending_pod_age', age)
817 | 
818 |             if capacity.is_possible(pod):
819 |                 pods_to_schedule.setdefault(
820 |                     utils.selectors_to_hash(pod.selectors), []).append(pod)
821 |             else:
822 |                 recommended_capacity = capacity.max_capacity_for_selectors(
823 |                     pod.selectors, pod.resources)
824 |                 logger.warn(
825 |                     "Pending pod %s cannot fit %s. "
826 |                     "Please check that requested resource amount is "
827 |                     "consistent with node selectors (recommended max: %s). "
828 |                     "Scheduling skipped." % (pod.name, pod.selectors, recommended_capacity))
829 |                 self.notifier.notify_invalid_pod_capacity(
830 |                     pod, recommended_capacity)
831 |         return pods_to_schedule
832 | 


--------------------------------------------------------------------------------