├── .DS_Store
├── .gitignore
├── .idea
├── .gitignore
├── deployment.xml
├── energy-aware-recommender.iml
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── remote-mappings.xml
└── vcs.xml
├── Dockerfile
├── LICENSE
├── PromClient.py
├── README.md
├── dashboards
├── .DS_Store
├── clever-dashboard-w-IPS.json
├── clever-dashboard.json
└── clever-sysbench-kubecon22.json
├── main.py
├── manifests
├── clever.yaml
├── random.yaml
└── sysbench.yaml
├── recommender.py
├── requirements.txt
├── scripts
├── set_cpu_freq.sh
└── watch_vpa.sh
├── testPromClient.py
└── utils.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/clever/fc20b3e8947978ec13c744daa4f936d16af4d3fa/.DS_Store
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/.idea/energy-aware-recommender.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/remote-mappings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.6.8
2 |
3 | WORKDIR /root/src/clever
4 |
5 | COPY requirements.txt ./
6 | RUN pip install --no-cache-dir -r requirements.txt
7 |
8 | COPY . .
9 |
10 | CMD [ "python", "./main.py" ]
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/PromClient.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | import requests
4 | import os
5 | import json
6 |
7 | requests.packages.urllib3.disable_warnings()
8 |
9 | class PromClient:
10 | now = None
11 | start = None
12 | prom_address = "http://127.0.0.1:9090"
13 | prom_token = None
14 | step = '15s'
15 | chunk_sz = 900
16 |
17 | def __init__(self, prom_address=None, prom_token=None):
18 | self.prom_address = prom_address or os.getenv("PROM_HOST")
19 | self.prom_token = prom_token or os.getenv("PROM_TOKEN")
20 |
21 | if not self.prom_address:
22 | raise ValueError(
23 | "Please appropriately configure environment variables $PROM_HOST, $PROM_TOKEN to successfully run the crawler and profiler!")
24 |
25 | def get_query(self, my_query):
26 | try:
27 | if self.prom_token:
28 | headers = {"content-type": "application/json; charset=UTF-8",
29 | 'Authorization': 'Bearer {}'.format(self.prom_token)}
30 | else:
31 | headers = {"content-type": "application/json; charset=UTF-8"}
32 | response = requests.get('{0}/api/v1/query'.format(self.prom_address),
33 | params={'query': my_query},
34 | headers=headers, verify=False)
35 |
36 | except requests.exceptions.RequestException as e:
37 | print(e)
38 | return None
39 |
40 | try:
41 | if response.json()['status'] != "success":
42 | print("Error processing the request: " + response.json()['status'])
43 | print("The Error is: " + response.json()['error'])
44 | return None
45 |
46 | results = response.json()['data']['result']
47 |
48 | if (results is None):
49 | # print("the results[] came back empty!")
50 | return None
51 |
52 | length = len(results)
53 | if length > 0:
54 | return results
55 | else:
56 | # print("the results[] has no entries!")
57 | return None
58 | except:
59 | print(response)
60 | return None
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CLEVER
2 | Container Level Energy-efficient VPA Recommender for Kubernetes
3 |
4 | ## Pre-requisites
5 | - Baremetal Node OS - RedHat 8
6 | - Kubernetes 1.22+
7 | - Kepler v0.2
8 | - Prometheus release-0.11
9 | - Kubernetes Vertical Pod Autoscaler (VPA) 0.11
10 |
11 | ## Installation
12 | ### Install Kepler
13 | - Follow the instructions in the [Kepler](https://github.com/sustainable-computing-io/kepler) to install Kepler as DaemonSets on nodes of the Kubernetes Cluster.
14 |
15 | ### Install Prometheus & Grafana Dashboard
16 | - Follow the instructions in the [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) to install both Prometheus and Grafana on the Kubernetes Cluster.
17 | - Import the [Grafana Dashboard](https://grafana.com/docs/grafana/v9.0/dashboards/export-import/). The dashboard is available in the `dashboards/clever-dashboard.json` folder.
18 | - Access Prometheus UI and Grafana Dashboard via `kubectl port-forward` command following the [Access UIs tutorial](https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/access-ui.md).
19 |
20 | ### Install VPA
21 | - Follow the instructions [here](https://github.com/kubernetes/autoscaler/blob/master/vertical-pod-autoscaler/README.md) to install the VPA on the Kubernetes Cluster.
22 |
23 | ### Install CLEVER
24 | - Clone the CLEVER repository
25 | ```bash
26 | git clone https://github.com/sustainable-computing-io/clever.git
27 | ```
28 |
29 | - Deploy CLEVER Recommender to run as an alternative recommender for VPA.
30 | ```bash
31 | kubectl apply -f manifests/clever.yaml
32 | ```
33 |
34 | ## Tutorial
35 | - Deploy the example application that selects the CLEVER recommender for VPA.
36 | ```bash
37 | kubectl apply -f manifests/random.yaml
38 | ```
39 |
40 | - The example application defines a VPA Custom Resource with the following configuration:
41 | ```yaml
42 | apiVersion: "autoscaling.k8s.io/v1"
43 | kind: VerticalPodAutoscaler
44 | metadata:
45 | name: random-vpa
46 | spec:
47 | recommenders:
48 | - name: clever
49 | targetRef:
50 | apiVersion: "apps/v1"
51 | kind: Deployment
52 | name: random
53 | resourcePolicy:
54 | containerPolicies:
55 | - containerName: '*'
56 | minAllowed:
57 | cpu: 100m
58 | maxAllowed:
59 | cpu: 16
60 | controlledResources: ["cpu"]
61 | ```
62 |
63 | - Monitor the recommended CPU requests for the example application by watching the VPA object.
64 | ```bash
65 | watch -n 0.1 ./scripts/vpa.sh random-vpa
66 | ```
67 |
68 | - Change the node CPU frequencies to observe the effect on the recommended CPU requests.
69 | ```bash
70 | ./scripts/set_cpu_freq.sh 1GHz
71 | ```
72 |
--------------------------------------------------------------------------------
/dashboards/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/clever/fc20b3e8947978ec13c744daa4f936d16af4d3fa/dashboards/.DS_Store
--------------------------------------------------------------------------------
/dashboards/clever-dashboard-w-IPS.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotations": {
3 | "list": [
4 | {
5 | "builtIn": 1,
6 | "datasource": {
7 | "type": "grafana",
8 | "uid": "-- Grafana --"
9 | },
10 | "enable": true,
11 | "hide": true,
12 | "iconColor": "rgba(0, 211, 255, 1)",
13 | "name": "Annotations & Alerts",
14 | "target": {
15 | "limit": 100,
16 | "matchAny": false,
17 | "tags": [],
18 | "type": "dashboard"
19 | },
20 | "type": "dashboard"
21 | }
22 | ]
23 | },
24 | "editable": true,
25 | "fiscalYearStartMonth": 0,
26 | "graphTooltip": 0,
27 | "id": 27,
28 | "links": [],
29 | "liveNow": false,
30 | "panels": [
31 | {
32 | "datasource": {
33 | "type": "prometheus",
34 | "uid": "P1809F7CD0C75ACF3"
35 | },
36 | "description": "For the node with more than 1 CPU, choose the maximum frequency.",
37 | "fieldConfig": {
38 | "defaults": {
39 | "color": {
40 | "mode": "palette-classic"
41 | },
42 | "custom": {
43 | "axisCenteredZero": false,
44 | "axisColorMode": "text",
45 | "axisLabel": "",
46 | "axisPlacement": "auto",
47 | "barAlignment": 0,
48 | "drawStyle": "line",
49 | "fillOpacity": 0,
50 | "gradientMode": "none",
51 | "hideFrom": {
52 | "legend": false,
53 | "tooltip": false,
54 | "viz": false
55 | },
56 | "lineInterpolation": "linear",
57 | "lineWidth": 1,
58 | "pointSize": 5,
59 | "scaleDistribution": {
60 | "type": "linear"
61 | },
62 | "showPoints": "auto",
63 | "spanNulls": false,
64 | "stacking": {
65 | "group": "A",
66 | "mode": "none"
67 | },
68 | "thresholdsStyle": {
69 | "mode": "off"
70 | }
71 | },
72 | "mappings": [],
73 | "thresholds": {
74 | "mode": "absolute",
75 | "steps": [
76 | {
77 | "color": "green",
78 | "value": null
79 | },
80 | {
81 | "color": "red",
82 | "value": 80
83 | }
84 | ]
85 | },
86 | "unit": "hertz"
87 | },
88 | "overrides": []
89 | },
90 | "gridPos": {
91 | "h": 9,
92 | "w": 24,
93 | "x": 0,
94 | "y": 0
95 | },
96 | "id": 4,
97 | "options": {
98 | "legend": {
99 | "calcs": [],
100 | "displayMode": "list",
101 | "placement": "bottom",
102 | "showLegend": true
103 | },
104 | "tooltip": {
105 | "mode": "single",
106 | "sort": "none"
107 | }
108 | },
109 | "targets": [
110 | {
111 | "datasource": {
112 | "type": "prometheus",
113 | "uid": "P1809F7CD0C75ACF3"
114 | },
115 | "editorMode": "builder",
116 | "expr": "max(node_cpu_scaling_frequency_hertz{instance=\"$node\"})",
117 | "legendFormat": "Current CPU Frequency ",
118 | "range": true,
119 | "refId": "A"
120 | },
121 | {
122 | "datasource": {
123 | "type": "prometheus",
124 | "uid": "P1809F7CD0C75ACF3"
125 | },
126 | "editorMode": "builder",
127 | "expr": "max(node_cpu_frequency_max_hertz{instance=\"$node\"})",
128 | "hide": false,
129 | "legendFormat": "Max CPU Frequency",
130 | "range": true,
131 | "refId": "B"
132 | },
133 | {
134 | "datasource": {
135 | "type": "prometheus",
136 | "uid": "P1809F7CD0C75ACF3"
137 | },
138 | "editorMode": "builder",
139 | "expr": "max(node_cpu_frequency_min_hertz{instance=\"$node\"})",
140 | "hide": false,
141 | "legendFormat": "Min CPU Frequency",
142 | "range": true,
143 | "refId": "C"
144 | }
145 | ],
146 | "title": "Current CPU Frequency (Max of all CPUs) for Node $node",
147 | "type": "timeseries"
148 | },
149 | {
150 | "datasource": {
151 | "type": "prometheus",
152 | "uid": "P1809F7CD0C75ACF3"
153 | },
154 | "fieldConfig": {
155 | "defaults": {
156 | "color": {
157 | "mode": "palette-classic"
158 | },
159 | "custom": {
160 | "axisCenteredZero": false,
161 | "axisColorMode": "text",
162 | "axisLabel": "",
163 | "axisPlacement": "auto",
164 | "barAlignment": 0,
165 | "drawStyle": "line",
166 | "fillOpacity": 0,
167 | "gradientMode": "none",
168 | "hideFrom": {
169 | "legend": false,
170 | "tooltip": false,
171 | "viz": false
172 | },
173 | "lineInterpolation": "linear",
174 | "lineWidth": 1,
175 | "pointSize": 5,
176 | "scaleDistribution": {
177 | "type": "linear"
178 | },
179 | "showPoints": "auto",
180 | "spanNulls": false,
181 | "stacking": {
182 | "group": "A",
183 | "mode": "none"
184 | },
185 | "thresholdsStyle": {
186 | "mode": "off"
187 | }
188 | },
189 | "mappings": [],
190 | "max": 4,
191 | "min": 0,
192 | "thresholds": {
193 | "mode": "absolute",
194 | "steps": [
195 | {
196 | "color": "green",
197 | "value": null
198 | },
199 | {
200 | "color": "red",
201 | "value": 80
202 | }
203 | ]
204 | }
205 | },
206 | "overrides": []
207 | },
208 | "gridPos": {
209 | "h": 10,
210 | "w": 24,
211 | "x": 0,
212 | "y": 9
213 | },
214 | "id": 2,
215 | "options": {
216 | "legend": {
217 | "calcs": [],
218 | "displayMode": "list",
219 | "placement": "bottom",
220 | "showLegend": true
221 | },
222 | "tooltip": {
223 | "mode": "single",
224 | "sort": "none"
225 | }
226 | },
227 | "targets": [
228 | {
229 | "datasource": {
230 | "type": "prometheus",
231 | "uid": "P1809F7CD0C75ACF3"
232 | },
233 | "editorMode": "builder",
234 | "expr": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\", resource=\"cpu\", container=\"$container\"}",
235 | "legendFormat": "{{pod}}",
236 | "range": true,
237 | "refId": "A"
238 | }
239 | ],
240 | "title": "Managed Container CPU Requests",
241 | "type": "timeseries"
242 | },
243 | {
244 | "datasource": {
245 | "type": "prometheus",
246 | "uid": "P1809F7CD0C75ACF3"
247 | },
248 | "description": "",
249 | "fieldConfig": {
250 | "defaults": {
251 | "color": {
252 | "mode": "palette-classic"
253 | },
254 | "custom": {
255 | "axisCenteredZero": false,
256 | "axisColorMode": "text",
257 | "axisLabel": "",
258 | "axisPlacement": "auto",
259 | "barAlignment": 0,
260 | "drawStyle": "line",
261 | "fillOpacity": 0,
262 | "gradientMode": "none",
263 | "hideFrom": {
264 | "legend": false,
265 | "tooltip": false,
266 | "viz": false
267 | },
268 | "lineInterpolation": "linear",
269 | "lineWidth": 1,
270 | "pointSize": 5,
271 | "scaleDistribution": {
272 | "type": "linear"
273 | },
274 | "showPoints": "auto",
275 | "spanNulls": false,
276 | "stacking": {
277 | "group": "A",
278 | "mode": "none"
279 | },
280 | "thresholdsStyle": {
281 | "mode": "off"
282 | }
283 | },
284 | "mappings": [],
285 | "max": 10000000000,
286 | "min": 0,
287 | "thresholds": {
288 | "mode": "absolute",
289 | "steps": [
290 | {
291 | "color": "green",
292 | "value": null
293 | },
294 | {
295 | "color": "red",
296 | "value": 80
297 | }
298 | ]
299 | }
300 | },
301 | "overrides": []
302 | },
303 | "gridPos": {
304 | "h": 9,
305 | "w": 24,
306 | "x": 0,
307 | "y": 19
308 | },
309 | "id": 6,
310 | "options": {
311 | "legend": {
312 | "calcs": [],
313 | "displayMode": "list",
314 | "placement": "bottom",
315 | "showLegend": true
316 | },
317 | "tooltip": {
318 | "mode": "single",
319 | "sort": "none"
320 | }
321 | },
322 | "targets": [
323 | {
324 | "datasource": {
325 | "type": "prometheus",
326 | "uid": "P1809F7CD0C75ACF3"
327 | },
328 | "editorMode": "code",
329 | "expr": "pod_cpu_instructions{pod_namespace=\"$namespace\"} / 3",
330 | "legendFormat": "{{pod_name}}",
331 | "range": true,
332 | "refId": "A"
333 | }
334 | ],
335 | "title": "Container Actual IPS (Instructions/second)",
336 | "type": "timeseries"
337 | }
338 | ],
339 | "schemaVersion": 37,
340 | "style": "dark",
341 | "tags": [],
342 | "templating": {
343 | "list": [
344 | {
345 | "current": {
346 | "selected": false,
347 | "text": "default",
348 | "value": "default"
349 | },
350 | "datasource": {
351 | "type": "prometheus",
352 | "uid": "P1809F7CD0C75ACF3"
353 | },
354 | "definition": "label_values(kube_pod_container_resource_requests, namespace)",
355 | "hide": 0,
356 | "includeAll": false,
357 | "label": "Namespace",
358 | "multi": false,
359 | "name": "namespace",
360 | "options": [],
361 | "query": {
362 | "query": "label_values(kube_pod_container_resource_requests, namespace)",
363 | "refId": "StandardVariableQuery"
364 | },
365 | "refresh": 1,
366 | "regex": "",
367 | "skipUrlSync": false,
368 | "sort": 0,
369 | "type": "query"
370 | },
371 | {
372 | "current": {
373 | "selected": true,
374 | "text": "sysbench",
375 | "value": "sysbench"
376 | },
377 | "datasource": {
378 | "type": "prometheus",
379 | "uid": "P1809F7CD0C75ACF3"
380 | },
381 | "definition": "label_values(kube_pod_container_resource_requests{namespace=\"$namespace\"}, container)",
382 | "hide": 0,
383 | "includeAll": false,
384 | "label": "Container",
385 | "multi": false,
386 | "name": "container",
387 | "options": [],
388 | "query": {
389 | "query": "label_values(kube_pod_container_resource_requests{namespace=\"$namespace\"}, container)",
390 | "refId": "StandardVariableQuery"
391 | },
392 | "refresh": 1,
393 | "regex": "",
394 | "skipUrlSync": false,
395 | "sort": 0,
396 | "type": "query"
397 | },
398 | {
399 | "current": {
400 | "selected": false,
401 | "text": "clever.ibm.cloud",
402 | "value": "clever.ibm.cloud"
403 | },
404 | "datasource": {
405 | "type": "prometheus",
406 | "uid": "P1809F7CD0C75ACF3"
407 | },
408 | "definition": "label_values(node_cpu_frequency_max_hertz, instance)",
409 | "hide": 0,
410 | "includeAll": false,
411 | "label": "Node",
412 | "multi": false,
413 | "name": "node",
414 | "options": [],
415 | "query": {
416 | "query": "label_values(node_cpu_frequency_max_hertz, instance)",
417 | "refId": "StandardVariableQuery"
418 | },
419 | "refresh": 1,
420 | "regex": "",
421 | "skipUrlSync": false,
422 | "sort": 0,
423 | "type": "query"
424 | }
425 | ]
426 | },
427 | "time": {
428 | "from": "now-5m",
429 | "to": "now"
430 | },
431 | "timepicker": {},
432 | "timezone": "",
433 | "title": "Clever VPA Recommender",
434 | "uid": "ePYMOfnVk",
435 | "version": 5,
436 | "weekStart": ""
437 | }
--------------------------------------------------------------------------------
/dashboards/clever-dashboard.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotations": {
3 | "list": [
4 | {
5 | "builtIn": 1,
6 | "datasource": {
7 | "type": "grafana",
8 | "uid": "-- Grafana --"
9 | },
10 | "enable": true,
11 | "hide": true,
12 | "iconColor": "rgba(0, 211, 255, 1)",
13 | "name": "Annotations & Alerts",
14 | "target": {
15 | "limit": 100,
16 | "matchAny": false,
17 | "tags": [],
18 | "type": "dashboard"
19 | },
20 | "type": "dashboard"
21 | }
22 | ]
23 | },
24 | "editable": true,
25 | "fiscalYearStartMonth": 0,
26 | "graphTooltip": 0,
27 | "id": 29,
28 | "links": [],
29 | "liveNow": false,
30 | "panels": [
31 | {
32 | "datasource": {
33 | "type": "prometheus",
34 | "uid": "P1809F7CD0C75ACF3"
35 | },
36 | "description": "For the node with more than 1 CPU, choose the maximum frequency.",
37 | "fieldConfig": {
38 | "defaults": {
39 | "color": {
40 | "mode": "palette-classic"
41 | },
42 | "custom": {
43 | "axisLabel": "",
44 | "axisPlacement": "auto",
45 | "barAlignment": 0,
46 | "drawStyle": "line",
47 | "fillOpacity": 0,
48 | "gradientMode": "none",
49 | "hideFrom": {
50 | "legend": false,
51 | "tooltip": false,
52 | "viz": false
53 | },
54 | "lineInterpolation": "linear",
55 | "lineWidth": 1,
56 | "pointSize": 5,
57 | "scaleDistribution": {
58 | "type": "linear"
59 | },
60 | "showPoints": "auto",
61 | "spanNulls": false,
62 | "stacking": {
63 | "group": "A",
64 | "mode": "none"
65 | },
66 | "thresholdsStyle": {
67 | "mode": "off"
68 | }
69 | },
70 | "mappings": [],
71 | "thresholds": {
72 | "mode": "absolute",
73 | "steps": [
74 | {
75 | "color": "green",
76 | "value": null
77 | },
78 | {
79 | "color": "red",
80 | "value": 80
81 | }
82 | ]
83 | },
84 | "unit": "hertz"
85 | },
86 | "overrides": []
87 | },
88 | "gridPos": {
89 | "h": 9,
90 | "w": 24,
91 | "x": 0,
92 | "y": 0
93 | },
94 | "id": 4,
95 | "options": {
96 | "legend": {
97 | "calcs": [],
98 | "displayMode": "list",
99 | "placement": "bottom"
100 | },
101 | "tooltip": {
102 | "mode": "single",
103 | "sort": "none"
104 | }
105 | },
106 | "targets": [
107 | {
108 | "datasource": {
109 | "type": "prometheus",
110 | "uid": "P1809F7CD0C75ACF3"
111 | },
112 | "editorMode": "builder",
113 | "expr": "max(node_cpu_scaling_frequency_hertz{instance=\"$node\"})",
114 | "legendFormat": "Current CPU Frequency ",
115 | "range": true,
116 | "refId": "A"
117 | },
118 | {
119 | "datasource": {
120 | "type": "prometheus",
121 | "uid": "P1809F7CD0C75ACF3"
122 | },
123 | "editorMode": "builder",
124 | "expr": "max(node_cpu_frequency_max_hertz{instance=\"$node\"})",
125 | "hide": false,
126 | "legendFormat": "Max CPU Frequency",
127 | "range": true,
128 | "refId": "B"
129 | },
130 | {
131 | "datasource": {
132 | "type": "prometheus",
133 | "uid": "P1809F7CD0C75ACF3"
134 | },
135 | "editorMode": "builder",
136 | "expr": "max(node_cpu_frequency_min_hertz{instance=\"$node\"})",
137 | "hide": false,
138 | "legendFormat": "Min CPU Frequency",
139 | "range": true,
140 | "refId": "C"
141 | }
142 | ],
143 | "title": "Current CPU Frequency (Max of all CPUs) for Node $node",
144 | "type": "timeseries"
145 | },
146 | {
147 | "datasource": {
148 | "type": "prometheus",
149 | "uid": "P1809F7CD0C75ACF3"
150 | },
151 | "fieldConfig": {
152 | "defaults": {
153 | "color": {
154 | "mode": "palette-classic"
155 | },
156 | "custom": {
157 | "axisLabel": "",
158 | "axisPlacement": "auto",
159 | "barAlignment": 0,
160 | "drawStyle": "line",
161 | "fillOpacity": 0,
162 | "gradientMode": "none",
163 | "hideFrom": {
164 | "legend": false,
165 | "tooltip": false,
166 | "viz": false
167 | },
168 | "lineInterpolation": "linear",
169 | "lineWidth": 1,
170 | "pointSize": 5,
171 | "scaleDistribution": {
172 | "type": "linear"
173 | },
174 | "showPoints": "auto",
175 | "spanNulls": false,
176 | "stacking": {
177 | "group": "A",
178 | "mode": "none"
179 | },
180 | "thresholdsStyle": {
181 | "mode": "off"
182 | }
183 | },
184 | "mappings": [],
185 | "thresholds": {
186 | "mode": "absolute",
187 | "steps": [
188 | {
189 | "color": "green",
190 | "value": null
191 | },
192 | {
193 | "color": "red",
194 | "value": 80
195 | }
196 | ]
197 | }
198 | },
199 | "overrides": []
200 | },
201 | "gridPos": {
202 | "h": 10,
203 | "w": 24,
204 | "x": 0,
205 | "y": 9
206 | },
207 | "id": 2,
208 | "options": {
209 | "legend": {
210 | "calcs": [],
211 | "displayMode": "list",
212 | "placement": "bottom"
213 | },
214 | "tooltip": {
215 | "mode": "single",
216 | "sort": "none"
217 | }
218 | },
219 | "targets": [
220 | {
221 | "datasource": {
222 | "type": "prometheus",
223 | "uid": "P1809F7CD0C75ACF3"
224 | },
225 | "editorMode": "builder",
226 | "expr": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\", resource=\"cpu\", container=\"$container\"}",
227 | "legendFormat": "{{pod}}",
228 | "range": true,
229 | "refId": "A"
230 | }
231 | ],
232 | "title": "Managed Container CPU Requests",
233 | "type": "timeseries"
234 | }
235 | ],
236 | "schemaVersion": 36,
237 | "style": "dark",
238 | "tags": [],
239 | "templating": {
240 | "list": [
241 | {
242 | "current": {
243 | "selected": true,
244 | "text": "default",
245 | "value": "default"
246 | },
247 | "datasource": {
248 | "type": "prometheus",
249 | "uid": "P1809F7CD0C75ACF3"
250 | },
251 | "definition": "label_values(kube_pod_container_resource_requests, namespace)",
252 | "hide": 0,
253 | "includeAll": false,
254 | "label": "Namespace",
255 | "multi": false,
256 | "name": "namespace",
257 | "options": [],
258 | "query": {
259 | "query": "label_values(kube_pod_container_resource_requests, namespace)",
260 | "refId": "StandardVariableQuery"
261 | },
262 | "refresh": 1,
263 | "regex": "",
264 | "skipUrlSync": false,
265 | "sort": 0,
266 | "type": "query"
267 | },
268 | {
269 | "current": {
270 | "selected": false,
271 | "text": "hamster",
272 | "value": "hamster"
273 | },
274 | "datasource": {
275 | "type": "prometheus",
276 | "uid": "P1809F7CD0C75ACF3"
277 | },
278 | "definition": "label_values(kube_pod_container_resource_requests{namespace=\"$namespace\"}, container)",
279 | "hide": 0,
280 | "includeAll": false,
281 | "label": "Container",
282 | "multi": false,
283 | "name": "container",
284 | "options": [],
285 | "query": {
286 | "query": "label_values(kube_pod_container_resource_requests{namespace=\"$namespace\"}, container)",
287 | "refId": "StandardVariableQuery"
288 | },
289 | "refresh": 1,
290 | "regex": "",
291 | "skipUrlSync": false,
292 | "sort": 0,
293 | "type": "query"
294 | },
295 | {
296 | "current": {
297 | "selected": false,
298 | "text": "load-test.ibm.cloud",
299 | "value": "load-test.ibm.cloud"
300 | },
301 | "datasource": {
302 | "type": "prometheus",
303 | "uid": "P1809F7CD0C75ACF3"
304 | },
305 | "definition": "label_values(node_cpu_frequency_max_hertz, instance)",
306 | "hide": 0,
307 | "includeAll": false,
308 | "label": "Node",
309 | "multi": false,
310 | "name": "node",
311 | "options": [],
312 | "query": {
313 | "query": "label_values(node_cpu_frequency_max_hertz, instance)",
314 | "refId": "StandardVariableQuery"
315 | },
316 | "refresh": 1,
317 | "regex": "",
318 | "skipUrlSync": false,
319 | "sort": 0,
320 | "type": "query"
321 | }
322 | ]
323 | },
324 | "time": {
325 | "from": "now-15m",
326 | "to": "now"
327 | },
328 | "timepicker": {},
329 | "timezone": "",
330 | "title": "Clever VPA Recommender",
331 | "uid": "ePYMOfnVk",
332 | "version": 13,
333 | "weekStart": ""
334 | }
--------------------------------------------------------------------------------
/dashboards/clever-sysbench-kubecon22.json:
--------------------------------------------------------------------------------
1 | {
2 | "annotations": {
3 | "list": [
4 | {
5 | "builtIn": 1,
6 | "datasource": {
7 | "type": "grafana",
8 | "uid": "-- Grafana --"
9 | },
10 | "enable": true,
11 | "hide": true,
12 | "iconColor": "rgba(0, 211, 255, 1)",
13 | "name": "Annotations & Alerts",
14 | "target": {
15 | "limit": 100,
16 | "matchAny": false,
17 | "tags": [],
18 | "type": "dashboard"
19 | },
20 | "type": "dashboard"
21 | }
22 | ]
23 | },
24 | "editable": true,
25 | "fiscalYearStartMonth": 0,
26 | "graphTooltip": 0,
27 | "id": 28,
28 | "links": [],
29 | "liveNow": false,
30 | "panels": [
31 | {
32 | "datasource": {
33 | "type": "prometheus",
34 | "uid": "P1809F7CD0C75ACF3"
35 | },
36 | "description": "For the node with more than 1 CPU, choose the maximum frequency.",
37 | "fieldConfig": {
38 | "defaults": {
39 | "color": {
40 | "mode": "palette-classic"
41 | },
42 | "custom": {
43 | "axisCenteredZero": false,
44 | "axisColorMode": "text",
45 | "axisLabel": "",
46 | "axisPlacement": "auto",
47 | "barAlignment": 0,
48 | "drawStyle": "line",
49 | "fillOpacity": 0,
50 | "gradientMode": "none",
51 | "hideFrom": {
52 | "legend": false,
53 | "tooltip": false,
54 | "viz": false
55 | },
56 | "lineInterpolation": "linear",
57 | "lineWidth": 1,
58 | "pointSize": 5,
59 | "scaleDistribution": {
60 | "type": "linear"
61 | },
62 | "showPoints": "auto",
63 | "spanNulls": false,
64 | "stacking": {
65 | "group": "A",
66 | "mode": "none"
67 | },
68 | "thresholdsStyle": {
69 | "mode": "off"
70 | }
71 | },
72 | "mappings": [],
73 | "thresholds": {
74 | "mode": "absolute",
75 | "steps": [
76 | {
77 | "color": "green",
78 | "value": null
79 | },
80 | {
81 | "color": "red",
82 | "value": 80
83 | }
84 | ]
85 | },
86 | "unit": "hertz"
87 | },
88 | "overrides": []
89 | },
90 | "gridPos": {
91 | "h": 9,
92 | "w": 24,
93 | "x": 0,
94 | "y": 0
95 | },
96 | "id": 4,
97 | "options": {
98 | "legend": {
99 | "calcs": [],
100 | "displayMode": "list",
101 | "placement": "bottom",
102 | "showLegend": true
103 | },
104 | "tooltip": {
105 | "mode": "single",
106 | "sort": "none"
107 | }
108 | },
109 | "targets": [
110 | {
111 | "datasource": {
112 | "type": "prometheus",
113 | "uid": "P1809F7CD0C75ACF3"
114 | },
115 | "editorMode": "builder",
116 | "expr": "max(node_cpu_scaling_frequency_hertz{instance=\"$node\"})",
117 | "legendFormat": "Current CPU Frequency ",
118 | "range": true,
119 | "refId": "A"
120 | },
121 | {
122 | "datasource": {
123 | "type": "prometheus",
124 | "uid": "P1809F7CD0C75ACF3"
125 | },
126 | "editorMode": "builder",
127 | "expr": "max(node_cpu_frequency_max_hertz{instance=\"$node\"})",
128 | "hide": false,
129 | "legendFormat": "Max CPU Frequency",
130 | "range": true,
131 | "refId": "B"
132 | },
133 | {
134 | "datasource": {
135 | "type": "prometheus",
136 | "uid": "P1809F7CD0C75ACF3"
137 | },
138 | "editorMode": "builder",
139 | "expr": "max(node_cpu_frequency_min_hertz{instance=\"$node\"})",
140 | "hide": false,
141 | "legendFormat": "Min CPU Frequency",
142 | "range": true,
143 | "refId": "C"
144 | }
145 | ],
146 | "title": "Current CPU Frequency (Max of all CPUs) for Node $node",
147 | "type": "timeseries"
148 | },
149 | {
150 | "datasource": {
151 | "type": "prometheus",
152 | "uid": "P1809F7CD0C75ACF3"
153 | },
154 | "fieldConfig": {
155 | "defaults": {
156 | "color": {
157 | "mode": "palette-classic"
158 | },
159 | "custom": {
160 | "axisCenteredZero": false,
161 | "axisColorMode": "text",
162 | "axisLabel": "",
163 | "axisPlacement": "auto",
164 | "barAlignment": 0,
165 | "drawStyle": "line",
166 | "fillOpacity": 0,
167 | "gradientMode": "none",
168 | "hideFrom": {
169 | "legend": false,
170 | "tooltip": false,
171 | "viz": false
172 | },
173 | "lineInterpolation": "linear",
174 | "lineWidth": 1,
175 | "pointSize": 5,
176 | "scaleDistribution": {
177 | "type": "linear"
178 | },
179 | "showPoints": "auto",
180 | "spanNulls": false,
181 | "stacking": {
182 | "group": "A",
183 | "mode": "none"
184 | },
185 | "thresholdsStyle": {
186 | "mode": "off"
187 | }
188 | },
189 | "mappings": [],
190 | "max": 1,
191 | "min": 0,
192 | "thresholds": {
193 | "mode": "absolute",
194 | "steps": [
195 | {
196 | "color": "green",
197 | "value": null
198 | },
199 | {
200 | "color": "red",
201 | "value": 80
202 | }
203 | ]
204 | }
205 | },
206 | "overrides": []
207 | },
208 | "gridPos": {
209 | "h": 10,
210 | "w": 24,
211 | "x": 0,
212 | "y": 9
213 | },
214 | "id": 2,
215 | "options": {
216 | "legend": {
217 | "calcs": [],
218 | "displayMode": "list",
219 | "placement": "bottom",
220 | "showLegend": true
221 | },
222 | "tooltip": {
223 | "mode": "single",
224 | "sort": "none"
225 | }
226 | },
227 | "targets": [
228 | {
229 | "datasource": {
230 | "type": "prometheus",
231 | "uid": "P1809F7CD0C75ACF3"
232 | },
233 | "editorMode": "builder",
234 | "expr": "max by(container) (cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\", resource=\"cpu\", container=\"$container\"})",
235 | "legendFormat": "{{pod}}",
236 | "range": true,
237 | "refId": "A"
238 | }
239 | ],
240 | "title": "Managed Container CPU Requests",
241 | "type": "timeseries"
242 | },
243 | {
244 | "datasource": {
245 | "type": "prometheus",
246 | "uid": "P1809F7CD0C75ACF3"
247 | },
248 | "description": "",
249 | "fieldConfig": {
250 | "defaults": {
251 | "color": {
252 | "mode": "palette-classic"
253 | },
254 | "custom": {
255 | "axisCenteredZero": false,
256 | "axisColorMode": "text",
257 | "axisLabel": "",
258 | "axisPlacement": "auto",
259 | "barAlignment": 0,
260 | "drawStyle": "line",
261 | "fillOpacity": 0,
262 | "gradientMode": "none",
263 | "hideFrom": {
264 | "legend": false,
265 | "tooltip": false,
266 | "viz": false
267 | },
268 | "lineInterpolation": "linear",
269 | "lineWidth": 1,
270 | "pointSize": 5,
271 | "scaleDistribution": {
272 | "type": "linear"
273 | },
274 | "showPoints": "auto",
275 | "spanNulls": false,
276 | "stacking": {
277 | "group": "A",
278 | "mode": "none"
279 | },
280 | "thresholdsStyle": {
281 | "mode": "off"
282 | }
283 | },
284 | "mappings": [],
285 | "max": 1000000000,
286 | "min": 0,
287 | "thresholds": {
288 | "mode": "absolute",
289 | "steps": [
290 | {
291 | "color": "green",
292 | "value": null
293 | },
294 | {
295 | "color": "red",
296 | "value": 80
297 | }
298 | ]
299 | }
300 | },
301 | "overrides": []
302 | },
303 | "gridPos": {
304 | "h": 9,
305 | "w": 24,
306 | "x": 0,
307 | "y": 19
308 | },
309 | "id": 6,
310 | "options": {
311 | "legend": {
312 | "calcs": [],
313 | "displayMode": "list",
314 | "placement": "bottom",
315 | "showLegend": true
316 | },
317 | "tooltip": {
318 | "mode": "single",
319 | "sort": "none"
320 | }
321 | },
322 | "targets": [
323 | {
324 | "datasource": {
325 | "type": "prometheus",
326 | "uid": "P1809F7CD0C75ACF3"
327 | },
328 | "editorMode": "builder",
329 | "expr": "max(pod_cpu_instructions{pod_namespace=\"$namespace\"} / 3)",
330 | "legendFormat": "sysbench",
331 | "range": true,
332 | "refId": "A"
333 | }
334 | ],
335 | "title": "Container Actual IPS (Instructions/second)",
336 | "type": "timeseries"
337 | }
338 | ],
339 | "schemaVersion": 37,
340 | "style": "dark",
341 | "tags": [],
342 | "templating": {
343 | "list": [
344 | {
345 | "current": {
346 | "selected": false,
347 | "text": "default",
348 | "value": "default"
349 | },
350 | "datasource": {
351 | "type": "prometheus",
352 | "uid": "P1809F7CD0C75ACF3"
353 | },
354 | "definition": "label_values(kube_pod_container_resource_requests, namespace)",
355 | "hide": 0,
356 | "includeAll": false,
357 | "label": "Namespace",
358 | "multi": false,
359 | "name": "namespace",
360 | "options": [],
361 | "query": {
362 | "query": "label_values(kube_pod_container_resource_requests, namespace)",
363 | "refId": "StandardVariableQuery"
364 | },
365 | "refresh": 1,
366 | "regex": "",
367 | "skipUrlSync": false,
368 | "sort": 0,
369 | "type": "query"
370 | },
371 | {
372 | "current": {
373 | "selected": false,
374 | "text": "sysbench",
375 | "value": "sysbench"
376 | },
377 | "datasource": {
378 | "type": "prometheus",
379 | "uid": "P1809F7CD0C75ACF3"
380 | },
381 | "definition": "label_values(kube_pod_container_resource_requests{namespace=\"$namespace\"}, container)",
382 | "hide": 0,
383 | "includeAll": false,
384 | "label": "Container",
385 | "multi": false,
386 | "name": "container",
387 | "options": [],
388 | "query": {
389 | "query": "label_values(kube_pod_container_resource_requests{namespace=\"$namespace\"}, container)",
390 | "refId": "StandardVariableQuery"
391 | },
392 | "refresh": 1,
393 | "regex": "",
394 | "skipUrlSync": false,
395 | "sort": 0,
396 | "type": "query"
397 | },
398 | {
399 | "current": {
400 | "selected": false,
401 | "text": "clever.ibm.cloud",
402 | "value": "clever.ibm.cloud"
403 | },
404 | "datasource": {
405 | "type": "prometheus",
406 | "uid": "P1809F7CD0C75ACF3"
407 | },
408 | "definition": "label_values(node_cpu_frequency_max_hertz, instance)",
409 | "hide": 0,
410 | "includeAll": false,
411 | "label": "Node",
412 | "multi": false,
413 | "name": "node",
414 | "options": [],
415 | "query": {
416 | "query": "label_values(node_cpu_frequency_max_hertz, instance)",
417 | "refId": "StandardVariableQuery"
418 | },
419 | "refresh": 1,
420 | "regex": "",
421 | "skipUrlSync": false,
422 | "sort": 0,
423 | "type": "query"
424 | }
425 | ]
426 | },
427 | "time": {
428 | "from": "now-5m",
429 | "to": "now"
430 | },
431 | "timepicker": {},
432 | "timezone": "",
433 | "title": "Clever VPA Recommender",
434 | "uid": "ePYMOfnVk",
435 | "version": 11,
436 | "weekStart": ""
437 | }
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | from kubernetes import client, config
3 | from kubernetes.client.rest import ApiException
4 |
5 | from utils import *
6 | from recommender import *
7 |
8 | # Current Recommender Name
9 | RECOMMENDER_NAME = "clever"
10 | SLEEP_WINDOW = 60
11 |
12 | # VPA resources
13 | DOMAIN = "autoscaling.k8s.io"
14 | VPA_NAME = "verticalpodautoscaler"
15 | VPA_PLURAL = "verticalpodautoscalers"
16 | VPA_CHECKPOINT_NAME = "verticalpodautoscalercheckpoint"
17 | VPA_CHECKPOINT_PLURAL = "verticalpodautoscalercheckpoints"
18 |
19 | # PROMETHEUS Queries
20 | MAX_CPU_FREQUENCY_QUERY = "node_cpu_frequency_max_hertz"
21 | MIN_CPU_FREQUENCY_QUERY = "node_cpu_frequency_min_hertz"
22 | LATEST_CPU_FREQUENCY_QUERY = "node_cpu_scaling_frequency_hertz"
23 |
24 | # Keep the latest node frequencies and the VPA default requests in cache
25 | MAX_NODE_CPU_FREQUENCY = {}
26 | LATEST_NODE_CPU_FREQUENCY = {}
27 | ACTIVE_VPA_DEFAULT_CPU_REQUESTS = {}
28 |
29 | # Press the green button in the gutter to run the script.
30 | if __name__ == '__main__':
31 | if 'KUBERNETES_PORT' in os.environ:
32 | config.load_incluster_config()
33 | else:
34 | config.load_kube_config()
35 |
36 | # Get the api instance to interact with the cluster
37 | api_client = client.api_client.ApiClient()
38 | v1 = client.ApiextensionsV1Api(api_client)
39 | corev1 = client.CoreV1Api(api_client)
40 | crds = client.CustomObjectsApi(api_client)
41 | resource_version = ''
42 |
43 | # Initialize the prometheus client
44 | prom_client = PromClient()
45 |
46 | # Initialize the node CPU frequency cache.
47 | MAX_NODE_CPU_FREQUENCY = get_all_node_homogeneous_frequencies(prom_client, MAX_CPU_FREQUENCY_QUERY)
48 | if MAX_NODE_CPU_FREQUENCY is None:
49 | print("Prometheus Query {} at Endpoint {} failed.".format(MAX_CPU_FREQUENCY_QUERY, prom_client.prom_address))
50 | exit(-1)
51 |
52 | LATEST_NODE_CPU_FREQUENCY = get_all_node_homogeneous_frequencies(prom_client, LATEST_CPU_FREQUENCY_QUERY)
53 | if LATEST_NODE_CPU_FREQUENCY is None:
54 | print("Prometheus Query {} at Endpoint {} failed.".format(LATEST_CPU_FREQUENCY_QUERY, prom_client.prom_address))
55 | exit(-1)
56 | print("Initialized the node CPU frequency cache {}".format(LATEST_NODE_CPU_FREQUENCY))
57 |
58 | # Get the VPA CRD
59 | current_crds = [x['spec']['names']['kind'].lower() for x in v1.list_custom_resource_definition().to_dict()['items']]
60 | if VPA_NAME not in current_crds:
61 | print("VerticalPodAutoscaler CRD is not created!")
62 | exit(-1)
63 |
64 | while True:
65 | print("Checking the frequency and the target IPS")
66 | # Updating the default VPA CPU cache.
67 | vpas = crds.list_cluster_custom_object(group=DOMAIN, version="v1", plural=VPA_PLURAL)
68 | selectedVpas = selects_recommender(vpas, RECOMMENDER_NAME)
69 |
70 | # Update the container default requests for selectedVpas
71 | # Keep the mapping between nodes and vpas, which manage pods on those nodes.
72 | node_vpas = {}
73 | for vpa in selectedVpas:
74 | vpa_name = vpa["metadata"]["name"]
75 | vpa_namespace = vpa["metadata"]["namespace"]
76 |
77 | # Get initial container request.
78 | if vpa_name not in ACTIVE_VPA_DEFAULT_CPU_REQUESTS.keys():
79 | ACTIVE_VPA_DEFAULT_CPU_REQUESTS[vpa_name], vpa_nodes = get_vpa_detailed_info(corev1, vpa)
80 | print("Updating the default CPU request cache for newly discovered VPA {}".format(vpa_name))
81 | print(ACTIVE_VPA_DEFAULT_CPU_REQUESTS)
82 | else:
83 | _, vpa_nodes = get_vpa_detailed_info(corev1, vpa)
84 |
85 | # Select VPAs per node.
86 | for node in list(set(vpa_nodes.values())):
87 | if node not in node_vpas.keys():
88 | node_vpas[node] = [vpa]
89 | else:
90 | node_vpas[node].append(vpa)
91 | print("Discovering VPAs running on the following nodes.")
92 | print(node_vpas)
93 |
94 | # Obtain the latest node cpu frequencies
95 | CUR_NODE_CPU_FREQUENCY = get_all_node_homogeneous_frequencies(prom_client, LATEST_CPU_FREQUENCY_QUERY)
96 | print("Current node CPU frequencies\n {}".format(CUR_NODE_CPU_FREQUENCY))
97 |
98 | # Check difference between LATEST_NODE_CPU_FREQUENCY and CUR_NODE_CPU_FREQUENCY
99 | if CUR_NODE_CPU_FREQUENCY != LATEST_NODE_CPU_FREQUENCY:
100 | # Select nodes with frequency changes.
101 | nodes_with_frequency_changes = find_node_with_frequency_changes(CUR_NODE_CPU_FREQUENCY, LATEST_NODE_CPU_FREQUENCY)
102 | print("Discover nodes with frequency changes {}".format(nodes_with_frequency_changes))
103 |
104 | vpas_to_update = {}
105 | for node in nodes_with_frequency_changes:
106 | if node not in node_vpas.keys():
107 | print("Frequency changes on node {} does not impact any vpa managed pods!")
108 | continue
109 |
110 | cur_node_vpas = node_vpas[node]
111 | for vpa in cur_node_vpas:
112 | vpa_name = vpa["metadata"]["name"]
113 | vpas_to_update[vpa_name] = vpa
114 |
115 | print("These VPAs {} are impacted by the following nodes with frequency changes {}.".format(vpas_to_update.keys(), nodes_with_frequency_changes))
116 | for vpa in vpas_to_update.values():
117 | vpa_name = vpa["metadata"]["name"]
118 | vpa_namespace = vpa["metadata"]["namespace"]
119 |
120 | print("Recommend sizes according to current frequency for vpas on nodes with frequency changes!")
121 |
122 | recommendations = get_recommendation(vpa, corev1, CUR_NODE_CPU_FREQUENCY, MAX_NODE_CPU_FREQUENCY, ACTIVE_VPA_DEFAULT_CPU_REQUESTS[vpa_name])
123 | print("Recommendations for VPA {} are {}".format(vpa_name, recommendations))
124 |
125 | if not recommendations:
126 | print("No new recommendations obtained, so skip updating the vpa object {}".format(vpa_name))
127 | continue
128 |
129 | # Update the recommendations.
130 | patched_vpa = {"recommendation": {"containerRecommendations": recommendations}}
131 | body = {"status": patched_vpa}
132 | vpa_api = client.CustomObjectsApi()
133 |
134 | # Update the VPA object
135 | # API call doc: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/CustomObjectsApi.md#patch_namespaced_custom_object
136 | try:
137 | vpa_updated = vpa_api.patch_namespaced_custom_object(group=DOMAIN, version="v1", plural=VPA_PLURAL,
138 | namespace=vpa_namespace, name=vpa_name,
139 | body=body)
140 | print("Successfully patched VPA object with the recommendation: %s" %
141 | vpa_updated['status']['recommendation']['containerRecommendations'])
142 | except ApiException as e:
143 | print("Exception when calling CustomObjectsApi->patch_namespaced_custom_object: %s\n" % e)
144 |
145 | print("Sleeping for {} seconds".format(SLEEP_WINDOW))
146 | print("=====================================================================================================")
147 | time.sleep(SLEEP_WINDOW)
148 |
149 |
--------------------------------------------------------------------------------
/manifests/clever.yaml:
--------------------------------------------------------------------------------
1 | kind: ClusterRole
2 | apiVersion: rbac.authorization.k8s.io/v1
3 | metadata:
4 | name: clever-role
5 | rules:
6 | - apiGroups:
7 | - ""
8 | resources:
9 | - pods
10 | - customresourcedefinitions
11 | verbs:
12 | - '*'
13 | - apiGroups:
14 | - apiextensions.k8s.io
15 | resources:
16 | - customresourcedefinitions
17 | verbs:
18 | - '*'
19 | - apiGroups:
20 | - autoscaling.k8s.io
21 | resources:
22 | - verticalpodautoscalers
23 | - verticalpodautoscalercheckpoints
24 | verbs:
25 | - '*'
26 | - apiGroups:
27 | - rbac.authorization.k8s.io
28 | resources:
29 | - clusterrolebindings
30 | verbs:
31 | - '*'
32 | - apiGroups:
33 | - apps
34 | resources:
35 | - deployments
36 | verbs:
37 | - "*"
38 | ---
39 | apiVersion: v1
40 | kind: ServiceAccount
41 | metadata:
42 | name: clever
43 | namespace: kube-system
44 | ---
45 | kind: ClusterRoleBinding
46 | apiVersion: rbac.authorization.k8s.io/v1
47 | metadata:
48 | name: clever-role-binding
49 | subjects:
50 | - kind: ServiceAccount
51 | name: clever
52 | namespace: kube-system
53 | roleRef:
54 | apiGroup: rbac.authorization.k8s.io
55 | kind: ClusterRole
56 | name: clever-role
57 | ---
58 | apiVersion: apps/v1
59 | kind: Deployment
60 | metadata:
61 | name: clever
62 | namespace: kube-system
63 | spec:
64 | replicas: 1
65 | selector:
66 | matchLabels:
67 | app: clever
68 | template:
69 | metadata:
70 | labels:
71 | app: clever
72 | spec:
73 | containers:
74 | - name: clever
75 | image: quay.io/chenw615/clever:latest
76 | env:
77 | - name: PROM_HOST
78 | value: "http://prometheus-k8s.monitoring.svc.cluster.local:9090"
79 | imagePullPolicy: Always
80 | serviceAccountName: clever
81 | serviceAccount: clever
--------------------------------------------------------------------------------
/manifests/random.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: "autoscaling.k8s.io/v1"
2 | kind: VerticalPodAutoscaler
3 | metadata:
4 | name: random-vpa
5 | spec:
6 | recommenders:
7 | - name: clever
8 | targetRef:
9 | apiVersion: "apps/v1"
10 | kind: Deployment
11 | name: random
12 | resourcePolicy:
13 | containerPolicies:
14 | - containerName: '*'
15 | minAllowed:
16 | cpu: 100m
17 | maxAllowed:
18 | cpu: 16
19 | controlledResources: ["cpu"]
20 | ---
21 | apiVersion: apps/v1
22 | kind: Deployment
23 | metadata:
24 | name: random
25 | spec:
26 | selector:
27 | matchLabels:
28 | app: random
29 | replicas: 2
30 | template:
31 | metadata:
32 | labels:
33 | app: random
34 | spec:
35 | securityContext:
36 | runAsNonRoot: true
37 | runAsUser: 65534 # nobody
38 | containers:
39 | - name: hamster
40 | image: k8s.gcr.io/ubuntu-slim:0.1
41 | resources:
42 | requests:
43 | cpu: 1
44 | memory: 500Mi
45 | command: ["/bin/sh"]
46 | args:
47 | - "-c"
48 | - "cat /dev/random"
--------------------------------------------------------------------------------
/manifests/sysbench.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: "autoscaling.k8s.io/v1"
2 | kind: VerticalPodAutoscaler
3 | metadata:
4 | name: sysbench-vpa
5 | spec:
6 | recommenders:
7 | - name: clever
8 | targetRef:
9 | apiVersion: "apps/v1"
10 | kind: Deployment
11 | name: sysbench
12 | resourcePolicy:
13 | containerPolicies:
14 | - containerName: '*'
15 | minAllowed:
16 | cpu: 100m
17 | maxAllowed:
18 | cpu: 16
19 | controlledResources: ["cpu"]
20 | ---
21 | apiVersion: apps/v1
22 | kind: Deployment
23 | metadata:
24 | name: sysbench
25 | spec:
26 | selector:
27 | matchLabels:
28 | app: sysbench
29 | replicas: 2
30 | template:
31 | metadata:
32 | labels:
33 | app: sysbench
34 | spec:
35 | securityContext:
36 | runAsNonRoot: true
37 | runAsUser: 65534 # nobody
38 | containers:
39 | - name: sysbench
40 | image: severalnines/sysbench
41 | resources:
42 | requests:
43 | cpu: 250m
44 | memory: 500Mi
45 | limits:
46 | cpu: 250m
47 | memory: 500Mi
48 | command:
49 | - sysbench
50 | - cpu
51 | - --threads=1
52 | - --time=100000
53 | - run
54 |
--------------------------------------------------------------------------------
/recommender.py:
--------------------------------------------------------------------------------
1 |
2 | DEFAULT_NAMESPACE="default"
3 | DELTA = 0.2
4 |
5 | # Select the VPAs that choose the current clever recommender
6 | def selects_recommender(vpas, recommender_name):
7 | selected_vpas = []
8 | for vpa in vpas["items"]:
9 | vpa_spec = vpa["spec"]
10 | if "recommenders" not in vpa_spec.keys():
11 | continue
12 | else:
13 | print("VPA {} has chosen {} recommenders".format(vpa["metadata"]["name"], len(vpa_spec["recommenders"])))
14 | print(vpa_spec)
15 | for recommender in vpa_spec["recommenders"]:
16 | if recommender["name"] == recommender_name:
17 | selected_vpas.append(vpa)
18 |
19 | return selected_vpas
20 |
21 | # Check if all container CPU requests are the same and get the consistent value.
22 | # If some container requests are larger than others, is_consistent would be False.
23 | def get_consistent_max_val(request_dict):
24 | max_val = -1
25 | consistent_cnt = 0
26 | for pod in request_dict.keys():
27 | for container in request_dict[pod].keys():
28 | if request_dict[pod][container] > max_val:
29 | max_val = request_dict[pod][container]
30 | consistent_cnt += 1
31 |
32 | is_consistent = True
33 | if consistent_cnt > 1:
34 | is_consistent = False
35 |
36 | return is_consistent, max_val
37 |
38 |
39 | # Only check the default CPU request. If not existed, it will use 1 core by default.
40 | def get_vpa_detailed_info(corev1, vpa):
41 | # Get the VPA spec
42 | vpa_spec = vpa["spec"]
43 |
44 | # example target_ref {'apiVersion': 'apps/v1', 'kind': 'Deployment', 'name': 'hamster'}
45 | target_ref = vpa_spec["targetRef"]
46 | print(target_ref)
47 |
48 | # Retrieve the target pods
49 | if "namespace" in target_ref.keys():
50 | target_namespace = target_ref["namespace"]
51 | else:
52 | target_namespace = DEFAULT_NAMESPACE
53 |
54 | # Get the target containers
55 | target_pods = corev1.list_namespaced_pod(namespace=target_namespace, label_selector="app=" + target_ref["name"])
56 |
57 | # Retrieve the target containers
58 | vpa_pod_nodes = {}
59 | all_container_cpu_requests = {}
60 | for pod in target_pods.items:
61 | all_container_cpu_requests[pod.metadata.name] = {}
62 | vpa_pod_nodes[pod.metadata.name] = pod.spec.node_name
63 | for container in pod.spec.containers:
64 | # print(container.name)
65 | # obtain the CPU request and convert it to int
66 | cur_request = str2resource("cpu", container.resources.requests["cpu"])
67 | all_container_cpu_requests[pod.metadata.name][container.name] = cur_request
68 |
69 | # Get the maximum default request if there are many containers.
70 | is_consistent, max_cpu_val = get_consistent_max_val(all_container_cpu_requests)
71 | vpa_container_cpu_request = max_cpu_val
72 |
73 | if not is_consistent:
74 | print("Warning: the containers managed by {} do not have consistent CPU requests!", vpa["metadata"]["name"])
75 |
76 | return vpa_container_cpu_request, vpa_pod_nodes
77 |
78 | # resource2str converts a resource (CPU, Memory) value to a string
79 | def resource2str(resource, value):
80 | if resource.lower() == "cpu":
81 | if value < 1:
82 | return str(int(value * 1000)) + "m"
83 | else:
84 | return str(value)
85 | # Memory is in bytes
86 | else:
87 | if value < 1024:
88 | return str(value) + "B"
89 | elif value < 1024 * 1024:
90 | return str(int(value / 1024)) + "k"
91 | elif value < 1024 * 1024 * 1024:
92 | return str(int(value / 1024 / 1024)) + "Mi"
93 | else:
94 | return str(int(value / 1024 / 1024 / 1024)) + "Gi"
95 |
96 | # Convert a resource (CPU, Memory) string to a float value
97 | def str2resource(resource, value):
98 | if type(value) is str:
99 | if resource.lower() == "cpu":
100 | if value[-1] == "m":
101 | return float(value[:-1]) / 1000
102 | else:
103 | return float(value)
104 | else:
105 | if value[-1].lower() == "b":
106 | return float(value[:-1])
107 | elif value[-1].lower() == "k":
108 | return float(value[:-1]) * 1024
109 | elif value[-2:].lower() == "mi":
110 | return float(value[:-2]) * 1024 * 1024
111 | elif value[-2:].lower() == "gi":
112 | return float(value[:-2]) * 1024 * 1024 * 1024
113 | else:
114 | return float(value)
115 | else:
116 | return value
117 |
118 | def bound_var(var, min_value, max_value):
119 | if var < min_value:
120 | return min_value
121 | elif var > max_value:
122 | return max_value
123 | else:
124 | return var
125 |
126 | # Find the nodes with frequency changes in the last iteration
127 | def find_node_with_frequency_changes(cur_node_frequencies, prev_node_frequencies):
128 | node_with_frequency_changes = []
129 | for node in cur_node_frequencies.keys():
130 | # TODO: compare frequencies
131 | if node not in prev_node_frequencies.keys():
132 | node_with_frequency_changes.append(node)
133 | else:
134 | if cur_node_frequencies[node] == prev_node_frequencies[node]:
135 | continue
136 | else:
137 | node_with_frequency_changes.append(node)
138 | return node_with_frequency_changes
139 |
140 | def get_recommendation(vpa, corev1, node_frequencies, max_node_frequencies, vpa_default_request):
141 | """
142 | This function takes a VPA and returns a list of recommendations
143 | """
144 | # Get the VPA spec
145 | vpa_spec = vpa["spec"]
146 |
147 | # example target_ref {'apiVersion': 'apps/v1', 'kind': 'Deployment', 'name': 'hamster'}
148 | target_ref = vpa_spec["targetRef"]
149 | print(target_ref)
150 |
151 | # Retrieve the target pods
152 | if "namespace" in target_ref.keys():
153 | target_namespace = target_ref["namespace"]
154 | else:
155 | target_namespace = DEFAULT_NAMESPACE
156 |
157 | # Get the target pods
158 | target_pods = corev1.list_namespaced_pod(namespace=target_namespace, label_selector="app=" + target_ref["name"])
159 |
160 | # Get the target container traces
161 | recommendations = []
162 |
163 | # Get uncapped target
164 | uncapped_targets = {}
165 | for pod in target_pods.items:
166 | pod_node = pod.spec.node_name
167 | node_frequency = node_frequencies[pod_node]
168 | max_node_frequency = max_node_frequencies[pod_node]
169 | for container in pod.spec.containers:
170 | container_name = container.name
171 | uncapped_target = vpa_default_request * float(max_node_frequency) / float(node_frequency)
172 | if container_name not in uncapped_targets.keys():
173 | uncapped_targets[container_name] = uncapped_target
174 | else:
175 | uncapped_targets[container_name] = max(uncapped_target, uncapped_targets[container_name])
176 |
177 |
178 | for containerPolicy in vpa_spec["resourcePolicy"]["containerPolicies"]:
179 | controlled_resources = containerPolicy["controlledResources"]
180 | max_allowed = containerPolicy["maxAllowed"]
181 | min_allowed = containerPolicy["minAllowed"]
182 |
183 | for resource in controlled_resources:
184 | if resource != "cpu":
185 | continue
186 | else:
187 | for container_name in uncapped_targets.keys():
188 | container_recommendation = {"containerName": container_name, "lowerBound": {}, "target": {},
189 | "uncappedTarget": {}, "upperBound": {}}
190 | uncapped_target = uncapped_targets[container_name]
191 | lower_bound = uncapped_target * (1 - DELTA)
192 | upper_bound = uncapped_target * (1 + DELTA)
193 |
194 | # If the target is below the lowerbound, set it to the lowerbound
195 | min_allowed_value = str2resource(resource, min_allowed[resource])
196 | max_allowed_value = str2resource(resource, max_allowed[resource])
197 | target = bound_var(uncapped_target, min_allowed_value, max_allowed_value)
198 | lower_bound = bound_var(lower_bound, min_allowed_value, max_allowed_value)
199 | upper_bound = bound_var(upper_bound, min_allowed_value, max_allowed_value)
200 |
201 | # Convert CPU/Memory values to millicores/bytes
202 | container_recommendation["lowerBound"][resource] = resource2str(resource, lower_bound)
203 | container_recommendation["target"][resource] = resource2str(resource, target)
204 | container_recommendation["uncappedTarget"][resource] = resource2str(resource, uncapped_target)
205 | container_recommendation["upperBound"][resource] = resource2str(resource, upper_bound)
206 |
207 | recommendations.append(container_recommendation)
208 | return recommendations
209 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | cachetools==4.2.4
2 | certifi==2022.9.14
3 | charset-normalizer==2.0.12
4 | google-auth==2.11.1
5 | idna==3.4
6 | kubernetes==24.2.0
7 | oauthlib==3.2.1
8 | pyasn1==0.4.8
9 | pyasn1-modules==0.2.8
10 | python-dateutil==2.8.2
11 | PyYAML==6.0
12 | requests==2.27.1
13 | requests-oauthlib==1.3.1
14 | rsa==4.9
15 | six==1.16.0
16 | urllib3==1.26.12
17 | websocket-client==1.3.1
18 |
--------------------------------------------------------------------------------
/scripts/set_cpu_freq.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | FREQ=${1}
4 | cpupower frequency-set -d ${FREQ}
5 | cpupower frequency-set -u ${FREQ}
--------------------------------------------------------------------------------
/scripts/watch_vpa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "\$kubectl get vpa ${1} --no-headers -o \"custom-columns=:status.recommendation.containerRecommendations[0].target.cpu\""
4 | kubectl get vpa ${1} --no-headers -o "custom-columns=:status.recommendation.containerRecommendations[0].target.cpu"
5 | echo -e "\n"
--------------------------------------------------------------------------------
/testPromClient.py:
--------------------------------------------------------------------------------
1 | from utils import *
2 |
3 | if __name__ == '__main__':
4 | prom_address = "http://127.0.0.1:39090"
5 | prom_client = PromClient(prom_address)
6 |
7 | max_cpu_frequency_query = "node_cpu_frequency_max_hertz"
8 | min_cpu_frequency_query = "node_cpu_frequency_min_hertz"
9 | latest_cpu_frequency_query = "node_cpu_scaling_frequency_hertz"
10 |
11 | pod_ips_query = "pod_energy_stat"
12 |
13 | all_node_homogeneous_max_frequencies = get_all_node_homogeneous_frequencies(prom_client, max_cpu_frequency_query)
14 | print(all_node_homogeneous_max_frequencies)
15 |
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | from PromClient import *
2 |
3 | def parse_frequency_dict(cpu_frequency_data):
4 | all_node_frequencies = {}
5 | for cur_element in cpu_frequency_data:
6 | node_name = cur_element["metric"]["instance"]
7 | cpu_idx = cur_element["metric"]["cpu"]
8 | if node_name not in all_node_frequencies.keys():
9 | all_node_frequencies[node_name] = {}
10 | cur_val = cur_element["value"][1]
11 | all_node_frequencies[node_name][cpu_idx] = cur_val
12 |
13 | return all_node_frequencies
14 |
15 | def get_homogeneous_value(node_frequencies):
16 | cpu_count = 0
17 | homogeneous_frequency = -1
18 | for cpu in node_frequencies.keys():
19 | if cpu_count == 0:
20 | homogeneous_frequency = int(node_frequencies[cpu])
21 | else:
22 | if node_frequencies[cpu] != homogeneous_frequency:
23 | homogeneous_frequency = max(int(node_frequencies[cpu]), homogeneous_frequency)
24 |
25 | cpu_count +=1
26 | homogeneous_frequency = get_rounded_frequency(homogeneous_frequency)
27 | return homogeneous_frequency
28 |
29 | def get_rounded_frequency(frequency):
30 | return round(frequency / 1000000000) * 1000000000
31 |
32 | def get_all_node_homogeneous_frequencies(prom_cient, prometheus_query):
33 | frequency_data = prom_cient.get_query(prometheus_query)
34 | if frequency_data is None:
35 | return None
36 | all_node_frequencies = parse_frequency_dict(frequency_data)
37 | all_node_homogeneous_frequencies = {}
38 | for node in all_node_frequencies.keys():
39 | cur_node_homogeneous_frequency = get_homogeneous_value(all_node_frequencies[node])
40 | if cur_node_homogeneous_frequency == -1:
41 | return None
42 | all_node_homogeneous_frequencies[node] = cur_node_homogeneous_frequency
43 | return all_node_homogeneous_frequencies
--------------------------------------------------------------------------------