├── .DS_Store ├── .gitignore ├── .idea ├── .gitignore ├── deployment.xml ├── energy-aware-recommender.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── remote-mappings.xml └── vcs.xml ├── Dockerfile ├── LICENSE ├── PromClient.py ├── README.md ├── dashboards ├── .DS_Store ├── clever-dashboard-w-IPS.json ├── clever-dashboard.json └── clever-sysbench-kubecon22.json ├── main.py ├── manifests ├── clever.yaml ├── random.yaml └── sysbench.yaml ├── recommender.py ├── requirements.txt ├── scripts ├── set_cpu_freq.sh └── watch_vpa.sh ├── testPromClient.py └── utils.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/clever/fc20b3e8947978ec13c744daa4f936d16af4d3fa/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 36 | -------------------------------------------------------------------------------- /.idea/energy-aware-recommender.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 38 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/remote-mappings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6.8 2 | 3 | WORKDIR /root/src/clever 4 | 5 | COPY requirements.txt ./ 6 | RUN pip install --no-cache-dir -r requirements.txt 7 | 8 | COPY . . 9 | 10 | CMD [ "python", "./main.py" ] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /PromClient.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import requests 4 | import os 5 | import json 6 | 7 | requests.packages.urllib3.disable_warnings() 8 | 9 | class PromClient: 10 | now = None 11 | start = None 12 | prom_address = "http://127.0.0.1:9090" 13 | prom_token = None 14 | step = '15s' 15 | chunk_sz = 900 16 | 17 | def __init__(self, prom_address=None, prom_token=None): 18 | self.prom_address = prom_address or os.getenv("PROM_HOST") 19 | self.prom_token = prom_token or os.getenv("PROM_TOKEN") 20 | 21 | if not self.prom_address: 22 | raise ValueError( 23 | "Please appropriately configure environment variables $PROM_HOST, $PROM_TOKEN to successfully run the crawler and profiler!") 24 | 25 | def get_query(self, my_query): 26 | try: 27 | if self.prom_token: 28 | headers = {"content-type": "application/json; charset=UTF-8", 29 | 'Authorization': 'Bearer {}'.format(self.prom_token)} 30 | else: 31 | headers = {"content-type": "application/json; charset=UTF-8"} 32 | response = requests.get('{0}/api/v1/query'.format(self.prom_address), 33 | params={'query': my_query}, 34 | headers=headers, verify=False) 35 | 36 | except requests.exceptions.RequestException as e: 37 | print(e) 38 | return None 39 | 40 | try: 41 | if response.json()['status'] != "success": 42 | print("Error processing the request: " + response.json()['status']) 43 | print("The Error is: " + response.json()['error']) 44 | return None 45 | 46 | results = response.json()['data']['result'] 47 | 48 | if (results is None): 49 | # print("the results[] came back empty!") 50 | return None 51 | 52 | length = len(results) 53 | if length > 0: 54 | return results 55 | else: 56 | # print("the results[] has no entries!") 57 | return None 58 | except: 59 | print(response) 60 | return None -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CLEVER 2 | Container Level Energy-efficient VPA Recommender for Kubernetes 3 | 4 | ## Pre-requisites 5 | - Baremetal Node OS - RedHat 8 6 | - Kubernetes 1.22+ 7 | - Kepler v0.2 8 | - Prometheus release-0.11 9 | - Kubernetes Vertical Pod Autoscaler (VPA) 0.11 10 | 11 | ## Installation 12 | ### Install Kepler 13 | - Follow the instructions in the [Kepler](https://github.com/sustainable-computing-io/kepler) to install Kepler as DaemonSets on nodes of the Kubernetes Cluster. 14 | 15 | ### Install Prometheus & Grafana Dashboard 16 | - Follow the instructions in the [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) to install both Prometheus and Grafana on the Kubernetes Cluster. 17 | - Import the [Grafana Dashboard](https://grafana.com/docs/grafana/v9.0/dashboards/export-import/). The dashboard is available in the `dashboards/clever-dashboard.json` folder. 18 | - Access Prometheus UI and Grafana Dashboard via `kubectl port-forward` command following the [Access UIs tutorial](https://github.com/prometheus-operator/kube-prometheus/blob/main/docs/access-ui.md). 19 | 20 | ### Install VPA 21 | - Follow the instructions [here](https://github.com/kubernetes/autoscaler/blob/master/vertical-pod-autoscaler/README.md) to install the VPA on the Kubernetes Cluster. 22 | 23 | ### Install CLEVER 24 | - Clone the CLEVER repository 25 | ```bash 26 | git clone https://github.com/sustainable-computing-io/clever.git 27 | ``` 28 | 29 | - Deploy CLEVER Recommender to run as an alternative recommender for VPA. 30 | ```bash 31 | kubectl apply -f manifests/clever.yaml 32 | ``` 33 | 34 | ## Tutorial 35 | - Deploy the example application that selects the CLEVER recommender for VPA. 36 | ```bash 37 | kubectl apply -f manifests/random.yaml 38 | ``` 39 | 40 | - The example application defines a VPA Custom Resource with the following configuration: 41 | ```yaml 42 | apiVersion: "autoscaling.k8s.io/v1" 43 | kind: VerticalPodAutoscaler 44 | metadata: 45 | name: random-vpa 46 | spec: 47 | recommenders: 48 | - name: clever 49 | targetRef: 50 | apiVersion: "apps/v1" 51 | kind: Deployment 52 | name: random 53 | resourcePolicy: 54 | containerPolicies: 55 | - containerName: '*' 56 | minAllowed: 57 | cpu: 100m 58 | maxAllowed: 59 | cpu: 16 60 | controlledResources: ["cpu"] 61 | ``` 62 | 63 | - Monitor the recommended CPU requests for the example application by watching the VPA object. 64 | ```bash 65 | watch -n 0.1 ./scripts/vpa.sh random-vpa 66 | ``` 67 | 68 | - Change the node CPU frequencies to observe the effect on the recommended CPU requests. 69 | ```bash 70 | ./scripts/set_cpu_freq.sh 1GHz 71 | ``` 72 | -------------------------------------------------------------------------------- /dashboards/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/clever/fc20b3e8947978ec13c744daa4f936d16af4d3fa/dashboards/.DS_Store -------------------------------------------------------------------------------- /dashboards/clever-dashboard-w-IPS.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "grafana", 8 | "uid": "-- Grafana --" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 27, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "datasource": { 33 | "type": "prometheus", 34 | "uid": "P1809F7CD0C75ACF3" 35 | }, 36 | "description": "For the node with more than 1 CPU, choose the maximum frequency.", 37 | "fieldConfig": { 38 | "defaults": { 39 | "color": { 40 | "mode": "palette-classic" 41 | }, 42 | "custom": { 43 | "axisCenteredZero": false, 44 | "axisColorMode": "text", 45 | "axisLabel": "", 46 | "axisPlacement": "auto", 47 | "barAlignment": 0, 48 | "drawStyle": "line", 49 | "fillOpacity": 0, 50 | "gradientMode": "none", 51 | "hideFrom": { 52 | "legend": false, 53 | "tooltip": false, 54 | "viz": false 55 | }, 56 | "lineInterpolation": "linear", 57 | "lineWidth": 1, 58 | "pointSize": 5, 59 | "scaleDistribution": { 60 | "type": "linear" 61 | }, 62 | "showPoints": "auto", 63 | "spanNulls": false, 64 | "stacking": { 65 | "group": "A", 66 | "mode": "none" 67 | }, 68 | "thresholdsStyle": { 69 | "mode": "off" 70 | } 71 | }, 72 | "mappings": [], 73 | "thresholds": { 74 | "mode": "absolute", 75 | "steps": [ 76 | { 77 | "color": "green", 78 | "value": null 79 | }, 80 | { 81 | "color": "red", 82 | "value": 80 83 | } 84 | ] 85 | }, 86 | "unit": "hertz" 87 | }, 88 | "overrides": [] 89 | }, 90 | "gridPos": { 91 | "h": 9, 92 | "w": 24, 93 | "x": 0, 94 | "y": 0 95 | }, 96 | "id": 4, 97 | "options": { 98 | "legend": { 99 | "calcs": [], 100 | "displayMode": "list", 101 | "placement": "bottom", 102 | "showLegend": true 103 | }, 104 | "tooltip": { 105 | "mode": "single", 106 | "sort": "none" 107 | } 108 | }, 109 | "targets": [ 110 | { 111 | "datasource": { 112 | "type": "prometheus", 113 | "uid": "P1809F7CD0C75ACF3" 114 | }, 115 | "editorMode": "builder", 116 | "expr": "max(node_cpu_scaling_frequency_hertz{instance=\"$node\"})", 117 | "legendFormat": "Current CPU Frequency ", 118 | "range": true, 119 | "refId": "A" 120 | }, 121 | { 122 | "datasource": { 123 | "type": "prometheus", 124 | "uid": "P1809F7CD0C75ACF3" 125 | }, 126 | "editorMode": "builder", 127 | "expr": "max(node_cpu_frequency_max_hertz{instance=\"$node\"})", 128 | "hide": false, 129 | "legendFormat": "Max CPU Frequency", 130 | "range": true, 131 | "refId": "B" 132 | }, 133 | { 134 | "datasource": { 135 | "type": "prometheus", 136 | "uid": "P1809F7CD0C75ACF3" 137 | }, 138 | "editorMode": "builder", 139 | "expr": "max(node_cpu_frequency_min_hertz{instance=\"$node\"})", 140 | "hide": false, 141 | "legendFormat": "Min CPU Frequency", 142 | "range": true, 143 | "refId": "C" 144 | } 145 | ], 146 | "title": "Current CPU Frequency (Max of all CPUs) for Node $node", 147 | "type": "timeseries" 148 | }, 149 | { 150 | "datasource": { 151 | "type": "prometheus", 152 | "uid": "P1809F7CD0C75ACF3" 153 | }, 154 | "fieldConfig": { 155 | "defaults": { 156 | "color": { 157 | "mode": "palette-classic" 158 | }, 159 | "custom": { 160 | "axisCenteredZero": false, 161 | "axisColorMode": "text", 162 | "axisLabel": "", 163 | "axisPlacement": "auto", 164 | "barAlignment": 0, 165 | "drawStyle": "line", 166 | "fillOpacity": 0, 167 | "gradientMode": "none", 168 | "hideFrom": { 169 | "legend": false, 170 | "tooltip": false, 171 | "viz": false 172 | }, 173 | "lineInterpolation": "linear", 174 | "lineWidth": 1, 175 | "pointSize": 5, 176 | "scaleDistribution": { 177 | "type": "linear" 178 | }, 179 | "showPoints": "auto", 180 | "spanNulls": false, 181 | "stacking": { 182 | "group": "A", 183 | "mode": "none" 184 | }, 185 | "thresholdsStyle": { 186 | "mode": "off" 187 | } 188 | }, 189 | "mappings": [], 190 | "max": 4, 191 | "min": 0, 192 | "thresholds": { 193 | "mode": "absolute", 194 | "steps": [ 195 | { 196 | "color": "green", 197 | "value": null 198 | }, 199 | { 200 | "color": "red", 201 | "value": 80 202 | } 203 | ] 204 | } 205 | }, 206 | "overrides": [] 207 | }, 208 | "gridPos": { 209 | "h": 10, 210 | "w": 24, 211 | "x": 0, 212 | "y": 9 213 | }, 214 | "id": 2, 215 | "options": { 216 | "legend": { 217 | "calcs": [], 218 | "displayMode": "list", 219 | "placement": "bottom", 220 | "showLegend": true 221 | }, 222 | "tooltip": { 223 | "mode": "single", 224 | "sort": "none" 225 | } 226 | }, 227 | "targets": [ 228 | { 229 | "datasource": { 230 | "type": "prometheus", 231 | "uid": "P1809F7CD0C75ACF3" 232 | }, 233 | "editorMode": "builder", 234 | "expr": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\", resource=\"cpu\", container=\"$container\"}", 235 | "legendFormat": "{{pod}}", 236 | "range": true, 237 | "refId": "A" 238 | } 239 | ], 240 | "title": "Managed Container CPU Requests", 241 | "type": "timeseries" 242 | }, 243 | { 244 | "datasource": { 245 | "type": "prometheus", 246 | "uid": "P1809F7CD0C75ACF3" 247 | }, 248 | "description": "", 249 | "fieldConfig": { 250 | "defaults": { 251 | "color": { 252 | "mode": "palette-classic" 253 | }, 254 | "custom": { 255 | "axisCenteredZero": false, 256 | "axisColorMode": "text", 257 | "axisLabel": "", 258 | "axisPlacement": "auto", 259 | "barAlignment": 0, 260 | "drawStyle": "line", 261 | "fillOpacity": 0, 262 | "gradientMode": "none", 263 | "hideFrom": { 264 | "legend": false, 265 | "tooltip": false, 266 | "viz": false 267 | }, 268 | "lineInterpolation": "linear", 269 | "lineWidth": 1, 270 | "pointSize": 5, 271 | "scaleDistribution": { 272 | "type": "linear" 273 | }, 274 | "showPoints": "auto", 275 | "spanNulls": false, 276 | "stacking": { 277 | "group": "A", 278 | "mode": "none" 279 | }, 280 | "thresholdsStyle": { 281 | "mode": "off" 282 | } 283 | }, 284 | "mappings": [], 285 | "max": 10000000000, 286 | "min": 0, 287 | "thresholds": { 288 | "mode": "absolute", 289 | "steps": [ 290 | { 291 | "color": "green", 292 | "value": null 293 | }, 294 | { 295 | "color": "red", 296 | "value": 80 297 | } 298 | ] 299 | } 300 | }, 301 | "overrides": [] 302 | }, 303 | "gridPos": { 304 | "h": 9, 305 | "w": 24, 306 | "x": 0, 307 | "y": 19 308 | }, 309 | "id": 6, 310 | "options": { 311 | "legend": { 312 | "calcs": [], 313 | "displayMode": "list", 314 | "placement": "bottom", 315 | "showLegend": true 316 | }, 317 | "tooltip": { 318 | "mode": "single", 319 | "sort": "none" 320 | } 321 | }, 322 | "targets": [ 323 | { 324 | "datasource": { 325 | "type": "prometheus", 326 | "uid": "P1809F7CD0C75ACF3" 327 | }, 328 | "editorMode": "code", 329 | "expr": "pod_cpu_instructions{pod_namespace=\"$namespace\"} / 3", 330 | "legendFormat": "{{pod_name}}", 331 | "range": true, 332 | "refId": "A" 333 | } 334 | ], 335 | "title": "Container Actual IPS (Instructions/second)", 336 | "type": "timeseries" 337 | } 338 | ], 339 | "schemaVersion": 37, 340 | "style": "dark", 341 | "tags": [], 342 | "templating": { 343 | "list": [ 344 | { 345 | "current": { 346 | "selected": false, 347 | "text": "default", 348 | "value": "default" 349 | }, 350 | "datasource": { 351 | "type": "prometheus", 352 | "uid": "P1809F7CD0C75ACF3" 353 | }, 354 | "definition": "label_values(kube_pod_container_resource_requests, namespace)", 355 | "hide": 0, 356 | "includeAll": false, 357 | "label": "Namespace", 358 | "multi": false, 359 | "name": "namespace", 360 | "options": [], 361 | "query": { 362 | "query": "label_values(kube_pod_container_resource_requests, namespace)", 363 | "refId": "StandardVariableQuery" 364 | }, 365 | "refresh": 1, 366 | "regex": "", 367 | "skipUrlSync": false, 368 | "sort": 0, 369 | "type": "query" 370 | }, 371 | { 372 | "current": { 373 | "selected": true, 374 | "text": "sysbench", 375 | "value": "sysbench" 376 | }, 377 | "datasource": { 378 | "type": "prometheus", 379 | "uid": "P1809F7CD0C75ACF3" 380 | }, 381 | "definition": "label_values(kube_pod_container_resource_requests{namespace=\"$namespace\"}, container)", 382 | "hide": 0, 383 | "includeAll": false, 384 | "label": "Container", 385 | "multi": false, 386 | "name": "container", 387 | "options": [], 388 | "query": { 389 | "query": "label_values(kube_pod_container_resource_requests{namespace=\"$namespace\"}, container)", 390 | "refId": "StandardVariableQuery" 391 | }, 392 | "refresh": 1, 393 | "regex": "", 394 | "skipUrlSync": false, 395 | "sort": 0, 396 | "type": "query" 397 | }, 398 | { 399 | "current": { 400 | "selected": false, 401 | "text": "clever.ibm.cloud", 402 | "value": "clever.ibm.cloud" 403 | }, 404 | "datasource": { 405 | "type": "prometheus", 406 | "uid": "P1809F7CD0C75ACF3" 407 | }, 408 | "definition": "label_values(node_cpu_frequency_max_hertz, instance)", 409 | "hide": 0, 410 | "includeAll": false, 411 | "label": "Node", 412 | "multi": false, 413 | "name": "node", 414 | "options": [], 415 | "query": { 416 | "query": "label_values(node_cpu_frequency_max_hertz, instance)", 417 | "refId": "StandardVariableQuery" 418 | }, 419 | "refresh": 1, 420 | "regex": "", 421 | "skipUrlSync": false, 422 | "sort": 0, 423 | "type": "query" 424 | } 425 | ] 426 | }, 427 | "time": { 428 | "from": "now-5m", 429 | "to": "now" 430 | }, 431 | "timepicker": {}, 432 | "timezone": "", 433 | "title": "Clever VPA Recommender", 434 | "uid": "ePYMOfnVk", 435 | "version": 5, 436 | "weekStart": "" 437 | } -------------------------------------------------------------------------------- /dashboards/clever-dashboard.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "grafana", 8 | "uid": "-- Grafana --" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 29, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "datasource": { 33 | "type": "prometheus", 34 | "uid": "P1809F7CD0C75ACF3" 35 | }, 36 | "description": "For the node with more than 1 CPU, choose the maximum frequency.", 37 | "fieldConfig": { 38 | "defaults": { 39 | "color": { 40 | "mode": "palette-classic" 41 | }, 42 | "custom": { 43 | "axisLabel": "", 44 | "axisPlacement": "auto", 45 | "barAlignment": 0, 46 | "drawStyle": "line", 47 | "fillOpacity": 0, 48 | "gradientMode": "none", 49 | "hideFrom": { 50 | "legend": false, 51 | "tooltip": false, 52 | "viz": false 53 | }, 54 | "lineInterpolation": "linear", 55 | "lineWidth": 1, 56 | "pointSize": 5, 57 | "scaleDistribution": { 58 | "type": "linear" 59 | }, 60 | "showPoints": "auto", 61 | "spanNulls": false, 62 | "stacking": { 63 | "group": "A", 64 | "mode": "none" 65 | }, 66 | "thresholdsStyle": { 67 | "mode": "off" 68 | } 69 | }, 70 | "mappings": [], 71 | "thresholds": { 72 | "mode": "absolute", 73 | "steps": [ 74 | { 75 | "color": "green", 76 | "value": null 77 | }, 78 | { 79 | "color": "red", 80 | "value": 80 81 | } 82 | ] 83 | }, 84 | "unit": "hertz" 85 | }, 86 | "overrides": [] 87 | }, 88 | "gridPos": { 89 | "h": 9, 90 | "w": 24, 91 | "x": 0, 92 | "y": 0 93 | }, 94 | "id": 4, 95 | "options": { 96 | "legend": { 97 | "calcs": [], 98 | "displayMode": "list", 99 | "placement": "bottom" 100 | }, 101 | "tooltip": { 102 | "mode": "single", 103 | "sort": "none" 104 | } 105 | }, 106 | "targets": [ 107 | { 108 | "datasource": { 109 | "type": "prometheus", 110 | "uid": "P1809F7CD0C75ACF3" 111 | }, 112 | "editorMode": "builder", 113 | "expr": "max(node_cpu_scaling_frequency_hertz{instance=\"$node\"})", 114 | "legendFormat": "Current CPU Frequency ", 115 | "range": true, 116 | "refId": "A" 117 | }, 118 | { 119 | "datasource": { 120 | "type": "prometheus", 121 | "uid": "P1809F7CD0C75ACF3" 122 | }, 123 | "editorMode": "builder", 124 | "expr": "max(node_cpu_frequency_max_hertz{instance=\"$node\"})", 125 | "hide": false, 126 | "legendFormat": "Max CPU Frequency", 127 | "range": true, 128 | "refId": "B" 129 | }, 130 | { 131 | "datasource": { 132 | "type": "prometheus", 133 | "uid": "P1809F7CD0C75ACF3" 134 | }, 135 | "editorMode": "builder", 136 | "expr": "max(node_cpu_frequency_min_hertz{instance=\"$node\"})", 137 | "hide": false, 138 | "legendFormat": "Min CPU Frequency", 139 | "range": true, 140 | "refId": "C" 141 | } 142 | ], 143 | "title": "Current CPU Frequency (Max of all CPUs) for Node $node", 144 | "type": "timeseries" 145 | }, 146 | { 147 | "datasource": { 148 | "type": "prometheus", 149 | "uid": "P1809F7CD0C75ACF3" 150 | }, 151 | "fieldConfig": { 152 | "defaults": { 153 | "color": { 154 | "mode": "palette-classic" 155 | }, 156 | "custom": { 157 | "axisLabel": "", 158 | "axisPlacement": "auto", 159 | "barAlignment": 0, 160 | "drawStyle": "line", 161 | "fillOpacity": 0, 162 | "gradientMode": "none", 163 | "hideFrom": { 164 | "legend": false, 165 | "tooltip": false, 166 | "viz": false 167 | }, 168 | "lineInterpolation": "linear", 169 | "lineWidth": 1, 170 | "pointSize": 5, 171 | "scaleDistribution": { 172 | "type": "linear" 173 | }, 174 | "showPoints": "auto", 175 | "spanNulls": false, 176 | "stacking": { 177 | "group": "A", 178 | "mode": "none" 179 | }, 180 | "thresholdsStyle": { 181 | "mode": "off" 182 | } 183 | }, 184 | "mappings": [], 185 | "thresholds": { 186 | "mode": "absolute", 187 | "steps": [ 188 | { 189 | "color": "green", 190 | "value": null 191 | }, 192 | { 193 | "color": "red", 194 | "value": 80 195 | } 196 | ] 197 | } 198 | }, 199 | "overrides": [] 200 | }, 201 | "gridPos": { 202 | "h": 10, 203 | "w": 24, 204 | "x": 0, 205 | "y": 9 206 | }, 207 | "id": 2, 208 | "options": { 209 | "legend": { 210 | "calcs": [], 211 | "displayMode": "list", 212 | "placement": "bottom" 213 | }, 214 | "tooltip": { 215 | "mode": "single", 216 | "sort": "none" 217 | } 218 | }, 219 | "targets": [ 220 | { 221 | "datasource": { 222 | "type": "prometheus", 223 | "uid": "P1809F7CD0C75ACF3" 224 | }, 225 | "editorMode": "builder", 226 | "expr": "cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\", resource=\"cpu\", container=\"$container\"}", 227 | "legendFormat": "{{pod}}", 228 | "range": true, 229 | "refId": "A" 230 | } 231 | ], 232 | "title": "Managed Container CPU Requests", 233 | "type": "timeseries" 234 | } 235 | ], 236 | "schemaVersion": 36, 237 | "style": "dark", 238 | "tags": [], 239 | "templating": { 240 | "list": [ 241 | { 242 | "current": { 243 | "selected": true, 244 | "text": "default", 245 | "value": "default" 246 | }, 247 | "datasource": { 248 | "type": "prometheus", 249 | "uid": "P1809F7CD0C75ACF3" 250 | }, 251 | "definition": "label_values(kube_pod_container_resource_requests, namespace)", 252 | "hide": 0, 253 | "includeAll": false, 254 | "label": "Namespace", 255 | "multi": false, 256 | "name": "namespace", 257 | "options": [], 258 | "query": { 259 | "query": "label_values(kube_pod_container_resource_requests, namespace)", 260 | "refId": "StandardVariableQuery" 261 | }, 262 | "refresh": 1, 263 | "regex": "", 264 | "skipUrlSync": false, 265 | "sort": 0, 266 | "type": "query" 267 | }, 268 | { 269 | "current": { 270 | "selected": false, 271 | "text": "hamster", 272 | "value": "hamster" 273 | }, 274 | "datasource": { 275 | "type": "prometheus", 276 | "uid": "P1809F7CD0C75ACF3" 277 | }, 278 | "definition": "label_values(kube_pod_container_resource_requests{namespace=\"$namespace\"}, container)", 279 | "hide": 0, 280 | "includeAll": false, 281 | "label": "Container", 282 | "multi": false, 283 | "name": "container", 284 | "options": [], 285 | "query": { 286 | "query": "label_values(kube_pod_container_resource_requests{namespace=\"$namespace\"}, container)", 287 | "refId": "StandardVariableQuery" 288 | }, 289 | "refresh": 1, 290 | "regex": "", 291 | "skipUrlSync": false, 292 | "sort": 0, 293 | "type": "query" 294 | }, 295 | { 296 | "current": { 297 | "selected": false, 298 | "text": "load-test.ibm.cloud", 299 | "value": "load-test.ibm.cloud" 300 | }, 301 | "datasource": { 302 | "type": "prometheus", 303 | "uid": "P1809F7CD0C75ACF3" 304 | }, 305 | "definition": "label_values(node_cpu_frequency_max_hertz, instance)", 306 | "hide": 0, 307 | "includeAll": false, 308 | "label": "Node", 309 | "multi": false, 310 | "name": "node", 311 | "options": [], 312 | "query": { 313 | "query": "label_values(node_cpu_frequency_max_hertz, instance)", 314 | "refId": "StandardVariableQuery" 315 | }, 316 | "refresh": 1, 317 | "regex": "", 318 | "skipUrlSync": false, 319 | "sort": 0, 320 | "type": "query" 321 | } 322 | ] 323 | }, 324 | "time": { 325 | "from": "now-15m", 326 | "to": "now" 327 | }, 328 | "timepicker": {}, 329 | "timezone": "", 330 | "title": "Clever VPA Recommender", 331 | "uid": "ePYMOfnVk", 332 | "version": 13, 333 | "weekStart": "" 334 | } -------------------------------------------------------------------------------- /dashboards/clever-sysbench-kubecon22.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "grafana", 8 | "uid": "-- Grafana --" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 28, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "datasource": { 33 | "type": "prometheus", 34 | "uid": "P1809F7CD0C75ACF3" 35 | }, 36 | "description": "For the node with more than 1 CPU, choose the maximum frequency.", 37 | "fieldConfig": { 38 | "defaults": { 39 | "color": { 40 | "mode": "palette-classic" 41 | }, 42 | "custom": { 43 | "axisCenteredZero": false, 44 | "axisColorMode": "text", 45 | "axisLabel": "", 46 | "axisPlacement": "auto", 47 | "barAlignment": 0, 48 | "drawStyle": "line", 49 | "fillOpacity": 0, 50 | "gradientMode": "none", 51 | "hideFrom": { 52 | "legend": false, 53 | "tooltip": false, 54 | "viz": false 55 | }, 56 | "lineInterpolation": "linear", 57 | "lineWidth": 1, 58 | "pointSize": 5, 59 | "scaleDistribution": { 60 | "type": "linear" 61 | }, 62 | "showPoints": "auto", 63 | "spanNulls": false, 64 | "stacking": { 65 | "group": "A", 66 | "mode": "none" 67 | }, 68 | "thresholdsStyle": { 69 | "mode": "off" 70 | } 71 | }, 72 | "mappings": [], 73 | "thresholds": { 74 | "mode": "absolute", 75 | "steps": [ 76 | { 77 | "color": "green", 78 | "value": null 79 | }, 80 | { 81 | "color": "red", 82 | "value": 80 83 | } 84 | ] 85 | }, 86 | "unit": "hertz" 87 | }, 88 | "overrides": [] 89 | }, 90 | "gridPos": { 91 | "h": 9, 92 | "w": 24, 93 | "x": 0, 94 | "y": 0 95 | }, 96 | "id": 4, 97 | "options": { 98 | "legend": { 99 | "calcs": [], 100 | "displayMode": "list", 101 | "placement": "bottom", 102 | "showLegend": true 103 | }, 104 | "tooltip": { 105 | "mode": "single", 106 | "sort": "none" 107 | } 108 | }, 109 | "targets": [ 110 | { 111 | "datasource": { 112 | "type": "prometheus", 113 | "uid": "P1809F7CD0C75ACF3" 114 | }, 115 | "editorMode": "builder", 116 | "expr": "max(node_cpu_scaling_frequency_hertz{instance=\"$node\"})", 117 | "legendFormat": "Current CPU Frequency ", 118 | "range": true, 119 | "refId": "A" 120 | }, 121 | { 122 | "datasource": { 123 | "type": "prometheus", 124 | "uid": "P1809F7CD0C75ACF3" 125 | }, 126 | "editorMode": "builder", 127 | "expr": "max(node_cpu_frequency_max_hertz{instance=\"$node\"})", 128 | "hide": false, 129 | "legendFormat": "Max CPU Frequency", 130 | "range": true, 131 | "refId": "B" 132 | }, 133 | { 134 | "datasource": { 135 | "type": "prometheus", 136 | "uid": "P1809F7CD0C75ACF3" 137 | }, 138 | "editorMode": "builder", 139 | "expr": "max(node_cpu_frequency_min_hertz{instance=\"$node\"})", 140 | "hide": false, 141 | "legendFormat": "Min CPU Frequency", 142 | "range": true, 143 | "refId": "C" 144 | } 145 | ], 146 | "title": "Current CPU Frequency (Max of all CPUs) for Node $node", 147 | "type": "timeseries" 148 | }, 149 | { 150 | "datasource": { 151 | "type": "prometheus", 152 | "uid": "P1809F7CD0C75ACF3" 153 | }, 154 | "fieldConfig": { 155 | "defaults": { 156 | "color": { 157 | "mode": "palette-classic" 158 | }, 159 | "custom": { 160 | "axisCenteredZero": false, 161 | "axisColorMode": "text", 162 | "axisLabel": "", 163 | "axisPlacement": "auto", 164 | "barAlignment": 0, 165 | "drawStyle": "line", 166 | "fillOpacity": 0, 167 | "gradientMode": "none", 168 | "hideFrom": { 169 | "legend": false, 170 | "tooltip": false, 171 | "viz": false 172 | }, 173 | "lineInterpolation": "linear", 174 | "lineWidth": 1, 175 | "pointSize": 5, 176 | "scaleDistribution": { 177 | "type": "linear" 178 | }, 179 | "showPoints": "auto", 180 | "spanNulls": false, 181 | "stacking": { 182 | "group": "A", 183 | "mode": "none" 184 | }, 185 | "thresholdsStyle": { 186 | "mode": "off" 187 | } 188 | }, 189 | "mappings": [], 190 | "max": 1, 191 | "min": 0, 192 | "thresholds": { 193 | "mode": "absolute", 194 | "steps": [ 195 | { 196 | "color": "green", 197 | "value": null 198 | }, 199 | { 200 | "color": "red", 201 | "value": 80 202 | } 203 | ] 204 | } 205 | }, 206 | "overrides": [] 207 | }, 208 | "gridPos": { 209 | "h": 10, 210 | "w": 24, 211 | "x": 0, 212 | "y": 9 213 | }, 214 | "id": 2, 215 | "options": { 216 | "legend": { 217 | "calcs": [], 218 | "displayMode": "list", 219 | "placement": "bottom", 220 | "showLegend": true 221 | }, 222 | "tooltip": { 223 | "mode": "single", 224 | "sort": "none" 225 | } 226 | }, 227 | "targets": [ 228 | { 229 | "datasource": { 230 | "type": "prometheus", 231 | "uid": "P1809F7CD0C75ACF3" 232 | }, 233 | "editorMode": "builder", 234 | "expr": "max by(container) (cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests{namespace=\"$namespace\", resource=\"cpu\", container=\"$container\"})", 235 | "legendFormat": "{{pod}}", 236 | "range": true, 237 | "refId": "A" 238 | } 239 | ], 240 | "title": "Managed Container CPU Requests", 241 | "type": "timeseries" 242 | }, 243 | { 244 | "datasource": { 245 | "type": "prometheus", 246 | "uid": "P1809F7CD0C75ACF3" 247 | }, 248 | "description": "", 249 | "fieldConfig": { 250 | "defaults": { 251 | "color": { 252 | "mode": "palette-classic" 253 | }, 254 | "custom": { 255 | "axisCenteredZero": false, 256 | "axisColorMode": "text", 257 | "axisLabel": "", 258 | "axisPlacement": "auto", 259 | "barAlignment": 0, 260 | "drawStyle": "line", 261 | "fillOpacity": 0, 262 | "gradientMode": "none", 263 | "hideFrom": { 264 | "legend": false, 265 | "tooltip": false, 266 | "viz": false 267 | }, 268 | "lineInterpolation": "linear", 269 | "lineWidth": 1, 270 | "pointSize": 5, 271 | "scaleDistribution": { 272 | "type": "linear" 273 | }, 274 | "showPoints": "auto", 275 | "spanNulls": false, 276 | "stacking": { 277 | "group": "A", 278 | "mode": "none" 279 | }, 280 | "thresholdsStyle": { 281 | "mode": "off" 282 | } 283 | }, 284 | "mappings": [], 285 | "max": 1000000000, 286 | "min": 0, 287 | "thresholds": { 288 | "mode": "absolute", 289 | "steps": [ 290 | { 291 | "color": "green", 292 | "value": null 293 | }, 294 | { 295 | "color": "red", 296 | "value": 80 297 | } 298 | ] 299 | } 300 | }, 301 | "overrides": [] 302 | }, 303 | "gridPos": { 304 | "h": 9, 305 | "w": 24, 306 | "x": 0, 307 | "y": 19 308 | }, 309 | "id": 6, 310 | "options": { 311 | "legend": { 312 | "calcs": [], 313 | "displayMode": "list", 314 | "placement": "bottom", 315 | "showLegend": true 316 | }, 317 | "tooltip": { 318 | "mode": "single", 319 | "sort": "none" 320 | } 321 | }, 322 | "targets": [ 323 | { 324 | "datasource": { 325 | "type": "prometheus", 326 | "uid": "P1809F7CD0C75ACF3" 327 | }, 328 | "editorMode": "builder", 329 | "expr": "max(pod_cpu_instructions{pod_namespace=\"$namespace\"} / 3)", 330 | "legendFormat": "sysbench", 331 | "range": true, 332 | "refId": "A" 333 | } 334 | ], 335 | "title": "Container Actual IPS (Instructions/second)", 336 | "type": "timeseries" 337 | } 338 | ], 339 | "schemaVersion": 37, 340 | "style": "dark", 341 | "tags": [], 342 | "templating": { 343 | "list": [ 344 | { 345 | "current": { 346 | "selected": false, 347 | "text": "default", 348 | "value": "default" 349 | }, 350 | "datasource": { 351 | "type": "prometheus", 352 | "uid": "P1809F7CD0C75ACF3" 353 | }, 354 | "definition": "label_values(kube_pod_container_resource_requests, namespace)", 355 | "hide": 0, 356 | "includeAll": false, 357 | "label": "Namespace", 358 | "multi": false, 359 | "name": "namespace", 360 | "options": [], 361 | "query": { 362 | "query": "label_values(kube_pod_container_resource_requests, namespace)", 363 | "refId": "StandardVariableQuery" 364 | }, 365 | "refresh": 1, 366 | "regex": "", 367 | "skipUrlSync": false, 368 | "sort": 0, 369 | "type": "query" 370 | }, 371 | { 372 | "current": { 373 | "selected": false, 374 | "text": "sysbench", 375 | "value": "sysbench" 376 | }, 377 | "datasource": { 378 | "type": "prometheus", 379 | "uid": "P1809F7CD0C75ACF3" 380 | }, 381 | "definition": "label_values(kube_pod_container_resource_requests{namespace=\"$namespace\"}, container)", 382 | "hide": 0, 383 | "includeAll": false, 384 | "label": "Container", 385 | "multi": false, 386 | "name": "container", 387 | "options": [], 388 | "query": { 389 | "query": "label_values(kube_pod_container_resource_requests{namespace=\"$namespace\"}, container)", 390 | "refId": "StandardVariableQuery" 391 | }, 392 | "refresh": 1, 393 | "regex": "", 394 | "skipUrlSync": false, 395 | "sort": 0, 396 | "type": "query" 397 | }, 398 | { 399 | "current": { 400 | "selected": false, 401 | "text": "clever.ibm.cloud", 402 | "value": "clever.ibm.cloud" 403 | }, 404 | "datasource": { 405 | "type": "prometheus", 406 | "uid": "P1809F7CD0C75ACF3" 407 | }, 408 | "definition": "label_values(node_cpu_frequency_max_hertz, instance)", 409 | "hide": 0, 410 | "includeAll": false, 411 | "label": "Node", 412 | "multi": false, 413 | "name": "node", 414 | "options": [], 415 | "query": { 416 | "query": "label_values(node_cpu_frequency_max_hertz, instance)", 417 | "refId": "StandardVariableQuery" 418 | }, 419 | "refresh": 1, 420 | "regex": "", 421 | "skipUrlSync": false, 422 | "sort": 0, 423 | "type": "query" 424 | } 425 | ] 426 | }, 427 | "time": { 428 | "from": "now-5m", 429 | "to": "now" 430 | }, 431 | "timepicker": {}, 432 | "timezone": "", 433 | "title": "Clever VPA Recommender", 434 | "uid": "ePYMOfnVk", 435 | "version": 11, 436 | "weekStart": "" 437 | } -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from kubernetes import client, config 3 | from kubernetes.client.rest import ApiException 4 | 5 | from utils import * 6 | from recommender import * 7 | 8 | # Current Recommender Name 9 | RECOMMENDER_NAME = "clever" 10 | SLEEP_WINDOW = 60 11 | 12 | # VPA resources 13 | DOMAIN = "autoscaling.k8s.io" 14 | VPA_NAME = "verticalpodautoscaler" 15 | VPA_PLURAL = "verticalpodautoscalers" 16 | VPA_CHECKPOINT_NAME = "verticalpodautoscalercheckpoint" 17 | VPA_CHECKPOINT_PLURAL = "verticalpodautoscalercheckpoints" 18 | 19 | # PROMETHEUS Queries 20 | MAX_CPU_FREQUENCY_QUERY = "node_cpu_frequency_max_hertz" 21 | MIN_CPU_FREQUENCY_QUERY = "node_cpu_frequency_min_hertz" 22 | LATEST_CPU_FREQUENCY_QUERY = "node_cpu_scaling_frequency_hertz" 23 | 24 | # Keep the latest node frequencies and the VPA default requests in cache 25 | MAX_NODE_CPU_FREQUENCY = {} 26 | LATEST_NODE_CPU_FREQUENCY = {} 27 | ACTIVE_VPA_DEFAULT_CPU_REQUESTS = {} 28 | 29 | # Press the green button in the gutter to run the script. 30 | if __name__ == '__main__': 31 | if 'KUBERNETES_PORT' in os.environ: 32 | config.load_incluster_config() 33 | else: 34 | config.load_kube_config() 35 | 36 | # Get the api instance to interact with the cluster 37 | api_client = client.api_client.ApiClient() 38 | v1 = client.ApiextensionsV1Api(api_client) 39 | corev1 = client.CoreV1Api(api_client) 40 | crds = client.CustomObjectsApi(api_client) 41 | resource_version = '' 42 | 43 | # Initialize the prometheus client 44 | prom_client = PromClient() 45 | 46 | # Initialize the node CPU frequency cache. 47 | MAX_NODE_CPU_FREQUENCY = get_all_node_homogeneous_frequencies(prom_client, MAX_CPU_FREQUENCY_QUERY) 48 | if MAX_NODE_CPU_FREQUENCY is None: 49 | print("Prometheus Query {} at Endpoint {} failed.".format(MAX_CPU_FREQUENCY_QUERY, prom_client.prom_address)) 50 | exit(-1) 51 | 52 | LATEST_NODE_CPU_FREQUENCY = get_all_node_homogeneous_frequencies(prom_client, LATEST_CPU_FREQUENCY_QUERY) 53 | if LATEST_NODE_CPU_FREQUENCY is None: 54 | print("Prometheus Query {} at Endpoint {} failed.".format(LATEST_CPU_FREQUENCY_QUERY, prom_client.prom_address)) 55 | exit(-1) 56 | print("Initialized the node CPU frequency cache {}".format(LATEST_NODE_CPU_FREQUENCY)) 57 | 58 | # Get the VPA CRD 59 | current_crds = [x['spec']['names']['kind'].lower() for x in v1.list_custom_resource_definition().to_dict()['items']] 60 | if VPA_NAME not in current_crds: 61 | print("VerticalPodAutoscaler CRD is not created!") 62 | exit(-1) 63 | 64 | while True: 65 | print("Checking the frequency and the target IPS") 66 | # Updating the default VPA CPU cache. 67 | vpas = crds.list_cluster_custom_object(group=DOMAIN, version="v1", plural=VPA_PLURAL) 68 | selectedVpas = selects_recommender(vpas, RECOMMENDER_NAME) 69 | 70 | # Update the container default requests for selectedVpas 71 | # Keep the mapping between nodes and vpas, which manage pods on those nodes. 72 | node_vpas = {} 73 | for vpa in selectedVpas: 74 | vpa_name = vpa["metadata"]["name"] 75 | vpa_namespace = vpa["metadata"]["namespace"] 76 | 77 | # Get initial container request. 78 | if vpa_name not in ACTIVE_VPA_DEFAULT_CPU_REQUESTS.keys(): 79 | ACTIVE_VPA_DEFAULT_CPU_REQUESTS[vpa_name], vpa_nodes = get_vpa_detailed_info(corev1, vpa) 80 | print("Updating the default CPU request cache for newly discovered VPA {}".format(vpa_name)) 81 | print(ACTIVE_VPA_DEFAULT_CPU_REQUESTS) 82 | else: 83 | _, vpa_nodes = get_vpa_detailed_info(corev1, vpa) 84 | 85 | # Select VPAs per node. 86 | for node in list(set(vpa_nodes.values())): 87 | if node not in node_vpas.keys(): 88 | node_vpas[node] = [vpa] 89 | else: 90 | node_vpas[node].append(vpa) 91 | print("Discovering VPAs running on the following nodes.") 92 | print(node_vpas) 93 | 94 | # Obtain the latest node cpu frequencies 95 | CUR_NODE_CPU_FREQUENCY = get_all_node_homogeneous_frequencies(prom_client, LATEST_CPU_FREQUENCY_QUERY) 96 | print("Current node CPU frequencies\n {}".format(CUR_NODE_CPU_FREQUENCY)) 97 | 98 | # Check difference between LATEST_NODE_CPU_FREQUENCY and CUR_NODE_CPU_FREQUENCY 99 | if CUR_NODE_CPU_FREQUENCY != LATEST_NODE_CPU_FREQUENCY: 100 | # Select nodes with frequency changes. 101 | nodes_with_frequency_changes = find_node_with_frequency_changes(CUR_NODE_CPU_FREQUENCY, LATEST_NODE_CPU_FREQUENCY) 102 | print("Discover nodes with frequency changes {}".format(nodes_with_frequency_changes)) 103 | 104 | vpas_to_update = {} 105 | for node in nodes_with_frequency_changes: 106 | if node not in node_vpas.keys(): 107 | print("Frequency changes on node {} does not impact any vpa managed pods!") 108 | continue 109 | 110 | cur_node_vpas = node_vpas[node] 111 | for vpa in cur_node_vpas: 112 | vpa_name = vpa["metadata"]["name"] 113 | vpas_to_update[vpa_name] = vpa 114 | 115 | print("These VPAs {} are impacted by the following nodes with frequency changes {}.".format(vpas_to_update.keys(), nodes_with_frequency_changes)) 116 | for vpa in vpas_to_update.values(): 117 | vpa_name = vpa["metadata"]["name"] 118 | vpa_namespace = vpa["metadata"]["namespace"] 119 | 120 | print("Recommend sizes according to current frequency for vpas on nodes with frequency changes!") 121 | 122 | recommendations = get_recommendation(vpa, corev1, CUR_NODE_CPU_FREQUENCY, MAX_NODE_CPU_FREQUENCY, ACTIVE_VPA_DEFAULT_CPU_REQUESTS[vpa_name]) 123 | print("Recommendations for VPA {} are {}".format(vpa_name, recommendations)) 124 | 125 | if not recommendations: 126 | print("No new recommendations obtained, so skip updating the vpa object {}".format(vpa_name)) 127 | continue 128 | 129 | # Update the recommendations. 130 | patched_vpa = {"recommendation": {"containerRecommendations": recommendations}} 131 | body = {"status": patched_vpa} 132 | vpa_api = client.CustomObjectsApi() 133 | 134 | # Update the VPA object 135 | # API call doc: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/CustomObjectsApi.md#patch_namespaced_custom_object 136 | try: 137 | vpa_updated = vpa_api.patch_namespaced_custom_object(group=DOMAIN, version="v1", plural=VPA_PLURAL, 138 | namespace=vpa_namespace, name=vpa_name, 139 | body=body) 140 | print("Successfully patched VPA object with the recommendation: %s" % 141 | vpa_updated['status']['recommendation']['containerRecommendations']) 142 | except ApiException as e: 143 | print("Exception when calling CustomObjectsApi->patch_namespaced_custom_object: %s\n" % e) 144 | 145 | print("Sleeping for {} seconds".format(SLEEP_WINDOW)) 146 | print("=====================================================================================================") 147 | time.sleep(SLEEP_WINDOW) 148 | 149 | -------------------------------------------------------------------------------- /manifests/clever.yaml: -------------------------------------------------------------------------------- 1 | kind: ClusterRole 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: clever-role 5 | rules: 6 | - apiGroups: 7 | - "" 8 | resources: 9 | - pods 10 | - customresourcedefinitions 11 | verbs: 12 | - '*' 13 | - apiGroups: 14 | - apiextensions.k8s.io 15 | resources: 16 | - customresourcedefinitions 17 | verbs: 18 | - '*' 19 | - apiGroups: 20 | - autoscaling.k8s.io 21 | resources: 22 | - verticalpodautoscalers 23 | - verticalpodautoscalercheckpoints 24 | verbs: 25 | - '*' 26 | - apiGroups: 27 | - rbac.authorization.k8s.io 28 | resources: 29 | - clusterrolebindings 30 | verbs: 31 | - '*' 32 | - apiGroups: 33 | - apps 34 | resources: 35 | - deployments 36 | verbs: 37 | - "*" 38 | --- 39 | apiVersion: v1 40 | kind: ServiceAccount 41 | metadata: 42 | name: clever 43 | namespace: kube-system 44 | --- 45 | kind: ClusterRoleBinding 46 | apiVersion: rbac.authorization.k8s.io/v1 47 | metadata: 48 | name: clever-role-binding 49 | subjects: 50 | - kind: ServiceAccount 51 | name: clever 52 | namespace: kube-system 53 | roleRef: 54 | apiGroup: rbac.authorization.k8s.io 55 | kind: ClusterRole 56 | name: clever-role 57 | --- 58 | apiVersion: apps/v1 59 | kind: Deployment 60 | metadata: 61 | name: clever 62 | namespace: kube-system 63 | spec: 64 | replicas: 1 65 | selector: 66 | matchLabels: 67 | app: clever 68 | template: 69 | metadata: 70 | labels: 71 | app: clever 72 | spec: 73 | containers: 74 | - name: clever 75 | image: quay.io/chenw615/clever:latest 76 | env: 77 | - name: PROM_HOST 78 | value: "http://prometheus-k8s.monitoring.svc.cluster.local:9090" 79 | imagePullPolicy: Always 80 | serviceAccountName: clever 81 | serviceAccount: clever -------------------------------------------------------------------------------- /manifests/random.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "autoscaling.k8s.io/v1" 2 | kind: VerticalPodAutoscaler 3 | metadata: 4 | name: random-vpa 5 | spec: 6 | recommenders: 7 | - name: clever 8 | targetRef: 9 | apiVersion: "apps/v1" 10 | kind: Deployment 11 | name: random 12 | resourcePolicy: 13 | containerPolicies: 14 | - containerName: '*' 15 | minAllowed: 16 | cpu: 100m 17 | maxAllowed: 18 | cpu: 16 19 | controlledResources: ["cpu"] 20 | --- 21 | apiVersion: apps/v1 22 | kind: Deployment 23 | metadata: 24 | name: random 25 | spec: 26 | selector: 27 | matchLabels: 28 | app: random 29 | replicas: 2 30 | template: 31 | metadata: 32 | labels: 33 | app: random 34 | spec: 35 | securityContext: 36 | runAsNonRoot: true 37 | runAsUser: 65534 # nobody 38 | containers: 39 | - name: hamster 40 | image: k8s.gcr.io/ubuntu-slim:0.1 41 | resources: 42 | requests: 43 | cpu: 1 44 | memory: 500Mi 45 | command: ["/bin/sh"] 46 | args: 47 | - "-c" 48 | - "cat /dev/random" -------------------------------------------------------------------------------- /manifests/sysbench.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "autoscaling.k8s.io/v1" 2 | kind: VerticalPodAutoscaler 3 | metadata: 4 | name: sysbench-vpa 5 | spec: 6 | recommenders: 7 | - name: clever 8 | targetRef: 9 | apiVersion: "apps/v1" 10 | kind: Deployment 11 | name: sysbench 12 | resourcePolicy: 13 | containerPolicies: 14 | - containerName: '*' 15 | minAllowed: 16 | cpu: 100m 17 | maxAllowed: 18 | cpu: 16 19 | controlledResources: ["cpu"] 20 | --- 21 | apiVersion: apps/v1 22 | kind: Deployment 23 | metadata: 24 | name: sysbench 25 | spec: 26 | selector: 27 | matchLabels: 28 | app: sysbench 29 | replicas: 2 30 | template: 31 | metadata: 32 | labels: 33 | app: sysbench 34 | spec: 35 | securityContext: 36 | runAsNonRoot: true 37 | runAsUser: 65534 # nobody 38 | containers: 39 | - name: sysbench 40 | image: severalnines/sysbench 41 | resources: 42 | requests: 43 | cpu: 250m 44 | memory: 500Mi 45 | limits: 46 | cpu: 250m 47 | memory: 500Mi 48 | command: 49 | - sysbench 50 | - cpu 51 | - --threads=1 52 | - --time=100000 53 | - run 54 | -------------------------------------------------------------------------------- /recommender.py: -------------------------------------------------------------------------------- 1 | 2 | DEFAULT_NAMESPACE="default" 3 | DELTA = 0.2 4 | 5 | # Select the VPAs that choose the current clever recommender 6 | def selects_recommender(vpas, recommender_name): 7 | selected_vpas = [] 8 | for vpa in vpas["items"]: 9 | vpa_spec = vpa["spec"] 10 | if "recommenders" not in vpa_spec.keys(): 11 | continue 12 | else: 13 | print("VPA {} has chosen {} recommenders".format(vpa["metadata"]["name"], len(vpa_spec["recommenders"]))) 14 | print(vpa_spec) 15 | for recommender in vpa_spec["recommenders"]: 16 | if recommender["name"] == recommender_name: 17 | selected_vpas.append(vpa) 18 | 19 | return selected_vpas 20 | 21 | # Check if all container CPU requests are the same and get the consistent value. 22 | # If some container requests are larger than others, is_consistent would be False. 23 | def get_consistent_max_val(request_dict): 24 | max_val = -1 25 | consistent_cnt = 0 26 | for pod in request_dict.keys(): 27 | for container in request_dict[pod].keys(): 28 | if request_dict[pod][container] > max_val: 29 | max_val = request_dict[pod][container] 30 | consistent_cnt += 1 31 | 32 | is_consistent = True 33 | if consistent_cnt > 1: 34 | is_consistent = False 35 | 36 | return is_consistent, max_val 37 | 38 | 39 | # Only check the default CPU request. If not existed, it will use 1 core by default. 40 | def get_vpa_detailed_info(corev1, vpa): 41 | # Get the VPA spec 42 | vpa_spec = vpa["spec"] 43 | 44 | # example target_ref {'apiVersion': 'apps/v1', 'kind': 'Deployment', 'name': 'hamster'} 45 | target_ref = vpa_spec["targetRef"] 46 | print(target_ref) 47 | 48 | # Retrieve the target pods 49 | if "namespace" in target_ref.keys(): 50 | target_namespace = target_ref["namespace"] 51 | else: 52 | target_namespace = DEFAULT_NAMESPACE 53 | 54 | # Get the target containers 55 | target_pods = corev1.list_namespaced_pod(namespace=target_namespace, label_selector="app=" + target_ref["name"]) 56 | 57 | # Retrieve the target containers 58 | vpa_pod_nodes = {} 59 | all_container_cpu_requests = {} 60 | for pod in target_pods.items: 61 | all_container_cpu_requests[pod.metadata.name] = {} 62 | vpa_pod_nodes[pod.metadata.name] = pod.spec.node_name 63 | for container in pod.spec.containers: 64 | # print(container.name) 65 | # obtain the CPU request and convert it to int 66 | cur_request = str2resource("cpu", container.resources.requests["cpu"]) 67 | all_container_cpu_requests[pod.metadata.name][container.name] = cur_request 68 | 69 | # Get the maximum default request if there are many containers. 70 | is_consistent, max_cpu_val = get_consistent_max_val(all_container_cpu_requests) 71 | vpa_container_cpu_request = max_cpu_val 72 | 73 | if not is_consistent: 74 | print("Warning: the containers managed by {} do not have consistent CPU requests!", vpa["metadata"]["name"]) 75 | 76 | return vpa_container_cpu_request, vpa_pod_nodes 77 | 78 | # resource2str converts a resource (CPU, Memory) value to a string 79 | def resource2str(resource, value): 80 | if resource.lower() == "cpu": 81 | if value < 1: 82 | return str(int(value * 1000)) + "m" 83 | else: 84 | return str(value) 85 | # Memory is in bytes 86 | else: 87 | if value < 1024: 88 | return str(value) + "B" 89 | elif value < 1024 * 1024: 90 | return str(int(value / 1024)) + "k" 91 | elif value < 1024 * 1024 * 1024: 92 | return str(int(value / 1024 / 1024)) + "Mi" 93 | else: 94 | return str(int(value / 1024 / 1024 / 1024)) + "Gi" 95 | 96 | # Convert a resource (CPU, Memory) string to a float value 97 | def str2resource(resource, value): 98 | if type(value) is str: 99 | if resource.lower() == "cpu": 100 | if value[-1] == "m": 101 | return float(value[:-1]) / 1000 102 | else: 103 | return float(value) 104 | else: 105 | if value[-1].lower() == "b": 106 | return float(value[:-1]) 107 | elif value[-1].lower() == "k": 108 | return float(value[:-1]) * 1024 109 | elif value[-2:].lower() == "mi": 110 | return float(value[:-2]) * 1024 * 1024 111 | elif value[-2:].lower() == "gi": 112 | return float(value[:-2]) * 1024 * 1024 * 1024 113 | else: 114 | return float(value) 115 | else: 116 | return value 117 | 118 | def bound_var(var, min_value, max_value): 119 | if var < min_value: 120 | return min_value 121 | elif var > max_value: 122 | return max_value 123 | else: 124 | return var 125 | 126 | # Find the nodes with frequency changes in the last iteration 127 | def find_node_with_frequency_changes(cur_node_frequencies, prev_node_frequencies): 128 | node_with_frequency_changes = [] 129 | for node in cur_node_frequencies.keys(): 130 | # TODO: compare frequencies 131 | if node not in prev_node_frequencies.keys(): 132 | node_with_frequency_changes.append(node) 133 | else: 134 | if cur_node_frequencies[node] == prev_node_frequencies[node]: 135 | continue 136 | else: 137 | node_with_frequency_changes.append(node) 138 | return node_with_frequency_changes 139 | 140 | def get_recommendation(vpa, corev1, node_frequencies, max_node_frequencies, vpa_default_request): 141 | """ 142 | This function takes a VPA and returns a list of recommendations 143 | """ 144 | # Get the VPA spec 145 | vpa_spec = vpa["spec"] 146 | 147 | # example target_ref {'apiVersion': 'apps/v1', 'kind': 'Deployment', 'name': 'hamster'} 148 | target_ref = vpa_spec["targetRef"] 149 | print(target_ref) 150 | 151 | # Retrieve the target pods 152 | if "namespace" in target_ref.keys(): 153 | target_namespace = target_ref["namespace"] 154 | else: 155 | target_namespace = DEFAULT_NAMESPACE 156 | 157 | # Get the target pods 158 | target_pods = corev1.list_namespaced_pod(namespace=target_namespace, label_selector="app=" + target_ref["name"]) 159 | 160 | # Get the target container traces 161 | recommendations = [] 162 | 163 | # Get uncapped target 164 | uncapped_targets = {} 165 | for pod in target_pods.items: 166 | pod_node = pod.spec.node_name 167 | node_frequency = node_frequencies[pod_node] 168 | max_node_frequency = max_node_frequencies[pod_node] 169 | for container in pod.spec.containers: 170 | container_name = container.name 171 | uncapped_target = vpa_default_request * float(max_node_frequency) / float(node_frequency) 172 | if container_name not in uncapped_targets.keys(): 173 | uncapped_targets[container_name] = uncapped_target 174 | else: 175 | uncapped_targets[container_name] = max(uncapped_target, uncapped_targets[container_name]) 176 | 177 | 178 | for containerPolicy in vpa_spec["resourcePolicy"]["containerPolicies"]: 179 | controlled_resources = containerPolicy["controlledResources"] 180 | max_allowed = containerPolicy["maxAllowed"] 181 | min_allowed = containerPolicy["minAllowed"] 182 | 183 | for resource in controlled_resources: 184 | if resource != "cpu": 185 | continue 186 | else: 187 | for container_name in uncapped_targets.keys(): 188 | container_recommendation = {"containerName": container_name, "lowerBound": {}, "target": {}, 189 | "uncappedTarget": {}, "upperBound": {}} 190 | uncapped_target = uncapped_targets[container_name] 191 | lower_bound = uncapped_target * (1 - DELTA) 192 | upper_bound = uncapped_target * (1 + DELTA) 193 | 194 | # If the target is below the lowerbound, set it to the lowerbound 195 | min_allowed_value = str2resource(resource, min_allowed[resource]) 196 | max_allowed_value = str2resource(resource, max_allowed[resource]) 197 | target = bound_var(uncapped_target, min_allowed_value, max_allowed_value) 198 | lower_bound = bound_var(lower_bound, min_allowed_value, max_allowed_value) 199 | upper_bound = bound_var(upper_bound, min_allowed_value, max_allowed_value) 200 | 201 | # Convert CPU/Memory values to millicores/bytes 202 | container_recommendation["lowerBound"][resource] = resource2str(resource, lower_bound) 203 | container_recommendation["target"][resource] = resource2str(resource, target) 204 | container_recommendation["uncappedTarget"][resource] = resource2str(resource, uncapped_target) 205 | container_recommendation["upperBound"][resource] = resource2str(resource, upper_bound) 206 | 207 | recommendations.append(container_recommendation) 208 | return recommendations 209 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cachetools==4.2.4 2 | certifi==2022.9.14 3 | charset-normalizer==2.0.12 4 | google-auth==2.11.1 5 | idna==3.4 6 | kubernetes==24.2.0 7 | oauthlib==3.2.1 8 | pyasn1==0.4.8 9 | pyasn1-modules==0.2.8 10 | python-dateutil==2.8.2 11 | PyYAML==6.0 12 | requests==2.27.1 13 | requests-oauthlib==1.3.1 14 | rsa==4.9 15 | six==1.16.0 16 | urllib3==1.26.12 17 | websocket-client==1.3.1 18 | -------------------------------------------------------------------------------- /scripts/set_cpu_freq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FREQ=${1} 4 | cpupower frequency-set -d ${FREQ} 5 | cpupower frequency-set -u ${FREQ} -------------------------------------------------------------------------------- /scripts/watch_vpa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "\$kubectl get vpa ${1} --no-headers -o \"custom-columns=:status.recommendation.containerRecommendations[0].target.cpu\"" 4 | kubectl get vpa ${1} --no-headers -o "custom-columns=:status.recommendation.containerRecommendations[0].target.cpu" 5 | echo -e "\n" -------------------------------------------------------------------------------- /testPromClient.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | 3 | if __name__ == '__main__': 4 | prom_address = "http://127.0.0.1:39090" 5 | prom_client = PromClient(prom_address) 6 | 7 | max_cpu_frequency_query = "node_cpu_frequency_max_hertz" 8 | min_cpu_frequency_query = "node_cpu_frequency_min_hertz" 9 | latest_cpu_frequency_query = "node_cpu_scaling_frequency_hertz" 10 | 11 | pod_ips_query = "pod_energy_stat" 12 | 13 | all_node_homogeneous_max_frequencies = get_all_node_homogeneous_frequencies(prom_client, max_cpu_frequency_query) 14 | print(all_node_homogeneous_max_frequencies) 15 | 16 | 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from PromClient import * 2 | 3 | def parse_frequency_dict(cpu_frequency_data): 4 | all_node_frequencies = {} 5 | for cur_element in cpu_frequency_data: 6 | node_name = cur_element["metric"]["instance"] 7 | cpu_idx = cur_element["metric"]["cpu"] 8 | if node_name not in all_node_frequencies.keys(): 9 | all_node_frequencies[node_name] = {} 10 | cur_val = cur_element["value"][1] 11 | all_node_frequencies[node_name][cpu_idx] = cur_val 12 | 13 | return all_node_frequencies 14 | 15 | def get_homogeneous_value(node_frequencies): 16 | cpu_count = 0 17 | homogeneous_frequency = -1 18 | for cpu in node_frequencies.keys(): 19 | if cpu_count == 0: 20 | homogeneous_frequency = int(node_frequencies[cpu]) 21 | else: 22 | if node_frequencies[cpu] != homogeneous_frequency: 23 | homogeneous_frequency = max(int(node_frequencies[cpu]), homogeneous_frequency) 24 | 25 | cpu_count +=1 26 | homogeneous_frequency = get_rounded_frequency(homogeneous_frequency) 27 | return homogeneous_frequency 28 | 29 | def get_rounded_frequency(frequency): 30 | return round(frequency / 1000000000) * 1000000000 31 | 32 | def get_all_node_homogeneous_frequencies(prom_cient, prometheus_query): 33 | frequency_data = prom_cient.get_query(prometheus_query) 34 | if frequency_data is None: 35 | return None 36 | all_node_frequencies = parse_frequency_dict(frequency_data) 37 | all_node_homogeneous_frequencies = {} 38 | for node in all_node_frequencies.keys(): 39 | cur_node_homogeneous_frequency = get_homogeneous_value(all_node_frequencies[node]) 40 | if cur_node_homogeneous_frequency == -1: 41 | return None 42 | all_node_homogeneous_frequencies[node] = cur_node_homogeneous_frequency 43 | return all_node_homogeneous_frequencies --------------------------------------------------------------------------------