├── .dockerignore
├── .flake8
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
└── workflows
│ ├── build-on-release.yml
│ ├── docker-build-on-tag.yml
│ └── pytest-on-push.yml
├── .gitignore
├── .pre-commit-config.yaml
├── ADOPTERS.md
├── Dockerfile
├── LICENSE
├── README.md
├── build_linux.sh
├── build_local.sh
├── build_release.sh
├── docker
├── README.md
└── aws.Dockerfile
├── docs
├── google-cloud-managed-service-for-prometheus.md
└── krr-in-cluster
│ └── krr-in-cluster-job.yaml
├── enforcer
├── Dockerfile
├── README.md
├── dal
│ ├── robusta_config.py
│ └── supabase_dal.py
├── enforcer_main.py
├── env_vars.py
├── metrics.py
├── model.py
├── params_utils.py
├── patch_manager.py
├── requirements.txt
├── resources
│ ├── kubernetes_resource_loader.py
│ ├── owner_store.py
│ └── recommendation_store.py
└── utils.py
├── examples
├── custom_formatter.py
├── custom_severity_calculator.py
└── custom_strategy.py
├── helm
├── krr-enforcer
│ ├── .helmignore
│ ├── Chart.yaml
│ ├── templates
│ │ ├── enforcer-cert-job.yaml
│ │ ├── enforcer-service-account.yaml
│ │ ├── enforcer.yaml
│ │ └── service-monitor.yaml
│ └── values.yaml
└── upload_chart.sh
├── images
├── krr-datasources.png
├── krr-datasources.svg
├── krr-other-integrations.png
├── krr-other-integrations.svg
├── krr_slack_example.png
├── logo.png
├── screenshot.jpeg
├── ui_recommendation.png
├── ui_screenshot_new.png
└── ui_video.gif
├── intro.txt
├── krr.py
├── poetry.lock
├── pyproject.toml
├── requirements.txt
├── robusta_krr
├── __init__.py
├── api
│ ├── formatters.py
│ ├── models.py
│ └── strategies.py
├── common
│ └── ssl_utils.py
├── core
│ ├── __init__.py
│ ├── abstract
│ │ ├── formatters.py
│ │ ├── metrics.py
│ │ └── strategies.py
│ ├── integrations
│ │ ├── kubernetes
│ │ │ ├── __init__.py
│ │ │ └── config_patch.py
│ │ ├── openshift
│ │ │ ├── __init__.py
│ │ │ └── token.py
│ │ └── prometheus
│ │ │ ├── __init__.py
│ │ │ ├── loader.py
│ │ │ ├── metrics
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── cpu.py
│ │ │ └── memory.py
│ │ │ ├── metrics_service
│ │ │ ├── base_metric_service.py
│ │ │ ├── mimir_metrics_service.py
│ │ │ ├── prometheus_metrics_service.py
│ │ │ ├── thanos_metrics_service.py
│ │ │ └── victoria_metrics_service.py
│ │ │ └── prometheus_utils.py
│ ├── models
│ │ ├── allocations.py
│ │ ├── config.py
│ │ ├── objects.py
│ │ ├── result.py
│ │ └── severity.py
│ └── runner.py
├── formatters
│ ├── __init__.py
│ ├── csv.py
│ ├── csv_raw.py
│ ├── html.py
│ ├── json.py
│ ├── pprint.py
│ ├── table.py
│ └── yaml.py
├── main.py
├── strategies
│ ├── __init__.py
│ ├── simple.py
│ └── simple_limit.py
└── utils
│ ├── batched.py
│ ├── intro.py
│ ├── object_like_dict.py
│ ├── patch.py
│ ├── progress_bar.py
│ ├── resource_units.py
│ ├── service_discovery.py
│ └── version.py
└── tests
├── conftest.py
├── formatters
└── test_csv_formatter.py
├── models
└── test_resource_allocations.py
├── single_namespace_as_group.yaml
├── single_namespace_permissions.yaml
├── test_krr.py
└── test_runner.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | # .dockerignore
2 | __pycache__
3 | *.pyc
4 | *.pyo
5 | *.pyd
6 |
7 | # Exclude development files
8 | .git
9 | .gitignore
10 | Dockerfile
11 | *.md
12 | .vscode
13 |
14 | # Exclude logs and cache
15 | logs/
16 | cache/
17 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | exclude = .git,
4 | __pycache__,
5 | old,
6 | build,
7 | dist,
8 | .venv,
9 | .vscode,
10 | .pytest_cache,
11 | __init__.py,
12 | .mypy_cache,
13 | src/robusta/integrations/kubernetes/autogenerated,
14 | src/robusta/integrations/kubernetes/custom_models.py
15 | ignore = E501, W503, E203
16 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Are you interested in contributing a fix for this?**
27 | Yes/no. If yes, we will provide guidance what parts of the code to modify and help you.
28 |
29 | **Desktop (please complete the following information):**
30 | - OS: [e.g. iOS]
31 | - Browser [e.g. chrome, safari]
32 | - Version [e.g. 22]
33 |
34 | **Smartphone (please complete the following information):**
35 | - Device: [e.g. iPhone6]
36 | - OS: [e.g. iOS8.1]
37 | - Browser [e.g. stock browser, safari]
38 | - Version [e.g. 22]
39 |
40 | **Additional context**
41 | Add any other context about the problem here.
42 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Are you interested in contributing a PR for this?**
20 | Yes/no. If yes, we will provide guidance what parts of the code to modify and help you.
21 |
22 | **Additional context**
23 | Add any other context or screenshots about the feature request here.
24 |
--------------------------------------------------------------------------------
/.github/workflows/build-on-release.yml:
--------------------------------------------------------------------------------
1 | name: Build and Release
2 |
3 | on:
4 | release:
5 | types: [created]
6 |
7 | jobs:
8 | build:
9 | strategy:
10 | matrix:
11 | # we build on macos-13 for x86 builds
12 | os: [ubuntu-latest, windows-latest, macos-latest, macos-13]
13 |
14 | runs-on: ${{ matrix.os }}
15 |
16 | steps:
17 | - uses: actions/checkout@v2
18 |
19 | - name: Set up Python
20 | uses: actions/setup-python@v2
21 | with:
22 | python-version: '3.11'
23 |
24 | - name: Install dependencies
25 | run: |
26 | python -m pip install --upgrade pip
27 | pip install -r requirements.txt
28 | pip install pyinstaller
29 |
30 | - name: Install dependancies (Linux)
31 | if: matrix.os == 'ubuntu-latest'
32 | run: |
33 | sudo apt-get install -y binutils
34 |
35 | - name: Install the Apple certificate and provisioning profile
36 | if: matrix.os == 'macos-latest' || matrix.os == 'macos-13'
37 | env:
38 | BUILD_CERTIFICATE_BASE64: ${{ secrets.BUILD_CERTIFICATE_BASE64 }}
39 | P12_PASSWORD: ${{ secrets.P12_PASSWORD }}
40 | BUILD_PROVISION_PROFILE_BASE64: ${{ secrets.BUILD_PROVISION_PROFILE_BASE64 }}
41 | KEYCHAIN_PASSWORD: ${{ secrets.KEYCHAIN_PASSWORD }}
42 | run: |
43 | # create variables
44 | CERTIFICATE_PATH=$RUNNER_TEMP/build_certificate.p12
45 | PP_PATH=$RUNNER_TEMP/build_pp.mobileprovision
46 | KEYCHAIN_PATH=$RUNNER_TEMP/app-signing.keychain-db
47 |
48 | # import certificate and provisioning profile from secrets
49 | echo -n "$BUILD_CERTIFICATE_BASE64" | base64 --decode -o $CERTIFICATE_PATH
50 | echo -n "$BUILD_PROVISION_PROFILE_BASE64" | base64 --decode -o $PP_PATH
51 |
52 | # create temporary keychain
53 | security create-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH
54 | security set-keychain-settings -lut 21600 $KEYCHAIN_PATH
55 | security unlock-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH
56 |
57 | # import certificate to keychain
58 | security import $CERTIFICATE_PATH -P "$P12_PASSWORD" -A -t cert -f pkcs12 -k $KEYCHAIN_PATH
59 | security list-keychain -d user -s $KEYCHAIN_PATH
60 |
61 | # apply provisioning profile
62 | mkdir -p ~/Library/MobileDevice/Provisioning\ Profiles
63 | cp $PP_PATH ~/Library/MobileDevice/Provisioning\ Profiles
64 |
65 | - name: Set version in code (Unix)
66 | if: matrix.os == 'macos-latest' || matrix.os == 'ubuntu-latest' || matrix.os == 'macos-13'
67 | run: |
68 | awk 'NR==3{$0="__version__ = \"'${{ github.ref_name }}'\""}1' ./robusta_krr/__init__.py > temp && mv temp ./robusta_krr/__init__.py
69 | cat ./robusta_krr/__init__.py
70 |
71 | - name: Set version in code (Windows)
72 | if: matrix.os == 'windows-latest'
73 | run: |
74 | $content = Get-Content -Path .\robusta_krr\__init__.py
75 | $content[2] = "__version__=`"$($env:GITHUB_REF_NAME)`""
76 | $content | Out-File -FilePath .\robusta_krr\__init__.py -Encoding ascii
77 | Get-Content .\robusta_krr\__init__.py
78 | shell: pwsh
79 | env:
80 | GITHUB_REF_NAME: ${{ github.ref_name }}
81 |
82 | - name: Build with PyInstaller
83 | if: matrix.os == 'macos-latest'
84 | shell: bash
85 | run: |
86 | pyinstaller --target-architecture arm64 krr.py
87 | mkdir -p ./dist/krr/grapheme/data
88 | cp $(python -c "import grapheme; print(grapheme.__path__[0] + '/data/grapheme_break_property.json')") ./dist/krr/grapheme/data/grapheme_break_property.json
89 | cp ./intro.txt ./dist/krr/intro.txt
90 |
91 | - name: Build with PyInstaller
92 | if: matrix.os != 'macos-latest'
93 | shell: bash
94 | run: |
95 | pyinstaller krr.py
96 | mkdir -p ./dist/krr/grapheme/data
97 | cp $(python -c "import grapheme; print(grapheme.__path__[0] + '/data/grapheme_break_property.json')") ./dist/krr/grapheme/data/grapheme_break_property.json
98 | cp ./intro.txt ./dist/krr/intro.txt
99 |
100 | - name: Zip the application (Unix)
101 | if: matrix.os == 'macos-latest' || matrix.os == 'ubuntu-latest' || matrix.os == 'macos-13'
102 | run: |
103 | cd dist
104 | zip -r krr-${{ matrix.os }}-${{ github.ref_name }}.zip krr
105 | mv krr-${{ matrix.os }}-${{ github.ref_name }}.zip ../
106 | cd ..
107 |
108 | - name: Zip the application (Windows)
109 | if: matrix.os == 'windows-latest'
110 | run: |
111 | Set-Location -Path dist
112 | Compress-Archive -Path krr -DestinationPath krr-${{ matrix.os }}-${{ github.ref_name }}.zip -Force
113 | Move-Item -Path krr-${{ matrix.os }}-${{ github.ref_name }}.zip -Destination ..\
114 | Set-Location -Path ..
115 |
116 | - name: Upload Release Asset
117 | uses: actions/upload-release-asset@v1.0.2
118 | env:
119 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
120 | with:
121 | upload_url: ${{ github.event.release.upload_url }}
122 | asset_path: ./krr-${{ matrix.os }}-${{ github.ref_name }}.zip
123 | asset_name: krr-${{ matrix.os }}-${{ github.ref_name }}.zip
124 | asset_content_type: application/octet-stream
125 |
126 | - name: Upload build as artifact
127 | uses: actions/upload-artifact@v4
128 | with:
129 | name: krr-${{ matrix.os }}-${{ github.ref_name }}
130 | path: ./krr-${{ matrix.os }}-${{ github.ref_name }}.zip
131 |
132 | - name: Clean up keychain and provisioning profile
133 | if: (matrix.os == 'macos-latest' || matrix.os == 'macos-13') && always()
134 | run: |
135 | security delete-keychain $RUNNER_TEMP/app-signing.keychain-db
136 | rm ~/Library/MobileDevice/Provisioning\ Profiles/build_pp.mobileprovision
137 |
138 | check-latest:
139 | needs: build
140 | runs-on: ubuntu-latest
141 | outputs:
142 | IS_LATEST: ${{ steps.check-latest.outputs.release == github.ref_name }}
143 | steps:
144 | - id: check-latest
145 | uses: pozetroninc/github-action-get-latest-release@v0.7.0
146 | with:
147 | token: ${{ secrets.GITHUB_TOKEN }}
148 | repository: ${{ github.repository }}
149 | excludes: prerelease, draft
150 |
151 | # Define MacOS hash job
152 | mac-hash:
153 | needs: check-latest
154 | runs-on: ubuntu-latest
155 | if: needs.check-latest.outputs.IS_LATEST
156 | outputs:
157 | MAC_BUILD_HASH: ${{ steps.calc-hash.outputs.MAC_BUILD_HASH }}
158 | steps:
159 | - name: Checkout Repository
160 | uses: actions/checkout@v2
161 | - name: Download MacOS artifact
162 | uses: actions/download-artifact@v4
163 | with:
164 | name: krr-macos-latest-${{ github.ref_name }}
165 | - name: Calculate hash
166 | id: calc-hash
167 | run: echo "::set-output name=MAC_BUILD_HASH::$(sha256sum krr-macos-latest-${{ github.ref_name }}.zip | awk '{print $1}')"
168 |
169 | # Define Linux hash job
170 | linux-hash:
171 | needs: check-latest
172 | runs-on: ubuntu-latest
173 | if: needs.check-latest.outputs.IS_LATEST
174 | outputs:
175 | LINUX_BUILD_HASH: ${{ steps.calc-hash.outputs.LINUX_BUILD_HASH }}
176 | steps:
177 | - name: Checkout Repository
178 | uses: actions/checkout@v2
179 | - name: Download Linux artifact
180 | uses: actions/download-artifact@v4
181 | with:
182 | name: krr-ubuntu-latest-${{ github.ref_name }}
183 | - name: Calculate hash
184 | id: calc-hash
185 | run: echo "::set-output name=LINUX_BUILD_HASH::$(sha256sum krr-ubuntu-latest-${{ github.ref_name }}.zip | awk '{print $1}')"
186 |
187 | # Define job to update homebrew formula
188 | update-formula:
189 | needs: [mac-hash, linux-hash]
190 | runs-on: ubuntu-latest
191 | steps:
192 | - name: Checkout homebrew-krr repository
193 | uses: actions/checkout@v2
194 | with:
195 | repository: robusta-dev/homebrew-krr
196 | token: ${{ secrets.MULTIREPO_GITHUB_TOKEN }}
197 | - name: Update krr.rb formula
198 | run: |
199 | MAC_BUILD_HASH=${{ needs.mac-hash.outputs.MAC_BUILD_HASH }}
200 | LINUX_BUILD_HASH=${{ needs.linux-hash.outputs.LINUX_BUILD_HASH }}
201 | TAG_NAME=${{ github.ref_name }}
202 | awk 'NR==6{$0=" url \"https://github.com/robusta-dev/krr/releases/download/'"$TAG_NAME"'/krr-macos-latest-'"$TAG_NAME"'.zip\""}1' ./Formula/krr.rb > temp && mv temp ./Formula/krr.rb
203 | awk 'NR==7{$0=" sha256 \"'$MAC_BUILD_HASH'\""}1' ./Formula/krr.rb > temp && mv temp ./Formula/krr.rb
204 | awk 'NR==9{$0=" url \"https://github.com/robusta-dev/krr/releases/download/'"$TAG_NAME"'/krr-ubuntu-latest-'"$TAG_NAME"'.zip\""}1' ./Formula/krr.rb > temp && mv temp ./Formula/krr.rb
205 | awk 'NR==10{$0=" sha256 \"'$LINUX_BUILD_HASH'\""}1' ./Formula/krr.rb > temp && mv temp ./Formula/krr.rb
206 | - name: Commit and push changes
207 | run: |
208 | git config --local user.email "action@github.com"
209 | git config --local user.name "GitHub Action"
210 | git commit -am "Update formula for release ${TAG_NAME}"
211 | git push
212 |
--------------------------------------------------------------------------------
/.github/workflows/docker-build-on-tag.yml:
--------------------------------------------------------------------------------
1 | name: Docker Build and Push
2 |
3 | on:
4 | push:
5 | tags:
6 | - '*'
7 |
8 | jobs:
9 | build:
10 |
11 | runs-on: ubuntu-latest
12 |
13 | permissions:
14 | contents: 'read'
15 | id-token: 'write'
16 |
17 | steps:
18 | - uses: actions/checkout@v4
19 |
20 | - uses: 'google-github-actions/auth@v2'
21 | with:
22 | project_id: 'genuine-flight-317411'
23 | workload_identity_provider: 'projects/429189597230/locations/global/workloadIdentityPools/github/providers/robusta-repos'
24 |
25 | - name: Set up gcloud CLI
26 | uses: google-github-actions/setup-gcloud@v2
27 | with:
28 | project_id: genuine-flight-317411
29 |
30 | - name: Configure Docker Registry
31 | run: gcloud auth configure-docker us-central1-docker.pkg.dev
32 |
33 | - name: Login to Docker Hub
34 | uses: docker/login-action@v1
35 | with:
36 | username: ${{ secrets.DOCKER_USERNAME }}
37 | password: ${{ secrets.DOCKER_PASSWORD }}
38 |
39 | - name: Set up Docker Buildx
40 | uses: docker/setup-buildx-action@v1
41 |
42 | - name: Build and push Docker images
43 | uses: docker/build-push-action@v2
44 | with:
45 | context: .
46 | platforms: linux/arm64,linux/amd64
47 | push: true
48 | tags: |
49 | robustadev/krr:${{ github.ref_name }}
50 | us-central1-docker.pkg.dev/genuine-flight-317411/devel/krr:${{ github.ref_name }}
51 | build-args: |
52 | BUILDKIT_INLINE_CACHE=1
--------------------------------------------------------------------------------
/.github/workflows/pytest-on-push.yml:
--------------------------------------------------------------------------------
1 | name: Pytest
2 |
3 | on: [push]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ubuntu-latest
9 |
10 | steps:
11 | - uses: actions/checkout@v2
12 |
13 | - name: Set up Python
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: '3.9'
17 |
18 | - name: Install dependencies
19 | run: |
20 | python -m pip install --upgrade pip
21 | pip install -r requirements.txt
22 | pip install -e .
23 | pip install pytest
24 |
25 | - name: Test with pytest
26 | run: |
27 | pytest
28 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | .idea/
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | pip-wheel-metadata/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # pipenv
89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
92 | # install all needed dependencies.
93 | #Pipfile.lock
94 |
95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
96 | __pypackages__/
97 |
98 | # Celery stuff
99 | celerybeat-schedule
100 | celerybeat.pid
101 |
102 | # SageMath parsed files
103 | *.sage.py
104 |
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 |
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 |
118 | # Rope project settings
119 | .ropeproject
120 |
121 | # mkdocs documentation
122 | /site
123 |
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 |
129 | # Pyre type checker
130 | .pyre/
131 |
132 |
133 | .DS_Store
134 | robusta_lib
135 | .idea
136 | .vscode
137 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/ambv/black
3 | rev: 23.1.0
4 | hooks:
5 | - id: black
6 | language_version: python3
7 | args: [--config=pyproject.toml]
8 |
9 | - repo: https://github.com/pre-commit/pre-commit-hooks
10 | rev: v3.3.0
11 | hooks:
12 | - id: trailing-whitespace
13 | - id: end-of-file-fixer
14 |
15 | - repo: https://github.com/pycqa/flake8
16 | rev: 6.0.0
17 | hooks:
18 | - id: flake8
19 | args: [--config=.flake8]
20 |
21 | - repo: https://github.com/pycqa/isort
22 | rev: 5.12.0
23 | hooks:
24 | - id: isort
25 | args: [--settings-path=pyproject.toml]
26 |
27 | - repo: https://github.com/pre-commit/mirrors-mypy
28 | rev: v1.0.1
29 | hooks:
30 | - id: mypy
31 | language: system
32 |
--------------------------------------------------------------------------------
/ADOPTERS.md:
--------------------------------------------------------------------------------
1 | # KRR Adopters
2 |
3 | This is a list of adopters of Robusta KRR operator:
4 |
5 | Everton Arakaki - WAES Platform Consultant for ASML (Semiconductor Industry)
6 |
7 | > I used Robusta KRR in my production clusters, and it took me less than 5 minutes to get very well detailed cpu/memory recommendations. Our applications and platform tooling were discovered automatically; our kubecontext was discovered automatically; and our kube-prometheus-stack was discovered automatically.
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Use the official Python 3.9 slim image as the base image
2 | FROM python:3.12-slim as builder
3 | ENV LANG=C.UTF-8
4 | ENV PYTHONDONTWRITEBYTECODE=1
5 | ENV PYTHONUNBUFFERED=1
6 | ENV PATH="/app/venv/bin:$PATH"
7 |
8 | # Install system dependencies required for Poetry
9 | RUN apt-get update && \
10 | dpkg --add-architecture arm64
11 |
12 | # We're installing here libexpat1, to upgrade the package to include a fix to 3 high CVEs. CVE-2024-45491,CVE-2024-45490,CVE-2024-45492
13 | RUN apt-get update \
14 | && apt-get install -y --no-install-recommends libexpat1 \
15 | && rm -rf /var/lib/apt/lists/*
16 |
17 | # Set the working directory
18 | WORKDIR /app
19 |
20 | COPY ./requirements.txt requirements.txt
21 |
22 | RUN pip install --no-cache-dir --upgrade pip
23 | # Install the project dependencies
24 | RUN python -m ensurepip --upgrade
25 | RUN pip install --no-cache-dir -r requirements.txt
26 |
27 | # Copy the rest of the application code
28 | COPY ./krr.py krr.py
29 | COPY ./robusta_krr/ robusta_krr/
30 | COPY ./intro.txt intro.txt
31 |
32 | # Run the application using 'poetry run krr simple'
33 | CMD ["python", "krr.py", "simple"]
34 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Robusta
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/build_linux.sh:
--------------------------------------------------------------------------------
1 | # Remove old build
2 | rm -rf build
3 | rm -rf dist
4 |
5 | # MacOS Build first
6 |
7 | # Active venv
8 | # python -m pip install -r requirements.txt
9 | pip install pyinstaller
10 | apt-get install binutils
11 |
12 | # source .venv/bin/activate
13 |
14 | # Build
15 | pyinstaller krr.py
16 | cd dist
17 | # zip -r "krr-linux-v1.1.0.zip" krr
18 |
19 | # Deactivate venv
20 | # deactivate
--------------------------------------------------------------------------------
/build_local.sh:
--------------------------------------------------------------------------------
1 | # Remove old build
2 | rm -rf build
3 | rm -rf dist
4 |
5 | # Active venv
6 | source .venv/bin/activate
7 | pip install -r requirements.txt
8 | pip install pyinstaller
9 |
10 | # Build
11 | pyinstaller krr.py
12 | cd dist
13 | zip -r "krr-macos-v1.1.0.zip" krr
--------------------------------------------------------------------------------
/build_release.sh:
--------------------------------------------------------------------------------
1 | docker buildx build \
2 | --build-arg BUILDKIT_INLINE_CACHE=1 \
3 | --platform linux/arm64,linux/amd64 \
4 | --tag us-central1-docker.pkg.dev/genuine-flight-317411/devel/krr:${TAG} \
5 | --push \
6 | .
--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
1 | # Dockerfiles for specific clouds
2 |
3 | This directory will include Dockerfiles for various cloud providers.
4 |
5 | ## AWS
6 |
7 | For the usage of `krr` container we need the Dockerfile to have `awscli` installed on it.
8 | The `aws.Dockerfile` is a modified `krr` dockerfile which includes:
9 | - installation of curl & zip
10 | - installation of awscli
11 |
12 |
13 |
--------------------------------------------------------------------------------
/docker/aws.Dockerfile:
--------------------------------------------------------------------------------
1 | # Use the official Python 3.9 slim image as the base image
2 | FROM python:3.9-slim as builder
3 |
4 | # Set the working directory
5 | WORKDIR /app
6 |
7 | # Install system dependencies required for Poetry
8 | RUN apt-get update && \
9 | dpkg --add-architecture arm64
10 |
11 | COPY ./requirements.txt requirements.txt
12 |
13 | # Install the project dependencies
14 | RUN pip install --no-cache-dir -r requirements.txt
15 |
16 | # Install curl and unzip for awscli
17 | RUN apt-get -y update; apt-get -y install curl; apt-get -y install unzip
18 |
19 | # Download awscli and unzip it
20 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \
21 | unzip awscliv2.zip && \
22 | ./aws/install
23 |
24 | # Copy the rest of the application code
25 | COPY . .
26 |
27 | # Run the application using 'poetry run krr simple'
28 | ENTRYPOINT ["python", "krr.py", "simple"]
29 |
--------------------------------------------------------------------------------
/docs/google-cloud-managed-service-for-prometheus.md:
--------------------------------------------------------------------------------
1 | ## Installation instructions for [Google Managed Service for Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus)
2 |
3 | The following instructions assume that you are running [Google Managed Service for Prometheus (GMP)](https://cloud.google.com/stackdriver/docs/managed-prometheus) in its [managed collection](https://cloud.google.com/stackdriver/docs/managed-prometheus/setup-managed) mode and that you have installed krr.
4 |
5 | krr depends upon 2 [cAdvisor](https://github.com/google/cadvisor) [metrics](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md#prometheus-container-metrics):
6 |
7 | 1. `container_cpu_usage_seconds_total`
8 | 1. `container_memory_working_set_bytes`
9 |
10 |
11 | In order for krr to work with GMP, we need to ensure that cAdvisor is enabled and that the GMP Operator is configured to collect these 2 metrics. This can be combined into a single step that involves revising the GMP Operator configuration file `operatorconfig/config` in Namespace `gmp-public`
12 |
13 | Google provides instructions for enabling [Kubelet/cAdvisor](https://cloud.google.com/stackdriver/docs/managed-prometheus/exporters/kubelet-cadvisor). This requires adding a `kubeletScraping` section to the configuration file.
14 |
15 | We must also add a `filter` section to the configuration file. The `filter` matches the 2 metrics that krr uses.
16 |
17 | `operatorconfig.krr.patch.yaml`:
18 | ```YAML
19 | collection:
20 | filter:
21 | matchOneOf:
22 | - '{__name__="container_cpu_usage_seconds_total"}'
23 | - '{__name__="container_memory_working_set_bytes"}'
24 | kubeletScraping:
25 | interval: 30s
26 | ```
27 |
28 | There are various ways to make this Resource change to the cluster.
29 |
30 | You can `kubectl edit` the file and manually add the changes:
31 |
32 | ```bash
33 | KUBE_EDITOR="nano" \
34 | kubectl edit operatorconfig/config \
35 | --namespace=gmp-public
36 | ```
37 |
38 | Or you can `kubectl patch` the file:
39 |
40 | ```bash
41 | kubectl patch operatorconfig/config \
42 | --namespace=gmp-public \
43 | --type=merge \
44 | --patch-file=/path/to/operatorconfig.krr.patch.yaml
45 | ```
46 |
47 | ### Test
48 |
49 | There are multiple ways to confirm that GMP is collecting the metrics needed by krr.
50 |
51 | The simplest is to access Google Cloud Console "Metric Diagnostics" and confirm that the "Metrics" section includes the 2 metrics with (recent) "Metric Data Ingested":
52 |
53 | `https://console.cloud.google.com/monitoring/metrics-diagnostics?project={project}`
54 |
55 | > **NOTE** Replace `{project}` with your Google Cloud Project ID.
56 |
57 | Another way is to deploy the [Frontend UI for GMP](https://cloud.google.com/stackdriver/docs/managed-prometheus/query#promui-deploy) and use the UI to browse the metrics.
58 |
59 | GMP implements the [Prometheus HTTP API](https://prometheus.io/docs/prometheus/latest/querying/api/) and, like krr, we can use this to query the metrics:
60 |
61 | ```bash
62 | PROJECT="..." # Google Cloud Project ID
63 | MONITORING="https://monitoring.googleapis.com/v1"
64 | ENDPOINT="${MONITORING}/projects/${PROJECT}/location/global/prometheus"
65 |
66 | TOKEN=$(gcloud auth print-access-token)
67 |
68 | # Either
69 | QUERY="count({__name__=\"container_cpu_usage_seconds_total\"})"
70 | # Or
71 | QUERY="count({__name__=\"container_memory_working_set_bytes\"})"
72 |
73 | curl \
74 | --silent \
75 | --get \
76 | --header "Authorization: Bearer ${TOKEN}" \
77 | --data-urlencode "query=${QUERY}" \
78 | ${ENDPOINT}/api/v1/query
79 | ```
80 | If you have [jq]() installed, you can filter the results to output only the latest value:
81 | ```bash
82 | | jq -r .data.result[0].value[1]
83 | ```
84 |
85 | ### Run krr
86 |
87 | krr leverages Google [Application Default Credentials (ADC)](https://cloud.google.com/docs/authentication/application-default-credentials). Ensure that ADC credentials are accessible (per Google's documentation) before running krr so that krr can authenticate to GMP.
88 |
89 | ```bash
90 | PROJECT="..." # Google Cloud Project ID
91 | MONITORING="https://monitoring.googleapis.com/v1"
92 | ENDPOINT="${MONITORING}/projects/${PROJECT}/location/global/prometheus"
93 |
94 | python krr.py simple \
95 | --prometheus-url=${ENDPOINT}
96 | ```
97 |
--------------------------------------------------------------------------------
/docs/krr-in-cluster/krr-in-cluster-job.yaml:
--------------------------------------------------------------------------------
1 | kind: ClusterRole
2 | apiVersion: rbac.authorization.k8s.io/v1
3 | metadata:
4 | name: krr-cluster-role
5 | namespace: default
6 | rules:
7 | - apiGroups:
8 | - ""
9 | resources:
10 | - configmaps
11 | - daemonsets
12 | - deployments
13 | - namespaces
14 | - pods
15 | - replicasets
16 | - replicationcontrollers
17 | - services
18 | verbs:
19 | - get
20 | - list
21 | - watch
22 |
23 | - apiGroups:
24 | - ""
25 | resources:
26 | - nodes
27 | verbs:
28 | - get
29 | - list
30 | - watch
31 |
32 | - apiGroups:
33 | - apps
34 | resources:
35 | - daemonsets
36 | - deployments
37 | - deployments/scale
38 | - replicasets
39 | - replicasets/scale
40 | - statefulsets
41 | verbs:
42 | - get
43 | - list
44 | - watch
45 |
46 | - apiGroups:
47 | - extensions
48 | resources:
49 | - daemonsets
50 | - deployments
51 | - deployments/scale
52 | - ingresses
53 | - replicasets
54 | - replicasets/scale
55 | - replicationcontrollers/scale
56 | verbs:
57 | - get
58 | - list
59 | - watch
60 | - apiGroups:
61 | - batch
62 | resources:
63 | - cronjobs
64 | - jobs
65 | verbs:
66 | - get
67 | - list
68 | - watch
69 | - apiGroups:
70 | - "autoscaling"
71 | resources:
72 | - horizontalpodautoscalers
73 | verbs:
74 | - get
75 | - list
76 | - watch
77 |
78 | ---
79 | apiVersion: v1
80 | kind: ServiceAccount
81 | metadata:
82 | name: krr-service-account
83 | namespace: default
84 | ---
85 | apiVersion: rbac.authorization.k8s.io/v1
86 | kind: ClusterRoleBinding
87 | metadata:
88 | name: krr-cluster-role-binding
89 | roleRef:
90 | apiGroup: rbac.authorization.k8s.io
91 | kind: ClusterRole
92 | name: krr-cluster-role
93 | subjects:
94 | - kind: ServiceAccount
95 | name: krr-service-account
96 | namespace: default
97 |
98 | ---
99 | apiVersion: batch/v1
100 | kind: Job
101 | metadata:
102 | name: krr
103 | namespace: default
104 | spec:
105 | template:
106 | spec:
107 | containers:
108 | - command:
109 | - /bin/sh
110 | - -c
111 | - "python krr.py simple --max-workers 3 --width 2048 "
112 | image: robustadev/krr:v1.17.0
113 | imagePullPolicy: Always
114 | name: krr
115 | resources:
116 | limits:
117 | memory: 2Gi
118 | requests:
119 | memory: 1Gi
120 | restartPolicy: Never
121 | serviceAccount: krr-service-account
122 | serviceAccountName: krr-service-account
123 |
--------------------------------------------------------------------------------
/enforcer/Dockerfile:
--------------------------------------------------------------------------------
1 | # Use the official Python 3.9 slim image as the base image
2 | FROM python:3.12-slim
3 | ENV LANG=C.UTF-8
4 | ENV PYTHONDONTWRITEBYTECODE=1
5 | ENV PYTHONUNBUFFERED=1
6 | ENV PATH="/app/venv/bin:$PATH"
7 |
8 | # We're installing here libexpat1, to upgrade the package to include a fix to 3 high CVEs. CVE-2024-45491,CVE-2024-45490,CVE-2024-45492
9 | RUN apt-get update \
10 | && apt-get install -y --no-install-recommends libexpat1 \
11 | && rm -rf /var/lib/apt/lists/*
12 |
13 | # Set the working directory
14 | WORKDIR /app/enforcer
15 |
16 | COPY ./*.py .
17 | COPY ./dal/ dal/
18 | COPY ./resources/ resources/
19 | COPY ./requirements.txt requirements.txt
20 |
21 |
22 | RUN pip install --no-cache-dir --upgrade pip
23 | # Install the project dependencies
24 | RUN python -m ensurepip --upgrade
25 | RUN pip install --no-cache-dir -r requirements.txt
26 |
27 | CMD ["python", "enforcer_main.py"]
28 |
--------------------------------------------------------------------------------
/enforcer/README.md:
--------------------------------------------------------------------------------
1 | # KRR Enforcer - Kubernetes Resource Recommendation Mutation Webhook
2 |
3 | A mutating webhook server that automatically enforces [KRR (Kubernetes Resource Recommender)](https://github.com/robusta-dev/krr) recommendations by patching pod resource requests and limits in real-time.
4 |
5 | ## Features
6 |
7 | - **Automatic Resource Enforcement**: Applies KRR recommendations to pods during pod creation
8 | - **Flexible Enforcement Modes**: Support for enforce/ignore modes per workload
9 | - **REST API**: Query recommendations via HTTP endpoints
10 |
11 | ## Enforcement Modes
12 |
13 | Enforcement can be configured globally or on a per-workload basis.
14 |
15 | ### Global Enforcement Mode
16 | The global default mode is configured via the `KRR_MUTATION_MODE_DEFAULT` environment variable:
17 | - `enforce` - Apply recommendations to all pods by default
18 | - `ignore` - Skip enforcement for all pods by default
19 |
20 | ### Per-Workload Mode
21 | You can override the default mode for specific workloads using the annotation:
22 |
23 | ```yaml
24 | apiVersion: apps/v1
25 | kind: Deployment
26 | metadata:
27 | name: my-app
28 | spec:
29 | template:
30 | metadata:
31 | annotations:
32 | admission.robusta.dev/krr-mutation-mode: enforce # or "ignore"
33 | ```
34 |
35 | **Mode Priority**: Pod annotation > Global default
36 |
37 | ## Webhook Failure Mode
38 |
39 | The webhook uses `failurePolicy: Ignore` by default, meaning if the webhook fails, pods are created without resource optimization rather than being blocked.
40 |
41 |
42 | ## Installation with Helm
43 |
44 | ### Prerequisites
45 | - Helm 3.x
46 | - Prometheus Operator (optional, for metrics collection)
47 | - Robusta UI account - used to store KRR scan results
48 |
49 | ### Certificate
50 |
51 | - Each helm install/upgrade, a new certificate is created and deployed for the admission webhook.
52 | - The certificate is set to expire after 1 year.
53 | - In order to avoid certificate expiration, you must upgrade the enforcer helm release, at least once a year.
54 |
55 | ### Quick Start
56 |
57 | 1. **Add the helm repository** (if available):
58 | ```bash
59 | helm repo add robusta https://robusta-charts.storage.googleapis.com && helm repo update
60 | ```
61 |
62 | 2. **Add cluster configuration**:
63 |
64 | If the enforcer is installed in the same namespace as Robusta, it will automatically detect the Robusta account settings.
65 |
66 | If your Robusta UI sink token, is pulled from a secret (as described [here](https://docs.robusta.dev/master/setup-robusta/configuration-secrets.html#pulling-values-from-kubernetes-secrets)), you should add the same environement variable to the `Enforcer` pod as well.
67 |
68 | If the `Enforcer` is installed on a different namespace, you can provide your Robusta account credentials using env variables:
69 |
70 | Add your robusta credentials and cluster name: (`enforcer-values.yaml`)
71 |
72 | ```yaml
73 | additionalEnvVars:
74 | - name: CLUSTER_NAME
75 | value: my-cluster-name # should be the same as the robusta installation on this cluster
76 | - name: ROBUSTA_UI_TOKEN
77 | value: "MY ROBUSTA UI TOKEN"
78 | # - name: ROBUSTA_UI_TOKEN # or pulled from a secret
79 | # valueFrom:
80 | # secretKeyRef:
81 | # name: robusta-secrets
82 | # key: robustaSinkToken
83 | ```
84 |
85 | 2. **Install with default settings**:
86 | ```bash
87 | helm install krr-enforcer robusta/krr-enforcer -f enforcer-values.yaml
88 | ```
89 |
90 | ### Helm values
91 |
92 | | Parameter | Description | Default |
93 | |-----------|---------------------------------------------------------------------|---------|
94 | | `logLevel` | Log level (DEBUG, INFO, WARN, ERROR) | `INFO` |
95 | | `certificate` | Base64-encoded custom CA certificate - for self signed certificates | `""` |
96 | | `serviceMonitor.enabled` | Enable Prometheus ServiceMonitor | `true` |
97 | | `resources.requests.cpu` | CPU request for the enforcer pod | `100m` |
98 | | `resources.requests.memory` | Memory request for the enforcer pod | `256Mi` |
99 |
100 |
101 | ## Running Locally
102 |
103 | ### Prerequisites
104 | - Python 3.9+
105 | - Access to a Kubernetes cluster
106 | - KRR recommendations data from Robusta UI
107 |
108 | ### Setup
109 |
110 | 1. **Install dependencies**:
111 | ```bash
112 | pip install -r requirements.txt
113 | ```
114 |
115 | 2. **Set environment variables**:
116 | ```bash
117 | export ENFORCER_SSL_KEY_FILE="path/to/tls.key"
118 | export ENFORCER_SSL_CERT_FILE="path/to/tls.crt"
119 | export LOG_LEVEL="DEBUG"
120 | export KRR_MUTATION_MODE_DEFAULT="enforce"
121 | ```
122 |
123 | 3. **Generate TLS certificates**:
124 | ```bash
125 | # Generate private key
126 | openssl genrsa -out tls.key 2048
127 |
128 | # Generate certificate signing request
129 | openssl req -new -key tls.key -out tls.csr \
130 | -subj "/CN=krr-enforcer.krr-system.svc"
131 |
132 | # Generate self-signed certificate
133 | openssl x509 -req -in tls.csr -signkey tls.key -out tls.crt -days 365
134 | ```
135 |
136 | 4. **Run the server**:
137 | ```bash
138 | python enforcer_main.py
139 | ```
140 |
141 | The server will start on `https://localhost:8443` with the following endpoints:
142 |
143 | - `POST /mutate` - Webhook endpoint for Kubernetes admission control
144 | - `GET /health` - Health check endpoint
145 | - `GET /metrics` - Prometheus metrics
146 | - `GET /recommendations/{namespace}/{kind}/{name}` - Query recommendations
147 |
148 | ### Local Development Tips
149 |
150 | - Use `LOG_LEVEL=DEBUG` for detailed request/response logging
151 | - Test webhook locally using tools like `curl` or `httpie`
152 | - Monitor metrics at `https://localhost:8443/metrics`
153 | - Query recommendations: `GET https://localhost:8443/recommendations/default/Deployment/my-app`
154 |
155 | ### Testing the Webhook
156 |
157 | ```bash
158 | # Test health endpoint
159 | curl -k https://localhost:8443/health
160 |
161 | # Test metrics endpoint
162 | curl -k https://localhost:8443/metrics
163 |
164 | # Test recommendations endpoint
165 | curl -k https://localhost:8443/recommendations/default/Deployment/my-app
166 | ```
167 |
168 | ## Metrics
169 |
170 | The enforcer exposes Prometheus metrics at `/metrics`:
171 |
172 | - `krr_pod_admission_mutations_total` - Total pod mutations (with `mutated` label)
173 | - `krr_replicaset_admissions_total` - Total ReplicaSet admissions (with `operation` label)
174 | - `krr_rs_owners_map_size` - Current size of the ReplicaSet owners map
175 | - `krr_admission_duration_seconds` - Duration of admission operations (with `kind` label)
176 |
177 | ## API Endpoints
178 |
179 | ### GET /recommendations/{namespace}/{kind}/{name}
180 |
181 | Retrieve recommendations for a specific workload:
182 |
183 | ```bash
184 | curl -k https://krr-enforcer.krr-system.svc.cluster.local/recommendations/default/Deployment/my-app
185 | ```
186 |
187 | Response:
188 | ```json
189 | {
190 | "namespace": "default",
191 | "kind": "Deployment",
192 | "name": "my-app",
193 | "containers": {
194 | "web": {
195 | "cpu": {
196 | "request": "100m",
197 | "limit": "200m"
198 | },
199 | "memory": {
200 | "request": "128Mi",
201 | "limit": "256Mi"
202 | }
203 | }
204 | }
205 | }
206 | ```
207 |
208 | ## Troubleshooting
209 |
210 | ### Common Issues
211 |
212 | 1. **Certificate Errors**: Ensure TLS certificates are properly configured and valid
213 | 2. **Permission Denied**: Verify the ServiceAccount has proper RBAC permissions
214 | 3. **No Recommendations**: Check that KRR has generated recommendations and they're accessible
215 | 4. **Webhook Timeout**: Increase `timeoutSeconds` in MutatingWebhookConfiguration
216 |
217 | ### Debug Mode
218 |
219 | Enable debug logging to troubleshoot issues:
220 |
221 | ```bash
222 | helm upgrade krr-enforcer ./helm/krr-enforcer --set logLevel=DEBUG
223 | ```
224 |
225 | ### Logs
226 |
227 | Check enforcer logs:
228 | ```bash
229 | kubectl logs -n krr-system deployment/krr-enforcer-krr-enforcer -f
230 | ```
--------------------------------------------------------------------------------
/enforcer/dal/robusta_config.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict
2 | from pydantic import BaseModel
3 |
4 |
5 | class RobustaConfig(BaseModel):
6 | sinks_config: List[Dict[str, Dict]]
7 | global_config: dict
8 |
9 | class RobustaToken(BaseModel):
10 | store_url: str
11 | api_key: str
12 | account_id: str
13 | email: str
14 | password: str
--------------------------------------------------------------------------------
/enforcer/env_vars.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | ROBUSTA_CONFIG_PATH = os.environ.get(
4 | "ROBUSTA_CONFIG_PATH", "/etc/robusta/config/active_playbooks.yaml"
5 | )
6 | ROBUSTA_ACCOUNT_ID = os.environ.get("ROBUSTA_ACCOUNT_ID", "")
7 | STORE_URL = os.environ.get("STORE_URL", "")
8 | STORE_API_KEY = os.environ.get("STORE_API_KEY", "")
9 | STORE_EMAIL = os.environ.get("STORE_EMAIL", "")
10 | STORE_PASSWORD = os.environ.get("STORE_PASSWORD", "")
11 |
12 | DISCOVERY_MAX_BATCHES = int(os.environ.get("DISCOVERY_MAX_BATCHES", 50))
13 | DISCOVERY_BATCH_SIZE = int(os.environ.get("DISCOVERY_BATCH_SIZE", 30000))
14 |
15 | UPDATE_THRESHOLD = float(os.environ.get("UPDATE_THRESHOLD", 20.0))
16 |
17 | SCAN_RELOAD_INTERVAL = int(os.environ.get("SCAN_RELOAD_INTERVAL", 3600))
18 | KRR_MUTATION_MODE_DEFAULT = os.environ.get("KRR_MUTATION_MODE_DEFAULT", "enforce")
19 | REPLICA_SET_CLEANUP_INTERVAL = int(os.environ.get("REPLICA_SET_CLEANUP_INTERVAL", 600))
20 | REPLICA_SET_DELETION_WAIT = int(os.environ.get("REPLICA_SET_DELETION_WAIT", 600))
21 | SCAN_AGE_HOURS_THRESHOLD = int(os.environ.get("SCAN_AGE_HOURS_THRESHOLD", 360)) # 15 days
22 |
23 | ENFORCER_SSL_KEY_FILE = os.environ.get("ENFORCER_SSL_KEY_FILE", "")
24 | ENFORCER_SSL_CERT_FILE = os.environ.get("ENFORCER_SSL_CERT_FILE", "")
--------------------------------------------------------------------------------
/enforcer/metrics.py:
--------------------------------------------------------------------------------
1 | from prometheus_client import Counter, Histogram, Gauge
2 |
3 | # Prometheus metrics
4 | pod_admission_mutations = Counter(
5 | 'krr_pod_admission_mutations_total',
6 | 'Total pod admission mutations',
7 | ['mutated', 'reason'] # labels: 'true' or 'false', reason for success/failure
8 | )
9 |
10 | replicaset_admissions = Counter(
11 | 'krr_replicaset_admissions_total',
12 | 'Total replicaset admissions',
13 | ['operation'] # labels: CREATE, DELETE, etc.
14 | )
15 |
16 | rs_owners_size = Gauge(
17 | 'krr_rs_owners_map_size',
18 | 'Current size of the rs_owners map'
19 | )
20 |
21 | admission_duration = Histogram(
22 | 'krr_admission_duration_seconds',
23 | 'Duration of admission operations',
24 | ['kind'] # labels: Pod, ReplicaSet
25 | )
--------------------------------------------------------------------------------
/enforcer/model.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import Optional, Dict, Any, List
3 |
4 | from pydantic import BaseModel
5 |
6 |
7 | class PodOwner(BaseModel):
8 | kind: str
9 | name: str
10 | namespace: str
11 |
12 | class RsOwner(BaseModel):
13 | rs_name: str
14 | namespace: str
15 | owner_name: str
16 | owner_kind: str
17 | deletion_ts: Optional[float] = None
18 |
19 | class Resources(BaseModel):
20 | request: float
21 | limit: Optional[float]
22 |
23 |
24 | class ContainerRecommendation(BaseModel):
25 | cpu: Optional[Resources] = None
26 | memory: Optional[Resources] = None
27 |
28 | @staticmethod
29 | def build(recommendation: Dict[str, Any]) -> Optional["ContainerRecommendation"]:
30 | resource_recommendation = ContainerRecommendation()
31 | content: List[Dict] = recommendation["content"]
32 | for container_resource in content:
33 | resource = container_resource["resource"]
34 | if resource not in ["memory", "cpu"]:
35 | continue
36 |
37 | recommended: Dict[str, Any] = container_resource["recommended"]
38 | request = recommended.get("request", 0.0)
39 | limit = recommended.get("limit", None)
40 |
41 | if request == 0.0:
42 | logging.debug("skipping container recommendations without request, %s", recommendation)
43 | return None
44 |
45 | if request == "?" or limit == "?":
46 | logging.debug("skipping container recommendations with '?', %s", recommendation)
47 | return None
48 |
49 | resources = Resources(request=request, limit=limit)
50 | if resource == "memory":
51 | resource_recommendation.memory = resources
52 | elif resource == "cpu":
53 | resource_recommendation.cpu = resources
54 |
55 | return resource_recommendation
56 |
57 |
58 | class WorkloadRecommendation(BaseModel):
59 | workload_key: str
60 | container_recommendations: Dict[str, ContainerRecommendation] = {}
61 |
62 | def get(self, container: str) -> Optional[ContainerRecommendation]:
63 | return self.container_recommendations.get(container, None)
64 |
65 |
66 | def add(self, container: str, recommendation: ContainerRecommendation):
67 | self.container_recommendations[container] = recommendation
--------------------------------------------------------------------------------
/enforcer/params_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import re
4 | from typing import Dict, Optional
5 |
6 | from pydantic.types import SecretStr
7 |
8 | def get_env_replacement(value: str) -> Optional[str]:
9 | env_values = re.findall(r"{{[ ]*env\.(.*)[ ]*}}", value)
10 | if env_values:
11 | env_var_value = os.environ.get(env_values[0].strip(), None)
12 | if not env_var_value:
13 | msg = f"ENV var replacement {env_values[0]} does not exist for param: {value}"
14 | logging.error(msg)
15 | raise Exception(msg)
16 | return env_var_value
17 | return None
18 |
19 |
20 | def replace_env_vars_values(values: Dict) -> Dict:
21 | for key, value in values.items():
22 | if isinstance(value, str):
23 | env_var_value = get_env_replacement(value)
24 | if env_var_value:
25 | values[key] = env_var_value
26 | elif isinstance(value, SecretStr):
27 | env_var_value = get_env_replacement(value.get_secret_value())
28 | if env_var_value:
29 | values[key] = SecretStr(env_var_value)
30 | elif isinstance(value, dict):
31 | env_var_value = replace_env_vars_values(value)
32 | if env_var_value:
33 | values[key] = env_var_value
34 |
35 | return values
36 |
--------------------------------------------------------------------------------
/enforcer/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi==0.109.2
2 | uvicorn==0.27.1
3 | pydantic==2.6.1
4 | supabase==2.5
5 | PyYAML==6.0.1
6 | cachetools==5.3.3
7 | prometheus-client==0.20.0
8 | kubernetes==26.1.0
9 |
--------------------------------------------------------------------------------
/enforcer/resources/kubernetes_resource_loader.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | from typing import List
4 |
5 | from enforcer.env_vars import DISCOVERY_MAX_BATCHES, DISCOVERY_BATCH_SIZE
6 | from kubernetes import client
7 | from kubernetes.client import V1ReplicaSetList
8 | from kubernetes import config
9 |
10 | from enforcer.model import RsOwner
11 |
12 | if os.getenv("KUBERNETES_SERVICE_HOST"):
13 | config.load_incluster_config()
14 | else:
15 | config.load_kube_config()
16 |
17 |
18 | class KubernetesResourceLoader:
19 |
20 | @staticmethod
21 | def load_replicasets() -> List[RsOwner]:
22 | cluster_rs: List[RsOwner] = []
23 | continue_ref = None
24 | for batch_num in range(DISCOVERY_MAX_BATCHES):
25 | replicasets: V1ReplicaSetList = client.AppsV1Api().list_replica_set_for_all_namespaces(
26 | limit=DISCOVERY_BATCH_SIZE, _continue=continue_ref
27 | )
28 |
29 | for replicaset in replicasets.items:
30 | owner_references = replicaset.metadata.owner_references
31 | if owner_references:
32 | rs_owner = owner_references[0]
33 | if len(owner_references) > 1:
34 | logging.warning(f"ReplicasSet with multiple owner_references: {owner_references}")
35 | controllers = [owner for owner in owner_references if owner.get("controller", False)]
36 | if controllers:
37 | rs_owner = controllers[0]
38 |
39 | cluster_rs.append(RsOwner(
40 | rs_name=replicaset.metadata.name,
41 | namespace=replicaset.metadata.namespace,
42 | owner_name=rs_owner.name,
43 | owner_kind=rs_owner.kind,
44 | ))
45 |
46 | continue_ref = replicasets.metadata._continue
47 | if not continue_ref:
48 | break
49 |
50 | if batch_num == DISCOVERY_MAX_BATCHES - 1:
51 | replicas_limit = DISCOVERY_MAX_BATCHES * DISCOVERY_BATCH_SIZE
52 | logging.warning(f"Reached replicas loading limit: {replicas_limit}.")
53 |
54 | return cluster_rs
55 |
--------------------------------------------------------------------------------
/enforcer/resources/owner_store.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import threading
3 | import time
4 | from typing import Dict, Any, Optional, List
5 |
6 | from enforcer.env_vars import REPLICA_SET_CLEANUP_INTERVAL, REPLICA_SET_DELETION_WAIT
7 | from enforcer.metrics import rs_owners_size
8 | from enforcer.model import PodOwner, RsOwner
9 | from enforcer.resources.kubernetes_resource_loader import KubernetesResourceLoader
10 |
11 |
12 | class OwnerStore:
13 |
14 | def __init__(self):
15 | self.rs_owners: Dict[str, RsOwner] = {}
16 | self._rs_owners_lock = threading.Lock()
17 | self._owners_loaded = threading.Event()
18 | self._loading_in_progress = threading.Lock()
19 | self.cleanup_interval = REPLICA_SET_CLEANUP_INTERVAL
20 | self._stop_event = threading.Event()
21 | self._cleanup_thread = threading.Thread(target=self._periodic_cleanup, daemon=True)
22 | self._cleanup_thread.start()
23 |
24 | def _rs_key(self, rs_name: str, namespace: str) -> str:
25 | return f"{namespace}/{rs_name}"
26 |
27 | def finalize_owner_initialization(self):
28 | """Initialize rs_owners on-demand, thread-safe, only once."""
29 | if self._owners_loaded.is_set():
30 | return # Already loaded
31 |
32 | # Try to acquire the loading lock without blocking
33 | if not self._loading_in_progress.acquire(blocking=False):
34 | # Another thread is loading, just return
35 | return
36 |
37 | try:
38 | if self._owners_loaded.is_set():
39 | return
40 |
41 | replica_sets_owners: List[RsOwner] = KubernetesResourceLoader.load_replicasets()
42 | loaded_owners: Dict[str, RsOwner] = {}
43 | for owner in replica_sets_owners:
44 | loaded_owners[self._rs_key(owner.rs_name, owner.namespace)] = owner
45 |
46 | with self._rs_owners_lock:
47 | self.rs_owners.update(loaded_owners)
48 | rs_owners_size.set(len(self.rs_owners))
49 |
50 | self._owners_loaded.set()
51 | logging.info(f"Loaded {len(loaded_owners)} ReplicaSet owners")
52 |
53 | except Exception:
54 | logging.exception(f"Failed to load ReplicaSet owners")
55 | finally:
56 | self._loading_in_progress.release()
57 |
58 | @staticmethod
59 | def get_pod_name(metadata: Dict[str, Any]) -> str:
60 | # if the pod's name is randomized, the name is under generateName
61 | return metadata.get("name") or metadata.get("generateName")
62 |
63 | def get_pod_owner(self, pod: Dict[str, Any]) -> Optional[PodOwner]:
64 | metadata = pod.get("metadata", {})
65 | owner_references = metadata.get("ownerReferences", [])
66 | namespace: str = metadata.get("namespace")
67 |
68 | try:
69 | if not owner_references: # pod has no owner, standalone pod. Return the pod
70 | return PodOwner(
71 | kind="Pod", namespace=namespace, name=self.get_pod_name(pod)
72 | )
73 |
74 | # get only owners with controller == true
75 | controllers = [owner for owner in owner_references if owner.get("controller", False)]
76 | if controllers:
77 | if len(controllers) > 1:
78 | logging.warning(f"Multiple controllers found for {pod}")
79 |
80 | controller = controllers[0]
81 | controller_kind: str = controller.get("kind")
82 | if controller_kind == "ReplicaSet":
83 | with self._rs_owners_lock:
84 | rs_owner = self.rs_owners.get(self._rs_key(controller.get("name"), namespace), None)
85 | return PodOwner(
86 | name=rs_owner.owner_name,
87 | namespace=rs_owner.namespace,
88 | kind=rs_owner.owner_kind,
89 | ) if rs_owner else None
90 | else: # Pod owner is a k8s workload: Job, StatefulSet, DaemonSet
91 | return PodOwner(kind=controller_kind, name=controller.get("name"), namespace=namespace)
92 | except Exception:
93 | logging.exception(f"Failed to get pod owner for {pod}")
94 |
95 | return None
96 |
97 | def handle_rs_admission(self, request: Dict[str, Any]):
98 | logging.debug(f"handle_rs_admission %s", request)
99 | operation = request.get("operation")
100 | if operation == "DELETE":
101 | old_object = request.get("oldObject") or {} # delete has old object
102 | metadata = old_object.get("metadata", {})
103 | rs_name = metadata.get("name")
104 | namespace = metadata.get("namespace")
105 | if rs_name and namespace:
106 | with self._rs_owners_lock:
107 | rs_owner = self.rs_owners.get(self._rs_key(rs_name, namespace), None)
108 | if rs_owner:
109 | rs_owner.deletion_ts = time.time()
110 | elif operation == "CREATE":
111 | self._add_rs_owner(request)
112 |
113 | def _add_rs_owner(self, rs_create_request: Dict[str, Any]):
114 | metadata = rs_create_request.get("object", {}).get("metadata", {})
115 | owner_references = metadata.get("ownerReferences", [])
116 | if len(owner_references):
117 | rs_owner = RsOwner(
118 | rs_name=metadata.get("name"),
119 | namespace=metadata.get("namespace"),
120 | owner_name=owner_references[0].get("name"),
121 | owner_kind=owner_references[0].get("kind"),
122 | )
123 | with self._rs_owners_lock:
124 | self.rs_owners[self._rs_key(rs_owner.rs_name, rs_owner.namespace)] = rs_owner
125 | else:
126 | logging.warning(f"No owner references for {rs_create_request}")
127 |
128 |
129 | def _cleanup_deleted_replica_sets(self):
130 | current_time = time.time()
131 |
132 | with self._rs_owners_lock:
133 | # Delete rs owners that were deleted more than REPLICA_SET_DELETION_WAIT seconds ago
134 | keys_to_delete = [
135 | key for key, rs_owner in self.rs_owners.items()
136 | if rs_owner.deletion_ts is not None and (current_time - rs_owner.deletion_ts) >= REPLICA_SET_DELETION_WAIT
137 | ]
138 |
139 | for key in keys_to_delete:
140 | del self.rs_owners[key]
141 |
142 | def _periodic_cleanup(self):
143 | while not self._stop_event.wait(self.cleanup_interval):
144 | try:
145 | self._cleanup_deleted_replica_sets()
146 | logging.debug("Deleted replicasets cleanup completed")
147 | except Exception as e:
148 | logging.exception(f"Failed to cleanup deleted replicasets")
149 |
150 | def get_rs_owners_count(self) -> int:
151 | with self._rs_owners_lock:
152 | return len(self.rs_owners)
153 |
154 | def stop(self):
155 | self._stop_event.set()
156 | self._cleanup_thread.join()
--------------------------------------------------------------------------------
/enforcer/resources/recommendation_store.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import threading
3 | from typing import Dict, Optional, Tuple
4 |
5 | from enforcer.dal.supabase_dal import SupabaseDal
6 | from enforcer.env_vars import SCAN_RELOAD_INTERVAL
7 | from enforcer.model import WorkloadRecommendation, ContainerRecommendation
8 |
9 |
10 | class RecommendationStore:
11 |
12 | def __init__(self, dal: SupabaseDal):
13 | self.dal = dal
14 | self.recommendations: Dict[str, WorkloadRecommendation] = {}
15 | self.scan_id: Optional[str] = None
16 | self._recommendations_lock = threading.Lock()
17 | self._reload_recommendations()
18 |
19 | self.reload_interval = SCAN_RELOAD_INTERVAL
20 | self._stop_event = threading.Event()
21 | self._reload_thread = threading.Thread(target=self._periodic_reload, daemon=True)
22 | self._reload_thread.start()
23 |
24 |
25 | def _load_recommendations(self, current_stored_scan: Optional[str]) -> Tuple[Optional[str], Optional[Dict[str, WorkloadRecommendation]]]:
26 | latest_scan_id, latest_scan = self.dal.get_latest_krr_scan(current_stored_scan)
27 |
28 | if not latest_scan:
29 | return None, None
30 |
31 | # group workload containers recommendations, into WorkloadRecommendation object
32 | scan_recommendations: Dict[str, WorkloadRecommendation] = {}
33 | for container_recommendation in latest_scan:
34 | try:
35 | store_key = self._store_key(
36 | name=container_recommendation["name"],
37 | namespace=container_recommendation["namespace"],
38 | kind=container_recommendation["kind"],
39 | )
40 |
41 | recommendation = ContainerRecommendation.build(container_recommendation)
42 | if recommendation: # if a valid recommendation was created, connect it to the workload
43 | workload_recommendation: WorkloadRecommendation = scan_recommendations.get(store_key, None)
44 | if not workload_recommendation:
45 | workload_recommendation = WorkloadRecommendation(workload_key=store_key)
46 | scan_recommendations[store_key] = workload_recommendation
47 |
48 | workload_recommendation.add(container_recommendation["container"], recommendation)
49 | except Exception:
50 | logging.exception(f"Failed to load container recommendation: {container_recommendation}")
51 |
52 | return latest_scan_id, scan_recommendations
53 |
54 | def _store_key(self, name: str, namespace: str, kind: str) -> str:
55 | return f"{namespace}/{name}/{kind}"
56 |
57 | def _reload_recommendations(self):
58 | scan_id, new_recommendations = self._load_recommendations(self.scan_id)
59 | if new_recommendations is not None:
60 | with self._recommendations_lock:
61 | self.recommendations = new_recommendations
62 | self.scan_id = scan_id
63 | logging.info("Recommendations reloaded successfully")
64 | logging.debug("Loaded recommendations: %s", new_recommendations)
65 |
66 | def _periodic_reload(self):
67 | while not self._stop_event.wait(self.reload_interval):
68 | try:
69 | self._reload_recommendations()
70 | except Exception as e:
71 | logging.error(f"Failed to reload recommendations: {e}")
72 |
73 | def stop(self):
74 | self._stop_event.set()
75 | self._reload_thread.join()
76 |
77 | def get_recommendations(self, name: str, namespace: str, kind: str) -> Optional[WorkloadRecommendation]:
78 | with self._recommendations_lock:
79 | return self.recommendations.get(self._store_key(name, namespace, kind))
80 |
81 |
--------------------------------------------------------------------------------
/enforcer/utils.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import os
3 |
4 | import certifi
5 |
6 | CUSTOM_CERTIFICATE_PATH = "/tmp/custom_ca.pem"
7 |
8 |
9 | def append_custom_certificate(custom_ca: str) -> None:
10 | with open(certifi.where(), "ab") as outfile:
11 | outfile.write(base64.b64decode(custom_ca))
12 |
13 | os.environ["WEBSOCKET_CLIENT_CA_BUNDLE"] = certifi.where()
14 |
15 |
16 | def create_temporary_certificate(custom_ca: str) -> None:
17 | with open(certifi.where(), "rb") as base_cert:
18 | base_cert_content = base_cert.read()
19 |
20 | with open(CUSTOM_CERTIFICATE_PATH, "wb") as outfile:
21 | outfile.write(base_cert_content)
22 | outfile.write(base64.b64decode(custom_ca))
23 |
24 | os.environ["REQUESTS_CA_BUNDLE"] = CUSTOM_CERTIFICATE_PATH
25 | os.environ["WEBSOCKET_CLIENT_CA_BUNDLE"] = CUSTOM_CERTIFICATE_PATH
26 | certifi.where = lambda: CUSTOM_CERTIFICATE_PATH
27 |
28 |
29 | def add_custom_certificate(custom_ca: str) -> bool:
30 | if not custom_ca:
31 | return False
32 |
33 | # NOTE: Sometimes (Openshift) the certifi.where() is not writable, so we need to
34 | # use a temporary file in case of PermissionError.
35 | try:
36 | append_custom_certificate(custom_ca)
37 | except PermissionError:
38 | create_temporary_certificate(custom_ca)
39 |
40 | return True
41 |
--------------------------------------------------------------------------------
/examples/custom_formatter.py:
--------------------------------------------------------------------------------
1 | # This is an example on how to create your own custom formatter
2 |
3 | from __future__ import annotations
4 |
5 | import robusta_krr
6 | from robusta_krr.api import formatters
7 | from robusta_krr.api.models import Result
8 |
9 |
10 | # This is a custom formatter
11 | # It will be available to the CLI as `my_formatter`
12 | # Rich console will be enabled in this case, so the output will be colored and formatted
13 | @formatters.register(rich_console=True)
14 | def my_formatter(result: Result) -> str:
15 | # Return custom formatter
16 | return "Custom formatter"
17 |
18 |
19 | # Running this file will register the formatter and make it available to the CLI
20 | # Run it as `python ./custom_formatter.py simple --formater my_formatter`
21 | if __name__ == "__main__":
22 | robusta_krr.run()
23 |
--------------------------------------------------------------------------------
/examples/custom_severity_calculator.py:
--------------------------------------------------------------------------------
1 | # This is an example on how to create your own custom formatter
2 |
3 | from __future__ import annotations
4 |
5 | from typing import Optional
6 |
7 | import robusta_krr
8 | from robusta_krr.api.models import ResourceType, Severity, register_severity_calculator
9 |
10 |
11 | @register_severity_calculator(ResourceType.CPU)
12 | def percentage_severity_calculator(
13 | current: Optional[float], recommended: Optional[float], resource_type: ResourceType
14 | ) -> Severity:
15 | """
16 | This is an example on how to create your own custom severity calculator
17 | You can use this decorator to bind a severity calculator function to a resource type.
18 | The function will be called with the current value, the recommended value and the resource type.
19 | The function should return a Severity enum value.
20 |
21 | If you have the same calculation for multiple resource types, you can use the `bind_calculator` decorator multiple times.
22 | Then, the function will be called for each resource type and you can use the resource type to distinguish between them.
23 |
24 | Keep in mind that you can not choose the strategy for the resource type using CLI - the last one created for the resource type will be used.
25 | """
26 |
27 | if current is None and recommended is None:
28 | return Severity.GOOD
29 | if current is None or recommended is None:
30 | return Severity.WARNING
31 |
32 | diff = abs(current - recommended) / current
33 | if diff >= 0.5:
34 | return Severity.CRITICAL
35 | elif diff >= 0.25:
36 | return Severity.WARNING
37 | elif diff >= 0.1:
38 | return Severity.OK
39 | else:
40 | return Severity.GOOD
41 |
42 |
43 | # Running this file will register the formatter and make it available to the CLI
44 | # Run it as `python ./custom_formatter.py simple --formater my_formatter`
45 | if __name__ == "__main__":
46 | robusta_krr.run()
47 |
--------------------------------------------------------------------------------
/examples/custom_strategy.py:
--------------------------------------------------------------------------------
1 | # This is an example on how to create your own custom strategy
2 |
3 | import pydantic as pd
4 |
5 | import robusta_krr
6 | from robusta_krr.api.models import K8sObjectData, MetricsPodData, ResourceRecommendation, ResourceType, RunResult
7 | from robusta_krr.api.strategies import BaseStrategy, StrategySettings
8 | from robusta_krr.core.integrations.prometheus.metrics import MaxMemoryLoader, PercentileCPULoader
9 |
10 |
11 | # Providing description to the settings will make it available in the CLI help
12 | class CustomStrategySettings(StrategySettings):
13 | param_1: float = pd.Field(99, gt=0, description="First example parameter")
14 | param_2: float = pd.Field(105_000, gt=0, description="Second example parameter")
15 |
16 |
17 | class CustomStrategy(BaseStrategy[CustomStrategySettings]):
18 | """
19 | A custom strategy that uses the provided parameters for CPU and memory.
20 | Made only in order to demonstrate how to create a custom strategy.
21 | """
22 |
23 | display_name = "custom" # The name of the strategy
24 | rich_console = True # Whether to use rich console for the CLI
25 | metrics = [PercentileCPULoader(90), MaxMemoryLoader] # The metrics to use for the strategy
26 |
27 | def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult:
28 | return {
29 | ResourceType.CPU: ResourceRecommendation(request=self.settings.param_1, limit=None),
30 | ResourceType.Memory: ResourceRecommendation(request=self.settings.param_2, limit=self.settings.param_2),
31 | }
32 |
33 |
34 | # Running this file will register the strategy and make it available to the CLI
35 | # Run it as `python ./custom_strategy.py my_strategy`
36 | if __name__ == "__main__":
37 | robusta_krr.run()
38 |
--------------------------------------------------------------------------------
/helm/krr-enforcer/.helmignore:
--------------------------------------------------------------------------------
1 | # Patterns to ignore when building packages.
2 | # This supports shell glob matching, relative path matching, and
3 | # negation (prefixed with !). Only one pattern per line.
4 | .DS_Store
5 | # Common VCS dirs
6 | .git/
7 | .gitignore
8 | .bzr/
9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 |
--------------------------------------------------------------------------------
/helm/krr-enforcer/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: krr-enforcer
3 | description: KRR enforcer - auto apply KRR recommendations
4 | type: application
5 |
6 | version: 0.3.1
7 | appVersion: 0.3.1
8 |
--------------------------------------------------------------------------------
/helm/krr-enforcer/templates/enforcer-cert-job.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: batch/v1
2 | kind: Job
3 | metadata:
4 | name: {{ .Release.Name }}-krr-enforcer-cert-job
5 | namespace: {{ .Release.Namespace }}
6 | labels:
7 | app.kubernetes.io/component: krr-enforcer-cert-job
8 | annotations:
9 | helm.sh/hook: pre-install,pre-upgrade
10 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
11 | helm.sh/hook-weight: "-5"
12 | spec:
13 | backoffLimit: 3
14 | template:
15 | metadata:
16 | labels:
17 | app.kubernetes.io/name: krr-enforcer
18 | app.kubernetes.io/instance: {{ .Release.Name }}
19 | app.kubernetes.io/component: krr-enforcer-cert-job
20 | spec:
21 | serviceAccountName: {{ .Release.Name }}-krr-enforcer-cert-job
22 | restartPolicy: OnFailure
23 | volumes:
24 | - name: workdir
25 | emptyDir: {}
26 | containers:
27 | - name: cert-job
28 | image: "bitnami/kubectl:1.30"
29 | workingDir: /tmp/certs
30 | volumeMounts:
31 | - name: workdir
32 | mountPath: /tmp/certs
33 | command:
34 | - /bin/bash
35 | - -c
36 | - |
37 | set -e
38 |
39 | # Generate a CA key and certificate
40 | echo "Generating CA certificate..."
41 | openssl genrsa -out ca.key 2048
42 | openssl req -x509 -new -nodes -key ca.key -subj "/CN=robusta-krr-enforcer-ca" -days 365 -out ca.crt
43 |
44 | # Generate a server key and certificate signing request (CSR)
45 | echo "Generating server certificate..."
46 | SERVICE_NAME={{ .Release.Name }}-krr-enforcer
47 | NAMESPACE={{ .Release.Namespace }}
48 | DNS_NAME=${SERVICE_NAME}.${NAMESPACE}.svc
49 |
50 | openssl genrsa -out server.key 2048
51 | cat > server.conf < server-ext.conf </dev/null 2>&1; then
145 | echo "Restarting enforcer deployment..."
146 | kubectl rollout restart deployment ${SERVICE_NAME} -n ${NAMESPACE}
147 | else
148 | echo "Deployment ${SERVICE_NAME} does not exist yet, skipping restart"
149 | fi
150 |
151 | echo "Job completed successfully!"
152 | ---
153 | apiVersion: v1
154 | kind: ServiceAccount
155 | metadata:
156 | name: {{ .Release.Name }}-krr-enforcer-cert-job
157 | namespace: {{ .Release.Namespace }}
158 | labels:
159 | app.kubernetes.io/component: krr-enforcer-cert-job
160 | annotations:
161 | helm.sh/hook: pre-install,pre-upgrade
162 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
163 | helm.sh/hook-weight: "-6"
164 | ---
165 | apiVersion: rbac.authorization.k8s.io/v1
166 | kind: ClusterRole
167 | metadata:
168 | name: {{ .Release.Name }}-krr-enforcer-cert-job
169 | labels:
170 | app.kubernetes.io/component: krr-enforcer-cert-job
171 | annotations:
172 | helm.sh/hook: pre-install,pre-upgrade
173 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
174 | helm.sh/hook-weight: "-6"
175 | rules:
176 | - apiGroups: [""]
177 | resources: ["secrets"]
178 | verbs: ["create", "get", "update", "patch"]
179 | - apiGroups: ["admissionregistration.k8s.io"]
180 | resources: ["mutatingwebhookconfigurations"]
181 | verbs: ["create", "get", "update", "patch"]
182 | - apiGroups: ["apps"]
183 | resources: ["deployments"]
184 | verbs: ["get", "patch"]
185 | ---
186 | apiVersion: rbac.authorization.k8s.io/v1
187 | kind: ClusterRoleBinding
188 | metadata:
189 | name: {{ .Release.Name }}-krr-enforcer-cert-job
190 | labels:
191 | app.kubernetes.io/component: krr-enforcer-cert-job
192 | annotations:
193 | helm.sh/hook: pre-install,pre-upgrade
194 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded
195 | helm.sh/hook-weight: "-6"
196 | roleRef:
197 | apiGroup: rbac.authorization.k8s.io
198 | kind: ClusterRole
199 | name: {{ .Release.Name }}-krr-enforcer-cert-job
200 | subjects:
201 | - kind: ServiceAccount
202 | name: {{ .Release.Name }}-krr-enforcer-cert-job
203 | namespace: {{ .Release.Namespace }}
204 |
--------------------------------------------------------------------------------
/helm/krr-enforcer/templates/enforcer-service-account.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ServiceAccount
3 | metadata:
4 | name: {{ .Release.Name }}-krr-enforcer
5 | namespace: {{ .Release.Namespace }}
6 | labels:
7 | app.kubernetes.io/component: krr-enforcer
8 | {{- with .Values.serviceAccount.annotations }}
9 | annotations:
10 | {{- toYaml . | nindent 4 }}
11 | {{- end }}
12 | ---
13 | apiVersion: rbac.authorization.k8s.io/v1
14 | kind: ClusterRole
15 | metadata:
16 | name: {{ .Release.Name }}-krr-enforcer
17 | labels:
18 | app.kubernetes.io/component: krr-enforcer
19 | rules:
20 | - apiGroups: [""]
21 | resources: ["pods"]
22 | verbs: ["get", "list", "watch"]
23 | - apiGroups: ["apps"]
24 | resources: ["replicasets"]
25 | verbs: ["get", "list", "watch"]
26 | ---
27 | apiVersion: rbac.authorization.k8s.io/v1
28 | kind: ClusterRoleBinding
29 | metadata:
30 | name: {{ .Release.Name }}-krr-enforcer
31 | labels:
32 | app.kubernetes.io/component: krr-enforcer
33 | roleRef:
34 | apiGroup: rbac.authorization.k8s.io
35 | kind: ClusterRole
36 | name: {{ .Release.Name }}-krr-enforcer
37 | subjects:
38 | - kind: ServiceAccount
39 | name: {{ .Release.Name }}-krr-enforcer
40 | namespace: {{ .Release.Namespace }}
41 |
--------------------------------------------------------------------------------
/helm/krr-enforcer/templates/enforcer.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: apps/v1
2 | kind: Deployment
3 | metadata:
4 | name: {{ .Release.Name }}-krr-enforcer
5 | namespace: {{ .Release.Namespace }}
6 | labels:
7 | app.kubernetes.io/component: krr-enforcer
8 | {{- with .Values.annotations }}
9 | annotations:
10 | {{- toYaml . | nindent 4 }}
11 | {{- end }}
12 | spec:
13 | replicas: 1
14 | selector:
15 | matchLabels:
16 | app.kubernetes.io/name: krr-enforcer
17 | app.kubernetes.io/instance: {{ .Release.Name }}
18 | app.kubernetes.io/component: krr-enforcer
19 | template:
20 | metadata:
21 | labels:
22 | app.kubernetes.io/name: krr-enforcer
23 | app.kubernetes.io/instance: {{ .Release.Name }}
24 | app.kubernetes.io/component: krr-enforcer
25 | annotations:
26 | admission.robusta.dev/krr-mutation-mode: ignore
27 | {{- with .Values.annotations }}
28 | {{- toYaml . | nindent 8 }}
29 | {{- end }}
30 | spec:
31 | serviceAccountName: {{ .Release.Name }}-krr-enforcer
32 | {{- with .Values.imagePullSecrets }}
33 | imagePullSecrets:
34 | {{- toYaml . | nindent 8 }}
35 | {{- end }}
36 | {{- with .Values.nodeSelector }}
37 | nodeSelector:
38 | {{- toYaml . | nindent 8 }}
39 | {{- end }}
40 | {{- with .Values.tolerations }}
41 | tolerations:
42 | {{- toYaml . | nindent 8 }}
43 | {{- end }}
44 | {{- with .Values.priorityClassName }}
45 | priorityClassName: {{ . }}
46 | {{- end }}
47 | {{- with .Values.securityContext.pod }}
48 | securityContext:
49 | {{- toYaml . | nindent 8 }}
50 | {{- end }}
51 | volumes:
52 | - name: playbooks-config-secret
53 | secret:
54 | secretName: robusta-playbooks-config-secret
55 | optional: true
56 | - name: certs
57 | secret:
58 | secretName: {{ .Release.Name }}-krr-enforcer-certs
59 | containers:
60 | - name: enforcer
61 | {{- if .Values.fullImage }}
62 | image: "{{ .Values.fullImage }}"
63 | {{- else }}
64 | image: "{{ .Values.image.repository }}/{{ .Values.image.name }}:{{ .Values.image.tag }}"
65 | {{- end }}
66 | imagePullPolicy: {{ .Values.imagePullPolicy }}
67 | {{- with .Values.securityContext.container }}
68 | securityContext:
69 | {{- toYaml . | nindent 12 }}
70 | {{- end }}
71 | ports:
72 | - name: https
73 | containerPort: 8443
74 | protocol: TCP
75 | volumeMounts:
76 | - name: certs
77 | mountPath: /etc/webhook/certs
78 | readOnly: true
79 | - name: playbooks-config-secret
80 | mountPath: /etc/robusta/config
81 | livenessProbe:
82 | httpGet:
83 | path: /health
84 | port: https
85 | scheme: HTTPS
86 | initialDelaySeconds: 30
87 | periodSeconds: 10
88 | readinessProbe:
89 | httpGet:
90 | path: /health
91 | port: https
92 | scheme: HTTPS
93 | initialDelaySeconds: 5
94 | periodSeconds: 5
95 | resources:
96 | {{- if .Values.resources.requests }}
97 | requests:
98 | {{- if .Values.resources.requests.cpu }}
99 | cpu: {{ .Values.resources.requests.cpu }}
100 | {{- end }}
101 | {{- if .Values.resources.requests.memory }}
102 | memory: {{ .Values.resources.requests.memory }}
103 | {{- end }}
104 | {{- end }}
105 | {{- if .Values.resources.limits }}
106 | limits:
107 | {{- if .Values.resources.limits.cpu }}
108 | cpu: {{ .Values.resources.limits.cpu }}
109 | {{- end }}
110 | {{- if .Values.resources.limits.memory }}
111 | memory: {{ .Values.resources.limits.memory }}
112 | {{- end }}
113 | {{- end }}
114 | env:
115 | - name: ENFORCER_SSL_KEY_FILE
116 | value: "/etc/webhook/certs/tls.key"
117 | - name: ENFORCER_SSL_CERT_FILE
118 | value: "/etc/webhook/certs/tls.crt"
119 | - name: LOG_LEVEL
120 | value: {{ .Values.logLevel | quote }}
121 | {{- if .Values.certificate }}
122 | - name: CERTIFICATE
123 | value: {{ .Values.certificate | quote }}
124 | {{- end }}
125 | {{- if .Values.additionalEnvVars }}
126 | {{- toYaml .Values.additionalEnvVars | nindent 12 }}
127 | {{- end }}
128 |
129 | ---
130 | apiVersion: v1
131 | kind: Service
132 | metadata:
133 | name: {{ .Release.Name }}-krr-enforcer
134 | namespace: {{ .Release.Namespace }}
135 | labels:
136 | app.kubernetes.io/name: krr-enforcer
137 | app.kubernetes.io/instance: {{ .Release.Name }}
138 | app.kubernetes.io/component: krr-enforcer
139 | {{- with .Values.service.annotations }}
140 | annotations:
141 | {{- toYaml . | nindent 4 }}
142 | {{- end }}
143 | spec:
144 | type: ClusterIP
145 | ports:
146 | - port: 443
147 | targetPort: https
148 | protocol: TCP
149 | name: https
150 | selector:
151 | app.kubernetes.io/name: krr-enforcer
152 | app.kubernetes.io/instance: {{ .Release.Name }}
153 | app.kubernetes.io/component: krr-enforcer
154 |
--------------------------------------------------------------------------------
/helm/krr-enforcer/templates/service-monitor.yaml:
--------------------------------------------------------------------------------
1 | {{- if .Values.serviceMonitor.enabled }}
2 | apiVersion: monitoring.coreos.com/v1
3 | kind: ServiceMonitor
4 | metadata:
5 | name: {{ .Release.Name }}-krr-enforcer
6 | namespace: {{ .Release.Namespace }}
7 | labels:
8 | app.kubernetes.io/name: krr-enforcer
9 | app.kubernetes.io/instance: {{ .Release.Name }}
10 | app.kubernetes.io/component: krr-enforcer
11 | {{- with .Values.serviceMonitor.labels }}
12 | {{- toYaml . | nindent 4 }}
13 | {{- end }}
14 | {{- with .Values.serviceMonitor.annotations }}
15 | annotations:
16 | {{- toYaml . | nindent 4 }}
17 | {{- end }}
18 | spec:
19 | selector:
20 | matchLabels:
21 | app.kubernetes.io/name: krr-enforcer
22 | app.kubernetes.io/instance: {{ .Release.Name }}
23 | app.kubernetes.io/component: krr-enforcer
24 | endpoints:
25 | - port: https
26 | path: /metrics
27 | scheme: https
28 | tlsConfig:
29 | insecureSkipVerify: true
30 | {{- with .Values.serviceMonitor.interval }}
31 | interval: {{ . }}
32 | {{- end }}
33 | {{- with .Values.serviceMonitor.scrapeTimeout }}
34 | scrapeTimeout: {{ . }}
35 | {{- end }}
36 | {{- end }}
--------------------------------------------------------------------------------
/helm/krr-enforcer/values.yaml:
--------------------------------------------------------------------------------
1 | certificate: "" # base64 encoded
2 | logLevel: INFO
3 |
4 | # fullImage: ~ # full image path can be used to override image.repository/image.name:image.tag
5 |
6 | image:
7 | repository: us-central1-docker.pkg.dev/genuine-flight-317411/devel
8 | name: krr-enforcer
9 | tag: 0.3.1
10 | imagePullPolicy: IfNotPresent
11 | resources:
12 | requests:
13 | cpu: 100m
14 | memory: 256Mi
15 | limits:
16 | cpu: ~
17 | additionalEnvVars: []
18 | priorityClassName: ""
19 | tolerations: []
20 | annotations: {}
21 | nodeSelector: ~
22 | imagePullSecrets: []
23 | securityContext:
24 | container:
25 | allowPrivilegeEscalation: false
26 | capabilities: {}
27 | privileged: false
28 | readOnlyRootFilesystem: false
29 | runAsUser: 1000
30 | pod: {}
31 | service:
32 | annotations: {}
33 | serviceAccount:
34 | annotations: {}
35 | serviceMonitor:
36 | enabled: true
37 | interval: 30s
38 | scrapeTimeout: 10s
39 | labels: {}
40 | annotations: {}
41 |
--------------------------------------------------------------------------------
/helm/upload_chart.sh:
--------------------------------------------------------------------------------
1 | rm -rf ./tmp
2 | mkdir ./tmp
3 | cd ./tmp
4 | helm package ../krr-enforcer
5 | mkdir krr-enforcer
6 | mv *.tgz ./krr-enforcer
7 | curl https://robusta-charts.storage.googleapis.com/index.yaml > index.yaml
8 | helm repo index --merge index.yaml --url https://robusta-charts.storage.googleapis.com ./krr-enforcer
9 | gsutil rsync -r krr-enforcer gs://robusta-charts
10 | gsutil setmeta -h "Cache-Control:max-age=0" gs://robusta-charts/index.yaml
11 | cd ../
12 | rm -rf ./tmp
13 |
--------------------------------------------------------------------------------
/images/krr-datasources.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/krr-datasources.png
--------------------------------------------------------------------------------
/images/krr-other-integrations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/krr-other-integrations.png
--------------------------------------------------------------------------------
/images/krr_slack_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/krr_slack_example.png
--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/logo.png
--------------------------------------------------------------------------------
/images/screenshot.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/screenshot.jpeg
--------------------------------------------------------------------------------
/images/ui_recommendation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/ui_recommendation.png
--------------------------------------------------------------------------------
/images/ui_screenshot_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/ui_screenshot_new.png
--------------------------------------------------------------------------------
/images/ui_video.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/ui_video.gif
--------------------------------------------------------------------------------
/intro.txt:
--------------------------------------------------------------------------------
1 | [bold magenta]
2 | _____ _ _ _ _______ _____
3 | | __ \ | | | | | |/ / __ \| __ \
4 | | |__) |___ | |__ _ _ ___| |_ __ _ | ' /| |__) | |__) |
5 | | _ // _ \| '_ \| | | / __| __/ _` | | < | _ /| _ /
6 | | | \ \ (_) | |_) | |_| \__ \ || (_| | | . \| | \ \| | \ \
7 | |_| \_\___/|_.__/ \__,_|___/\__\__,_| |_|\_\_| \_\_| \_\
8 |
9 |
10 | Thanks for using Robusta KRR. If you have any questions or feedback, please feel free to reach out to us at
11 | https://github.com/robusta-dev/krr/issues
12 |
13 | Watch our latest video to optimize your workloads and save costs: https://www.youtube.com/watch?v=TYRA2QcDIuI
14 |
15 | [/bold magenta]
--------------------------------------------------------------------------------
/krr.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from robusta_krr.common.ssl_utils import add_custom_certificate
4 |
5 | ADDITIONAL_CERTIFICATE: str = os.environ.get("CERTIFICATE", "")
6 |
7 | if add_custom_certificate(ADDITIONAL_CERTIFICATE):
8 | print("added custom certificate")
9 |
10 | # DO NOT ADD ANY CODE ABOVE THIS
11 | # ADDING IMPORTS BEFORE ADDING THE CUSTOM CERTS MIGHT INIT HTTP CLIENTS THAT DOESN'T RESPECT THE CUSTOM CERT
12 |
13 | from robusta_krr import run
14 |
15 | if __name__ == "__main__":
16 | run()
17 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "robusta-krr"
3 | version = "1.8.2-dev"
4 | description = "Robusta's Resource Recommendation engine for Kubernetes"
5 | authors = ["Pavel Zhukov <33721692+LeaveMyYard@users.noreply.github.com>"]
6 | license = "MIT"
7 | readme = "README.md"
8 | packages = [{ include = "robusta_krr" }]
9 |
10 | [tool.black]
11 | line-length = 120
12 | target-version = ['py39']
13 |
14 | [tool.isort]
15 | line_length = 120
16 | multi_line_output = 3
17 | include_trailing_comma = true
18 |
19 | [tool.mypy]
20 | plugins = "numpy.typing.mypy_plugin,pydantic.mypy"
21 |
22 | [tool.poetry.scripts]
23 | krr = "robusta_krr.main:run"
24 |
25 | [tool.poetry.dependencies]
26 | python = ">=3.9,<=3.12.3"
27 | typer = { extras = ["all"], version = "^0.7.0" }
28 | pydantic = "^1.10.7"
29 | kubernetes = "^26.1.0"
30 | prometheus-api-client = "0.5.3"
31 | numpy = ">=1.26.4,<1.27.0"
32 | alive-progress = "^3.1.2"
33 | prometrix = "0.2.0"
34 | slack-sdk = "^3.21.3"
35 | pandas = "2.2.2"
36 | requests = "2.32.0"
37 | pyyaml = "6.0.1"
38 | typing-extensions = "4.6.0"
39 | idna = "3.7"
40 | urllib3 = "^1.26.20"
41 | setuptools = "^70.0.0"
42 | zipp = "^3.19.1"
43 | tenacity = "^9.0.0"
44 |
45 |
46 |
47 | [tool.poetry.group.dev.dependencies]
48 | mypy = "^1.0.1"
49 | black = "^23.1.0"
50 | isort = "^5.12.0"
51 | flake8 = "^6.0.0"
52 | types-pyyaml = "^6.0.12.8"
53 | types-cachetools = "^5.3.0.4"
54 | types-requests = "^2.28.11.15"
55 | pyinstaller = "^5.9.0"
56 | pytest = "^7.2.2"
57 |
58 | [build-system]
59 | requires = ["poetry-core"]
60 | build-backend = "poetry.core.masonry.api"
61 |
62 |
63 | [project]
64 | name = "robusta_krr"
65 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | about-time==4.2.1 ; python_version >= "3.9" and python_full_version < "3.13"
2 | alive-progress==3.1.5 ; python_version >= "3.9" and python_full_version < "3.13"
3 | boto3==1.34.62 ; python_version >= "3.9" and python_full_version < "3.13"
4 | botocore==1.34.62 ; python_version >= "3.9" and python_full_version < "3.13"
5 | cachetools==5.3.3 ; python_version >= "3.9" and python_full_version < "3.13"
6 | certifi==2024.2.2 ; python_version >= "3.9" and python_full_version < "3.13"
7 | charset-normalizer==3.3.2 ; python_version >= "3.9" and python_full_version < "3.13"
8 | click==8.1.7 ; python_version >= "3.9" and python_full_version < "3.13"
9 | colorama==0.4.6 ; python_version >= "3.9" and python_full_version < "3.13"
10 | commonmark==0.9.1 ; python_version >= "3.9" and python_full_version < "3.13"
11 | contourpy==1.2.0 ; python_version >= "3.9" and python_full_version < "3.13"
12 | cycler==0.12.1 ; python_version >= "3.9" and python_full_version < "3.13"
13 | dateparser==1.2.0 ; python_version >= "3.9" and python_full_version < "3.13"
14 | fonttools==4.49.0 ; python_version >= "3.9" and python_full_version < "3.13"
15 | google-auth==2.28.2 ; python_version >= "3.9" and python_full_version < "3.13"
16 | grapheme==0.6.0 ; python_version >= "3.9" and python_full_version < "3.13"
17 | httmock==1.4.0 ; python_version >= "3.9" and python_full_version < "3.13"
18 | idna==3.7 ; python_version >= "3.9" and python_full_version < "3.13"
19 | importlib-resources==6.3.0 ; python_version >= "3.9" and python_version < "3.10"
20 | jmespath==1.0.1 ; python_version >= "3.9" and python_full_version < "3.13"
21 | kiwisolver==1.4.5 ; python_version >= "3.9" and python_full_version < "3.13"
22 | kubernetes==26.1.0 ; python_version >= "3.9" and python_full_version < "3.13"
23 | matplotlib==3.8.3 ; python_version >= "3.9" and python_full_version < "3.13"
24 | numpy==1.26.4 ; python_version >= "3.9" and python_full_version < "3.13"
25 | oauthlib==3.2.2 ; python_version >= "3.9" and python_full_version < "3.13"
26 | packaging==24.0 ; python_version >= "3.9" and python_full_version < "3.13"
27 | pandas==2.2.2 ; python_version >= "3.9" and python_full_version < "3.13"
28 | pillow==10.3.0 ; python_version >= "3.9" and python_full_version < "3.13"
29 | prometheus-api-client==0.5.3 ; python_version >= "3.9" and python_full_version < "3.13"
30 | prometrix==0.1.17 ; python_version >= "3.9" and python_full_version < "3.13"
31 | pyasn1-modules==0.3.0 ; python_version >= "3.9" and python_full_version < "3.13"
32 | pyasn1==0.5.1 ; python_version >= "3.9" and python_full_version < "3.13"
33 | pydantic==1.10.15 ; python_version >= "3.9" and python_full_version < "3.13"
34 | pygments==2.17.2 ; python_version >= "3.9" and python_full_version < "3.13"
35 | pyparsing==3.1.2 ; python_version >= "3.9" and python_full_version < "3.13"
36 | python-dateutil==2.9.0.post0 ; python_version >= "3.9" and python_full_version < "3.13"
37 | pytz==2024.1 ; python_version >= "3.9" and python_full_version < "3.13"
38 | pyyaml==6.0.1 ; python_version >= "3.9" and python_full_version < "3.13"
39 | regex==2023.12.25 ; python_version >= "3.9" and python_full_version < "3.13"
40 | requests-oauthlib==1.4.0 ; python_version >= "3.9" and python_full_version < "3.13"
41 | requests==2.32.0 ; python_version >= "3.9" and python_full_version < "3.13"
42 | rich==12.6.0 ; python_version >= "3.9" and python_full_version < "3.13"
43 | rsa==4.9 ; python_version >= "3.9" and python_full_version < "3.13"
44 | s3transfer==0.10.0 ; python_version >= "3.9" and python_full_version < "3.13"
45 | setuptools==70.3.0 ; python_version >= "3.9" and python_full_version < "3.13"
46 | shellingham==1.5.4 ; python_version >= "3.9" and python_full_version < "3.13"
47 | six==1.16.0 ; python_version >= "3.9" and python_full_version < "3.13"
48 | slack-sdk==3.27.1 ; python_version >= "3.9" and python_full_version < "3.13"
49 | typer[all]==0.7.0 ; python_version >= "3.9" and python_full_version < "3.13"
50 | typing-extensions==4.6.0 ; python_version >= "3.9" and python_full_version < "3.13"
51 | tzdata==2024.1 ; python_version >= "3.9" and python_full_version < "3.13"
52 | tzlocal==5.2 ; python_version >= "3.9" and python_full_version < "3.13"
53 | urllib3==1.26.19 ; python_version >= "3.9" and python_full_version < "3.13"
54 | websocket-client==1.7.0 ; python_version >= "3.9" and python_full_version < "3.13"
55 | zipp==3.19.2 ; python_version >= "3.9" and python_version < "3.13"
56 | tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13"
--------------------------------------------------------------------------------
/robusta_krr/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import run
2 |
3 | __version__ = "dev"
4 | __all__ = ["run", "__version__"]
5 |
--------------------------------------------------------------------------------
/robusta_krr/api/formatters.py:
--------------------------------------------------------------------------------
1 | from robusta_krr.core.abstract.formatters import find, list_available, register
2 |
3 | __all__ = ["register", "find", "list_available"]
4 |
--------------------------------------------------------------------------------
/robusta_krr/api/models.py:
--------------------------------------------------------------------------------
1 | from robusta_krr.core.abstract.strategies import MetricsPodData, PodsTimeData, ResourceRecommendation, RunResult
2 | from robusta_krr.core.models.allocations import RecommendationValue, ResourceAllocations, ResourceType
3 | from robusta_krr.core.models.objects import K8sObjectData, PodData
4 | from robusta_krr.core.models.result import ResourceScan, Result
5 | from robusta_krr.core.models.severity import Severity, register_severity_calculator
6 |
7 | __all__ = [
8 | "ResourceType",
9 | "ResourceAllocations",
10 | "RecommendationValue",
11 | "K8sObjectData",
12 | "PodData",
13 | "Result",
14 | "Severity",
15 | "register_severity_calculator",
16 | "ResourceScan",
17 | "ResourceRecommendation",
18 | "PodsTimeData",
19 | "MetricsPodData",
20 | "RunResult",
21 | ]
22 |
--------------------------------------------------------------------------------
/robusta_krr/api/strategies.py:
--------------------------------------------------------------------------------
1 | from robusta_krr.core.abstract.strategies import BaseStrategy, StrategySettings
2 |
3 | __all__ = ["BaseStrategy", "StrategySettings"]
4 |
--------------------------------------------------------------------------------
/robusta_krr/common/ssl_utils.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import os
3 |
4 | import certifi
5 |
6 | CUSTOM_CERTIFICATE_PATH = "/tmp/custom_ca.pem"
7 |
8 |
9 | def append_custom_certificate(custom_ca: str) -> None:
10 | with open(certifi.where(), "ab") as outfile:
11 | outfile.write(base64.b64decode(custom_ca))
12 |
13 | os.environ["WEBSOCKET_CLIENT_CA_BUNDLE"] = certifi.where()
14 |
15 |
16 | def create_temporary_certificate(custom_ca: str) -> None:
17 | with open(certifi.where(), "rb") as base_cert:
18 | base_cert_content = base_cert.read()
19 |
20 | with open(CUSTOM_CERTIFICATE_PATH, "wb") as outfile:
21 | outfile.write(base_cert_content)
22 | outfile.write(base64.b64decode(custom_ca))
23 |
24 | os.environ["REQUESTS_CA_BUNDLE"] = CUSTOM_CERTIFICATE_PATH
25 | os.environ["WEBSOCKET_CLIENT_CA_BUNDLE"] = CUSTOM_CERTIFICATE_PATH
26 | certifi.where = lambda: CUSTOM_CERTIFICATE_PATH
27 |
28 |
29 | def add_custom_certificate(custom_ca: str) -> bool:
30 | if not custom_ca:
31 | return False
32 |
33 | # NOTE: Sometimes (Openshift) the certifi.where() is not writable, so we need to
34 | # use a temporary file in case of PermissionError.
35 | try:
36 | append_custom_certificate(custom_ca)
37 | except PermissionError:
38 | create_temporary_certificate(custom_ca)
39 |
40 | return True
41 |
--------------------------------------------------------------------------------
/robusta_krr/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/robusta_krr/core/__init__.py
--------------------------------------------------------------------------------
/robusta_krr/core/abstract/formatters.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import Any, Callable, Optional
4 |
5 | from robusta_krr.core.models.result import Result
6 |
7 | FormatterFunc = Callable[[Result], Any]
8 |
9 | FORMATTERS_REGISTRY: dict[str, FormatterFunc] = {}
10 |
11 |
12 | # NOTE: Here asterisk is used to make the argument `rich_console` keyword-only
13 | # This is done to avoid the following usage, where it is unclear what the boolean value is for:
14 | # @register("My Formatter", True)
15 | # def my_formatter(result: Result) -> str:
16 | # return "My formatter"
17 | #
18 | # Instead, the following usage is enforced:
19 | # @register("My Formatter", rich_console=True)
20 | # def my_formatter(result: Result) -> str:
21 | # return "My formatter"
22 |
23 |
24 | def register(
25 | display_name: Optional[str] = None, *, rich_console: bool = False
26 | ) -> Callable[[FormatterFunc], FormatterFunc]:
27 | """
28 | A decorator to register a formatter function.
29 |
30 | Args:
31 | display_name (str, optional): The name to use for the formatter in the registry.
32 | rich_console (bool): Whether or not the formatter is for a rich console. Defaults to False.
33 |
34 | Returns:
35 | Callable[[FormatterFunc], FormatterFunc]: The decorator function.
36 | """
37 |
38 | def decorator(func: FormatterFunc) -> FormatterFunc:
39 | name = display_name or func.__name__
40 |
41 | FORMATTERS_REGISTRY[name] = func
42 |
43 | func.__display_name__ = name # type: ignore
44 | func.__rich_console__ = rich_console # type: ignore
45 |
46 | return func
47 |
48 | return decorator
49 |
50 |
51 | def find(name: str) -> FormatterFunc:
52 | """
53 | Find a formatter by name in the registry.
54 |
55 | Args:
56 | name (str): The name of the formatter.
57 |
58 | Returns:
59 | FormatterFunc: The formatter function.
60 |
61 | Raises:
62 | ValueError: If a formatter with the given name does not exist.
63 | """
64 |
65 | try:
66 | return FORMATTERS_REGISTRY[name]
67 | except KeyError as e:
68 | raise ValueError(f"Formatter '{name}' not found") from e
69 |
70 |
71 | def list_available() -> list[str]:
72 | """
73 | List available formatters in the registry.
74 |
75 | Returns:
76 | list[str]: A list of the names of the available formatters.
77 | """
78 |
79 | return list(FORMATTERS_REGISTRY)
80 |
81 |
82 | __all__ = ["register", "find"]
83 |
--------------------------------------------------------------------------------
/robusta_krr/core/abstract/metrics.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | from abc import ABC, abstractmethod
3 |
4 | from robusta_krr.core.abstract.strategies import PodsTimeData
5 | from robusta_krr.core.models.objects import K8sObjectData
6 |
7 |
8 | class BaseMetric(ABC):
9 | """
10 | This abstraction is done for a future use.
11 | Currently we only scrape metrics from Prometheus,
12 | but in the future we may want to support other metric sources like Datadog, etc.
13 |
14 | TODO: When we want to support other metric sources, we should maybe rethink an interface here.
15 | """
16 |
17 | @abstractmethod
18 | async def load_data(
19 | self, object: K8sObjectData, period: datetime.timedelta, step: datetime.timedelta
20 | ) -> PodsTimeData:
21 | ...
22 |
--------------------------------------------------------------------------------
/robusta_krr/core/abstract/strategies.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import abc
4 | import datetime
5 | from textwrap import dedent
6 | from typing import TYPE_CHECKING, Annotated, Generic, Literal, Optional, Sequence, TypeVar, get_args
7 |
8 | import numpy as np
9 | import pydantic as pd
10 | from numpy.typing import NDArray
11 |
12 | from robusta_krr.core.models.result import K8sObjectData, ResourceType
13 |
14 | if TYPE_CHECKING:
15 | from robusta_krr.core.abstract.metrics import BaseMetric # noqa: F401
16 | from robusta_krr.core.integrations.prometheus.metrics import PrometheusMetric
17 |
18 | SelfRR = TypeVar("SelfRR", bound="ResourceRecommendation")
19 |
20 |
21 | class ResourceRecommendation(pd.BaseModel):
22 | """A class to represent resource recommendation with optional request and limit values.
23 |
24 | The NaN values are used to represent undefined values: the strategy did not provide a recommendation for the resource.
25 | None values are used to represent the strategy says that value should not be set.
26 | """
27 |
28 | request: Optional[float]
29 | limit: Optional[float]
30 | info: Optional[str] = pd.Field(
31 | None, description="Additional information about the recommendation."
32 | )
33 |
34 | @classmethod
35 | def undefined(cls: type[SelfRR], info: Optional[str] = None) -> SelfRR:
36 | return cls(request=float("NaN"), limit=float("NaN"), info=info)
37 |
38 |
39 | class StrategySettings(pd.BaseModel):
40 | """A class to represent strategy settings with configurable history and timeframe duration.
41 |
42 | It is used in CLI to generate the help, parameters and validate values.
43 | Description is used to generate the help.
44 | Other pydantic features can be used to validate the values.
45 |
46 | Nested classes are not supported here.
47 | """
48 |
49 | history_duration: float = pd.Field(
50 | 24 * 7 * 2, ge=1, description="The duration of the history data to use (in hours)."
51 | )
52 | timeframe_duration: float = pd.Field(1.25, gt=0, description="The step for the history data (in minutes).")
53 |
54 | @property
55 | def history_timedelta(self) -> datetime.timedelta:
56 | return datetime.timedelta(hours=self.history_duration)
57 |
58 | @property
59 | def timeframe_timedelta(self) -> datetime.timedelta:
60 | return datetime.timedelta(minutes=self.timeframe_duration)
61 |
62 | def history_range_enough(self, history_range: tuple[datetime.timedelta, datetime.timedelta]) -> bool:
63 | """Override this function to check if the history range is enough for the strategy."""
64 |
65 | return True
66 |
67 |
68 | # A type alias for a numpy array of shape (N, 2).
69 | ArrayNx2 = Annotated[NDArray[np.float64], Literal["N", 2]]
70 |
71 |
72 | PodsTimeData = dict[str, ArrayNx2] # Mapping: pod -> [(time, value)]
73 | MetricsPodData = dict[str, PodsTimeData]
74 |
75 | RunResult = dict[ResourceType, ResourceRecommendation]
76 |
77 | SelfBS = TypeVar("SelfBS", bound="BaseStrategy")
78 | _StrategySettings = TypeVar("_StrategySettings", bound=StrategySettings)
79 |
80 |
81 | # An abstract base class for strategy implementation.
82 | # This class requires implementation of a 'run' method for calculating recommendation.
83 | # Make a subclass if you want to create a concrete strategy.
84 | class BaseStrategy(abc.ABC, Generic[_StrategySettings]):
85 | """An abstract base class for strategy implementation.
86 |
87 | This class is generic, and requires a type for the settings.
88 | This settings type will be used for the settings property of the strategy.
89 | It will be used to generate CLI parameters for this strategy, validated by pydantic.
90 |
91 | This class requires implementation of a 'run' method for calculating recommendation.
92 | Additionally, it provides a 'description' property for generating a description for the strategy.
93 | Description property uses the docstring of the strategy class and the settings of the strategy.
94 |
95 | The name of the strategy is the name of the class in lowercase, without the 'Strategy' suffix, if exists.
96 | If you want to change the name of the strategy, you can change the display_name class attribute.
97 |
98 | The strategy will automatically be registered in the strategy registry using __subclasses__ mechanism.
99 | """
100 |
101 | display_name: str
102 | rich_console: bool = False
103 |
104 | # TODO: this should be BaseMetric, but currently we only support Prometheus
105 | @property
106 | @abc.abstractmethod
107 | def metrics(self) -> Sequence[type[PrometheusMetric]]:
108 | pass
109 |
110 | def __init__(self, settings: _StrategySettings):
111 | self.settings = settings
112 |
113 | def __str__(self) -> str:
114 | return self.display_name.title()
115 |
116 | @property
117 | def description(self) -> Optional[str]:
118 | """
119 | Generate a description for the strategy.
120 | You can use Rich's markdown syntax to format the description.
121 | """
122 | raise NotImplementedError()
123 |
124 | # Abstract method that needs to be implemented by subclass.
125 | # This method is intended to calculate resource recommendation based on history data and kubernetes object data.
126 | @abc.abstractmethod
127 | def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult:
128 | pass
129 |
130 | # This method is intended to return a strategy by its name.
131 | @classmethod
132 | def find(cls: type[SelfBS], name: str) -> type[SelfBS]:
133 | strategies = cls.get_all()
134 | if name.lower() in strategies:
135 | return strategies[name.lower()]
136 |
137 | raise ValueError(f"Unknown strategy name: {name}. Available strategies: {', '.join(strategies)}")
138 |
139 | # This method is intended to return all the available strategies.
140 | @classmethod
141 | def get_all(cls: type[SelfBS]) -> dict[str, type[SelfBS]]:
142 | from robusta_krr import strategies as _ # noqa: F401
143 |
144 | return {sub_cls.display_name.lower(): sub_cls for sub_cls in cls.__subclasses__()}
145 |
146 | # This method is intended to return the type of settings used in strategy.
147 | @classmethod
148 | def get_settings_type(cls) -> type[StrategySettings]:
149 | return get_args(cls.__orig_bases__[0])[0] # type: ignore
150 |
151 |
152 | AnyStrategy = BaseStrategy[StrategySettings]
153 |
154 |
155 | __all__ = [
156 | "AnyStrategy",
157 | "BaseStrategy",
158 | "StrategySettings",
159 | "PodsTimeData",
160 | "MetricsPodData",
161 | "K8sObjectData",
162 | "ResourceType",
163 | ]
164 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/kubernetes/config_patch.py:
--------------------------------------------------------------------------------
1 | # NOTE: This is a workaround for the issue described here:
2 | # https://github.com/kubernetes-client/python/pull/1863
3 |
4 | from __future__ import annotations
5 |
6 | from typing import Optional
7 |
8 | from kubernetes.client import configuration
9 | from kubernetes.config import kube_config
10 |
11 |
12 | class KubeConfigLoader(kube_config.KubeConfigLoader):
13 | def _load_cluster_info(self):
14 | super()._load_cluster_info()
15 |
16 | if "proxy-url" in self._cluster:
17 | self.proxy = self._cluster["proxy-url"]
18 |
19 | def _set_config(self, client_configuration: Configuration):
20 | super()._set_config(client_configuration)
21 |
22 | key = "proxy"
23 | if key in self.__dict__:
24 | setattr(client_configuration, key, getattr(self, key))
25 |
26 |
27 | class Configuration(configuration.Configuration):
28 | def __init__(
29 | self,
30 | proxy: Optional[str] = None,
31 | **kwargs,
32 | ):
33 | super().__init__(**kwargs)
34 |
35 | self.proxy = proxy
36 |
37 |
38 | configuration.Configuration = Configuration
39 | kube_config.KubeConfigLoader = KubeConfigLoader
40 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/openshift/__init__.py:
--------------------------------------------------------------------------------
1 | from .token import TOKEN_LOCATION, load_token
2 |
3 | __all__ = ["TOKEN_LOCATION", "load_token"]
4 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/openshift/token.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from robusta_krr.core.models.config import settings
4 |
5 | # NOTE: This one should be mounted if openshift is enabled (done by Robusta Runner)
6 | TOKEN_LOCATION = '/var/run/secrets/kubernetes.io/serviceaccount/token'
7 |
8 |
9 | def load_token() -> Optional[str]:
10 | if not settings.openshift:
11 | return None
12 |
13 | try:
14 | with open(TOKEN_LOCATION, 'r') as file:
15 | return file.read()
16 | except FileNotFoundError:
17 | return None
18 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/prometheus/__init__.py:
--------------------------------------------------------------------------------
1 | from .loader import PrometheusMetricsLoader
2 | from .metrics_service.prometheus_metrics_service import PrometheusDiscovery, PrometheusNotFound
3 | from .prometheus_utils import ClusterNotSpecifiedException
4 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/prometheus/loader.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import datetime
4 | import logging
5 | from concurrent.futures import ThreadPoolExecutor
6 | from typing import TYPE_CHECKING, Optional, Dict, Any
7 |
8 | from kubernetes import config as k8s_config
9 | from kubernetes.client.api_client import ApiClient
10 | from kubernetes.client.exceptions import ApiException
11 | from prometrix import MetricsNotFound, PrometheusNotFound
12 |
13 | from robusta_krr.core.models.config import settings
14 | from robusta_krr.core.models.objects import K8sObjectData, PodData
15 |
16 | from .metrics_service.prometheus_metrics_service import PrometheusMetricsService
17 | from .metrics_service.thanos_metrics_service import ThanosMetricsService
18 | from .metrics_service.victoria_metrics_service import VictoriaMetricsService
19 | from .metrics_service.mimir_metrics_service import MimirMetricsService
20 |
21 | if TYPE_CHECKING:
22 | from robusta_krr.core.abstract.strategies import BaseStrategy, MetricsPodData
23 |
24 | logger = logging.getLogger("krr")
25 |
26 | class PrometheusMetricsLoader:
27 | def __init__(self, *, cluster: Optional[str] = None) -> None:
28 | """
29 | Initializes the Prometheus Loader.
30 |
31 | Args:
32 | cluster (Optional[str]): The name of the cluster. Defaults to None.
33 | """
34 |
35 | self.executor = ThreadPoolExecutor(settings.max_workers)
36 | self.api_client = settings.get_kube_client(context=cluster)
37 | loader = self.get_metrics_service(api_client=self.api_client, cluster=cluster)
38 | if loader is None:
39 | raise PrometheusNotFound(
40 | f"Wasn't able to connect to any Prometheus service in {cluster or 'inner'} cluster\n"
41 | "Try using port-forwarding and/or setting the url manually (using the -p flag.).\n"
42 | "For more information, see 'Giving the Explicit Prometheus URL' at https://github.com/robusta-dev/krr?tab=readme-ov-file#usage"
43 | )
44 |
45 | self.loader = loader
46 |
47 | logger.info(f"{self.loader.name()} connected successfully for {cluster or 'default'} cluster")
48 |
49 | def get_metrics_service(
50 | self,
51 | api_client: Optional[ApiClient] = None,
52 | cluster: Optional[str] = None,
53 | ) -> Optional[PrometheusMetricsService]:
54 | if settings.prometheus_url is not None:
55 | logger.info("Prometheus URL is specified, will not auto-detect a metrics service")
56 | metrics_to_check = [PrometheusMetricsService]
57 | else:
58 | logger.info("No Prometheus URL is specified, trying to auto-detect a metrics service")
59 | metrics_to_check = [VictoriaMetricsService, ThanosMetricsService, MimirMetricsService, PrometheusMetricsService]
60 |
61 | for metric_service_class in metrics_to_check:
62 | service_name = metric_service_class.name()
63 | try:
64 | loader = metric_service_class(api_client=api_client, cluster=cluster, executor=self.executor)
65 | loader.check_connection()
66 | except MetricsNotFound as e:
67 | logger.info(f"{service_name} not found: {e}")
68 | except ApiException as e:
69 | logger.warning(
70 | f"Unable to automatically discover a {service_name} in the cluster ({e}). "
71 | "Try specifying how to connect to Prometheus via cli options"
72 | )
73 | else:
74 | logger.info(f"{service_name} found")
75 | loader.validate_cluster_name()
76 | return loader
77 |
78 | return None
79 |
80 | async def get_history_range(
81 | self, history_duration: datetime.timedelta
82 | ) -> Optional[tuple[datetime.datetime, datetime.datetime]]:
83 | return await self.loader.get_history_range(history_duration)
84 |
85 | async def load_pods(self, object: K8sObjectData, period: datetime.timedelta) -> list[PodData]:
86 | try:
87 | return await self.loader.load_pods(object, period)
88 | except Exception as e:
89 | logger.exception(f"Failed to load pods for {object}: {e}")
90 | return []
91 |
92 | async def get_cluster_summary(self) -> Dict[str, Any]:
93 | try:
94 | return await self.loader.get_cluster_summary()
95 | except Exception as e:
96 | logger.exception(f"Failed to get cluster summary: {e}")
97 | return {}
98 |
99 | async def gather_data(
100 | self,
101 | object: K8sObjectData,
102 | strategy: BaseStrategy,
103 | period: datetime.timedelta,
104 | *,
105 | step: datetime.timedelta = datetime.timedelta(minutes=30),
106 | ) -> MetricsPodData:
107 | """
108 | Gathers data from Prometheus for a specified object and resource.
109 |
110 | Args:
111 | object (K8sObjectData): The Kubernetes object.
112 | resource (ResourceType): The resource type.
113 | period (datetime.timedelta): The time period for which to gather data.
114 | step (datetime.timedelta, optional): The time step between data points. Defaults to 30 minutes.
115 |
116 | Returns:
117 | ResourceHistoryData: The gathered resource history data.
118 | """
119 |
120 | return {
121 | MetricLoader.__name__: await self.loader.gather_data(object, MetricLoader, period, step)
122 | for MetricLoader in strategy.metrics
123 | }
124 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/prometheus/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import PrometheusMetric
2 | from .cpu import CPUAmountLoader, CPULoader, PercentileCPULoader
3 | from .memory import MaxMemoryLoader, MemoryAmountLoader, MemoryLoader, MaxOOMKilledMemoryLoader
4 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/prometheus/metrics/cpu.py:
--------------------------------------------------------------------------------
1 | from robusta_krr.core.models.objects import K8sObjectData
2 |
3 | from .base import PrometheusMetric, QueryType
4 |
5 |
6 | class CPULoader(PrometheusMetric):
7 | """
8 | A metric loader for loading CPU usage metrics.
9 | """
10 |
11 | query_type: QueryType = QueryType.QueryRange
12 |
13 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
14 | pods_selector = "|".join(pod.name for pod in object.pods)
15 | cluster_label = self.get_prometheus_cluster_label()
16 | return f"""
17 | max(
18 | rate(
19 | container_cpu_usage_seconds_total{{
20 | namespace="{object.namespace}",
21 | pod=~"{pods_selector}",
22 | container="{object.container}"
23 | {cluster_label}
24 | }}[{step}]
25 | )
26 | ) by (container, pod, job)
27 | """
28 |
29 |
30 | def PercentileCPULoader(percentile: float) -> type[PrometheusMetric]:
31 | """
32 | A factory for creating percentile CPU usage metric loaders.
33 | """
34 |
35 | if not 0 <= percentile <= 100:
36 | raise ValueError("percentile must be between 0 and 100")
37 |
38 | class PercentileCPULoader(PrometheusMetric):
39 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
40 | pods_selector = "|".join(pod.name for pod in object.pods)
41 | cluster_label = self.get_prometheus_cluster_label()
42 | return f"""
43 | quantile_over_time(
44 | {round(percentile / 100, 2)},
45 | max(
46 | rate(
47 | container_cpu_usage_seconds_total{{
48 | namespace="{object.namespace}",
49 | pod=~"{pods_selector}",
50 | container="{object.container}"
51 | {cluster_label}
52 | }}[{step}]
53 | )
54 | ) by (container, pod, job)
55 | [{duration}:{step}]
56 | )
57 | """
58 |
59 | return PercentileCPULoader
60 |
61 |
62 | class CPUAmountLoader(PrometheusMetric):
63 | """
64 | A metric loader for loading CPU points count.
65 | """
66 |
67 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
68 | pods_selector = "|".join(pod.name for pod in object.pods)
69 | cluster_label = self.get_prometheus_cluster_label()
70 | return f"""
71 | count_over_time(
72 | max(
73 | container_cpu_usage_seconds_total{{
74 | namespace="{object.namespace}",
75 | pod=~"{pods_selector}",
76 | container="{object.container}"
77 | {cluster_label}
78 | }}
79 | ) by (container, pod, job)
80 | [{duration}:{step}]
81 | )
82 | """
83 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/prometheus/metrics/memory.py:
--------------------------------------------------------------------------------
1 | from robusta_krr.core.models.objects import K8sObjectData
2 |
3 | from .base import PrometheusMetric, QueryType
4 |
5 |
6 | class MemoryLoader(PrometheusMetric):
7 | """
8 | A metric loader for loading memory usage metrics.
9 | """
10 |
11 | query_type: QueryType = QueryType.QueryRange
12 |
13 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
14 | pods_selector = "|".join(pod.name for pod in object.pods)
15 | cluster_label = self.get_prometheus_cluster_label()
16 | return f"""
17 | max(
18 | container_memory_working_set_bytes{{
19 | namespace="{object.namespace}",
20 | pod=~"{pods_selector}",
21 | container="{object.container}"
22 | {cluster_label}
23 | }}
24 | ) by (container, pod, job)
25 | """
26 |
27 |
28 | class MaxMemoryLoader(PrometheusMetric):
29 | """
30 | A metric loader for loading max memory usage metrics.
31 | """
32 |
33 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
34 | pods_selector = "|".join(pod.name for pod in object.pods)
35 | cluster_label = self.get_prometheus_cluster_label()
36 | return f"""
37 | max_over_time(
38 | max(
39 | container_memory_working_set_bytes{{
40 | namespace="{object.namespace}",
41 | pod=~"{pods_selector}",
42 | container="{object.container}"
43 | {cluster_label}
44 | }}
45 | ) by (container, pod, job)
46 | [{duration}:{step}]
47 | )
48 | """
49 |
50 |
51 | class MemoryAmountLoader(PrometheusMetric):
52 | """
53 | A metric loader for loading memory points count.
54 | """
55 |
56 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
57 | pods_selector = "|".join(pod.name for pod in object.pods)
58 | cluster_label = self.get_prometheus_cluster_label()
59 | return f"""
60 | count_over_time(
61 | max(
62 | container_memory_working_set_bytes{{
63 | namespace="{object.namespace}",
64 | pod=~"{pods_selector}",
65 | container="{object.container}"
66 | {cluster_label}
67 | }}
68 | ) by (container, pod, job)
69 | [{duration}:{step}]
70 | )
71 | """
72 |
73 | # TODO: Need to battle test if this one is correct.
74 | class MaxOOMKilledMemoryLoader(PrometheusMetric):
75 | """
76 | A metric loader for loading the maximum memory limits that were surpassed by the OOMKilled event.
77 | """
78 |
79 | warning_on_no_data = False
80 |
81 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str:
82 | pods_selector = "|".join(pod.name for pod in object.pods)
83 | cluster_label = self.get_prometheus_cluster_label()
84 | return f"""
85 | max_over_time(
86 | max(
87 | max(
88 | kube_pod_container_resource_limits{{
89 | resource="memory",
90 | namespace="{object.namespace}",
91 | pod=~"{pods_selector}",
92 | container="{object.container}"
93 | {cluster_label}
94 | }}
95 | ) by (pod, container, job)
96 | * on(pod, container, job) group_left(reason)
97 | max(
98 | kube_pod_container_status_last_terminated_reason{{
99 | reason="OOMKilled",
100 | namespace="{object.namespace}",
101 | pod=~"{pods_selector}",
102 | container="{object.container}"
103 | {cluster_label}
104 | }}
105 | ) by (pod, container, job, reason)
106 | ) by (container, pod, job)
107 | [{duration}:{step}]
108 | )
109 | """
110 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/prometheus/metrics_service/base_metric_service.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import datetime
3 | from concurrent.futures import ThreadPoolExecutor
4 | from typing import List, Optional, Dict, Any
5 |
6 | from kubernetes.client.api_client import ApiClient
7 |
8 | from robusta_krr.core.abstract.strategies import PodsTimeData
9 | from robusta_krr.core.models.config import settings
10 | from robusta_krr.core.models.objects import K8sObjectData
11 |
12 | from ..metrics import PrometheusMetric
13 |
14 |
15 | class MetricsService(abc.ABC):
16 | def __init__(
17 | self,
18 | api_client: Optional[ApiClient] = None,
19 | cluster: Optional[str] = None,
20 | executor: Optional[ThreadPoolExecutor] = None,
21 | ) -> None:
22 | self.api_client = api_client
23 | self.cluster = cluster or "default"
24 | self.executor = executor
25 |
26 | @abc.abstractmethod
27 | def check_connection(self):
28 | ...
29 |
30 | @classmethod
31 | def name(cls) -> str:
32 | classname = cls.__name__
33 | return classname.replace("MetricsService", "") if classname != MetricsService.__name__ else classname
34 |
35 | @abc.abstractmethod
36 | def get_cluster_names(self) -> Optional[List[str]]:
37 | ...
38 |
39 | @abc.abstractmethod
40 | async def get_cluster_summary(self) -> Dict[str, Any]:
41 | ...
42 |
43 | @abc.abstractmethod
44 | async def gather_data(
45 | self,
46 | object: K8sObjectData,
47 | LoaderClass: type[PrometheusMetric],
48 | period: datetime.timedelta,
49 | step: datetime.timedelta = datetime.timedelta(minutes=30),
50 | ) -> PodsTimeData:
51 | ...
52 |
53 | def get_prometheus_cluster_label(self) -> str:
54 | """
55 | Generates the cluster label for querying a centralized Prometheus
56 |
57 | Returns:
58 | str: a promql safe label string for querying the cluster.
59 | """
60 | if settings.prometheus_cluster_label is None:
61 | return ""
62 | return f', {settings.prometheus_label}="{settings.prometheus_cluster_label}"'
63 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/prometheus/metrics_service/mimir_metrics_service.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from kubernetes.client import ApiClient
4 | from prometrix import MetricsNotFound
5 |
6 | from robusta_krr.utils.service_discovery import MetricsServiceDiscovery
7 |
8 | from .prometheus_metrics_service import PrometheusMetricsService
9 |
10 | class MimirMetricsDiscovery(MetricsServiceDiscovery):
11 | def find_metrics_url(self, *, api_client: Optional[ApiClient] = None) -> Optional[str]:
12 | """
13 | Finds the Mimir Metrics URL using selectors.
14 | Args:
15 | api_client (Optional[ApiClient]): A Kubernetes API client. Defaults to None.
16 | Returns:
17 | Optional[str]: The discovered Mimir Metrics URL, or None if not found.
18 | """
19 | return super().find_url(
20 | selectors=[
21 | "app.kubernetes.io/name=mimir,app.kubernetes.io/component=query-frontend",
22 | ]
23 | )
24 |
25 |
26 | class MimirMetricsService(PrometheusMetricsService):
27 | """
28 | A class for fetching metrics from Mimir Metrics.
29 | """
30 |
31 | service_discovery = MimirMetricsDiscovery
32 | url_postfix = "/prometheus"
33 | additional_headers = {"X-Scope-OrgID": "anonymous"}
34 |
35 | def check_connection(self):
36 | """
37 | Checks the connection to Prometheus.
38 | Raises:
39 | MimirMetricsNotFound: If the connection to Mimir Metrics cannot be established.
40 | """
41 | try:
42 | super().check_connection()
43 | except MetricsNotFound as e:
44 | # This is to clarify which metrics service had the issue and not say its a prometheus issue
45 | raise MetricsNotFound(
46 | f"Couldn't connect to Mimir Metrics found under {self.prometheus.url}\nCaused by {e.__class__.__name__}: {e})"
47 | ) from e
48 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/prometheus/metrics_service/thanos_metrics_service.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from kubernetes.client import ApiClient
4 | from prometrix import MetricsNotFound, ThanosMetricsNotFound
5 |
6 | from robusta_krr.utils.service_discovery import MetricsServiceDiscovery
7 |
8 | from .prometheus_metrics_service import PrometheusMetricsService
9 |
10 |
11 | class ThanosMetricsDiscovery(MetricsServiceDiscovery):
12 | def find_metrics_url(self, *, api_client: Optional[ApiClient] = None) -> Optional[str]:
13 | """
14 | Finds the Thanos URL using selectors.
15 | Args:
16 | api_client (Optional[ApiClient]): A Kubernetes API client. Defaults to None.
17 | Returns:
18 | Optional[str]: The discovered Thanos URL, or None if not found.
19 | """
20 |
21 | return super().find_url(
22 | selectors=[
23 | "app.kubernetes.io/component=query,app.kubernetes.io/name=thanos",
24 | "app.kubernetes.io/name=thanos-query",
25 | "app=thanos-query",
26 | "app=thanos-querier",
27 | ]
28 | )
29 |
30 |
31 | class ThanosMetricsService(PrometheusMetricsService):
32 | """
33 | A class for fetching metrics from Thanos.
34 | """
35 |
36 | service_discovery = ThanosMetricsDiscovery
37 |
38 | def check_connection(self):
39 | """
40 | Checks the connection to Prometheus.
41 | Raises:
42 | ThanosMetricsNotFound: If the connection to Thanos cannot be established.
43 | """
44 | try:
45 | super().check_connection()
46 | except MetricsNotFound as e:
47 | # This is to clarify which metrics service had the issue and not say its a prometheus issue
48 | raise ThanosMetricsNotFound(
49 | f"Couldn't connect to Thanos found under {self.prometheus.url}\nCaused by {e.__class__.__name__}: {e})"
50 | ) from e
51 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/prometheus/metrics_service/victoria_metrics_service.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from kubernetes.client import ApiClient
4 | from prometrix import MetricsNotFound, VictoriaMetricsNotFound
5 |
6 | from robusta_krr.utils.service_discovery import MetricsServiceDiscovery
7 |
8 | from .prometheus_metrics_service import PrometheusMetricsService
9 |
10 |
11 | class VictoriaMetricsDiscovery(MetricsServiceDiscovery):
12 | def find_metrics_url(self, *, api_client: Optional[ApiClient] = None) -> Optional[str]:
13 | """
14 | Finds the Victoria Metrics URL using selectors.
15 | Args:
16 | api_client (Optional[ApiClient]): A Kubernetes API client. Defaults to None.
17 | Returns:
18 | Optional[str]: The discovered Victoria Metrics URL, or None if not found.
19 | """
20 | url = super().find_url(
21 | selectors=[
22 | "app.kubernetes.io/name=vmsingle",
23 | "app.kubernetes.io/name=victoria-metrics-single",
24 | ]
25 | )
26 | if url is None:
27 | url = super().find_url(
28 | selectors=[
29 | "app.kubernetes.io/name=vmselect",
30 | "app=vmselect",
31 | ]
32 | )
33 | if url is not None:
34 | url = f"{url}/select/0/prometheus/"
35 | return url
36 |
37 |
38 | class VictoriaMetricsService(PrometheusMetricsService):
39 | """
40 | A class for fetching metrics from Victoria Metrics.
41 | """
42 |
43 | service_discovery = VictoriaMetricsDiscovery
44 |
45 | @classmethod
46 | def name(cls) -> str:
47 | return "Victoria Metrics"
48 |
49 | def check_connection(self):
50 | """
51 | Checks the connection to Prometheus.
52 | Raises:
53 | VictoriaMetricsNotFound: If the connection to Victoria Metrics cannot be established.
54 | """
55 | try:
56 | super().check_connection()
57 | except MetricsNotFound as e:
58 | # This is to clarify which metrics service had the issue and not say its a prometheus issue
59 | raise VictoriaMetricsNotFound(
60 | f"Couldn't connect to Victoria Metrics found under {self.prometheus.url}\nCaused by {e.__class__.__name__}: {e})"
61 | ) from e
62 |
--------------------------------------------------------------------------------
/robusta_krr/core/integrations/prometheus/prometheus_utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import TYPE_CHECKING
4 |
5 | import boto3
6 | from prometrix import AWSPrometheusConfig, CoralogixPrometheusConfig, PrometheusConfig, VictoriaMetricsPrometheusConfig
7 |
8 | from robusta_krr.core.models.config import settings
9 |
10 | if TYPE_CHECKING:
11 | from robusta_krr.core.integrations.prometheus.metrics_service.prometheus_metrics_service import (
12 | PrometheusMetricsService,
13 | )
14 |
15 |
16 | class ClusterNotSpecifiedException(Exception):
17 | """
18 | An exception raised when a prometheus requires a cluster label but an invalid one is provided.
19 | """
20 |
21 | pass
22 |
23 |
24 | def generate_prometheus_config(
25 | url: str, headers: dict[str, str], metrics_service: PrometheusMetricsService
26 | ) -> PrometheusConfig:
27 | from .metrics_service.victoria_metrics_service import VictoriaMetricsService
28 |
29 | baseconfig = {
30 | "url": url,
31 | "disable_ssl": not settings.prometheus_ssl_enabled,
32 | "headers": headers,
33 | }
34 |
35 | # aws config
36 | if settings.eks_managed_prom:
37 | session = boto3.Session(profile_name=settings.eks_managed_prom_profile_name)
38 | credentials = session.get_credentials()
39 | region = settings.eks_managed_prom_region if settings.eks_managed_prom_region else session.region_name
40 |
41 | if settings.eks_access_key and settings.eks_secret_key:
42 | # when we have both access key and secret key, don't try to read credentials which can fail
43 | access_key = settings.eks_access_key
44 | secret_key = settings.eks_secret_key.get_secret_value()
45 | else:
46 | # we need at least one parameter from credentials, but we should use whatever we can from settings (this has higher precedence)
47 | credentials = credentials.get_frozen_credentials()
48 | access_key = settings.eks_access_key if settings.eks_access_key else credentials.access_key
49 | secret_key = settings.eks_secret_key.get_secret_value() if settings.eks_secret_key else credentials.secret_key
50 |
51 | service_name = settings.eks_service_name if settings.eks_secret_key else "aps"
52 | if not region:
53 | raise Exception("No eks region specified")
54 |
55 | return AWSPrometheusConfig(
56 | access_key=access_key,
57 | secret_access_key=secret_key,
58 | aws_region=region,
59 | service_name=service_name,
60 | **baseconfig,
61 | )
62 | # coralogix config
63 | if settings.coralogix_token:
64 | return CoralogixPrometheusConfig(**baseconfig, prometheus_token=settings.coralogix_token.get_secret_value())
65 | if isinstance(metrics_service, VictoriaMetricsService):
66 | return VictoriaMetricsPrometheusConfig(**baseconfig)
67 | return PrometheusConfig(**baseconfig)
68 |
--------------------------------------------------------------------------------
/robusta_krr/core/models/allocations.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import enum
4 | import math
5 | from typing import Literal, Optional, TypeVar, Union
6 |
7 | import pydantic as pd
8 | from kubernetes.client.models import V1Container
9 |
10 | from robusta_krr.utils import resource_units
11 |
12 |
13 | class ResourceType(str, enum.Enum):
14 | """The type of resource.
15 |
16 | Just add new types here and they will be automatically supported.
17 | """
18 |
19 | CPU = "cpu"
20 | Memory = "memory"
21 |
22 |
23 | RecommendationValue = Union[float, Literal["?"], None]
24 | RecommendationValueRaw = Union[float, str, None]
25 |
26 | Self = TypeVar("Self", bound="ResourceAllocations")
27 |
28 | NONE_LITERAL = "unset"
29 | NAN_LITERAL = "?"
30 |
31 | def format_recommendation_value(value: RecommendationValue) -> str:
32 | if value is None:
33 | return NONE_LITERAL
34 | elif isinstance(value, str):
35 | return NAN_LITERAL
36 | else:
37 | return resource_units.format(value)
38 |
39 | def format_diff(allocated, recommended, selector, multiplier=1, colored=False) -> str:
40 | if recommended is None or isinstance(recommended.value, str) or selector != "requests":
41 | return ""
42 | else:
43 | reccomended_val = recommended.value if isinstance(recommended.value, (int, float)) else 0
44 | allocated_val = allocated if isinstance(allocated, (int, float)) else 0
45 | diff_val = reccomended_val - allocated_val
46 | if colored:
47 | diff_sign = "[green]+[/green]" if diff_val >= 0 else "[red]-[/red]"
48 | else:
49 | diff_sign = "+" if diff_val >= 0 else "-"
50 | return f"{diff_sign}{format_recommendation_value(abs(diff_val) * multiplier)}"
51 |
52 | class ResourceAllocations(pd.BaseModel):
53 | requests: dict[ResourceType, RecommendationValue]
54 | limits: dict[ResourceType, RecommendationValue]
55 | info: dict[ResourceType, Optional[str]] = {}
56 |
57 | @staticmethod
58 | def __parse_resource_value(value: RecommendationValueRaw) -> RecommendationValue:
59 | if value is None:
60 | return None
61 |
62 | if isinstance(value, str):
63 | return float(resource_units.parse(value))
64 |
65 | if math.isnan(value):
66 | return "?"
67 |
68 | return float(value)
69 |
70 | @pd.validator("requests", "limits", pre=True)
71 | def validate_requests(
72 | cls, value: dict[ResourceType, RecommendationValueRaw]
73 | ) -> dict[ResourceType, RecommendationValue]:
74 | return {
75 | resource_type: cls.__parse_resource_value(resource_value) for resource_type, resource_value in value.items()
76 | }
77 |
78 | @classmethod
79 | def from_container(cls: type[Self], container: V1Container) -> Self:
80 | """Get the resource allocations from a Kubernetes container.
81 |
82 | Args:
83 | container: The Kubernetes container.
84 |
85 | Returns:
86 | The resource allocations.
87 | """
88 |
89 | return cls(
90 | requests={
91 | ResourceType.CPU: container.resources.requests.get("cpu")
92 | if container.resources and container.resources.requests
93 | else None,
94 | ResourceType.Memory: container.resources.requests.get("memory")
95 | if container.resources and container.resources.requests
96 | else None,
97 | },
98 | limits={
99 | ResourceType.CPU: container.resources.limits.get("cpu")
100 | if container.resources and container.resources.limits
101 | else None,
102 | ResourceType.Memory: container.resources.limits.get("memory")
103 | if container.resources and container.resources.limits
104 | else None,
105 | },
106 | )
107 |
--------------------------------------------------------------------------------
/robusta_krr/core/models/config.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import logging
4 | import sys
5 | from typing import Any, Literal, Optional, Union
6 |
7 | import pydantic as pd
8 | from kubernetes import config
9 | from kubernetes.config.config_exception import ConfigException
10 | from rich.console import Console
11 | from rich.logging import RichHandler
12 |
13 | from robusta_krr.core.abstract import formatters
14 | from robusta_krr.core.abstract.strategies import AnyStrategy, BaseStrategy
15 | from robusta_krr.core.models.objects import KindLiteral
16 |
17 | logger = logging.getLogger("krr")
18 |
19 |
20 | class Config(pd.BaseSettings):
21 | quiet: bool = pd.Field(False)
22 | verbose: bool = pd.Field(False)
23 |
24 | clusters: Union[list[str], Literal["*"], None] = None
25 | kubeconfig: Optional[str] = None
26 | impersonate_user: Optional[str] = None
27 | impersonate_group: Optional[str] = None
28 | namespaces: Union[list[str], Literal["*"]] = pd.Field("*")
29 | resources: Union[list[KindLiteral], Literal["*"]] = pd.Field("*")
30 | selector: Optional[str] = None
31 |
32 | # Value settings
33 | cpu_min_value: int = pd.Field(10, ge=0) # in millicores
34 | memory_min_value: int = pd.Field(100, ge=0) # in megabytes
35 |
36 | # Prometheus Settings
37 | prometheus_url: Optional[str] = pd.Field(None)
38 | prometheus_auth_header: Optional[pd.SecretStr] = pd.Field(None)
39 | prometheus_other_headers: dict[str, pd.SecretStr] = pd.Field(default_factory=dict)
40 | prometheus_ssl_enabled: bool = pd.Field(False)
41 | prometheus_cluster_label: Optional[str] = pd.Field(None)
42 | prometheus_label: Optional[str] = pd.Field(None)
43 | eks_managed_prom: bool = pd.Field(False)
44 | eks_managed_prom_profile_name: Optional[str] = pd.Field(None)
45 | eks_access_key: Optional[str] = pd.Field(None)
46 | eks_secret_key: Optional[pd.SecretStr] = pd.Field(None)
47 | eks_service_name: Optional[str] = pd.Field(None)
48 | eks_managed_prom_region: Optional[str] = pd.Field(None)
49 | coralogix_token: Optional[pd.SecretStr] = pd.Field(None)
50 | openshift: bool = pd.Field(False)
51 |
52 | # Threading settings
53 | max_workers: int = pd.Field(6, ge=1)
54 |
55 | # Logging Settings
56 | format: str
57 | show_cluster_name: bool
58 | strategy: str
59 | log_to_stderr: bool
60 | width: Optional[int] = pd.Field(None, ge=1)
61 | show_severity: bool = True
62 |
63 | # Output Settings
64 | file_output: Optional[str] = pd.Field(None)
65 | file_output_dynamic: bool = pd.Field(False)
66 | slack_output: Optional[str] = pd.Field(None)
67 |
68 | other_args: dict[str, Any]
69 |
70 | # Internal
71 | inside_cluster: bool = False
72 | _logging_console: Optional[Console] = pd.PrivateAttr(None)
73 |
74 | def __init__(self, **kwargs: Any) -> None:
75 | super().__init__(**kwargs)
76 |
77 | @property
78 | def Formatter(self) -> formatters.FormatterFunc:
79 | return formatters.find(self.format)
80 |
81 | @pd.validator("prometheus_url")
82 | def validate_prometheus_url(cls, v: Optional[str]):
83 | if v is None:
84 | return None
85 |
86 | if not v.startswith("https://") and not v.startswith("http://"):
87 | raise Exception("--prometheus-url must start with https:// or http://")
88 |
89 | v = v.removesuffix("/")
90 |
91 | return v
92 |
93 | @pd.validator("prometheus_other_headers", pre=True)
94 | def validate_prometheus_other_headers(cls, headers: Union[list[str], dict[str, str]]) -> dict[str, str]:
95 | if isinstance(headers, dict):
96 | return headers
97 |
98 | return {k.strip().lower(): v.strip() for k, v in [header.split(":") for header in headers]}
99 |
100 | @pd.validator("namespaces")
101 | def validate_namespaces(cls, v: Union[list[str], Literal["*"]]) -> Union[list[str], Literal["*"]]:
102 | if v == []:
103 | return "*"
104 |
105 | if isinstance(v, list):
106 | for val in v:
107 | if val.startswith("*"):
108 | raise ValueError("Namespace's values cannot start with an asterisk (*)")
109 |
110 | return [val.lower() for val in v]
111 |
112 | @pd.validator("resources", pre=True)
113 | def validate_resources(cls, v: Union[list[str], Literal["*"]]) -> Union[list[str], Literal["*"]]:
114 | if v == []:
115 | return "*"
116 |
117 | # NOTE: KindLiteral.__args__ is a tuple of all possible values of KindLiteral
118 | # So this will preserve the big and small letters of the resource
119 | return [next(r for r in KindLiteral.__args__ if r.lower() == val.lower()) for val in v]
120 |
121 | def create_strategy(self) -> AnyStrategy:
122 | StrategyType = AnyStrategy.find(self.strategy)
123 | StrategySettingsType = StrategyType.get_settings_type()
124 | return StrategyType(StrategySettingsType(**self.other_args)) # type: ignore
125 |
126 | @pd.validator("strategy")
127 | def validate_strategy(cls, v: str) -> str:
128 | BaseStrategy.find(v) # NOTE: raises if strategy is not found
129 | return v
130 |
131 | @pd.validator("format")
132 | def validate_format(cls, v: str) -> str:
133 | formatters.find(v) # NOTE: raises if strategy is not found
134 | return v
135 |
136 | @property
137 | def context(self) -> Optional[str]:
138 | return self.clusters[0] if self.clusters != "*" and self.clusters else None
139 |
140 | @property
141 | def logging_console(self) -> Console:
142 | if getattr(self, "_logging_console") is None:
143 | self._logging_console = Console(file=sys.stderr if self.log_to_stderr else sys.stdout, width=self.width)
144 | return self._logging_console
145 |
146 | def load_kubeconfig(self) -> None:
147 | try:
148 | config.load_kube_config(config_file=self.kubeconfig, context=self.context)
149 | self.inside_cluster = False
150 | except ConfigException:
151 | config.load_incluster_config()
152 | self.inside_cluster = True
153 |
154 | def get_kube_client(self, context: Optional[str] = None):
155 | if context is None:
156 | return None
157 |
158 | api_client = config.new_client_from_config(context=context, config_file=self.kubeconfig)
159 | if self.impersonate_user is not None:
160 | # trick copied from https://github.com/kubernetes-client/python/issues/362
161 | api_client.set_default_header("Impersonate-User", self.impersonate_user)
162 | if self.impersonate_group is not None:
163 | api_client.set_default_header("Impersonate-Group", self.impersonate_group)
164 | return api_client
165 |
166 | @staticmethod
167 | def set_config(config: Config) -> None:
168 | global _config
169 |
170 | _config = config
171 | logging.basicConfig(
172 | level="NOTSET",
173 | format="%(message)s",
174 | datefmt="[%X]",
175 | handlers=[RichHandler(console=config.logging_console)],
176 | )
177 | logging.getLogger("").setLevel(logging.CRITICAL)
178 | logger.setLevel(logging.DEBUG if config.verbose else logging.CRITICAL if config.quiet else logging.INFO)
179 |
180 | @staticmethod
181 | def get_config() -> Optional[Config]:
182 | return _config
183 |
184 |
185 | # NOTE: This class is just a proxy for _config.
186 | # Import settings from this module and use it like it is just a config object.
187 | class _Settings(Config): # Config here is used for type checking
188 | def __init__(self) -> None:
189 | pass
190 |
191 | def __getattr__(self, name: str):
192 | if _config is None:
193 | raise AttributeError("Config is not set")
194 |
195 | return getattr(_config, name)
196 |
197 |
198 | _config: Optional[Config] = None
199 | settings = _Settings()
200 |
--------------------------------------------------------------------------------
/robusta_krr/core/models/objects.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import Literal, Optional
4 |
5 | import pydantic as pd
6 |
7 | from robusta_krr.core.models.allocations import ResourceAllocations
8 | from robusta_krr.utils.batched import batched
9 | from kubernetes.client.models import V1LabelSelector
10 |
11 | KindLiteral = Literal["Deployment", "DaemonSet", "StatefulSet", "Job", "CronJob", "Rollout", "DeploymentConfig", "StrimziPodSet"]
12 |
13 |
14 | class PodData(pd.BaseModel):
15 | name: str
16 | deleted: bool
17 |
18 | def __hash__(self) -> int:
19 | return hash(self.name)
20 |
21 |
22 | class HPAData(pd.BaseModel):
23 | min_replicas: Optional[int]
24 | max_replicas: int
25 | current_replicas: Optional[int]
26 | desired_replicas: int
27 | target_cpu_utilization_percentage: Optional[float]
28 | target_memory_utilization_percentage: Optional[float]
29 |
30 |
31 | PodWarning = Literal[
32 | "NoPrometheusPods",
33 | "NoPrometheusCPUMetrics",
34 | "NoPrometheusMemoryMetrics",
35 | ]
36 |
37 |
38 | class K8sObjectData(pd.BaseModel):
39 | # NOTE: Here None means that we are running inside the cluster
40 | cluster: Optional[str]
41 | name: str
42 | container: str
43 | pods: list[PodData] = []
44 | hpa: Optional[HPAData]
45 | namespace: str
46 | kind: KindLiteral
47 | allocations: ResourceAllocations
48 | warnings: set[PodWarning] = set()
49 | labels: Optional[dict[str, str]]
50 | annotations: Optional[dict[str, str]]
51 |
52 | _api_resource = pd.PrivateAttr(None)
53 |
54 | def __str__(self) -> str:
55 | return f"{self.kind} {self.namespace}/{self.name}/{self.container}"
56 |
57 | def __hash__(self) -> int:
58 | return hash(str(self))
59 |
60 | def add_warning(self, warning: PodWarning) -> None:
61 | self.warnings.add(warning)
62 |
63 | @property
64 | def current_pods_count(self) -> int:
65 | return len([pod for pod in self.pods if not pod.deleted])
66 |
67 | @property
68 | def deleted_pods_count(self) -> int:
69 | return len([pod for pod in self.pods if pod.deleted])
70 |
71 | @property
72 | def pods_count(self) -> int:
73 | return len(self.pods)
74 |
75 | @property
76 | def selector(self) -> V1LabelSelector:
77 | if self._api_resource is None:
78 | raise ValueError("api_resource is not set")
79 |
80 | if self.kind == 'CronJob':
81 | return self._api_resource.spec.job_template.spec.selector
82 | else:
83 | return self._api_resource.spec.selector
84 |
85 | def split_into_batches(self, n: int) -> list[K8sObjectData]:
86 | """
87 | Batch this object into n objects, splitting the pods into batches of size n.
88 | """
89 |
90 | if self.pods_count <= n:
91 | return [self]
92 |
93 | return [
94 | K8sObjectData(
95 | cluster=self.cluster,
96 | name=self.name,
97 | container=self.container,
98 | pods=batch,
99 | hpa=self.hpa,
100 | namespace=self.namespace,
101 | kind=self.kind,
102 | allocations=self.allocations,
103 | labels=self.labels,
104 | annotations=self.annotations,
105 | )
106 | for batch in batched(self.pods, n)
107 | ]
108 |
--------------------------------------------------------------------------------
/robusta_krr/core/models/result.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import Any, Optional, Union
4 |
5 | import pydantic as pd
6 |
7 | from robusta_krr.core.abstract import formatters
8 | from robusta_krr.core.models.allocations import RecommendationValue, ResourceAllocations, ResourceType
9 | from robusta_krr.core.models.objects import K8sObjectData
10 | from robusta_krr.core.models.severity import Severity
11 | from robusta_krr.core.models.config import Config
12 |
13 |
14 | class Recommendation(pd.BaseModel):
15 | value: RecommendationValue
16 | severity: Severity
17 |
18 |
19 | class ResourceRecommendation(pd.BaseModel):
20 | requests: dict[ResourceType, Union[RecommendationValue, Recommendation]]
21 | limits: dict[ResourceType, Union[RecommendationValue, Recommendation]]
22 | info: dict[ResourceType, Optional[str]]
23 |
24 |
25 | class ResourceScan(pd.BaseModel):
26 | object: K8sObjectData
27 | recommended: ResourceRecommendation
28 | severity: Severity
29 |
30 | @classmethod
31 | def calculate(cls, object: K8sObjectData, recommendation: ResourceAllocations) -> ResourceScan:
32 | recommendation_processed = ResourceRecommendation(requests={}, limits={}, info={})
33 |
34 | for resource_type in ResourceType:
35 | recommendation_processed.info[resource_type] = recommendation.info.get(resource_type)
36 |
37 | for selector in ["requests", "limits"]:
38 | current = getattr(object.allocations, selector).get(resource_type)
39 | recommended = getattr(recommendation, selector).get(resource_type)
40 |
41 | current_severity = Severity.calculate(current, recommended, resource_type)
42 |
43 | #TODO: consider... changing field after model created doesn't validate it.
44 | getattr(recommendation_processed, selector)[resource_type] = Recommendation(
45 | value=recommended, severity=current_severity
46 | )
47 |
48 | for severity in [Severity.CRITICAL, Severity.WARNING, Severity.OK, Severity.GOOD, Severity.UNKNOWN]:
49 | for selector in ["requests", "limits"]:
50 | for recommendation_request in getattr(recommendation_processed, selector).values():
51 | if recommendation_request.severity == severity:
52 | return cls(object=object, recommended=recommendation_processed, severity=severity)
53 |
54 | return cls(object=object, recommended=recommendation_processed, severity=Severity.UNKNOWN)
55 |
56 |
57 | class StrategyData(pd.BaseModel):
58 | name: str
59 | settings: dict[str, Any]
60 |
61 |
62 | class Result(pd.BaseModel):
63 | scans: list[ResourceScan]
64 | score: int = 0
65 | resources: list[str] = ["cpu", "memory"]
66 | description: Optional[str] = None
67 | strategy: StrategyData
68 | errors: list[dict[str, Any]] = pd.Field(default_factory=list)
69 | clusterSummary: dict[str, Any] = {}
70 | config: Optional[Config] = pd.Field(default_factory=Config.get_config)
71 |
72 | def __init__(self, *args, **kwargs) -> None:
73 | super().__init__(*args, **kwargs)
74 | self.score = self.__calculate_score()
75 |
76 | def format(self, formatter: Union[formatters.FormatterFunc, str]) -> Any:
77 | """Format the result.
78 |
79 | Args:
80 | formatter: The formatter to use.
81 |
82 | Returns:
83 | The formatted result.
84 | """
85 |
86 | formatter = formatters.find(formatter) if isinstance(formatter, str) else formatter
87 | return formatter(self)
88 |
89 | @staticmethod
90 | def __scan_cost(scan: ResourceScan) -> float:
91 | return 0.7 if scan.severity == Severity.WARNING else 1 if scan.severity == Severity.CRITICAL else 0
92 |
93 | def __calculate_score(self) -> int:
94 | """Get the score of the result.
95 |
96 | Returns:
97 | The score of the result.
98 | """
99 |
100 | score = sum(self.__scan_cost(scan) for scan in self.scans)
101 | return int((len(self.scans) - score) / len(self.scans) * 100) if self.scans else 0
102 |
103 | @property
104 | def score_letter(self) -> str:
105 | return (
106 | "F"
107 | if self.score < 30
108 | else "D"
109 | if self.score < 55
110 | else "C"
111 | if self.score < 70
112 | else "B"
113 | if self.score < 90
114 | else "A"
115 | )
116 |
--------------------------------------------------------------------------------
/robusta_krr/core/models/severity.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import enum
4 | from typing import Callable, Optional
5 |
6 | from robusta_krr.core.models.allocations import RecommendationValue, ResourceType
7 |
8 |
9 | class Severity(str, enum.Enum):
10 | """
11 | The severity of the scan.
12 |
13 | The severity is calculated based on the difference between the current value and the recommended value.
14 | You can override the severity calculation function by using the `bind_calculator` decorator from the same module.
15 | """
16 |
17 | UNKNOWN = "UNKNOWN"
18 | GOOD = "GOOD"
19 | OK = "OK"
20 | WARNING = "WARNING"
21 | CRITICAL = "CRITICAL"
22 |
23 | @property
24 | def color(self) -> str:
25 | return {
26 | self.UNKNOWN: "dim",
27 | self.GOOD: "green",
28 | self.OK: "gray",
29 | self.WARNING: "yellow",
30 | self.CRITICAL: "red",
31 | }[self]
32 |
33 | @classmethod
34 | def calculate(
35 | cls, current: RecommendationValue, recommended: RecommendationValue, resource_type: ResourceType
36 | ) -> Severity:
37 | if isinstance(recommended, str) or isinstance(current, str):
38 | return cls.UNKNOWN
39 |
40 | return calculate_severity(current, recommended, resource_type)
41 |
42 |
43 | def register_severity_calculator(resource_type: ResourceType) -> Callable[[SeverityCalculator], SeverityCalculator]:
44 | """
45 | Bind a severity calculator function to a resource type.
46 | Use this decorator to bind a severity calculator function to a resource type.
47 |
48 | Example:
49 | >>> @bind_severity_calculator(ResourceType.CPU)
50 | >>> def cpu_severity_calculator(current: Optional[float], recommended: Optional[float], resource_type: ResourceType) -> Severity:
51 | >>> if current is None and recommended is None:
52 | >>> return Severity.GOOD
53 | >>> if current is None or recommended is None:
54 | >>> return Severity.WARNING
55 | >>>
56 | >>> return Severity.CRITICAL if abs(current - recommended) >= 0.5 else Severity.GOOD
57 | """
58 |
59 | def decorator(func: SeverityCalculator) -> SeverityCalculator:
60 | SEVERITY_CALCULATORS_REGISTRY[resource_type] = func
61 | return func
62 |
63 | return decorator
64 |
65 |
66 | SeverityCalculator = Callable[[Optional[float], Optional[float], ResourceType], Severity]
67 | SEVERITY_CALCULATORS_REGISTRY: dict[ResourceType, SeverityCalculator] = {}
68 |
69 |
70 | def calculate_severity(current: Optional[float], recommended: Optional[float], resource_type: ResourceType) -> Severity:
71 | """
72 | Calculate the severity of the scan based on the current value and the recommended value.
73 |
74 | This function will use the severity calculator function that is bound to the resource type.
75 | If there is no calculator function bound to the resource type, it will use the default severity calculator function.
76 | """
77 |
78 | return SEVERITY_CALCULATORS_REGISTRY.get(resource_type, default_severity_calculator)(
79 | current, recommended, resource_type
80 | )
81 |
82 |
83 | def default_severity_calculator(
84 | current: Optional[float], recommended: Optional[float], resource_type: ResourceType
85 | ) -> Severity:
86 | return Severity.UNKNOWN
87 |
88 |
89 | @register_severity_calculator(ResourceType.CPU)
90 | def cpu_severity_calculator(
91 | current: Optional[float], recommended: Optional[float], resource_type: ResourceType
92 | ) -> Severity:
93 | if current is None and recommended is None:
94 | return Severity.GOOD
95 | if current is None or recommended is None:
96 | return Severity.WARNING
97 |
98 | diff = abs(current - recommended)
99 |
100 | if diff >= 0.5:
101 | return Severity.CRITICAL
102 | elif diff >= 0.25:
103 | return Severity.WARNING
104 | elif diff >= 0.1:
105 | return Severity.OK
106 | else:
107 | return Severity.GOOD
108 |
109 |
110 | @register_severity_calculator(ResourceType.Memory)
111 | def memory_severity_calculator(
112 | current: Optional[float], recommended: Optional[float], resource_type: ResourceType
113 | ) -> Severity:
114 | if current is None and recommended is None:
115 | return Severity.GOOD
116 | if current is None or recommended is None:
117 | return Severity.WARNING
118 |
119 | diff = abs(current - recommended) / 1024 / 1024
120 |
121 | if diff >= 500:
122 | return Severity.CRITICAL
123 | elif diff >= 250:
124 | return Severity.WARNING
125 | elif diff >= 100:
126 | return Severity.OK
127 | else:
128 | return Severity.GOOD
129 |
--------------------------------------------------------------------------------
/robusta_krr/formatters/__init__.py:
--------------------------------------------------------------------------------
1 | from .json import json
2 | from .pprint import pprint
3 | from .table import table
4 | from .yaml import yaml
5 | from .csv import csv
6 | from .csv_raw import csv_raw
7 | from .html import html
8 |
--------------------------------------------------------------------------------
/robusta_krr/formatters/csv.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import io
3 | import itertools
4 | import logging
5 | from typing import Any
6 |
7 | from robusta_krr.core.abstract import formatters
8 | from robusta_krr.core.models.allocations import NONE_LITERAL, format_diff, format_recommendation_value
9 | from robusta_krr.core.models.config import settings
10 | from robusta_krr.core.models.result import ResourceScan, ResourceType, Result
11 |
12 | logger = logging.getLogger("krr")
13 |
14 |
15 | NAMESPACE_HEADER = "Namespace"
16 | NAME_HEADER = "Name"
17 | PODS_HEADER = "Pods"
18 | OLD_PODS_HEADER = "Old Pods"
19 | TYPE_HEADER = "Type"
20 | CONTAINER_HEADER = "Container"
21 | CLUSTER_HEADER = "Cluster"
22 | SEVERITY_HEADER = "Severity"
23 |
24 | RESOURCE_DIFF_HEADER = "{resource_name} Diff"
25 | RESOURCE_REQUESTS_HEADER = "{resource_name} Requests"
26 | RESOURCE_LIMITS_HEADER = "{resource_name} Limits"
27 |
28 |
29 | def _format_request_str(item: ResourceScan, resource: ResourceType, selector: str) -> str:
30 | allocated = getattr(item.object.allocations, selector)[resource]
31 | recommended = getattr(item.recommended, selector)[resource]
32 |
33 | if allocated is None and recommended.value is None:
34 | return f"{NONE_LITERAL}"
35 |
36 | diff = format_diff(allocated, recommended, selector)
37 | if diff != "":
38 | diff = f"({diff}) "
39 |
40 | return diff + format_recommendation_value(allocated) + " -> " + format_recommendation_value(recommended.value)
41 |
42 |
43 | def _format_total_diff(item: ResourceScan, resource: ResourceType, pods_current: int) -> str:
44 | selector = "requests"
45 | allocated = getattr(item.object.allocations, selector)[resource]
46 | recommended = getattr(item.recommended, selector)[resource]
47 |
48 | return format_diff(allocated, recommended, selector, pods_current)
49 |
50 |
51 | @formatters.register("csv")
52 | def csv_exporter(result: Result) -> str:
53 | # We need to order the resource columns so that they are in the format of Namespace,Name,Pods,Old Pods,Type,Container,CPU Diff,CPU Requests,CPU Limits,Memory Diff,Memory Requests,Memory Limits
54 | csv_columns = ["Namespace", "Name", "Pods", "Old Pods", "Type", "Container"]
55 |
56 | if settings.show_cluster_name:
57 | csv_columns.insert(0, "Cluster")
58 |
59 | if settings.show_severity:
60 | csv_columns.append("Severity")
61 |
62 | for resource in ResourceType:
63 | csv_columns.append(RESOURCE_DIFF_HEADER.format(resource_name=resource.name))
64 | csv_columns.append(RESOURCE_REQUESTS_HEADER.format(resource_name=resource.name))
65 | csv_columns.append(RESOURCE_LIMITS_HEADER.format(resource_name=resource.name))
66 |
67 | output = io.StringIO()
68 | csv_writer = csv.DictWriter(output, csv_columns, extrasaction="ignore")
69 | csv_writer.writeheader()
70 |
71 | for _, group in itertools.groupby(
72 | enumerate(result.scans), key=lambda x: (x[1].object.cluster, x[1].object.namespace, x[1].object.name)
73 | ):
74 | group_items = list(group)
75 |
76 | for j, (_, item) in enumerate(group_items):
77 | full_info_row = j == 0
78 |
79 | row: dict[str, Any] = {
80 | NAMESPACE_HEADER: item.object.namespace if full_info_row else "",
81 | NAME_HEADER: item.object.name if full_info_row else "",
82 | PODS_HEADER: f"{item.object.current_pods_count}" if full_info_row else "",
83 | OLD_PODS_HEADER: f"{item.object.deleted_pods_count}" if full_info_row else "",
84 | TYPE_HEADER: item.object.kind if full_info_row else "",
85 | CONTAINER_HEADER: item.object.container,
86 | SEVERITY_HEADER: item.severity,
87 | CLUSTER_HEADER: item.object.cluster,
88 | }
89 |
90 | for resource in ResourceType:
91 | row[RESOURCE_DIFF_HEADER.format(resource_name=resource.name)] = _format_total_diff(
92 | item, resource, item.object.current_pods_count
93 | )
94 | row[RESOURCE_REQUESTS_HEADER.format(resource_name=resource.name)] = _format_request_str(
95 | item, resource, "requests"
96 | )
97 | row[RESOURCE_LIMITS_HEADER.format(resource_name=resource.name)] = _format_request_str(
98 | item, resource, "limits"
99 | )
100 |
101 | csv_writer.writerow(row)
102 |
103 | return output.getvalue()
104 |
--------------------------------------------------------------------------------
/robusta_krr/formatters/csv_raw.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import io
3 | import logging
4 | from typing import Any, Union
5 |
6 | from robusta_krr.core.abstract import formatters
7 | from robusta_krr.core.models.allocations import NAN_LITERAL, NONE_LITERAL
8 | from robusta_krr.core.models.config import settings
9 | from robusta_krr.core.models.result import ResourceScan, ResourceType, Result
10 |
11 | logger = logging.getLogger("krr")
12 |
13 |
14 | NAMESPACE_HEADER = "Namespace"
15 | NAME_HEADER = "Name"
16 | PODS_HEADER = "Pods"
17 | OLD_PODS_HEADER = "Old Pods"
18 | TYPE_HEADER = "Type"
19 | CONTAINER_HEADER = "Container"
20 | CLUSTER_HEADER = "Cluster"
21 | SEVERITY_HEADER = "Severity"
22 |
23 | RESOURCE_REQUESTS_CURRENT_HEADER = "{resource_name} Requests Current"
24 | RESOURCE_REQUESTS_RECOMMENDED_HEADER = '{resource_name} Requests Recommended'
25 |
26 | RESOURCE_LIMITS_CURRENT_HEADER = "{resource_name} Limits Current"
27 | RESOURCE_LIMITS_RECOMMENDED_HEADER = '{resource_name} Limits Recommended'
28 |
29 |
30 | def _format_value(val: Union[float, int]) -> str:
31 | if isinstance(val, int):
32 | return str(val)
33 | elif isinstance(val, float):
34 | return str(int(val)) if val.is_integer() else str(val)
35 | elif val is None:
36 | return NONE_LITERAL
37 | elif isinstance(val, str):
38 | return NAN_LITERAL
39 | else:
40 | raise ValueError(f'unknown value: {val}')
41 |
42 |
43 | def _format_request_current(item: ResourceScan, resource: ResourceType, selector: str) -> str:
44 | allocated = getattr(item.object.allocations, selector)[resource]
45 | if allocated is None:
46 | return NONE_LITERAL
47 | return _format_value(allocated)
48 |
49 |
50 | def _format_request_recommend(item: ResourceScan, resource: ResourceType, selector: str) -> str:
51 | recommended = getattr(item.recommended, selector)[resource]
52 | if recommended is None:
53 | return NONE_LITERAL
54 | return _format_value(recommended.value)
55 |
56 |
57 | @formatters.register("csv-raw")
58 | def csv_raw(result: Result) -> str:
59 | # We need to order the resource columns so that they are in the format of
60 | # Namespace, Name, Pods, Old Pods, Type, Container,
61 | # CPU Requests Current, CPU Requests Recommend, CPU Limits Current, CPU Limits Recommend,
62 | # Memory Requests Current, Memory Requests Recommend, Memory Limits Current, Memory Limits Recommend,
63 | csv_columns = ["Namespace", "Name", "Pods", "Old Pods", "Type", "Container"]
64 |
65 | if settings.show_cluster_name:
66 | csv_columns.insert(0, "Cluster")
67 |
68 | if settings.show_severity:
69 | csv_columns.append("Severity")
70 |
71 | for resource in ResourceType:
72 | csv_columns.append(RESOURCE_REQUESTS_CURRENT_HEADER.format(resource_name=resource.name))
73 | csv_columns.append(RESOURCE_REQUESTS_RECOMMENDED_HEADER.format(resource_name=resource.name))
74 | csv_columns.append(RESOURCE_LIMITS_CURRENT_HEADER.format(resource_name=resource.name))
75 | csv_columns.append(RESOURCE_LIMITS_RECOMMENDED_HEADER.format(resource_name=resource.name))
76 |
77 | output = io.StringIO()
78 | csv_writer = csv.DictWriter(output, csv_columns, extrasaction="ignore")
79 | csv_writer.writeheader()
80 |
81 | for item in result.scans:
82 | row: dict[str, Any] = {
83 | NAMESPACE_HEADER: item.object.namespace,
84 | NAME_HEADER: item.object.name,
85 | PODS_HEADER: f"{item.object.current_pods_count}",
86 | OLD_PODS_HEADER: f"{item.object.deleted_pods_count}",
87 | TYPE_HEADER: item.object.kind,
88 | CONTAINER_HEADER: item.object.container,
89 | SEVERITY_HEADER: item.severity,
90 | CLUSTER_HEADER: item.object.cluster,
91 | }
92 |
93 | for resource in ResourceType:
94 | resource: ResourceType
95 | row[RESOURCE_REQUESTS_CURRENT_HEADER.format(resource_name=resource.name)] = _format_request_current(
96 | item, resource, "requests"
97 | )
98 | row[RESOURCE_REQUESTS_RECOMMENDED_HEADER.format(resource_name=resource.name)] = _format_request_recommend(
99 | item, resource, "requests"
100 | )
101 | row[RESOURCE_LIMITS_CURRENT_HEADER.format(resource_name=resource.name)] = _format_request_current(
102 | item, resource, "limits"
103 | )
104 | row[RESOURCE_LIMITS_RECOMMENDED_HEADER.format(resource_name=resource.name)] = _format_request_recommend(
105 | item, resource, "limits"
106 | )
107 |
108 | csv_writer.writerow(row)
109 |
110 | return output.getvalue()
111 |
--------------------------------------------------------------------------------
/robusta_krr/formatters/html.py:
--------------------------------------------------------------------------------
1 | from rich.console import Console
2 |
3 | from robusta_krr.core.abstract import formatters
4 | from robusta_krr.core.models.result import Result
5 | from .table import table
6 |
7 | @formatters.register("html")
8 | def html(result: Result) -> str:
9 | console = Console(record=True)
10 | table_output = table(result)
11 | console.print(table_output)
12 | return console.export_html(inline_styles=True)
13 |
--------------------------------------------------------------------------------
/robusta_krr/formatters/json.py:
--------------------------------------------------------------------------------
1 | from robusta_krr.core.abstract import formatters
2 | from robusta_krr.core.models.result import Result
3 |
4 |
5 | @formatters.register()
6 | def json(result: Result) -> str:
7 | return result.json(indent=2)
8 |
--------------------------------------------------------------------------------
/robusta_krr/formatters/pprint.py:
--------------------------------------------------------------------------------
1 | from pprint import pformat
2 |
3 | from robusta_krr.core.abstract import formatters
4 | from robusta_krr.core.models.result import Result
5 |
6 |
7 | @formatters.register()
8 | def pprint(result: Result) -> str:
9 | return pformat(result.dict())
10 |
--------------------------------------------------------------------------------
/robusta_krr/formatters/table.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | from typing import Any
3 |
4 | from rich.table import Table
5 |
6 | from robusta_krr.core.abstract import formatters
7 | from robusta_krr.core.models.allocations import RecommendationValue, format_recommendation_value, format_diff, NONE_LITERAL, NAN_LITERAL
8 | from robusta_krr.core.models.result import ResourceScan, ResourceType, Result
9 | from robusta_krr.core.models.config import settings
10 | from robusta_krr.utils import resource_units
11 |
12 |
13 | DEFAULT_INFO_COLOR = "grey27"
14 | INFO_COLORS: dict[str, str] = {
15 | "OOMKill detected": "dark_red",
16 | }
17 |
18 |
19 | def _format_request_str(item: ResourceScan, resource: ResourceType, selector: str) -> str:
20 | allocated = getattr(item.object.allocations, selector)[resource]
21 | info = item.recommended.info.get(resource)
22 | recommended = getattr(item.recommended, selector)[resource]
23 | severity = recommended.severity
24 |
25 | if allocated is None and recommended.value is None:
26 | return f"[{severity.color}]{NONE_LITERAL}[/{severity.color}]"
27 |
28 | diff = format_diff(allocated, recommended, selector, colored=True)
29 | if diff != "":
30 | diff = f"({diff}) "
31 |
32 | if info is None:
33 | info_formatted = ""
34 | else:
35 | color = INFO_COLORS.get(info, DEFAULT_INFO_COLOR)
36 | info_formatted = f"\n[{color}]({info})[/{color}]"
37 |
38 | return (
39 | diff
40 | + f"[{severity.color}]"
41 | + format_recommendation_value(allocated)
42 | + " -> "
43 | + format_recommendation_value(recommended.value)
44 | + f"[/{severity.color}]"
45 | + info_formatted
46 | )
47 |
48 |
49 | def _format_total_diff(item: ResourceScan, resource: ResourceType, pods_current: int) -> str:
50 | selector = "requests"
51 | allocated = getattr(item.object.allocations, selector)[resource]
52 | recommended = getattr(item.recommended, selector)[resource]
53 |
54 | # if we have more than one pod, say so (this explains to the user why the total is different than the recommendation)
55 | if pods_current == 1:
56 | pods_info = ""
57 | else:
58 | pods_info = f"\n({pods_current} pods)"
59 |
60 | return f"{format_diff(allocated, recommended, selector, pods_current, colored=True)}{pods_info}"
61 |
62 |
63 | @formatters.register(rich_console=True)
64 | def table(result: Result) -> Table:
65 | """Format the result as text.
66 |
67 | :param result: The result to format.
68 | :type result: :class:`core.result.Result`
69 | :returns: The formatted results.
70 | :rtype: str
71 | """
72 |
73 | table = Table(
74 | show_header=True,
75 | header_style="bold magenta",
76 | title=f"\n{result.description}\n" if result.description else None,
77 | title_justify="left",
78 | title_style="",
79 | caption=f"{result.score} points - {result.score_letter}",
80 | )
81 |
82 | cluster_count = len(set(item.object.cluster for item in result.scans))
83 |
84 | table.add_column("Number", justify="right", no_wrap=True)
85 | if cluster_count > 1 or settings.show_cluster_name:
86 | table.add_column("Cluster", style="cyan")
87 | table.add_column("Namespace", style="cyan")
88 | table.add_column("Name", style="cyan")
89 | table.add_column("Pods", style="cyan")
90 | table.add_column("Old Pods", style="cyan")
91 | table.add_column("Type", style="cyan")
92 | table.add_column("Container", style="cyan")
93 | for resource in ResourceType:
94 | table.add_column(f"{resource.name} Diff")
95 | table.add_column(f"{resource.name} Requests")
96 | table.add_column(f"{resource.name} Limits")
97 |
98 | for _, group in itertools.groupby(
99 | enumerate(result.scans), key=lambda x: (x[1].object.cluster, x[1].object.namespace, x[1].object.name)
100 | ):
101 | group_items = list(group)
102 |
103 | for j, (i, item) in enumerate(group_items):
104 | last_row = j == len(group_items) - 1
105 | full_info_row = j == 0
106 |
107 | cells: list[Any] = [f"[{item.severity.color}]{i + 1}.[/{item.severity.color}]"]
108 | if cluster_count > 1 or settings.show_cluster_name:
109 | cells.append(item.object.cluster if full_info_row else "")
110 | cells += [
111 | item.object.namespace if full_info_row else "",
112 | item.object.name if full_info_row else "",
113 | f"{item.object.current_pods_count}" if full_info_row else "",
114 | f"{item.object.deleted_pods_count}" if full_info_row else "",
115 | item.object.kind if full_info_row else "",
116 | item.object.container,
117 | ]
118 |
119 | for resource in ResourceType:
120 | cells.append(_format_total_diff(item, resource, item.object.current_pods_count))
121 | cells += [_format_request_str(item, resource, selector) for selector in ["requests", "limits"]]
122 |
123 | table.add_row(*cells, end_section=last_row)
124 |
125 | return table
126 |
--------------------------------------------------------------------------------
/robusta_krr/formatters/yaml.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import yaml as yaml_module
4 |
5 | from robusta_krr.core.abstract import formatters
6 | from robusta_krr.core.models.result import Result
7 |
8 |
9 | @formatters.register()
10 | def yaml(result: Result) -> str:
11 | return yaml_module.dump(json.loads(result.json()), sort_keys=False)
12 |
--------------------------------------------------------------------------------
/robusta_krr/strategies/__init__.py:
--------------------------------------------------------------------------------
1 | from .simple import SimpleStrategy
2 | from .simple_limit import SimpleLimitStrategy
--------------------------------------------------------------------------------
/robusta_krr/strategies/simple.py:
--------------------------------------------------------------------------------
1 | import textwrap
2 | from datetime import timedelta
3 |
4 | import numpy as np
5 | import pydantic as pd
6 |
7 | from robusta_krr.core.abstract.strategies import (
8 | BaseStrategy,
9 | K8sObjectData,
10 | MetricsPodData,
11 | PodsTimeData,
12 | ResourceRecommendation,
13 | ResourceType,
14 | RunResult,
15 | StrategySettings,
16 | )
17 | from robusta_krr.core.integrations.prometheus.metrics import (
18 | CPUAmountLoader,
19 | MaxMemoryLoader,
20 | MemoryAmountLoader,
21 | PercentileCPULoader,
22 | PrometheusMetric,
23 | MaxOOMKilledMemoryLoader,
24 | )
25 |
26 |
27 | class SimpleStrategySettings(StrategySettings):
28 | cpu_percentile: float = pd.Field(95, gt=0, le=100, description="The percentile to use for the CPU recommendation.")
29 | memory_buffer_percentage: float = pd.Field(
30 | 15, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation."
31 | )
32 | points_required: int = pd.Field(
33 | 100, ge=1, description="The number of data points required to make a recommendation for a resource."
34 | )
35 | allow_hpa: bool = pd.Field(
36 | False,
37 | description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.",
38 | )
39 | use_oomkill_data: bool = pd.Field(
40 | False,
41 | description="Whether to bump the memory when OOMKills are detected (experimental).",
42 | )
43 | oom_memory_buffer_percentage: float = pd.Field(
44 | 25, ge=0, description="What percentage to increase the memory when there are OOMKill events."
45 | )
46 |
47 | def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0) -> float:
48 | data_ = [np.max(values[:, 1]) for values in data.values()]
49 | if len(data_) == 0:
50 | return float("NaN")
51 |
52 | return max(
53 | np.max(data_) * (1 + self.memory_buffer_percentage / 100),
54 | max_oomkill * (1 + self.oom_memory_buffer_percentage / 100),
55 | )
56 |
57 | def calculate_cpu_proposal(self, data: PodsTimeData) -> float:
58 | if len(data) == 0:
59 | return float("NaN")
60 |
61 | if len(data) > 1:
62 | data_ = np.concatenate([values[:, 1] for values in data.values()])
63 | else:
64 | data_ = list(data.values())[0][:, 1]
65 |
66 | return np.max(data_)
67 |
68 | def history_range_enough(self, history_range: tuple[timedelta, timedelta]) -> bool:
69 | start, end = history_range
70 | return (end - start) >= timedelta(hours=3)
71 |
72 |
73 | class SimpleStrategy(BaseStrategy[SimpleStrategySettings]):
74 |
75 | display_name = "simple"
76 | rich_console = True
77 |
78 | @property
79 | def metrics(self) -> list[type[PrometheusMetric]]:
80 | metrics = [
81 | PercentileCPULoader(self.settings.cpu_percentile),
82 | MaxMemoryLoader,
83 | CPUAmountLoader,
84 | MemoryAmountLoader,
85 | ]
86 |
87 | if self.settings.use_oomkill_data:
88 | metrics.append(MaxOOMKilledMemoryLoader)
89 |
90 | return metrics
91 |
92 | @property
93 | def description(self):
94 | s = textwrap.dedent(f"""\
95 | CPU request: {self.settings.cpu_percentile}% percentile, limit: unset
96 | Memory request: max + {self.settings.memory_buffer_percentage}%, limit: max + {self.settings.memory_buffer_percentage}%
97 | History: {self.settings.history_duration} hours
98 | Step: {self.settings.timeframe_duration} minutes
99 |
100 | All parameters can be customized. For example: `krr simple --cpu_percentile=90 --memory_buffer_percentage=15 --history_duration=24 --timeframe_duration=0.5`
101 | """)
102 |
103 | if not self.settings.allow_hpa:
104 | s += "\n" + textwrap.dedent(f"""\
105 | This strategy does not work with objects with HPA defined (Horizontal Pod Autoscaler).
106 | If HPA is defined for CPU or Memory, the strategy will return "?" for that resource.
107 | You can override this behaviour by passing the --allow-hpa flag
108 | """)
109 |
110 | s += "\nLearn more: [underline]https://github.com/robusta-dev/krr#algorithm[/underline]"
111 | return s
112 |
113 | def __calculate_cpu_proposal(
114 | self, history_data: MetricsPodData, object_data: K8sObjectData
115 | ) -> ResourceRecommendation:
116 | data = history_data["PercentileCPULoader"]
117 |
118 | if len(data) == 0:
119 | return ResourceRecommendation.undefined(info="No data")
120 |
121 | # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
122 | # As CPUAmountLoader returns only the last value (1 point), [0, 1] is used to get the value
123 | # So each pod is string with pod name, and values is numpy array of shape (N, 2)
124 | data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()}
125 | total_points_count = sum(data_count.values())
126 |
127 | if total_points_count < self.settings.points_required:
128 | return ResourceRecommendation.undefined(info="Not enough data")
129 |
130 | if (
131 | object_data.hpa is not None
132 | and object_data.hpa.target_cpu_utilization_percentage is not None
133 | and not self.settings.allow_hpa
134 | ):
135 | return ResourceRecommendation.undefined(info="HPA detected")
136 |
137 | cpu_usage = self.settings.calculate_cpu_proposal(data)
138 | return ResourceRecommendation(request=cpu_usage, limit=None)
139 |
140 | def __calculate_memory_proposal(
141 | self, history_data: MetricsPodData, object_data: K8sObjectData
142 | ) -> ResourceRecommendation:
143 | data = history_data["MaxMemoryLoader"]
144 |
145 | oomkill_detected = False
146 |
147 | if self.settings.use_oomkill_data:
148 | max_oomkill_data = history_data["MaxOOMKilledMemoryLoader"]
149 | # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
150 | # As MaxOOMKilledMemoryLoader returns only the last value (1 point), [0, 1] is used to get the value
151 | # So each value is numpy array of shape (N, 2)
152 | max_oomkill_value = (
153 | np.max([values[0, 1] for values in max_oomkill_data.values()]) if len(max_oomkill_data) > 0 else 0
154 | )
155 | if max_oomkill_value != 0:
156 | oomkill_detected = True
157 | else:
158 | max_oomkill_value = 0
159 |
160 | if len(data) == 0:
161 | return ResourceRecommendation.undefined(info="No data")
162 |
163 | # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
164 | # As MemoryAmountLoader returns only the last value (1 point), [0, 1] is used to get the value
165 | # So each pod is string with pod name, and values is numpy array of shape (N, 2)
166 | data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()}
167 | total_points_count = sum(data_count.values())
168 |
169 | if total_points_count < self.settings.points_required:
170 | return ResourceRecommendation.undefined(info="Not enough data")
171 |
172 | if (
173 | object_data.hpa is not None
174 | and object_data.hpa.target_memory_utilization_percentage is not None
175 | and not self.settings.allow_hpa
176 | ):
177 | return ResourceRecommendation.undefined(info="HPA detected")
178 |
179 | memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value)
180 | return ResourceRecommendation(
181 | request=memory_usage, limit=memory_usage, info="OOMKill detected" if oomkill_detected else None
182 | )
183 |
184 | def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult:
185 | return {
186 | ResourceType.CPU: self.__calculate_cpu_proposal(history_data, object_data),
187 | ResourceType.Memory: self.__calculate_memory_proposal(history_data, object_data),
188 | }
189 |
--------------------------------------------------------------------------------
/robusta_krr/strategies/simple_limit.py:
--------------------------------------------------------------------------------
1 | import textwrap
2 | from datetime import timedelta
3 |
4 | import numpy as np
5 | import pydantic as pd
6 |
7 | from robusta_krr.core.abstract.strategies import (
8 | BaseStrategy,
9 | K8sObjectData,
10 | MetricsPodData,
11 | PodsTimeData,
12 | ResourceRecommendation,
13 | ResourceType,
14 | RunResult,
15 | StrategySettings,
16 | )
17 | from robusta_krr.core.integrations.prometheus.metrics import (
18 | CPUAmountLoader,
19 | MaxMemoryLoader,
20 | MemoryAmountLoader,
21 | CPULoader,
22 | PrometheusMetric,
23 | MaxOOMKilledMemoryLoader,
24 | )
25 |
26 |
27 | class SimpleLimitStrategySettings(StrategySettings):
28 | cpu_request: float = pd.Field(66, gt=0, le=100, description="The percentile to use for the CPU request.")
29 | cpu_limit: float = pd.Field(96, gt=0, le=100, description="The percentile to use for the CPU limit.")
30 | memory_buffer_percentage: float = pd.Field(
31 | 15, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation."
32 | )
33 | points_required: int = pd.Field(
34 | 100, ge=1, description="The number of data points required to make a recommendation for a resource."
35 | )
36 | allow_hpa: bool = pd.Field(
37 | False,
38 | description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.",
39 | )
40 | use_oomkill_data: bool = pd.Field(
41 | False,
42 | description="Whether to bump the memory when OOMKills are detected (experimental).",
43 | )
44 | oom_memory_buffer_percentage: float = pd.Field(
45 | 25, ge=0, description="What percentage to increase the memory when there are OOMKill events."
46 | )
47 |
48 | def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0) -> float:
49 | data_ = [np.max(values[:, 1]) for values in data.values()]
50 | if len(data_) == 0:
51 | return float("NaN")
52 |
53 | return max(
54 | np.max(data_) * (1 + self.memory_buffer_percentage / 100),
55 | max_oomkill * (1 + self.oom_memory_buffer_percentage / 100),
56 | )
57 |
58 | def calculate_cpu_percentile(self, data: PodsTimeData, percentile: float) -> float:
59 | if len(data) == 0:
60 | return float("NaN")
61 |
62 | if len(data) > 1:
63 | data_ = np.concatenate([values[:, 1] for values in data.values()])
64 | else:
65 | data_ = list(data.values())[0][:, 1]
66 |
67 | return np.percentile(data_, percentile)
68 |
69 | def history_range_enough(self, history_range: tuple[timedelta, timedelta]) -> bool:
70 | start, end = history_range
71 | return (end - start) >= timedelta(hours=3)
72 |
73 |
74 | class SimpleLimitStrategy(BaseStrategy[SimpleLimitStrategySettings]):
75 |
76 | display_name = "simple_limit"
77 | rich_console = True
78 |
79 | @property
80 | def metrics(self) -> list[type[PrometheusMetric]]:
81 | metrics = [
82 | CPULoader,
83 | MaxMemoryLoader,
84 | CPUAmountLoader,
85 | MemoryAmountLoader,
86 | ]
87 |
88 | if self.settings.use_oomkill_data:
89 | metrics.append(MaxOOMKilledMemoryLoader)
90 |
91 | return metrics
92 |
93 | @property
94 | def description(self):
95 | s = textwrap.dedent(f"""\
96 | CPU request: {self.settings.cpu_request}% percentile, limit: {self.settings.cpu_limit}% percentile
97 | Memory request: max + {self.settings.memory_buffer_percentage}%, limit: max + {self.settings.memory_buffer_percentage}%
98 | History: {self.settings.history_duration} hours
99 | Step: {self.settings.timeframe_duration} minutes
100 |
101 | All parameters can be customized. For example: `krr simple_limit --cpu_request=66 --cpu_limit=96 --memory_buffer_percentage=15 --history_duration=24 --timeframe_duration=0.5`
102 | """)
103 |
104 | if not self.settings.allow_hpa:
105 | s += "\n" + textwrap.dedent(f"""\
106 | This strategy does not work with objects with HPA defined (Horizontal Pod Autoscaler).
107 | If HPA is defined for CPU or Memory, the strategy will return "?" for that resource.
108 | You can override this behaviour by passing the --allow-hpa flag
109 | """)
110 |
111 | s += "\nLearn more: [underline]https://github.com/robusta-dev/krr#algorithm[/underline]"
112 | return s
113 |
114 | def __calculate_cpu_proposal(
115 | self, history_data: MetricsPodData, object_data: K8sObjectData
116 | ) -> ResourceRecommendation:
117 | data = history_data["CPULoader"]
118 |
119 | if len(data) == 0:
120 | return ResourceRecommendation.undefined(info="No data")
121 |
122 | # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
123 | # As CPUAmountLoader returns only the last value (1 point), [0, 1] is used to get the value
124 | # So each pod is string with pod name, and values is numpy array of shape (N, 2)
125 | data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()}
126 | total_points_count = sum(data_count.values())
127 |
128 | if total_points_count < self.settings.points_required:
129 | return ResourceRecommendation.undefined(info="Not enough data")
130 |
131 | if (
132 | object_data.hpa is not None
133 | and object_data.hpa.target_cpu_utilization_percentage is not None
134 | and not self.settings.allow_hpa
135 | ):
136 | return ResourceRecommendation.undefined(info="HPA detected")
137 |
138 | cpu_request = self.settings.calculate_cpu_percentile(data, self.settings.cpu_request)
139 | cpu_limit = self.settings.calculate_cpu_percentile(data, self.settings.cpu_limit)
140 | return ResourceRecommendation(request=cpu_request, limit=cpu_limit)
141 |
142 | def __calculate_memory_proposal(
143 | self, history_data: MetricsPodData, object_data: K8sObjectData
144 | ) -> ResourceRecommendation:
145 | data = history_data["MaxMemoryLoader"]
146 |
147 | oomkill_detected = False
148 |
149 | if self.settings.use_oomkill_data:
150 | max_oomkill_data = history_data["MaxOOMKilledMemoryLoader"]
151 | # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
152 | # As MaxOOMKilledMemoryLoader returns only the last value (1 point), [0, 1] is used to get the value
153 | # So each value is numpy array of shape (N, 2)
154 | max_oomkill_value = (
155 | np.max([values[0, 1] for values in max_oomkill_data.values()]) if len(max_oomkill_data) > 0 else 0
156 | )
157 | if max_oomkill_value != 0:
158 | oomkill_detected = True
159 | else:
160 | max_oomkill_value = 0
161 |
162 | if len(data) == 0:
163 | return ResourceRecommendation.undefined(info="No data")
164 |
165 | # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value]
166 | # As MemoryAmountLoader returns only the last value (1 point), [0, 1] is used to get the value
167 | # So each pod is string with pod name, and values is numpy array of shape (N, 2)
168 | data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()}
169 | total_points_count = sum(data_count.values())
170 |
171 | if total_points_count < self.settings.points_required:
172 | return ResourceRecommendation.undefined(info="Not enough data")
173 |
174 | if (
175 | object_data.hpa is not None
176 | and object_data.hpa.target_memory_utilization_percentage is not None
177 | and not self.settings.allow_hpa
178 | ):
179 | return ResourceRecommendation.undefined(info="HPA detected")
180 |
181 | memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value)
182 | return ResourceRecommendation(
183 | request=memory_usage, limit=memory_usage, info="OOMKill detected" if oomkill_detected else None
184 | )
185 |
186 | def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult:
187 | return {
188 | ResourceType.CPU: self.__calculate_cpu_proposal(history_data, object_data),
189 | ResourceType.Memory: self.__calculate_memory_proposal(history_data, object_data),
190 | }
191 |
--------------------------------------------------------------------------------
/robusta_krr/utils/batched.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | from typing import Iterable, TypeVar
3 |
4 | _T = TypeVar("_T")
5 |
6 |
7 | def batched(iterable: Iterable[_T], n: int) -> Iterable[list[_T]]:
8 | "Batch data into tuples of length n. The last batch may be shorter."
9 | # batched('ABCDEFG', 3) --> ABC DEF G
10 | if n < 1:
11 | raise ValueError("n must be at least one")
12 | it = iter(iterable)
13 | while batch := list(itertools.islice(it, n)):
14 | yield batch
15 |
--------------------------------------------------------------------------------
/robusta_krr/utils/intro.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import asyncio
3 | from concurrent.futures import ThreadPoolExecutor
4 |
5 | from .version import get_version
6 |
7 |
8 | ONLINE_LINK = 'https://api.robusta.dev/krr/intro'
9 | LOCAL_LINK = './intro.txt'
10 | TIMEOUT = 0.5
11 |
12 |
13 | # Synchronous function to fetch intro message
14 | def fetch_intro_message() -> str:
15 | try:
16 | # Attempt to get the message from the URL
17 | response = requests.get(ONLINE_LINK, params={"version": get_version()}, timeout=TIMEOUT)
18 | response.raise_for_status() # Raises an error for bad responses
19 | result = response.json()
20 | return result['message']
21 | except Exception as e1:
22 | # If there's any error, fallback to local file
23 | try:
24 | with open(LOCAL_LINK, 'r') as file:
25 | return file.read()
26 | except Exception as e2:
27 | return (
28 | "[red]Failed to load the intro message.\n"
29 | f"Both from the URL: {e1.__class__.__name__} {e1}\n"
30 | f"and the local file: {e2.__class__.__name__} {e2}\n"
31 | "But as that is not critical, KRR will continue to run without the intro message.[/red]"
32 | )
33 |
34 |
35 | async def load_intro_message() -> str:
36 | loop = asyncio.get_running_loop()
37 | # Use a ThreadPoolExecutor to run the synchronous function in a separate thread
38 | with ThreadPoolExecutor() as pool:
39 | return await loop.run_in_executor(pool, fetch_intro_message)
40 |
41 |
42 | __all__ = ['load_intro_message']
43 |
--------------------------------------------------------------------------------
/robusta_krr/utils/object_like_dict.py:
--------------------------------------------------------------------------------
1 | class ObjectLikeDict:
2 | def __init__(self, dictionary):
3 | for key, value in dictionary.items():
4 | if isinstance(value, dict):
5 | value = ObjectLikeDict(value) # Convert inner dict
6 | if isinstance(value, list):
7 | value = [ObjectLikeDict(item) if isinstance(item, dict) else item for item in value]
8 | self.__dict__[key] = value
9 |
10 | def __getattr__(self, name):
11 | return self.__dict__.get(name)
12 |
13 | def __setattr__(self, name, value):
14 | self.__dict__[name] = value
15 |
16 | def __str__(self):
17 | return str(self.__dict__)
18 |
19 | def __repr__(self):
20 | return repr(self.__dict__)
21 |
22 | def __len__(self):
23 | return len(self.__dict__)
24 |
25 | def get(self, key, default=None):
26 | return self.__dict__.get(key, default)
27 |
28 | def items(self):
29 | return self.__dict__.items()
30 |
--------------------------------------------------------------------------------
/robusta_krr/utils/patch.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from kubernetes.client.models.v1_pod_failure_policy_rule import V1PodFailurePolicyRule
4 |
5 | def create_monkey_patches():
6 | """
7 | The python kubernetes client will throw exceptions for specific fields that were not allowed to be None on older versions of kubernetes.
8 | """
9 | logger = logging.getLogger("krr")
10 | logger.debug("Creating kubernetes python cli monkey patches")
11 |
12 | def patched_setter_pod_failure_policy(self, on_pod_conditions):
13 | self._on_pod_conditions = on_pod_conditions
14 |
15 | V1PodFailurePolicyRule.on_pod_conditions = V1PodFailurePolicyRule.on_pod_conditions.setter(patched_setter_pod_failure_policy)
16 |
--------------------------------------------------------------------------------
/robusta_krr/utils/progress_bar.py:
--------------------------------------------------------------------------------
1 | from alive_progress import alive_bar
2 |
3 | # from robusta_krr.core.models.config import settings
4 |
5 |
6 | class ProgressBar:
7 | """
8 | Progress bar for displaying progress of gathering recommendations.
9 |
10 | Use `ProgressBar` as a context manager to automatically handle the progress bar.
11 | Use `progress` method to step the progress bar.
12 | """
13 |
14 | def __init__(self, **kwargs) -> None:
15 | # self.show_bar = not settings.quiet and not settings.log_to_stderr
16 | self.show_bar = False # FIXME: Progress bar is not working good with other logs
17 | if self.show_bar:
18 | self.alive_bar = alive_bar(**kwargs, enrich_print=False)
19 |
20 | def __enter__(self):
21 | if self.show_bar:
22 | self.bar = self.alive_bar.__enter__()
23 | return self
24 |
25 | def progress(self):
26 | if self.show_bar:
27 | self.bar()
28 |
29 | def __exit__(self, *args):
30 | if self.show_bar:
31 | self.alive_bar.__exit__(*args)
32 |
--------------------------------------------------------------------------------
/robusta_krr/utils/resource_units.py:
--------------------------------------------------------------------------------
1 | from typing import Literal, Union
2 |
3 | UNITS: dict[str, float] = {
4 | "m": 0.001,
5 | "Ki": 1024,
6 | "Mi": 1024**2,
7 | "Gi": 1024**3,
8 | "Ti": 1024**4,
9 | "Pi": 1024**5,
10 | "Ei": 1024**6,
11 | "k": 1e3,
12 | "M": 1e6,
13 | "G": 1e9,
14 | "T": 1e12,
15 | "P": 1e15,
16 | "E": 1e18,
17 | }
18 |
19 |
20 | def parse(x: str, /) -> Union[float, int]:
21 | """Converts a string to an integer with respect of units."""
22 |
23 | for unit, multiplier in UNITS.items():
24 | if x.endswith(unit):
25 | return float(x[: -len(unit)]) * multiplier
26 |
27 | return float(x)
28 |
29 |
30 | def get_base(x: str, /) -> Literal[1024, 1000]:
31 | """Returns the base of the unit."""
32 |
33 | for unit, _ in UNITS.items():
34 | if x.endswith(unit):
35 | return 1024 if unit in ["Ki", "Mi", "Gi", "Ti", "Pi", "Ei"] else 1000
36 | return 1000 if "." in x else 1024
37 |
38 |
39 | def format(x: Union[float, int], /, *, base: Literal[1024, 1000] = 1024) -> str:
40 | """Converts an integer to a string with respect of units."""
41 |
42 | if x < 1:
43 | return f"{int(x*1000)}m"
44 | if x < base:
45 | return str(x)
46 |
47 | units = ["", "K", "M", "G", "T", "P", "E"]
48 | binary_units = ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei"]
49 |
50 | x = int(x)
51 | for i, unit in enumerate(binary_units if base == 1024 else units):
52 | if x < base ** (i + 1) or i == len(units) - 1 or x / base ** (i + 1) < 10:
53 | return f"{x/base**i:.0f}{unit}"
54 | return f"{x/6**i:.0f}{unit}"
55 |
--------------------------------------------------------------------------------
/robusta_krr/utils/service_discovery.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from abc import ABC, abstractmethod
3 | from typing import Optional
4 |
5 | from cachetools import TTLCache
6 | from kubernetes import client
7 | from kubernetes.client import V1IngressList, V1ServiceList
8 | from kubernetes.client.api_client import ApiClient
9 | from kubernetes.client.models.v1_ingress import V1Ingress
10 | from kubernetes.client.models.v1_service import V1Service
11 |
12 | from robusta_krr.core.models.config import settings
13 |
14 | logger = logging.getLogger("krr")
15 |
16 |
17 | class ServiceDiscovery:
18 | SERVICE_CACHE_TTL_SEC = 900
19 | cache: TTLCache = TTLCache(maxsize=1, ttl=SERVICE_CACHE_TTL_SEC)
20 |
21 | def __init__(self, api_client: Optional[ApiClient] = None) -> None:
22 | self.api_client = api_client
23 |
24 | def find_service_url(self, label_selector: str) -> Optional[str]:
25 | """
26 | Get the url of an in-cluster service with a specific label
27 | """
28 | # we do it this way because there is a weird issue with hikaru's ServiceList.listServiceForAllNamespaces()
29 | v1 = client.CoreV1Api(api_client=self.api_client)
30 | svc_list: V1ServiceList = v1.list_service_for_all_namespaces(label_selector=label_selector)
31 | if not svc_list.items:
32 | return None
33 |
34 | svc: V1Service = svc_list.items[0]
35 | name = svc.metadata.name
36 | namespace = svc.metadata.namespace
37 | port = svc.spec.ports[0].port
38 |
39 | if settings.inside_cluster:
40 | return f"http://{name}.{namespace}.svc.cluster.local:{port}"
41 |
42 | elif self.api_client is not None:
43 | return f"{self.api_client.configuration.host}/api/v1/namespaces/{namespace}/services/{name}:{port}/proxy"
44 |
45 | return None
46 |
47 | def find_ingress_host(self, label_selector: str) -> Optional[str]:
48 | """
49 | Discover the ingress host of the Prometheus if krr is not running in cluster
50 | """
51 | if settings.inside_cluster:
52 | return None
53 |
54 | v1 = client.NetworkingV1Api(api_client=self.api_client)
55 | ingress_list: V1IngressList = v1.list_ingress_for_all_namespaces(label_selector=label_selector)
56 | if not ingress_list.items:
57 | return None
58 |
59 | ingress: V1Ingress = ingress_list.items[0]
60 | prometheus_host = ingress.spec.rules[0].host
61 | return f"http://{prometheus_host}"
62 |
63 | def find_url(self, selectors: list[str]) -> Optional[str]:
64 | """
65 | Try to autodiscover the url of an in-cluster service
66 | """
67 | cache_key = ",".join(selectors + [self.api_client.configuration.host if self.api_client else ""])
68 | cached_value = self.cache.get(cache_key)
69 | if cached_value:
70 | return cached_value
71 |
72 | for label_selector in selectors:
73 | logger.debug(f"Trying to find service with label selector {label_selector}")
74 | service_url = self.find_service_url(label_selector)
75 | if service_url:
76 | logger.debug(f"Found service with label selector {label_selector}")
77 | self.cache[cache_key] = service_url
78 | return service_url
79 |
80 | logger.debug(f"Trying to find ingress with label selector {label_selector}")
81 | self.find_ingress_host(label_selector)
82 | ingress_url = self.find_ingress_host(label_selector)
83 | if ingress_url:
84 | return ingress_url
85 |
86 | return None
87 |
88 |
89 | class MetricsServiceDiscovery(ServiceDiscovery, ABC):
90 | @abstractmethod
91 | def find_metrics_url(self, *, api_client: Optional[ApiClient] = None) -> Optional[str]:
92 | pass
93 |
--------------------------------------------------------------------------------
/robusta_krr/utils/version.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import os
3 | import subprocess
4 | import sys
5 | from concurrent.futures import ThreadPoolExecutor
6 | from typing import Optional
7 |
8 | import requests
9 |
10 | import robusta_krr
11 |
12 |
13 | def get_version() -> str:
14 | # the version string was patched by a release - return __version__ which will be correct
15 | if robusta_krr.__version__ != "dev":
16 | return robusta_krr.__version__
17 |
18 | # we are running from an unreleased dev version
19 | try:
20 | # Get the latest git tag
21 | tag = subprocess.check_output(["git", "describe", "--tags"]).decode().strip()
22 |
23 | # Get the current branch name
24 | branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode().strip()
25 |
26 | # Check if there are uncommitted changes
27 | status = subprocess.check_output(["git", "status", "--porcelain"]).decode().strip()
28 | dirty = "-dirty" if status else ""
29 |
30 | return f"{tag}-{branch}{dirty}"
31 |
32 | except Exception:
33 | return robusta_krr.__version__
34 |
35 |
36 | # Synchronous function to fetch the latest release version from GitHub API
37 | def fetch_latest_version() -> Optional[str]:
38 | url = "https://api.github.com/repos/robusta-dev/krr/releases/latest"
39 | try:
40 | response = requests.get(url, timeout=0.5) # 0.5 seconds timeout
41 | response.raise_for_status() # Raises an error for bad responses
42 | data = response.json()
43 | return data.get("tag_name") # Returns the tag name of the latest release
44 | except Exception:
45 | return None
46 |
47 |
48 | async def load_latest_version() -> Optional[str]:
49 | loop = asyncio.get_running_loop()
50 | # Run the synchronous function in a separate thread
51 | with ThreadPoolExecutor() as pool:
52 | return await loop.run_in_executor(pool, fetch_latest_version)
53 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import random
2 | from datetime import datetime, timedelta
3 | from unittest.mock import AsyncMock, patch
4 |
5 | import numpy as np
6 | import pytest
7 |
8 | from robusta_krr.api.models import K8sObjectData, PodData, ResourceAllocations
9 | from robusta_krr.strategies.simple import SimpleStrategy, SimpleStrategySettings
10 |
11 | TEST_OBJECT = K8sObjectData(
12 | cluster="mock-cluster",
13 | name="mock-object-1",
14 | container="mock-container-1",
15 | pods=[
16 | PodData(name="mock-pod-1", deleted=False),
17 | PodData(name="mock-pod-2", deleted=False),
18 | PodData(name="mock-pod-3", deleted=True),
19 | ],
20 | namespace="default",
21 | kind="Deployment",
22 | allocations=ResourceAllocations(
23 | requests={"cpu": 1, "memory": 1}, # type: ignore
24 | limits={"cpu": 2, "memory": 2}, # type: ignore
25 | ),
26 | )
27 |
28 |
29 | @pytest.fixture(autouse=True, scope="session")
30 | def mock_list_clusters():
31 | with patch(
32 | "robusta_krr.core.integrations.kubernetes.KubernetesLoader.list_clusters",
33 | new=AsyncMock(return_value=[TEST_OBJECT.cluster]),
34 | ):
35 | yield
36 |
37 |
38 | @pytest.fixture(autouse=True, scope="session")
39 | def mock_list_scannable_objects():
40 | with patch(
41 | "robusta_krr.core.integrations.kubernetes.KubernetesLoader.list_scannable_objects",
42 | new=AsyncMock(return_value=[TEST_OBJECT]),
43 | ):
44 | yield
45 |
46 |
47 | @pytest.fixture(autouse=True, scope="session")
48 | def mock_load_kubeconfig():
49 | with patch("robusta_krr.core.models.config.Config.load_kubeconfig", return_value=None):
50 | yield
51 |
52 |
53 | @pytest.fixture(autouse=True, scope="session")
54 | def mock_prometheus_loader():
55 | now = datetime.now()
56 | start = now - timedelta(hours=1)
57 | now_ts, start_ts = now.timestamp(), start.timestamp()
58 | metric_points_data = np.array([(t, random.randrange(0, 100)) for t in np.linspace(start_ts, now_ts, 3600)])
59 |
60 | settings = SimpleStrategySettings()
61 | strategy = SimpleStrategy(settings)
62 |
63 | with patch(
64 | "robusta_krr.core.integrations.prometheus.loader.PrometheusMetricsLoader.gather_data",
65 | new=AsyncMock(
66 | return_value={
67 | metric.__name__: {pod.name: metric_points_data for pod in TEST_OBJECT.pods}
68 | for metric in strategy.metrics
69 | },
70 | ),
71 | ) as mock_prometheus_loader:
72 | mock_prometheus_loader
73 | yield
74 |
75 |
76 | @pytest.fixture(autouse=True, scope="session")
77 | def mock_prometheus_load_pods():
78 | with patch(
79 | "robusta_krr.core.integrations.prometheus.loader.PrometheusMetricsLoader.load_pods",
80 | new=AsyncMock(
81 | return_value=TEST_OBJECT.pods,
82 | ),
83 | ) as mock_prometheus_loader:
84 | mock_prometheus_loader
85 | yield
86 |
87 |
88 | @pytest.fixture(autouse=True, scope="session")
89 | def mock_prometheus_get_history_range():
90 | async def get_history_range(self, history_duration: timedelta) -> tuple[datetime, datetime]:
91 | now = datetime.now()
92 | start = now - history_duration
93 | return start, now
94 |
95 | with patch(
96 | "robusta_krr.core.integrations.prometheus.loader.PrometheusMetricsLoader.get_history_range", get_history_range
97 | ):
98 | yield
99 |
100 |
101 | @pytest.fixture(autouse=True, scope="session")
102 | def mock_prometheus_init():
103 | with patch("robusta_krr.core.integrations.prometheus.loader.PrometheusMetricsLoader.__init__", return_value=None):
104 | yield
105 |
--------------------------------------------------------------------------------
/tests/models/test_resource_allocations.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 |
3 | import pytest
4 |
5 | from robusta_krr.core.models.allocations import ResourceAllocations, ResourceType
6 |
7 |
8 | @pytest.mark.parametrize(
9 | "cpu",
10 | [
11 | {"request": "5m", "limit": None},
12 | {"request": 0.005, "limit": None},
13 | ],
14 | )
15 | @pytest.mark.parametrize(
16 | "memory",
17 | [
18 | {"request": 128974848, "limit": 128974848},
19 | {"request": 128.974848e6, "limit": 128.974848e6},
20 | {"request": "128.9748480M", "limit": "128.9748480M"},
21 | {"request": "128974848000m", "limit": "128974848000m"},
22 | {"request": "123Mi", "limit": "123Mi"},
23 | {"request": "128974848e0", "limit": "128974848e0"},
24 | ],
25 | )
26 | def test_resource_allocation_supported_formats(
27 | cpu: dict[str, Union[str, int, float, None]], memory: dict[str, Union[str, int, float, None]]
28 | ):
29 | allocations = ResourceAllocations(
30 | requests={ResourceType.CPU: cpu["request"], ResourceType.Memory: memory["request"]},
31 | limits={ResourceType.CPU: cpu["limit"], ResourceType.Memory: memory["limit"]},
32 | )
33 | assert allocations.requests[ResourceType.CPU] == 0.005
34 | assert allocations.limits[ResourceType.CPU] == None
35 | assert (allocations.requests[ResourceType.Memory] // 1) == 128974848.0
36 | assert (allocations.limits[ResourceType.Memory] // 1) == 128974848.0
37 |
--------------------------------------------------------------------------------
/tests/single_namespace_as_group.yaml:
--------------------------------------------------------------------------------
1 | # Test environment for per-namespace scans using a group object ID (for e.g. Microsoft Entra)
2 | # The purpose of this setup is to verify that per-namespace features work without cluster level permissions
3 | # You can test this Group and KRR using:
4 | # A user named aksdev that's part of the appdev group.
5 | # krr simple --as aksdev --as-group -n kube-system
6 | apiVersion: rbac.authorization.k8s.io/v1
7 | kind: Role
8 | metadata:
9 | namespace: kube-system
10 | name: krr-role
11 | rules:
12 | - apiGroups: [""]
13 | resources: ["pods", "services"]
14 | verbs: ["get", "watch", "list"]
15 | - apiGroups: ["batch"]
16 | resources: ["jobs"]
17 | verbs: ["get", "watch", "list"]
18 | - apiGroups: ["apps"]
19 | resources: ["deployments", "replicasets", "daemonsets", "statefulsets"]
20 | verbs: ["get", "list", "watch"]
21 | - apiGroups: ["autoscaling"]
22 | resources: ["horizontalpodautoscalers"]
23 | verbs: ["get", "list", "watch"]
24 | ---
25 | apiVersion: rbac.authorization.k8s.io/v1
26 | kind: RoleBinding
27 | metadata:
28 | name: krr-role-binding
29 | namespace: kube-system
30 | subjects:
31 | - kind: Group
32 | # Replace with the actual Group Object ID
33 | name:
34 | apiGroup: rbac.authorization.k8s.io
35 | roleRef:
36 | kind: Role
37 | name: krr-role
38 | apiGroup: rbac.authorization.k8s.io
39 |
--------------------------------------------------------------------------------
/tests/single_namespace_permissions.yaml:
--------------------------------------------------------------------------------
1 | # Test environment for per-namespace scans
2 | # The purpose of this setup is to verify that per-namespace features work without cluster level permissions
3 | # You can test this ServiceAccount and KRR using:
4 | # krr simple --as system:serviceaccount:kube-system:krr-account -n kube-system
5 | apiVersion: v1
6 | kind: ServiceAccount
7 | metadata:
8 | name: krr-account
9 | namespace: kube-system
10 | ---
11 | apiVersion: rbac.authorization.k8s.io/v1
12 | kind: Role
13 | metadata:
14 | namespace: kube-system
15 | name: krr-role
16 | rules:
17 | - apiGroups: [""]
18 | resources: ["pods", "services"]
19 | verbs: ["get", "watch", "list"]
20 | - apiGroups: ["batch"]
21 | resources: ["jobs"]
22 | verbs: ["get", "watch", "list"]
23 | - apiGroups: ["apps"]
24 | resources: ["deployments", "replicasets", "daemonsets", "statefulsets"]
25 | verbs: ["get", "list", "watch"]
26 | - apiGroups: ["autoscaling"]
27 | resources: ["horizontalpodautoscalers"]
28 | verbs: ["get", "list", "watch"]
29 | ---
30 | apiVersion: rbac.authorization.k8s.io/v1
31 | kind: RoleBinding
32 | metadata:
33 | name: krr-role-binding
34 | namespace: kube-system
35 | subjects:
36 | - kind: ServiceAccount
37 | name: krr-account
38 | namespace: kube-system
39 | roleRef:
40 | kind: Role
41 | name: krr-role
42 | apiGroup: rbac.authorization.k8s.io
43 |
--------------------------------------------------------------------------------
/tests/test_krr.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from typing import Literal, Union
3 | from unittest.mock import patch, Mock, MagicMock
4 | from typer.testing import CliRunner
5 |
6 | from robusta_krr.main import app, load_commands
7 | from robusta_krr.core.integrations.kubernetes import ClusterLoader
8 | from robusta_krr.core.models.config import settings
9 |
10 | runner = CliRunner(mix_stderr=False)
11 | load_commands()
12 |
13 | STRATEGY_NAME = "simple"
14 |
15 |
16 | def test_help():
17 | result = runner.invoke(app, [STRATEGY_NAME, "--help"])
18 | try:
19 | assert result.exit_code == 0
20 | except AssertionError as e:
21 | raise e from result.exception
22 |
23 |
24 | @pytest.mark.parametrize("log_flag", ["-v", "-q"])
25 | def test_run(log_flag: str):
26 | result = runner.invoke(app, [STRATEGY_NAME, log_flag, "--namespace", "default"])
27 | try:
28 | assert result.exit_code == 0, result.stdout
29 | except AssertionError as e:
30 | raise e from result.exception
31 |
32 |
33 | @pytest.mark.parametrize("format", ["json", "yaml", "table", "pprint", "csv"])
34 | @pytest.mark.parametrize("output", ["--logtostderr", "-q"])
35 | def test_output_formats(format: str, output: str):
36 | result = runner.invoke(app, [STRATEGY_NAME, output, "-f", format])
37 | try:
38 | assert result.exit_code == 0, result.exc_info
39 | except AssertionError as e:
40 | raise e from result.exception
41 |
42 | @pytest.mark.parametrize(
43 | "setting_namespaces,cluster_all_ns,expected",[
44 | (
45 | # default settings
46 | "*",
47 | ["kube-system", "robusta-frontend", "robusta-backend", "infra-grafana"],
48 | "*"
49 | ),
50 | (
51 | # list of namespace provided from arguments without regex pattern
52 | ["robusta-krr", "kube-system"],
53 | ["kube-system", "robusta-frontend", "robusta-backend", "robusta-krr"],
54 | ["robusta-krr", "kube-system"]
55 | ),
56 | (
57 | # list of namespace provided from arguments with regex pattern and will not duplicating in final result
58 | ["robusta-.*", "robusta-frontend"],
59 | ["kube-system", "robusta-frontend", "robusta-backend", "robusta-krr"],
60 | ["robusta-frontend", "robusta-backend", "robusta-krr"]
61 | ),
62 | (
63 | # namespace provided with regex pattern and will match for some namespaces
64 | [".*end$"],
65 | ["kube-system", "robusta-frontend", "robusta-backend", "robusta-krr"],
66 | ["robusta-frontend", "robusta-backend"]
67 | )
68 | ]
69 | )
70 | def test_cluster_namespace_list(
71 | setting_namespaces: Union[Literal["*"], list[str]],
72 | cluster_all_ns: list[str],
73 | expected: Union[Literal["*"], list[str]],
74 | ):
75 | cluster = ClusterLoader()
76 | with patch("robusta_krr.core.models.config.settings.namespaces", setting_namespaces):
77 | with patch.object(cluster.core, "list_namespace", return_value=MagicMock(
78 | items=[MagicMock(**{"metadata.name": m}) for m in cluster_all_ns])):
79 | assert sorted(cluster.namespaces) == sorted(expected)
80 |
--------------------------------------------------------------------------------
/tests/test_runner.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from click.testing import Result
3 | from typer.testing import CliRunner
4 |
5 | from robusta_krr.main import app, load_commands
6 |
7 | runner = CliRunner(mix_stderr=False)
8 | load_commands()
9 |
10 |
11 | @pytest.mark.parametrize(
12 | "args, expected_exit_code",
13 | [
14 | (["--exclude-severity", "-f", "csv"], 0),
15 | (["--exclude-severity", "-f", "table"], 2),
16 | (["--exclude-severity"], 2),
17 | ],
18 | )
19 | def test_exclude_severity_option(args: list[str], expected_exit_code: int) -> None:
20 | result: Result = runner.invoke(app, ["simple", *args])
21 | assert result.exit_code == expected_exit_code
22 |
--------------------------------------------------------------------------------