├── .dockerignore ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── build-on-release.yml │ ├── docker-build-on-tag.yml │ └── pytest-on-push.yml ├── .gitignore ├── .pre-commit-config.yaml ├── ADOPTERS.md ├── Dockerfile ├── LICENSE ├── README.md ├── build_linux.sh ├── build_local.sh ├── build_release.sh ├── docker ├── README.md └── aws.Dockerfile ├── docs ├── google-cloud-managed-service-for-prometheus.md └── krr-in-cluster │ └── krr-in-cluster-job.yaml ├── enforcer ├── Dockerfile ├── README.md ├── dal │ ├── robusta_config.py │ └── supabase_dal.py ├── enforcer_main.py ├── env_vars.py ├── metrics.py ├── model.py ├── params_utils.py ├── patch_manager.py ├── requirements.txt ├── resources │ ├── kubernetes_resource_loader.py │ ├── owner_store.py │ └── recommendation_store.py └── utils.py ├── examples ├── custom_formatter.py ├── custom_severity_calculator.py └── custom_strategy.py ├── helm ├── krr-enforcer │ ├── .helmignore │ ├── Chart.yaml │ ├── templates │ │ ├── enforcer-cert-job.yaml │ │ ├── enforcer-service-account.yaml │ │ ├── enforcer.yaml │ │ └── service-monitor.yaml │ └── values.yaml └── upload_chart.sh ├── images ├── krr-datasources.png ├── krr-datasources.svg ├── krr-other-integrations.png ├── krr-other-integrations.svg ├── krr_slack_example.png ├── logo.png ├── screenshot.jpeg ├── ui_recommendation.png ├── ui_screenshot_new.png └── ui_video.gif ├── intro.txt ├── krr.py ├── poetry.lock ├── pyproject.toml ├── requirements.txt ├── robusta_krr ├── __init__.py ├── api │ ├── formatters.py │ ├── models.py │ └── strategies.py ├── common │ └── ssl_utils.py ├── core │ ├── __init__.py │ ├── abstract │ │ ├── formatters.py │ │ ├── metrics.py │ │ └── strategies.py │ ├── integrations │ │ ├── kubernetes │ │ │ ├── __init__.py │ │ │ └── config_patch.py │ │ ├── openshift │ │ │ ├── __init__.py │ │ │ └── token.py │ │ └── prometheus │ │ │ ├── __init__.py │ │ │ ├── loader.py │ │ │ ├── metrics │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── cpu.py │ │ │ └── memory.py │ │ │ ├── metrics_service │ │ │ ├── base_metric_service.py │ │ │ ├── mimir_metrics_service.py │ │ │ ├── prometheus_metrics_service.py │ │ │ ├── thanos_metrics_service.py │ │ │ └── victoria_metrics_service.py │ │ │ └── prometheus_utils.py │ ├── models │ │ ├── allocations.py │ │ ├── config.py │ │ ├── objects.py │ │ ├── result.py │ │ └── severity.py │ └── runner.py ├── formatters │ ├── __init__.py │ ├── csv.py │ ├── csv_raw.py │ ├── html.py │ ├── json.py │ ├── pprint.py │ ├── table.py │ └── yaml.py ├── main.py ├── strategies │ ├── __init__.py │ ├── simple.py │ └── simple_limit.py └── utils │ ├── batched.py │ ├── intro.py │ ├── object_like_dict.py │ ├── patch.py │ ├── progress_bar.py │ ├── resource_units.py │ ├── service_discovery.py │ └── version.py └── tests ├── conftest.py ├── formatters └── test_csv_formatter.py ├── models └── test_resource_allocations.py ├── single_namespace_as_group.yaml ├── single_namespace_permissions.yaml ├── test_krr.py └── test_runner.py /.dockerignore: -------------------------------------------------------------------------------- 1 | # .dockerignore 2 | __pycache__ 3 | *.pyc 4 | *.pyo 5 | *.pyd 6 | 7 | # Exclude development files 8 | .git 9 | .gitignore 10 | Dockerfile 11 | *.md 12 | .vscode 13 | 14 | # Exclude logs and cache 15 | logs/ 16 | cache/ 17 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | exclude = .git, 4 | __pycache__, 5 | old, 6 | build, 7 | dist, 8 | .venv, 9 | .vscode, 10 | .pytest_cache, 11 | __init__.py, 12 | .mypy_cache, 13 | src/robusta/integrations/kubernetes/autogenerated, 14 | src/robusta/integrations/kubernetes/custom_models.py 15 | ignore = E501, W503, E203 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Are you interested in contributing a fix for this?** 27 | Yes/no. If yes, we will provide guidance what parts of the code to modify and help you. 28 | 29 | **Desktop (please complete the following information):** 30 | - OS: [e.g. iOS] 31 | - Browser [e.g. chrome, safari] 32 | - Version [e.g. 22] 33 | 34 | **Smartphone (please complete the following information):** 35 | - Device: [e.g. iPhone6] 36 | - OS: [e.g. iOS8.1] 37 | - Browser [e.g. stock browser, safari] 38 | - Version [e.g. 22] 39 | 40 | **Additional context** 41 | Add any other context about the problem here. 42 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Are you interested in contributing a PR for this?** 20 | Yes/no. If yes, we will provide guidance what parts of the code to modify and help you. 21 | 22 | **Additional context** 23 | Add any other context or screenshots about the feature request here. 24 | -------------------------------------------------------------------------------- /.github/workflows/build-on-release.yml: -------------------------------------------------------------------------------- 1 | name: Build and Release 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | build: 9 | strategy: 10 | matrix: 11 | # we build on macos-13 for x86 builds 12 | os: [ubuntu-latest, windows-latest, macos-latest, macos-13] 13 | 14 | runs-on: ${{ matrix.os }} 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: '3.11' 23 | 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install -r requirements.txt 28 | pip install pyinstaller 29 | 30 | - name: Install dependancies (Linux) 31 | if: matrix.os == 'ubuntu-latest' 32 | run: | 33 | sudo apt-get install -y binutils 34 | 35 | - name: Install the Apple certificate and provisioning profile 36 | if: matrix.os == 'macos-latest' || matrix.os == 'macos-13' 37 | env: 38 | BUILD_CERTIFICATE_BASE64: ${{ secrets.BUILD_CERTIFICATE_BASE64 }} 39 | P12_PASSWORD: ${{ secrets.P12_PASSWORD }} 40 | BUILD_PROVISION_PROFILE_BASE64: ${{ secrets.BUILD_PROVISION_PROFILE_BASE64 }} 41 | KEYCHAIN_PASSWORD: ${{ secrets.KEYCHAIN_PASSWORD }} 42 | run: | 43 | # create variables 44 | CERTIFICATE_PATH=$RUNNER_TEMP/build_certificate.p12 45 | PP_PATH=$RUNNER_TEMP/build_pp.mobileprovision 46 | KEYCHAIN_PATH=$RUNNER_TEMP/app-signing.keychain-db 47 | 48 | # import certificate and provisioning profile from secrets 49 | echo -n "$BUILD_CERTIFICATE_BASE64" | base64 --decode -o $CERTIFICATE_PATH 50 | echo -n "$BUILD_PROVISION_PROFILE_BASE64" | base64 --decode -o $PP_PATH 51 | 52 | # create temporary keychain 53 | security create-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH 54 | security set-keychain-settings -lut 21600 $KEYCHAIN_PATH 55 | security unlock-keychain -p "$KEYCHAIN_PASSWORD" $KEYCHAIN_PATH 56 | 57 | # import certificate to keychain 58 | security import $CERTIFICATE_PATH -P "$P12_PASSWORD" -A -t cert -f pkcs12 -k $KEYCHAIN_PATH 59 | security list-keychain -d user -s $KEYCHAIN_PATH 60 | 61 | # apply provisioning profile 62 | mkdir -p ~/Library/MobileDevice/Provisioning\ Profiles 63 | cp $PP_PATH ~/Library/MobileDevice/Provisioning\ Profiles 64 | 65 | - name: Set version in code (Unix) 66 | if: matrix.os == 'macos-latest' || matrix.os == 'ubuntu-latest' || matrix.os == 'macos-13' 67 | run: | 68 | awk 'NR==3{$0="__version__ = \"'${{ github.ref_name }}'\""}1' ./robusta_krr/__init__.py > temp && mv temp ./robusta_krr/__init__.py 69 | cat ./robusta_krr/__init__.py 70 | 71 | - name: Set version in code (Windows) 72 | if: matrix.os == 'windows-latest' 73 | run: | 74 | $content = Get-Content -Path .\robusta_krr\__init__.py 75 | $content[2] = "__version__=`"$($env:GITHUB_REF_NAME)`"" 76 | $content | Out-File -FilePath .\robusta_krr\__init__.py -Encoding ascii 77 | Get-Content .\robusta_krr\__init__.py 78 | shell: pwsh 79 | env: 80 | GITHUB_REF_NAME: ${{ github.ref_name }} 81 | 82 | - name: Build with PyInstaller 83 | if: matrix.os == 'macos-latest' 84 | shell: bash 85 | run: | 86 | pyinstaller --target-architecture arm64 krr.py 87 | mkdir -p ./dist/krr/grapheme/data 88 | cp $(python -c "import grapheme; print(grapheme.__path__[0] + '/data/grapheme_break_property.json')") ./dist/krr/grapheme/data/grapheme_break_property.json 89 | cp ./intro.txt ./dist/krr/intro.txt 90 | 91 | - name: Build with PyInstaller 92 | if: matrix.os != 'macos-latest' 93 | shell: bash 94 | run: | 95 | pyinstaller krr.py 96 | mkdir -p ./dist/krr/grapheme/data 97 | cp $(python -c "import grapheme; print(grapheme.__path__[0] + '/data/grapheme_break_property.json')") ./dist/krr/grapheme/data/grapheme_break_property.json 98 | cp ./intro.txt ./dist/krr/intro.txt 99 | 100 | - name: Zip the application (Unix) 101 | if: matrix.os == 'macos-latest' || matrix.os == 'ubuntu-latest' || matrix.os == 'macos-13' 102 | run: | 103 | cd dist 104 | zip -r krr-${{ matrix.os }}-${{ github.ref_name }}.zip krr 105 | mv krr-${{ matrix.os }}-${{ github.ref_name }}.zip ../ 106 | cd .. 107 | 108 | - name: Zip the application (Windows) 109 | if: matrix.os == 'windows-latest' 110 | run: | 111 | Set-Location -Path dist 112 | Compress-Archive -Path krr -DestinationPath krr-${{ matrix.os }}-${{ github.ref_name }}.zip -Force 113 | Move-Item -Path krr-${{ matrix.os }}-${{ github.ref_name }}.zip -Destination ..\ 114 | Set-Location -Path .. 115 | 116 | - name: Upload Release Asset 117 | uses: actions/upload-release-asset@v1.0.2 118 | env: 119 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 120 | with: 121 | upload_url: ${{ github.event.release.upload_url }} 122 | asset_path: ./krr-${{ matrix.os }}-${{ github.ref_name }}.zip 123 | asset_name: krr-${{ matrix.os }}-${{ github.ref_name }}.zip 124 | asset_content_type: application/octet-stream 125 | 126 | - name: Upload build as artifact 127 | uses: actions/upload-artifact@v4 128 | with: 129 | name: krr-${{ matrix.os }}-${{ github.ref_name }} 130 | path: ./krr-${{ matrix.os }}-${{ github.ref_name }}.zip 131 | 132 | - name: Clean up keychain and provisioning profile 133 | if: (matrix.os == 'macos-latest' || matrix.os == 'macos-13') && always() 134 | run: | 135 | security delete-keychain $RUNNER_TEMP/app-signing.keychain-db 136 | rm ~/Library/MobileDevice/Provisioning\ Profiles/build_pp.mobileprovision 137 | 138 | check-latest: 139 | needs: build 140 | runs-on: ubuntu-latest 141 | outputs: 142 | IS_LATEST: ${{ steps.check-latest.outputs.release == github.ref_name }} 143 | steps: 144 | - id: check-latest 145 | uses: pozetroninc/github-action-get-latest-release@v0.7.0 146 | with: 147 | token: ${{ secrets.GITHUB_TOKEN }} 148 | repository: ${{ github.repository }} 149 | excludes: prerelease, draft 150 | 151 | # Define MacOS hash job 152 | mac-hash: 153 | needs: check-latest 154 | runs-on: ubuntu-latest 155 | if: needs.check-latest.outputs.IS_LATEST 156 | outputs: 157 | MAC_BUILD_HASH: ${{ steps.calc-hash.outputs.MAC_BUILD_HASH }} 158 | steps: 159 | - name: Checkout Repository 160 | uses: actions/checkout@v2 161 | - name: Download MacOS artifact 162 | uses: actions/download-artifact@v4 163 | with: 164 | name: krr-macos-latest-${{ github.ref_name }} 165 | - name: Calculate hash 166 | id: calc-hash 167 | run: echo "::set-output name=MAC_BUILD_HASH::$(sha256sum krr-macos-latest-${{ github.ref_name }}.zip | awk '{print $1}')" 168 | 169 | # Define Linux hash job 170 | linux-hash: 171 | needs: check-latest 172 | runs-on: ubuntu-latest 173 | if: needs.check-latest.outputs.IS_LATEST 174 | outputs: 175 | LINUX_BUILD_HASH: ${{ steps.calc-hash.outputs.LINUX_BUILD_HASH }} 176 | steps: 177 | - name: Checkout Repository 178 | uses: actions/checkout@v2 179 | - name: Download Linux artifact 180 | uses: actions/download-artifact@v4 181 | with: 182 | name: krr-ubuntu-latest-${{ github.ref_name }} 183 | - name: Calculate hash 184 | id: calc-hash 185 | run: echo "::set-output name=LINUX_BUILD_HASH::$(sha256sum krr-ubuntu-latest-${{ github.ref_name }}.zip | awk '{print $1}')" 186 | 187 | # Define job to update homebrew formula 188 | update-formula: 189 | needs: [mac-hash, linux-hash] 190 | runs-on: ubuntu-latest 191 | steps: 192 | - name: Checkout homebrew-krr repository 193 | uses: actions/checkout@v2 194 | with: 195 | repository: robusta-dev/homebrew-krr 196 | token: ${{ secrets.MULTIREPO_GITHUB_TOKEN }} 197 | - name: Update krr.rb formula 198 | run: | 199 | MAC_BUILD_HASH=${{ needs.mac-hash.outputs.MAC_BUILD_HASH }} 200 | LINUX_BUILD_HASH=${{ needs.linux-hash.outputs.LINUX_BUILD_HASH }} 201 | TAG_NAME=${{ github.ref_name }} 202 | awk 'NR==6{$0=" url \"https://github.com/robusta-dev/krr/releases/download/'"$TAG_NAME"'/krr-macos-latest-'"$TAG_NAME"'.zip\""}1' ./Formula/krr.rb > temp && mv temp ./Formula/krr.rb 203 | awk 'NR==7{$0=" sha256 \"'$MAC_BUILD_HASH'\""}1' ./Formula/krr.rb > temp && mv temp ./Formula/krr.rb 204 | awk 'NR==9{$0=" url \"https://github.com/robusta-dev/krr/releases/download/'"$TAG_NAME"'/krr-ubuntu-latest-'"$TAG_NAME"'.zip\""}1' ./Formula/krr.rb > temp && mv temp ./Formula/krr.rb 205 | awk 'NR==10{$0=" sha256 \"'$LINUX_BUILD_HASH'\""}1' ./Formula/krr.rb > temp && mv temp ./Formula/krr.rb 206 | - name: Commit and push changes 207 | run: | 208 | git config --local user.email "action@github.com" 209 | git config --local user.name "GitHub Action" 210 | git commit -am "Update formula for release ${TAG_NAME}" 211 | git push 212 | -------------------------------------------------------------------------------- /.github/workflows/docker-build-on-tag.yml: -------------------------------------------------------------------------------- 1 | name: Docker Build and Push 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | build: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | permissions: 14 | contents: 'read' 15 | id-token: 'write' 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - uses: 'google-github-actions/auth@v2' 21 | with: 22 | project_id: 'genuine-flight-317411' 23 | workload_identity_provider: 'projects/429189597230/locations/global/workloadIdentityPools/github/providers/robusta-repos' 24 | 25 | - name: Set up gcloud CLI 26 | uses: google-github-actions/setup-gcloud@v2 27 | with: 28 | project_id: genuine-flight-317411 29 | 30 | - name: Configure Docker Registry 31 | run: gcloud auth configure-docker us-central1-docker.pkg.dev 32 | 33 | - name: Login to Docker Hub 34 | uses: docker/login-action@v1 35 | with: 36 | username: ${{ secrets.DOCKER_USERNAME }} 37 | password: ${{ secrets.DOCKER_PASSWORD }} 38 | 39 | - name: Set up Docker Buildx 40 | uses: docker/setup-buildx-action@v1 41 | 42 | - name: Build and push Docker images 43 | uses: docker/build-push-action@v2 44 | with: 45 | context: . 46 | platforms: linux/arm64,linux/amd64 47 | push: true 48 | tags: | 49 | robustadev/krr:${{ github.ref_name }} 50 | us-central1-docker.pkg.dev/genuine-flight-317411/devel/krr:${{ github.ref_name }} 51 | build-args: | 52 | BUILDKIT_INLINE_CACHE=1 -------------------------------------------------------------------------------- /.github/workflows/pytest-on-push.yml: -------------------------------------------------------------------------------- 1 | name: Pytest 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | 10 | steps: 11 | - uses: actions/checkout@v2 12 | 13 | - name: Set up Python 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: '3.9' 17 | 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install -r requirements.txt 22 | pip install -e . 23 | pip install pytest 24 | 25 | - name: Test with pytest 26 | run: | 27 | pytest 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | .idea/ 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | 133 | .DS_Store 134 | robusta_lib 135 | .idea 136 | .vscode 137 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/ambv/black 3 | rev: 23.1.0 4 | hooks: 5 | - id: black 6 | language_version: python3 7 | args: [--config=pyproject.toml] 8 | 9 | - repo: https://github.com/pre-commit/pre-commit-hooks 10 | rev: v3.3.0 11 | hooks: 12 | - id: trailing-whitespace 13 | - id: end-of-file-fixer 14 | 15 | - repo: https://github.com/pycqa/flake8 16 | rev: 6.0.0 17 | hooks: 18 | - id: flake8 19 | args: [--config=.flake8] 20 | 21 | - repo: https://github.com/pycqa/isort 22 | rev: 5.12.0 23 | hooks: 24 | - id: isort 25 | args: [--settings-path=pyproject.toml] 26 | 27 | - repo: https://github.com/pre-commit/mirrors-mypy 28 | rev: v1.0.1 29 | hooks: 30 | - id: mypy 31 | language: system 32 | -------------------------------------------------------------------------------- /ADOPTERS.md: -------------------------------------------------------------------------------- 1 | # KRR Adopters 2 | 3 | This is a list of adopters of Robusta KRR operator: 4 | 5 | Everton Arakaki - WAES Platform Consultant for ASML (Semiconductor Industry) 6 | 7 | > I used Robusta KRR in my production clusters, and it took me less than 5 minutes to get very well detailed cpu/memory recommendations. Our applications and platform tooling were discovered automatically; our kubecontext was discovered automatically; and our kube-prometheus-stack was discovered automatically. -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Python 3.9 slim image as the base image 2 | FROM python:3.12-slim as builder 3 | ENV LANG=C.UTF-8 4 | ENV PYTHONDONTWRITEBYTECODE=1 5 | ENV PYTHONUNBUFFERED=1 6 | ENV PATH="/app/venv/bin:$PATH" 7 | 8 | # Install system dependencies required for Poetry 9 | RUN apt-get update && \ 10 | dpkg --add-architecture arm64 11 | 12 | # We're installing here libexpat1, to upgrade the package to include a fix to 3 high CVEs. CVE-2024-45491,CVE-2024-45490,CVE-2024-45492 13 | RUN apt-get update \ 14 | && apt-get install -y --no-install-recommends libexpat1 \ 15 | && rm -rf /var/lib/apt/lists/* 16 | 17 | # Set the working directory 18 | WORKDIR /app 19 | 20 | COPY ./requirements.txt requirements.txt 21 | 22 | RUN pip install --no-cache-dir --upgrade pip 23 | # Install the project dependencies 24 | RUN python -m ensurepip --upgrade 25 | RUN pip install --no-cache-dir -r requirements.txt 26 | 27 | # Copy the rest of the application code 28 | COPY ./krr.py krr.py 29 | COPY ./robusta_krr/ robusta_krr/ 30 | COPY ./intro.txt intro.txt 31 | 32 | # Run the application using 'poetry run krr simple' 33 | CMD ["python", "krr.py", "simple"] 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Robusta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /build_linux.sh: -------------------------------------------------------------------------------- 1 | # Remove old build 2 | rm -rf build 3 | rm -rf dist 4 | 5 | # MacOS Build first 6 | 7 | # Active venv 8 | # python -m pip install -r requirements.txt 9 | pip install pyinstaller 10 | apt-get install binutils 11 | 12 | # source .venv/bin/activate 13 | 14 | # Build 15 | pyinstaller krr.py 16 | cd dist 17 | # zip -r "krr-linux-v1.1.0.zip" krr 18 | 19 | # Deactivate venv 20 | # deactivate -------------------------------------------------------------------------------- /build_local.sh: -------------------------------------------------------------------------------- 1 | # Remove old build 2 | rm -rf build 3 | rm -rf dist 4 | 5 | # Active venv 6 | source .venv/bin/activate 7 | pip install -r requirements.txt 8 | pip install pyinstaller 9 | 10 | # Build 11 | pyinstaller krr.py 12 | cd dist 13 | zip -r "krr-macos-v1.1.0.zip" krr -------------------------------------------------------------------------------- /build_release.sh: -------------------------------------------------------------------------------- 1 | docker buildx build \ 2 | --build-arg BUILDKIT_INLINE_CACHE=1 \ 3 | --platform linux/arm64,linux/amd64 \ 4 | --tag us-central1-docker.pkg.dev/genuine-flight-317411/devel/krr:${TAG} \ 5 | --push \ 6 | . -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | # Dockerfiles for specific clouds 2 | 3 | This directory will include Dockerfiles for various cloud providers. 4 | 5 | ## AWS 6 | 7 | For the usage of `krr` container we need the Dockerfile to have `awscli` installed on it. 8 | The `aws.Dockerfile` is a modified `krr` dockerfile which includes: 9 | - installation of curl & zip 10 | - installation of awscli 11 | 12 | 13 | -------------------------------------------------------------------------------- /docker/aws.Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Python 3.9 slim image as the base image 2 | FROM python:3.9-slim as builder 3 | 4 | # Set the working directory 5 | WORKDIR /app 6 | 7 | # Install system dependencies required for Poetry 8 | RUN apt-get update && \ 9 | dpkg --add-architecture arm64 10 | 11 | COPY ./requirements.txt requirements.txt 12 | 13 | # Install the project dependencies 14 | RUN pip install --no-cache-dir -r requirements.txt 15 | 16 | # Install curl and unzip for awscli 17 | RUN apt-get -y update; apt-get -y install curl; apt-get -y install unzip 18 | 19 | # Download awscli and unzip it 20 | RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" && \ 21 | unzip awscliv2.zip && \ 22 | ./aws/install 23 | 24 | # Copy the rest of the application code 25 | COPY . . 26 | 27 | # Run the application using 'poetry run krr simple' 28 | ENTRYPOINT ["python", "krr.py", "simple"] 29 | -------------------------------------------------------------------------------- /docs/google-cloud-managed-service-for-prometheus.md: -------------------------------------------------------------------------------- 1 | ## Installation instructions for [Google Managed Service for Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus) 2 | 3 | The following instructions assume that you are running [Google Managed Service for Prometheus (GMP)](https://cloud.google.com/stackdriver/docs/managed-prometheus) in its [managed collection](https://cloud.google.com/stackdriver/docs/managed-prometheus/setup-managed) mode and that you have installed krr. 4 | 5 | krr depends upon 2 [cAdvisor](https://github.com/google/cadvisor) [metrics](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md#prometheus-container-metrics): 6 | 7 | 1. `container_cpu_usage_seconds_total` 8 | 1. `container_memory_working_set_bytes` 9 | 10 | 11 | In order for krr to work with GMP, we need to ensure that cAdvisor is enabled and that the GMP Operator is configured to collect these 2 metrics. This can be combined into a single step that involves revising the GMP Operator configuration file `operatorconfig/config` in Namespace `gmp-public` 12 | 13 | Google provides instructions for enabling [Kubelet/cAdvisor](https://cloud.google.com/stackdriver/docs/managed-prometheus/exporters/kubelet-cadvisor). This requires adding a `kubeletScraping` section to the configuration file. 14 | 15 | We must also add a `filter` section to the configuration file. The `filter` matches the 2 metrics that krr uses. 16 | 17 | `operatorconfig.krr.patch.yaml`: 18 | ```YAML 19 | collection: 20 | filter: 21 | matchOneOf: 22 | - '{__name__="container_cpu_usage_seconds_total"}' 23 | - '{__name__="container_memory_working_set_bytes"}' 24 | kubeletScraping: 25 | interval: 30s 26 | ``` 27 | 28 | There are various ways to make this Resource change to the cluster. 29 | 30 | You can `kubectl edit` the file and manually add the changes: 31 | 32 | ```bash 33 | KUBE_EDITOR="nano" \ 34 | kubectl edit operatorconfig/config \ 35 | --namespace=gmp-public 36 | ``` 37 | 38 | Or you can `kubectl patch` the file: 39 | 40 | ```bash 41 | kubectl patch operatorconfig/config \ 42 | --namespace=gmp-public \ 43 | --type=merge \ 44 | --patch-file=/path/to/operatorconfig.krr.patch.yaml 45 | ``` 46 | 47 | ### Test 48 | 49 | There are multiple ways to confirm that GMP is collecting the metrics needed by krr. 50 | 51 | The simplest is to access Google Cloud Console "Metric Diagnostics" and confirm that the "Metrics" section includes the 2 metrics with (recent) "Metric Data Ingested": 52 | 53 | `https://console.cloud.google.com/monitoring/metrics-diagnostics?project={project}` 54 | 55 | > **NOTE** Replace `{project}` with your Google Cloud Project ID. 56 | 57 | Another way is to deploy the [Frontend UI for GMP](https://cloud.google.com/stackdriver/docs/managed-prometheus/query#promui-deploy) and use the UI to browse the metrics. 58 | 59 | GMP implements the [Prometheus HTTP API](https://prometheus.io/docs/prometheus/latest/querying/api/) and, like krr, we can use this to query the metrics: 60 | 61 | ```bash 62 | PROJECT="..." # Google Cloud Project ID 63 | MONITORING="https://monitoring.googleapis.com/v1" 64 | ENDPOINT="${MONITORING}/projects/${PROJECT}/location/global/prometheus" 65 | 66 | TOKEN=$(gcloud auth print-access-token) 67 | 68 | # Either 69 | QUERY="count({__name__=\"container_cpu_usage_seconds_total\"})" 70 | # Or 71 | QUERY="count({__name__=\"container_memory_working_set_bytes\"})" 72 | 73 | curl \ 74 | --silent \ 75 | --get \ 76 | --header "Authorization: Bearer ${TOKEN}" \ 77 | --data-urlencode "query=${QUERY}" \ 78 | ${ENDPOINT}/api/v1/query 79 | ``` 80 | If you have [jq]() installed, you can filter the results to output only the latest value: 81 | ```bash 82 | | jq -r .data.result[0].value[1] 83 | ``` 84 | 85 | ### Run krr 86 | 87 | krr leverages Google [Application Default Credentials (ADC)](https://cloud.google.com/docs/authentication/application-default-credentials). Ensure that ADC credentials are accessible (per Google's documentation) before running krr so that krr can authenticate to GMP. 88 | 89 | ```bash 90 | PROJECT="..." # Google Cloud Project ID 91 | MONITORING="https://monitoring.googleapis.com/v1" 92 | ENDPOINT="${MONITORING}/projects/${PROJECT}/location/global/prometheus" 93 | 94 | python krr.py simple \ 95 | --prometheus-url=${ENDPOINT} 96 | ``` 97 | -------------------------------------------------------------------------------- /docs/krr-in-cluster/krr-in-cluster-job.yaml: -------------------------------------------------------------------------------- 1 | kind: ClusterRole 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: krr-cluster-role 5 | namespace: default 6 | rules: 7 | - apiGroups: 8 | - "" 9 | resources: 10 | - configmaps 11 | - daemonsets 12 | - deployments 13 | - namespaces 14 | - pods 15 | - replicasets 16 | - replicationcontrollers 17 | - services 18 | verbs: 19 | - get 20 | - list 21 | - watch 22 | 23 | - apiGroups: 24 | - "" 25 | resources: 26 | - nodes 27 | verbs: 28 | - get 29 | - list 30 | - watch 31 | 32 | - apiGroups: 33 | - apps 34 | resources: 35 | - daemonsets 36 | - deployments 37 | - deployments/scale 38 | - replicasets 39 | - replicasets/scale 40 | - statefulsets 41 | verbs: 42 | - get 43 | - list 44 | - watch 45 | 46 | - apiGroups: 47 | - extensions 48 | resources: 49 | - daemonsets 50 | - deployments 51 | - deployments/scale 52 | - ingresses 53 | - replicasets 54 | - replicasets/scale 55 | - replicationcontrollers/scale 56 | verbs: 57 | - get 58 | - list 59 | - watch 60 | - apiGroups: 61 | - batch 62 | resources: 63 | - cronjobs 64 | - jobs 65 | verbs: 66 | - get 67 | - list 68 | - watch 69 | - apiGroups: 70 | - "autoscaling" 71 | resources: 72 | - horizontalpodautoscalers 73 | verbs: 74 | - get 75 | - list 76 | - watch 77 | 78 | --- 79 | apiVersion: v1 80 | kind: ServiceAccount 81 | metadata: 82 | name: krr-service-account 83 | namespace: default 84 | --- 85 | apiVersion: rbac.authorization.k8s.io/v1 86 | kind: ClusterRoleBinding 87 | metadata: 88 | name: krr-cluster-role-binding 89 | roleRef: 90 | apiGroup: rbac.authorization.k8s.io 91 | kind: ClusterRole 92 | name: krr-cluster-role 93 | subjects: 94 | - kind: ServiceAccount 95 | name: krr-service-account 96 | namespace: default 97 | 98 | --- 99 | apiVersion: batch/v1 100 | kind: Job 101 | metadata: 102 | name: krr 103 | namespace: default 104 | spec: 105 | template: 106 | spec: 107 | containers: 108 | - command: 109 | - /bin/sh 110 | - -c 111 | - "python krr.py simple --max-workers 3 --width 2048 " 112 | image: robustadev/krr:v1.17.0 113 | imagePullPolicy: Always 114 | name: krr 115 | resources: 116 | limits: 117 | memory: 2Gi 118 | requests: 119 | memory: 1Gi 120 | restartPolicy: Never 121 | serviceAccount: krr-service-account 122 | serviceAccountName: krr-service-account 123 | -------------------------------------------------------------------------------- /enforcer/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official Python 3.9 slim image as the base image 2 | FROM python:3.12-slim 3 | ENV LANG=C.UTF-8 4 | ENV PYTHONDONTWRITEBYTECODE=1 5 | ENV PYTHONUNBUFFERED=1 6 | ENV PATH="/app/venv/bin:$PATH" 7 | 8 | # We're installing here libexpat1, to upgrade the package to include a fix to 3 high CVEs. CVE-2024-45491,CVE-2024-45490,CVE-2024-45492 9 | RUN apt-get update \ 10 | && apt-get install -y --no-install-recommends libexpat1 \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | # Set the working directory 14 | WORKDIR /app/enforcer 15 | 16 | COPY ./*.py . 17 | COPY ./dal/ dal/ 18 | COPY ./resources/ resources/ 19 | COPY ./requirements.txt requirements.txt 20 | 21 | 22 | RUN pip install --no-cache-dir --upgrade pip 23 | # Install the project dependencies 24 | RUN python -m ensurepip --upgrade 25 | RUN pip install --no-cache-dir -r requirements.txt 26 | 27 | CMD ["python", "enforcer_main.py"] 28 | -------------------------------------------------------------------------------- /enforcer/README.md: -------------------------------------------------------------------------------- 1 | # KRR Enforcer - Kubernetes Resource Recommendation Mutation Webhook 2 | 3 | A mutating webhook server that automatically enforces [KRR (Kubernetes Resource Recommender)](https://github.com/robusta-dev/krr) recommendations by patching pod resource requests and limits in real-time. 4 | 5 | ## Features 6 | 7 | - **Automatic Resource Enforcement**: Applies KRR recommendations to pods during pod creation 8 | - **Flexible Enforcement Modes**: Support for enforce/ignore modes per workload 9 | - **REST API**: Query recommendations via HTTP endpoints 10 | 11 | ## Enforcement Modes 12 | 13 | Enforcement can be configured globally or on a per-workload basis. 14 | 15 | ### Global Enforcement Mode 16 | The global default mode is configured via the `KRR_MUTATION_MODE_DEFAULT` environment variable: 17 | - `enforce` - Apply recommendations to all pods by default 18 | - `ignore` - Skip enforcement for all pods by default 19 | 20 | ### Per-Workload Mode 21 | You can override the default mode for specific workloads using the annotation: 22 | 23 | ```yaml 24 | apiVersion: apps/v1 25 | kind: Deployment 26 | metadata: 27 | name: my-app 28 | spec: 29 | template: 30 | metadata: 31 | annotations: 32 | admission.robusta.dev/krr-mutation-mode: enforce # or "ignore" 33 | ``` 34 | 35 | **Mode Priority**: Pod annotation > Global default 36 | 37 | ## Webhook Failure Mode 38 | 39 | The webhook uses `failurePolicy: Ignore` by default, meaning if the webhook fails, pods are created without resource optimization rather than being blocked. 40 | 41 | 42 | ## Installation with Helm 43 | 44 | ### Prerequisites 45 | - Helm 3.x 46 | - Prometheus Operator (optional, for metrics collection) 47 | - Robusta UI account - used to store KRR scan results 48 | 49 | ### Certificate 50 | 51 | - Each helm install/upgrade, a new certificate is created and deployed for the admission webhook. 52 | - The certificate is set to expire after 1 year. 53 | - In order to avoid certificate expiration, you must upgrade the enforcer helm release, at least once a year. 54 | 55 | ### Quick Start 56 | 57 | 1. **Add the helm repository** (if available): 58 | ```bash 59 | helm repo add robusta https://robusta-charts.storage.googleapis.com && helm repo update 60 | ``` 61 | 62 | 2. **Add cluster configuration**: 63 | 64 | If the enforcer is installed in the same namespace as Robusta, it will automatically detect the Robusta account settings. 65 | 66 | If your Robusta UI sink token, is pulled from a secret (as described [here](https://docs.robusta.dev/master/setup-robusta/configuration-secrets.html#pulling-values-from-kubernetes-secrets)), you should add the same environement variable to the `Enforcer` pod as well. 67 | 68 | If the `Enforcer` is installed on a different namespace, you can provide your Robusta account credentials using env variables: 69 | 70 | Add your robusta credentials and cluster name: (`enforcer-values.yaml`) 71 | 72 | ```yaml 73 | additionalEnvVars: 74 | - name: CLUSTER_NAME 75 | value: my-cluster-name # should be the same as the robusta installation on this cluster 76 | - name: ROBUSTA_UI_TOKEN 77 | value: "MY ROBUSTA UI TOKEN" 78 | # - name: ROBUSTA_UI_TOKEN # or pulled from a secret 79 | # valueFrom: 80 | # secretKeyRef: 81 | # name: robusta-secrets 82 | # key: robustaSinkToken 83 | ``` 84 | 85 | 2. **Install with default settings**: 86 | ```bash 87 | helm install krr-enforcer robusta/krr-enforcer -f enforcer-values.yaml 88 | ``` 89 | 90 | ### Helm values 91 | 92 | | Parameter | Description | Default | 93 | |-----------|---------------------------------------------------------------------|---------| 94 | | `logLevel` | Log level (DEBUG, INFO, WARN, ERROR) | `INFO` | 95 | | `certificate` | Base64-encoded custom CA certificate - for self signed certificates | `""` | 96 | | `serviceMonitor.enabled` | Enable Prometheus ServiceMonitor | `true` | 97 | | `resources.requests.cpu` | CPU request for the enforcer pod | `100m` | 98 | | `resources.requests.memory` | Memory request for the enforcer pod | `256Mi` | 99 | 100 | 101 | ## Running Locally 102 | 103 | ### Prerequisites 104 | - Python 3.9+ 105 | - Access to a Kubernetes cluster 106 | - KRR recommendations data from Robusta UI 107 | 108 | ### Setup 109 | 110 | 1. **Install dependencies**: 111 | ```bash 112 | pip install -r requirements.txt 113 | ``` 114 | 115 | 2. **Set environment variables**: 116 | ```bash 117 | export ENFORCER_SSL_KEY_FILE="path/to/tls.key" 118 | export ENFORCER_SSL_CERT_FILE="path/to/tls.crt" 119 | export LOG_LEVEL="DEBUG" 120 | export KRR_MUTATION_MODE_DEFAULT="enforce" 121 | ``` 122 | 123 | 3. **Generate TLS certificates**: 124 | ```bash 125 | # Generate private key 126 | openssl genrsa -out tls.key 2048 127 | 128 | # Generate certificate signing request 129 | openssl req -new -key tls.key -out tls.csr \ 130 | -subj "/CN=krr-enforcer.krr-system.svc" 131 | 132 | # Generate self-signed certificate 133 | openssl x509 -req -in tls.csr -signkey tls.key -out tls.crt -days 365 134 | ``` 135 | 136 | 4. **Run the server**: 137 | ```bash 138 | python enforcer_main.py 139 | ``` 140 | 141 | The server will start on `https://localhost:8443` with the following endpoints: 142 | 143 | - `POST /mutate` - Webhook endpoint for Kubernetes admission control 144 | - `GET /health` - Health check endpoint 145 | - `GET /metrics` - Prometheus metrics 146 | - `GET /recommendations/{namespace}/{kind}/{name}` - Query recommendations 147 | 148 | ### Local Development Tips 149 | 150 | - Use `LOG_LEVEL=DEBUG` for detailed request/response logging 151 | - Test webhook locally using tools like `curl` or `httpie` 152 | - Monitor metrics at `https://localhost:8443/metrics` 153 | - Query recommendations: `GET https://localhost:8443/recommendations/default/Deployment/my-app` 154 | 155 | ### Testing the Webhook 156 | 157 | ```bash 158 | # Test health endpoint 159 | curl -k https://localhost:8443/health 160 | 161 | # Test metrics endpoint 162 | curl -k https://localhost:8443/metrics 163 | 164 | # Test recommendations endpoint 165 | curl -k https://localhost:8443/recommendations/default/Deployment/my-app 166 | ``` 167 | 168 | ## Metrics 169 | 170 | The enforcer exposes Prometheus metrics at `/metrics`: 171 | 172 | - `krr_pod_admission_mutations_total` - Total pod mutations (with `mutated` label) 173 | - `krr_replicaset_admissions_total` - Total ReplicaSet admissions (with `operation` label) 174 | - `krr_rs_owners_map_size` - Current size of the ReplicaSet owners map 175 | - `krr_admission_duration_seconds` - Duration of admission operations (with `kind` label) 176 | 177 | ## API Endpoints 178 | 179 | ### GET /recommendations/{namespace}/{kind}/{name} 180 | 181 | Retrieve recommendations for a specific workload: 182 | 183 | ```bash 184 | curl -k https://krr-enforcer.krr-system.svc.cluster.local/recommendations/default/Deployment/my-app 185 | ``` 186 | 187 | Response: 188 | ```json 189 | { 190 | "namespace": "default", 191 | "kind": "Deployment", 192 | "name": "my-app", 193 | "containers": { 194 | "web": { 195 | "cpu": { 196 | "request": "100m", 197 | "limit": "200m" 198 | }, 199 | "memory": { 200 | "request": "128Mi", 201 | "limit": "256Mi" 202 | } 203 | } 204 | } 205 | } 206 | ``` 207 | 208 | ## Troubleshooting 209 | 210 | ### Common Issues 211 | 212 | 1. **Certificate Errors**: Ensure TLS certificates are properly configured and valid 213 | 2. **Permission Denied**: Verify the ServiceAccount has proper RBAC permissions 214 | 3. **No Recommendations**: Check that KRR has generated recommendations and they're accessible 215 | 4. **Webhook Timeout**: Increase `timeoutSeconds` in MutatingWebhookConfiguration 216 | 217 | ### Debug Mode 218 | 219 | Enable debug logging to troubleshoot issues: 220 | 221 | ```bash 222 | helm upgrade krr-enforcer ./helm/krr-enforcer --set logLevel=DEBUG 223 | ``` 224 | 225 | ### Logs 226 | 227 | Check enforcer logs: 228 | ```bash 229 | kubectl logs -n krr-system deployment/krr-enforcer-krr-enforcer -f 230 | ``` -------------------------------------------------------------------------------- /enforcer/dal/robusta_config.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | from pydantic import BaseModel 3 | 4 | 5 | class RobustaConfig(BaseModel): 6 | sinks_config: List[Dict[str, Dict]] 7 | global_config: dict 8 | 9 | class RobustaToken(BaseModel): 10 | store_url: str 11 | api_key: str 12 | account_id: str 13 | email: str 14 | password: str -------------------------------------------------------------------------------- /enforcer/env_vars.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ROBUSTA_CONFIG_PATH = os.environ.get( 4 | "ROBUSTA_CONFIG_PATH", "/etc/robusta/config/active_playbooks.yaml" 5 | ) 6 | ROBUSTA_ACCOUNT_ID = os.environ.get("ROBUSTA_ACCOUNT_ID", "") 7 | STORE_URL = os.environ.get("STORE_URL", "") 8 | STORE_API_KEY = os.environ.get("STORE_API_KEY", "") 9 | STORE_EMAIL = os.environ.get("STORE_EMAIL", "") 10 | STORE_PASSWORD = os.environ.get("STORE_PASSWORD", "") 11 | 12 | DISCOVERY_MAX_BATCHES = int(os.environ.get("DISCOVERY_MAX_BATCHES", 50)) 13 | DISCOVERY_BATCH_SIZE = int(os.environ.get("DISCOVERY_BATCH_SIZE", 30000)) 14 | 15 | UPDATE_THRESHOLD = float(os.environ.get("UPDATE_THRESHOLD", 20.0)) 16 | 17 | SCAN_RELOAD_INTERVAL = int(os.environ.get("SCAN_RELOAD_INTERVAL", 3600)) 18 | KRR_MUTATION_MODE_DEFAULT = os.environ.get("KRR_MUTATION_MODE_DEFAULT", "enforce") 19 | REPLICA_SET_CLEANUP_INTERVAL = int(os.environ.get("REPLICA_SET_CLEANUP_INTERVAL", 600)) 20 | REPLICA_SET_DELETION_WAIT = int(os.environ.get("REPLICA_SET_DELETION_WAIT", 600)) 21 | SCAN_AGE_HOURS_THRESHOLD = int(os.environ.get("SCAN_AGE_HOURS_THRESHOLD", 360)) # 15 days 22 | 23 | ENFORCER_SSL_KEY_FILE = os.environ.get("ENFORCER_SSL_KEY_FILE", "") 24 | ENFORCER_SSL_CERT_FILE = os.environ.get("ENFORCER_SSL_CERT_FILE", "") -------------------------------------------------------------------------------- /enforcer/metrics.py: -------------------------------------------------------------------------------- 1 | from prometheus_client import Counter, Histogram, Gauge 2 | 3 | # Prometheus metrics 4 | pod_admission_mutations = Counter( 5 | 'krr_pod_admission_mutations_total', 6 | 'Total pod admission mutations', 7 | ['mutated', 'reason'] # labels: 'true' or 'false', reason for success/failure 8 | ) 9 | 10 | replicaset_admissions = Counter( 11 | 'krr_replicaset_admissions_total', 12 | 'Total replicaset admissions', 13 | ['operation'] # labels: CREATE, DELETE, etc. 14 | ) 15 | 16 | rs_owners_size = Gauge( 17 | 'krr_rs_owners_map_size', 18 | 'Current size of the rs_owners map' 19 | ) 20 | 21 | admission_duration = Histogram( 22 | 'krr_admission_duration_seconds', 23 | 'Duration of admission operations', 24 | ['kind'] # labels: Pod, ReplicaSet 25 | ) -------------------------------------------------------------------------------- /enforcer/model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Optional, Dict, Any, List 3 | 4 | from pydantic import BaseModel 5 | 6 | 7 | class PodOwner(BaseModel): 8 | kind: str 9 | name: str 10 | namespace: str 11 | 12 | class RsOwner(BaseModel): 13 | rs_name: str 14 | namespace: str 15 | owner_name: str 16 | owner_kind: str 17 | deletion_ts: Optional[float] = None 18 | 19 | class Resources(BaseModel): 20 | request: float 21 | limit: Optional[float] 22 | 23 | 24 | class ContainerRecommendation(BaseModel): 25 | cpu: Optional[Resources] = None 26 | memory: Optional[Resources] = None 27 | 28 | @staticmethod 29 | def build(recommendation: Dict[str, Any]) -> Optional["ContainerRecommendation"]: 30 | resource_recommendation = ContainerRecommendation() 31 | content: List[Dict] = recommendation["content"] 32 | for container_resource in content: 33 | resource = container_resource["resource"] 34 | if resource not in ["memory", "cpu"]: 35 | continue 36 | 37 | recommended: Dict[str, Any] = container_resource["recommended"] 38 | request = recommended.get("request", 0.0) 39 | limit = recommended.get("limit", None) 40 | 41 | if request == 0.0: 42 | logging.debug("skipping container recommendations without request, %s", recommendation) 43 | return None 44 | 45 | if request == "?" or limit == "?": 46 | logging.debug("skipping container recommendations with '?', %s", recommendation) 47 | return None 48 | 49 | resources = Resources(request=request, limit=limit) 50 | if resource == "memory": 51 | resource_recommendation.memory = resources 52 | elif resource == "cpu": 53 | resource_recommendation.cpu = resources 54 | 55 | return resource_recommendation 56 | 57 | 58 | class WorkloadRecommendation(BaseModel): 59 | workload_key: str 60 | container_recommendations: Dict[str, ContainerRecommendation] = {} 61 | 62 | def get(self, container: str) -> Optional[ContainerRecommendation]: 63 | return self.container_recommendations.get(container, None) 64 | 65 | 66 | def add(self, container: str, recommendation: ContainerRecommendation): 67 | self.container_recommendations[container] = recommendation -------------------------------------------------------------------------------- /enforcer/params_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | from typing import Dict, Optional 5 | 6 | from pydantic.types import SecretStr 7 | 8 | def get_env_replacement(value: str) -> Optional[str]: 9 | env_values = re.findall(r"{{[ ]*env\.(.*)[ ]*}}", value) 10 | if env_values: 11 | env_var_value = os.environ.get(env_values[0].strip(), None) 12 | if not env_var_value: 13 | msg = f"ENV var replacement {env_values[0]} does not exist for param: {value}" 14 | logging.error(msg) 15 | raise Exception(msg) 16 | return env_var_value 17 | return None 18 | 19 | 20 | def replace_env_vars_values(values: Dict) -> Dict: 21 | for key, value in values.items(): 22 | if isinstance(value, str): 23 | env_var_value = get_env_replacement(value) 24 | if env_var_value: 25 | values[key] = env_var_value 26 | elif isinstance(value, SecretStr): 27 | env_var_value = get_env_replacement(value.get_secret_value()) 28 | if env_var_value: 29 | values[key] = SecretStr(env_var_value) 30 | elif isinstance(value, dict): 31 | env_var_value = replace_env_vars_values(value) 32 | if env_var_value: 33 | values[key] = env_var_value 34 | 35 | return values 36 | -------------------------------------------------------------------------------- /enforcer/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.109.2 2 | uvicorn==0.27.1 3 | pydantic==2.6.1 4 | supabase==2.5 5 | PyYAML==6.0.1 6 | cachetools==5.3.3 7 | prometheus-client==0.20.0 8 | kubernetes==26.1.0 9 | -------------------------------------------------------------------------------- /enforcer/resources/kubernetes_resource_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from typing import List 4 | 5 | from enforcer.env_vars import DISCOVERY_MAX_BATCHES, DISCOVERY_BATCH_SIZE 6 | from kubernetes import client 7 | from kubernetes.client import V1ReplicaSetList 8 | from kubernetes import config 9 | 10 | from enforcer.model import RsOwner 11 | 12 | if os.getenv("KUBERNETES_SERVICE_HOST"): 13 | config.load_incluster_config() 14 | else: 15 | config.load_kube_config() 16 | 17 | 18 | class KubernetesResourceLoader: 19 | 20 | @staticmethod 21 | def load_replicasets() -> List[RsOwner]: 22 | cluster_rs: List[RsOwner] = [] 23 | continue_ref = None 24 | for batch_num in range(DISCOVERY_MAX_BATCHES): 25 | replicasets: V1ReplicaSetList = client.AppsV1Api().list_replica_set_for_all_namespaces( 26 | limit=DISCOVERY_BATCH_SIZE, _continue=continue_ref 27 | ) 28 | 29 | for replicaset in replicasets.items: 30 | owner_references = replicaset.metadata.owner_references 31 | if owner_references: 32 | rs_owner = owner_references[0] 33 | if len(owner_references) > 1: 34 | logging.warning(f"ReplicasSet with multiple owner_references: {owner_references}") 35 | controllers = [owner for owner in owner_references if owner.get("controller", False)] 36 | if controllers: 37 | rs_owner = controllers[0] 38 | 39 | cluster_rs.append(RsOwner( 40 | rs_name=replicaset.metadata.name, 41 | namespace=replicaset.metadata.namespace, 42 | owner_name=rs_owner.name, 43 | owner_kind=rs_owner.kind, 44 | )) 45 | 46 | continue_ref = replicasets.metadata._continue 47 | if not continue_ref: 48 | break 49 | 50 | if batch_num == DISCOVERY_MAX_BATCHES - 1: 51 | replicas_limit = DISCOVERY_MAX_BATCHES * DISCOVERY_BATCH_SIZE 52 | logging.warning(f"Reached replicas loading limit: {replicas_limit}.") 53 | 54 | return cluster_rs 55 | -------------------------------------------------------------------------------- /enforcer/resources/owner_store.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import threading 3 | import time 4 | from typing import Dict, Any, Optional, List 5 | 6 | from enforcer.env_vars import REPLICA_SET_CLEANUP_INTERVAL, REPLICA_SET_DELETION_WAIT 7 | from enforcer.metrics import rs_owners_size 8 | from enforcer.model import PodOwner, RsOwner 9 | from enforcer.resources.kubernetes_resource_loader import KubernetesResourceLoader 10 | 11 | 12 | class OwnerStore: 13 | 14 | def __init__(self): 15 | self.rs_owners: Dict[str, RsOwner] = {} 16 | self._rs_owners_lock = threading.Lock() 17 | self._owners_loaded = threading.Event() 18 | self._loading_in_progress = threading.Lock() 19 | self.cleanup_interval = REPLICA_SET_CLEANUP_INTERVAL 20 | self._stop_event = threading.Event() 21 | self._cleanup_thread = threading.Thread(target=self._periodic_cleanup, daemon=True) 22 | self._cleanup_thread.start() 23 | 24 | def _rs_key(self, rs_name: str, namespace: str) -> str: 25 | return f"{namespace}/{rs_name}" 26 | 27 | def finalize_owner_initialization(self): 28 | """Initialize rs_owners on-demand, thread-safe, only once.""" 29 | if self._owners_loaded.is_set(): 30 | return # Already loaded 31 | 32 | # Try to acquire the loading lock without blocking 33 | if not self._loading_in_progress.acquire(blocking=False): 34 | # Another thread is loading, just return 35 | return 36 | 37 | try: 38 | if self._owners_loaded.is_set(): 39 | return 40 | 41 | replica_sets_owners: List[RsOwner] = KubernetesResourceLoader.load_replicasets() 42 | loaded_owners: Dict[str, RsOwner] = {} 43 | for owner in replica_sets_owners: 44 | loaded_owners[self._rs_key(owner.rs_name, owner.namespace)] = owner 45 | 46 | with self._rs_owners_lock: 47 | self.rs_owners.update(loaded_owners) 48 | rs_owners_size.set(len(self.rs_owners)) 49 | 50 | self._owners_loaded.set() 51 | logging.info(f"Loaded {len(loaded_owners)} ReplicaSet owners") 52 | 53 | except Exception: 54 | logging.exception(f"Failed to load ReplicaSet owners") 55 | finally: 56 | self._loading_in_progress.release() 57 | 58 | @staticmethod 59 | def get_pod_name(metadata: Dict[str, Any]) -> str: 60 | # if the pod's name is randomized, the name is under generateName 61 | return metadata.get("name") or metadata.get("generateName") 62 | 63 | def get_pod_owner(self, pod: Dict[str, Any]) -> Optional[PodOwner]: 64 | metadata = pod.get("metadata", {}) 65 | owner_references = metadata.get("ownerReferences", []) 66 | namespace: str = metadata.get("namespace") 67 | 68 | try: 69 | if not owner_references: # pod has no owner, standalone pod. Return the pod 70 | return PodOwner( 71 | kind="Pod", namespace=namespace, name=self.get_pod_name(pod) 72 | ) 73 | 74 | # get only owners with controller == true 75 | controllers = [owner for owner in owner_references if owner.get("controller", False)] 76 | if controllers: 77 | if len(controllers) > 1: 78 | logging.warning(f"Multiple controllers found for {pod}") 79 | 80 | controller = controllers[0] 81 | controller_kind: str = controller.get("kind") 82 | if controller_kind == "ReplicaSet": 83 | with self._rs_owners_lock: 84 | rs_owner = self.rs_owners.get(self._rs_key(controller.get("name"), namespace), None) 85 | return PodOwner( 86 | name=rs_owner.owner_name, 87 | namespace=rs_owner.namespace, 88 | kind=rs_owner.owner_kind, 89 | ) if rs_owner else None 90 | else: # Pod owner is a k8s workload: Job, StatefulSet, DaemonSet 91 | return PodOwner(kind=controller_kind, name=controller.get("name"), namespace=namespace) 92 | except Exception: 93 | logging.exception(f"Failed to get pod owner for {pod}") 94 | 95 | return None 96 | 97 | def handle_rs_admission(self, request: Dict[str, Any]): 98 | logging.debug(f"handle_rs_admission %s", request) 99 | operation = request.get("operation") 100 | if operation == "DELETE": 101 | old_object = request.get("oldObject") or {} # delete has old object 102 | metadata = old_object.get("metadata", {}) 103 | rs_name = metadata.get("name") 104 | namespace = metadata.get("namespace") 105 | if rs_name and namespace: 106 | with self._rs_owners_lock: 107 | rs_owner = self.rs_owners.get(self._rs_key(rs_name, namespace), None) 108 | if rs_owner: 109 | rs_owner.deletion_ts = time.time() 110 | elif operation == "CREATE": 111 | self._add_rs_owner(request) 112 | 113 | def _add_rs_owner(self, rs_create_request: Dict[str, Any]): 114 | metadata = rs_create_request.get("object", {}).get("metadata", {}) 115 | owner_references = metadata.get("ownerReferences", []) 116 | if len(owner_references): 117 | rs_owner = RsOwner( 118 | rs_name=metadata.get("name"), 119 | namespace=metadata.get("namespace"), 120 | owner_name=owner_references[0].get("name"), 121 | owner_kind=owner_references[0].get("kind"), 122 | ) 123 | with self._rs_owners_lock: 124 | self.rs_owners[self._rs_key(rs_owner.rs_name, rs_owner.namespace)] = rs_owner 125 | else: 126 | logging.warning(f"No owner references for {rs_create_request}") 127 | 128 | 129 | def _cleanup_deleted_replica_sets(self): 130 | current_time = time.time() 131 | 132 | with self._rs_owners_lock: 133 | # Delete rs owners that were deleted more than REPLICA_SET_DELETION_WAIT seconds ago 134 | keys_to_delete = [ 135 | key for key, rs_owner in self.rs_owners.items() 136 | if rs_owner.deletion_ts is not None and (current_time - rs_owner.deletion_ts) >= REPLICA_SET_DELETION_WAIT 137 | ] 138 | 139 | for key in keys_to_delete: 140 | del self.rs_owners[key] 141 | 142 | def _periodic_cleanup(self): 143 | while not self._stop_event.wait(self.cleanup_interval): 144 | try: 145 | self._cleanup_deleted_replica_sets() 146 | logging.debug("Deleted replicasets cleanup completed") 147 | except Exception as e: 148 | logging.exception(f"Failed to cleanup deleted replicasets") 149 | 150 | def get_rs_owners_count(self) -> int: 151 | with self._rs_owners_lock: 152 | return len(self.rs_owners) 153 | 154 | def stop(self): 155 | self._stop_event.set() 156 | self._cleanup_thread.join() -------------------------------------------------------------------------------- /enforcer/resources/recommendation_store.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import threading 3 | from typing import Dict, Optional, Tuple 4 | 5 | from enforcer.dal.supabase_dal import SupabaseDal 6 | from enforcer.env_vars import SCAN_RELOAD_INTERVAL 7 | from enforcer.model import WorkloadRecommendation, ContainerRecommendation 8 | 9 | 10 | class RecommendationStore: 11 | 12 | def __init__(self, dal: SupabaseDal): 13 | self.dal = dal 14 | self.recommendations: Dict[str, WorkloadRecommendation] = {} 15 | self.scan_id: Optional[str] = None 16 | self._recommendations_lock = threading.Lock() 17 | self._reload_recommendations() 18 | 19 | self.reload_interval = SCAN_RELOAD_INTERVAL 20 | self._stop_event = threading.Event() 21 | self._reload_thread = threading.Thread(target=self._periodic_reload, daemon=True) 22 | self._reload_thread.start() 23 | 24 | 25 | def _load_recommendations(self, current_stored_scan: Optional[str]) -> Tuple[Optional[str], Optional[Dict[str, WorkloadRecommendation]]]: 26 | latest_scan_id, latest_scan = self.dal.get_latest_krr_scan(current_stored_scan) 27 | 28 | if not latest_scan: 29 | return None, None 30 | 31 | # group workload containers recommendations, into WorkloadRecommendation object 32 | scan_recommendations: Dict[str, WorkloadRecommendation] = {} 33 | for container_recommendation in latest_scan: 34 | try: 35 | store_key = self._store_key( 36 | name=container_recommendation["name"], 37 | namespace=container_recommendation["namespace"], 38 | kind=container_recommendation["kind"], 39 | ) 40 | 41 | recommendation = ContainerRecommendation.build(container_recommendation) 42 | if recommendation: # if a valid recommendation was created, connect it to the workload 43 | workload_recommendation: WorkloadRecommendation = scan_recommendations.get(store_key, None) 44 | if not workload_recommendation: 45 | workload_recommendation = WorkloadRecommendation(workload_key=store_key) 46 | scan_recommendations[store_key] = workload_recommendation 47 | 48 | workload_recommendation.add(container_recommendation["container"], recommendation) 49 | except Exception: 50 | logging.exception(f"Failed to load container recommendation: {container_recommendation}") 51 | 52 | return latest_scan_id, scan_recommendations 53 | 54 | def _store_key(self, name: str, namespace: str, kind: str) -> str: 55 | return f"{namespace}/{name}/{kind}" 56 | 57 | def _reload_recommendations(self): 58 | scan_id, new_recommendations = self._load_recommendations(self.scan_id) 59 | if new_recommendations is not None: 60 | with self._recommendations_lock: 61 | self.recommendations = new_recommendations 62 | self.scan_id = scan_id 63 | logging.info("Recommendations reloaded successfully") 64 | logging.debug("Loaded recommendations: %s", new_recommendations) 65 | 66 | def _periodic_reload(self): 67 | while not self._stop_event.wait(self.reload_interval): 68 | try: 69 | self._reload_recommendations() 70 | except Exception as e: 71 | logging.error(f"Failed to reload recommendations: {e}") 72 | 73 | def stop(self): 74 | self._stop_event.set() 75 | self._reload_thread.join() 76 | 77 | def get_recommendations(self, name: str, namespace: str, kind: str) -> Optional[WorkloadRecommendation]: 78 | with self._recommendations_lock: 79 | return self.recommendations.get(self._store_key(name, namespace, kind)) 80 | 81 | -------------------------------------------------------------------------------- /enforcer/utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import os 3 | 4 | import certifi 5 | 6 | CUSTOM_CERTIFICATE_PATH = "/tmp/custom_ca.pem" 7 | 8 | 9 | def append_custom_certificate(custom_ca: str) -> None: 10 | with open(certifi.where(), "ab") as outfile: 11 | outfile.write(base64.b64decode(custom_ca)) 12 | 13 | os.environ["WEBSOCKET_CLIENT_CA_BUNDLE"] = certifi.where() 14 | 15 | 16 | def create_temporary_certificate(custom_ca: str) -> None: 17 | with open(certifi.where(), "rb") as base_cert: 18 | base_cert_content = base_cert.read() 19 | 20 | with open(CUSTOM_CERTIFICATE_PATH, "wb") as outfile: 21 | outfile.write(base_cert_content) 22 | outfile.write(base64.b64decode(custom_ca)) 23 | 24 | os.environ["REQUESTS_CA_BUNDLE"] = CUSTOM_CERTIFICATE_PATH 25 | os.environ["WEBSOCKET_CLIENT_CA_BUNDLE"] = CUSTOM_CERTIFICATE_PATH 26 | certifi.where = lambda: CUSTOM_CERTIFICATE_PATH 27 | 28 | 29 | def add_custom_certificate(custom_ca: str) -> bool: 30 | if not custom_ca: 31 | return False 32 | 33 | # NOTE: Sometimes (Openshift) the certifi.where() is not writable, so we need to 34 | # use a temporary file in case of PermissionError. 35 | try: 36 | append_custom_certificate(custom_ca) 37 | except PermissionError: 38 | create_temporary_certificate(custom_ca) 39 | 40 | return True 41 | -------------------------------------------------------------------------------- /examples/custom_formatter.py: -------------------------------------------------------------------------------- 1 | # This is an example on how to create your own custom formatter 2 | 3 | from __future__ import annotations 4 | 5 | import robusta_krr 6 | from robusta_krr.api import formatters 7 | from robusta_krr.api.models import Result 8 | 9 | 10 | # This is a custom formatter 11 | # It will be available to the CLI as `my_formatter` 12 | # Rich console will be enabled in this case, so the output will be colored and formatted 13 | @formatters.register(rich_console=True) 14 | def my_formatter(result: Result) -> str: 15 | # Return custom formatter 16 | return "Custom formatter" 17 | 18 | 19 | # Running this file will register the formatter and make it available to the CLI 20 | # Run it as `python ./custom_formatter.py simple --formater my_formatter` 21 | if __name__ == "__main__": 22 | robusta_krr.run() 23 | -------------------------------------------------------------------------------- /examples/custom_severity_calculator.py: -------------------------------------------------------------------------------- 1 | # This is an example on how to create your own custom formatter 2 | 3 | from __future__ import annotations 4 | 5 | from typing import Optional 6 | 7 | import robusta_krr 8 | from robusta_krr.api.models import ResourceType, Severity, register_severity_calculator 9 | 10 | 11 | @register_severity_calculator(ResourceType.CPU) 12 | def percentage_severity_calculator( 13 | current: Optional[float], recommended: Optional[float], resource_type: ResourceType 14 | ) -> Severity: 15 | """ 16 | This is an example on how to create your own custom severity calculator 17 | You can use this decorator to bind a severity calculator function to a resource type. 18 | The function will be called with the current value, the recommended value and the resource type. 19 | The function should return a Severity enum value. 20 | 21 | If you have the same calculation for multiple resource types, you can use the `bind_calculator` decorator multiple times. 22 | Then, the function will be called for each resource type and you can use the resource type to distinguish between them. 23 | 24 | Keep in mind that you can not choose the strategy for the resource type using CLI - the last one created for the resource type will be used. 25 | """ 26 | 27 | if current is None and recommended is None: 28 | return Severity.GOOD 29 | if current is None or recommended is None: 30 | return Severity.WARNING 31 | 32 | diff = abs(current - recommended) / current 33 | if diff >= 0.5: 34 | return Severity.CRITICAL 35 | elif diff >= 0.25: 36 | return Severity.WARNING 37 | elif diff >= 0.1: 38 | return Severity.OK 39 | else: 40 | return Severity.GOOD 41 | 42 | 43 | # Running this file will register the formatter and make it available to the CLI 44 | # Run it as `python ./custom_formatter.py simple --formater my_formatter` 45 | if __name__ == "__main__": 46 | robusta_krr.run() 47 | -------------------------------------------------------------------------------- /examples/custom_strategy.py: -------------------------------------------------------------------------------- 1 | # This is an example on how to create your own custom strategy 2 | 3 | import pydantic as pd 4 | 5 | import robusta_krr 6 | from robusta_krr.api.models import K8sObjectData, MetricsPodData, ResourceRecommendation, ResourceType, RunResult 7 | from robusta_krr.api.strategies import BaseStrategy, StrategySettings 8 | from robusta_krr.core.integrations.prometheus.metrics import MaxMemoryLoader, PercentileCPULoader 9 | 10 | 11 | # Providing description to the settings will make it available in the CLI help 12 | class CustomStrategySettings(StrategySettings): 13 | param_1: float = pd.Field(99, gt=0, description="First example parameter") 14 | param_2: float = pd.Field(105_000, gt=0, description="Second example parameter") 15 | 16 | 17 | class CustomStrategy(BaseStrategy[CustomStrategySettings]): 18 | """ 19 | A custom strategy that uses the provided parameters for CPU and memory. 20 | Made only in order to demonstrate how to create a custom strategy. 21 | """ 22 | 23 | display_name = "custom" # The name of the strategy 24 | rich_console = True # Whether to use rich console for the CLI 25 | metrics = [PercentileCPULoader(90), MaxMemoryLoader] # The metrics to use for the strategy 26 | 27 | def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult: 28 | return { 29 | ResourceType.CPU: ResourceRecommendation(request=self.settings.param_1, limit=None), 30 | ResourceType.Memory: ResourceRecommendation(request=self.settings.param_2, limit=self.settings.param_2), 31 | } 32 | 33 | 34 | # Running this file will register the strategy and make it available to the CLI 35 | # Run it as `python ./custom_strategy.py my_strategy` 36 | if __name__ == "__main__": 37 | robusta_krr.run() 38 | -------------------------------------------------------------------------------- /helm/krr-enforcer/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /helm/krr-enforcer/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: krr-enforcer 3 | description: KRR enforcer - auto apply KRR recommendations 4 | type: application 5 | 6 | version: 0.3.1 7 | appVersion: 0.3.1 8 | -------------------------------------------------------------------------------- /helm/krr-enforcer/templates/enforcer-cert-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: {{ .Release.Name }}-krr-enforcer-cert-job 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | app.kubernetes.io/component: krr-enforcer-cert-job 8 | annotations: 9 | helm.sh/hook: pre-install,pre-upgrade 10 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded 11 | helm.sh/hook-weight: "-5" 12 | spec: 13 | backoffLimit: 3 14 | template: 15 | metadata: 16 | labels: 17 | app.kubernetes.io/name: krr-enforcer 18 | app.kubernetes.io/instance: {{ .Release.Name }} 19 | app.kubernetes.io/component: krr-enforcer-cert-job 20 | spec: 21 | serviceAccountName: {{ .Release.Name }}-krr-enforcer-cert-job 22 | restartPolicy: OnFailure 23 | volumes: 24 | - name: workdir 25 | emptyDir: {} 26 | containers: 27 | - name: cert-job 28 | image: "bitnami/kubectl:1.30" 29 | workingDir: /tmp/certs 30 | volumeMounts: 31 | - name: workdir 32 | mountPath: /tmp/certs 33 | command: 34 | - /bin/bash 35 | - -c 36 | - | 37 | set -e 38 | 39 | # Generate a CA key and certificate 40 | echo "Generating CA certificate..." 41 | openssl genrsa -out ca.key 2048 42 | openssl req -x509 -new -nodes -key ca.key -subj "/CN=robusta-krr-enforcer-ca" -days 365 -out ca.crt 43 | 44 | # Generate a server key and certificate signing request (CSR) 45 | echo "Generating server certificate..." 46 | SERVICE_NAME={{ .Release.Name }}-krr-enforcer 47 | NAMESPACE={{ .Release.Namespace }} 48 | DNS_NAME=${SERVICE_NAME}.${NAMESPACE}.svc 49 | 50 | openssl genrsa -out server.key 2048 51 | cat > server.conf < server-ext.conf </dev/null 2>&1; then 145 | echo "Restarting enforcer deployment..." 146 | kubectl rollout restart deployment ${SERVICE_NAME} -n ${NAMESPACE} 147 | else 148 | echo "Deployment ${SERVICE_NAME} does not exist yet, skipping restart" 149 | fi 150 | 151 | echo "Job completed successfully!" 152 | --- 153 | apiVersion: v1 154 | kind: ServiceAccount 155 | metadata: 156 | name: {{ .Release.Name }}-krr-enforcer-cert-job 157 | namespace: {{ .Release.Namespace }} 158 | labels: 159 | app.kubernetes.io/component: krr-enforcer-cert-job 160 | annotations: 161 | helm.sh/hook: pre-install,pre-upgrade 162 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded 163 | helm.sh/hook-weight: "-6" 164 | --- 165 | apiVersion: rbac.authorization.k8s.io/v1 166 | kind: ClusterRole 167 | metadata: 168 | name: {{ .Release.Name }}-krr-enforcer-cert-job 169 | labels: 170 | app.kubernetes.io/component: krr-enforcer-cert-job 171 | annotations: 172 | helm.sh/hook: pre-install,pre-upgrade 173 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded 174 | helm.sh/hook-weight: "-6" 175 | rules: 176 | - apiGroups: [""] 177 | resources: ["secrets"] 178 | verbs: ["create", "get", "update", "patch"] 179 | - apiGroups: ["admissionregistration.k8s.io"] 180 | resources: ["mutatingwebhookconfigurations"] 181 | verbs: ["create", "get", "update", "patch"] 182 | - apiGroups: ["apps"] 183 | resources: ["deployments"] 184 | verbs: ["get", "patch"] 185 | --- 186 | apiVersion: rbac.authorization.k8s.io/v1 187 | kind: ClusterRoleBinding 188 | metadata: 189 | name: {{ .Release.Name }}-krr-enforcer-cert-job 190 | labels: 191 | app.kubernetes.io/component: krr-enforcer-cert-job 192 | annotations: 193 | helm.sh/hook: pre-install,pre-upgrade 194 | helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded 195 | helm.sh/hook-weight: "-6" 196 | roleRef: 197 | apiGroup: rbac.authorization.k8s.io 198 | kind: ClusterRole 199 | name: {{ .Release.Name }}-krr-enforcer-cert-job 200 | subjects: 201 | - kind: ServiceAccount 202 | name: {{ .Release.Name }}-krr-enforcer-cert-job 203 | namespace: {{ .Release.Namespace }} 204 | -------------------------------------------------------------------------------- /helm/krr-enforcer/templates/enforcer-service-account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: {{ .Release.Name }}-krr-enforcer 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | app.kubernetes.io/component: krr-enforcer 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | --- 13 | apiVersion: rbac.authorization.k8s.io/v1 14 | kind: ClusterRole 15 | metadata: 16 | name: {{ .Release.Name }}-krr-enforcer 17 | labels: 18 | app.kubernetes.io/component: krr-enforcer 19 | rules: 20 | - apiGroups: [""] 21 | resources: ["pods"] 22 | verbs: ["get", "list", "watch"] 23 | - apiGroups: ["apps"] 24 | resources: ["replicasets"] 25 | verbs: ["get", "list", "watch"] 26 | --- 27 | apiVersion: rbac.authorization.k8s.io/v1 28 | kind: ClusterRoleBinding 29 | metadata: 30 | name: {{ .Release.Name }}-krr-enforcer 31 | labels: 32 | app.kubernetes.io/component: krr-enforcer 33 | roleRef: 34 | apiGroup: rbac.authorization.k8s.io 35 | kind: ClusterRole 36 | name: {{ .Release.Name }}-krr-enforcer 37 | subjects: 38 | - kind: ServiceAccount 39 | name: {{ .Release.Name }}-krr-enforcer 40 | namespace: {{ .Release.Namespace }} 41 | -------------------------------------------------------------------------------- /helm/krr-enforcer/templates/enforcer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ .Release.Name }}-krr-enforcer 5 | namespace: {{ .Release.Namespace }} 6 | labels: 7 | app.kubernetes.io/component: krr-enforcer 8 | {{- with .Values.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | spec: 13 | replicas: 1 14 | selector: 15 | matchLabels: 16 | app.kubernetes.io/name: krr-enforcer 17 | app.kubernetes.io/instance: {{ .Release.Name }} 18 | app.kubernetes.io/component: krr-enforcer 19 | template: 20 | metadata: 21 | labels: 22 | app.kubernetes.io/name: krr-enforcer 23 | app.kubernetes.io/instance: {{ .Release.Name }} 24 | app.kubernetes.io/component: krr-enforcer 25 | annotations: 26 | admission.robusta.dev/krr-mutation-mode: ignore 27 | {{- with .Values.annotations }} 28 | {{- toYaml . | nindent 8 }} 29 | {{- end }} 30 | spec: 31 | serviceAccountName: {{ .Release.Name }}-krr-enforcer 32 | {{- with .Values.imagePullSecrets }} 33 | imagePullSecrets: 34 | {{- toYaml . | nindent 8 }} 35 | {{- end }} 36 | {{- with .Values.nodeSelector }} 37 | nodeSelector: 38 | {{- toYaml . | nindent 8 }} 39 | {{- end }} 40 | {{- with .Values.tolerations }} 41 | tolerations: 42 | {{- toYaml . | nindent 8 }} 43 | {{- end }} 44 | {{- with .Values.priorityClassName }} 45 | priorityClassName: {{ . }} 46 | {{- end }} 47 | {{- with .Values.securityContext.pod }} 48 | securityContext: 49 | {{- toYaml . | nindent 8 }} 50 | {{- end }} 51 | volumes: 52 | - name: playbooks-config-secret 53 | secret: 54 | secretName: robusta-playbooks-config-secret 55 | optional: true 56 | - name: certs 57 | secret: 58 | secretName: {{ .Release.Name }}-krr-enforcer-certs 59 | containers: 60 | - name: enforcer 61 | {{- if .Values.fullImage }} 62 | image: "{{ .Values.fullImage }}" 63 | {{- else }} 64 | image: "{{ .Values.image.repository }}/{{ .Values.image.name }}:{{ .Values.image.tag }}" 65 | {{- end }} 66 | imagePullPolicy: {{ .Values.imagePullPolicy }} 67 | {{- with .Values.securityContext.container }} 68 | securityContext: 69 | {{- toYaml . | nindent 12 }} 70 | {{- end }} 71 | ports: 72 | - name: https 73 | containerPort: 8443 74 | protocol: TCP 75 | volumeMounts: 76 | - name: certs 77 | mountPath: /etc/webhook/certs 78 | readOnly: true 79 | - name: playbooks-config-secret 80 | mountPath: /etc/robusta/config 81 | livenessProbe: 82 | httpGet: 83 | path: /health 84 | port: https 85 | scheme: HTTPS 86 | initialDelaySeconds: 30 87 | periodSeconds: 10 88 | readinessProbe: 89 | httpGet: 90 | path: /health 91 | port: https 92 | scheme: HTTPS 93 | initialDelaySeconds: 5 94 | periodSeconds: 5 95 | resources: 96 | {{- if .Values.resources.requests }} 97 | requests: 98 | {{- if .Values.resources.requests.cpu }} 99 | cpu: {{ .Values.resources.requests.cpu }} 100 | {{- end }} 101 | {{- if .Values.resources.requests.memory }} 102 | memory: {{ .Values.resources.requests.memory }} 103 | {{- end }} 104 | {{- end }} 105 | {{- if .Values.resources.limits }} 106 | limits: 107 | {{- if .Values.resources.limits.cpu }} 108 | cpu: {{ .Values.resources.limits.cpu }} 109 | {{- end }} 110 | {{- if .Values.resources.limits.memory }} 111 | memory: {{ .Values.resources.limits.memory }} 112 | {{- end }} 113 | {{- end }} 114 | env: 115 | - name: ENFORCER_SSL_KEY_FILE 116 | value: "/etc/webhook/certs/tls.key" 117 | - name: ENFORCER_SSL_CERT_FILE 118 | value: "/etc/webhook/certs/tls.crt" 119 | - name: LOG_LEVEL 120 | value: {{ .Values.logLevel | quote }} 121 | {{- if .Values.certificate }} 122 | - name: CERTIFICATE 123 | value: {{ .Values.certificate | quote }} 124 | {{- end }} 125 | {{- if .Values.additionalEnvVars }} 126 | {{- toYaml .Values.additionalEnvVars | nindent 12 }} 127 | {{- end }} 128 | 129 | --- 130 | apiVersion: v1 131 | kind: Service 132 | metadata: 133 | name: {{ .Release.Name }}-krr-enforcer 134 | namespace: {{ .Release.Namespace }} 135 | labels: 136 | app.kubernetes.io/name: krr-enforcer 137 | app.kubernetes.io/instance: {{ .Release.Name }} 138 | app.kubernetes.io/component: krr-enforcer 139 | {{- with .Values.service.annotations }} 140 | annotations: 141 | {{- toYaml . | nindent 4 }} 142 | {{- end }} 143 | spec: 144 | type: ClusterIP 145 | ports: 146 | - port: 443 147 | targetPort: https 148 | protocol: TCP 149 | name: https 150 | selector: 151 | app.kubernetes.io/name: krr-enforcer 152 | app.kubernetes.io/instance: {{ .Release.Name }} 153 | app.kubernetes.io/component: krr-enforcer 154 | -------------------------------------------------------------------------------- /helm/krr-enforcer/templates/service-monitor.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceMonitor.enabled }} 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: ServiceMonitor 4 | metadata: 5 | name: {{ .Release.Name }}-krr-enforcer 6 | namespace: {{ .Release.Namespace }} 7 | labels: 8 | app.kubernetes.io/name: krr-enforcer 9 | app.kubernetes.io/instance: {{ .Release.Name }} 10 | app.kubernetes.io/component: krr-enforcer 11 | {{- with .Values.serviceMonitor.labels }} 12 | {{- toYaml . | nindent 4 }} 13 | {{- end }} 14 | {{- with .Values.serviceMonitor.annotations }} 15 | annotations: 16 | {{- toYaml . | nindent 4 }} 17 | {{- end }} 18 | spec: 19 | selector: 20 | matchLabels: 21 | app.kubernetes.io/name: krr-enforcer 22 | app.kubernetes.io/instance: {{ .Release.Name }} 23 | app.kubernetes.io/component: krr-enforcer 24 | endpoints: 25 | - port: https 26 | path: /metrics 27 | scheme: https 28 | tlsConfig: 29 | insecureSkipVerify: true 30 | {{- with .Values.serviceMonitor.interval }} 31 | interval: {{ . }} 32 | {{- end }} 33 | {{- with .Values.serviceMonitor.scrapeTimeout }} 34 | scrapeTimeout: {{ . }} 35 | {{- end }} 36 | {{- end }} -------------------------------------------------------------------------------- /helm/krr-enforcer/values.yaml: -------------------------------------------------------------------------------- 1 | certificate: "" # base64 encoded 2 | logLevel: INFO 3 | 4 | # fullImage: ~ # full image path can be used to override image.repository/image.name:image.tag 5 | 6 | image: 7 | repository: us-central1-docker.pkg.dev/genuine-flight-317411/devel 8 | name: krr-enforcer 9 | tag: 0.3.1 10 | imagePullPolicy: IfNotPresent 11 | resources: 12 | requests: 13 | cpu: 100m 14 | memory: 256Mi 15 | limits: 16 | cpu: ~ 17 | additionalEnvVars: [] 18 | priorityClassName: "" 19 | tolerations: [] 20 | annotations: {} 21 | nodeSelector: ~ 22 | imagePullSecrets: [] 23 | securityContext: 24 | container: 25 | allowPrivilegeEscalation: false 26 | capabilities: {} 27 | privileged: false 28 | readOnlyRootFilesystem: false 29 | runAsUser: 1000 30 | pod: {} 31 | service: 32 | annotations: {} 33 | serviceAccount: 34 | annotations: {} 35 | serviceMonitor: 36 | enabled: true 37 | interval: 30s 38 | scrapeTimeout: 10s 39 | labels: {} 40 | annotations: {} 41 | -------------------------------------------------------------------------------- /helm/upload_chart.sh: -------------------------------------------------------------------------------- 1 | rm -rf ./tmp 2 | mkdir ./tmp 3 | cd ./tmp 4 | helm package ../krr-enforcer 5 | mkdir krr-enforcer 6 | mv *.tgz ./krr-enforcer 7 | curl https://robusta-charts.storage.googleapis.com/index.yaml > index.yaml 8 | helm repo index --merge index.yaml --url https://robusta-charts.storage.googleapis.com ./krr-enforcer 9 | gsutil rsync -r krr-enforcer gs://robusta-charts 10 | gsutil setmeta -h "Cache-Control:max-age=0" gs://robusta-charts/index.yaml 11 | cd ../ 12 | rm -rf ./tmp 13 | -------------------------------------------------------------------------------- /images/krr-datasources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/krr-datasources.png -------------------------------------------------------------------------------- /images/krr-other-integrations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/krr-other-integrations.png -------------------------------------------------------------------------------- /images/krr_slack_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/krr_slack_example.png -------------------------------------------------------------------------------- /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/logo.png -------------------------------------------------------------------------------- /images/screenshot.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/screenshot.jpeg -------------------------------------------------------------------------------- /images/ui_recommendation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/ui_recommendation.png -------------------------------------------------------------------------------- /images/ui_screenshot_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/ui_screenshot_new.png -------------------------------------------------------------------------------- /images/ui_video.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/images/ui_video.gif -------------------------------------------------------------------------------- /intro.txt: -------------------------------------------------------------------------------- 1 | [bold magenta] 2 | _____ _ _ _ _______ _____ 3 | | __ \ | | | | | |/ / __ \| __ \ 4 | | |__) |___ | |__ _ _ ___| |_ __ _ | ' /| |__) | |__) | 5 | | _ // _ \| '_ \| | | / __| __/ _` | | < | _ /| _ / 6 | | | \ \ (_) | |_) | |_| \__ \ || (_| | | . \| | \ \| | \ \ 7 | |_| \_\___/|_.__/ \__,_|___/\__\__,_| |_|\_\_| \_\_| \_\ 8 | 9 | 10 | Thanks for using Robusta KRR. If you have any questions or feedback, please feel free to reach out to us at 11 | https://github.com/robusta-dev/krr/issues 12 | 13 | Watch our latest video to optimize your workloads and save costs: https://www.youtube.com/watch?v=TYRA2QcDIuI 14 | 15 | [/bold magenta] -------------------------------------------------------------------------------- /krr.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from robusta_krr.common.ssl_utils import add_custom_certificate 4 | 5 | ADDITIONAL_CERTIFICATE: str = os.environ.get("CERTIFICATE", "") 6 | 7 | if add_custom_certificate(ADDITIONAL_CERTIFICATE): 8 | print("added custom certificate") 9 | 10 | # DO NOT ADD ANY CODE ABOVE THIS 11 | # ADDING IMPORTS BEFORE ADDING THE CUSTOM CERTS MIGHT INIT HTTP CLIENTS THAT DOESN'T RESPECT THE CUSTOM CERT 12 | 13 | from robusta_krr import run 14 | 15 | if __name__ == "__main__": 16 | run() 17 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "robusta-krr" 3 | version = "1.8.2-dev" 4 | description = "Robusta's Resource Recommendation engine for Kubernetes" 5 | authors = ["Pavel Zhukov <33721692+LeaveMyYard@users.noreply.github.com>"] 6 | license = "MIT" 7 | readme = "README.md" 8 | packages = [{ include = "robusta_krr" }] 9 | 10 | [tool.black] 11 | line-length = 120 12 | target-version = ['py39'] 13 | 14 | [tool.isort] 15 | line_length = 120 16 | multi_line_output = 3 17 | include_trailing_comma = true 18 | 19 | [tool.mypy] 20 | plugins = "numpy.typing.mypy_plugin,pydantic.mypy" 21 | 22 | [tool.poetry.scripts] 23 | krr = "robusta_krr.main:run" 24 | 25 | [tool.poetry.dependencies] 26 | python = ">=3.9,<=3.12.3" 27 | typer = { extras = ["all"], version = "^0.7.0" } 28 | pydantic = "^1.10.7" 29 | kubernetes = "^26.1.0" 30 | prometheus-api-client = "0.5.3" 31 | numpy = ">=1.26.4,<1.27.0" 32 | alive-progress = "^3.1.2" 33 | prometrix = "0.2.0" 34 | slack-sdk = "^3.21.3" 35 | pandas = "2.2.2" 36 | requests = "2.32.0" 37 | pyyaml = "6.0.1" 38 | typing-extensions = "4.6.0" 39 | idna = "3.7" 40 | urllib3 = "^1.26.20" 41 | setuptools = "^70.0.0" 42 | zipp = "^3.19.1" 43 | tenacity = "^9.0.0" 44 | 45 | 46 | 47 | [tool.poetry.group.dev.dependencies] 48 | mypy = "^1.0.1" 49 | black = "^23.1.0" 50 | isort = "^5.12.0" 51 | flake8 = "^6.0.0" 52 | types-pyyaml = "^6.0.12.8" 53 | types-cachetools = "^5.3.0.4" 54 | types-requests = "^2.28.11.15" 55 | pyinstaller = "^5.9.0" 56 | pytest = "^7.2.2" 57 | 58 | [build-system] 59 | requires = ["poetry-core"] 60 | build-backend = "poetry.core.masonry.api" 61 | 62 | 63 | [project] 64 | name = "robusta_krr" 65 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | about-time==4.2.1 ; python_version >= "3.9" and python_full_version < "3.13" 2 | alive-progress==3.1.5 ; python_version >= "3.9" and python_full_version < "3.13" 3 | boto3==1.34.62 ; python_version >= "3.9" and python_full_version < "3.13" 4 | botocore==1.34.62 ; python_version >= "3.9" and python_full_version < "3.13" 5 | cachetools==5.3.3 ; python_version >= "3.9" and python_full_version < "3.13" 6 | certifi==2024.2.2 ; python_version >= "3.9" and python_full_version < "3.13" 7 | charset-normalizer==3.3.2 ; python_version >= "3.9" and python_full_version < "3.13" 8 | click==8.1.7 ; python_version >= "3.9" and python_full_version < "3.13" 9 | colorama==0.4.6 ; python_version >= "3.9" and python_full_version < "3.13" 10 | commonmark==0.9.1 ; python_version >= "3.9" and python_full_version < "3.13" 11 | contourpy==1.2.0 ; python_version >= "3.9" and python_full_version < "3.13" 12 | cycler==0.12.1 ; python_version >= "3.9" and python_full_version < "3.13" 13 | dateparser==1.2.0 ; python_version >= "3.9" and python_full_version < "3.13" 14 | fonttools==4.49.0 ; python_version >= "3.9" and python_full_version < "3.13" 15 | google-auth==2.28.2 ; python_version >= "3.9" and python_full_version < "3.13" 16 | grapheme==0.6.0 ; python_version >= "3.9" and python_full_version < "3.13" 17 | httmock==1.4.0 ; python_version >= "3.9" and python_full_version < "3.13" 18 | idna==3.7 ; python_version >= "3.9" and python_full_version < "3.13" 19 | importlib-resources==6.3.0 ; python_version >= "3.9" and python_version < "3.10" 20 | jmespath==1.0.1 ; python_version >= "3.9" and python_full_version < "3.13" 21 | kiwisolver==1.4.5 ; python_version >= "3.9" and python_full_version < "3.13" 22 | kubernetes==26.1.0 ; python_version >= "3.9" and python_full_version < "3.13" 23 | matplotlib==3.8.3 ; python_version >= "3.9" and python_full_version < "3.13" 24 | numpy==1.26.4 ; python_version >= "3.9" and python_full_version < "3.13" 25 | oauthlib==3.2.2 ; python_version >= "3.9" and python_full_version < "3.13" 26 | packaging==24.0 ; python_version >= "3.9" and python_full_version < "3.13" 27 | pandas==2.2.2 ; python_version >= "3.9" and python_full_version < "3.13" 28 | pillow==10.3.0 ; python_version >= "3.9" and python_full_version < "3.13" 29 | prometheus-api-client==0.5.3 ; python_version >= "3.9" and python_full_version < "3.13" 30 | prometrix==0.1.17 ; python_version >= "3.9" and python_full_version < "3.13" 31 | pyasn1-modules==0.3.0 ; python_version >= "3.9" and python_full_version < "3.13" 32 | pyasn1==0.5.1 ; python_version >= "3.9" and python_full_version < "3.13" 33 | pydantic==1.10.15 ; python_version >= "3.9" and python_full_version < "3.13" 34 | pygments==2.17.2 ; python_version >= "3.9" and python_full_version < "3.13" 35 | pyparsing==3.1.2 ; python_version >= "3.9" and python_full_version < "3.13" 36 | python-dateutil==2.9.0.post0 ; python_version >= "3.9" and python_full_version < "3.13" 37 | pytz==2024.1 ; python_version >= "3.9" and python_full_version < "3.13" 38 | pyyaml==6.0.1 ; python_version >= "3.9" and python_full_version < "3.13" 39 | regex==2023.12.25 ; python_version >= "3.9" and python_full_version < "3.13" 40 | requests-oauthlib==1.4.0 ; python_version >= "3.9" and python_full_version < "3.13" 41 | requests==2.32.0 ; python_version >= "3.9" and python_full_version < "3.13" 42 | rich==12.6.0 ; python_version >= "3.9" and python_full_version < "3.13" 43 | rsa==4.9 ; python_version >= "3.9" and python_full_version < "3.13" 44 | s3transfer==0.10.0 ; python_version >= "3.9" and python_full_version < "3.13" 45 | setuptools==70.3.0 ; python_version >= "3.9" and python_full_version < "3.13" 46 | shellingham==1.5.4 ; python_version >= "3.9" and python_full_version < "3.13" 47 | six==1.16.0 ; python_version >= "3.9" and python_full_version < "3.13" 48 | slack-sdk==3.27.1 ; python_version >= "3.9" and python_full_version < "3.13" 49 | typer[all]==0.7.0 ; python_version >= "3.9" and python_full_version < "3.13" 50 | typing-extensions==4.6.0 ; python_version >= "3.9" and python_full_version < "3.13" 51 | tzdata==2024.1 ; python_version >= "3.9" and python_full_version < "3.13" 52 | tzlocal==5.2 ; python_version >= "3.9" and python_full_version < "3.13" 53 | urllib3==1.26.19 ; python_version >= "3.9" and python_full_version < "3.13" 54 | websocket-client==1.7.0 ; python_version >= "3.9" and python_full_version < "3.13" 55 | zipp==3.19.2 ; python_version >= "3.9" and python_version < "3.13" 56 | tenacity==9.0.0 ; python_version >= "3.9" and python_version < "3.13" -------------------------------------------------------------------------------- /robusta_krr/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import run 2 | 3 | __version__ = "dev" 4 | __all__ = ["run", "__version__"] 5 | -------------------------------------------------------------------------------- /robusta_krr/api/formatters.py: -------------------------------------------------------------------------------- 1 | from robusta_krr.core.abstract.formatters import find, list_available, register 2 | 3 | __all__ = ["register", "find", "list_available"] 4 | -------------------------------------------------------------------------------- /robusta_krr/api/models.py: -------------------------------------------------------------------------------- 1 | from robusta_krr.core.abstract.strategies import MetricsPodData, PodsTimeData, ResourceRecommendation, RunResult 2 | from robusta_krr.core.models.allocations import RecommendationValue, ResourceAllocations, ResourceType 3 | from robusta_krr.core.models.objects import K8sObjectData, PodData 4 | from robusta_krr.core.models.result import ResourceScan, Result 5 | from robusta_krr.core.models.severity import Severity, register_severity_calculator 6 | 7 | __all__ = [ 8 | "ResourceType", 9 | "ResourceAllocations", 10 | "RecommendationValue", 11 | "K8sObjectData", 12 | "PodData", 13 | "Result", 14 | "Severity", 15 | "register_severity_calculator", 16 | "ResourceScan", 17 | "ResourceRecommendation", 18 | "PodsTimeData", 19 | "MetricsPodData", 20 | "RunResult", 21 | ] 22 | -------------------------------------------------------------------------------- /robusta_krr/api/strategies.py: -------------------------------------------------------------------------------- 1 | from robusta_krr.core.abstract.strategies import BaseStrategy, StrategySettings 2 | 3 | __all__ = ["BaseStrategy", "StrategySettings"] 4 | -------------------------------------------------------------------------------- /robusta_krr/common/ssl_utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import os 3 | 4 | import certifi 5 | 6 | CUSTOM_CERTIFICATE_PATH = "/tmp/custom_ca.pem" 7 | 8 | 9 | def append_custom_certificate(custom_ca: str) -> None: 10 | with open(certifi.where(), "ab") as outfile: 11 | outfile.write(base64.b64decode(custom_ca)) 12 | 13 | os.environ["WEBSOCKET_CLIENT_CA_BUNDLE"] = certifi.where() 14 | 15 | 16 | def create_temporary_certificate(custom_ca: str) -> None: 17 | with open(certifi.where(), "rb") as base_cert: 18 | base_cert_content = base_cert.read() 19 | 20 | with open(CUSTOM_CERTIFICATE_PATH, "wb") as outfile: 21 | outfile.write(base_cert_content) 22 | outfile.write(base64.b64decode(custom_ca)) 23 | 24 | os.environ["REQUESTS_CA_BUNDLE"] = CUSTOM_CERTIFICATE_PATH 25 | os.environ["WEBSOCKET_CLIENT_CA_BUNDLE"] = CUSTOM_CERTIFICATE_PATH 26 | certifi.where = lambda: CUSTOM_CERTIFICATE_PATH 27 | 28 | 29 | def add_custom_certificate(custom_ca: str) -> bool: 30 | if not custom_ca: 31 | return False 32 | 33 | # NOTE: Sometimes (Openshift) the certifi.where() is not writable, so we need to 34 | # use a temporary file in case of PermissionError. 35 | try: 36 | append_custom_certificate(custom_ca) 37 | except PermissionError: 38 | create_temporary_certificate(custom_ca) 39 | 40 | return True 41 | -------------------------------------------------------------------------------- /robusta_krr/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/robusta-dev/krr/f220c3156fd3d71ab3bfa7b75b9c31338cc21ce2/robusta_krr/core/__init__.py -------------------------------------------------------------------------------- /robusta_krr/core/abstract/formatters.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Callable, Optional 4 | 5 | from robusta_krr.core.models.result import Result 6 | 7 | FormatterFunc = Callable[[Result], Any] 8 | 9 | FORMATTERS_REGISTRY: dict[str, FormatterFunc] = {} 10 | 11 | 12 | # NOTE: Here asterisk is used to make the argument `rich_console` keyword-only 13 | # This is done to avoid the following usage, where it is unclear what the boolean value is for: 14 | # @register("My Formatter", True) 15 | # def my_formatter(result: Result) -> str: 16 | # return "My formatter" 17 | # 18 | # Instead, the following usage is enforced: 19 | # @register("My Formatter", rich_console=True) 20 | # def my_formatter(result: Result) -> str: 21 | # return "My formatter" 22 | 23 | 24 | def register( 25 | display_name: Optional[str] = None, *, rich_console: bool = False 26 | ) -> Callable[[FormatterFunc], FormatterFunc]: 27 | """ 28 | A decorator to register a formatter function. 29 | 30 | Args: 31 | display_name (str, optional): The name to use for the formatter in the registry. 32 | rich_console (bool): Whether or not the formatter is for a rich console. Defaults to False. 33 | 34 | Returns: 35 | Callable[[FormatterFunc], FormatterFunc]: The decorator function. 36 | """ 37 | 38 | def decorator(func: FormatterFunc) -> FormatterFunc: 39 | name = display_name or func.__name__ 40 | 41 | FORMATTERS_REGISTRY[name] = func 42 | 43 | func.__display_name__ = name # type: ignore 44 | func.__rich_console__ = rich_console # type: ignore 45 | 46 | return func 47 | 48 | return decorator 49 | 50 | 51 | def find(name: str) -> FormatterFunc: 52 | """ 53 | Find a formatter by name in the registry. 54 | 55 | Args: 56 | name (str): The name of the formatter. 57 | 58 | Returns: 59 | FormatterFunc: The formatter function. 60 | 61 | Raises: 62 | ValueError: If a formatter with the given name does not exist. 63 | """ 64 | 65 | try: 66 | return FORMATTERS_REGISTRY[name] 67 | except KeyError as e: 68 | raise ValueError(f"Formatter '{name}' not found") from e 69 | 70 | 71 | def list_available() -> list[str]: 72 | """ 73 | List available formatters in the registry. 74 | 75 | Returns: 76 | list[str]: A list of the names of the available formatters. 77 | """ 78 | 79 | return list(FORMATTERS_REGISTRY) 80 | 81 | 82 | __all__ = ["register", "find"] 83 | -------------------------------------------------------------------------------- /robusta_krr/core/abstract/metrics.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from abc import ABC, abstractmethod 3 | 4 | from robusta_krr.core.abstract.strategies import PodsTimeData 5 | from robusta_krr.core.models.objects import K8sObjectData 6 | 7 | 8 | class BaseMetric(ABC): 9 | """ 10 | This abstraction is done for a future use. 11 | Currently we only scrape metrics from Prometheus, 12 | but in the future we may want to support other metric sources like Datadog, etc. 13 | 14 | TODO: When we want to support other metric sources, we should maybe rethink an interface here. 15 | """ 16 | 17 | @abstractmethod 18 | async def load_data( 19 | self, object: K8sObjectData, period: datetime.timedelta, step: datetime.timedelta 20 | ) -> PodsTimeData: 21 | ... 22 | -------------------------------------------------------------------------------- /robusta_krr/core/abstract/strategies.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import abc 4 | import datetime 5 | from textwrap import dedent 6 | from typing import TYPE_CHECKING, Annotated, Generic, Literal, Optional, Sequence, TypeVar, get_args 7 | 8 | import numpy as np 9 | import pydantic as pd 10 | from numpy.typing import NDArray 11 | 12 | from robusta_krr.core.models.result import K8sObjectData, ResourceType 13 | 14 | if TYPE_CHECKING: 15 | from robusta_krr.core.abstract.metrics import BaseMetric # noqa: F401 16 | from robusta_krr.core.integrations.prometheus.metrics import PrometheusMetric 17 | 18 | SelfRR = TypeVar("SelfRR", bound="ResourceRecommendation") 19 | 20 | 21 | class ResourceRecommendation(pd.BaseModel): 22 | """A class to represent resource recommendation with optional request and limit values. 23 | 24 | The NaN values are used to represent undefined values: the strategy did not provide a recommendation for the resource. 25 | None values are used to represent the strategy says that value should not be set. 26 | """ 27 | 28 | request: Optional[float] 29 | limit: Optional[float] 30 | info: Optional[str] = pd.Field( 31 | None, description="Additional information about the recommendation." 32 | ) 33 | 34 | @classmethod 35 | def undefined(cls: type[SelfRR], info: Optional[str] = None) -> SelfRR: 36 | return cls(request=float("NaN"), limit=float("NaN"), info=info) 37 | 38 | 39 | class StrategySettings(pd.BaseModel): 40 | """A class to represent strategy settings with configurable history and timeframe duration. 41 | 42 | It is used in CLI to generate the help, parameters and validate values. 43 | Description is used to generate the help. 44 | Other pydantic features can be used to validate the values. 45 | 46 | Nested classes are not supported here. 47 | """ 48 | 49 | history_duration: float = pd.Field( 50 | 24 * 7 * 2, ge=1, description="The duration of the history data to use (in hours)." 51 | ) 52 | timeframe_duration: float = pd.Field(1.25, gt=0, description="The step for the history data (in minutes).") 53 | 54 | @property 55 | def history_timedelta(self) -> datetime.timedelta: 56 | return datetime.timedelta(hours=self.history_duration) 57 | 58 | @property 59 | def timeframe_timedelta(self) -> datetime.timedelta: 60 | return datetime.timedelta(minutes=self.timeframe_duration) 61 | 62 | def history_range_enough(self, history_range: tuple[datetime.timedelta, datetime.timedelta]) -> bool: 63 | """Override this function to check if the history range is enough for the strategy.""" 64 | 65 | return True 66 | 67 | 68 | # A type alias for a numpy array of shape (N, 2). 69 | ArrayNx2 = Annotated[NDArray[np.float64], Literal["N", 2]] 70 | 71 | 72 | PodsTimeData = dict[str, ArrayNx2] # Mapping: pod -> [(time, value)] 73 | MetricsPodData = dict[str, PodsTimeData] 74 | 75 | RunResult = dict[ResourceType, ResourceRecommendation] 76 | 77 | SelfBS = TypeVar("SelfBS", bound="BaseStrategy") 78 | _StrategySettings = TypeVar("_StrategySettings", bound=StrategySettings) 79 | 80 | 81 | # An abstract base class for strategy implementation. 82 | # This class requires implementation of a 'run' method for calculating recommendation. 83 | # Make a subclass if you want to create a concrete strategy. 84 | class BaseStrategy(abc.ABC, Generic[_StrategySettings]): 85 | """An abstract base class for strategy implementation. 86 | 87 | This class is generic, and requires a type for the settings. 88 | This settings type will be used for the settings property of the strategy. 89 | It will be used to generate CLI parameters for this strategy, validated by pydantic. 90 | 91 | This class requires implementation of a 'run' method for calculating recommendation. 92 | Additionally, it provides a 'description' property for generating a description for the strategy. 93 | Description property uses the docstring of the strategy class and the settings of the strategy. 94 | 95 | The name of the strategy is the name of the class in lowercase, without the 'Strategy' suffix, if exists. 96 | If you want to change the name of the strategy, you can change the display_name class attribute. 97 | 98 | The strategy will automatically be registered in the strategy registry using __subclasses__ mechanism. 99 | """ 100 | 101 | display_name: str 102 | rich_console: bool = False 103 | 104 | # TODO: this should be BaseMetric, but currently we only support Prometheus 105 | @property 106 | @abc.abstractmethod 107 | def metrics(self) -> Sequence[type[PrometheusMetric]]: 108 | pass 109 | 110 | def __init__(self, settings: _StrategySettings): 111 | self.settings = settings 112 | 113 | def __str__(self) -> str: 114 | return self.display_name.title() 115 | 116 | @property 117 | def description(self) -> Optional[str]: 118 | """ 119 | Generate a description for the strategy. 120 | You can use Rich's markdown syntax to format the description. 121 | """ 122 | raise NotImplementedError() 123 | 124 | # Abstract method that needs to be implemented by subclass. 125 | # This method is intended to calculate resource recommendation based on history data and kubernetes object data. 126 | @abc.abstractmethod 127 | def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult: 128 | pass 129 | 130 | # This method is intended to return a strategy by its name. 131 | @classmethod 132 | def find(cls: type[SelfBS], name: str) -> type[SelfBS]: 133 | strategies = cls.get_all() 134 | if name.lower() in strategies: 135 | return strategies[name.lower()] 136 | 137 | raise ValueError(f"Unknown strategy name: {name}. Available strategies: {', '.join(strategies)}") 138 | 139 | # This method is intended to return all the available strategies. 140 | @classmethod 141 | def get_all(cls: type[SelfBS]) -> dict[str, type[SelfBS]]: 142 | from robusta_krr import strategies as _ # noqa: F401 143 | 144 | return {sub_cls.display_name.lower(): sub_cls for sub_cls in cls.__subclasses__()} 145 | 146 | # This method is intended to return the type of settings used in strategy. 147 | @classmethod 148 | def get_settings_type(cls) -> type[StrategySettings]: 149 | return get_args(cls.__orig_bases__[0])[0] # type: ignore 150 | 151 | 152 | AnyStrategy = BaseStrategy[StrategySettings] 153 | 154 | 155 | __all__ = [ 156 | "AnyStrategy", 157 | "BaseStrategy", 158 | "StrategySettings", 159 | "PodsTimeData", 160 | "MetricsPodData", 161 | "K8sObjectData", 162 | "ResourceType", 163 | ] 164 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/kubernetes/config_patch.py: -------------------------------------------------------------------------------- 1 | # NOTE: This is a workaround for the issue described here: 2 | # https://github.com/kubernetes-client/python/pull/1863 3 | 4 | from __future__ import annotations 5 | 6 | from typing import Optional 7 | 8 | from kubernetes.client import configuration 9 | from kubernetes.config import kube_config 10 | 11 | 12 | class KubeConfigLoader(kube_config.KubeConfigLoader): 13 | def _load_cluster_info(self): 14 | super()._load_cluster_info() 15 | 16 | if "proxy-url" in self._cluster: 17 | self.proxy = self._cluster["proxy-url"] 18 | 19 | def _set_config(self, client_configuration: Configuration): 20 | super()._set_config(client_configuration) 21 | 22 | key = "proxy" 23 | if key in self.__dict__: 24 | setattr(client_configuration, key, getattr(self, key)) 25 | 26 | 27 | class Configuration(configuration.Configuration): 28 | def __init__( 29 | self, 30 | proxy: Optional[str] = None, 31 | **kwargs, 32 | ): 33 | super().__init__(**kwargs) 34 | 35 | self.proxy = proxy 36 | 37 | 38 | configuration.Configuration = Configuration 39 | kube_config.KubeConfigLoader = KubeConfigLoader 40 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/openshift/__init__.py: -------------------------------------------------------------------------------- 1 | from .token import TOKEN_LOCATION, load_token 2 | 3 | __all__ = ["TOKEN_LOCATION", "load_token"] 4 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/openshift/token.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from robusta_krr.core.models.config import settings 4 | 5 | # NOTE: This one should be mounted if openshift is enabled (done by Robusta Runner) 6 | TOKEN_LOCATION = '/var/run/secrets/kubernetes.io/serviceaccount/token' 7 | 8 | 9 | def load_token() -> Optional[str]: 10 | if not settings.openshift: 11 | return None 12 | 13 | try: 14 | with open(TOKEN_LOCATION, 'r') as file: 15 | return file.read() 16 | except FileNotFoundError: 17 | return None 18 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/prometheus/__init__.py: -------------------------------------------------------------------------------- 1 | from .loader import PrometheusMetricsLoader 2 | from .metrics_service.prometheus_metrics_service import PrometheusDiscovery, PrometheusNotFound 3 | from .prometheus_utils import ClusterNotSpecifiedException 4 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/prometheus/loader.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import datetime 4 | import logging 5 | from concurrent.futures import ThreadPoolExecutor 6 | from typing import TYPE_CHECKING, Optional, Dict, Any 7 | 8 | from kubernetes import config as k8s_config 9 | from kubernetes.client.api_client import ApiClient 10 | from kubernetes.client.exceptions import ApiException 11 | from prometrix import MetricsNotFound, PrometheusNotFound 12 | 13 | from robusta_krr.core.models.config import settings 14 | from robusta_krr.core.models.objects import K8sObjectData, PodData 15 | 16 | from .metrics_service.prometheus_metrics_service import PrometheusMetricsService 17 | from .metrics_service.thanos_metrics_service import ThanosMetricsService 18 | from .metrics_service.victoria_metrics_service import VictoriaMetricsService 19 | from .metrics_service.mimir_metrics_service import MimirMetricsService 20 | 21 | if TYPE_CHECKING: 22 | from robusta_krr.core.abstract.strategies import BaseStrategy, MetricsPodData 23 | 24 | logger = logging.getLogger("krr") 25 | 26 | class PrometheusMetricsLoader: 27 | def __init__(self, *, cluster: Optional[str] = None) -> None: 28 | """ 29 | Initializes the Prometheus Loader. 30 | 31 | Args: 32 | cluster (Optional[str]): The name of the cluster. Defaults to None. 33 | """ 34 | 35 | self.executor = ThreadPoolExecutor(settings.max_workers) 36 | self.api_client = settings.get_kube_client(context=cluster) 37 | loader = self.get_metrics_service(api_client=self.api_client, cluster=cluster) 38 | if loader is None: 39 | raise PrometheusNotFound( 40 | f"Wasn't able to connect to any Prometheus service in {cluster or 'inner'} cluster\n" 41 | "Try using port-forwarding and/or setting the url manually (using the -p flag.).\n" 42 | "For more information, see 'Giving the Explicit Prometheus URL' at https://github.com/robusta-dev/krr?tab=readme-ov-file#usage" 43 | ) 44 | 45 | self.loader = loader 46 | 47 | logger.info(f"{self.loader.name()} connected successfully for {cluster or 'default'} cluster") 48 | 49 | def get_metrics_service( 50 | self, 51 | api_client: Optional[ApiClient] = None, 52 | cluster: Optional[str] = None, 53 | ) -> Optional[PrometheusMetricsService]: 54 | if settings.prometheus_url is not None: 55 | logger.info("Prometheus URL is specified, will not auto-detect a metrics service") 56 | metrics_to_check = [PrometheusMetricsService] 57 | else: 58 | logger.info("No Prometheus URL is specified, trying to auto-detect a metrics service") 59 | metrics_to_check = [VictoriaMetricsService, ThanosMetricsService, MimirMetricsService, PrometheusMetricsService] 60 | 61 | for metric_service_class in metrics_to_check: 62 | service_name = metric_service_class.name() 63 | try: 64 | loader = metric_service_class(api_client=api_client, cluster=cluster, executor=self.executor) 65 | loader.check_connection() 66 | except MetricsNotFound as e: 67 | logger.info(f"{service_name} not found: {e}") 68 | except ApiException as e: 69 | logger.warning( 70 | f"Unable to automatically discover a {service_name} in the cluster ({e}). " 71 | "Try specifying how to connect to Prometheus via cli options" 72 | ) 73 | else: 74 | logger.info(f"{service_name} found") 75 | loader.validate_cluster_name() 76 | return loader 77 | 78 | return None 79 | 80 | async def get_history_range( 81 | self, history_duration: datetime.timedelta 82 | ) -> Optional[tuple[datetime.datetime, datetime.datetime]]: 83 | return await self.loader.get_history_range(history_duration) 84 | 85 | async def load_pods(self, object: K8sObjectData, period: datetime.timedelta) -> list[PodData]: 86 | try: 87 | return await self.loader.load_pods(object, period) 88 | except Exception as e: 89 | logger.exception(f"Failed to load pods for {object}: {e}") 90 | return [] 91 | 92 | async def get_cluster_summary(self) -> Dict[str, Any]: 93 | try: 94 | return await self.loader.get_cluster_summary() 95 | except Exception as e: 96 | logger.exception(f"Failed to get cluster summary: {e}") 97 | return {} 98 | 99 | async def gather_data( 100 | self, 101 | object: K8sObjectData, 102 | strategy: BaseStrategy, 103 | period: datetime.timedelta, 104 | *, 105 | step: datetime.timedelta = datetime.timedelta(minutes=30), 106 | ) -> MetricsPodData: 107 | """ 108 | Gathers data from Prometheus for a specified object and resource. 109 | 110 | Args: 111 | object (K8sObjectData): The Kubernetes object. 112 | resource (ResourceType): The resource type. 113 | period (datetime.timedelta): The time period for which to gather data. 114 | step (datetime.timedelta, optional): The time step between data points. Defaults to 30 minutes. 115 | 116 | Returns: 117 | ResourceHistoryData: The gathered resource history data. 118 | """ 119 | 120 | return { 121 | MetricLoader.__name__: await self.loader.gather_data(object, MetricLoader, period, step) 122 | for MetricLoader in strategy.metrics 123 | } 124 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/prometheus/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import PrometheusMetric 2 | from .cpu import CPUAmountLoader, CPULoader, PercentileCPULoader 3 | from .memory import MaxMemoryLoader, MemoryAmountLoader, MemoryLoader, MaxOOMKilledMemoryLoader 4 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/prometheus/metrics/cpu.py: -------------------------------------------------------------------------------- 1 | from robusta_krr.core.models.objects import K8sObjectData 2 | 3 | from .base import PrometheusMetric, QueryType 4 | 5 | 6 | class CPULoader(PrometheusMetric): 7 | """ 8 | A metric loader for loading CPU usage metrics. 9 | """ 10 | 11 | query_type: QueryType = QueryType.QueryRange 12 | 13 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: 14 | pods_selector = "|".join(pod.name for pod in object.pods) 15 | cluster_label = self.get_prometheus_cluster_label() 16 | return f""" 17 | max( 18 | rate( 19 | container_cpu_usage_seconds_total{{ 20 | namespace="{object.namespace}", 21 | pod=~"{pods_selector}", 22 | container="{object.container}" 23 | {cluster_label} 24 | }}[{step}] 25 | ) 26 | ) by (container, pod, job) 27 | """ 28 | 29 | 30 | def PercentileCPULoader(percentile: float) -> type[PrometheusMetric]: 31 | """ 32 | A factory for creating percentile CPU usage metric loaders. 33 | """ 34 | 35 | if not 0 <= percentile <= 100: 36 | raise ValueError("percentile must be between 0 and 100") 37 | 38 | class PercentileCPULoader(PrometheusMetric): 39 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: 40 | pods_selector = "|".join(pod.name for pod in object.pods) 41 | cluster_label = self.get_prometheus_cluster_label() 42 | return f""" 43 | quantile_over_time( 44 | {round(percentile / 100, 2)}, 45 | max( 46 | rate( 47 | container_cpu_usage_seconds_total{{ 48 | namespace="{object.namespace}", 49 | pod=~"{pods_selector}", 50 | container="{object.container}" 51 | {cluster_label} 52 | }}[{step}] 53 | ) 54 | ) by (container, pod, job) 55 | [{duration}:{step}] 56 | ) 57 | """ 58 | 59 | return PercentileCPULoader 60 | 61 | 62 | class CPUAmountLoader(PrometheusMetric): 63 | """ 64 | A metric loader for loading CPU points count. 65 | """ 66 | 67 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: 68 | pods_selector = "|".join(pod.name for pod in object.pods) 69 | cluster_label = self.get_prometheus_cluster_label() 70 | return f""" 71 | count_over_time( 72 | max( 73 | container_cpu_usage_seconds_total{{ 74 | namespace="{object.namespace}", 75 | pod=~"{pods_selector}", 76 | container="{object.container}" 77 | {cluster_label} 78 | }} 79 | ) by (container, pod, job) 80 | [{duration}:{step}] 81 | ) 82 | """ 83 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/prometheus/metrics/memory.py: -------------------------------------------------------------------------------- 1 | from robusta_krr.core.models.objects import K8sObjectData 2 | 3 | from .base import PrometheusMetric, QueryType 4 | 5 | 6 | class MemoryLoader(PrometheusMetric): 7 | """ 8 | A metric loader for loading memory usage metrics. 9 | """ 10 | 11 | query_type: QueryType = QueryType.QueryRange 12 | 13 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: 14 | pods_selector = "|".join(pod.name for pod in object.pods) 15 | cluster_label = self.get_prometheus_cluster_label() 16 | return f""" 17 | max( 18 | container_memory_working_set_bytes{{ 19 | namespace="{object.namespace}", 20 | pod=~"{pods_selector}", 21 | container="{object.container}" 22 | {cluster_label} 23 | }} 24 | ) by (container, pod, job) 25 | """ 26 | 27 | 28 | class MaxMemoryLoader(PrometheusMetric): 29 | """ 30 | A metric loader for loading max memory usage metrics. 31 | """ 32 | 33 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: 34 | pods_selector = "|".join(pod.name for pod in object.pods) 35 | cluster_label = self.get_prometheus_cluster_label() 36 | return f""" 37 | max_over_time( 38 | max( 39 | container_memory_working_set_bytes{{ 40 | namespace="{object.namespace}", 41 | pod=~"{pods_selector}", 42 | container="{object.container}" 43 | {cluster_label} 44 | }} 45 | ) by (container, pod, job) 46 | [{duration}:{step}] 47 | ) 48 | """ 49 | 50 | 51 | class MemoryAmountLoader(PrometheusMetric): 52 | """ 53 | A metric loader for loading memory points count. 54 | """ 55 | 56 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: 57 | pods_selector = "|".join(pod.name for pod in object.pods) 58 | cluster_label = self.get_prometheus_cluster_label() 59 | return f""" 60 | count_over_time( 61 | max( 62 | container_memory_working_set_bytes{{ 63 | namespace="{object.namespace}", 64 | pod=~"{pods_selector}", 65 | container="{object.container}" 66 | {cluster_label} 67 | }} 68 | ) by (container, pod, job) 69 | [{duration}:{step}] 70 | ) 71 | """ 72 | 73 | # TODO: Need to battle test if this one is correct. 74 | class MaxOOMKilledMemoryLoader(PrometheusMetric): 75 | """ 76 | A metric loader for loading the maximum memory limits that were surpassed by the OOMKilled event. 77 | """ 78 | 79 | warning_on_no_data = False 80 | 81 | def get_query(self, object: K8sObjectData, duration: str, step: str) -> str: 82 | pods_selector = "|".join(pod.name for pod in object.pods) 83 | cluster_label = self.get_prometheus_cluster_label() 84 | return f""" 85 | max_over_time( 86 | max( 87 | max( 88 | kube_pod_container_resource_limits{{ 89 | resource="memory", 90 | namespace="{object.namespace}", 91 | pod=~"{pods_selector}", 92 | container="{object.container}" 93 | {cluster_label} 94 | }} 95 | ) by (pod, container, job) 96 | * on(pod, container, job) group_left(reason) 97 | max( 98 | kube_pod_container_status_last_terminated_reason{{ 99 | reason="OOMKilled", 100 | namespace="{object.namespace}", 101 | pod=~"{pods_selector}", 102 | container="{object.container}" 103 | {cluster_label} 104 | }} 105 | ) by (pod, container, job, reason) 106 | ) by (container, pod, job) 107 | [{duration}:{step}] 108 | ) 109 | """ 110 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/prometheus/metrics_service/base_metric_service.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import datetime 3 | from concurrent.futures import ThreadPoolExecutor 4 | from typing import List, Optional, Dict, Any 5 | 6 | from kubernetes.client.api_client import ApiClient 7 | 8 | from robusta_krr.core.abstract.strategies import PodsTimeData 9 | from robusta_krr.core.models.config import settings 10 | from robusta_krr.core.models.objects import K8sObjectData 11 | 12 | from ..metrics import PrometheusMetric 13 | 14 | 15 | class MetricsService(abc.ABC): 16 | def __init__( 17 | self, 18 | api_client: Optional[ApiClient] = None, 19 | cluster: Optional[str] = None, 20 | executor: Optional[ThreadPoolExecutor] = None, 21 | ) -> None: 22 | self.api_client = api_client 23 | self.cluster = cluster or "default" 24 | self.executor = executor 25 | 26 | @abc.abstractmethod 27 | def check_connection(self): 28 | ... 29 | 30 | @classmethod 31 | def name(cls) -> str: 32 | classname = cls.__name__ 33 | return classname.replace("MetricsService", "") if classname != MetricsService.__name__ else classname 34 | 35 | @abc.abstractmethod 36 | def get_cluster_names(self) -> Optional[List[str]]: 37 | ... 38 | 39 | @abc.abstractmethod 40 | async def get_cluster_summary(self) -> Dict[str, Any]: 41 | ... 42 | 43 | @abc.abstractmethod 44 | async def gather_data( 45 | self, 46 | object: K8sObjectData, 47 | LoaderClass: type[PrometheusMetric], 48 | period: datetime.timedelta, 49 | step: datetime.timedelta = datetime.timedelta(minutes=30), 50 | ) -> PodsTimeData: 51 | ... 52 | 53 | def get_prometheus_cluster_label(self) -> str: 54 | """ 55 | Generates the cluster label for querying a centralized Prometheus 56 | 57 | Returns: 58 | str: a promql safe label string for querying the cluster. 59 | """ 60 | if settings.prometheus_cluster_label is None: 61 | return "" 62 | return f', {settings.prometheus_label}="{settings.prometheus_cluster_label}"' 63 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/prometheus/metrics_service/mimir_metrics_service.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from kubernetes.client import ApiClient 4 | from prometrix import MetricsNotFound 5 | 6 | from robusta_krr.utils.service_discovery import MetricsServiceDiscovery 7 | 8 | from .prometheus_metrics_service import PrometheusMetricsService 9 | 10 | class MimirMetricsDiscovery(MetricsServiceDiscovery): 11 | def find_metrics_url(self, *, api_client: Optional[ApiClient] = None) -> Optional[str]: 12 | """ 13 | Finds the Mimir Metrics URL using selectors. 14 | Args: 15 | api_client (Optional[ApiClient]): A Kubernetes API client. Defaults to None. 16 | Returns: 17 | Optional[str]: The discovered Mimir Metrics URL, or None if not found. 18 | """ 19 | return super().find_url( 20 | selectors=[ 21 | "app.kubernetes.io/name=mimir,app.kubernetes.io/component=query-frontend", 22 | ] 23 | ) 24 | 25 | 26 | class MimirMetricsService(PrometheusMetricsService): 27 | """ 28 | A class for fetching metrics from Mimir Metrics. 29 | """ 30 | 31 | service_discovery = MimirMetricsDiscovery 32 | url_postfix = "/prometheus" 33 | additional_headers = {"X-Scope-OrgID": "anonymous"} 34 | 35 | def check_connection(self): 36 | """ 37 | Checks the connection to Prometheus. 38 | Raises: 39 | MimirMetricsNotFound: If the connection to Mimir Metrics cannot be established. 40 | """ 41 | try: 42 | super().check_connection() 43 | except MetricsNotFound as e: 44 | # This is to clarify which metrics service had the issue and not say its a prometheus issue 45 | raise MetricsNotFound( 46 | f"Couldn't connect to Mimir Metrics found under {self.prometheus.url}\nCaused by {e.__class__.__name__}: {e})" 47 | ) from e 48 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/prometheus/metrics_service/thanos_metrics_service.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from kubernetes.client import ApiClient 4 | from prometrix import MetricsNotFound, ThanosMetricsNotFound 5 | 6 | from robusta_krr.utils.service_discovery import MetricsServiceDiscovery 7 | 8 | from .prometheus_metrics_service import PrometheusMetricsService 9 | 10 | 11 | class ThanosMetricsDiscovery(MetricsServiceDiscovery): 12 | def find_metrics_url(self, *, api_client: Optional[ApiClient] = None) -> Optional[str]: 13 | """ 14 | Finds the Thanos URL using selectors. 15 | Args: 16 | api_client (Optional[ApiClient]): A Kubernetes API client. Defaults to None. 17 | Returns: 18 | Optional[str]: The discovered Thanos URL, or None if not found. 19 | """ 20 | 21 | return super().find_url( 22 | selectors=[ 23 | "app.kubernetes.io/component=query,app.kubernetes.io/name=thanos", 24 | "app.kubernetes.io/name=thanos-query", 25 | "app=thanos-query", 26 | "app=thanos-querier", 27 | ] 28 | ) 29 | 30 | 31 | class ThanosMetricsService(PrometheusMetricsService): 32 | """ 33 | A class for fetching metrics from Thanos. 34 | """ 35 | 36 | service_discovery = ThanosMetricsDiscovery 37 | 38 | def check_connection(self): 39 | """ 40 | Checks the connection to Prometheus. 41 | Raises: 42 | ThanosMetricsNotFound: If the connection to Thanos cannot be established. 43 | """ 44 | try: 45 | super().check_connection() 46 | except MetricsNotFound as e: 47 | # This is to clarify which metrics service had the issue and not say its a prometheus issue 48 | raise ThanosMetricsNotFound( 49 | f"Couldn't connect to Thanos found under {self.prometheus.url}\nCaused by {e.__class__.__name__}: {e})" 50 | ) from e 51 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/prometheus/metrics_service/victoria_metrics_service.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from kubernetes.client import ApiClient 4 | from prometrix import MetricsNotFound, VictoriaMetricsNotFound 5 | 6 | from robusta_krr.utils.service_discovery import MetricsServiceDiscovery 7 | 8 | from .prometheus_metrics_service import PrometheusMetricsService 9 | 10 | 11 | class VictoriaMetricsDiscovery(MetricsServiceDiscovery): 12 | def find_metrics_url(self, *, api_client: Optional[ApiClient] = None) -> Optional[str]: 13 | """ 14 | Finds the Victoria Metrics URL using selectors. 15 | Args: 16 | api_client (Optional[ApiClient]): A Kubernetes API client. Defaults to None. 17 | Returns: 18 | Optional[str]: The discovered Victoria Metrics URL, or None if not found. 19 | """ 20 | url = super().find_url( 21 | selectors=[ 22 | "app.kubernetes.io/name=vmsingle", 23 | "app.kubernetes.io/name=victoria-metrics-single", 24 | ] 25 | ) 26 | if url is None: 27 | url = super().find_url( 28 | selectors=[ 29 | "app.kubernetes.io/name=vmselect", 30 | "app=vmselect", 31 | ] 32 | ) 33 | if url is not None: 34 | url = f"{url}/select/0/prometheus/" 35 | return url 36 | 37 | 38 | class VictoriaMetricsService(PrometheusMetricsService): 39 | """ 40 | A class for fetching metrics from Victoria Metrics. 41 | """ 42 | 43 | service_discovery = VictoriaMetricsDiscovery 44 | 45 | @classmethod 46 | def name(cls) -> str: 47 | return "Victoria Metrics" 48 | 49 | def check_connection(self): 50 | """ 51 | Checks the connection to Prometheus. 52 | Raises: 53 | VictoriaMetricsNotFound: If the connection to Victoria Metrics cannot be established. 54 | """ 55 | try: 56 | super().check_connection() 57 | except MetricsNotFound as e: 58 | # This is to clarify which metrics service had the issue and not say its a prometheus issue 59 | raise VictoriaMetricsNotFound( 60 | f"Couldn't connect to Victoria Metrics found under {self.prometheus.url}\nCaused by {e.__class__.__name__}: {e})" 61 | ) from e 62 | -------------------------------------------------------------------------------- /robusta_krr/core/integrations/prometheus/prometheus_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | import boto3 6 | from prometrix import AWSPrometheusConfig, CoralogixPrometheusConfig, PrometheusConfig, VictoriaMetricsPrometheusConfig 7 | 8 | from robusta_krr.core.models.config import settings 9 | 10 | if TYPE_CHECKING: 11 | from robusta_krr.core.integrations.prometheus.metrics_service.prometheus_metrics_service import ( 12 | PrometheusMetricsService, 13 | ) 14 | 15 | 16 | class ClusterNotSpecifiedException(Exception): 17 | """ 18 | An exception raised when a prometheus requires a cluster label but an invalid one is provided. 19 | """ 20 | 21 | pass 22 | 23 | 24 | def generate_prometheus_config( 25 | url: str, headers: dict[str, str], metrics_service: PrometheusMetricsService 26 | ) -> PrometheusConfig: 27 | from .metrics_service.victoria_metrics_service import VictoriaMetricsService 28 | 29 | baseconfig = { 30 | "url": url, 31 | "disable_ssl": not settings.prometheus_ssl_enabled, 32 | "headers": headers, 33 | } 34 | 35 | # aws config 36 | if settings.eks_managed_prom: 37 | session = boto3.Session(profile_name=settings.eks_managed_prom_profile_name) 38 | credentials = session.get_credentials() 39 | region = settings.eks_managed_prom_region if settings.eks_managed_prom_region else session.region_name 40 | 41 | if settings.eks_access_key and settings.eks_secret_key: 42 | # when we have both access key and secret key, don't try to read credentials which can fail 43 | access_key = settings.eks_access_key 44 | secret_key = settings.eks_secret_key.get_secret_value() 45 | else: 46 | # we need at least one parameter from credentials, but we should use whatever we can from settings (this has higher precedence) 47 | credentials = credentials.get_frozen_credentials() 48 | access_key = settings.eks_access_key if settings.eks_access_key else credentials.access_key 49 | secret_key = settings.eks_secret_key.get_secret_value() if settings.eks_secret_key else credentials.secret_key 50 | 51 | service_name = settings.eks_service_name if settings.eks_secret_key else "aps" 52 | if not region: 53 | raise Exception("No eks region specified") 54 | 55 | return AWSPrometheusConfig( 56 | access_key=access_key, 57 | secret_access_key=secret_key, 58 | aws_region=region, 59 | service_name=service_name, 60 | **baseconfig, 61 | ) 62 | # coralogix config 63 | if settings.coralogix_token: 64 | return CoralogixPrometheusConfig(**baseconfig, prometheus_token=settings.coralogix_token.get_secret_value()) 65 | if isinstance(metrics_service, VictoriaMetricsService): 66 | return VictoriaMetricsPrometheusConfig(**baseconfig) 67 | return PrometheusConfig(**baseconfig) 68 | -------------------------------------------------------------------------------- /robusta_krr/core/models/allocations.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import enum 4 | import math 5 | from typing import Literal, Optional, TypeVar, Union 6 | 7 | import pydantic as pd 8 | from kubernetes.client.models import V1Container 9 | 10 | from robusta_krr.utils import resource_units 11 | 12 | 13 | class ResourceType(str, enum.Enum): 14 | """The type of resource. 15 | 16 | Just add new types here and they will be automatically supported. 17 | """ 18 | 19 | CPU = "cpu" 20 | Memory = "memory" 21 | 22 | 23 | RecommendationValue = Union[float, Literal["?"], None] 24 | RecommendationValueRaw = Union[float, str, None] 25 | 26 | Self = TypeVar("Self", bound="ResourceAllocations") 27 | 28 | NONE_LITERAL = "unset" 29 | NAN_LITERAL = "?" 30 | 31 | def format_recommendation_value(value: RecommendationValue) -> str: 32 | if value is None: 33 | return NONE_LITERAL 34 | elif isinstance(value, str): 35 | return NAN_LITERAL 36 | else: 37 | return resource_units.format(value) 38 | 39 | def format_diff(allocated, recommended, selector, multiplier=1, colored=False) -> str: 40 | if recommended is None or isinstance(recommended.value, str) or selector != "requests": 41 | return "" 42 | else: 43 | reccomended_val = recommended.value if isinstance(recommended.value, (int, float)) else 0 44 | allocated_val = allocated if isinstance(allocated, (int, float)) else 0 45 | diff_val = reccomended_val - allocated_val 46 | if colored: 47 | diff_sign = "[green]+[/green]" if diff_val >= 0 else "[red]-[/red]" 48 | else: 49 | diff_sign = "+" if diff_val >= 0 else "-" 50 | return f"{diff_sign}{format_recommendation_value(abs(diff_val) * multiplier)}" 51 | 52 | class ResourceAllocations(pd.BaseModel): 53 | requests: dict[ResourceType, RecommendationValue] 54 | limits: dict[ResourceType, RecommendationValue] 55 | info: dict[ResourceType, Optional[str]] = {} 56 | 57 | @staticmethod 58 | def __parse_resource_value(value: RecommendationValueRaw) -> RecommendationValue: 59 | if value is None: 60 | return None 61 | 62 | if isinstance(value, str): 63 | return float(resource_units.parse(value)) 64 | 65 | if math.isnan(value): 66 | return "?" 67 | 68 | return float(value) 69 | 70 | @pd.validator("requests", "limits", pre=True) 71 | def validate_requests( 72 | cls, value: dict[ResourceType, RecommendationValueRaw] 73 | ) -> dict[ResourceType, RecommendationValue]: 74 | return { 75 | resource_type: cls.__parse_resource_value(resource_value) for resource_type, resource_value in value.items() 76 | } 77 | 78 | @classmethod 79 | def from_container(cls: type[Self], container: V1Container) -> Self: 80 | """Get the resource allocations from a Kubernetes container. 81 | 82 | Args: 83 | container: The Kubernetes container. 84 | 85 | Returns: 86 | The resource allocations. 87 | """ 88 | 89 | return cls( 90 | requests={ 91 | ResourceType.CPU: container.resources.requests.get("cpu") 92 | if container.resources and container.resources.requests 93 | else None, 94 | ResourceType.Memory: container.resources.requests.get("memory") 95 | if container.resources and container.resources.requests 96 | else None, 97 | }, 98 | limits={ 99 | ResourceType.CPU: container.resources.limits.get("cpu") 100 | if container.resources and container.resources.limits 101 | else None, 102 | ResourceType.Memory: container.resources.limits.get("memory") 103 | if container.resources and container.resources.limits 104 | else None, 105 | }, 106 | ) 107 | -------------------------------------------------------------------------------- /robusta_krr/core/models/config.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import logging 4 | import sys 5 | from typing import Any, Literal, Optional, Union 6 | 7 | import pydantic as pd 8 | from kubernetes import config 9 | from kubernetes.config.config_exception import ConfigException 10 | from rich.console import Console 11 | from rich.logging import RichHandler 12 | 13 | from robusta_krr.core.abstract import formatters 14 | from robusta_krr.core.abstract.strategies import AnyStrategy, BaseStrategy 15 | from robusta_krr.core.models.objects import KindLiteral 16 | 17 | logger = logging.getLogger("krr") 18 | 19 | 20 | class Config(pd.BaseSettings): 21 | quiet: bool = pd.Field(False) 22 | verbose: bool = pd.Field(False) 23 | 24 | clusters: Union[list[str], Literal["*"], None] = None 25 | kubeconfig: Optional[str] = None 26 | impersonate_user: Optional[str] = None 27 | impersonate_group: Optional[str] = None 28 | namespaces: Union[list[str], Literal["*"]] = pd.Field("*") 29 | resources: Union[list[KindLiteral], Literal["*"]] = pd.Field("*") 30 | selector: Optional[str] = None 31 | 32 | # Value settings 33 | cpu_min_value: int = pd.Field(10, ge=0) # in millicores 34 | memory_min_value: int = pd.Field(100, ge=0) # in megabytes 35 | 36 | # Prometheus Settings 37 | prometheus_url: Optional[str] = pd.Field(None) 38 | prometheus_auth_header: Optional[pd.SecretStr] = pd.Field(None) 39 | prometheus_other_headers: dict[str, pd.SecretStr] = pd.Field(default_factory=dict) 40 | prometheus_ssl_enabled: bool = pd.Field(False) 41 | prometheus_cluster_label: Optional[str] = pd.Field(None) 42 | prometheus_label: Optional[str] = pd.Field(None) 43 | eks_managed_prom: bool = pd.Field(False) 44 | eks_managed_prom_profile_name: Optional[str] = pd.Field(None) 45 | eks_access_key: Optional[str] = pd.Field(None) 46 | eks_secret_key: Optional[pd.SecretStr] = pd.Field(None) 47 | eks_service_name: Optional[str] = pd.Field(None) 48 | eks_managed_prom_region: Optional[str] = pd.Field(None) 49 | coralogix_token: Optional[pd.SecretStr] = pd.Field(None) 50 | openshift: bool = pd.Field(False) 51 | 52 | # Threading settings 53 | max_workers: int = pd.Field(6, ge=1) 54 | 55 | # Logging Settings 56 | format: str 57 | show_cluster_name: bool 58 | strategy: str 59 | log_to_stderr: bool 60 | width: Optional[int] = pd.Field(None, ge=1) 61 | show_severity: bool = True 62 | 63 | # Output Settings 64 | file_output: Optional[str] = pd.Field(None) 65 | file_output_dynamic: bool = pd.Field(False) 66 | slack_output: Optional[str] = pd.Field(None) 67 | 68 | other_args: dict[str, Any] 69 | 70 | # Internal 71 | inside_cluster: bool = False 72 | _logging_console: Optional[Console] = pd.PrivateAttr(None) 73 | 74 | def __init__(self, **kwargs: Any) -> None: 75 | super().__init__(**kwargs) 76 | 77 | @property 78 | def Formatter(self) -> formatters.FormatterFunc: 79 | return formatters.find(self.format) 80 | 81 | @pd.validator("prometheus_url") 82 | def validate_prometheus_url(cls, v: Optional[str]): 83 | if v is None: 84 | return None 85 | 86 | if not v.startswith("https://") and not v.startswith("http://"): 87 | raise Exception("--prometheus-url must start with https:// or http://") 88 | 89 | v = v.removesuffix("/") 90 | 91 | return v 92 | 93 | @pd.validator("prometheus_other_headers", pre=True) 94 | def validate_prometheus_other_headers(cls, headers: Union[list[str], dict[str, str]]) -> dict[str, str]: 95 | if isinstance(headers, dict): 96 | return headers 97 | 98 | return {k.strip().lower(): v.strip() for k, v in [header.split(":") for header in headers]} 99 | 100 | @pd.validator("namespaces") 101 | def validate_namespaces(cls, v: Union[list[str], Literal["*"]]) -> Union[list[str], Literal["*"]]: 102 | if v == []: 103 | return "*" 104 | 105 | if isinstance(v, list): 106 | for val in v: 107 | if val.startswith("*"): 108 | raise ValueError("Namespace's values cannot start with an asterisk (*)") 109 | 110 | return [val.lower() for val in v] 111 | 112 | @pd.validator("resources", pre=True) 113 | def validate_resources(cls, v: Union[list[str], Literal["*"]]) -> Union[list[str], Literal["*"]]: 114 | if v == []: 115 | return "*" 116 | 117 | # NOTE: KindLiteral.__args__ is a tuple of all possible values of KindLiteral 118 | # So this will preserve the big and small letters of the resource 119 | return [next(r for r in KindLiteral.__args__ if r.lower() == val.lower()) for val in v] 120 | 121 | def create_strategy(self) -> AnyStrategy: 122 | StrategyType = AnyStrategy.find(self.strategy) 123 | StrategySettingsType = StrategyType.get_settings_type() 124 | return StrategyType(StrategySettingsType(**self.other_args)) # type: ignore 125 | 126 | @pd.validator("strategy") 127 | def validate_strategy(cls, v: str) -> str: 128 | BaseStrategy.find(v) # NOTE: raises if strategy is not found 129 | return v 130 | 131 | @pd.validator("format") 132 | def validate_format(cls, v: str) -> str: 133 | formatters.find(v) # NOTE: raises if strategy is not found 134 | return v 135 | 136 | @property 137 | def context(self) -> Optional[str]: 138 | return self.clusters[0] if self.clusters != "*" and self.clusters else None 139 | 140 | @property 141 | def logging_console(self) -> Console: 142 | if getattr(self, "_logging_console") is None: 143 | self._logging_console = Console(file=sys.stderr if self.log_to_stderr else sys.stdout, width=self.width) 144 | return self._logging_console 145 | 146 | def load_kubeconfig(self) -> None: 147 | try: 148 | config.load_kube_config(config_file=self.kubeconfig, context=self.context) 149 | self.inside_cluster = False 150 | except ConfigException: 151 | config.load_incluster_config() 152 | self.inside_cluster = True 153 | 154 | def get_kube_client(self, context: Optional[str] = None): 155 | if context is None: 156 | return None 157 | 158 | api_client = config.new_client_from_config(context=context, config_file=self.kubeconfig) 159 | if self.impersonate_user is not None: 160 | # trick copied from https://github.com/kubernetes-client/python/issues/362 161 | api_client.set_default_header("Impersonate-User", self.impersonate_user) 162 | if self.impersonate_group is not None: 163 | api_client.set_default_header("Impersonate-Group", self.impersonate_group) 164 | return api_client 165 | 166 | @staticmethod 167 | def set_config(config: Config) -> None: 168 | global _config 169 | 170 | _config = config 171 | logging.basicConfig( 172 | level="NOTSET", 173 | format="%(message)s", 174 | datefmt="[%X]", 175 | handlers=[RichHandler(console=config.logging_console)], 176 | ) 177 | logging.getLogger("").setLevel(logging.CRITICAL) 178 | logger.setLevel(logging.DEBUG if config.verbose else logging.CRITICAL if config.quiet else logging.INFO) 179 | 180 | @staticmethod 181 | def get_config() -> Optional[Config]: 182 | return _config 183 | 184 | 185 | # NOTE: This class is just a proxy for _config. 186 | # Import settings from this module and use it like it is just a config object. 187 | class _Settings(Config): # Config here is used for type checking 188 | def __init__(self) -> None: 189 | pass 190 | 191 | def __getattr__(self, name: str): 192 | if _config is None: 193 | raise AttributeError("Config is not set") 194 | 195 | return getattr(_config, name) 196 | 197 | 198 | _config: Optional[Config] = None 199 | settings = _Settings() 200 | -------------------------------------------------------------------------------- /robusta_krr/core/models/objects.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Literal, Optional 4 | 5 | import pydantic as pd 6 | 7 | from robusta_krr.core.models.allocations import ResourceAllocations 8 | from robusta_krr.utils.batched import batched 9 | from kubernetes.client.models import V1LabelSelector 10 | 11 | KindLiteral = Literal["Deployment", "DaemonSet", "StatefulSet", "Job", "CronJob", "Rollout", "DeploymentConfig", "StrimziPodSet"] 12 | 13 | 14 | class PodData(pd.BaseModel): 15 | name: str 16 | deleted: bool 17 | 18 | def __hash__(self) -> int: 19 | return hash(self.name) 20 | 21 | 22 | class HPAData(pd.BaseModel): 23 | min_replicas: Optional[int] 24 | max_replicas: int 25 | current_replicas: Optional[int] 26 | desired_replicas: int 27 | target_cpu_utilization_percentage: Optional[float] 28 | target_memory_utilization_percentage: Optional[float] 29 | 30 | 31 | PodWarning = Literal[ 32 | "NoPrometheusPods", 33 | "NoPrometheusCPUMetrics", 34 | "NoPrometheusMemoryMetrics", 35 | ] 36 | 37 | 38 | class K8sObjectData(pd.BaseModel): 39 | # NOTE: Here None means that we are running inside the cluster 40 | cluster: Optional[str] 41 | name: str 42 | container: str 43 | pods: list[PodData] = [] 44 | hpa: Optional[HPAData] 45 | namespace: str 46 | kind: KindLiteral 47 | allocations: ResourceAllocations 48 | warnings: set[PodWarning] = set() 49 | labels: Optional[dict[str, str]] 50 | annotations: Optional[dict[str, str]] 51 | 52 | _api_resource = pd.PrivateAttr(None) 53 | 54 | def __str__(self) -> str: 55 | return f"{self.kind} {self.namespace}/{self.name}/{self.container}" 56 | 57 | def __hash__(self) -> int: 58 | return hash(str(self)) 59 | 60 | def add_warning(self, warning: PodWarning) -> None: 61 | self.warnings.add(warning) 62 | 63 | @property 64 | def current_pods_count(self) -> int: 65 | return len([pod for pod in self.pods if not pod.deleted]) 66 | 67 | @property 68 | def deleted_pods_count(self) -> int: 69 | return len([pod for pod in self.pods if pod.deleted]) 70 | 71 | @property 72 | def pods_count(self) -> int: 73 | return len(self.pods) 74 | 75 | @property 76 | def selector(self) -> V1LabelSelector: 77 | if self._api_resource is None: 78 | raise ValueError("api_resource is not set") 79 | 80 | if self.kind == 'CronJob': 81 | return self._api_resource.spec.job_template.spec.selector 82 | else: 83 | return self._api_resource.spec.selector 84 | 85 | def split_into_batches(self, n: int) -> list[K8sObjectData]: 86 | """ 87 | Batch this object into n objects, splitting the pods into batches of size n. 88 | """ 89 | 90 | if self.pods_count <= n: 91 | return [self] 92 | 93 | return [ 94 | K8sObjectData( 95 | cluster=self.cluster, 96 | name=self.name, 97 | container=self.container, 98 | pods=batch, 99 | hpa=self.hpa, 100 | namespace=self.namespace, 101 | kind=self.kind, 102 | allocations=self.allocations, 103 | labels=self.labels, 104 | annotations=self.annotations, 105 | ) 106 | for batch in batched(self.pods, n) 107 | ] 108 | -------------------------------------------------------------------------------- /robusta_krr/core/models/result.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Optional, Union 4 | 5 | import pydantic as pd 6 | 7 | from robusta_krr.core.abstract import formatters 8 | from robusta_krr.core.models.allocations import RecommendationValue, ResourceAllocations, ResourceType 9 | from robusta_krr.core.models.objects import K8sObjectData 10 | from robusta_krr.core.models.severity import Severity 11 | from robusta_krr.core.models.config import Config 12 | 13 | 14 | class Recommendation(pd.BaseModel): 15 | value: RecommendationValue 16 | severity: Severity 17 | 18 | 19 | class ResourceRecommendation(pd.BaseModel): 20 | requests: dict[ResourceType, Union[RecommendationValue, Recommendation]] 21 | limits: dict[ResourceType, Union[RecommendationValue, Recommendation]] 22 | info: dict[ResourceType, Optional[str]] 23 | 24 | 25 | class ResourceScan(pd.BaseModel): 26 | object: K8sObjectData 27 | recommended: ResourceRecommendation 28 | severity: Severity 29 | 30 | @classmethod 31 | def calculate(cls, object: K8sObjectData, recommendation: ResourceAllocations) -> ResourceScan: 32 | recommendation_processed = ResourceRecommendation(requests={}, limits={}, info={}) 33 | 34 | for resource_type in ResourceType: 35 | recommendation_processed.info[resource_type] = recommendation.info.get(resource_type) 36 | 37 | for selector in ["requests", "limits"]: 38 | current = getattr(object.allocations, selector).get(resource_type) 39 | recommended = getattr(recommendation, selector).get(resource_type) 40 | 41 | current_severity = Severity.calculate(current, recommended, resource_type) 42 | 43 | #TODO: consider... changing field after model created doesn't validate it. 44 | getattr(recommendation_processed, selector)[resource_type] = Recommendation( 45 | value=recommended, severity=current_severity 46 | ) 47 | 48 | for severity in [Severity.CRITICAL, Severity.WARNING, Severity.OK, Severity.GOOD, Severity.UNKNOWN]: 49 | for selector in ["requests", "limits"]: 50 | for recommendation_request in getattr(recommendation_processed, selector).values(): 51 | if recommendation_request.severity == severity: 52 | return cls(object=object, recommended=recommendation_processed, severity=severity) 53 | 54 | return cls(object=object, recommended=recommendation_processed, severity=Severity.UNKNOWN) 55 | 56 | 57 | class StrategyData(pd.BaseModel): 58 | name: str 59 | settings: dict[str, Any] 60 | 61 | 62 | class Result(pd.BaseModel): 63 | scans: list[ResourceScan] 64 | score: int = 0 65 | resources: list[str] = ["cpu", "memory"] 66 | description: Optional[str] = None 67 | strategy: StrategyData 68 | errors: list[dict[str, Any]] = pd.Field(default_factory=list) 69 | clusterSummary: dict[str, Any] = {} 70 | config: Optional[Config] = pd.Field(default_factory=Config.get_config) 71 | 72 | def __init__(self, *args, **kwargs) -> None: 73 | super().__init__(*args, **kwargs) 74 | self.score = self.__calculate_score() 75 | 76 | def format(self, formatter: Union[formatters.FormatterFunc, str]) -> Any: 77 | """Format the result. 78 | 79 | Args: 80 | formatter: The formatter to use. 81 | 82 | Returns: 83 | The formatted result. 84 | """ 85 | 86 | formatter = formatters.find(formatter) if isinstance(formatter, str) else formatter 87 | return formatter(self) 88 | 89 | @staticmethod 90 | def __scan_cost(scan: ResourceScan) -> float: 91 | return 0.7 if scan.severity == Severity.WARNING else 1 if scan.severity == Severity.CRITICAL else 0 92 | 93 | def __calculate_score(self) -> int: 94 | """Get the score of the result. 95 | 96 | Returns: 97 | The score of the result. 98 | """ 99 | 100 | score = sum(self.__scan_cost(scan) for scan in self.scans) 101 | return int((len(self.scans) - score) / len(self.scans) * 100) if self.scans else 0 102 | 103 | @property 104 | def score_letter(self) -> str: 105 | return ( 106 | "F" 107 | if self.score < 30 108 | else "D" 109 | if self.score < 55 110 | else "C" 111 | if self.score < 70 112 | else "B" 113 | if self.score < 90 114 | else "A" 115 | ) 116 | -------------------------------------------------------------------------------- /robusta_krr/core/models/severity.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import enum 4 | from typing import Callable, Optional 5 | 6 | from robusta_krr.core.models.allocations import RecommendationValue, ResourceType 7 | 8 | 9 | class Severity(str, enum.Enum): 10 | """ 11 | The severity of the scan. 12 | 13 | The severity is calculated based on the difference between the current value and the recommended value. 14 | You can override the severity calculation function by using the `bind_calculator` decorator from the same module. 15 | """ 16 | 17 | UNKNOWN = "UNKNOWN" 18 | GOOD = "GOOD" 19 | OK = "OK" 20 | WARNING = "WARNING" 21 | CRITICAL = "CRITICAL" 22 | 23 | @property 24 | def color(self) -> str: 25 | return { 26 | self.UNKNOWN: "dim", 27 | self.GOOD: "green", 28 | self.OK: "gray", 29 | self.WARNING: "yellow", 30 | self.CRITICAL: "red", 31 | }[self] 32 | 33 | @classmethod 34 | def calculate( 35 | cls, current: RecommendationValue, recommended: RecommendationValue, resource_type: ResourceType 36 | ) -> Severity: 37 | if isinstance(recommended, str) or isinstance(current, str): 38 | return cls.UNKNOWN 39 | 40 | return calculate_severity(current, recommended, resource_type) 41 | 42 | 43 | def register_severity_calculator(resource_type: ResourceType) -> Callable[[SeverityCalculator], SeverityCalculator]: 44 | """ 45 | Bind a severity calculator function to a resource type. 46 | Use this decorator to bind a severity calculator function to a resource type. 47 | 48 | Example: 49 | >>> @bind_severity_calculator(ResourceType.CPU) 50 | >>> def cpu_severity_calculator(current: Optional[float], recommended: Optional[float], resource_type: ResourceType) -> Severity: 51 | >>> if current is None and recommended is None: 52 | >>> return Severity.GOOD 53 | >>> if current is None or recommended is None: 54 | >>> return Severity.WARNING 55 | >>> 56 | >>> return Severity.CRITICAL if abs(current - recommended) >= 0.5 else Severity.GOOD 57 | """ 58 | 59 | def decorator(func: SeverityCalculator) -> SeverityCalculator: 60 | SEVERITY_CALCULATORS_REGISTRY[resource_type] = func 61 | return func 62 | 63 | return decorator 64 | 65 | 66 | SeverityCalculator = Callable[[Optional[float], Optional[float], ResourceType], Severity] 67 | SEVERITY_CALCULATORS_REGISTRY: dict[ResourceType, SeverityCalculator] = {} 68 | 69 | 70 | def calculate_severity(current: Optional[float], recommended: Optional[float], resource_type: ResourceType) -> Severity: 71 | """ 72 | Calculate the severity of the scan based on the current value and the recommended value. 73 | 74 | This function will use the severity calculator function that is bound to the resource type. 75 | If there is no calculator function bound to the resource type, it will use the default severity calculator function. 76 | """ 77 | 78 | return SEVERITY_CALCULATORS_REGISTRY.get(resource_type, default_severity_calculator)( 79 | current, recommended, resource_type 80 | ) 81 | 82 | 83 | def default_severity_calculator( 84 | current: Optional[float], recommended: Optional[float], resource_type: ResourceType 85 | ) -> Severity: 86 | return Severity.UNKNOWN 87 | 88 | 89 | @register_severity_calculator(ResourceType.CPU) 90 | def cpu_severity_calculator( 91 | current: Optional[float], recommended: Optional[float], resource_type: ResourceType 92 | ) -> Severity: 93 | if current is None and recommended is None: 94 | return Severity.GOOD 95 | if current is None or recommended is None: 96 | return Severity.WARNING 97 | 98 | diff = abs(current - recommended) 99 | 100 | if diff >= 0.5: 101 | return Severity.CRITICAL 102 | elif diff >= 0.25: 103 | return Severity.WARNING 104 | elif diff >= 0.1: 105 | return Severity.OK 106 | else: 107 | return Severity.GOOD 108 | 109 | 110 | @register_severity_calculator(ResourceType.Memory) 111 | def memory_severity_calculator( 112 | current: Optional[float], recommended: Optional[float], resource_type: ResourceType 113 | ) -> Severity: 114 | if current is None and recommended is None: 115 | return Severity.GOOD 116 | if current is None or recommended is None: 117 | return Severity.WARNING 118 | 119 | diff = abs(current - recommended) / 1024 / 1024 120 | 121 | if diff >= 500: 122 | return Severity.CRITICAL 123 | elif diff >= 250: 124 | return Severity.WARNING 125 | elif diff >= 100: 126 | return Severity.OK 127 | else: 128 | return Severity.GOOD 129 | -------------------------------------------------------------------------------- /robusta_krr/formatters/__init__.py: -------------------------------------------------------------------------------- 1 | from .json import json 2 | from .pprint import pprint 3 | from .table import table 4 | from .yaml import yaml 5 | from .csv import csv 6 | from .csv_raw import csv_raw 7 | from .html import html 8 | -------------------------------------------------------------------------------- /robusta_krr/formatters/csv.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import io 3 | import itertools 4 | import logging 5 | from typing import Any 6 | 7 | from robusta_krr.core.abstract import formatters 8 | from robusta_krr.core.models.allocations import NONE_LITERAL, format_diff, format_recommendation_value 9 | from robusta_krr.core.models.config import settings 10 | from robusta_krr.core.models.result import ResourceScan, ResourceType, Result 11 | 12 | logger = logging.getLogger("krr") 13 | 14 | 15 | NAMESPACE_HEADER = "Namespace" 16 | NAME_HEADER = "Name" 17 | PODS_HEADER = "Pods" 18 | OLD_PODS_HEADER = "Old Pods" 19 | TYPE_HEADER = "Type" 20 | CONTAINER_HEADER = "Container" 21 | CLUSTER_HEADER = "Cluster" 22 | SEVERITY_HEADER = "Severity" 23 | 24 | RESOURCE_DIFF_HEADER = "{resource_name} Diff" 25 | RESOURCE_REQUESTS_HEADER = "{resource_name} Requests" 26 | RESOURCE_LIMITS_HEADER = "{resource_name} Limits" 27 | 28 | 29 | def _format_request_str(item: ResourceScan, resource: ResourceType, selector: str) -> str: 30 | allocated = getattr(item.object.allocations, selector)[resource] 31 | recommended = getattr(item.recommended, selector)[resource] 32 | 33 | if allocated is None and recommended.value is None: 34 | return f"{NONE_LITERAL}" 35 | 36 | diff = format_diff(allocated, recommended, selector) 37 | if diff != "": 38 | diff = f"({diff}) " 39 | 40 | return diff + format_recommendation_value(allocated) + " -> " + format_recommendation_value(recommended.value) 41 | 42 | 43 | def _format_total_diff(item: ResourceScan, resource: ResourceType, pods_current: int) -> str: 44 | selector = "requests" 45 | allocated = getattr(item.object.allocations, selector)[resource] 46 | recommended = getattr(item.recommended, selector)[resource] 47 | 48 | return format_diff(allocated, recommended, selector, pods_current) 49 | 50 | 51 | @formatters.register("csv") 52 | def csv_exporter(result: Result) -> str: 53 | # We need to order the resource columns so that they are in the format of Namespace,Name,Pods,Old Pods,Type,Container,CPU Diff,CPU Requests,CPU Limits,Memory Diff,Memory Requests,Memory Limits 54 | csv_columns = ["Namespace", "Name", "Pods", "Old Pods", "Type", "Container"] 55 | 56 | if settings.show_cluster_name: 57 | csv_columns.insert(0, "Cluster") 58 | 59 | if settings.show_severity: 60 | csv_columns.append("Severity") 61 | 62 | for resource in ResourceType: 63 | csv_columns.append(RESOURCE_DIFF_HEADER.format(resource_name=resource.name)) 64 | csv_columns.append(RESOURCE_REQUESTS_HEADER.format(resource_name=resource.name)) 65 | csv_columns.append(RESOURCE_LIMITS_HEADER.format(resource_name=resource.name)) 66 | 67 | output = io.StringIO() 68 | csv_writer = csv.DictWriter(output, csv_columns, extrasaction="ignore") 69 | csv_writer.writeheader() 70 | 71 | for _, group in itertools.groupby( 72 | enumerate(result.scans), key=lambda x: (x[1].object.cluster, x[1].object.namespace, x[1].object.name) 73 | ): 74 | group_items = list(group) 75 | 76 | for j, (_, item) in enumerate(group_items): 77 | full_info_row = j == 0 78 | 79 | row: dict[str, Any] = { 80 | NAMESPACE_HEADER: item.object.namespace if full_info_row else "", 81 | NAME_HEADER: item.object.name if full_info_row else "", 82 | PODS_HEADER: f"{item.object.current_pods_count}" if full_info_row else "", 83 | OLD_PODS_HEADER: f"{item.object.deleted_pods_count}" if full_info_row else "", 84 | TYPE_HEADER: item.object.kind if full_info_row else "", 85 | CONTAINER_HEADER: item.object.container, 86 | SEVERITY_HEADER: item.severity, 87 | CLUSTER_HEADER: item.object.cluster, 88 | } 89 | 90 | for resource in ResourceType: 91 | row[RESOURCE_DIFF_HEADER.format(resource_name=resource.name)] = _format_total_diff( 92 | item, resource, item.object.current_pods_count 93 | ) 94 | row[RESOURCE_REQUESTS_HEADER.format(resource_name=resource.name)] = _format_request_str( 95 | item, resource, "requests" 96 | ) 97 | row[RESOURCE_LIMITS_HEADER.format(resource_name=resource.name)] = _format_request_str( 98 | item, resource, "limits" 99 | ) 100 | 101 | csv_writer.writerow(row) 102 | 103 | return output.getvalue() 104 | -------------------------------------------------------------------------------- /robusta_krr/formatters/csv_raw.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import io 3 | import logging 4 | from typing import Any, Union 5 | 6 | from robusta_krr.core.abstract import formatters 7 | from robusta_krr.core.models.allocations import NAN_LITERAL, NONE_LITERAL 8 | from robusta_krr.core.models.config import settings 9 | from robusta_krr.core.models.result import ResourceScan, ResourceType, Result 10 | 11 | logger = logging.getLogger("krr") 12 | 13 | 14 | NAMESPACE_HEADER = "Namespace" 15 | NAME_HEADER = "Name" 16 | PODS_HEADER = "Pods" 17 | OLD_PODS_HEADER = "Old Pods" 18 | TYPE_HEADER = "Type" 19 | CONTAINER_HEADER = "Container" 20 | CLUSTER_HEADER = "Cluster" 21 | SEVERITY_HEADER = "Severity" 22 | 23 | RESOURCE_REQUESTS_CURRENT_HEADER = "{resource_name} Requests Current" 24 | RESOURCE_REQUESTS_RECOMMENDED_HEADER = '{resource_name} Requests Recommended' 25 | 26 | RESOURCE_LIMITS_CURRENT_HEADER = "{resource_name} Limits Current" 27 | RESOURCE_LIMITS_RECOMMENDED_HEADER = '{resource_name} Limits Recommended' 28 | 29 | 30 | def _format_value(val: Union[float, int]) -> str: 31 | if isinstance(val, int): 32 | return str(val) 33 | elif isinstance(val, float): 34 | return str(int(val)) if val.is_integer() else str(val) 35 | elif val is None: 36 | return NONE_LITERAL 37 | elif isinstance(val, str): 38 | return NAN_LITERAL 39 | else: 40 | raise ValueError(f'unknown value: {val}') 41 | 42 | 43 | def _format_request_current(item: ResourceScan, resource: ResourceType, selector: str) -> str: 44 | allocated = getattr(item.object.allocations, selector)[resource] 45 | if allocated is None: 46 | return NONE_LITERAL 47 | return _format_value(allocated) 48 | 49 | 50 | def _format_request_recommend(item: ResourceScan, resource: ResourceType, selector: str) -> str: 51 | recommended = getattr(item.recommended, selector)[resource] 52 | if recommended is None: 53 | return NONE_LITERAL 54 | return _format_value(recommended.value) 55 | 56 | 57 | @formatters.register("csv-raw") 58 | def csv_raw(result: Result) -> str: 59 | # We need to order the resource columns so that they are in the format of 60 | # Namespace, Name, Pods, Old Pods, Type, Container, 61 | # CPU Requests Current, CPU Requests Recommend, CPU Limits Current, CPU Limits Recommend, 62 | # Memory Requests Current, Memory Requests Recommend, Memory Limits Current, Memory Limits Recommend, 63 | csv_columns = ["Namespace", "Name", "Pods", "Old Pods", "Type", "Container"] 64 | 65 | if settings.show_cluster_name: 66 | csv_columns.insert(0, "Cluster") 67 | 68 | if settings.show_severity: 69 | csv_columns.append("Severity") 70 | 71 | for resource in ResourceType: 72 | csv_columns.append(RESOURCE_REQUESTS_CURRENT_HEADER.format(resource_name=resource.name)) 73 | csv_columns.append(RESOURCE_REQUESTS_RECOMMENDED_HEADER.format(resource_name=resource.name)) 74 | csv_columns.append(RESOURCE_LIMITS_CURRENT_HEADER.format(resource_name=resource.name)) 75 | csv_columns.append(RESOURCE_LIMITS_RECOMMENDED_HEADER.format(resource_name=resource.name)) 76 | 77 | output = io.StringIO() 78 | csv_writer = csv.DictWriter(output, csv_columns, extrasaction="ignore") 79 | csv_writer.writeheader() 80 | 81 | for item in result.scans: 82 | row: dict[str, Any] = { 83 | NAMESPACE_HEADER: item.object.namespace, 84 | NAME_HEADER: item.object.name, 85 | PODS_HEADER: f"{item.object.current_pods_count}", 86 | OLD_PODS_HEADER: f"{item.object.deleted_pods_count}", 87 | TYPE_HEADER: item.object.kind, 88 | CONTAINER_HEADER: item.object.container, 89 | SEVERITY_HEADER: item.severity, 90 | CLUSTER_HEADER: item.object.cluster, 91 | } 92 | 93 | for resource in ResourceType: 94 | resource: ResourceType 95 | row[RESOURCE_REQUESTS_CURRENT_HEADER.format(resource_name=resource.name)] = _format_request_current( 96 | item, resource, "requests" 97 | ) 98 | row[RESOURCE_REQUESTS_RECOMMENDED_HEADER.format(resource_name=resource.name)] = _format_request_recommend( 99 | item, resource, "requests" 100 | ) 101 | row[RESOURCE_LIMITS_CURRENT_HEADER.format(resource_name=resource.name)] = _format_request_current( 102 | item, resource, "limits" 103 | ) 104 | row[RESOURCE_LIMITS_RECOMMENDED_HEADER.format(resource_name=resource.name)] = _format_request_recommend( 105 | item, resource, "limits" 106 | ) 107 | 108 | csv_writer.writerow(row) 109 | 110 | return output.getvalue() 111 | -------------------------------------------------------------------------------- /robusta_krr/formatters/html.py: -------------------------------------------------------------------------------- 1 | from rich.console import Console 2 | 3 | from robusta_krr.core.abstract import formatters 4 | from robusta_krr.core.models.result import Result 5 | from .table import table 6 | 7 | @formatters.register("html") 8 | def html(result: Result) -> str: 9 | console = Console(record=True) 10 | table_output = table(result) 11 | console.print(table_output) 12 | return console.export_html(inline_styles=True) 13 | -------------------------------------------------------------------------------- /robusta_krr/formatters/json.py: -------------------------------------------------------------------------------- 1 | from robusta_krr.core.abstract import formatters 2 | from robusta_krr.core.models.result import Result 3 | 4 | 5 | @formatters.register() 6 | def json(result: Result) -> str: 7 | return result.json(indent=2) 8 | -------------------------------------------------------------------------------- /robusta_krr/formatters/pprint.py: -------------------------------------------------------------------------------- 1 | from pprint import pformat 2 | 3 | from robusta_krr.core.abstract import formatters 4 | from robusta_krr.core.models.result import Result 5 | 6 | 7 | @formatters.register() 8 | def pprint(result: Result) -> str: 9 | return pformat(result.dict()) 10 | -------------------------------------------------------------------------------- /robusta_krr/formatters/table.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import Any 3 | 4 | from rich.table import Table 5 | 6 | from robusta_krr.core.abstract import formatters 7 | from robusta_krr.core.models.allocations import RecommendationValue, format_recommendation_value, format_diff, NONE_LITERAL, NAN_LITERAL 8 | from robusta_krr.core.models.result import ResourceScan, ResourceType, Result 9 | from robusta_krr.core.models.config import settings 10 | from robusta_krr.utils import resource_units 11 | 12 | 13 | DEFAULT_INFO_COLOR = "grey27" 14 | INFO_COLORS: dict[str, str] = { 15 | "OOMKill detected": "dark_red", 16 | } 17 | 18 | 19 | def _format_request_str(item: ResourceScan, resource: ResourceType, selector: str) -> str: 20 | allocated = getattr(item.object.allocations, selector)[resource] 21 | info = item.recommended.info.get(resource) 22 | recommended = getattr(item.recommended, selector)[resource] 23 | severity = recommended.severity 24 | 25 | if allocated is None and recommended.value is None: 26 | return f"[{severity.color}]{NONE_LITERAL}[/{severity.color}]" 27 | 28 | diff = format_diff(allocated, recommended, selector, colored=True) 29 | if diff != "": 30 | diff = f"({diff}) " 31 | 32 | if info is None: 33 | info_formatted = "" 34 | else: 35 | color = INFO_COLORS.get(info, DEFAULT_INFO_COLOR) 36 | info_formatted = f"\n[{color}]({info})[/{color}]" 37 | 38 | return ( 39 | diff 40 | + f"[{severity.color}]" 41 | + format_recommendation_value(allocated) 42 | + " -> " 43 | + format_recommendation_value(recommended.value) 44 | + f"[/{severity.color}]" 45 | + info_formatted 46 | ) 47 | 48 | 49 | def _format_total_diff(item: ResourceScan, resource: ResourceType, pods_current: int) -> str: 50 | selector = "requests" 51 | allocated = getattr(item.object.allocations, selector)[resource] 52 | recommended = getattr(item.recommended, selector)[resource] 53 | 54 | # if we have more than one pod, say so (this explains to the user why the total is different than the recommendation) 55 | if pods_current == 1: 56 | pods_info = "" 57 | else: 58 | pods_info = f"\n({pods_current} pods)" 59 | 60 | return f"{format_diff(allocated, recommended, selector, pods_current, colored=True)}{pods_info}" 61 | 62 | 63 | @formatters.register(rich_console=True) 64 | def table(result: Result) -> Table: 65 | """Format the result as text. 66 | 67 | :param result: The result to format. 68 | :type result: :class:`core.result.Result` 69 | :returns: The formatted results. 70 | :rtype: str 71 | """ 72 | 73 | table = Table( 74 | show_header=True, 75 | header_style="bold magenta", 76 | title=f"\n{result.description}\n" if result.description else None, 77 | title_justify="left", 78 | title_style="", 79 | caption=f"{result.score} points - {result.score_letter}", 80 | ) 81 | 82 | cluster_count = len(set(item.object.cluster for item in result.scans)) 83 | 84 | table.add_column("Number", justify="right", no_wrap=True) 85 | if cluster_count > 1 or settings.show_cluster_name: 86 | table.add_column("Cluster", style="cyan") 87 | table.add_column("Namespace", style="cyan") 88 | table.add_column("Name", style="cyan") 89 | table.add_column("Pods", style="cyan") 90 | table.add_column("Old Pods", style="cyan") 91 | table.add_column("Type", style="cyan") 92 | table.add_column("Container", style="cyan") 93 | for resource in ResourceType: 94 | table.add_column(f"{resource.name} Diff") 95 | table.add_column(f"{resource.name} Requests") 96 | table.add_column(f"{resource.name} Limits") 97 | 98 | for _, group in itertools.groupby( 99 | enumerate(result.scans), key=lambda x: (x[1].object.cluster, x[1].object.namespace, x[1].object.name) 100 | ): 101 | group_items = list(group) 102 | 103 | for j, (i, item) in enumerate(group_items): 104 | last_row = j == len(group_items) - 1 105 | full_info_row = j == 0 106 | 107 | cells: list[Any] = [f"[{item.severity.color}]{i + 1}.[/{item.severity.color}]"] 108 | if cluster_count > 1 or settings.show_cluster_name: 109 | cells.append(item.object.cluster if full_info_row else "") 110 | cells += [ 111 | item.object.namespace if full_info_row else "", 112 | item.object.name if full_info_row else "", 113 | f"{item.object.current_pods_count}" if full_info_row else "", 114 | f"{item.object.deleted_pods_count}" if full_info_row else "", 115 | item.object.kind if full_info_row else "", 116 | item.object.container, 117 | ] 118 | 119 | for resource in ResourceType: 120 | cells.append(_format_total_diff(item, resource, item.object.current_pods_count)) 121 | cells += [_format_request_str(item, resource, selector) for selector in ["requests", "limits"]] 122 | 123 | table.add_row(*cells, end_section=last_row) 124 | 125 | return table 126 | -------------------------------------------------------------------------------- /robusta_krr/formatters/yaml.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import yaml as yaml_module 4 | 5 | from robusta_krr.core.abstract import formatters 6 | from robusta_krr.core.models.result import Result 7 | 8 | 9 | @formatters.register() 10 | def yaml(result: Result) -> str: 11 | return yaml_module.dump(json.loads(result.json()), sort_keys=False) 12 | -------------------------------------------------------------------------------- /robusta_krr/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | from .simple import SimpleStrategy 2 | from .simple_limit import SimpleLimitStrategy -------------------------------------------------------------------------------- /robusta_krr/strategies/simple.py: -------------------------------------------------------------------------------- 1 | import textwrap 2 | from datetime import timedelta 3 | 4 | import numpy as np 5 | import pydantic as pd 6 | 7 | from robusta_krr.core.abstract.strategies import ( 8 | BaseStrategy, 9 | K8sObjectData, 10 | MetricsPodData, 11 | PodsTimeData, 12 | ResourceRecommendation, 13 | ResourceType, 14 | RunResult, 15 | StrategySettings, 16 | ) 17 | from robusta_krr.core.integrations.prometheus.metrics import ( 18 | CPUAmountLoader, 19 | MaxMemoryLoader, 20 | MemoryAmountLoader, 21 | PercentileCPULoader, 22 | PrometheusMetric, 23 | MaxOOMKilledMemoryLoader, 24 | ) 25 | 26 | 27 | class SimpleStrategySettings(StrategySettings): 28 | cpu_percentile: float = pd.Field(95, gt=0, le=100, description="The percentile to use for the CPU recommendation.") 29 | memory_buffer_percentage: float = pd.Field( 30 | 15, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation." 31 | ) 32 | points_required: int = pd.Field( 33 | 100, ge=1, description="The number of data points required to make a recommendation for a resource." 34 | ) 35 | allow_hpa: bool = pd.Field( 36 | False, 37 | description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.", 38 | ) 39 | use_oomkill_data: bool = pd.Field( 40 | False, 41 | description="Whether to bump the memory when OOMKills are detected (experimental).", 42 | ) 43 | oom_memory_buffer_percentage: float = pd.Field( 44 | 25, ge=0, description="What percentage to increase the memory when there are OOMKill events." 45 | ) 46 | 47 | def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0) -> float: 48 | data_ = [np.max(values[:, 1]) for values in data.values()] 49 | if len(data_) == 0: 50 | return float("NaN") 51 | 52 | return max( 53 | np.max(data_) * (1 + self.memory_buffer_percentage / 100), 54 | max_oomkill * (1 + self.oom_memory_buffer_percentage / 100), 55 | ) 56 | 57 | def calculate_cpu_proposal(self, data: PodsTimeData) -> float: 58 | if len(data) == 0: 59 | return float("NaN") 60 | 61 | if len(data) > 1: 62 | data_ = np.concatenate([values[:, 1] for values in data.values()]) 63 | else: 64 | data_ = list(data.values())[0][:, 1] 65 | 66 | return np.max(data_) 67 | 68 | def history_range_enough(self, history_range: tuple[timedelta, timedelta]) -> bool: 69 | start, end = history_range 70 | return (end - start) >= timedelta(hours=3) 71 | 72 | 73 | class SimpleStrategy(BaseStrategy[SimpleStrategySettings]): 74 | 75 | display_name = "simple" 76 | rich_console = True 77 | 78 | @property 79 | def metrics(self) -> list[type[PrometheusMetric]]: 80 | metrics = [ 81 | PercentileCPULoader(self.settings.cpu_percentile), 82 | MaxMemoryLoader, 83 | CPUAmountLoader, 84 | MemoryAmountLoader, 85 | ] 86 | 87 | if self.settings.use_oomkill_data: 88 | metrics.append(MaxOOMKilledMemoryLoader) 89 | 90 | return metrics 91 | 92 | @property 93 | def description(self): 94 | s = textwrap.dedent(f"""\ 95 | CPU request: {self.settings.cpu_percentile}% percentile, limit: unset 96 | Memory request: max + {self.settings.memory_buffer_percentage}%, limit: max + {self.settings.memory_buffer_percentage}% 97 | History: {self.settings.history_duration} hours 98 | Step: {self.settings.timeframe_duration} minutes 99 | 100 | All parameters can be customized. For example: `krr simple --cpu_percentile=90 --memory_buffer_percentage=15 --history_duration=24 --timeframe_duration=0.5` 101 | """) 102 | 103 | if not self.settings.allow_hpa: 104 | s += "\n" + textwrap.dedent(f"""\ 105 | This strategy does not work with objects with HPA defined (Horizontal Pod Autoscaler). 106 | If HPA is defined for CPU or Memory, the strategy will return "?" for that resource. 107 | You can override this behaviour by passing the --allow-hpa flag 108 | """) 109 | 110 | s += "\nLearn more: [underline]https://github.com/robusta-dev/krr#algorithm[/underline]" 111 | return s 112 | 113 | def __calculate_cpu_proposal( 114 | self, history_data: MetricsPodData, object_data: K8sObjectData 115 | ) -> ResourceRecommendation: 116 | data = history_data["PercentileCPULoader"] 117 | 118 | if len(data) == 0: 119 | return ResourceRecommendation.undefined(info="No data") 120 | 121 | # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] 122 | # As CPUAmountLoader returns only the last value (1 point), [0, 1] is used to get the value 123 | # So each pod is string with pod name, and values is numpy array of shape (N, 2) 124 | data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()} 125 | total_points_count = sum(data_count.values()) 126 | 127 | if total_points_count < self.settings.points_required: 128 | return ResourceRecommendation.undefined(info="Not enough data") 129 | 130 | if ( 131 | object_data.hpa is not None 132 | and object_data.hpa.target_cpu_utilization_percentage is not None 133 | and not self.settings.allow_hpa 134 | ): 135 | return ResourceRecommendation.undefined(info="HPA detected") 136 | 137 | cpu_usage = self.settings.calculate_cpu_proposal(data) 138 | return ResourceRecommendation(request=cpu_usage, limit=None) 139 | 140 | def __calculate_memory_proposal( 141 | self, history_data: MetricsPodData, object_data: K8sObjectData 142 | ) -> ResourceRecommendation: 143 | data = history_data["MaxMemoryLoader"] 144 | 145 | oomkill_detected = False 146 | 147 | if self.settings.use_oomkill_data: 148 | max_oomkill_data = history_data["MaxOOMKilledMemoryLoader"] 149 | # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] 150 | # As MaxOOMKilledMemoryLoader returns only the last value (1 point), [0, 1] is used to get the value 151 | # So each value is numpy array of shape (N, 2) 152 | max_oomkill_value = ( 153 | np.max([values[0, 1] for values in max_oomkill_data.values()]) if len(max_oomkill_data) > 0 else 0 154 | ) 155 | if max_oomkill_value != 0: 156 | oomkill_detected = True 157 | else: 158 | max_oomkill_value = 0 159 | 160 | if len(data) == 0: 161 | return ResourceRecommendation.undefined(info="No data") 162 | 163 | # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] 164 | # As MemoryAmountLoader returns only the last value (1 point), [0, 1] is used to get the value 165 | # So each pod is string with pod name, and values is numpy array of shape (N, 2) 166 | data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()} 167 | total_points_count = sum(data_count.values()) 168 | 169 | if total_points_count < self.settings.points_required: 170 | return ResourceRecommendation.undefined(info="Not enough data") 171 | 172 | if ( 173 | object_data.hpa is not None 174 | and object_data.hpa.target_memory_utilization_percentage is not None 175 | and not self.settings.allow_hpa 176 | ): 177 | return ResourceRecommendation.undefined(info="HPA detected") 178 | 179 | memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value) 180 | return ResourceRecommendation( 181 | request=memory_usage, limit=memory_usage, info="OOMKill detected" if oomkill_detected else None 182 | ) 183 | 184 | def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult: 185 | return { 186 | ResourceType.CPU: self.__calculate_cpu_proposal(history_data, object_data), 187 | ResourceType.Memory: self.__calculate_memory_proposal(history_data, object_data), 188 | } 189 | -------------------------------------------------------------------------------- /robusta_krr/strategies/simple_limit.py: -------------------------------------------------------------------------------- 1 | import textwrap 2 | from datetime import timedelta 3 | 4 | import numpy as np 5 | import pydantic as pd 6 | 7 | from robusta_krr.core.abstract.strategies import ( 8 | BaseStrategy, 9 | K8sObjectData, 10 | MetricsPodData, 11 | PodsTimeData, 12 | ResourceRecommendation, 13 | ResourceType, 14 | RunResult, 15 | StrategySettings, 16 | ) 17 | from robusta_krr.core.integrations.prometheus.metrics import ( 18 | CPUAmountLoader, 19 | MaxMemoryLoader, 20 | MemoryAmountLoader, 21 | CPULoader, 22 | PrometheusMetric, 23 | MaxOOMKilledMemoryLoader, 24 | ) 25 | 26 | 27 | class SimpleLimitStrategySettings(StrategySettings): 28 | cpu_request: float = pd.Field(66, gt=0, le=100, description="The percentile to use for the CPU request.") 29 | cpu_limit: float = pd.Field(96, gt=0, le=100, description="The percentile to use for the CPU limit.") 30 | memory_buffer_percentage: float = pd.Field( 31 | 15, gt=0, description="The percentage of added buffer to the peak memory usage for memory recommendation." 32 | ) 33 | points_required: int = pd.Field( 34 | 100, ge=1, description="The number of data points required to make a recommendation for a resource." 35 | ) 36 | allow_hpa: bool = pd.Field( 37 | False, 38 | description="Whether to calculate recommendations even when there is an HPA scaler defined on that resource.", 39 | ) 40 | use_oomkill_data: bool = pd.Field( 41 | False, 42 | description="Whether to bump the memory when OOMKills are detected (experimental).", 43 | ) 44 | oom_memory_buffer_percentage: float = pd.Field( 45 | 25, ge=0, description="What percentage to increase the memory when there are OOMKill events." 46 | ) 47 | 48 | def calculate_memory_proposal(self, data: PodsTimeData, max_oomkill: float = 0) -> float: 49 | data_ = [np.max(values[:, 1]) for values in data.values()] 50 | if len(data_) == 0: 51 | return float("NaN") 52 | 53 | return max( 54 | np.max(data_) * (1 + self.memory_buffer_percentage / 100), 55 | max_oomkill * (1 + self.oom_memory_buffer_percentage / 100), 56 | ) 57 | 58 | def calculate_cpu_percentile(self, data: PodsTimeData, percentile: float) -> float: 59 | if len(data) == 0: 60 | return float("NaN") 61 | 62 | if len(data) > 1: 63 | data_ = np.concatenate([values[:, 1] for values in data.values()]) 64 | else: 65 | data_ = list(data.values())[0][:, 1] 66 | 67 | return np.percentile(data_, percentile) 68 | 69 | def history_range_enough(self, history_range: tuple[timedelta, timedelta]) -> bool: 70 | start, end = history_range 71 | return (end - start) >= timedelta(hours=3) 72 | 73 | 74 | class SimpleLimitStrategy(BaseStrategy[SimpleLimitStrategySettings]): 75 | 76 | display_name = "simple_limit" 77 | rich_console = True 78 | 79 | @property 80 | def metrics(self) -> list[type[PrometheusMetric]]: 81 | metrics = [ 82 | CPULoader, 83 | MaxMemoryLoader, 84 | CPUAmountLoader, 85 | MemoryAmountLoader, 86 | ] 87 | 88 | if self.settings.use_oomkill_data: 89 | metrics.append(MaxOOMKilledMemoryLoader) 90 | 91 | return metrics 92 | 93 | @property 94 | def description(self): 95 | s = textwrap.dedent(f"""\ 96 | CPU request: {self.settings.cpu_request}% percentile, limit: {self.settings.cpu_limit}% percentile 97 | Memory request: max + {self.settings.memory_buffer_percentage}%, limit: max + {self.settings.memory_buffer_percentage}% 98 | History: {self.settings.history_duration} hours 99 | Step: {self.settings.timeframe_duration} minutes 100 | 101 | All parameters can be customized. For example: `krr simple_limit --cpu_request=66 --cpu_limit=96 --memory_buffer_percentage=15 --history_duration=24 --timeframe_duration=0.5` 102 | """) 103 | 104 | if not self.settings.allow_hpa: 105 | s += "\n" + textwrap.dedent(f"""\ 106 | This strategy does not work with objects with HPA defined (Horizontal Pod Autoscaler). 107 | If HPA is defined for CPU or Memory, the strategy will return "?" for that resource. 108 | You can override this behaviour by passing the --allow-hpa flag 109 | """) 110 | 111 | s += "\nLearn more: [underline]https://github.com/robusta-dev/krr#algorithm[/underline]" 112 | return s 113 | 114 | def __calculate_cpu_proposal( 115 | self, history_data: MetricsPodData, object_data: K8sObjectData 116 | ) -> ResourceRecommendation: 117 | data = history_data["CPULoader"] 118 | 119 | if len(data) == 0: 120 | return ResourceRecommendation.undefined(info="No data") 121 | 122 | # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] 123 | # As CPUAmountLoader returns only the last value (1 point), [0, 1] is used to get the value 124 | # So each pod is string with pod name, and values is numpy array of shape (N, 2) 125 | data_count = {pod: values[0, 1] for pod, values in history_data["CPUAmountLoader"].items()} 126 | total_points_count = sum(data_count.values()) 127 | 128 | if total_points_count < self.settings.points_required: 129 | return ResourceRecommendation.undefined(info="Not enough data") 130 | 131 | if ( 132 | object_data.hpa is not None 133 | and object_data.hpa.target_cpu_utilization_percentage is not None 134 | and not self.settings.allow_hpa 135 | ): 136 | return ResourceRecommendation.undefined(info="HPA detected") 137 | 138 | cpu_request = self.settings.calculate_cpu_percentile(data, self.settings.cpu_request) 139 | cpu_limit = self.settings.calculate_cpu_percentile(data, self.settings.cpu_limit) 140 | return ResourceRecommendation(request=cpu_request, limit=cpu_limit) 141 | 142 | def __calculate_memory_proposal( 143 | self, history_data: MetricsPodData, object_data: K8sObjectData 144 | ) -> ResourceRecommendation: 145 | data = history_data["MaxMemoryLoader"] 146 | 147 | oomkill_detected = False 148 | 149 | if self.settings.use_oomkill_data: 150 | max_oomkill_data = history_data["MaxOOMKilledMemoryLoader"] 151 | # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] 152 | # As MaxOOMKilledMemoryLoader returns only the last value (1 point), [0, 1] is used to get the value 153 | # So each value is numpy array of shape (N, 2) 154 | max_oomkill_value = ( 155 | np.max([values[0, 1] for values in max_oomkill_data.values()]) if len(max_oomkill_data) > 0 else 0 156 | ) 157 | if max_oomkill_value != 0: 158 | oomkill_detected = True 159 | else: 160 | max_oomkill_value = 0 161 | 162 | if len(data) == 0: 163 | return ResourceRecommendation.undefined(info="No data") 164 | 165 | # NOTE: metrics for each pod are returned as list[values] where values is [timestamp, value] 166 | # As MemoryAmountLoader returns only the last value (1 point), [0, 1] is used to get the value 167 | # So each pod is string with pod name, and values is numpy array of shape (N, 2) 168 | data_count = {pod: values[0, 1] for pod, values in history_data["MemoryAmountLoader"].items()} 169 | total_points_count = sum(data_count.values()) 170 | 171 | if total_points_count < self.settings.points_required: 172 | return ResourceRecommendation.undefined(info="Not enough data") 173 | 174 | if ( 175 | object_data.hpa is not None 176 | and object_data.hpa.target_memory_utilization_percentage is not None 177 | and not self.settings.allow_hpa 178 | ): 179 | return ResourceRecommendation.undefined(info="HPA detected") 180 | 181 | memory_usage = self.settings.calculate_memory_proposal(data, max_oomkill_value) 182 | return ResourceRecommendation( 183 | request=memory_usage, limit=memory_usage, info="OOMKill detected" if oomkill_detected else None 184 | ) 185 | 186 | def run(self, history_data: MetricsPodData, object_data: K8sObjectData) -> RunResult: 187 | return { 188 | ResourceType.CPU: self.__calculate_cpu_proposal(history_data, object_data), 189 | ResourceType.Memory: self.__calculate_memory_proposal(history_data, object_data), 190 | } 191 | -------------------------------------------------------------------------------- /robusta_krr/utils/batched.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | from typing import Iterable, TypeVar 3 | 4 | _T = TypeVar("_T") 5 | 6 | 7 | def batched(iterable: Iterable[_T], n: int) -> Iterable[list[_T]]: 8 | "Batch data into tuples of length n. The last batch may be shorter." 9 | # batched('ABCDEFG', 3) --> ABC DEF G 10 | if n < 1: 11 | raise ValueError("n must be at least one") 12 | it = iter(iterable) 13 | while batch := list(itertools.islice(it, n)): 14 | yield batch 15 | -------------------------------------------------------------------------------- /robusta_krr/utils/intro.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import asyncio 3 | from concurrent.futures import ThreadPoolExecutor 4 | 5 | from .version import get_version 6 | 7 | 8 | ONLINE_LINK = 'https://api.robusta.dev/krr/intro' 9 | LOCAL_LINK = './intro.txt' 10 | TIMEOUT = 0.5 11 | 12 | 13 | # Synchronous function to fetch intro message 14 | def fetch_intro_message() -> str: 15 | try: 16 | # Attempt to get the message from the URL 17 | response = requests.get(ONLINE_LINK, params={"version": get_version()}, timeout=TIMEOUT) 18 | response.raise_for_status() # Raises an error for bad responses 19 | result = response.json() 20 | return result['message'] 21 | except Exception as e1: 22 | # If there's any error, fallback to local file 23 | try: 24 | with open(LOCAL_LINK, 'r') as file: 25 | return file.read() 26 | except Exception as e2: 27 | return ( 28 | "[red]Failed to load the intro message.\n" 29 | f"Both from the URL: {e1.__class__.__name__} {e1}\n" 30 | f"and the local file: {e2.__class__.__name__} {e2}\n" 31 | "But as that is not critical, KRR will continue to run without the intro message.[/red]" 32 | ) 33 | 34 | 35 | async def load_intro_message() -> str: 36 | loop = asyncio.get_running_loop() 37 | # Use a ThreadPoolExecutor to run the synchronous function in a separate thread 38 | with ThreadPoolExecutor() as pool: 39 | return await loop.run_in_executor(pool, fetch_intro_message) 40 | 41 | 42 | __all__ = ['load_intro_message'] 43 | -------------------------------------------------------------------------------- /robusta_krr/utils/object_like_dict.py: -------------------------------------------------------------------------------- 1 | class ObjectLikeDict: 2 | def __init__(self, dictionary): 3 | for key, value in dictionary.items(): 4 | if isinstance(value, dict): 5 | value = ObjectLikeDict(value) # Convert inner dict 6 | if isinstance(value, list): 7 | value = [ObjectLikeDict(item) if isinstance(item, dict) else item for item in value] 8 | self.__dict__[key] = value 9 | 10 | def __getattr__(self, name): 11 | return self.__dict__.get(name) 12 | 13 | def __setattr__(self, name, value): 14 | self.__dict__[name] = value 15 | 16 | def __str__(self): 17 | return str(self.__dict__) 18 | 19 | def __repr__(self): 20 | return repr(self.__dict__) 21 | 22 | def __len__(self): 23 | return len(self.__dict__) 24 | 25 | def get(self, key, default=None): 26 | return self.__dict__.get(key, default) 27 | 28 | def items(self): 29 | return self.__dict__.items() 30 | -------------------------------------------------------------------------------- /robusta_krr/utils/patch.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from kubernetes.client.models.v1_pod_failure_policy_rule import V1PodFailurePolicyRule 4 | 5 | def create_monkey_patches(): 6 | """ 7 | The python kubernetes client will throw exceptions for specific fields that were not allowed to be None on older versions of kubernetes. 8 | """ 9 | logger = logging.getLogger("krr") 10 | logger.debug("Creating kubernetes python cli monkey patches") 11 | 12 | def patched_setter_pod_failure_policy(self, on_pod_conditions): 13 | self._on_pod_conditions = on_pod_conditions 14 | 15 | V1PodFailurePolicyRule.on_pod_conditions = V1PodFailurePolicyRule.on_pod_conditions.setter(patched_setter_pod_failure_policy) 16 | -------------------------------------------------------------------------------- /robusta_krr/utils/progress_bar.py: -------------------------------------------------------------------------------- 1 | from alive_progress import alive_bar 2 | 3 | # from robusta_krr.core.models.config import settings 4 | 5 | 6 | class ProgressBar: 7 | """ 8 | Progress bar for displaying progress of gathering recommendations. 9 | 10 | Use `ProgressBar` as a context manager to automatically handle the progress bar. 11 | Use `progress` method to step the progress bar. 12 | """ 13 | 14 | def __init__(self, **kwargs) -> None: 15 | # self.show_bar = not settings.quiet and not settings.log_to_stderr 16 | self.show_bar = False # FIXME: Progress bar is not working good with other logs 17 | if self.show_bar: 18 | self.alive_bar = alive_bar(**kwargs, enrich_print=False) 19 | 20 | def __enter__(self): 21 | if self.show_bar: 22 | self.bar = self.alive_bar.__enter__() 23 | return self 24 | 25 | def progress(self): 26 | if self.show_bar: 27 | self.bar() 28 | 29 | def __exit__(self, *args): 30 | if self.show_bar: 31 | self.alive_bar.__exit__(*args) 32 | -------------------------------------------------------------------------------- /robusta_krr/utils/resource_units.py: -------------------------------------------------------------------------------- 1 | from typing import Literal, Union 2 | 3 | UNITS: dict[str, float] = { 4 | "m": 0.001, 5 | "Ki": 1024, 6 | "Mi": 1024**2, 7 | "Gi": 1024**3, 8 | "Ti": 1024**4, 9 | "Pi": 1024**5, 10 | "Ei": 1024**6, 11 | "k": 1e3, 12 | "M": 1e6, 13 | "G": 1e9, 14 | "T": 1e12, 15 | "P": 1e15, 16 | "E": 1e18, 17 | } 18 | 19 | 20 | def parse(x: str, /) -> Union[float, int]: 21 | """Converts a string to an integer with respect of units.""" 22 | 23 | for unit, multiplier in UNITS.items(): 24 | if x.endswith(unit): 25 | return float(x[: -len(unit)]) * multiplier 26 | 27 | return float(x) 28 | 29 | 30 | def get_base(x: str, /) -> Literal[1024, 1000]: 31 | """Returns the base of the unit.""" 32 | 33 | for unit, _ in UNITS.items(): 34 | if x.endswith(unit): 35 | return 1024 if unit in ["Ki", "Mi", "Gi", "Ti", "Pi", "Ei"] else 1000 36 | return 1000 if "." in x else 1024 37 | 38 | 39 | def format(x: Union[float, int], /, *, base: Literal[1024, 1000] = 1024) -> str: 40 | """Converts an integer to a string with respect of units.""" 41 | 42 | if x < 1: 43 | return f"{int(x*1000)}m" 44 | if x < base: 45 | return str(x) 46 | 47 | units = ["", "K", "M", "G", "T", "P", "E"] 48 | binary_units = ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei"] 49 | 50 | x = int(x) 51 | for i, unit in enumerate(binary_units if base == 1024 else units): 52 | if x < base ** (i + 1) or i == len(units) - 1 or x / base ** (i + 1) < 10: 53 | return f"{x/base**i:.0f}{unit}" 54 | return f"{x/6**i:.0f}{unit}" 55 | -------------------------------------------------------------------------------- /robusta_krr/utils/service_discovery.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from abc import ABC, abstractmethod 3 | from typing import Optional 4 | 5 | from cachetools import TTLCache 6 | from kubernetes import client 7 | from kubernetes.client import V1IngressList, V1ServiceList 8 | from kubernetes.client.api_client import ApiClient 9 | from kubernetes.client.models.v1_ingress import V1Ingress 10 | from kubernetes.client.models.v1_service import V1Service 11 | 12 | from robusta_krr.core.models.config import settings 13 | 14 | logger = logging.getLogger("krr") 15 | 16 | 17 | class ServiceDiscovery: 18 | SERVICE_CACHE_TTL_SEC = 900 19 | cache: TTLCache = TTLCache(maxsize=1, ttl=SERVICE_CACHE_TTL_SEC) 20 | 21 | def __init__(self, api_client: Optional[ApiClient] = None) -> None: 22 | self.api_client = api_client 23 | 24 | def find_service_url(self, label_selector: str) -> Optional[str]: 25 | """ 26 | Get the url of an in-cluster service with a specific label 27 | """ 28 | # we do it this way because there is a weird issue with hikaru's ServiceList.listServiceForAllNamespaces() 29 | v1 = client.CoreV1Api(api_client=self.api_client) 30 | svc_list: V1ServiceList = v1.list_service_for_all_namespaces(label_selector=label_selector) 31 | if not svc_list.items: 32 | return None 33 | 34 | svc: V1Service = svc_list.items[0] 35 | name = svc.metadata.name 36 | namespace = svc.metadata.namespace 37 | port = svc.spec.ports[0].port 38 | 39 | if settings.inside_cluster: 40 | return f"http://{name}.{namespace}.svc.cluster.local:{port}" 41 | 42 | elif self.api_client is not None: 43 | return f"{self.api_client.configuration.host}/api/v1/namespaces/{namespace}/services/{name}:{port}/proxy" 44 | 45 | return None 46 | 47 | def find_ingress_host(self, label_selector: str) -> Optional[str]: 48 | """ 49 | Discover the ingress host of the Prometheus if krr is not running in cluster 50 | """ 51 | if settings.inside_cluster: 52 | return None 53 | 54 | v1 = client.NetworkingV1Api(api_client=self.api_client) 55 | ingress_list: V1IngressList = v1.list_ingress_for_all_namespaces(label_selector=label_selector) 56 | if not ingress_list.items: 57 | return None 58 | 59 | ingress: V1Ingress = ingress_list.items[0] 60 | prometheus_host = ingress.spec.rules[0].host 61 | return f"http://{prometheus_host}" 62 | 63 | def find_url(self, selectors: list[str]) -> Optional[str]: 64 | """ 65 | Try to autodiscover the url of an in-cluster service 66 | """ 67 | cache_key = ",".join(selectors + [self.api_client.configuration.host if self.api_client else ""]) 68 | cached_value = self.cache.get(cache_key) 69 | if cached_value: 70 | return cached_value 71 | 72 | for label_selector in selectors: 73 | logger.debug(f"Trying to find service with label selector {label_selector}") 74 | service_url = self.find_service_url(label_selector) 75 | if service_url: 76 | logger.debug(f"Found service with label selector {label_selector}") 77 | self.cache[cache_key] = service_url 78 | return service_url 79 | 80 | logger.debug(f"Trying to find ingress with label selector {label_selector}") 81 | self.find_ingress_host(label_selector) 82 | ingress_url = self.find_ingress_host(label_selector) 83 | if ingress_url: 84 | return ingress_url 85 | 86 | return None 87 | 88 | 89 | class MetricsServiceDiscovery(ServiceDiscovery, ABC): 90 | @abstractmethod 91 | def find_metrics_url(self, *, api_client: Optional[ApiClient] = None) -> Optional[str]: 92 | pass 93 | -------------------------------------------------------------------------------- /robusta_krr/utils/version.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import subprocess 4 | import sys 5 | from concurrent.futures import ThreadPoolExecutor 6 | from typing import Optional 7 | 8 | import requests 9 | 10 | import robusta_krr 11 | 12 | 13 | def get_version() -> str: 14 | # the version string was patched by a release - return __version__ which will be correct 15 | if robusta_krr.__version__ != "dev": 16 | return robusta_krr.__version__ 17 | 18 | # we are running from an unreleased dev version 19 | try: 20 | # Get the latest git tag 21 | tag = subprocess.check_output(["git", "describe", "--tags"]).decode().strip() 22 | 23 | # Get the current branch name 24 | branch = subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]).decode().strip() 25 | 26 | # Check if there are uncommitted changes 27 | status = subprocess.check_output(["git", "status", "--porcelain"]).decode().strip() 28 | dirty = "-dirty" if status else "" 29 | 30 | return f"{tag}-{branch}{dirty}" 31 | 32 | except Exception: 33 | return robusta_krr.__version__ 34 | 35 | 36 | # Synchronous function to fetch the latest release version from GitHub API 37 | def fetch_latest_version() -> Optional[str]: 38 | url = "https://api.github.com/repos/robusta-dev/krr/releases/latest" 39 | try: 40 | response = requests.get(url, timeout=0.5) # 0.5 seconds timeout 41 | response.raise_for_status() # Raises an error for bad responses 42 | data = response.json() 43 | return data.get("tag_name") # Returns the tag name of the latest release 44 | except Exception: 45 | return None 46 | 47 | 48 | async def load_latest_version() -> Optional[str]: 49 | loop = asyncio.get_running_loop() 50 | # Run the synchronous function in a separate thread 51 | with ThreadPoolExecutor() as pool: 52 | return await loop.run_in_executor(pool, fetch_latest_version) 53 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import random 2 | from datetime import datetime, timedelta 3 | from unittest.mock import AsyncMock, patch 4 | 5 | import numpy as np 6 | import pytest 7 | 8 | from robusta_krr.api.models import K8sObjectData, PodData, ResourceAllocations 9 | from robusta_krr.strategies.simple import SimpleStrategy, SimpleStrategySettings 10 | 11 | TEST_OBJECT = K8sObjectData( 12 | cluster="mock-cluster", 13 | name="mock-object-1", 14 | container="mock-container-1", 15 | pods=[ 16 | PodData(name="mock-pod-1", deleted=False), 17 | PodData(name="mock-pod-2", deleted=False), 18 | PodData(name="mock-pod-3", deleted=True), 19 | ], 20 | namespace="default", 21 | kind="Deployment", 22 | allocations=ResourceAllocations( 23 | requests={"cpu": 1, "memory": 1}, # type: ignore 24 | limits={"cpu": 2, "memory": 2}, # type: ignore 25 | ), 26 | ) 27 | 28 | 29 | @pytest.fixture(autouse=True, scope="session") 30 | def mock_list_clusters(): 31 | with patch( 32 | "robusta_krr.core.integrations.kubernetes.KubernetesLoader.list_clusters", 33 | new=AsyncMock(return_value=[TEST_OBJECT.cluster]), 34 | ): 35 | yield 36 | 37 | 38 | @pytest.fixture(autouse=True, scope="session") 39 | def mock_list_scannable_objects(): 40 | with patch( 41 | "robusta_krr.core.integrations.kubernetes.KubernetesLoader.list_scannable_objects", 42 | new=AsyncMock(return_value=[TEST_OBJECT]), 43 | ): 44 | yield 45 | 46 | 47 | @pytest.fixture(autouse=True, scope="session") 48 | def mock_load_kubeconfig(): 49 | with patch("robusta_krr.core.models.config.Config.load_kubeconfig", return_value=None): 50 | yield 51 | 52 | 53 | @pytest.fixture(autouse=True, scope="session") 54 | def mock_prometheus_loader(): 55 | now = datetime.now() 56 | start = now - timedelta(hours=1) 57 | now_ts, start_ts = now.timestamp(), start.timestamp() 58 | metric_points_data = np.array([(t, random.randrange(0, 100)) for t in np.linspace(start_ts, now_ts, 3600)]) 59 | 60 | settings = SimpleStrategySettings() 61 | strategy = SimpleStrategy(settings) 62 | 63 | with patch( 64 | "robusta_krr.core.integrations.prometheus.loader.PrometheusMetricsLoader.gather_data", 65 | new=AsyncMock( 66 | return_value={ 67 | metric.__name__: {pod.name: metric_points_data for pod in TEST_OBJECT.pods} 68 | for metric in strategy.metrics 69 | }, 70 | ), 71 | ) as mock_prometheus_loader: 72 | mock_prometheus_loader 73 | yield 74 | 75 | 76 | @pytest.fixture(autouse=True, scope="session") 77 | def mock_prometheus_load_pods(): 78 | with patch( 79 | "robusta_krr.core.integrations.prometheus.loader.PrometheusMetricsLoader.load_pods", 80 | new=AsyncMock( 81 | return_value=TEST_OBJECT.pods, 82 | ), 83 | ) as mock_prometheus_loader: 84 | mock_prometheus_loader 85 | yield 86 | 87 | 88 | @pytest.fixture(autouse=True, scope="session") 89 | def mock_prometheus_get_history_range(): 90 | async def get_history_range(self, history_duration: timedelta) -> tuple[datetime, datetime]: 91 | now = datetime.now() 92 | start = now - history_duration 93 | return start, now 94 | 95 | with patch( 96 | "robusta_krr.core.integrations.prometheus.loader.PrometheusMetricsLoader.get_history_range", get_history_range 97 | ): 98 | yield 99 | 100 | 101 | @pytest.fixture(autouse=True, scope="session") 102 | def mock_prometheus_init(): 103 | with patch("robusta_krr.core.integrations.prometheus.loader.PrometheusMetricsLoader.__init__", return_value=None): 104 | yield 105 | -------------------------------------------------------------------------------- /tests/models/test_resource_allocations.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import pytest 4 | 5 | from robusta_krr.core.models.allocations import ResourceAllocations, ResourceType 6 | 7 | 8 | @pytest.mark.parametrize( 9 | "cpu", 10 | [ 11 | {"request": "5m", "limit": None}, 12 | {"request": 0.005, "limit": None}, 13 | ], 14 | ) 15 | @pytest.mark.parametrize( 16 | "memory", 17 | [ 18 | {"request": 128974848, "limit": 128974848}, 19 | {"request": 128.974848e6, "limit": 128.974848e6}, 20 | {"request": "128.9748480M", "limit": "128.9748480M"}, 21 | {"request": "128974848000m", "limit": "128974848000m"}, 22 | {"request": "123Mi", "limit": "123Mi"}, 23 | {"request": "128974848e0", "limit": "128974848e0"}, 24 | ], 25 | ) 26 | def test_resource_allocation_supported_formats( 27 | cpu: dict[str, Union[str, int, float, None]], memory: dict[str, Union[str, int, float, None]] 28 | ): 29 | allocations = ResourceAllocations( 30 | requests={ResourceType.CPU: cpu["request"], ResourceType.Memory: memory["request"]}, 31 | limits={ResourceType.CPU: cpu["limit"], ResourceType.Memory: memory["limit"]}, 32 | ) 33 | assert allocations.requests[ResourceType.CPU] == 0.005 34 | assert allocations.limits[ResourceType.CPU] == None 35 | assert (allocations.requests[ResourceType.Memory] // 1) == 128974848.0 36 | assert (allocations.limits[ResourceType.Memory] // 1) == 128974848.0 37 | -------------------------------------------------------------------------------- /tests/single_namespace_as_group.yaml: -------------------------------------------------------------------------------- 1 | # Test environment for per-namespace scans using a group object ID (for e.g. Microsoft Entra) 2 | # The purpose of this setup is to verify that per-namespace features work without cluster level permissions 3 | # You can test this Group and KRR using: 4 | # A user named aksdev that's part of the appdev group. 5 | # krr simple --as aksdev --as-group -n kube-system 6 | apiVersion: rbac.authorization.k8s.io/v1 7 | kind: Role 8 | metadata: 9 | namespace: kube-system 10 | name: krr-role 11 | rules: 12 | - apiGroups: [""] 13 | resources: ["pods", "services"] 14 | verbs: ["get", "watch", "list"] 15 | - apiGroups: ["batch"] 16 | resources: ["jobs"] 17 | verbs: ["get", "watch", "list"] 18 | - apiGroups: ["apps"] 19 | resources: ["deployments", "replicasets", "daemonsets", "statefulsets"] 20 | verbs: ["get", "list", "watch"] 21 | - apiGroups: ["autoscaling"] 22 | resources: ["horizontalpodautoscalers"] 23 | verbs: ["get", "list", "watch"] 24 | --- 25 | apiVersion: rbac.authorization.k8s.io/v1 26 | kind: RoleBinding 27 | metadata: 28 | name: krr-role-binding 29 | namespace: kube-system 30 | subjects: 31 | - kind: Group 32 | # Replace with the actual Group Object ID 33 | name: 34 | apiGroup: rbac.authorization.k8s.io 35 | roleRef: 36 | kind: Role 37 | name: krr-role 38 | apiGroup: rbac.authorization.k8s.io 39 | -------------------------------------------------------------------------------- /tests/single_namespace_permissions.yaml: -------------------------------------------------------------------------------- 1 | # Test environment for per-namespace scans 2 | # The purpose of this setup is to verify that per-namespace features work without cluster level permissions 3 | # You can test this ServiceAccount and KRR using: 4 | # krr simple --as system:serviceaccount:kube-system:krr-account -n kube-system 5 | apiVersion: v1 6 | kind: ServiceAccount 7 | metadata: 8 | name: krr-account 9 | namespace: kube-system 10 | --- 11 | apiVersion: rbac.authorization.k8s.io/v1 12 | kind: Role 13 | metadata: 14 | namespace: kube-system 15 | name: krr-role 16 | rules: 17 | - apiGroups: [""] 18 | resources: ["pods", "services"] 19 | verbs: ["get", "watch", "list"] 20 | - apiGroups: ["batch"] 21 | resources: ["jobs"] 22 | verbs: ["get", "watch", "list"] 23 | - apiGroups: ["apps"] 24 | resources: ["deployments", "replicasets", "daemonsets", "statefulsets"] 25 | verbs: ["get", "list", "watch"] 26 | - apiGroups: ["autoscaling"] 27 | resources: ["horizontalpodautoscalers"] 28 | verbs: ["get", "list", "watch"] 29 | --- 30 | apiVersion: rbac.authorization.k8s.io/v1 31 | kind: RoleBinding 32 | metadata: 33 | name: krr-role-binding 34 | namespace: kube-system 35 | subjects: 36 | - kind: ServiceAccount 37 | name: krr-account 38 | namespace: kube-system 39 | roleRef: 40 | kind: Role 41 | name: krr-role 42 | apiGroup: rbac.authorization.k8s.io 43 | -------------------------------------------------------------------------------- /tests/test_krr.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from typing import Literal, Union 3 | from unittest.mock import patch, Mock, MagicMock 4 | from typer.testing import CliRunner 5 | 6 | from robusta_krr.main import app, load_commands 7 | from robusta_krr.core.integrations.kubernetes import ClusterLoader 8 | from robusta_krr.core.models.config import settings 9 | 10 | runner = CliRunner(mix_stderr=False) 11 | load_commands() 12 | 13 | STRATEGY_NAME = "simple" 14 | 15 | 16 | def test_help(): 17 | result = runner.invoke(app, [STRATEGY_NAME, "--help"]) 18 | try: 19 | assert result.exit_code == 0 20 | except AssertionError as e: 21 | raise e from result.exception 22 | 23 | 24 | @pytest.mark.parametrize("log_flag", ["-v", "-q"]) 25 | def test_run(log_flag: str): 26 | result = runner.invoke(app, [STRATEGY_NAME, log_flag, "--namespace", "default"]) 27 | try: 28 | assert result.exit_code == 0, result.stdout 29 | except AssertionError as e: 30 | raise e from result.exception 31 | 32 | 33 | @pytest.mark.parametrize("format", ["json", "yaml", "table", "pprint", "csv"]) 34 | @pytest.mark.parametrize("output", ["--logtostderr", "-q"]) 35 | def test_output_formats(format: str, output: str): 36 | result = runner.invoke(app, [STRATEGY_NAME, output, "-f", format]) 37 | try: 38 | assert result.exit_code == 0, result.exc_info 39 | except AssertionError as e: 40 | raise e from result.exception 41 | 42 | @pytest.mark.parametrize( 43 | "setting_namespaces,cluster_all_ns,expected",[ 44 | ( 45 | # default settings 46 | "*", 47 | ["kube-system", "robusta-frontend", "robusta-backend", "infra-grafana"], 48 | "*" 49 | ), 50 | ( 51 | # list of namespace provided from arguments without regex pattern 52 | ["robusta-krr", "kube-system"], 53 | ["kube-system", "robusta-frontend", "robusta-backend", "robusta-krr"], 54 | ["robusta-krr", "kube-system"] 55 | ), 56 | ( 57 | # list of namespace provided from arguments with regex pattern and will not duplicating in final result 58 | ["robusta-.*", "robusta-frontend"], 59 | ["kube-system", "robusta-frontend", "robusta-backend", "robusta-krr"], 60 | ["robusta-frontend", "robusta-backend", "robusta-krr"] 61 | ), 62 | ( 63 | # namespace provided with regex pattern and will match for some namespaces 64 | [".*end$"], 65 | ["kube-system", "robusta-frontend", "robusta-backend", "robusta-krr"], 66 | ["robusta-frontend", "robusta-backend"] 67 | ) 68 | ] 69 | ) 70 | def test_cluster_namespace_list( 71 | setting_namespaces: Union[Literal["*"], list[str]], 72 | cluster_all_ns: list[str], 73 | expected: Union[Literal["*"], list[str]], 74 | ): 75 | cluster = ClusterLoader() 76 | with patch("robusta_krr.core.models.config.settings.namespaces", setting_namespaces): 77 | with patch.object(cluster.core, "list_namespace", return_value=MagicMock( 78 | items=[MagicMock(**{"metadata.name": m}) for m in cluster_all_ns])): 79 | assert sorted(cluster.namespaces) == sorted(expected) 80 | -------------------------------------------------------------------------------- /tests/test_runner.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from click.testing import Result 3 | from typer.testing import CliRunner 4 | 5 | from robusta_krr.main import app, load_commands 6 | 7 | runner = CliRunner(mix_stderr=False) 8 | load_commands() 9 | 10 | 11 | @pytest.mark.parametrize( 12 | "args, expected_exit_code", 13 | [ 14 | (["--exclude-severity", "-f", "csv"], 0), 15 | (["--exclude-severity", "-f", "table"], 2), 16 | (["--exclude-severity"], 2), 17 | ], 18 | ) 19 | def test_exclude_severity_option(args: list[str], expected_exit_code: int) -> None: 20 | result: Result = runner.invoke(app, ["simple", *args]) 21 | assert result.exit_code == expected_exit_code 22 | --------------------------------------------------------------------------------