├── .coveragerc ├── .dockerignore ├── .envrc ├── .github ├── dependabot.yml └── workflows │ ├── ci.yml │ ├── helm-release.yml │ ├── helm-test.yml │ └── push-image-ghcr.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pylintrc ├── .python-version ├── Dockerfile ├── Dockerfile.pyinstaller ├── LICENSE.md ├── README.md ├── Taskfile.yml ├── buildInstaller.sh ├── celery-mixin ├── .gitignore ├── Makefile ├── README.md ├── alerts.jsonnet ├── alerts │ └── alerts.libsonnet ├── config.libsonnet ├── dashboards.jsonnet ├── dashboards │ ├── celery-tasks-by-task.libsonnet │ ├── celery-tasks-overview.libsonnet │ └── dashboards.libsonnet ├── dashboards_out │ ├── celery-tasks-by-task.json │ ├── celery-tasks-overview.json │ └── celery-tasks.json ├── jsonnetfile.json ├── mixin.libsonnet ├── prometheus-alerts.yaml └── tests.yaml ├── charts └── celery-exporter │ ├── .helmignore │ ├── Chart.yaml │ ├── README.md │ ├── ci │ └── test-values.yaml │ ├── templates │ ├── NOTES.txt │ ├── _helpers.tpl │ ├── deployment.yaml │ ├── ingress.yaml │ ├── service.yaml │ ├── serviceaccount.yaml │ ├── servicemonitor.yaml │ └── tests │ │ └── test-connection.yaml │ └── values.yaml ├── cli.py ├── conftest.py ├── docker-compose.yml ├── images ├── celery-tasks-by-task.png └── celery-tasks-overview.png ├── jsonnetfile.json ├── jsonnetfile.lock.json ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── src ├── __init__.py ├── cli.py ├── exporter.py ├── help.py ├── http_server.py ├── test_cli.py ├── test_exporter.py ├── test_http_server.py └── test_metrics.py └── vendor ├── github.com └── honeylogic-io │ └── utils-libsonnet │ └── lib │ ├── celery.libsonnet │ ├── django.libsonnet │ ├── drone.libsonnet │ └── ingress.libsonnet └── lib /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = . 3 | 4 | omit = 5 | .venv/* 6 | .virtualenv/* 7 | 8 | [report] 9 | fail_under = 80 10 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .virtualenv 2 | .venv 3 | .mypy_cache 4 | .pytest_cache 5 | .git 6 | build 7 | dist 8 | images 9 | __pycache__ 10 | vendor 11 | -------------------------------------------------------------------------------- /.envrc: -------------------------------------------------------------------------------- 1 | layout pyenv $(cat .python-version) 2 | layout python 3 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2 3 | updates: 4 | - package-ecosystem: "pip" 5 | directory: "/" 6 | schedule: 7 | interval: "monthly" 8 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | branches: 8 | - master 9 | 10 | jobs: 11 | lint: 12 | name: Lint 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | 17 | - uses: actions/setup-python@v5 18 | id: setup-python 19 | with: 20 | python-version: 3.13 21 | 22 | - name: Install Poetry 23 | uses: snok/install-poetry@v1 24 | with: 25 | virtualenvs-create: true 26 | virtualenvs-in-project: true 27 | installer-parallel: true 28 | 29 | - name: Load cached venv 30 | id: cached-poetry-dependencies 31 | uses: actions/cache@v4 32 | with: 33 | path: .venv 34 | key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 35 | 36 | - name: Install dependencies 37 | if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' 38 | run: | 39 | poetry install --no-interaction --no-root 40 | 41 | - name: Format 42 | run: | 43 | source .venv/bin/activate 44 | black . --check 45 | 46 | - name: Type Check 47 | run: | 48 | source .venv/bin/activate 49 | mypy . 50 | 51 | - name: Lint 52 | run: | 53 | source .venv/bin/activate 54 | pylint $(git ls-files -- '*.py' ':!:**/migrations/*.py') 55 | 56 | test: 57 | name: Test 58 | runs-on: ubuntu-latest 59 | services: 60 | redis: 61 | image: redis:6 62 | ports: ['6379:6379'] 63 | rabbitmq: 64 | image: rabbitmq:3 65 | ports: ['5672:5672'] 66 | strategy: 67 | matrix: 68 | broker: [memory, redis, rabbitmq] 69 | steps: 70 | - uses: actions/checkout@v4 71 | 72 | - uses: actions/setup-python@v5 73 | id: setup-python 74 | with: 75 | python-version: 3.13 76 | 77 | - name: Install Poetry 78 | uses: snok/install-poetry@v1 79 | with: 80 | virtualenvs-create: true 81 | virtualenvs-in-project: true 82 | installer-parallel: true 83 | 84 | - name: Load cached venv 85 | id: cached-poetry-dependencies 86 | uses: actions/cache@v4 87 | with: 88 | path: .venv 89 | key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 90 | 91 | - name: Install dependencies 92 | if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' 93 | run: | 94 | poetry install --no-interaction --no-root 95 | source .venv/bin/activate 96 | 97 | - name: Test 98 | run: | 99 | source .venv/bin/activate 100 | pytest --broker=${{ matrix.broker }} --ignore .poetry --cov 101 | -------------------------------------------------------------------------------- /.github/workflows/helm-release.yml: -------------------------------------------------------------------------------- 1 | name: Release Charts 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | release: 10 | # depending on default permission settings for your org (contents being read-only or read-write for workloads), you will have to add permissions 11 | # see: https://docs.github.com/en/actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token 12 | permissions: 13 | contents: write 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v4 18 | with: 19 | fetch-depth: 0 20 | 21 | - name: Configure Git 22 | run: | 23 | git config user.name "$GITHUB_ACTOR" 24 | git config user.email "$GITHUB_ACTOR@users.noreply.github.com" 25 | 26 | - name: Set up Helm 27 | uses: azure/setup-helm@v4.2.0 28 | with: 29 | version: v3.14.4 30 | 31 | - name: Run chart-releaser 32 | uses: helm/chart-releaser-action@v1.6.0 33 | env: 34 | CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" 35 | CR_RELEASE_NAME_TEMPLATE: "{{ .Name }}-chart-{{ .Version }}" 36 | -------------------------------------------------------------------------------- /.github/workflows/helm-test.yml: -------------------------------------------------------------------------------- 1 | name: Lint and Test Charts 2 | 3 | on: pull_request 4 | 5 | jobs: 6 | lint-test: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Checkout 10 | uses: actions/checkout@v4 11 | with: 12 | fetch-depth: 0 13 | 14 | - name: Set up Helm 15 | uses: azure/setup-helm@v4.2.0 16 | with: 17 | version: v3.14.4 18 | 19 | - uses: actions/setup-python@v5 20 | with: 21 | python-version: "3.x" 22 | check-latest: true 23 | 24 | - name: Set up chart-testing 25 | uses: helm/chart-testing-action@v2.6.1 26 | 27 | - name: Run chart-testing (list-changed) 28 | id: list-changed 29 | run: | 30 | changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }}) 31 | if [[ -n "$changed" ]]; then 32 | echo "changed=true" >> "$GITHUB_OUTPUT" 33 | fi 34 | 35 | - name: Run chart-testing (lint) 36 | if: steps.list-changed.outputs.changed == 'true' 37 | run: ct lint --target-branch ${{ github.event.repository.default_branch }} 38 | 39 | - name: Create kind cluster 40 | if: steps.list-changed.outputs.changed == 'true' 41 | uses: helm/kind-action@v1.10.0 42 | 43 | - name: Run chart-testing (install) 44 | if: steps.list-changed.outputs.changed == 'true' 45 | run: ct install --target-branch ${{ github.event.repository.default_branch }} 46 | -------------------------------------------------------------------------------- /.github/workflows/push-image-ghcr.yml: -------------------------------------------------------------------------------- 1 | name: Create and publish a Docker image to ghcr.io 2 | 3 | on: 4 | push: 5 | tags: 6 | - "v[0-9]+.[0-9]+.[0-9]+" 7 | 8 | env: 9 | REGISTRY: ghcr.io 10 | IMAGE_NAME: ${{ github.repository }} 11 | 12 | jobs: 13 | build-and-push-image: 14 | runs-on: ubuntu-latest 15 | permissions: 16 | contents: write 17 | packages: write 18 | 19 | steps: 20 | - name: Checkout repository 21 | uses: actions/checkout@v4 22 | 23 | - name: Set up QEMU 24 | uses: docker/setup-qemu-action@v3 25 | 26 | - name: Set up Docker Buildx 27 | uses: docker/setup-buildx-action@v3 28 | 29 | - name: Log in to the Container registry 30 | uses: docker/login-action@v3 31 | with: 32 | registry: ${{ env.REGISTRY }} 33 | username: ${{ github.actor }} 34 | password: ${{ secrets.GITHUB_TOKEN }} 35 | 36 | - name: Extract metadata (tags, labels) for Docker 37 | id: meta 38 | uses: docker/metadata-action@v5 39 | with: 40 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 41 | tags: | 42 | type=sha,enable=true,priority=100,prefix=,suffix=,format=short 43 | type=semver,pattern={{version}},value=${{ github.ref_name }} 44 | 45 | - name: Build and push 46 | uses: docker/build-push-action@v6 47 | with: 48 | context: . 49 | platforms: linux/amd64,linux/arm64 50 | provenance: false 51 | push: true 52 | tags: ${{ steps.meta.outputs.tags }} 53 | cache-from: type=gha 54 | cache-to: type=gha,mode=max 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/python 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 4 | 5 | ### Python ### 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | pytestdebug.log 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | doc/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | pythonenv* 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | 136 | # pytype static type analyzer 137 | .pytype/ 138 | 139 | # profiling data 140 | .prof 141 | 142 | # End of https://www.toptal.com/developers/gitignore/api/python 143 | 144 | .virtualenv 145 | .direnv 146 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: black 5 | name: black 6 | entry: poetry 7 | language: system 8 | types: [python] 9 | args: 10 | - run 11 | - black 12 | 13 | - repo: local 14 | hooks: 15 | - id: mypy 16 | name: mypy 17 | entry: poetry 18 | language: system 19 | pass_filenames: false 20 | args: 21 | - run 22 | - mypy 23 | - . 24 | 25 | - repo: local 26 | hooks: 27 | - id: pylint 28 | name: pylint 29 | entry: poetry 30 | language: system 31 | types: [python] 32 | args: 33 | - run 34 | - pylint 35 | - "-rn" # Only display messages 36 | - "-sn" # Don't display the score 37 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist= 7 | 8 | # Specify a score threshold to be exceeded before program exits with error. 9 | fail-under=10.0 10 | 11 | # Add files or directories to the blacklist. They should be base names, not 12 | # paths. 13 | ignore=CVS 14 | 15 | # Add files or directories matching the regex patterns to the blacklist. The 16 | # regex matches against base names, not paths. 17 | ignore-patterns= 18 | 19 | # Python code to execute, usually for sys.path manipulation such as 20 | # pygtk.require(). 21 | #init-hook= 22 | 23 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 24 | # number of processors available to use. 25 | jobs=1 26 | 27 | # Control the amount of potential inferred values when inferring a single 28 | # object. This can help the performance when dealing with large functions or 29 | # complex, nested conditions. 30 | limit-inference-results=100 31 | 32 | # List of plugins (as comma separated values of python module names) to load, 33 | # usually to register additional checkers. 34 | load-plugins= 35 | 36 | # Pickle collected data for later comparisons. 37 | persistent=yes 38 | 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit 40 | # user-friendly hints instead of false-positive error messages. 41 | suggestion-mode=yes 42 | 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the 44 | # active Python interpreter and may run arbitrary code. 45 | unsafe-load-any-extension=no 46 | 47 | 48 | [MESSAGES CONTROL] 49 | 50 | # Only show warnings with the listed confidence levels. Leave empty to show 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 52 | confidence= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once). You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use "--disable=all --enable=classes 62 | # --disable=W". 63 | disable= 64 | missing-function-docstring, 65 | missing-module-docstring, 66 | invalid-name, 67 | redefined-outer-name, 68 | missing-class-docstring, 69 | fixme, 70 | unnecessary-lambda-assignment, 71 | use-dict-literal 72 | 73 | # Enable the message, report, category or checker with the given id(s). You can 74 | # either give multiple identifier separated by comma (,) or put this option 75 | # multiple time (only on the command line, not in the configuration file where 76 | # it should appear only once). See also the "--disable" option for examples. 77 | enable=c-extension-no-member 78 | 79 | 80 | [REPORTS] 81 | 82 | # Python expression which should return a score less than or equal to 10. You 83 | # have access to the variables 'error', 'warning', 'refactor', and 'convention' 84 | # which contain the number of messages in each category, as well as 'statement' 85 | # which is the total number of statements analyzed. This score is used by the 86 | # global evaluation report (RP0004). 87 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 88 | 89 | # Template used to display messages. This is a python new-style format string 90 | # used to format the message information. See doc for all details. 91 | #msg-template= 92 | 93 | # Set the output format. Available formats are text, parseable, colorized, json 94 | # and msvs (visual studio). You can also give a reporter class, e.g. 95 | # mypackage.mymodule.MyReporterClass. 96 | output-format=text 97 | 98 | # Tells whether to display a full report or only the messages. 99 | reports=no 100 | 101 | # Activate the evaluation score. 102 | score=yes 103 | 104 | 105 | [REFACTORING] 106 | 107 | # Maximum number of nested blocks for function / method body 108 | max-nested-blocks=5 109 | 110 | # Complete name of functions that never returns. When checking for 111 | # inconsistent-return-statements if a never returning function is called then 112 | # it will be considered as an explicit return statement and no message will be 113 | # printed. 114 | never-returning-functions=sys.exit 115 | 116 | 117 | [TYPECHECK] 118 | 119 | # List of decorators that produce context managers, such as 120 | # contextlib.contextmanager. Add to this list to register other decorators that 121 | # produce valid context managers. 122 | contextmanager-decorators=contextlib.contextmanager 123 | 124 | # List of members which are set dynamically and missed by pylint inference 125 | # system, and so shouldn't trigger E1101 when accessed. Python regular 126 | # expressions are accepted. 127 | generated-members= 128 | 129 | # Tells whether missing members accessed in mixin class should be ignored. A 130 | # mixin class is detected if its name ends with "mixin" (case insensitive). 131 | ignore-mixin-members=yes 132 | 133 | # Tells whether to warn about missing members when the owner of the attribute 134 | # is inferred to be None. 135 | ignore-none=yes 136 | 137 | # This flag controls whether pylint should warn about no-member and similar 138 | # checks whenever an opaque object is returned when inferring. The inference 139 | # can return multiple potential results while evaluating a Python object, but 140 | # some branches might not be evaluated, which results in partial inference. In 141 | # that case, it might be useful to still emit no-member and other checks for 142 | # the rest of the inferred objects. 143 | ignore-on-opaque-inference=yes 144 | 145 | # List of class names for which member attributes should not be checked (useful 146 | # for classes with dynamically set attributes). This supports the use of 147 | # qualified names. 148 | ignored-classes=optparse.Values,thread._local,_thread._local 149 | 150 | # List of module names for which member attributes should not be checked 151 | # (useful for modules/projects where namespaces are manipulated during runtime 152 | # and thus existing member attributes cannot be deduced by static analysis). It 153 | # supports qualified module names, as well as Unix pattern matching. 154 | ignored-modules= 155 | 156 | # Show a hint with possible names when a member name was not found. The aspect 157 | # of finding the hint is based on edit distance. 158 | missing-member-hint=yes 159 | 160 | # The minimum edit distance a name should have in order to be considered a 161 | # similar match for a missing member name. 162 | missing-member-hint-distance=1 163 | 164 | # The total number of similar names that should be taken in consideration when 165 | # showing a hint for a missing member. 166 | missing-member-max-choices=1 167 | 168 | # List of decorators that change the signature of a decorated function. 169 | signature-mutators= 170 | 171 | 172 | [VARIABLES] 173 | 174 | # List of additional names supposed to be defined in builtins. Remember that 175 | # you should avoid defining new builtins when possible. 176 | additional-builtins= 177 | 178 | # Tells whether unused global variables should be treated as a violation. 179 | allow-global-unused-variables=yes 180 | 181 | # List of strings which can identify a callback function by name. A callback 182 | # name must start or end with one of those strings. 183 | callbacks=cb_, 184 | _cb 185 | 186 | # A regular expression matching the name of dummy variables (i.e. expected to 187 | # not be used). 188 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 189 | 190 | # Argument names that match this expression will be ignored. Default to name 191 | # with leading underscore. 192 | ignored-argument-names=_.*|^ignored_|^unused_ 193 | 194 | # Tells whether we should check for unused import in __init__ files. 195 | init-import=no 196 | 197 | # List of qualified module names which can have objects that can redefine 198 | # builtins. 199 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 200 | 201 | 202 | [BASIC] 203 | 204 | # Naming style matching correct argument names. 205 | argument-naming-style=snake_case 206 | 207 | # Regular expression matching correct argument names. Overrides argument- 208 | # naming-style. 209 | #argument-rgx= 210 | 211 | # Naming style matching correct attribute names. 212 | attr-naming-style=snake_case 213 | 214 | # Regular expression matching correct attribute names. Overrides attr-naming- 215 | # style. 216 | #attr-rgx= 217 | 218 | # Bad variable names which should always be refused, separated by a comma. 219 | bad-names=foo, 220 | bar, 221 | baz, 222 | toto, 223 | tutu, 224 | tata 225 | 226 | # Bad variable names regexes, separated by a comma. If names match any regex, 227 | # they will always be refused 228 | bad-names-rgxs= 229 | 230 | # Naming style matching correct class attribute names. 231 | class-attribute-naming-style=any 232 | 233 | # Regular expression matching correct class attribute names. Overrides class- 234 | # attribute-naming-style. 235 | #class-attribute-rgx= 236 | 237 | # Naming style matching correct class names. 238 | class-naming-style=PascalCase 239 | 240 | # Regular expression matching correct class names. Overrides class-naming- 241 | # style. 242 | #class-rgx= 243 | 244 | # Naming style matching correct constant names. 245 | const-naming-style=UPPER_CASE 246 | 247 | # Regular expression matching correct constant names. Overrides const-naming- 248 | # style. 249 | #const-rgx= 250 | 251 | # Minimum line length for functions/classes that require docstrings, shorter 252 | # ones are exempt. 253 | docstring-min-length=-1 254 | 255 | # Naming style matching correct function names. 256 | function-naming-style=snake_case 257 | 258 | # Regular expression matching correct function names. Overrides function- 259 | # naming-style. 260 | #function-rgx= 261 | 262 | # Good variable names which should always be accepted, separated by a comma. 263 | good-names=i, 264 | j, 265 | k, 266 | ex, 267 | Run, 268 | _ 269 | 270 | # Good variable names regexes, separated by a comma. If names match any regex, 271 | # they will always be accepted 272 | good-names-rgxs= 273 | 274 | # Include a hint for the correct naming format with invalid-name. 275 | include-naming-hint=no 276 | 277 | # Naming style matching correct inline iteration names. 278 | inlinevar-naming-style=any 279 | 280 | # Regular expression matching correct inline iteration names. Overrides 281 | # inlinevar-naming-style. 282 | #inlinevar-rgx= 283 | 284 | # Naming style matching correct method names. 285 | method-naming-style=snake_case 286 | 287 | # Regular expression matching correct method names. Overrides method-naming- 288 | # style. 289 | #method-rgx= 290 | 291 | # Naming style matching correct module names. 292 | module-naming-style=snake_case 293 | 294 | # Regular expression matching correct module names. Overrides module-naming- 295 | # style. 296 | #module-rgx= 297 | 298 | # Colon-delimited sets of names that determine each other's naming style when 299 | # the name regexes allow several styles. 300 | name-group= 301 | 302 | # Regular expression which should only match function or class names that do 303 | # not require a docstring. 304 | no-docstring-rgx=^_ 305 | 306 | # List of decorators that produce properties, such as abc.abstractproperty. Add 307 | # to this list to register other decorators that produce valid properties. 308 | # These decorators are taken in consideration only for invalid-name. 309 | property-classes=abc.abstractproperty 310 | 311 | # Naming style matching correct variable names. 312 | variable-naming-style=snake_case 313 | 314 | # Regular expression matching correct variable names. Overrides variable- 315 | # naming-style. 316 | #variable-rgx= 317 | 318 | 319 | [SPELLING] 320 | 321 | # Limits count of emitted suggestions for spelling mistakes. 322 | max-spelling-suggestions=4 323 | 324 | # Spelling dictionary name. Available dictionaries: none. To make it work, 325 | # install the python-enchant package. 326 | spelling-dict= 327 | 328 | # List of comma separated words that should not be checked. 329 | spelling-ignore-words= 330 | 331 | # A path to a file that contains the private dictionary; one word per line. 332 | spelling-private-dict-file= 333 | 334 | # Tells whether to store unknown words to the private dictionary (see the 335 | # --spelling-private-dict-file option) instead of raising a message. 336 | spelling-store-unknown-words=no 337 | 338 | 339 | [MISCELLANEOUS] 340 | 341 | # List of note tags to take in consideration, separated by a comma. 342 | notes=FIXME, 343 | XXX, 344 | TODO 345 | 346 | # Regular expression of note tags to take in consideration. 347 | #notes-rgx= 348 | 349 | 350 | [STRING] 351 | 352 | # This flag controls whether inconsistent-quotes generates a warning when the 353 | # character used as a quote delimiter is used inconsistently within a module. 354 | check-quote-consistency=no 355 | 356 | # This flag controls whether the implicit-str-concat should generate a warning 357 | # on implicit string concatenation in sequences defined over several lines. 358 | check-str-concat-over-line-jumps=no 359 | 360 | 361 | [LOGGING] 362 | 363 | # The type of string formatting that logging methods do. `old` means using % 364 | # formatting, `new` is for `{}` formatting. 365 | logging-format-style=old 366 | 367 | # Logging modules to check that the string format arguments are in logging 368 | # function parameter format. 369 | logging-modules=logging 370 | 371 | 372 | [FORMAT] 373 | 374 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 375 | expected-line-ending-format= 376 | 377 | # Regexp for a line that is allowed to be longer than the limit. 378 | ignore-long-lines=^\s*(# )??$ 379 | 380 | # Number of spaces of indent required inside a hanging or continued line. 381 | indent-after-paren=4 382 | 383 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 384 | # tab). 385 | indent-string=' ' 386 | 387 | # Maximum number of characters on a single line. 388 | max-line-length=100 389 | 390 | # Maximum number of lines in a module. 391 | max-module-lines=1000 392 | 393 | # Allow the body of a class to be on the same line as the declaration if body 394 | # contains single statement. 395 | single-line-class-stmt=no 396 | 397 | # Allow the body of an if to be on the same line as the test if there is no 398 | # else. 399 | single-line-if-stmt=no 400 | 401 | 402 | [SIMILARITIES] 403 | 404 | # Ignore comments when computing similarities. 405 | ignore-comments=yes 406 | 407 | # Ignore docstrings when computing similarities. 408 | ignore-docstrings=yes 409 | 410 | # Ignore imports when computing similarities. 411 | ignore-imports=no 412 | 413 | # Minimum lines number of a similarity. 414 | min-similarity-lines=5 415 | 416 | 417 | [DESIGN] 418 | 419 | # Maximum number of arguments for function / method. 420 | max-args=5 421 | 422 | # Maximum number of attributes for a class (see R0902). 423 | max-attributes=7 424 | 425 | # Maximum number of boolean expressions in an if statement (see R0916). 426 | max-bool-expr=5 427 | 428 | # Maximum number of branch for function / method body. 429 | max-branches=12 430 | 431 | # Maximum number of locals for function / method body. 432 | max-locals=15 433 | 434 | # Maximum number of parents for a class (see R0901). 435 | max-parents=7 436 | 437 | # Maximum number of public methods for a class (see R0904). 438 | max-public-methods=20 439 | 440 | # Maximum number of return / yield for function / method body. 441 | max-returns=6 442 | 443 | # Maximum number of statements in function / method body. 444 | max-statements=50 445 | 446 | # Minimum number of public methods for a class (see R0903). 447 | min-public-methods=2 448 | 449 | 450 | [IMPORTS] 451 | 452 | # List of modules that can be imported at any level, not just the top level 453 | # one. 454 | allow-any-import-level= 455 | 456 | # Allow wildcard imports from modules that define __all__. 457 | allow-wildcard-with-all=no 458 | 459 | # Analyse import fallback blocks. This can be used to support both Python 2 and 460 | # 3 compatible code, which means that the block might have code that exists 461 | # only in one or another interpreter, leading to false positives when analysed. 462 | analyse-fallback-blocks=no 463 | 464 | # Deprecated modules which should not be used, separated by a comma. 465 | deprecated-modules=optparse,tkinter.tix 466 | 467 | # Create a graph of external dependencies in the given file (report RP0402 must 468 | # not be disabled). 469 | ext-import-graph= 470 | 471 | # Create a graph of every (i.e. internal and external) dependencies in the 472 | # given file (report RP0402 must not be disabled). 473 | import-graph= 474 | 475 | # Create a graph of internal dependencies in the given file (report RP0402 must 476 | # not be disabled). 477 | int-import-graph= 478 | 479 | # Force import order to recognize a module as part of the standard 480 | # compatibility libraries. 481 | known-standard-library= 482 | 483 | # Force import order to recognize a module as part of a third party library. 484 | known-third-party=enchant 485 | 486 | # Couples of modules and preferred modules, separated by a comma. 487 | preferred-modules= 488 | 489 | 490 | [CLASSES] 491 | 492 | # List of method names used to declare (i.e. assign) instance attributes. 493 | defining-attr-methods=__init__, 494 | __new__, 495 | setUp, 496 | __post_init__ 497 | 498 | # List of member names, which should be excluded from the protected access 499 | # warning. 500 | exclude-protected=_asdict, 501 | _fields, 502 | _replace, 503 | _source, 504 | _make 505 | 506 | # List of valid names for the first argument in a class method. 507 | valid-classmethod-first-arg=cls 508 | 509 | # List of valid names for the first argument in a metaclass class method. 510 | valid-metaclass-classmethod-first-arg=cls 511 | 512 | 513 | [EXCEPTIONS] 514 | 515 | # Exceptions that will emit a warning when being caught. Defaults to 516 | # "BaseException, Exception". 517 | overgeneral-exceptions=builtins.BaseException, 518 | builtins.Exception 519 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.13.3 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Stage 1: Build 2 | FROM python:3.13-slim-bookworm as builder 3 | 4 | ENV PYTHONUNBUFFERED=1 \ 5 | POETRY_NO_INTERACTION=1 \ 6 | POETRY_VIRTUALENVS_IN_PROJECT=1 \ 7 | POETRY_VIRTUALENVS_CREATE=1 \ 8 | POETRY_CACHE_DIR=/tmp/poetry_cache 9 | 10 | WORKDIR /app/ 11 | COPY pyproject.toml poetry.lock /app/ 12 | RUN apt-get update && \ 13 | apt-get -y dist-upgrade && \ 14 | apt install -y locales libcurl4-openssl-dev libssl-dev build-essential && \ 15 | apt-get clean && \ 16 | rm -rf /var/lib/apt/lists/* && \ 17 | pip install -U pip poetry && \ 18 | poetry install --without dev --no-root && \ 19 | rm -rf $POETRY_CACHE_DIR 20 | 21 | # Stage 2: Runtime environment 22 | FROM python:3.13-slim-bookworm 23 | 24 | ENV PYTHONUNBUFFERED=1 \ 25 | VIRTUAL_ENV=/app/.venv \ 26 | PATH="/app/.venv/bin:$PATH" 27 | 28 | COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} 29 | COPY . /app/ 30 | 31 | EXPOSE 9808 32 | 33 | RUN adduser --disabled-login exporter 34 | 35 | USER exporter 36 | 37 | ENTRYPOINT ["python", "/app/cli.py"] 38 | -------------------------------------------------------------------------------- /Dockerfile.pyinstaller: -------------------------------------------------------------------------------- 1 | FROM danihodovic/pyinstaller-builder:latest 2 | 3 | ARG PYTHON_VERSION=3.12 4 | 5 | RUN pyenv install $PYTHON_VERSION && pyenv global $PYTHON_VERSION 6 | 7 | RUN pip install poetry 8 | 9 | WORKDIR /app/ 10 | 11 | COPY ./pyproject.toml ./poetry.lock /app/ 12 | 13 | RUN poetry install 14 | 15 | COPY . /app/ 16 | 17 | RUN eval "$(pyenv init -)" && pyinstaller cli.py -y --onefile --name celery-exporter \ 18 | --hidden-import=celery.fixups \ 19 | --hidden-import=celery.fixups.django \ 20 | --hidden-import=celery.app.events \ 21 | --hidden-import=celery.loaders.app \ 22 | --hidden-import=celery.app.amqp \ 23 | --hidden-import=celery.app.control \ 24 | --hidden-import=kombu.transport.redis \ 25 | --hidden-import=kombu.transport.pyamqp 26 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Dani Hodovic 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # celery-exporter ![Build Status](https://github.com/danihodovic/celery-exporter/actions/workflows/.github/workflows/ci.yml/badge.svg) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 2 | 3 | ![celery-tasks-by-task](images/celery-tasks-by-task.png) 4 | 5 | ##### Table of Contents 6 | 7 | * [Why another exporter?](#why-another-exporter) 8 | * [Features](#features) 9 | * [Usage](#usage) 10 | * [Enable events using the CLI](#enable-events-using-the-cli) 11 | * [Running the exporter](#running-the-exporter) 12 | * [Metrics](#metrics) 13 | * [Development](#development) 14 | * [Contributors](#contributors) 15 | 16 | ### Why another exporter? 17 | 18 | While I was adding Celery monitoring to a client site I realized that the 19 | existing brokers either didn't work, exposed incorrect metric values or didn't 20 | expose the metrics I needed. So I wrote this exporter which essentially wraps 21 | the built-in Celery monitoring API and exposes all of the event metrics to 22 | Prometheus in real-time. 23 | 24 | ## Features 25 | 26 | - Tested for both Redis and RabbitMQ 27 | - Uses the built in [real-time monitoring component in Celery](https://docs.celeryproject.org/en/latest/userguide/monitoring.html#real-time-processing) to expose Prometheus metrics 28 | - Tracks task status (task-started, task-succeeded, task-failed etc) 29 | - Tracks which workers are running and the number of active tasks 30 | - Follows the Prometheus exporter [best practises](https://prometheus.io/docs/instrumenting/writing_exporters/) 31 | - Deployed as a Docker image or Python single-file binary (via PyInstaller) 32 | - Exposes a health check endpoint at /health 33 | - Grafana dashboards provided by the Celery-mixin 34 | - Prometheus alerts provided by the Celery-mixin 35 | 36 | ## Dashboards and alerts 37 | 38 | Alerting rules can be found [here](./celery-mixin/prometheus-alerts.yaml). By 39 | default we alert if: 40 | 41 | - A task failed in the last 10 minutes. 42 | - No Celery workers are online. 43 | 44 | Tweak these to suit your use-case. 45 | 46 | The Grafana dashboard (seen in the image above) is 47 | [here](https://grafana.com/grafana/dashboards/17508). You can import it 48 | directly into your Grafana instance. 49 | 50 | There's another Grafana dashboards that shows an overview of Celery tasks. An image can be found in `./images/celery-tasks-overview.png`. It can also be found 51 | [here](https://grafana.com/grafana/dashboards/17509). 52 | 53 | ## Usage 54 | 55 | Celery needs to be configured to send events to the broker which the exporter 56 | will collect. You can either enable this via Celery configuration or via the 57 | Celery CLI. 58 | 59 | ##### Enable events using the CLI 60 | 61 | To enable events in the CLI run the below command. Note that by default it 62 | doesn't send the `task-sent` event which needs to be [configured](https://docs.celeryproject.org/en/latest/userguide/configuration.html#std-setting-task_send_sent_event) in the 63 | configuration. The other events work out of the box. 64 | 65 | ```sh 66 | $ celery -A control enable_events 67 | ``` 68 | 69 | **Enable events using the configuration:** 70 | 71 | ```python 72 | # In celeryconfig.py 73 | worker_send_task_events = True 74 | task_send_sent_event = True 75 | ``` 76 | 77 | **Configuration in Django:** 78 | ```python 79 | # In settings.py 80 | CELERY_WORKER_SEND_TASK_EVENTS = True 81 | CELERY_TASK_SEND_SENT_EVENT = True 82 | ``` 83 | 84 | ##### Running the exporter 85 | 86 | Using Docker: 87 | 88 | ```sh 89 | docker run -p 9808:9808 danihodovic/celery-exporter --broker-url=redis://redis.service.consul/1 90 | ``` 91 | 92 | Using the Python binary (for-non Docker environments): 93 | ```sh 94 | curl -L https://github.com/danihodovic/celery-exporter/releases/download/latest/celery-exporter -o ./celery-exporter 95 | chmod +x ./celery-exporter 96 | ./celery-exporter --broker-url=redis://redis.service.consul/1 97 | ``` 98 | 99 | ###### Kubernetes 100 | 101 | There's a Helm in the directory `charts/celery-exporter` for deploying the Celery-exporter to Kubernetes using Helm. 102 | 103 | ###### Environment variables 104 | 105 | All arguments can be specified using environment variables with a `CE_` prefix: 106 | 107 | ```sh 108 | docker run -p 9808:9808 -e CE_BROKER_URL=redis://redis danihodovic/celery-exporter 109 | ``` 110 | 111 | ###### Specifying optional broker transport options 112 | 113 | While the default options may be fine for most cases, 114 | there may be a need to specify optional broker transport options. This can be done by specifying 115 | one or more --broker-transport-option parameters as follows: 116 | 117 | ```sh 118 | docker run -p 9808:9808 danihodovic/celery-exporter --broker-url=redis://redis.service.consul/1 \ 119 | --broker-transport-option global_keyprefix=danihodovic \ 120 | --broker-transport-option visibility_timeout=7200 121 | ``` 122 | 123 | In case of extended transport options, such as `sentinel_kwargs` you can pass JSON string:, 124 | for example: 125 | 126 | ```sh 127 | docker run -p 9808:9808 danihodovic/celery-exporter --broker-url=sentinel://sentinel.service.consul/1 \ 128 | --broker-transport-option master_name=my_master \ 129 | --broker-transport-option sentinel_kwargs="{\"password\": \"sentinelpass\"}" 130 | ``` 131 | 132 | The list of available broker transport options can be found here: 133 | https://docs.celeryq.dev/projects/kombu/en/stable/reference/kombu.transport.redis.html 134 | 135 | ###### Specifying an optional retry interval 136 | 137 | By default, celery-exporter will raise an exception and exit if there 138 | are any errors communicating with the broker. If preferred, one can 139 | have the celery-exporter retry connecting to the broker after a certain 140 | period of time in seconds via the `--retry-interval` parameter as follows: 141 | 142 | ```sh 143 | docker run -p 9808:9808 danihodovic/celery-exporter --broker-url=redis://redis.service.consul/1 \ 144 | --retry-interval=5 145 | ``` 146 | 147 | ##### Test for prometheus scrape target 148 | ```sh 149 | curl 127.0.0.1:9808/metrics 150 | ``` 151 | 152 | ##### Grafana Dashboards & Prometheus Alerts 153 | 154 | Head over to the [Celery-mixin in this subdirectory](https://github.com/danihodovic/celery-exporter/tree/master/celery-mixin) to generate rules and dashboards suited to your Prometheus setup. 155 | 156 | ### Metrics 157 | Name | Description | Type 158 | ---------|-------------|---- 159 | celery_task_sent_total | Sent when a task message is published. | Counter 160 | celery_task_received_total | Sent when the worker receives a task. | Counter 161 | celery_task_started_total | Sent just before the worker executes the task. | Counter 162 | celery_task_succeeded_total | Sent if the task executed successfully. | Counter 163 | celery_task_failed_total | Sent if the execution of the task failed. | Counter 164 | celery_task_rejected_total | The task was rejected by the worker, possibly to be re-queued or moved to a dead letter queue. | Counter 165 | celery_task_revoked_total | Sent if the task has been revoked. | Counter 166 | celery_task_retried_total | Sent if the task failed, but will be retried in the future. | Counter 167 | celery_worker_up | Indicates if a worker has recently sent a heartbeat. | Gauge 168 | celery_worker_tasks_active | The number of tasks the worker is currently processing | Gauge 169 | celery_task_runtime_bucket | Histogram of runtime measurements for each task | Histogram 170 | celery_queue_length | The number of message in broker queue | Gauge 171 | celery_active_consumer_count | The number of active consumer in broker queue **(Only work for [RabbitMQ and Qpid](https://qpid.apache.org/) broker, more details at [here](https://github.com/danihodovic/celery-exporter/pull/118#issuecomment-1169870481))** | Gauge 172 | celery_active_worker_count | The number of active workers in broker queue | Gauge 173 | celery_active_process_count | The number of active process in broker queue. Each worker may have more than one process. | Gauge 174 | 175 | Used in production at [https://findwork.dev](https://findwork.dev) and [https://django.wtf](https://django.wtf). 176 | 177 | 178 | ## Development 179 | Pull requests are welcome here! 180 | 181 | To start developing run commands below to prepare your environment after the `git clone` command: 182 | ```shell 183 | # Install dependencies and pre-commit hooks 184 | poetry install 185 | pre-commit install 186 | 187 | # Test everything works fine 188 | pre-commit run --all-files 189 | docker-compose up -d 190 | pytest --broker=memory --log-level=DEBUG 191 | pytest --broker=redis --log-level=DEBUG 192 | pytest --broker=rabbitmq --log-level=DEBUG 193 | ``` 194 | 195 | ## Contributors 196 | 197 | 198 | 199 | 200 | 201 | Made with [contrib.rocks](https://contrib.rocks). 202 | -------------------------------------------------------------------------------- /Taskfile.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # yamllint disable rule:line-length 3 | version: '3' 4 | 5 | tasks: 6 | build-image: 7 | desc: Builds a docker image 8 | cmds: 9 | - docker build . -t danihodovic/celery-exporter 10 | 11 | trivy-scan: 12 | desc: Scans the docker image for vulnerabilities 13 | cmds: 14 | - trivy image --severity CRITICAL,HIGH --ignore-unfixed danihodovic/celery-exporter:latest 15 | 16 | build-binary: 17 | desc: Creates a binary 18 | cmds: 19 | - docker build . -t celery-exporter-builder -f Dockerfile.pyinstaller --build-arg PYTHON_VERSION=$(cat .python-version) 20 | - > 21 | container=$(docker run --rm -d celery-exporter-builder sleep 5) && 22 | docker cp $container:/app/dist/celery-exporter celery-exporter 23 | 24 | release: 25 | desc: Creates a Github release 26 | deps: [build-binary] 27 | cmds: 28 | - git tag --delete latest 29 | - git tag -a latest -m 'Latest build' 30 | - > 31 | github-release delete 32 | --user danihodovic 33 | --repo celery-exporter 34 | --tag latest 35 | - > 36 | github-release release 37 | --user danihodovic 38 | --repo celery-exporter 39 | --tag latest 40 | --name celery-exporter 41 | --description "Celery exporter for Prometheus" 42 | - > 43 | github-release upload 44 | --user danihodovic 45 | --repo celery-exporter 46 | --tag latest 47 | --name celery-exporter 48 | --file ./celery-exporter 49 | -------------------------------------------------------------------------------- /buildInstaller.sh: -------------------------------------------------------------------------------- 1 | docker build -f Dockerfile.pyinstaller . -t cel-ex-builder 2 | docker rm celex -f 3 | docker run --name celex -d cel-ex-builder 4 | rm celery-exporter 5 | docker cp celex:/app/dist/celery-exporter . -------------------------------------------------------------------------------- /celery-mixin/.gitignore: -------------------------------------------------------------------------------- 1 | vendor 2 | jsonnetfile.lock.json 3 | -------------------------------------------------------------------------------- /celery-mixin/Makefile: -------------------------------------------------------------------------------- 1 | JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s 2 | 3 | all: fmt prometheus-alerts.yaml dashboards_out lint 4 | 5 | fmt: 6 | find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ 7 | xargs -n 1 -- $(JSONNET_FMT) -i 8 | 9 | prometheus-alerts.yaml: mixin.libsonnet config.libsonnet $(wildcard alerts/*) 10 | jsonnet -S alerts.jsonnet > $@ 11 | 12 | dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*) 13 | @mkdir -p dashboards_out 14 | jsonnet -J vendor -m dashboards_out dashboards.jsonnet 15 | 16 | lint: prometheus-alerts.yaml 17 | find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ 18 | while read f; do \ 19 | $(JSONNET_FMT) "$$f" | diff -u "$$f" -; \ 20 | done 21 | 22 | promtool check rules prometheus-alerts.yaml 23 | 24 | test: prometheus-alerts.yaml 25 | promtool test rules tests.yaml 26 | 27 | clean: 28 | rm -rf dashboards_out prometheus-alerts.yaml 29 | -------------------------------------------------------------------------------- /celery-mixin/README.md: -------------------------------------------------------------------------------- 1 | # Prometheus Monitoring Mixin for Celery 2 | 3 | A set of Grafana dashboards and Prometheus alerts for Celery. 4 | 5 | ## How to use 6 | 7 | This mixin is designed to be vendored into the repo with your infrastructure config. 8 | To do this, use [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler): 9 | 10 | You then have three options for deploying your dashboards 11 | 12 | 1. Generate the config files and deploy them yourself 13 | 2. Use jsonnet to deploy this mixin along with Prometheus and Grafana 14 | 3. Use prometheus-operator to deploy this mixin 15 | 16 | ## Generate config files 17 | 18 | You can manually generate the alerts, dashboards and rules files, but first you 19 | must install some tools: 20 | 21 | ```sh 22 | go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb 23 | brew install jsonnet 24 | ``` 25 | 26 | Then, grab the mixin and its dependencies: 27 | 28 | ```sh 29 | git clone https://github.com/danihodovic/celery-exporter 30 | cd celery-exporter/celery-mixin 31 | jb install 32 | ``` 33 | 34 | Finally, build the mixin: 35 | 36 | ```sh 37 | make prometheus-alerts.yaml 38 | make dashboards_out 39 | ``` 40 | 41 | The `prometheus-alerts.yaml` file then need to passed 42 | to your Prometheus server, and the files in `dashboards_out` need to be imported 43 | into you Grafana server. The exact details will depending on how you deploy your 44 | monitoring stack. 45 | 46 | ## Alerts 47 | 48 | The mixin follows the [monitoring-mixins guidelines](https://github.com/monitoring-mixins/docs#guidelines-for-alert-names-labels-and-annotations) for alerts. 49 | -------------------------------------------------------------------------------- /celery-mixin/alerts.jsonnet: -------------------------------------------------------------------------------- 1 | std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts) 2 | -------------------------------------------------------------------------------- /celery-mixin/alerts/alerts.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | prometheusAlerts+:: { 3 | groups+: [ 4 | { 5 | name: 'celery', 6 | rules: std.prune([ 7 | { 8 | alert: 'CeleryTaskHighFailRate', 9 | expr: ||| 10 | sum( 11 | increase( 12 | celery_task_failed_total{ 13 | %(celerySelector)s, 14 | queue_name!~"%(celeryIgnoredQueues)s", 15 | name!~"%(celeryIgnoredTasks)s" 16 | }[%(celeryTaskFailedInterval)s] 17 | ) 18 | ) by (job, namespace, queue_name, name) 19 | / 20 | ( 21 | sum( 22 | increase( 23 | celery_task_failed_total{ 24 | %(celerySelector)s, 25 | queue_name!~"%(celeryIgnoredQueues)s", 26 | name!~"%(celeryIgnoredTasks)s" 27 | }[%(celeryTaskFailedInterval)s] 28 | ) 29 | ) by (job, namespace, queue_name, name) 30 | + 31 | sum( 32 | increase( 33 | celery_task_succeeded_total{ 34 | %(celerySelector)s, 35 | queue_name!~"%(celeryIgnoredQueues)s", 36 | name!~"%(celeryIgnoredTasks)s" 37 | }[%(celeryTaskFailedInterval)s] 38 | ) 39 | ) by (job, namespace, queue_name, name) 40 | ) 41 | * 100 > %(celeryTaskFailedThreshold)s 42 | ||| % $._config, 43 | annotations: { 44 | summary: 'Celery high task fail rate.', 45 | description: 'More than %(celeryTaskFailedThreshold)s%% tasks failed for the task {{ $labels.job }}/{{ $labels.queue_name }}/{{ $labels.name }} the past %(celeryTaskFailedInterval)s.' % $._config, 46 | dashboard_url: $._config.celeryTasksByTaskUrl + '?var-job={{ $labels.job }}&var-queue_name={{ $labels.queue_name }}&var-task={{ $labels.name }}', 47 | }, 48 | 'for': '1m', 49 | labels: { 50 | severity: 'warning', 51 | }, 52 | }, 53 | if $._config.celeryCeleryHighQueueLengthAlertEnabled then { 54 | alert: 'CeleryHighQueueLength', 55 | expr: ||| 56 | sum( 57 | celery_queue_length{ 58 | %(celerySelector)s, 59 | queue_name!~"%(celeryIgnoredQueues)s" 60 | } 61 | ) by (job, namespace, queue_name) 62 | > %(celeryHighQueueLengthThreshold)s 63 | ||| % $._config, 64 | 'for': $._config.celeryHighQueueLengthInterval, 65 | labels: { 66 | severity: 'warning', 67 | }, 68 | annotations: { 69 | summary: 'Celery high queue length.', 70 | description: 'More than %(celeryHighQueueLengthThreshold)s tasks in the queue {{ $labels.job }}/{{ $labels.queue_name }} the past %(celeryHighQueueLengthInterval)s.' % $._config, 71 | dashboard_url: $._config.celeryTasksOverviewUrl + '?&var-job={{ $labels.job }}&var-queue_name={{ $labels.queue_name }}', 72 | }, 73 | }, 74 | if $._config.celeryWorkerDownAlertEnabled then { 75 | alert: 'CeleryWorkerDown', 76 | expr: ||| 77 | celery_worker_up{%(celerySelector)s} == 0 78 | ||| % $._config, 79 | 'for': $._config.celeryWorkerDownInterval, 80 | labels: { 81 | severity: 'warning', 82 | }, 83 | annotations: { 84 | summary: 'A Celery worker is offline.', 85 | description: 'The Celery worker {{ $labels.job }}/{{ $labels.hostname }} is offline.', 86 | dashboard_url: $._config.celeryTasksOverviewUrl + '?&var-job={{ $labels.job }}', 87 | }, 88 | }, 89 | ]), 90 | }, 91 | ], 92 | }, 93 | } 94 | -------------------------------------------------------------------------------- /celery-mixin/config.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local annotation = g.dashboard.annotation; 3 | 4 | { 5 | _config+:: { 6 | // Selectors are inserted between {} in Prometheus queries. 7 | celerySelector: 'job=~".*celery.*"', 8 | 9 | grafanaUrl: 'https://grafana.com', 10 | 11 | celeryIgnoredTasks: 'None', 12 | celeryIgnoredQueues: 'None', 13 | 14 | celeryTasksOverviewUid: 'celery-tasks-overview-32s3', 15 | celeryTasksByTaskUid: 'celery-tasks-by-task-32s3', 16 | 17 | celeryTasksOverviewUrl: '%s/d/%s/celery-tasks-overview' % [self.grafanaUrl, self.celeryTasksOverviewUid], 18 | celeryTasksByTaskUrl: '%s/d/%s/celery-tasks-by-task' % [self.grafanaUrl, self.celeryTasksByTaskUid], 19 | 20 | tags: ['celery', 'celery-mixin'], 21 | 22 | // If you have autoscaling workers then you maybe do not want to alert on workers that are down. 23 | celeryWorkerDownAlertEnabled: true, 24 | celeryCeleryHighQueueLengthAlertEnabled: true, 25 | // The task interval is used as the interval for Prometheus alerts of failed tasks. 26 | celeryTaskFailedInterval: '10m', 27 | celeryTaskFailedThreshold: '5', // percent 28 | celeryHighQueueLengthInterval: '20m', 29 | celeryHighQueueLengthThreshold: '100', 30 | celeryWorkerDownInterval: '15m', 31 | 32 | // Custom annotations to display in graphs 33 | annotation: { 34 | enabled: false, 35 | name: 'Custom Annotation', 36 | datasource: '-- Grafana --', 37 | iconColor: 'green', 38 | tags: [], 39 | }, 40 | 41 | customAnnotation:: if $._config.annotation.enabled then 42 | annotation.withName($._config.annotation.name) + 43 | annotation.withIconColor($._config.annotation.iconColor) + 44 | annotation.withHide(false) + 45 | annotation.datasource.withUid($._config.annotation.datasource) + 46 | annotation.target.withMatchAny(true) + 47 | annotation.target.withTags($._config.annotation.tags) + 48 | annotation.target.withType('tags') 49 | else {}, 50 | }, 51 | } 52 | -------------------------------------------------------------------------------- /celery-mixin/dashboards.jsonnet: -------------------------------------------------------------------------------- 1 | local dashboards = (import 'mixin.libsonnet').grafanaDashboards; 2 | 3 | { 4 | [name]: dashboards[name] 5 | for name in std.objectFields(dashboards) 6 | } 7 | -------------------------------------------------------------------------------- /celery-mixin/dashboards/celery-tasks-by-task.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | local dashboard = g.dashboard; 4 | local row = g.panel.row; 5 | local grid = g.util.grid; 6 | 7 | local variable = dashboard.variable; 8 | local datasource = variable.datasource; 9 | local query = variable.query; 10 | local prometheus = g.query.prometheus; 11 | 12 | local timeSeriesPanel = g.panel.timeSeries; 13 | local tablePanel = g.panel.table; 14 | 15 | // Timeseries 16 | local tsOptions = timeSeriesPanel.options; 17 | local tsStandardOptions = timeSeriesPanel.standardOptions; 18 | local tsQueryOptions = timeSeriesPanel.queryOptions; 19 | local tsFieldConfig = timeSeriesPanel.fieldConfig; 20 | local tsCustom = tsFieldConfig.defaults.custom; 21 | local tsLegend = tsOptions.legend; 22 | local tsOverride = tsStandardOptions.override; 23 | 24 | // Table 25 | local tbOptions = tablePanel.options; 26 | local tbStandardOptions = tablePanel.standardOptions; 27 | local tbQueryOptions = tablePanel.queryOptions; 28 | local tbOverride = tbStandardOptions.override; 29 | 30 | { 31 | grafanaDashboards+:: { 32 | 33 | local datasourceVariable = 34 | datasource.new( 35 | 'datasource', 36 | 'prometheus', 37 | ) + 38 | datasource.generalOptions.withLabel('Data source'), 39 | 40 | local namespaceVariable = 41 | query.new( 42 | 'namespace', 43 | 'label_values(celery_worker_up{}, namespace)' 44 | ) + 45 | query.withDatasourceFromVariable(datasourceVariable) + 46 | query.withSort(1) + 47 | query.generalOptions.withLabel('Namespace') + 48 | query.selectionOptions.withMulti(false) + 49 | query.selectionOptions.withIncludeAll(false) + 50 | query.refresh.onLoad() + 51 | query.refresh.onTime(), 52 | 53 | 54 | local jobVariable = 55 | query.new( 56 | 'job', 57 | 'label_values(celery_worker_up{namespace="$namespace"}, job)' 58 | ) + 59 | query.withDatasourceFromVariable(datasourceVariable) + 60 | query.withSort(1) + 61 | query.generalOptions.withLabel('Job') + 62 | query.selectionOptions.withMulti(false) + 63 | query.selectionOptions.withIncludeAll(false) + 64 | query.refresh.onLoad() + 65 | query.refresh.onTime(), 66 | 67 | local queueNameVariable = 68 | query.new( 69 | 'queue_name', 70 | 'label_values(celery_task_received_total{namespace="$namespace", job="$job", name!~"%(celeryIgnoredQueues)s"}, queue_name)' % $._config 71 | ) + 72 | query.withDatasourceFromVariable(datasourceVariable) + 73 | query.withSort(1) + 74 | query.generalOptions.withLabel('Queue Name') + 75 | query.selectionOptions.withMulti(false) + 76 | query.selectionOptions.withIncludeAll(false) + 77 | query.refresh.onLoad() + 78 | query.refresh.onTime(), 79 | 80 | local taskVariable = 81 | query.new( 82 | 'task', 83 | 'label_values(celery_task_received_total{namespace="$namespace", job="$job", queue_name=~"$queue_name", name!~"%(celeryIgnoredTasks)s"}, name)' % $._config 84 | ) + 85 | query.withDatasourceFromVariable(datasourceVariable) + 86 | query.withSort(1) + 87 | query.generalOptions.withLabel('Task') + 88 | query.selectionOptions.withMulti(true) + 89 | query.selectionOptions.withIncludeAll(false) + 90 | query.refresh.onLoad() + 91 | query.refresh.onTime(), 92 | 93 | local variables = [ 94 | datasourceVariable, 95 | namespaceVariable, 96 | jobVariable, 97 | queueNameVariable, 98 | taskVariable, 99 | ], 100 | 101 | local taskExceptionsQuery = ||| 102 | round( 103 | sum ( 104 | increase( 105 | celery_task_failed_total{ 106 | job="$job", 107 | name=~"$task", 108 | queue_name=~"$queue_name" 109 | }[$__range] 110 | ) 111 | ) by (name, exception) > 0 112 | ) 113 | |||, 114 | local taskExceptionsTable = 115 | tablePanel.new( 116 | 'Task Exceptions', 117 | ) + 118 | tbStandardOptions.withUnit('short') + 119 | tbOptions.withSortBy( 120 | tbOptions.sortBy.withDisplayName('Value') + 121 | tbOptions.sortBy.withDesc(true) 122 | ) + 123 | tbOptions.footer.withEnablePagination(true) + 124 | tbQueryOptions.withTargets( 125 | prometheus.new( 126 | '$datasource', 127 | taskExceptionsQuery, 128 | ) + 129 | prometheus.withFormat('table') + 130 | prometheus.withInstant(true) 131 | ) + 132 | tbQueryOptions.withTransformations([ 133 | tbQueryOptions.transformation.withId( 134 | 'organize' 135 | ) + 136 | tbQueryOptions.transformation.withOptions( 137 | { 138 | renameByName: { 139 | name: 'Task', 140 | exception: 'Exception', 141 | }, 142 | indexByName: { 143 | name: 0, 144 | exception: 1, 145 | Value: 2, 146 | }, 147 | excludeByName: { 148 | Time: true, 149 | job: true, 150 | }, 151 | } 152 | ), 153 | ]), 154 | 155 | local taskFailedQuery = ||| 156 | sum ( 157 | round( 158 | increase( 159 | celery_task_failed_total{ 160 | job="$job", 161 | name=~"$task", 162 | queue_name=~"$queue_name" 163 | }[$__range] 164 | ) 165 | ) 166 | ) by (name) > 0 167 | |||, 168 | local taskSucceededQuery = std.strReplace(taskFailedQuery, 'failed', 'succeeded'), 169 | local taskSentQuery = std.strReplace(taskFailedQuery, 'failed', 'sent'), 170 | local taskReceivedQuery = std.strReplace(taskFailedQuery, 'failed', 'received'), 171 | local taskRetriedQuery = std.strReplace(taskFailedQuery, 'failed', 'retried'), 172 | local taskRevokedQuery = std.strReplace(taskFailedQuery, 'failed', 'revoked'), 173 | local taskRejectedQuery = std.strReplace(taskFailedQuery, 'failed', 'rejected'), 174 | local taskSuccessRateQuery = ||| 175 | %s/(%s+%s) > -1 176 | ||| % [ 177 | // Strip out > 0 from the end of the success query 178 | std.strReplace(taskSucceededQuery, ' > 0', ''), 179 | std.strReplace(taskSucceededQuery, ' > 0', ''), 180 | std.strReplace(taskFailedQuery, ' > 0', ''), 181 | ], // Add > -1 to remove NaN results 182 | 183 | local tasksStatsTable = 184 | tablePanel.new( 185 | 'Task Stats', 186 | ) + 187 | tbStandardOptions.withUnit('short') + 188 | tbStandardOptions.withNoValue(0) + 189 | tbOptions.withSortBy( 190 | tbOptions.sortBy.withDisplayName('Succeeded') + 191 | tbOptions.sortBy.withDesc(true) 192 | ) + 193 | tbOptions.footer.withEnablePagination(true) + 194 | tbQueryOptions.withTargets( 195 | [ 196 | prometheus.new( 197 | '$datasource', 198 | taskSuccessRateQuery, 199 | ) + 200 | prometheus.withFormat('table') + 201 | prometheus.withInstant(true), 202 | prometheus.new( 203 | '$datasource', 204 | taskSucceededQuery, 205 | ) + 206 | prometheus.withFormat('table') + 207 | prometheus.withInstant(true), 208 | prometheus.new( 209 | '$datasource', 210 | taskFailedQuery, 211 | ) + 212 | prometheus.withFormat('table') + 213 | prometheus.withInstant(true), 214 | prometheus.new( 215 | '$datasource', 216 | taskSentQuery, 217 | ) + 218 | prometheus.withFormat('table') + 219 | prometheus.withInstant(true), 220 | prometheus.new( 221 | '$datasource', 222 | taskReceivedQuery, 223 | ) + 224 | prometheus.withFormat('table') + 225 | prometheus.withInstant(true), 226 | prometheus.new( 227 | '$datasource', 228 | taskRejectedQuery, 229 | ) + 230 | prometheus.withFormat('table') + 231 | prometheus.withInstant(true), 232 | prometheus.new( 233 | '$datasource', 234 | taskRetriedQuery, 235 | ) + 236 | prometheus.withFormat('table') + 237 | prometheus.withInstant(true), 238 | prometheus.new( 239 | '$datasource', 240 | taskRevokedQuery, 241 | ) + 242 | prometheus.withFormat('table') + 243 | prometheus.withInstant(true), 244 | ] 245 | ) + 246 | tbQueryOptions.withTransformations([ 247 | tbQueryOptions.transformation.withId( 248 | 'merge' 249 | ), 250 | tbQueryOptions.transformation.withId( 251 | 'organize' 252 | ) + 253 | tbQueryOptions.transformation.withOptions( 254 | { 255 | renameByName: { 256 | name: 'Name', 257 | 'Value #A': 'Success Rate', 258 | 'Value #B': 'Succeeded', 259 | 'Value #C': 'Failed', 260 | 'Value #D': 'Sent', 261 | 'Value #E': 'Received', 262 | 'Value #F': 'Rejected', 263 | 'Value #G': 'Retried', 264 | 'Value #H': 'Revoked', 265 | }, 266 | indexByName: { 267 | name: 0, 268 | 'Value #A': 1, 269 | 'Value #B': 2, 270 | 'Value #C': 3, 271 | 'Value #D': 4, 272 | 'Value #E': 5, 273 | 'Value #F': 6, 274 | 'Value #G': 7, 275 | 'Value #H': 8, 276 | }, 277 | excludeByName: { 278 | Time: true, 279 | }, 280 | } 281 | ), 282 | ]) + 283 | tbStandardOptions.withOverrides([ 284 | tbOverride.byName.new('Success Rate') + 285 | tbOverride.byName.withPropertiesFromOptions( 286 | tbStandardOptions.withUnit('percentunit') 287 | ), 288 | ]), 289 | 290 | local taskFailedByExceptionIntervalQuery = ||| 291 | sum ( 292 | round( 293 | increase( 294 | celery_task_failed_total{ 295 | job="$job", 296 | name=~"$task", 297 | queue_name=~"$queue_name" 298 | }[$__rate_interval] 299 | ) 300 | ) 301 | ) by (name, exception) > 0 302 | |||, 303 | 304 | local tasksFailedByExceptionTimeSeriesPanel = 305 | timeSeriesPanel.new( 306 | 'Task Exceptions', 307 | ) + 308 | tsQueryOptions.withTargets( 309 | [ 310 | prometheus.new( 311 | '$datasource', 312 | taskFailedByExceptionIntervalQuery, 313 | ) + 314 | prometheus.withLegendFormat( 315 | '{{ name }}/{{ exception }}' 316 | ), 317 | ] 318 | ) + 319 | tsStandardOptions.withUnit('short') + 320 | tsOptions.tooltip.withMode('multi') + 321 | tsOptions.tooltip.withSort('desc') + 322 | tsLegend.withShowLegend(true) + 323 | tsLegend.withDisplayMode('table') + 324 | tsLegend.withPlacement('right') + 325 | tsLegend.withCalcs(['mean', 'max']) + 326 | tsLegend.withSortBy('Mean') + 327 | tsLegend.withSortDesc(true) + 328 | tsCustom.withSpanNulls(false), 329 | 330 | local taskFailedIntervalQuery = ||| 331 | sum ( 332 | round( 333 | increase( 334 | celery_task_failed_total{ 335 | job="$job", 336 | name=~"$task", 337 | queue_name=~"$queue_name" 338 | }[$__rate_interval] 339 | ) 340 | ) 341 | ) by (name) > 0 342 | |||, 343 | local taskSucceededIntervalQuery = std.strReplace(taskFailedIntervalQuery, 'failed', 'succeeded'), 344 | local taskSentIntervalQuery = std.strReplace(taskFailedIntervalQuery, 'failed', 'sent'), 345 | local taskReceivedIntervalQuery = std.strReplace(taskFailedIntervalQuery, 'failed', 'received'), 346 | local taskRetriedIntervalQuery = std.strReplace(taskFailedIntervalQuery, 'failed', 'retried'), 347 | local taskRevokedIntervalQuery = std.strReplace(taskFailedIntervalQuery, 'failed', 'revoked'), 348 | local taskRejectedIntervalQuery = std.strReplace(taskFailedIntervalQuery, 'failed', 'rejected'), 349 | 350 | 351 | local tasksCompletedTimeSeriesPanel = 352 | timeSeriesPanel.new( 353 | 'Tasks Completed', 354 | ) + 355 | tsQueryOptions.withTargets( 356 | [ 357 | prometheus.new( 358 | '$datasource', 359 | taskSucceededIntervalQuery, 360 | ) + 361 | prometheus.withLegendFormat( 362 | 'Succeeded - {{ name }}' 363 | ), 364 | prometheus.new( 365 | '$datasource', 366 | taskFailedIntervalQuery, 367 | ) + 368 | prometheus.withLegendFormat( 369 | 'Failed - {{ name }}' 370 | ), 371 | prometheus.new( 372 | '$datasource', 373 | taskSentIntervalQuery, 374 | ) + 375 | prometheus.withLegendFormat( 376 | 'Sent - {{ name }}' 377 | ), 378 | prometheus.new( 379 | '$datasource', 380 | taskReceivedIntervalQuery, 381 | ) + 382 | prometheus.withLegendFormat( 383 | 'Received - {{ name }}' 384 | ), 385 | prometheus.new( 386 | '$datasource', 387 | taskRetriedIntervalQuery, 388 | ) + 389 | prometheus.withLegendFormat( 390 | 'Retried - {{ name }}' 391 | ), 392 | prometheus.new( 393 | '$datasource', 394 | taskRevokedIntervalQuery, 395 | ) + 396 | prometheus.withLegendFormat( 397 | 'Revoked - {{ name }}' 398 | ), 399 | prometheus.new( 400 | '$datasource', 401 | taskRejectedIntervalQuery, 402 | ) + 403 | prometheus.withLegendFormat( 404 | 'Rejected - {{ name }}' 405 | ), 406 | ] 407 | ) + 408 | tsStandardOptions.withUnit('short') + 409 | tsOptions.tooltip.withMode('multi') + 410 | tsOptions.tooltip.withSort('desc') + 411 | tsLegend.withShowLegend(true) + 412 | tsLegend.withDisplayMode('table') + 413 | tsLegend.withPlacement('right') + 414 | tsLegend.withCalcs(['mean', 'max']) + 415 | tsLegend.withSortBy('Mean') + 416 | tsLegend.withSortDesc(true) + 417 | tsCustom.withSpanNulls(false), 418 | 419 | local tasksRuntimeP50Query = ||| 420 | histogram_quantile(0.50, 421 | sum( 422 | irate( 423 | celery_task_runtime_bucket{ 424 | job="$job", 425 | name=~"$task", 426 | queue_name=~"$queue_name" 427 | }[$__rate_interval] 428 | ) > 0 429 | ) by (name, job, le) 430 | ) 431 | |||, 432 | local tasksRuntimeP95Query = std.strReplace(tasksRuntimeP50Query, '0.50', '0.95'), 433 | local tasksRuntimeP99Query = std.strReplace(tasksRuntimeP50Query, '0.50', '0.99'), 434 | 435 | local tasksRuntimeTimeSeriesPanel = 436 | timeSeriesPanel.new( 437 | 'Tasks Runtime', 438 | ) + 439 | tsQueryOptions.withTargets( 440 | [ 441 | prometheus.new( 442 | '$datasource', 443 | tasksRuntimeP50Query, 444 | ) + 445 | prometheus.withLegendFormat( 446 | 'P50 - {{ name }}' 447 | ), 448 | prometheus.new( 449 | '$datasource', 450 | tasksRuntimeP95Query, 451 | ) + 452 | prometheus.withLegendFormat( 453 | 'P95 - {{ name }}' 454 | ), 455 | prometheus.new( 456 | '$datasource', 457 | tasksRuntimeP99Query, 458 | ) + 459 | prometheus.withLegendFormat( 460 | 'P99 - {{ name }}' 461 | ), 462 | ] 463 | ) + 464 | tsStandardOptions.withUnit('s') + 465 | tsOptions.tooltip.withMode('multi') + 466 | tsOptions.tooltip.withSort('desc') + 467 | tsStandardOptions.withOverrides([ 468 | tsOverride.byName.new('P50') + 469 | tsOverride.byName.withPropertiesFromOptions( 470 | tsStandardOptions.color.withMode('fixed') + 471 | tsStandardOptions.color.withFixedColor('green') 472 | ), 473 | tsOverride.byName.new('P95') + 474 | tsOverride.byName.withPropertiesFromOptions( 475 | tsStandardOptions.color.withMode('fixed') + 476 | tsStandardOptions.color.withFixedColor('yellow') 477 | ), 478 | tsOverride.byName.new('P99') + 479 | tsOverride.byName.withPropertiesFromOptions( 480 | tsStandardOptions.color.withMode('fixed') + 481 | tsStandardOptions.color.withFixedColor('red') 482 | ), 483 | ]) + 484 | tsLegend.withShowLegend(true) + 485 | tsLegend.withDisplayMode('table') + 486 | tsLegend.withPlacement('right') + 487 | tsLegend.withCalcs(['mean', 'max']) + 488 | tsLegend.withSortBy('Mean') + 489 | tsLegend.withSortDesc(true) + 490 | tsCustom.withSpanNulls(false), 491 | 492 | local tasksRow = 493 | row.new( 494 | title='Tasks' 495 | ), 496 | 497 | 498 | 'celery-tasks-by-task.json': 499 | dashboard.new( 500 | 'Celery / Tasks / By Task', 501 | ) + 502 | dashboard.withDescription( 503 | 'A dashboard that monitors Celery. It is created using the Celery-mixin for the the (Celery-exporter)[https://github.com/danihodovic/celery-exporter]' 504 | ) + 505 | dashboard.withUid($._config.celeryTasksByTaskUid) + 506 | dashboard.withTags($._config.tags) + 507 | dashboard.withTimezone('utc') + 508 | dashboard.withEditable(true) + 509 | dashboard.time.withFrom('now-2d') + 510 | dashboard.time.withTo('now') + 511 | dashboard.withVariables(variables) + 512 | dashboard.withLinks( 513 | [ 514 | dashboard.link.dashboards.new('Celery Dashboards', $._config.tags) + 515 | dashboard.link.link.options.withTargetBlank(true), 516 | ] 517 | ) + 518 | dashboard.withPanels( 519 | [ 520 | tasksRow + 521 | row.gridPos.withX(0) + 522 | row.gridPos.withY(0) + 523 | row.gridPos.withW(24) + 524 | row.gridPos.withH(1), 525 | tasksStatsTable + 526 | timeSeriesPanel.gridPos.withX(0) + 527 | timeSeriesPanel.gridPos.withY(1) + 528 | timeSeriesPanel.gridPos.withW(16) + 529 | timeSeriesPanel.gridPos.withH(8), 530 | taskExceptionsTable + 531 | timeSeriesPanel.gridPos.withX(16) + 532 | timeSeriesPanel.gridPos.withY(1) + 533 | timeSeriesPanel.gridPos.withW(8) + 534 | timeSeriesPanel.gridPos.withH(8), 535 | tasksCompletedTimeSeriesPanel + 536 | timeSeriesPanel.gridPos.withX(0) + 537 | timeSeriesPanel.gridPos.withY(9) + 538 | timeSeriesPanel.gridPos.withW(24) + 539 | timeSeriesPanel.gridPos.withH(8), 540 | tasksFailedByExceptionTimeSeriesPanel + 541 | timeSeriesPanel.gridPos.withX(0) + 542 | timeSeriesPanel.gridPos.withY(17) + 543 | timeSeriesPanel.gridPos.withW(24) + 544 | timeSeriesPanel.gridPos.withH(8), 545 | tasksRuntimeTimeSeriesPanel + 546 | timeSeriesPanel.gridPos.withX(0) + 547 | timeSeriesPanel.gridPos.withY(25) + 548 | timeSeriesPanel.gridPos.withW(24) + 549 | timeSeriesPanel.gridPos.withH(8), 550 | ] 551 | ) + 552 | if $._config.annotation.enabled then 553 | dashboard.withAnnotations($._config.customAnnotation) 554 | else {}, 555 | }, 556 | } 557 | -------------------------------------------------------------------------------- /celery-mixin/dashboards/dashboards.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'celery-tasks-overview.libsonnet') + 2 | (import 'celery-tasks-by-task.libsonnet') + 3 | {} 4 | -------------------------------------------------------------------------------- /celery-mixin/dashboards_out/celery-tasks-by-task.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "A dashboard that monitors Celery. It is created using the Celery-mixin for the the (Celery-exporter)[https://github.com/danihodovic/celery-exporter]", 3 | "editable": true, 4 | "links": [ 5 | { 6 | "tags": [ 7 | "celery", 8 | "celery-mixin" 9 | ], 10 | "targetBlank": true, 11 | "title": "Celery Dashboards", 12 | "type": "dashboards" 13 | } 14 | ], 15 | "panels": [ 16 | { 17 | "collapsed": false, 18 | "gridPos": { 19 | "h": 1, 20 | "w": 24, 21 | "x": 0, 22 | "y": 0 23 | }, 24 | "id": 1, 25 | "title": "Tasks", 26 | "type": "row" 27 | }, 28 | { 29 | "datasource": { 30 | "type": "datasource", 31 | "uid": "-- Mixed --" 32 | }, 33 | "fieldConfig": { 34 | "defaults": { 35 | "noValue": 0, 36 | "unit": "short" 37 | }, 38 | "overrides": [ 39 | { 40 | "matcher": { 41 | "id": "byName", 42 | "options": "Success Rate" 43 | }, 44 | "properties": [ 45 | { 46 | "id": "unit", 47 | "value": "percentunit" 48 | } 49 | ] 50 | } 51 | ] 52 | }, 53 | "gridPos": { 54 | "h": 8, 55 | "w": 16, 56 | "x": 0, 57 | "y": 1 58 | }, 59 | "id": 2, 60 | "options": { 61 | "footer": { 62 | "enablePagination": true 63 | }, 64 | "sortBy": [ 65 | { 66 | "desc": true, 67 | "displayName": "Succeeded" 68 | } 69 | ] 70 | }, 71 | "pluginVersion": "v11.1.0", 72 | "targets": [ 73 | { 74 | "datasource": { 75 | "type": "prometheus", 76 | "uid": "$datasource" 77 | }, 78 | "expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name)\n/(sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name)\n+sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name)\n) > -1\n", 79 | "format": "table", 80 | "instant": true 81 | }, 82 | { 83 | "datasource": { 84 | "type": "prometheus", 85 | "uid": "$datasource" 86 | }, 87 | "expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", 88 | "format": "table", 89 | "instant": true 90 | }, 91 | { 92 | "datasource": { 93 | "type": "prometheus", 94 | "uid": "$datasource" 95 | }, 96 | "expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", 97 | "format": "table", 98 | "instant": true 99 | }, 100 | { 101 | "datasource": { 102 | "type": "prometheus", 103 | "uid": "$datasource" 104 | }, 105 | "expr": "sum (\n round(\n increase(\n celery_task_sent_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", 106 | "format": "table", 107 | "instant": true 108 | }, 109 | { 110 | "datasource": { 111 | "type": "prometheus", 112 | "uid": "$datasource" 113 | }, 114 | "expr": "sum (\n round(\n increase(\n celery_task_received_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", 115 | "format": "table", 116 | "instant": true 117 | }, 118 | { 119 | "datasource": { 120 | "type": "prometheus", 121 | "uid": "$datasource" 122 | }, 123 | "expr": "sum (\n round(\n increase(\n celery_task_rejected_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", 124 | "format": "table", 125 | "instant": true 126 | }, 127 | { 128 | "datasource": { 129 | "type": "prometheus", 130 | "uid": "$datasource" 131 | }, 132 | "expr": "sum (\n round(\n increase(\n celery_task_retried_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", 133 | "format": "table", 134 | "instant": true 135 | }, 136 | { 137 | "datasource": { 138 | "type": "prometheus", 139 | "uid": "$datasource" 140 | }, 141 | "expr": "sum (\n round(\n increase(\n celery_task_revoked_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n )\n) by (name) > 0\n", 142 | "format": "table", 143 | "instant": true 144 | } 145 | ], 146 | "title": "Task Stats", 147 | "transformations": [ 148 | { 149 | "id": "merge" 150 | }, 151 | { 152 | "id": "organize", 153 | "options": { 154 | "excludeByName": { 155 | "Time": true 156 | }, 157 | "indexByName": { 158 | "Value #A": 1, 159 | "Value #B": 2, 160 | "Value #C": 3, 161 | "Value #D": 4, 162 | "Value #E": 5, 163 | "Value #F": 6, 164 | "Value #G": 7, 165 | "Value #H": 8, 166 | "name": 0 167 | }, 168 | "renameByName": { 169 | "Value #A": "Success Rate", 170 | "Value #B": "Succeeded", 171 | "Value #C": "Failed", 172 | "Value #D": "Sent", 173 | "Value #E": "Received", 174 | "Value #F": "Rejected", 175 | "Value #G": "Retried", 176 | "Value #H": "Revoked", 177 | "name": "Name" 178 | } 179 | } 180 | } 181 | ], 182 | "type": "table" 183 | }, 184 | { 185 | "datasource": { 186 | "type": "datasource", 187 | "uid": "-- Mixed --" 188 | }, 189 | "fieldConfig": { 190 | "defaults": { 191 | "unit": "short" 192 | } 193 | }, 194 | "gridPos": { 195 | "h": 8, 196 | "w": 8, 197 | "x": 16, 198 | "y": 1 199 | }, 200 | "id": 3, 201 | "options": { 202 | "footer": { 203 | "enablePagination": true 204 | }, 205 | "sortBy": [ 206 | { 207 | "desc": true, 208 | "displayName": "Value" 209 | } 210 | ] 211 | }, 212 | "pluginVersion": "v11.1.0", 213 | "targets": [ 214 | { 215 | "datasource": { 216 | "type": "prometheus", 217 | "uid": "$datasource" 218 | }, 219 | "expr": "round(\n sum (\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__range]\n )\n ) by (name, exception) > 0\n)\n", 220 | "format": "table", 221 | "instant": true 222 | } 223 | ], 224 | "title": "Task Exceptions", 225 | "transformations": [ 226 | { 227 | "id": "organize", 228 | "options": { 229 | "excludeByName": { 230 | "Time": true, 231 | "job": true 232 | }, 233 | "indexByName": { 234 | "Value": 2, 235 | "exception": 1, 236 | "name": 0 237 | }, 238 | "renameByName": { 239 | "exception": "Exception", 240 | "name": "Task" 241 | } 242 | } 243 | } 244 | ], 245 | "type": "table" 246 | }, 247 | { 248 | "datasource": { 249 | "type": "datasource", 250 | "uid": "-- Mixed --" 251 | }, 252 | "fieldConfig": { 253 | "defaults": { 254 | "custom": { 255 | "spanNulls": false 256 | }, 257 | "unit": "short" 258 | } 259 | }, 260 | "gridPos": { 261 | "h": 8, 262 | "w": 24, 263 | "x": 0, 264 | "y": 9 265 | }, 266 | "id": 4, 267 | "options": { 268 | "legend": { 269 | "calcs": [ 270 | "mean", 271 | "max" 272 | ], 273 | "displayMode": "table", 274 | "placement": "right", 275 | "showLegend": true, 276 | "sortBy": "Mean", 277 | "sortDesc": true 278 | }, 279 | "tooltip": { 280 | "mode": "multi", 281 | "sort": "desc" 282 | } 283 | }, 284 | "pluginVersion": "v11.1.0", 285 | "targets": [ 286 | { 287 | "datasource": { 288 | "type": "prometheus", 289 | "uid": "$datasource" 290 | }, 291 | "expr": "sum (\n round(\n increase(\n celery_task_succeeded_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", 292 | "legendFormat": "Succeeded - {{ name }}" 293 | }, 294 | { 295 | "datasource": { 296 | "type": "prometheus", 297 | "uid": "$datasource" 298 | }, 299 | "expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", 300 | "legendFormat": "Failed - {{ name }}" 301 | }, 302 | { 303 | "datasource": { 304 | "type": "prometheus", 305 | "uid": "$datasource" 306 | }, 307 | "expr": "sum (\n round(\n increase(\n celery_task_sent_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", 308 | "legendFormat": "Sent - {{ name }}" 309 | }, 310 | { 311 | "datasource": { 312 | "type": "prometheus", 313 | "uid": "$datasource" 314 | }, 315 | "expr": "sum (\n round(\n increase(\n celery_task_received_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", 316 | "legendFormat": "Received - {{ name }}" 317 | }, 318 | { 319 | "datasource": { 320 | "type": "prometheus", 321 | "uid": "$datasource" 322 | }, 323 | "expr": "sum (\n round(\n increase(\n celery_task_retried_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", 324 | "legendFormat": "Retried - {{ name }}" 325 | }, 326 | { 327 | "datasource": { 328 | "type": "prometheus", 329 | "uid": "$datasource" 330 | }, 331 | "expr": "sum (\n round(\n increase(\n celery_task_revoked_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", 332 | "legendFormat": "Revoked - {{ name }}" 333 | }, 334 | { 335 | "datasource": { 336 | "type": "prometheus", 337 | "uid": "$datasource" 338 | }, 339 | "expr": "sum (\n round(\n increase(\n celery_task_rejected_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name) > 0\n", 340 | "legendFormat": "Rejected - {{ name }}" 341 | } 342 | ], 343 | "title": "Tasks Completed", 344 | "type": "timeseries" 345 | }, 346 | { 347 | "datasource": { 348 | "type": "datasource", 349 | "uid": "-- Mixed --" 350 | }, 351 | "fieldConfig": { 352 | "defaults": { 353 | "custom": { 354 | "spanNulls": false 355 | }, 356 | "unit": "short" 357 | } 358 | }, 359 | "gridPos": { 360 | "h": 8, 361 | "w": 24, 362 | "x": 0, 363 | "y": 17 364 | }, 365 | "id": 5, 366 | "options": { 367 | "legend": { 368 | "calcs": [ 369 | "mean", 370 | "max" 371 | ], 372 | "displayMode": "table", 373 | "placement": "right", 374 | "showLegend": true, 375 | "sortBy": "Mean", 376 | "sortDesc": true 377 | }, 378 | "tooltip": { 379 | "mode": "multi", 380 | "sort": "desc" 381 | } 382 | }, 383 | "pluginVersion": "v11.1.0", 384 | "targets": [ 385 | { 386 | "datasource": { 387 | "type": "prometheus", 388 | "uid": "$datasource" 389 | }, 390 | "expr": "sum (\n round(\n increase(\n celery_task_failed_total{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n )\n )\n) by (name, exception) > 0\n", 391 | "legendFormat": "{{ name }}/{{ exception }}" 392 | } 393 | ], 394 | "title": "Task Exceptions", 395 | "type": "timeseries" 396 | }, 397 | { 398 | "datasource": { 399 | "type": "datasource", 400 | "uid": "-- Mixed --" 401 | }, 402 | "fieldConfig": { 403 | "defaults": { 404 | "custom": { 405 | "spanNulls": false 406 | }, 407 | "unit": "s" 408 | }, 409 | "overrides": [ 410 | { 411 | "matcher": { 412 | "id": "byName", 413 | "options": "P50" 414 | }, 415 | "properties": [ 416 | { 417 | "id": "color", 418 | "value": { 419 | "fixedColor": "green", 420 | "mode": "fixed" 421 | } 422 | } 423 | ] 424 | }, 425 | { 426 | "matcher": { 427 | "id": "byName", 428 | "options": "P95" 429 | }, 430 | "properties": [ 431 | { 432 | "id": "color", 433 | "value": { 434 | "fixedColor": "yellow", 435 | "mode": "fixed" 436 | } 437 | } 438 | ] 439 | }, 440 | { 441 | "matcher": { 442 | "id": "byName", 443 | "options": "P99" 444 | }, 445 | "properties": [ 446 | { 447 | "id": "color", 448 | "value": { 449 | "fixedColor": "red", 450 | "mode": "fixed" 451 | } 452 | } 453 | ] 454 | } 455 | ] 456 | }, 457 | "gridPos": { 458 | "h": 8, 459 | "w": 24, 460 | "x": 0, 461 | "y": 25 462 | }, 463 | "id": 6, 464 | "options": { 465 | "legend": { 466 | "calcs": [ 467 | "mean", 468 | "max" 469 | ], 470 | "displayMode": "table", 471 | "placement": "right", 472 | "showLegend": true, 473 | "sortBy": "Mean", 474 | "sortDesc": true 475 | }, 476 | "tooltip": { 477 | "mode": "multi", 478 | "sort": "desc" 479 | } 480 | }, 481 | "pluginVersion": "v11.1.0", 482 | "targets": [ 483 | { 484 | "datasource": { 485 | "type": "prometheus", 486 | "uid": "$datasource" 487 | }, 488 | "expr": "histogram_quantile(0.50,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (name, job, le)\n)\n", 489 | "legendFormat": "P50 - {{ name }}" 490 | }, 491 | { 492 | "datasource": { 493 | "type": "prometheus", 494 | "uid": "$datasource" 495 | }, 496 | "expr": "histogram_quantile(0.95,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (name, job, le)\n)\n", 497 | "legendFormat": "P95 - {{ name }}" 498 | }, 499 | { 500 | "datasource": { 501 | "type": "prometheus", 502 | "uid": "$datasource" 503 | }, 504 | "expr": "histogram_quantile(0.99,\n sum(\n irate(\n celery_task_runtime_bucket{\n job=\"$job\",\n name=~\"$task\",\n queue_name=~\"$queue_name\"\n }[$__rate_interval]\n ) > 0\n ) by (name, job, le)\n)\n", 505 | "legendFormat": "P99 - {{ name }}" 506 | } 507 | ], 508 | "title": "Tasks Runtime", 509 | "type": "timeseries" 510 | } 511 | ], 512 | "schemaVersion": 39, 513 | "tags": [ 514 | "celery", 515 | "celery-mixin" 516 | ], 517 | "templating": { 518 | "list": [ 519 | { 520 | "label": "Data source", 521 | "name": "datasource", 522 | "query": "prometheus", 523 | "type": "datasource" 524 | }, 525 | { 526 | "datasource": { 527 | "type": "prometheus", 528 | "uid": "${datasource}" 529 | }, 530 | "includeAll": false, 531 | "label": "Namespace", 532 | "multi": false, 533 | "name": "namespace", 534 | "query": "label_values(celery_worker_up{}, namespace)", 535 | "refresh": 2, 536 | "sort": 1, 537 | "type": "query" 538 | }, 539 | { 540 | "datasource": { 541 | "type": "prometheus", 542 | "uid": "${datasource}" 543 | }, 544 | "includeAll": false, 545 | "label": "Job", 546 | "multi": false, 547 | "name": "job", 548 | "query": "label_values(celery_worker_up{namespace=\"$namespace\"}, job)", 549 | "refresh": 2, 550 | "sort": 1, 551 | "type": "query" 552 | }, 553 | { 554 | "datasource": { 555 | "type": "prometheus", 556 | "uid": "${datasource}" 557 | }, 558 | "includeAll": false, 559 | "label": "Queue Name", 560 | "multi": false, 561 | "name": "queue_name", 562 | "query": "label_values(celery_task_received_total{namespace=\"$namespace\", job=\"$job\", name!~\"None\"}, queue_name)", 563 | "refresh": 2, 564 | "sort": 1, 565 | "type": "query" 566 | }, 567 | { 568 | "datasource": { 569 | "type": "prometheus", 570 | "uid": "${datasource}" 571 | }, 572 | "includeAll": false, 573 | "label": "Task", 574 | "multi": true, 575 | "name": "task", 576 | "query": "label_values(celery_task_received_total{namespace=\"$namespace\", job=\"$job\", queue_name=~\"$queue_name\", name!~\"None\"}, name)", 577 | "refresh": 2, 578 | "sort": 1, 579 | "type": "query" 580 | } 581 | ] 582 | }, 583 | "time": { 584 | "from": "now-2d", 585 | "to": "now" 586 | }, 587 | "timezone": "utc", 588 | "title": "Celery / Tasks / By Task", 589 | "uid": "celery-tasks-by-task-32s3" 590 | } 591 | -------------------------------------------------------------------------------- /celery-mixin/dashboards_out/celery-tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ ], 3 | "__requires": [ ], 4 | "annotations": { 5 | "list": [ ] 6 | }, 7 | "description": "A dashboard that monitors Celery. It is created using the Celery-mixin for the the (Celery-exporter)[https://github.com/danihodovic/celery-exporter]", 8 | "editable": false, 9 | "gnetId": null, 10 | "graphTooltip": 0, 11 | "hideControls": false, 12 | "id": null, 13 | "links": [ ], 14 | "panels": [ 15 | { 16 | "collapse": false, 17 | "collapsed": false, 18 | "gridPos": { 19 | "h": 1, 20 | "w": 24, 21 | "x": 0, 22 | "y": 0 23 | }, 24 | "id": 2, 25 | "panels": [ ], 26 | "repeat": null, 27 | "repeatIteration": null, 28 | "repeatRowId": null, 29 | "showTitle": true, 30 | "title": "Summary", 31 | "titleSize": "h6", 32 | "type": "row" 33 | }, 34 | { 35 | "datasource": "$datasource", 36 | "fieldConfig": { 37 | "defaults": { 38 | "links": [ ], 39 | "mappings": [ ], 40 | "thresholds": { 41 | "mode": "absolute", 42 | "steps": [ ] 43 | }, 44 | "unit": "none" 45 | } 46 | }, 47 | "gridPos": { 48 | "h": 4, 49 | "w": 4, 50 | "x": 0, 51 | "y": 1 52 | }, 53 | "id": 3, 54 | "links": [ ], 55 | "options": { 56 | "colorMode": "value", 57 | "graphMode": "area", 58 | "justifyMode": "auto", 59 | "orientation": "auto", 60 | "reduceOptions": { 61 | "calcs": [ 62 | "last" 63 | ], 64 | "fields": "", 65 | "values": false 66 | }, 67 | "textMode": "auto" 68 | }, 69 | "pluginVersion": "7", 70 | "targets": [ 71 | { 72 | "expr": "count(celery_worker_up{job=~\"celery|celery-exporter\"} == 1)", 73 | "format": "time_series", 74 | "intervalFactor": 1, 75 | "legendFormat": "", 76 | "refId": "A" 77 | } 78 | ], 79 | "title": "Celery Workers", 80 | "transparent": false, 81 | "type": "stat" 82 | }, 83 | { 84 | "datasource": "$datasource", 85 | "fieldConfig": { 86 | "defaults": { 87 | "links": [ ], 88 | "mappings": [ ], 89 | "thresholds": { 90 | "mode": "absolute", 91 | "steps": [ ] 92 | }, 93 | "unit": "none" 94 | } 95 | }, 96 | "gridPos": { 97 | "h": 4, 98 | "w": 5, 99 | "x": 4, 100 | "y": 1 101 | }, 102 | "id": 4, 103 | "links": [ ], 104 | "options": { 105 | "colorMode": "value", 106 | "graphMode": "area", 107 | "justifyMode": "auto", 108 | "orientation": "auto", 109 | "reduceOptions": { 110 | "calcs": [ 111 | "last" 112 | ], 113 | "fields": "", 114 | "values": false 115 | }, 116 | "textMode": "auto" 117 | }, 118 | "pluginVersion": "7", 119 | "targets": [ 120 | { 121 | "expr": "sum(celery_worker_tasks_active{job=~\"celery|celery-exporter\"})", 122 | "format": "time_series", 123 | "intervalFactor": 1, 124 | "legendFormat": "", 125 | "refId": "A" 126 | } 127 | ], 128 | "title": "Tasks Active", 129 | "transparent": false, 130 | "type": "stat" 131 | }, 132 | { 133 | "datasource": "$datasource", 134 | "fieldConfig": { 135 | "defaults": { 136 | "links": [ ], 137 | "mappings": [ ], 138 | "thresholds": { 139 | "mode": "absolute", 140 | "steps": [ ] 141 | }, 142 | "unit": "none" 143 | } 144 | }, 145 | "gridPos": { 146 | "h": 4, 147 | "w": 5, 148 | "x": 9, 149 | "y": 1 150 | }, 151 | "id": 5, 152 | "links": [ ], 153 | "options": { 154 | "colorMode": "value", 155 | "graphMode": "area", 156 | "justifyMode": "auto", 157 | "orientation": "auto", 158 | "reduceOptions": { 159 | "calcs": [ 160 | "last" 161 | ], 162 | "fields": "", 163 | "values": false 164 | }, 165 | "textMode": "auto" 166 | }, 167 | "pluginVersion": "7", 168 | "targets": [ 169 | { 170 | "expr": "sum(round(increase(celery_task_received_total{job=~\"celery|celery-exporter\"}[1d])))\n", 171 | "format": "time_series", 172 | "intervalFactor": 2, 173 | "legendFormat": "", 174 | "refId": "A" 175 | } 176 | ], 177 | "title": "Tasks received by workers last 24h", 178 | "transparent": false, 179 | "type": "stat" 180 | }, 181 | { 182 | "datasource": "$datasource", 183 | "fieldConfig": { 184 | "defaults": { 185 | "links": [ ], 186 | "mappings": [ ], 187 | "thresholds": { 188 | "mode": "absolute", 189 | "steps": [ 190 | { 191 | "color": "green", 192 | "value": 0.94999999999999996 193 | } 194 | ] 195 | }, 196 | "unit": "percentunit" 197 | } 198 | }, 199 | "gridPos": { 200 | "h": 4, 201 | "w": 5, 202 | "x": 14, 203 | "y": 1 204 | }, 205 | "id": 6, 206 | "links": [ ], 207 | "options": { 208 | "colorMode": "value", 209 | "graphMode": "area", 210 | "justifyMode": "auto", 211 | "orientation": "auto", 212 | "reduceOptions": { 213 | "calcs": [ 214 | "last" 215 | ], 216 | "fields": "", 217 | "values": false 218 | }, 219 | "textMode": "auto" 220 | }, 221 | "pluginVersion": "7", 222 | "targets": [ 223 | { 224 | "expr": "sum(round(increase(celery_task_succeeded_total{job=~\"celery|celery-exporter\"}[1d])))\n/(sum(round(increase(celery_task_succeeded_total{job=~\"celery|celery-exporter\"}[1d])))\n+sum(round(increase(celery_task_failed_total{job=~\"celery|celery-exporter\"}[1d])))\n)\n", 225 | "format": "time_series", 226 | "intervalFactor": 2, 227 | "legendFormat": "", 228 | "refId": "A" 229 | } 230 | ], 231 | "title": "Successful completion rate last 24h", 232 | "transparent": false, 233 | "type": "stat" 234 | }, 235 | { 236 | "datasource": "$datasource", 237 | "fieldConfig": { 238 | "defaults": { 239 | "links": [ ], 240 | "mappings": [ ], 241 | "thresholds": { 242 | "mode": "absolute", 243 | "steps": [ ] 244 | }, 245 | "unit": "none" 246 | } 247 | }, 248 | "gridPos": { 249 | "h": 4, 250 | "w": 5, 251 | "x": 19, 252 | "y": 1 253 | }, 254 | "id": 7, 255 | "links": [ ], 256 | "options": { 257 | "colorMode": "value", 258 | "graphMode": "area", 259 | "justifyMode": "auto", 260 | "orientation": "auto", 261 | "reduceOptions": { 262 | "calcs": [ 263 | "last" 264 | ], 265 | "fields": "", 266 | "values": false 267 | }, 268 | "textMode": "auto" 269 | }, 270 | "pluginVersion": "7", 271 | "targets": [ 272 | { 273 | "expr": "sum(rate(celery_task_runtime_sum{job=~\"celery|celery-exporter\"}[1d])) / sum(rate(celery_task_runtime_count{job=~\"celery|celery-exporter\"}[1d])) > 0\n", 274 | "format": "time_series", 275 | "intervalFactor": 2, 276 | "legendFormat": "", 277 | "refId": "A" 278 | } 279 | ], 280 | "title": "Average Runtime for Tasks last 24h", 281 | "transparent": false, 282 | "type": "stat" 283 | }, 284 | { 285 | "columns": [ ], 286 | "datasource": "$datasource", 287 | "gridPos": { 288 | "h": 8, 289 | "w": 8, 290 | "x": 0, 291 | "y": 5 292 | }, 293 | "id": 8, 294 | "links": [ ], 295 | "sort": { 296 | "col": 2, 297 | "desc": true 298 | }, 299 | "span": "4", 300 | "styles": [ 301 | { 302 | "alias": "Time", 303 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 304 | "pattern": "Time", 305 | "type": "hidden" 306 | }, 307 | { 308 | "alias": "Task", 309 | "pattern": "name" 310 | } 311 | ], 312 | "targets": [ 313 | { 314 | "expr": "round(topk(5, sum by (name) (increase(celery_task_failed_total{job=~\"celery|celery-exporter\"}[1d]) > 0 )))\n", 315 | "format": "table", 316 | "instant": true, 317 | "intervalFactor": 2, 318 | "legendFormat": "", 319 | "refId": "A" 320 | } 321 | ], 322 | "timeFrom": null, 323 | "timeShift": null, 324 | "title": "Top 5 failed tasks last 24h", 325 | "type": "table" 326 | }, 327 | { 328 | "columns": [ ], 329 | "datasource": "$datasource", 330 | "gridPos": { 331 | "h": 8, 332 | "w": 8, 333 | "x": 8, 334 | "y": 5 335 | }, 336 | "id": 9, 337 | "links": [ ], 338 | "sort": { 339 | "col": 2, 340 | "desc": true 341 | }, 342 | "span": "4", 343 | "styles": [ 344 | { 345 | "alias": "Time", 346 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 347 | "pattern": "Time", 348 | "type": "hidden" 349 | }, 350 | { 351 | "alias": "Task", 352 | "pattern": "name" 353 | } 354 | ], 355 | "targets": [ 356 | { 357 | "expr": "round(topk(5, sum by (exception) (increase(celery_task_failed_total{job=~\"celery|celery-exporter\"}[1d]) > 0 )))\n", 358 | "format": "table", 359 | "instant": true, 360 | "intervalFactor": 2, 361 | "legendFormat": "", 362 | "refId": "A" 363 | } 364 | ], 365 | "timeFrom": null, 366 | "timeShift": null, 367 | "title": "Top 5 exceptions last 24h", 368 | "type": "table" 369 | }, 370 | { 371 | "columns": [ ], 372 | "datasource": "$datasource", 373 | "gridPos": { 374 | "h": 8, 375 | "w": 8, 376 | "x": 16, 377 | "y": 5 378 | }, 379 | "id": 10, 380 | "links": [ ], 381 | "sort": { 382 | "col": 2, 383 | "desc": true 384 | }, 385 | "span": "4", 386 | "styles": [ 387 | { 388 | "alias": "Time", 389 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 390 | "pattern": "Time", 391 | "type": "hidden" 392 | }, 393 | { 394 | "alias": "Task", 395 | "pattern": "name" 396 | } 397 | ], 398 | "targets": [ 399 | { 400 | "expr": "topk(5, (sum by(name) (rate(celery_task_runtime_sum{job=~\"celery|celery-exporter\"}[1d])) / sum by (name) (rate(celery_task_runtime_count{job=~\"celery|celery-exporter\"}[1d])) > 0 ))\n", 401 | "format": "table", 402 | "instant": true, 403 | "intervalFactor": 2, 404 | "legendFormat": "", 405 | "refId": "A" 406 | } 407 | ], 408 | "timeFrom": null, 409 | "timeShift": null, 410 | "title": "Top 5 task runtime last 24h", 411 | "type": "table" 412 | }, 413 | { 414 | "collapse": false, 415 | "collapsed": false, 416 | "gridPos": { 417 | "h": 1, 418 | "w": 24, 419 | "x": 0, 420 | "y": 13 421 | }, 422 | "id": 11, 423 | "panels": [ ], 424 | "repeat": null, 425 | "repeatIteration": null, 426 | "repeatRowId": null, 427 | "showTitle": true, 428 | "title": "Individual Tasks", 429 | "titleSize": "h6", 430 | "type": "row" 431 | }, 432 | { 433 | "columns": [ ], 434 | "datasource": "$datasource", 435 | "gridPos": { 436 | "h": 8, 437 | "w": 24, 438 | "x": 0, 439 | "y": 14 440 | }, 441 | "id": 12, 442 | "links": [ ], 443 | "sort": { 444 | "col": 2, 445 | "desc": false 446 | }, 447 | "span": "6", 448 | "styles": [ 449 | { 450 | "alias": "Time", 451 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 452 | "pattern": "Time", 453 | "type": "hidden" 454 | }, 455 | { 456 | "alias": "Task", 457 | "pattern": "name" 458 | }, 459 | { 460 | "alias": "Success Rate", 461 | "pattern": "Value #A", 462 | "type": "number", 463 | "unit": "percentunit" 464 | }, 465 | { 466 | "alias": "Received", 467 | "decimals": "0", 468 | "pattern": "Value #B", 469 | "type": "number", 470 | "unit": "short" 471 | }, 472 | { 473 | "alias": "Succeeded", 474 | "decimals": "0", 475 | "pattern": "Value #C", 476 | "type": "number", 477 | "unit": "short" 478 | }, 479 | { 480 | "alias": "Failed", 481 | "decimals": "0", 482 | "pattern": "Value #D", 483 | "type": "number", 484 | "unit": "short" 485 | }, 486 | { 487 | "alias": "Rejected", 488 | "decimals": "0", 489 | "pattern": "Value #E", 490 | "type": "number", 491 | "unit": "short" 492 | }, 493 | { 494 | "alias": "Retried", 495 | "decimals": "0", 496 | "pattern": "Value #F", 497 | "type": "number", 498 | "unit": "short" 499 | }, 500 | { 501 | "alias": "Revoked", 502 | "decimals": "0", 503 | "pattern": "Value #G", 504 | "type": "number", 505 | "unit": "short" 506 | } 507 | ], 508 | "targets": [ 509 | { 510 | "expr": "sum by (name) (round(increase(celery_task_succeeded_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[$__range])))\n/(sum by (name) (round(increase(celery_task_succeeded_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[$__range])))\n+sum by (name) (round(increase(celery_task_failed_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[$__range])))\n) > -1\n", 511 | "format": "table", 512 | "instant": true, 513 | "intervalFactor": 2, 514 | "legendFormat": "", 515 | "refId": "A" 516 | }, 517 | { 518 | "expr": "sum by (name) (round(increase(celery_task_received_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[$__range]))) > 0\n", 519 | "format": "table", 520 | "instant": true, 521 | "intervalFactor": 2, 522 | "legendFormat": "", 523 | "refId": "B" 524 | }, 525 | { 526 | "expr": "sum by (name) (round(increase(celery_task_succeeded_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[$__range]))) > 0\n", 527 | "format": "table", 528 | "instant": true, 529 | "intervalFactor": 2, 530 | "legendFormat": "", 531 | "refId": "C" 532 | }, 533 | { 534 | "expr": "sum by (name) (round(increase(celery_task_failed_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[$__range]))) > 0\n", 535 | "format": "table", 536 | "instant": true, 537 | "intervalFactor": 2, 538 | "legendFormat": "", 539 | "refId": "D" 540 | }, 541 | { 542 | "expr": "sum by (name) (round(increase(celery_task_rejected_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[$__range]))) > 0\n", 543 | "format": "table", 544 | "instant": true, 545 | "intervalFactor": 2, 546 | "legendFormat": "", 547 | "refId": "E" 548 | }, 549 | { 550 | "expr": "sum by (name) (round(increase(celery_task_retried_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[$__range]))) > 0\n", 551 | "format": "table", 552 | "instant": true, 553 | "intervalFactor": 2, 554 | "legendFormat": "", 555 | "refId": "F" 556 | }, 557 | { 558 | "expr": "sum by (name) (round(increase(celery_task_revoked_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[$__range]))) > 0\n", 559 | "format": "table", 560 | "instant": true, 561 | "intervalFactor": 2, 562 | "legendFormat": "", 563 | "refId": "G" 564 | } 565 | ], 566 | "timeFrom": null, 567 | "timeShift": null, 568 | "title": "Task Stats", 569 | "type": "table" 570 | }, 571 | { 572 | "aliasColors": { }, 573 | "bars": false, 574 | "dashLength": 10, 575 | "dashes": false, 576 | "datasource": "$datasource", 577 | "fill": 1, 578 | "fillGradient": 0, 579 | "gridPos": { 580 | "h": 10, 581 | "w": 24, 582 | "x": 0, 583 | "y": 22 584 | }, 585 | "id": 13, 586 | "legend": { 587 | "alignAsTable": true, 588 | "avg": true, 589 | "current": true, 590 | "hideZero": true, 591 | "max": false, 592 | "min": false, 593 | "rightSide": true, 594 | "show": true, 595 | "sideWidth": null, 596 | "total": false, 597 | "values": true 598 | }, 599 | "lines": true, 600 | "linewidth": 1, 601 | "links": [ ], 602 | "nullPointMode": "null", 603 | "percentage": false, 604 | "pointradius": 5, 605 | "points": false, 606 | "renderer": "flot", 607 | "repeat": null, 608 | "seriesOverrides": [ ], 609 | "spaceLength": 10, 610 | "stack": false, 611 | "steppedLine": false, 612 | "targets": [ 613 | { 614 | "expr": "sum by (name) (round(increase(celery_task_succeeded_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[10m])))\n", 615 | "format": "time_series", 616 | "intervalFactor": 2, 617 | "legendFormat": "Succeeded - {{ name }}", 618 | "refId": "A" 619 | }, 620 | { 621 | "expr": "sum by (name) (round(increase(celery_task_failed_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[10m])))\n", 622 | "format": "time_series", 623 | "intervalFactor": 2, 624 | "legendFormat": "Failed - {{ name }}", 625 | "refId": "B" 626 | }, 627 | { 628 | "expr": "sum by (name) (round(increase(celery_task_received_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[10m])))\n", 629 | "format": "time_series", 630 | "intervalFactor": 2, 631 | "legendFormat": "Received - {{ name }}", 632 | "refId": "C" 633 | }, 634 | { 635 | "expr": "sum by (name) (round(increase(celery_task_retried_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[10m])))\n", 636 | "format": "time_series", 637 | "intervalFactor": 2, 638 | "legendFormat": "Retried - {{ name }}", 639 | "refId": "D" 640 | }, 641 | { 642 | "expr": "sum by (name) (round(increase(celery_task_rejected_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[10m])))\n", 643 | "format": "time_series", 644 | "intervalFactor": 2, 645 | "legendFormat": "Rejected - {{ name }}", 646 | "refId": "E" 647 | }, 648 | { 649 | "expr": "sum by (name) (round(increase(celery_task_revoked_total{job=~\"celery|celery-exporter\", name=~\"$task\"}[10m])))\n", 650 | "format": "time_series", 651 | "intervalFactor": 2, 652 | "legendFormat": "Revoked - {{ name }}", 653 | "refId": "F" 654 | } 655 | ], 656 | "thresholds": [ ], 657 | "timeFrom": null, 658 | "timeShift": null, 659 | "title": "Tasks completed with 10m intervals", 660 | "tooltip": { 661 | "shared": true, 662 | "sort": 0, 663 | "value_type": "individual" 664 | }, 665 | "type": "graph", 666 | "xaxis": { 667 | "buckets": null, 668 | "mode": "time", 669 | "name": null, 670 | "show": true, 671 | "values": [ ] 672 | }, 673 | "yaxes": [ 674 | { 675 | "format": "short", 676 | "label": null, 677 | "logBase": 1, 678 | "max": null, 679 | "min": null, 680 | "show": true 681 | }, 682 | { 683 | "format": "short", 684 | "label": null, 685 | "logBase": 1, 686 | "max": null, 687 | "min": null, 688 | "show": true 689 | } 690 | ] 691 | }, 692 | { 693 | "aliasColors": { }, 694 | "bars": false, 695 | "dashLength": 10, 696 | "dashes": false, 697 | "datasource": "$datasource", 698 | "fill": 1, 699 | "fillGradient": 0, 700 | "gridPos": { 701 | "h": 8, 702 | "w": 24, 703 | "x": 0, 704 | "y": 32 705 | }, 706 | "id": 14, 707 | "legend": { 708 | "alignAsTable": true, 709 | "avg": true, 710 | "current": true, 711 | "hideZero": true, 712 | "max": false, 713 | "min": false, 714 | "rightSide": true, 715 | "show": true, 716 | "sideWidth": null, 717 | "total": false, 718 | "values": true 719 | }, 720 | "lines": true, 721 | "linewidth": 1, 722 | "links": [ ], 723 | "nullPointMode": "null", 724 | "percentage": false, 725 | "pointradius": 5, 726 | "points": false, 727 | "renderer": "flot", 728 | "repeat": null, 729 | "seriesOverrides": [ ], 730 | "spaceLength": 10, 731 | "stack": false, 732 | "steppedLine": false, 733 | "targets": [ 734 | { 735 | "expr": "sum by (name) (rate(celery_task_runtime_sum{job=~\"celery|celery-exporter\", name=~\"$task\"}[10m])) / sum by (name) (rate(celery_task_runtime_count{job=~\"celery|celery-exporter\", name=~\"$task\"}[10m])) > 0\n", 736 | "format": "time_series", 737 | "intervalFactor": 2, 738 | "legendFormat": "{{ name }}", 739 | "refId": "A" 740 | } 741 | ], 742 | "thresholds": [ ], 743 | "timeFrom": null, 744 | "timeShift": null, 745 | "title": "Tasks Runtime with 10m intervals", 746 | "tooltip": { 747 | "shared": true, 748 | "sort": 0, 749 | "value_type": "individual" 750 | }, 751 | "type": "graph", 752 | "xaxis": { 753 | "buckets": null, 754 | "mode": "time", 755 | "name": null, 756 | "show": true, 757 | "values": [ ] 758 | }, 759 | "yaxes": [ 760 | { 761 | "format": "short", 762 | "label": null, 763 | "logBase": 1, 764 | "max": null, 765 | "min": null, 766 | "show": true 767 | }, 768 | { 769 | "format": "short", 770 | "label": null, 771 | "logBase": 1, 772 | "max": null, 773 | "min": null, 774 | "show": true 775 | } 776 | ] 777 | } 778 | ], 779 | "refresh": "", 780 | "rows": [ ], 781 | "schemaVersion": 14, 782 | "style": "dark", 783 | "tags": [ ], 784 | "templating": { 785 | "list": [ 786 | { 787 | "current": { 788 | "text": "Prometheus", 789 | "value": "Prometheus" 790 | }, 791 | "hide": 0, 792 | "label": null, 793 | "name": "datasource", 794 | "options": [ ], 795 | "query": "prometheus", 796 | "refresh": 1, 797 | "regex": "", 798 | "type": "datasource" 799 | }, 800 | { 801 | "allValue": null, 802 | "current": { 803 | "text": "", 804 | "value": "" 805 | }, 806 | "datasource": "$datasource", 807 | "hide": 0, 808 | "includeAll": true, 809 | "label": null, 810 | "multi": true, 811 | "name": "task", 812 | "options": [ ], 813 | "query": "label_values(celery_task_sent_total, name)", 814 | "refresh": 1, 815 | "regex": "", 816 | "sort": 1, 817 | "tagValuesQuery": "", 818 | "tags": [ ], 819 | "tagsQuery": "", 820 | "type": "query", 821 | "useTags": false 822 | } 823 | ] 824 | }, 825 | "time": { 826 | "from": "now-2d", 827 | "to": "now" 828 | }, 829 | "timepicker": { 830 | "refresh_intervals": [ 831 | "5s", 832 | "10s", 833 | "30s", 834 | "1m", 835 | "5m", 836 | "15m", 837 | "30m", 838 | "1h", 839 | "2h", 840 | "1d" 841 | ], 842 | "time_options": [ 843 | "5m", 844 | "15m", 845 | "1h", 846 | "6h", 847 | "12h", 848 | "24h", 849 | "2d", 850 | "7d", 851 | "30d" 852 | ] 853 | }, 854 | "timezone": "utc", 855 | "title": "Celery / Tasks", 856 | "uid": "celery-exporter", 857 | "version": 0 858 | } 859 | -------------------------------------------------------------------------------- /celery-mixin/jsonnetfile.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dependencies": [ 4 | { 5 | "source": { 6 | "git": { 7 | "remote": "https://github.com/grafana/grafonnet.git", 8 | "subdir": "gen/grafonnet-latest" 9 | } 10 | }, 11 | "version": "main" 12 | } 13 | ], 14 | "legacyImports": false 15 | } 16 | -------------------------------------------------------------------------------- /celery-mixin/mixin.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'alerts/alerts.libsonnet') + 2 | (import 'dashboards/dashboards.libsonnet') + 3 | (import 'config.libsonnet') 4 | -------------------------------------------------------------------------------- /celery-mixin/prometheus-alerts.yaml: -------------------------------------------------------------------------------- 1 | "groups": 2 | - "name": "celery" 3 | "rules": 4 | - "alert": "CeleryTaskHighFailRate" 5 | "annotations": 6 | "dashboard_url": "https://grafana.com/d/celery-tasks-by-task-32s3/celery-tasks-by-task?var-job={{ $labels.job }}&var-queue_name={{ $labels.queue_name }}&var-task={{ $labels.name }}" 7 | "description": "More than 5% tasks failed for the task {{ $labels.job }}/{{ $labels.queue_name }}/{{ $labels.name }} the past 10m." 8 | "summary": "Celery high task fail rate." 9 | "expr": | 10 | sum( 11 | increase( 12 | celery_task_failed_total{ 13 | job=~"celery|celery-exporter", 14 | queue_name!~"None", 15 | name!~"None" 16 | }[10m] 17 | ) 18 | ) by (job, namespace, queue_name, name) 19 | / 20 | ( 21 | sum( 22 | increase( 23 | celery_task_failed_total{ 24 | job=~"celery|celery-exporter", 25 | queue_name!~"None", 26 | name!~"None" 27 | }[10m] 28 | ) 29 | ) by (job, namespace, queue_name, name) 30 | + 31 | sum( 32 | increase( 33 | celery_task_succeeded_total{ 34 | job=~"celery|celery-exporter", 35 | queue_name!~"None", 36 | name!~"None" 37 | }[10m] 38 | ) 39 | ) by (job, namespace, queue_name, name) 40 | ) 41 | * 100 > 5 42 | "for": "1m" 43 | "labels": 44 | "severity": "warning" 45 | - "alert": "CeleryHighQueueLength" 46 | "annotations": 47 | "dashboard_url": "https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{ $labels.job }}&var-queue_name={{ $labels.queue_name }}" 48 | "description": "More than 100 tasks in the queue {{ $labels.job }}/{{ $labels.queue_name }} the past 20m." 49 | "summary": "Celery high queue length." 50 | "expr": | 51 | sum( 52 | celery_queue_length{ 53 | job=~"celery|celery-exporter", 54 | queue_name!~"None" 55 | } 56 | ) by (job, namespace, queue_name) 57 | > 100 58 | "for": "20m" 59 | "labels": 60 | "severity": "warning" 61 | - "alert": "CeleryWorkerDown" 62 | "annotations": 63 | "dashboard_url": "https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job={{ $labels.job }}" 64 | "description": "The Celery worker {{ $labels.job }}/{{ $labels.hostname }} is offline." 65 | "summary": "A Celery worker is offline." 66 | "expr": | 67 | celery_worker_up{job=~"celery|celery-exporter"} == 0 68 | "for": "15m" 69 | "labels": 70 | "severity": "warning" 71 | -------------------------------------------------------------------------------- /celery-mixin/tests.yaml: -------------------------------------------------------------------------------- 1 | # yamllint disable rule:line-length 2 | --- 3 | rule_files: 4 | - prometheus-alerts.yaml 5 | 6 | tests: 7 | - interval: 5m 8 | input_series: 9 | - series: 'celery_task_failed_total{job="celery-exporter", namespace="staging", queue_name="celery", name="test-task"}' 10 | values: "1+10x10" 11 | - series: 'celery_task_succeeded_total{job="celery-exporter", namespace="staging", queue_name="celery", name="test-task"}' 12 | values: "1+10x10" 13 | alert_rule_test: 14 | - eval_time: 15m 15 | alertname: CeleryTaskHighFailRate 16 | exp_alerts: 17 | - exp_labels: 18 | job: celery-exporter 19 | severity: warning 20 | namespace: staging 21 | queue_name: celery 22 | name: test-task 23 | exp_annotations: 24 | summary: "Celery high task fail rate." 25 | description: "More than 5% tasks failed for the task celery-exporter/celery/test-task the past 10m." 26 | dashboard_url: "https://grafana.com/d/celery-tasks-by-task-32s3/celery-tasks-by-task?var-job=celery-exporter&var-queue_name=celery&var-task=test-task" 27 | - interval: 1m 28 | input_series: 29 | - series: 'celery_queue_length{job="celery-exporter", namespace="staging", queue_name="celery-low-queue"}' 30 | values: "1+0x50" 31 | - series: 'celery_queue_length{job="celery-exporter", namespace="staging", queue_name="celery-high-queue"}' 32 | values: "1000+200x50" 33 | alert_rule_test: 34 | - eval_time: 40m 35 | alertname: CeleryHighQueueLength 36 | exp_alerts: 37 | - exp_labels: 38 | job: celery-exporter 39 | severity: warning 40 | namespace: staging 41 | queue_name: celery-high-queue 42 | exp_annotations: 43 | summary: "Celery high queue length." 44 | description: "More than 100 tasks in the queue celery-exporter/celery-high-queue the past 20m." 45 | dashboard_url: https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job=celery-exporter&var-queue_name=celery-high-queue 46 | - interval: 1m 47 | input_series: 48 | - series: 'celery_worker_up{job="celery-exporter", namespace="staging", hostname="down"}' 49 | values: "0+0x20" 50 | - series: 'celery_worker_up{job="celery-exporter", namespace="staging", hostname="up"}' 51 | values: "1+0x20" 52 | alert_rule_test: 53 | - eval_time: 20m 54 | alertname: CeleryWorkerDown 55 | exp_alerts: 56 | - exp_labels: 57 | job: celery-exporter 58 | severity: warning 59 | namespace: staging 60 | hostname: down 61 | exp_annotations: 62 | summary: "A Celery worker is offline." 63 | description: "The Celery worker celery-exporter/down is offline." 64 | dashboard_url: "https://grafana.com/d/celery-tasks-overview-32s3/celery-tasks-overview?&var-job=celery-exporter" 65 | -------------------------------------------------------------------------------- /charts/celery-exporter/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /charts/celery-exporter/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: celery-exporter 3 | description: Prometheus exporter for Celery 4 | type: application 5 | home: https://github.com/danihodovic/celery-exporter 6 | keywords: 7 | - celery 8 | - prometheus 9 | - exporter 10 | sources: 11 | - https://github.com/danihodovic/celery-exporter 12 | maintainers: 13 | - name: danihodovic 14 | - name: adinhodovic 15 | 16 | version: 0.8.0 17 | appVersion: 0.9.2 18 | -------------------------------------------------------------------------------- /charts/celery-exporter/README.md: -------------------------------------------------------------------------------- 1 | # celery-exporter 2 | 3 | ![Version: 0.7.0](https://img.shields.io/badge/Version-0.7.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.9.2](https://img.shields.io/badge/AppVersion-0.9.2-informational?style=flat-square) 4 | 5 | Prometheus exporter for Celery 6 | 7 | **Homepage:** 8 | 9 | ## Installation 10 | 11 | Add the helm repository: 12 | 13 | ```bash 14 | helm repo add danihodovic https://danihodovic.github.io/celery-exporter/ 15 | ``` 16 | 17 | Install the chart: 18 | 19 | ```bash 20 | helm install celery-exporter danihodovic/celery-exporter 21 | ``` 22 | 23 | 24 | You'll need to set the enviroment variable `CE_BROKER_URL` to the broker url of your celery instance. 25 | 26 | For example: 27 | 28 | ```bash 29 | helm install celery-exporter danihodovic/celery-exporter --set env[0].name=CE_BROKER_URL,env[0].value=redis://redis:6379/0 30 | ``` 31 | 32 | ## Maintainers 33 | 34 | | Name | Email | Url | 35 | | ---- | ------ | --- | 36 | | danihodovic | | | 37 | | adinhodovic | | | 38 | 39 | ## Source Code 40 | 41 | * 42 | 43 | ## Values 44 | 45 | | Key | Type | Default | Description | 46 | |-----|------|---------|-------------| 47 | | affinity | object | `{}` | | 48 | | env | list | `[]` | | 49 | | fullnameOverride | string | `""` | | 50 | | image.pullPolicy | string | `"IfNotPresent"` | | 51 | | image.repository | string | `"danihodovic/celery-exporter"` | | 52 | | image.tag | string | `""` | | 53 | | imagePullSecrets | list | `[]` | | 54 | | ingress.annotations | object | `{}` | | 55 | | ingress.className | string | `""` | | 56 | | ingress.enabled | bool | `false` | | 57 | | ingress.hosts[0].host | string | `"celery-exporter.example"` | | 58 | | ingress.hosts[0].paths[0].path | string | `"/"` | | 59 | | ingress.hosts[0].paths[0].pathType | string | `"ImplementationSpecific"` | | 60 | | ingress.tls | list | `[]` | | 61 | | livenessProbe | object | `{}` | | 62 | | nameOverride | string | `""` | | 63 | | nodeSelector | object | `{}` | | 64 | | podAnnotations | object | `{}` | | 65 | | podSecurityContext | object | `{}` | | 66 | | readinessProbe | object | `{}` | | 67 | | replicaCount | int | `1` | | 68 | | resources | object | `{}` | | 69 | | securityContext | object | `{}` | | 70 | | service.port | int | `9808` | | 71 | | service.type | string | `"ClusterIP"` | | 72 | | service.annotations | object | `{}` | | 73 | | serviceAccount.annotations | object | `{}` | | 74 | | serviceAccount.create | bool | `true` | | 75 | | serviceAccount.name | string | `""` | | 76 | | serviceMonitor.additionalLabels | object | `{}` | | 77 | | serviceMonitor.enabled | bool | `false` | | 78 | | serviceMonitor.metricRelabelings | list | `[]` | | 79 | | serviceMonitor.namespace | string | `""` | | 80 | | serviceMonitor.namespaceSelector | object | `{}` | | 81 | | serviceMonitor.relabelings | list | `[]` | | 82 | | serviceMonitor.scrapeInterval | string | `"30s"` | | 83 | | serviceMonitor.targetLabels | list | `[]` | | 84 | | tolerations | list | `[]` | | 85 | -------------------------------------------------------------------------------- /charts/celery-exporter/ci/test-values.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | - name: CE_BROKER_URL 3 | value: memory://localhost/ 4 | -------------------------------------------------------------------------------- /charts/celery-exporter/templates/NOTES.txt: -------------------------------------------------------------------------------- 1 | 1. Get the application URL by running these commands: 2 | {{- if .Values.ingress.enabled }} 3 | {{- range $host := .Values.ingress.hosts }} 4 | {{- range .paths }} 5 | http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} 6 | {{- end }} 7 | {{- end }} 8 | {{- else if contains "NodePort" .Values.service.type }} 9 | export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "celery-exporter.fullname" . }}) 10 | export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") 11 | echo http://$NODE_IP:$NODE_PORT 12 | {{- else if contains "LoadBalancer" .Values.service.type }} 13 | NOTE: It may take a few minutes for the LoadBalancer IP to be available. 14 | You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "celery-exporter.fullname" . }}' 15 | export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "celery-exporter.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") 16 | echo http://$SERVICE_IP:{{ .Values.service.port }} 17 | {{- else if contains "ClusterIP" .Values.service.type }} 18 | export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "celery-exporter.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") 19 | export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") 20 | echo "Visit http://127.0.0.1:8080 to use your application" 21 | kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT 22 | {{- end }} 23 | -------------------------------------------------------------------------------- /charts/celery-exporter/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "celery-exporter.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "celery-exporter.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "celery-exporter.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "celery-exporter.labels" -}} 37 | helm.sh/chart: {{ include "celery-exporter.chart" . }} 38 | {{ include "celery-exporter.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "celery-exporter.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "celery-exporter.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "celery-exporter.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "celery-exporter.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /charts/celery-exporter/templates/deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: {{ include "celery-exporter.fullname" . }} 5 | labels: 6 | {{- include "celery-exporter.labels" . | nindent 4 }} 7 | spec: 8 | replicas: {{ .Values.replicaCount }} 9 | selector: 10 | matchLabels: 11 | {{- include "celery-exporter.selectorLabels" . | nindent 6 }} 12 | template: 13 | metadata: 14 | {{- with .Values.podAnnotations }} 15 | annotations: 16 | {{- toYaml . | nindent 8 }} 17 | {{- end }} 18 | labels: 19 | {{- include "celery-exporter.selectorLabels" . | nindent 8 }} 20 | spec: 21 | {{- with .Values.imagePullSecrets }} 22 | imagePullSecrets: 23 | {{- toYaml . | nindent 8 }} 24 | {{- end }} 25 | serviceAccountName: {{ include "celery-exporter.serviceAccountName" . }} 26 | securityContext: 27 | {{- toYaml .Values.podSecurityContext | nindent 8 }} 28 | containers: 29 | - name: {{ .Chart.Name }} 30 | securityContext: 31 | {{- toYaml .Values.securityContext | nindent 12 }} 32 | image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" 33 | imagePullPolicy: {{ .Values.image.pullPolicy }} 34 | ports: 35 | - name: http 36 | containerPort: 9808 37 | protocol: TCP 38 | readinessProbe: 39 | httpGet: 40 | path: /health 41 | port: http 42 | timeoutSeconds: {{ .Values.livenessProbe.timeoutSeconds | default "5" }} 43 | failureThreshold: {{ .Values.livenessProbe.failureThreshold | default "5" }} 44 | periodSeconds: {{ .Values.livenessProbe.periodSeconds | default "10" }} 45 | successThreshold: {{ .Values.livenessProbe.successThreshold | default "1" }} 46 | livenessProbe: 47 | httpGet: 48 | path: /health 49 | port: http 50 | timeoutSeconds: {{ .Values.livenessProbe.timeoutSeconds | default "5" }} 51 | failureThreshold: {{ .Values.livenessProbe.failureThreshold | default "5" }} 52 | periodSeconds: {{ .Values.livenessProbe.periodSeconds | default "10" }} 53 | successThreshold: {{ .Values.livenessProbe.successThreshold | default "1" }} 54 | resources: 55 | {{- toYaml .Values.resources | nindent 12 }} 56 | {{- with .Values.env }} 57 | env: 58 | {{- toYaml . | nindent 12 }} 59 | {{- end }} 60 | {{- with .Values.nodeSelector }} 61 | nodeSelector: 62 | {{- toYaml . | nindent 8 }} 63 | {{- end }} 64 | {{- with .Values.affinity }} 65 | affinity: 66 | {{- toYaml . | nindent 8 }} 67 | {{- end }} 68 | {{- with .Values.tolerations }} 69 | tolerations: 70 | {{- toYaml . | nindent 8 }} 71 | {{- end }} 72 | -------------------------------------------------------------------------------- /charts/celery-exporter/templates/ingress.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.ingress.enabled -}} 2 | {{- $fullName := include "celery-exporter.fullname" . -}} 3 | {{- $svcPort := .Values.service.port -}} 4 | {{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} 5 | {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} 6 | {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} 7 | {{- end }} 8 | {{- end }} 9 | {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} 10 | apiVersion: networking.k8s.io/v1 11 | {{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} 12 | apiVersion: networking.k8s.io/v1beta1 13 | {{- else -}} 14 | apiVersion: extensions/v1beta1 15 | {{- end }} 16 | kind: Ingress 17 | metadata: 18 | name: {{ $fullName }} 19 | labels: 20 | {{- include "celery-exporter.labels" . | nindent 4 }} 21 | {{- with .Values.ingress.annotations }} 22 | annotations: 23 | {{- toYaml . | nindent 4 }} 24 | {{- end }} 25 | spec: 26 | {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} 27 | ingressClassName: {{ .Values.ingress.className }} 28 | {{- end }} 29 | {{- if .Values.ingress.tls }} 30 | tls: 31 | {{- range .Values.ingress.tls }} 32 | - hosts: 33 | {{- range .hosts }} 34 | - {{ . | quote }} 35 | {{- end }} 36 | secretName: {{ .secretName }} 37 | {{- end }} 38 | {{- end }} 39 | rules: 40 | {{- range .Values.ingress.hosts }} 41 | - host: {{ .host | quote }} 42 | http: 43 | paths: 44 | {{- range .paths }} 45 | - path: {{ .path }} 46 | {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} 47 | pathType: {{ .pathType }} 48 | {{- end }} 49 | backend: 50 | {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} 51 | service: 52 | name: {{ $fullName }} 53 | port: 54 | number: {{ $svcPort }} 55 | {{- else }} 56 | serviceName: {{ $fullName }} 57 | servicePort: {{ $svcPort }} 58 | {{- end }} 59 | {{- end }} 60 | {{- end }} 61 | {{- end }} 62 | -------------------------------------------------------------------------------- /charts/celery-exporter/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "celery-exporter.fullname" . }} 5 | labels: 6 | {{- include "celery-exporter.labels" . | nindent 4 }} 7 | {{- with .Values.service.annotations }} 8 | annotations: 9 | {{- toYaml . | nindent 4 }} 10 | {{- end }} 11 | 12 | spec: 13 | type: {{ .Values.service.type }} 14 | ports: 15 | - port: {{ .Values.service.port }} 16 | targetPort: http 17 | protocol: TCP 18 | name: http 19 | selector: 20 | {{- include "celery-exporter.selectorLabels" . | nindent 4 }} 21 | -------------------------------------------------------------------------------- /charts/celery-exporter/templates/serviceaccount.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceAccount.create -}} 2 | apiVersion: v1 3 | kind: ServiceAccount 4 | metadata: 5 | name: {{ include "celery-exporter.serviceAccountName" . }} 6 | labels: 7 | {{- include "celery-exporter.labels" . | nindent 4 }} 8 | {{- with .Values.serviceAccount.annotations }} 9 | annotations: 10 | {{- toYaml . | nindent 4 }} 11 | {{- end }} 12 | {{- end }} 13 | -------------------------------------------------------------------------------- /charts/celery-exporter/templates/servicemonitor.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.serviceMonitor.enabled -}} 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: ServiceMonitor 4 | metadata: 5 | name: {{ include "celery-exporter.fullname" . }} 6 | {{- if .Values.serviceMonitor.namespace }} 7 | namespace: {{ .Values.serviceMonitor.namespace | quote }} 8 | {{- end }} 9 | labels: 10 | {{- include "celery-exporter.labels" . | nindent 4 }} 11 | {{- if .Values.serviceMonitor.additionalLabels }} 12 | {{- toYaml .Values.serviceMonitor.additionalLabels | nindent 4 }} 13 | {{- end }} 14 | spec: 15 | endpoints: 16 | - port: http 17 | interval: {{ .Values.serviceMonitor.scrapeInterval }} 18 | {{- if .Values.serviceMonitor.honorLabels }} 19 | honorLabels: true 20 | {{- end }} 21 | {{- if .Values.serviceMonitor.relabelings }} 22 | relabelings: {{ toYaml .Values.serviceMonitor.relabelings | nindent 8 }} 23 | {{- end }} 24 | {{- if .Values.serviceMonitor.metricRelabelings }} 25 | metricRelabelings: {{ toYaml .Values.serviceMonitor.metricRelabelings | nindent 8 }} 26 | {{- end }} 27 | {{- if .Values.serviceMonitor.jobLabel }} 28 | jobLabel: {{ .Values.serviceMonitor.jobLabel | quote }} 29 | {{- end }} 30 | {{- if .Values.serviceMonitor.namespaceSelector }} 31 | namespaceSelector: {{ toYaml .Values.serviceMonitor.namespaceSelector | nindent 4 }} 32 | {{- else }} 33 | namespaceSelector: 34 | matchNames: 35 | - {{ .Release.Namespace }} 36 | {{- end }} 37 | {{- if .Values.serviceMonitor.targetLabels }} 38 | targetLabels: 39 | {{- range .Values.serviceMonitor.targetLabels }} 40 | - {{ . }} 41 | {{- end }} 42 | {{- end }} 43 | selector: 44 | matchLabels: 45 | {{- include "celery-exporter.selectorLabels" . | nindent 6 }} 46 | {{- end }} 47 | -------------------------------------------------------------------------------- /charts/celery-exporter/templates/tests/test-connection.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: "{{ include "celery-exporter.fullname" . }}-test-connection" 5 | labels: 6 | {{- include "celery-exporter.labels" . | nindent 4 }} 7 | annotations: 8 | "helm.sh/hook": test 9 | spec: 10 | containers: 11 | - name: wget 12 | image: busybox 13 | command: ['wget'] 14 | args: ['{{ include "celery-exporter.fullname" . }}:{{ .Values.service.port }}'] 15 | restartPolicy: Never 16 | -------------------------------------------------------------------------------- /charts/celery-exporter/values.yaml: -------------------------------------------------------------------------------- 1 | # Default values for celery-exporter. 2 | # This is a YAML-formatted file. 3 | # Declare variables to be passed into your templates. 4 | 5 | replicaCount: 1 6 | 7 | image: 8 | repository: danihodovic/celery-exporter 9 | pullPolicy: IfNotPresent 10 | # Overrides the image tag whose default is the chart appVersion. 11 | tag: "" 12 | 13 | imagePullSecrets: [] 14 | nameOverride: "" 15 | fullnameOverride: "" 16 | 17 | serviceAccount: 18 | # Specifies whether a service account should be created 19 | create: true 20 | # Annotations to add to the service account 21 | annotations: {} 22 | # The name of the service account to use. 23 | # If not set and create is true, a name is generated using the fullname template 24 | name: "" 25 | 26 | env: [] 27 | # - name: CE_BROKER_URL 28 | # value: 29 | # - name: CE_BROKER_URL 30 | # valueFrom: 31 | # secretKeyRef: 32 | # name: MY_SECRET 33 | # key: MY_SECRET_KEY 34 | 35 | podAnnotations: {} 36 | 37 | podSecurityContext: {} 38 | # fsGroup: 2000 39 | 40 | securityContext: {} 41 | # capabilities: 42 | # drop: 43 | # - ALL 44 | # readOnlyRootFilesystem: true 45 | # runAsNonRoot: true 46 | # runAsUser: 1000 47 | 48 | service: 49 | type: ClusterIP 50 | port: 9808 51 | annotations: {} 52 | # prometheus.io/scrape: "true" 53 | # prometheus.io/port: "9808" 54 | 55 | ingress: 56 | enabled: false 57 | className: "" 58 | annotations: {} 59 | # kubernetes.io/ingress.class: nginx 60 | # kubernetes.io/tls-acme: "true" 61 | hosts: 62 | - host: celery-exporter.example 63 | paths: 64 | - path: / 65 | pathType: ImplementationSpecific 66 | tls: [] 67 | # - secretName: chart-example-tls 68 | # hosts: 69 | # - chart-example.local 70 | 71 | serviceMonitor: 72 | enabled: false 73 | additionalLabels: {} 74 | ## The label to use to retrieve the job name from. 75 | ## jobLabel: "app.kubernetes.io/name" 76 | namespace: "" 77 | namespaceSelector: {} 78 | ## Default: scrape .Release.Namespace only 79 | ## To scrape all, use the following: 80 | ## namespaceSelector: 81 | ## any: true 82 | scrapeInterval: 30s 83 | # honorLabels: true 84 | targetLabels: [] 85 | relabelings: [] 86 | metricRelabelings: [] 87 | 88 | resources: {} 89 | # We usually recommend not to specify default resources and to leave this as a conscious 90 | # choice for the user. This also increases chances charts run on environments with little 91 | # resources, such as Minikube. If you do want to specify resources, uncomment the following 92 | # lines, adjust them as necessary, and remove the curly braces after 'resources:'. 93 | # limits: 94 | # cpu: 100m 95 | # memory: 128Mi 96 | # requests: 97 | # cpu: 100m 98 | # memory: 128Mi 99 | 100 | livenessProbe: {} 101 | # Liveness and readiness probe timeout values. 102 | # timeoutSeconds: 5 103 | # failureThreshold: 5 104 | # periodSeconds: 10 105 | # successThreshold: 1 106 | readinessProbe: {} 107 | # timeoutSeconds: 15 108 | # failureThreshold: 5 109 | # periodSeconds: 10 110 | # successThreshold: 1 111 | 112 | nodeSelector: {} 113 | 114 | tolerations: [] 115 | 116 | affinity: {} 117 | -------------------------------------------------------------------------------- /cli.py: -------------------------------------------------------------------------------- 1 | from src.cli import cli 2 | 3 | if __name__ == "__main__": 4 | # pylint: disable=no-value-for-parameter,unexpected-keyword-arg 5 | cli(auto_envvar_prefix="CE") 6 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import threading 3 | import copy 4 | 5 | import pytest 6 | 7 | from src.exporter import Exporter 8 | 9 | 10 | def pytest_addoption(parser): 11 | parser.addoption( 12 | "--broker", 13 | action="store", 14 | default="redis", 15 | help="What broker to use in tests", 16 | choices=("redis", "rabbitmq", "memory"), 17 | ) 18 | parser.addoption( 19 | "--loglevel", 20 | action="store", 21 | default="INFO", 22 | help="Log level of the exporter and celery worker in tests", 23 | choices=("DEBUG", "INFO", "WARNING", "ERROR"), 24 | ) 25 | 26 | 27 | @pytest.fixture(scope="session") 28 | def broker(request): 29 | return request.config.getoption("--broker") 30 | 31 | 32 | @pytest.fixture(scope="session") 33 | def log_level(request): 34 | return request.config.getoption("--loglevel") 35 | 36 | 37 | @pytest.fixture(scope="session") 38 | def celery_config(broker): 39 | config = dict( 40 | task_send_sent_event=True, 41 | worker_send_task_events=True, 42 | ) 43 | if broker == "redis": 44 | config["broker_url"] = "redis://localhost:6379/" # type: ignore 45 | elif broker == "rabbitmq": 46 | config["broker_url"] = "amqp://guest:guest@localhost:5672" # type: ignore 47 | elif broker == "memory": 48 | config["broker_url"] = "memory://localhost/" # type: ignore 49 | 50 | return config 51 | 52 | 53 | # https://github.com/celery/celery/pull/6632 54 | @pytest.fixture(scope="session") 55 | def celery_worker_parameters(log_level): 56 | return dict( 57 | loglevel=log_level, 58 | without_heartbeat=False, 59 | ) 60 | 61 | 62 | @pytest.fixture(scope="session") 63 | def celery_enable_logging(log_level): 64 | return log_level == "DEBUG" 65 | 66 | 67 | @pytest.fixture(scope="session") 68 | def find_free_port(): 69 | """ 70 | https://gist.github.com/bertjwregeer/0be94ced48383a42e70c3d9fff1f4ad0 71 | """ 72 | 73 | def _find_free_port(): 74 | s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 75 | s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) 76 | s.bind(("0.0.0.0", 0)) 77 | portnum = s.getsockname()[1] 78 | s.close() 79 | 80 | return portnum 81 | 82 | return _find_free_port 83 | 84 | 85 | # Configurations for exporters 86 | @pytest.fixture(scope="session") 87 | def exporter_cfg_defaults(find_free_port, celery_config, log_level): 88 | cfg = { 89 | "host": "0.0.0.0", 90 | "port": find_free_port(), 91 | "broker_url": celery_config["broker_url"], 92 | "broker_transport_option": ["visibility_timeout=7200"], 93 | "broker_ssl_option": [], 94 | "retry_interval": 5, 95 | "log_level": log_level, 96 | "accept_content": None, 97 | "worker_timeout": 1, 98 | "purge_offline_worker_metrics": 10, 99 | "initial_queues": ["queue_from_command_line"], 100 | } 101 | yield cfg 102 | 103 | 104 | @pytest.fixture() 105 | def exporter_instance(exporter_cfg_defaults, find_free_port): 106 | exporter_cfg = copy.deepcopy(exporter_cfg_defaults) 107 | exporter_cfg["port"] = find_free_port() 108 | exporter = Exporter( 109 | worker_timeout_seconds=exporter_cfg["worker_timeout"], 110 | purge_offline_worker_metrics_seconds=exporter_cfg[ 111 | "purge_offline_worker_metrics" 112 | ], 113 | initial_queues=exporter_cfg["initial_queues"], 114 | ) 115 | setattr(exporter, "cfg", exporter_cfg) 116 | yield exporter 117 | 118 | 119 | @pytest.fixture() 120 | def threaded_exporter(exporter_instance): 121 | thread = threading.Thread( 122 | target=exporter_instance.run, args=(exporter_instance.cfg,), daemon=True 123 | ) 124 | thread.start() 125 | yield exporter_instance 126 | 127 | 128 | # Fixtures for same exporter, but with static labels 129 | @pytest.fixture 130 | def exporter_instance_static_labels(exporter_cfg_defaults, find_free_port): 131 | exporter_cfg = copy.deepcopy(exporter_cfg_defaults) 132 | exporter_cfg["port"] = find_free_port() 133 | exporter_cfg["static_label"] = { 134 | "test_label_1": "test_value", 135 | "test_label_2_long_named": "test_value_2_long_named", 136 | } 137 | exporter = Exporter( 138 | worker_timeout_seconds=exporter_cfg["worker_timeout"], 139 | purge_offline_worker_metrics_seconds=exporter_cfg[ 140 | "purge_offline_worker_metrics" 141 | ], 142 | initial_queues=exporter_cfg["initial_queues"], 143 | static_label=exporter_cfg["static_label"], 144 | ) 145 | setattr(exporter, "cfg", exporter_cfg) 146 | yield exporter 147 | 148 | 149 | @pytest.fixture() 150 | def threaded_exporter_static_labels(exporter_instance_static_labels): 151 | thread = threading.Thread( 152 | target=exporter_instance_static_labels.run, 153 | args=(exporter_instance_static_labels.cfg,), 154 | daemon=True, 155 | ) 156 | thread.start() 157 | yield exporter_instance_static_labels 158 | 159 | 160 | @pytest.fixture() 161 | def hostname(): 162 | return socket.gethostname() 163 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: '2.4' 3 | services: 4 | redis: 5 | image: 'redis:6' 6 | ports: ['6379:6379'] 7 | 8 | rabbitmq: 9 | image: rabbitmq:3 10 | ports: ['5672:5672'] 11 | -------------------------------------------------------------------------------- /images/celery-tasks-by-task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danihodovic/celery-exporter/e1160523e5230a44c314f37d21878d150ca97cf3/images/celery-tasks-by-task.png -------------------------------------------------------------------------------- /images/celery-tasks-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danihodovic/celery-exporter/e1160523e5230a44c314f37d21878d150ca97cf3/images/celery-tasks-overview.png -------------------------------------------------------------------------------- /jsonnetfile.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dependencies": [ 4 | { 5 | "source": { 6 | "git": { 7 | "remote": "https://github.com/honeylogic-io/utils-libsonnet.git", 8 | "subdir": "lib" 9 | } 10 | }, 11 | "version": "master" 12 | } 13 | ], 14 | "legacyImports": true 15 | } 16 | -------------------------------------------------------------------------------- /jsonnetfile.lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dependencies": [ 4 | { 5 | "source": { 6 | "git": { 7 | "remote": "https://github.com/honeylogic-io/utils-libsonnet.git", 8 | "subdir": "lib" 9 | } 10 | }, 11 | "version": "cdcd088b54cf73511db37377841361b61abd5b14", 12 | "sum": "jWIlMnQDtnLbHE5Aj8eTL0R3sLCe1v0syzXEj9BZwaI=" 13 | } 14 | ], 15 | "legacyImports": false 16 | } 17 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "prometheus-exporter-celery" 3 | version = "0.12.0" 4 | description = "" 5 | authors = [ 6 | "Dani Hodovic ", 7 | "Adin Hodovic ", 8 | ] 9 | license = "MIT" 10 | packages = [ 11 | { include = "src" }, 12 | ] 13 | readme = "README.md" 14 | repository = "https://github.com/danihodovic/celery-exporter" 15 | documentation = "https://github.com/danihodovic/celery-exporter" 16 | keywords = ["celery", "task-processing", "prometheus", "grafana", "monitoring"] 17 | classifiers = [ 18 | "Topic :: System :: Monitoring", 19 | "Topic :: System :: Systems Administration", 20 | "Topic :: System :: Distributed Computing", 21 | "Framework :: Celery", 22 | "Framework :: Django", 23 | ] 24 | 25 | [tool.black] 26 | skip_numeric_underscore_normalization = true 27 | exclude = ".*(venv|virtualenv|.poetry|migrations|node_modules)" 28 | 29 | [tool.isort] 30 | profile = "black" 31 | multi_line_output = 3 32 | skip = '.virtualenv,.venv,.poetry,.poetry-cache' 33 | 34 | [tool.poetry.dependencies] 35 | python = ">=3.11,<3.14" 36 | celery = "^5.5.2" 37 | prometheus-client = "^0.21.1" 38 | click = "^8.1.8" 39 | pretty-errors = "^1.2.25" 40 | loguru = "^0.7.3" 41 | redis = "^5.2.0" 42 | Flask = "^3.1.0" 43 | waitress = "^3.0.2" 44 | arrow = "^1.3.0" 45 | timy = "^0.4.2" 46 | 47 | [tool.poetry.group.dev.dependencies] 48 | pytest = "^8.2.2" 49 | black = "^24.3.0" 50 | isort = "^5.13.2" 51 | jedi = "^0.19.1" 52 | pudb = "^2024.1.3" 53 | requests = "^2.32.3" 54 | pytest-cov = "^4.1.0" 55 | ptpython = "^3.0.25" 56 | pytest-mock = "^3.12.0" 57 | pyinstaller = "^6.13.0" 58 | mypy = "^1.8.0" 59 | types-requests = "^2" 60 | types-waitress = "^3.0.1.20241117" 61 | celery-types = "^0.11.0" 62 | pre-commit = "^2.19.0" 63 | pytest-celery = "^0.0.0" 64 | pylint = "^3.3.1" 65 | certifi = "^2024.8.30" 66 | idna = "^3.7" 67 | 68 | [build-system] 69 | requires = ["poetry-core>=1.0.0a5"] 70 | build-backend = "poetry.core.masonry.api" 71 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --pdbcls=pudb.debugger:Debugger --doctest-modules 3 | python_files = tests.py test_*.py 4 | norecursedirs = .git .venv .virtualenv 5 | log_cli = true 6 | filterwarnings = 7 | ignore::celery.fixups.django.FixupWarning 8 | ignore::DeprecationWarning 9 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danihodovic/celery-exporter/e1160523e5230a44c314f37d21878d150ca97cf3/src/__init__.py -------------------------------------------------------------------------------- /src/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | # pylint: disable=unused-import 4 | import pretty_errors # type: ignore 5 | from prometheus_client import Histogram 6 | 7 | from .exporter import Exporter 8 | from .help import cmd_help 9 | 10 | # https://github.com/pallets/click/issues/448#issuecomment-246029304 11 | # pylint: disable=protected-access 12 | click.core._verify_python3_env = lambda: None # type: ignore 13 | 14 | default_buckets_str = ",".join(map(str, Histogram.DEFAULT_BUCKETS)) 15 | 16 | 17 | def _comma_seperated_argument(_ctx, _param, value): 18 | if value is not None: 19 | return value.split(",") 20 | return [] 21 | 22 | 23 | # Accepts value string in format "key=val". Returns dict {key: val}. 24 | # * If value is None - returns empty dict 25 | def _eq_sign_separated_argument_to_dict(_ctx, _param, value): 26 | if value is not None: 27 | dict_of_key_value_pairs = {} 28 | for key_value_pair in value: 29 | key, val = key_value_pair.split("=") 30 | dict_of_key_value_pairs[key] = val 31 | return dict_of_key_value_pairs 32 | return {} 33 | 34 | 35 | @click.command(help=cmd_help) 36 | @click.option( 37 | "--broker-url", 38 | required=True, 39 | help="The url to the broker, e.g redis://1.2.3.4", 40 | ) 41 | @click.option( 42 | "--broker-transport-option", 43 | required=False, 44 | default=[None], 45 | multiple=True, 46 | help="Celery broker transport option, e.g visibility_timeout=18000", 47 | ) 48 | @click.option( 49 | "--broker-ssl-option", 50 | required=False, 51 | default=[None], 52 | multiple=True, 53 | help="Celery broker ssl option, e.g certfile=/var/ssl/amqp-server-cert.pem", 54 | ) 55 | @click.option( 56 | "--accept-content", 57 | required=False, 58 | default=None, 59 | help="Celery accept content options, e.g 'json,pickle'", 60 | ) 61 | @click.option( 62 | "--retry-interval", 63 | required=False, 64 | default=0, 65 | help="Broker exception retry interval in seconds, default is 0 for no retry", 66 | ) 67 | @click.option( 68 | "--host", 69 | default="0.0.0.0", 70 | show_default=True, 71 | help="The host the exporter will listen on", 72 | ) 73 | @click.option( 74 | "--port", 75 | type=int, 76 | default=9808, 77 | show_default=True, 78 | help="The port the exporter will listen on", 79 | ) 80 | @click.option( 81 | "--buckets", 82 | default=default_buckets_str, 83 | show_default=True, 84 | help="Buckets for runtime histogram", 85 | ) 86 | @click.option("--log-level", default="INFO", show_default=True) 87 | @click.option( 88 | "--worker-timeout", 89 | default=5 * 60, 90 | show_default=True, 91 | help="If no heartbeat has been recieved from a worker in this many seconds, " 92 | "that a worker will be considered dead. If set to 0, workers will never be " 93 | "timed out", 94 | ) 95 | @click.option( 96 | "--purge-offline-worker-metrics", 97 | default=10 * 60, 98 | show_default=True, 99 | help="If no heartbeat has been recieved from a worker in this many seconds, " 100 | "that a worker will be considered dead. Metrics will be purged for this worker " 101 | "after this many seconds. If set to 0, metrics will never be purged. Helps " 102 | "with keeping the cardinality of the metrics low.", 103 | ) 104 | @click.option( 105 | "--generic-hostname-task-sent-metric", 106 | default=False, 107 | is_flag=True, 108 | help="The metric celery_task_sent_total will be labeled with a generic hostname. " 109 | "This option helps with label cardinality when using a dynamic number of clients " 110 | "which create tasks. The default behavior is to label the metric with the client's hostname. " 111 | "Knowing which client sent a task might not be useful for many use cases as for example in " 112 | "Kubernetes environments where the client's hostname is a random string.", 113 | ) 114 | @click.option( 115 | "-Q", 116 | "--queues", 117 | default=None, 118 | show_default=False, 119 | callback=_comma_seperated_argument, 120 | help="A comma seperated list of queues to force metrics to appear for. " 121 | "Queues not included in this setting will not appear in metrics until at least one worker has " 122 | "been seen to follow that queue.", 123 | ) 124 | @click.option( 125 | "--metric-prefix", 126 | default="celery_", 127 | help="Prefix all metrics with a string. " 128 | "This option replaces the 'celery_*' part with a custom prefix. ", 129 | ) 130 | @click.option( 131 | "--default-queue-name", 132 | default="celery", 133 | help="task_default_queue option for celery." 134 | "This option is to define default queue name for celery, if queue name is not present in " 135 | "task parameters. It will be used in prom metrics label value.", 136 | ) 137 | @click.option( 138 | "--static-label", 139 | required=False, 140 | default=None, 141 | multiple=True, 142 | callback=_eq_sign_separated_argument_to_dict, 143 | help="Add label with static value to all metrics", 144 | ) 145 | def cli( # pylint: disable=too-many-arguments,too-many-positional-arguments,too-many-locals 146 | broker_url, 147 | broker_transport_option, 148 | accept_content, 149 | retry_interval, 150 | host, 151 | port, 152 | buckets, 153 | log_level, 154 | broker_ssl_option, 155 | worker_timeout, 156 | purge_offline_worker_metrics, 157 | generic_hostname_task_sent_metric, 158 | queues, 159 | metric_prefix, 160 | default_queue_name, 161 | static_label, 162 | ): # pylint: disable=unused-argument 163 | formatted_buckets = list(map(float, buckets.split(","))) 164 | ctx = click.get_current_context() 165 | Exporter( 166 | formatted_buckets, 167 | worker_timeout, 168 | purge_offline_worker_metrics, 169 | generic_hostname_task_sent_metric, 170 | queues, 171 | metric_prefix, 172 | default_queue_name, 173 | static_label, 174 | ).run(ctx.params) 175 | -------------------------------------------------------------------------------- /src/exporter.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=protected-access,,attribute-defined-outside-init 2 | import json 3 | import re 4 | import sys 5 | import time 6 | from collections import defaultdict 7 | from typing import Callable, Optional 8 | 9 | from celery import Celery 10 | from celery.events.state import State # type: ignore 11 | from celery.utils import nodesplit # type: ignore 12 | from celery.utils.time import utcoffset # type: ignore 13 | from kombu.exceptions import ChannelError # type: ignore 14 | from loguru import logger 15 | from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram 16 | 17 | from .http_server import start_http_server 18 | 19 | 20 | class Exporter: # pylint: disable=too-many-instance-attributes,too-many-branches 21 | state: State = None 22 | 23 | # pylint: disable=too-many-arguments,too-many-positional-arguments 24 | def __init__( 25 | self, 26 | buckets=None, 27 | worker_timeout_seconds=5 * 60, 28 | purge_offline_worker_metrics_seconds=10 * 60, 29 | generic_hostname_task_sent_metric=False, 30 | initial_queues=None, 31 | metric_prefix="celery_", 32 | default_queue_name="celery", 33 | static_label=None, 34 | ): 35 | self.registry = CollectorRegistry(auto_describe=True) 36 | self.queue_cache = set(initial_queues or []) 37 | self.worker_last_seen = {} 38 | self.worker_timeout_seconds = worker_timeout_seconds 39 | self.purge_offline_worker_metrics_after_seconds = ( 40 | purge_offline_worker_metrics_seconds 41 | ) 42 | self.generic_hostname_task_sent_metric = generic_hostname_task_sent_metric 43 | self.default_queue_name = default_queue_name 44 | 45 | # Static labels 46 | self.static_label = static_label or {} 47 | self.static_label_keys = self.static_label.keys() 48 | 49 | self.state_counters = { 50 | "task-sent": Counter( 51 | f"{metric_prefix}task_sent", 52 | "Sent when a task message is published.", 53 | ["name", "hostname", "queue_name", *self.static_label_keys], 54 | registry=self.registry, 55 | ), 56 | "task-received": Counter( 57 | f"{metric_prefix}task_received", 58 | "Sent when the worker receives a task.", 59 | ["name", "hostname", "queue_name", *self.static_label_keys], 60 | registry=self.registry, 61 | ), 62 | "task-started": Counter( 63 | f"{metric_prefix}task_started", 64 | "Sent just before the worker executes the task.", 65 | ["name", "hostname", "queue_name", *self.static_label_keys], 66 | registry=self.registry, 67 | ), 68 | "task-succeeded": Counter( 69 | f"{metric_prefix}task_succeeded", 70 | "Sent if the task executed successfully.", 71 | ["name", "hostname", "queue_name", *self.static_label_keys], 72 | registry=self.registry, 73 | ), 74 | "task-failed": Counter( 75 | f"{metric_prefix}task_failed", 76 | "Sent if the execution of the task failed.", 77 | [ 78 | "name", 79 | "hostname", 80 | "exception", 81 | "queue_name", 82 | *self.static_label_keys, 83 | ], 84 | registry=self.registry, 85 | ), 86 | "task-rejected": Counter( 87 | f"{metric_prefix}task_rejected", 88 | # pylint: disable=line-too-long 89 | "The task was rejected by the worker, possibly to be re-queued or moved to a dead letter queue.", 90 | ["name", "hostname", "queue_name", *self.static_label_keys], 91 | registry=self.registry, 92 | ), 93 | "task-revoked": Counter( 94 | f"{metric_prefix}task_revoked", 95 | "Sent if the task has been revoked.", 96 | ["name", "hostname", "queue_name", *self.static_label_keys], 97 | registry=self.registry, 98 | ), 99 | "task-retried": Counter( 100 | f"{metric_prefix}task_retried", 101 | "Sent if the task failed, but will be retried in the future.", 102 | ["name", "hostname", "queue_name", *self.static_label_keys], 103 | registry=self.registry, 104 | ), 105 | } 106 | self.celery_worker_up = Gauge( 107 | f"{metric_prefix}worker_up", 108 | "Indicates if a worker has recently sent a heartbeat.", 109 | ["hostname", *self.static_label_keys], 110 | registry=self.registry, 111 | ) 112 | self.worker_tasks_active = Gauge( 113 | f"{metric_prefix}worker_tasks_active", 114 | "The number of tasks the worker is currently processing", 115 | ["hostname", *self.static_label_keys], 116 | registry=self.registry, 117 | ) 118 | self.celery_task_runtime = Histogram( 119 | f"{metric_prefix}task_runtime", 120 | "Histogram of task runtime measurements.", 121 | ["name", "hostname", "queue_name", *self.static_label_keys], 122 | registry=self.registry, 123 | buckets=buckets or Histogram.DEFAULT_BUCKETS, 124 | ) 125 | self.celery_queue_length = Gauge( 126 | f"{metric_prefix}queue_length", 127 | "The number of message in broker queue.", 128 | ["queue_name", *self.static_label_keys], 129 | registry=self.registry, 130 | ) 131 | self.celery_active_consumer_count = Gauge( 132 | f"{metric_prefix}active_consumer_count", 133 | "The number of active consumer in broker queue.", 134 | ["queue_name", *self.static_label_keys], 135 | registry=self.registry, 136 | ) 137 | self.celery_active_worker_count = Gauge( 138 | f"{metric_prefix}active_worker_count", 139 | "The number of active workers in broker queue.", 140 | ["queue_name", *self.static_label_keys], 141 | registry=self.registry, 142 | ) 143 | self.celery_active_process_count = Gauge( 144 | f"{metric_prefix}active_process_count", 145 | "The number of active processes in broker queue.", 146 | ["queue_name", *self.static_label_keys], 147 | registry=self.registry, 148 | ) 149 | 150 | def scrape(self): 151 | if ( 152 | self.worker_timeout_seconds > 0 153 | or self.purge_offline_worker_metrics_after_seconds > 0 154 | ): 155 | self.track_timed_out_workers() 156 | self.track_queue_metrics() 157 | 158 | def forget_worker(self, hostname): 159 | if hostname in self.worker_last_seen: 160 | self.celery_worker_up.labels(hostname=hostname, **self.static_label).set(0) 161 | self.worker_tasks_active.labels(hostname=hostname, **self.static_label).set( 162 | 0 163 | ) 164 | logger.debug( 165 | "Updated gauge='{}' value='{}'", self.worker_tasks_active._name, 0 166 | ) 167 | logger.debug( 168 | "Updated gauge='{}' value='{}'", self.celery_worker_up._name, 0 169 | ) 170 | self.worker_last_seen[hostname]["forgotten"] = True 171 | 172 | # If purging of metrics is enabled we should keep the last seen so that we can 173 | # use the timestamp to purge the metrics later 174 | if self.purge_offline_worker_metrics_after_seconds == 0: 175 | del self.worker_last_seen[hostname] 176 | 177 | def purge_worker_metrics(self, hostname): 178 | # Prometheus stores a copy of the metrics in memory, so we need to remove them 179 | # The key of the metrics is a string sequence e.g ('celery(queue_name)', 'host-1(hostname)') 180 | for label_seq in list(self.worker_tasks_active._metrics.keys()): 181 | if hostname in label_seq: 182 | self.worker_tasks_active.remove(*label_seq) 183 | 184 | for label_seq in list(self.celery_worker_up._metrics.keys()): 185 | if hostname in label_seq: 186 | self.celery_worker_up.remove(*label_seq) 187 | 188 | for counter in self.state_counters.values(): 189 | for label_seq in list(counter._metrics.keys()): 190 | if hostname in label_seq: 191 | counter.remove(*label_seq) 192 | 193 | for label_seq in list(self.celery_task_runtime._metrics.keys()): 194 | if hostname in label_seq: 195 | self.celery_task_runtime.remove(*label_seq) 196 | 197 | del self.worker_last_seen[hostname] 198 | 199 | def track_timed_out_workers(self): 200 | now = time.time() 201 | # Make a copy of the last seen dict so we can delete from the dict with no issues 202 | for hostname, worker_status in list(self.worker_last_seen.items()): 203 | since = now - worker_status["ts"] 204 | if since > self.worker_timeout_seconds and not worker_status["forgotten"]: 205 | logger.info( 206 | f"Have not seen {hostname} for {since:0.2f} seconds. " 207 | "Removing from metrics" 208 | ) 209 | self.forget_worker(hostname) 210 | 211 | if self.purge_offline_worker_metrics_after_seconds > 0: 212 | if since > self.purge_offline_worker_metrics_after_seconds: 213 | logger.info( 214 | f"Have not seen {hostname} for {since:0.2f} seconds. " 215 | "Purging worker metrics" 216 | ) 217 | self.purge_worker_metrics(hostname) 218 | 219 | def track_queue_metrics(self): 220 | with self.app.connection() as connection: # type: ignore 221 | transport = connection.info()["transport"] 222 | acceptable_transports = [ 223 | "redis", 224 | "rediss", 225 | "amqp", 226 | "amqps", 227 | "memory", 228 | "sentinel", 229 | ] 230 | if transport not in acceptable_transports: 231 | logger.debug( 232 | f"Queue length tracking is only implemented for {acceptable_transports}" 233 | ) 234 | return 235 | 236 | concurrency_per_worker = { 237 | worker: len(stats["pool"].get("processes", [])) 238 | for worker, stats in (self.app.control.inspect().stats() or {}).items() 239 | } 240 | processes_per_queue = defaultdict(int) 241 | workers_per_queue = defaultdict(int) 242 | 243 | # request workers to response active queues 244 | # we need to cache queue info in exporter in case all workers are offline 245 | # so that no worker response to exporter will make active_queues return None 246 | queues = self.app.control.inspect().active_queues() or {} 247 | for worker, info_list in queues.items(): 248 | for queue_info in info_list: 249 | name = queue_info["name"] 250 | self.queue_cache.add(name) 251 | workers_per_queue[name] += 1 252 | processes_per_queue[name] += concurrency_per_worker.get(worker, 0) 253 | 254 | for queue in self.queue_cache: 255 | if transport in ["amqp", "amqps", "memory"]: 256 | consumer_count = rabbitmq_queue_consumer_count(connection, queue) 257 | self.celery_active_consumer_count.labels( 258 | queue_name=queue, **self.static_label 259 | ).set(consumer_count) 260 | 261 | self.celery_active_process_count.labels( 262 | queue_name=queue, **self.static_label 263 | ).set(processes_per_queue[queue]) 264 | self.celery_active_worker_count.labels( 265 | queue_name=queue, **self.static_label 266 | ).set(workers_per_queue[queue]) 267 | length = queue_length(transport, connection, queue) 268 | if length is not None: 269 | self.celery_queue_length.labels( 270 | queue_name=queue, **self.static_label 271 | ).set(length) 272 | 273 | def track_task_event(self, event): 274 | self.state.event(event) 275 | task = self.state.tasks.get(event["uuid"]) 276 | logger.debug("Received event='{}' for task='{}'", event["type"], task.name) 277 | 278 | if event["type"] not in self.state_counters: 279 | logger.warning("No counter matches task state='{}'", task.state) 280 | 281 | labels = { 282 | "name": task.name, 283 | "hostname": get_hostname(task.hostname), 284 | "queue_name": getattr(task, "queue", self.default_queue_name), 285 | **self.static_label, 286 | } 287 | if event["type"] == "task-sent" and self.generic_hostname_task_sent_metric: 288 | labels["hostname"] = "generic" 289 | 290 | for counter_name, counter in self.state_counters.items(): 291 | _labels = labels.copy() 292 | 293 | if counter_name == "task-failed": 294 | if counter_name == event["type"]: 295 | _labels["exception"] = get_exception_class_name(task.exception) 296 | else: 297 | _labels["exception"] = "" 298 | 299 | if counter_name == event["type"]: 300 | counter.labels(**_labels).inc() 301 | logger.debug( 302 | "Incremented metric='{}' labels='{}'", counter._name, labels 303 | ) 304 | elif ( 305 | event["type"] != "task-sent" 306 | ): # task-sent is sent by various hosts (webservers, task creators etc.) which cause label cardinality # pylint: disable=line-too-long 307 | # increase unaffected counters by zero in order to make them visible 308 | counter.labels(**_labels).inc(0) 309 | 310 | # observe task runtime 311 | if event["type"] == "task-succeeded": 312 | self.celery_task_runtime.labels(**labels).observe(task.runtime) 313 | logger.debug( 314 | "Observed metric='{}' labels='{}': {}s", 315 | self.celery_task_runtime._name, 316 | labels, 317 | task.runtime, 318 | ) 319 | 320 | def track_worker_status(self, event, is_online): 321 | value = 1 if is_online else 0 322 | event_name = "worker-online" if is_online else "worker-offline" 323 | hostname = get_hostname(event["hostname"]) 324 | logger.debug("Received event='{}' for hostname='{}'", event_name, hostname) 325 | self.celery_worker_up.labels(hostname=hostname, **self.static_label).set(value) 326 | 327 | if is_online: 328 | self.worker_last_seen[hostname] = { 329 | "ts": reverse_adjust_timestamp( 330 | event["timestamp"], event.get("utcoffset") 331 | ), 332 | "forgotten": False, 333 | } 334 | else: 335 | self.forget_worker(hostname) 336 | 337 | def track_worker_heartbeat(self, event): 338 | hostname = get_hostname(event["hostname"]) 339 | logger.debug("Received event='{}' for worker='{}'", event["type"], hostname) 340 | 341 | self.worker_last_seen[hostname] = { 342 | "ts": reverse_adjust_timestamp(event["timestamp"], event.get("utcoffset")), 343 | "forgotten": False, 344 | } 345 | worker_state = self.state.event(event)[0][0] 346 | active = worker_state.active or 0 347 | up = 1 if worker_state.alive else 0 348 | self.celery_worker_up.labels(hostname=hostname, **self.static_label).set(up) 349 | self.worker_tasks_active.labels(hostname=hostname, **self.static_label).set( 350 | active 351 | ) 352 | logger.debug( 353 | "Updated gauge='{}' value='{}'", self.worker_tasks_active._name, active 354 | ) 355 | logger.debug("Updated gauge='{}' value='{}'", self.celery_worker_up._name, up) 356 | 357 | def run(self, click_params): 358 | logger.remove() 359 | logger.add(sys.stdout, level=click_params["log_level"]) 360 | self.app = Celery(broker=click_params["broker_url"]) 361 | if click_params["accept_content"] is not None: 362 | accept_content_list = click_params["accept_content"].split(",") 363 | logger.info("Setting celery accept_content {}", accept_content_list) 364 | self.app.config_from_object(dict(accept_content=accept_content_list)) 365 | transport_options = {} 366 | for transport_option in click_params["broker_transport_option"]: 367 | if transport_option is not None: 368 | option, value = transport_option.split("=", 1) 369 | if option is not None: 370 | logger.debug( 371 | "Setting celery broker_transport_option {}={}", option, value 372 | ) 373 | transport_options[option] = transform_option_value(value) 374 | 375 | if transport_options is not None: 376 | self.app.conf["broker_transport_options"] = transport_options 377 | 378 | ssl_options = {} 379 | for ssl_option in click_params["broker_ssl_option"]: 380 | if ssl_option is not None: 381 | option, value = ssl_option.split("=", 1) 382 | if option is not None: 383 | logger.debug("Setting celery ssl_option {}={}", option, value) 384 | if value.isnumeric(): 385 | ssl_options[option] = int(value) 386 | else: 387 | ssl_options[option] = value 388 | 389 | if ssl_options is not None: 390 | self.app.conf["broker_use_ssl"] = ssl_options 391 | 392 | self.state = self.app.events.State() # type: ignore 393 | self.retry_interval = click_params["retry_interval"] 394 | if self.retry_interval: 395 | logger.debug("Using retry_interval of {} seconds", self.retry_interval) 396 | 397 | handlers = { 398 | "worker-heartbeat": self.track_worker_heartbeat, 399 | "worker-online": lambda event: self.track_worker_status(event, True), 400 | "worker-offline": lambda event: self.track_worker_status(event, False), 401 | } 402 | for key in self.state_counters: 403 | handlers[key] = self.track_task_event 404 | 405 | with self.app.connection() as connection: # type: ignore 406 | start_http_server( 407 | self.registry, 408 | connection, 409 | click_params["host"], 410 | click_params["port"], 411 | self.scrape, 412 | ) 413 | while True: 414 | try: 415 | recv = self.app.events.Receiver(connection, handlers=handlers) # type: ignore 416 | recv.capture(limit=None, timeout=None, wakeup=True) # type: ignore 417 | 418 | except (KeyboardInterrupt, SystemExit) as ex: 419 | raise ex 420 | 421 | except Exception as e: # pylint: disable=broad-except 422 | logger.exception( 423 | "celery-exporter exception '{}', retrying in {} seconds.", 424 | str(e), 425 | self.retry_interval, 426 | ) 427 | if self.retry_interval == 0: 428 | raise e 429 | 430 | time.sleep(self.retry_interval) 431 | 432 | 433 | exception_pattern = re.compile(r"^(\w+)\(") 434 | 435 | 436 | def reverse_adjust_timestamp( 437 | ts: float, offset: Optional[int] = None, here: Callable[..., float] = utcoffset 438 | ) -> float: 439 | """Adjust timestamp in reverse of celery, based on provided utcoffset.""" 440 | return ts + ((offset or 0) - here()) * 3600 441 | 442 | 443 | def get_exception_class_name(exception_name: str): 444 | m = exception_pattern.match(exception_name) 445 | if m: 446 | return m.group(1) 447 | return "UnknownException" 448 | 449 | 450 | def get_hostname(name: str) -> str: 451 | """ 452 | Get hostname from celery's hostname. 453 | 454 | Celery's hostname contains either worker's name or Process ID in it. 455 | >>> get_hostname("workername@hostname") 456 | 'hostname' 457 | >>> get_hostname("gen531@hostname") 458 | 'hostname' 459 | 460 | Prometheus suggests it: 461 | > Do not use labels to store dimensions with high cardinality (many different label values) 462 | """ 463 | _, hostname = nodesplit(name) 464 | return hostname 465 | 466 | 467 | def transform_option_value(value: str): 468 | """ 469 | Make an attempt to transform option value to appropriate type 470 | 471 | Result type: 472 | - int - if input contains only digits 473 | - dict - if input may be correctly decoded from JSON string 474 | - str - in any other cases 475 | """ 476 | if value.isnumeric(): 477 | return int(value) 478 | try: 479 | return json.loads(value) 480 | except ValueError: 481 | return value 482 | 483 | 484 | def redis_queue_length(connection, queue: str) -> int: 485 | return connection.default_channel.client.llen(queue) 486 | 487 | 488 | def rabbitmq_queue_length(connection, queue: str) -> int: 489 | if queue_info := rabbitmq_queue_info(connection, queue): 490 | return queue_info.message_count 491 | return 0 492 | 493 | 494 | def queue_length(transport, connection, queue: str) -> Optional[int]: 495 | if transport in ["redis", "rediss", "sentinel"]: 496 | return redis_queue_length(connection, queue) 497 | 498 | if transport in ["amqp", "amqps", "memory"]: 499 | return rabbitmq_queue_length(connection, queue) 500 | 501 | return None 502 | 503 | 504 | def rabbitmq_queue_consumer_count(connection, queue: str) -> int: 505 | if queue_info := rabbitmq_queue_info(connection, queue): 506 | return queue_info.consumer_count 507 | return 0 508 | 509 | 510 | def rabbitmq_queue_info(connection, queue: str): 511 | try: 512 | queue_info = connection.default_channel.queue_declare(queue=queue, passive=True) 513 | return queue_info 514 | except ChannelError as ex: 515 | if "NOT_FOUND" in ex.message: 516 | logger.debug(f"Queue '{queue}' not found") 517 | return None 518 | raise ex 519 | -------------------------------------------------------------------------------- /src/help.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=protected-access 2 | from .exporter import Exporter 3 | 4 | prometheus_logo = """ 5 | @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 6 | @@@@@@@@@@@@@@@@@@@((((((((((((((((((((((@@@@@@@@@@@@@@@@@@@ 7 | @@@@@@@@@@@@@@((((((((((((((@@((((((((((((((((@@@@@@@@@@@@@@ 8 | @@@@@@@@@@@((((((((((((((((@@@(((((((((((((((((((@@@@@@@@@@@ 9 | @@@@@@@@(((((((((((((((((((@@@@(((((@(((((((((((((((@@@@@@@@ 10 | @@@@@@(((((((((((((((@@((((@@@@@(((@@(((((((((((((((((@@@@@@ 11 | @@@@@((((((((((((((((@@@((@@@@@@@(@@@@((((((((((((((((((@@@@ 12 | @@@(((((((((((((((((@@@@(@@@@@@@@(@@@@@((((((((((((((((((@@@ 13 | @@(((((((((((((((((@@@@@@@@@@@@@@(@@@@@@((((((((((((((((((@@ 14 | @(((((((((((((((((@@@@@@@@@@@@@@@@@@@@@@@((((((((((((((((((@ 15 | @(((((((((((((((((@@@@@@@@@@@@@@@@@@@@@@@((((((((((((((((((@ 16 | @(((((((((((((((((@@@@@@@@@@@@@@@@@@@@@@@((((((((((((((((((( 17 | (((((((((((((((((((@@@@@@@@@@@@@@@@@@@@@@((((((((((((((((((( 18 | @(((((((((((@@@@((((@@@@@@@@@@@@@@@@@@@(((((@@@@(((((((((((( 19 | @((((((((((((@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@((((((((((((@ 20 | @((((((((((((((@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@((((((((((((((@ 21 | @@((((((((((((((((((((((((((((((((((((((((((((((((((((((((@@ 22 | @@@(((((((((((((@@@@@@@@@@@@@@@@@@@@@@@@@@@@(((((((((((((@@@ 23 | @@@@((((((((((((@@@@@@@@@@@@@@@@@@@@@@@@@@@@((((((((((((@@@@ 24 | @@@@@@((((((((((((((((((((((((((((((((((((((((((((((((@@@@@@ 25 | @@@@@@@@((((((((((((((@@@@@@@@@@@@@@@@((((((((((((((@@@@@@@@ 26 | @@@@@@@@@@(((((((((((((@@@@@@@@@@@@@@(((((((((((((@@@@@@@@@@ 27 | @@@@@@@@@@@@@@(((((((((((#@@@@@@@@@(((((((((((&@@@@@@@@@@@@@ 28 | @@@@@@@@@@@@@@@@@@((((((((((((((((((((((((@@@@@@@@@@@@@@@@@@""" 29 | 30 | cmd_help = ( 31 | prometheus_logo 32 | + """ 33 | 34 | A Prometheus exporter for Celery. 35 | 36 | Metrics exposed: 37 | """ 38 | ) 39 | 40 | temp_exporter = Exporter() 41 | 42 | for metric in temp_exporter.state_counters.values(): 43 | cmd_help += f""" 44 | \b 45 | {metric._name}_total 46 | {metric._documentation:30s} 47 | """ 48 | 49 | for metric in [ 50 | temp_exporter.celery_worker_up, 51 | temp_exporter.worker_tasks_active, 52 | temp_exporter.celery_task_runtime, 53 | ]: 54 | cmd_help += f""" 55 | \b 56 | {metric._name} 57 | {metric._documentation:30s} 58 | """ 59 | -------------------------------------------------------------------------------- /src/http_server.py: -------------------------------------------------------------------------------- 1 | from threading import Thread 2 | 3 | import kombu.exceptions 4 | from flask import Blueprint, Flask, current_app, request 5 | from loguru import logger 6 | from prometheus_client.exposition import choose_encoder 7 | from waitress import serve 8 | 9 | blueprint = Blueprint("celery_exporter", __name__) 10 | 11 | 12 | @blueprint.route("/") 13 | def index(): 14 | return """ 15 | 16 | 17 | 18 | 19 | 20 | celery-exporter 21 | 22 | 23 |

Celery Exporter

24 |

Metrics

25 | 26 | 27 | """ 28 | 29 | 30 | @blueprint.route("/metrics") 31 | def metrics(): 32 | current_app.config["metrics_puller"]() 33 | encoder, content_type = choose_encoder(request.headers.get("accept")) 34 | output = encoder(current_app.config["registry"]) 35 | return output, 200, {"Content-Type": content_type} 36 | 37 | 38 | @blueprint.route("/health") 39 | def health(): 40 | conn = current_app.config["celery_connection"] 41 | uri = conn.as_uri() 42 | 43 | try: 44 | conn.ensure_connection(max_retries=3) 45 | except kombu.exceptions.OperationalError: 46 | logger.error("Failed to connect to broker='{}'", uri) 47 | return (f"Failed to connect to broker: '{uri}'", 500) 48 | except Exception: # pylint: disable=broad-except 49 | logger.exception("Unrecognized error") 50 | return ("Unknown exception", 500) 51 | return f"Connected to the broker {conn.as_uri()}" 52 | 53 | 54 | def start_http_server(registry, celery_connection, host, port, metrics_puller): 55 | app = Flask(__name__) 56 | app.config["registry"] = registry 57 | app.config["celery_connection"] = celery_connection 58 | app.config["metrics_puller"] = metrics_puller 59 | app.register_blueprint(blueprint) 60 | Thread( 61 | target=serve, 62 | args=(app,), 63 | kwargs=dict(host=host, port=port, _quiet=True), 64 | daemon=True, 65 | ).start() 66 | logger.info("Started celery-exporter at host='{}' on port='{}'", host, port) 67 | -------------------------------------------------------------------------------- /src/test_cli.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import pytest 4 | import requests 5 | from celery.contrib.testing.worker import start_worker # type: ignore 6 | from requests.exceptions import HTTPError 7 | 8 | 9 | @pytest.mark.celery() 10 | def test_integration(broker, celery_app, threaded_exporter, hostname): 11 | exporter_url = f"http://localhost:{threaded_exporter.cfg['port']}/metrics" 12 | 13 | @celery_app.task 14 | def succeed(): 15 | pass 16 | 17 | @celery_app.task 18 | def fail(): 19 | raise HTTPError("Intentional error") 20 | 21 | time.sleep(1) 22 | # Before the first worker starts, make sure queues that the exporter is initialized 23 | # with are available anyway. Queues to be detected from workers should not be there yet 24 | res = requests.get(exporter_url, timeout=5) 25 | assert res.status_code == 200 26 | assert 'celery_queue_length{queue_name="queue_from_command_line"} 0.0' in res.text 27 | assert ( 28 | 'celery_active_worker_count{queue_name="queue_from_command_line"} 0.0' 29 | in res.text 30 | ) 31 | assert ( 32 | 'celery_active_process_count{queue_name="queue_from_command_line"} 0.0' 33 | in res.text 34 | ) 35 | assert 'celery_queue_length{queue_name="celery"}' not in res.text 36 | assert 'celery_active_worker_count{queue_name="celery"}' not in res.text 37 | assert 'celery_active_process_count{queue_name="celery"}' not in res.text 38 | 39 | # start worker first so the exporter can fetch and cache queue information 40 | with start_worker(celery_app, without_heartbeat=False): 41 | time.sleep(5) 42 | res = requests.get(exporter_url, timeout=5) 43 | assert res.status_code == 200 44 | assert 'celery_queue_length{queue_name="celery"} 0.0' in res.text, res.text 45 | 46 | # TODO: Fix this... 47 | if broker == "memory": 48 | assert ( 49 | 'celery_active_consumer_count{queue_name="celery"} 0.0' in res.text 50 | ), res.text 51 | assert 'celery_active_worker_count{queue_name="celery"} 1.0' in res.text 52 | assert 'celery_active_process_count{queue_name="celery"} 1.0' in res.text 53 | 54 | succeed.apply_async() 55 | succeed.apply_async() 56 | fail.apply_async() 57 | 58 | # assert celery_queue_length when message in broker but no worker start 59 | res = requests.get(exporter_url, timeout=3) 60 | assert res.status_code == 200 61 | assert 'celery_queue_length{queue_name="celery"} 3.0' in res.text 62 | 63 | if broker == "memory": 64 | assert 'celery_active_consumer_count{queue_name="celery"} 0.0' in res.text 65 | assert 'celery_active_worker_count{queue_name="celery"} 0.0' in res.text 66 | assert 'celery_active_process_count{queue_name="celery"} 0.0' in res.text 67 | 68 | # start worker and consume message in broker 69 | with start_worker(celery_app, without_heartbeat=False): 70 | time.sleep(2) 71 | 72 | res = requests.get(exporter_url, timeout=3) 73 | assert res.status_code == 200 74 | # pylint: disable=line-too-long 75 | assert ( 76 | f'celery_task_sent_total{{hostname="{hostname}",name="src.test_cli.succeed",queue_name="celery"}} 2.0' 77 | in res.text 78 | ) 79 | assert ( 80 | f'celery_task_sent_total{{hostname="{hostname}",name="src.test_cli.fail",queue_name="celery"}} 1.0' 81 | in res.text 82 | ) 83 | assert ( 84 | f'celery_task_received_total{{hostname="{hostname}",name="src.test_cli.succeed",queue_name="celery"}} 2.0' 85 | in res.text 86 | ) 87 | assert ( 88 | f'celery_task_received_total{{hostname="{hostname}",name="src.test_cli.fail",queue_name="celery"}} 1.0' 89 | in res.text 90 | ) 91 | assert ( 92 | f'celery_task_started_total{{hostname="{hostname}",name="src.test_cli.succeed",queue_name="celery"}} 2.0' 93 | in res.text 94 | ) 95 | assert ( 96 | f'celery_task_started_total{{hostname="{hostname}",name="src.test_cli.fail",queue_name="celery"}} 1.0' 97 | in res.text 98 | ) 99 | assert ( 100 | f'celery_task_succeeded_total{{hostname="{hostname}",name="src.test_cli.succeed",queue_name="celery"}} 2.0' 101 | in res.text 102 | ) 103 | assert ( 104 | f'celery_task_failed_total{{exception="HTTPError",hostname="{hostname}",name="src.test_cli.fail",queue_name="celery"}} 1.0' 105 | in res.text 106 | ) 107 | assert ( 108 | f'celery_task_runtime_count{{hostname="{hostname}",name="src.test_cli.succeed",queue_name="celery"}} 2.0' 109 | in res.text 110 | ) 111 | assert 'celery_queue_length{queue_name="celery"} 0.0' in res.text 112 | 113 | # TODO: Fix this... 114 | if broker == "memory": 115 | assert 'celery_active_consumer_count{queue_name="celery"} 0.0' in res.text 116 | assert 'celery_active_worker_count{queue_name="celery"} 0.0' in res.text 117 | assert 'celery_active_process_count{queue_name="celery"} 0.0' in res.text 118 | 119 | 120 | # pylint: disable=too-many-statements 121 | @pytest.mark.celery() 122 | def test_integration_static_labels( 123 | broker, celery_app, threaded_exporter_static_labels, hostname 124 | ): 125 | exporter_url = ( 126 | f"http://localhost:{threaded_exporter_static_labels.cfg['port']}/metrics" 127 | ) 128 | # Substring representing static labels in metrics labels 129 | static_labels_str = ",".join( 130 | [ 131 | f'{k}="{v}"' 132 | for k, v in sorted( 133 | threaded_exporter_static_labels.cfg["static_label"].items() 134 | ) 135 | ] 136 | ) 137 | 138 | @celery_app.task 139 | def succeed(): 140 | pass 141 | 142 | @celery_app.task 143 | def fail(): 144 | raise HTTPError("Intentional error") 145 | 146 | time.sleep(1) 147 | # Before the first worker starts, make sure queues that the exporter is initialized 148 | # with are available anyway. Queues to be detected from workers should not be there yet 149 | res = requests.get(exporter_url, timeout=5) 150 | assert res.status_code == 200 151 | assert ( 152 | f'celery_queue_length{{queue_name="queue_from_command_line",{static_labels_str}}} 0.0' 153 | in res.text 154 | ) 155 | assert ( 156 | # pylint: disable=line-too-long 157 | f'celery_active_worker_count{{queue_name="queue_from_command_line",{static_labels_str}}} 0.0' 158 | in res.text 159 | ) 160 | assert ( 161 | # pylint: disable=line-too-long 162 | f'celery_active_process_count{{queue_name="queue_from_command_line",{static_labels_str}}} 0.0' 163 | in res.text 164 | ) 165 | assert ( 166 | f'celery_queue_length{{queue_name="celery",{static_labels_str}}}' 167 | not in res.text 168 | ) 169 | assert ( 170 | f'celery_active_worker_count{{queue_name="celery",{static_labels_str}}}' 171 | not in res.text 172 | ) 173 | assert ( 174 | f'celery_active_process_count{{queue_name="celery",{static_labels_str}}}' 175 | not in res.text 176 | ) 177 | 178 | # start worker first so the exporter can fetch and cache queue information 179 | with start_worker(celery_app, without_heartbeat=False): 180 | time.sleep(5) 181 | res = requests.get(exporter_url, timeout=5) 182 | assert res.status_code == 200 183 | assert ( 184 | f'celery_queue_length{{queue_name="celery",{static_labels_str}}} 0.0' 185 | in res.text 186 | ), res.text 187 | 188 | # TODO: Fix this... 189 | if broker == "memory": 190 | assert ( 191 | f'celery_active_consumer_count{{queue_name="celery",{static_labels_str}}} 0.0' 192 | in res.text 193 | ), res.text 194 | assert ( 195 | f'celery_active_worker_count{{queue_name="celery",{static_labels_str}}} 1.0' 196 | in res.text 197 | ) 198 | assert ( 199 | f'celery_active_process_count{{queue_name="celery",{static_labels_str}}} 1.0' 200 | in res.text 201 | ) 202 | 203 | succeed.apply_async() 204 | succeed.apply_async() 205 | fail.apply_async() 206 | 207 | # assert celery_queue_length when message in broker but no worker start 208 | res = requests.get(exporter_url, timeout=3) 209 | assert res.status_code == 200 210 | assert ( 211 | f'celery_queue_length{{queue_name="celery",{static_labels_str}}} 3.0' 212 | in res.text 213 | ) 214 | 215 | if broker == "memory": 216 | assert ( 217 | f'celery_active_consumer_count{{queue_name="celery",{static_labels_str}}} 0.0' 218 | in res.text 219 | ) 220 | assert ( 221 | f'celery_active_worker_count{{queue_name="celery",{static_labels_str}}} 0.0' 222 | in res.text 223 | ) 224 | assert ( 225 | f'celery_active_process_count{{queue_name="celery",{static_labels_str}}} 0.0' 226 | in res.text 227 | ) 228 | 229 | # start worker and consume message in broker 230 | with start_worker(celery_app, without_heartbeat=False): 231 | time.sleep(2) 232 | 233 | res = requests.get(exporter_url, timeout=3) 234 | assert res.status_code == 200 235 | # pylint: disable=line-too-long 236 | assert ( 237 | f'celery_task_sent_total{{hostname="{hostname}",name="src.test_cli.succeed",queue_name="celery",{static_labels_str}}} 2.0' 238 | in res.text 239 | ) 240 | assert ( 241 | f'celery_task_sent_total{{hostname="{hostname}",name="src.test_cli.fail",queue_name="celery",{static_labels_str}}} 1.0' 242 | in res.text 243 | ) 244 | assert ( 245 | f'celery_task_received_total{{hostname="{hostname}",name="src.test_cli.succeed",queue_name="celery",{static_labels_str}}} 2.0' 246 | in res.text 247 | ) 248 | assert ( 249 | f'celery_task_received_total{{hostname="{hostname}",name="src.test_cli.fail",queue_name="celery",{static_labels_str}}} 1.0' 250 | in res.text 251 | ) 252 | assert ( 253 | f'celery_task_started_total{{hostname="{hostname}",name="src.test_cli.succeed",queue_name="celery",{static_labels_str}}} 2.0' 254 | in res.text 255 | ) 256 | assert ( 257 | f'celery_task_started_total{{hostname="{hostname}",name="src.test_cli.fail",queue_name="celery",{static_labels_str}}} 1.0' 258 | in res.text 259 | ) 260 | assert ( 261 | f'celery_task_succeeded_total{{hostname="{hostname}",name="src.test_cli.succeed",queue_name="celery",{static_labels_str}}} 2.0' 262 | in res.text 263 | ) 264 | assert ( 265 | f'celery_task_failed_total{{exception="HTTPError",hostname="{hostname}",name="src.test_cli.fail",queue_name="celery",{static_labels_str}}} 1.0' 266 | in res.text 267 | ) 268 | assert ( 269 | f'celery_task_runtime_count{{hostname="{hostname}",name="src.test_cli.succeed",queue_name="celery",{static_labels_str}}} 2.0' 270 | in res.text 271 | ) 272 | assert ( 273 | f'celery_queue_length{{queue_name="celery",{static_labels_str}}} 0.0' 274 | in res.text 275 | ) 276 | 277 | # TODO: Fix this... 278 | if broker == "memory": 279 | assert ( 280 | f'celery_active_consumer_count{{queue_name="celery",{static_labels_str}}} 0.0' 281 | in res.text 282 | ) 283 | assert ( 284 | f'celery_active_worker_count{{queue_name="celery",{static_labels_str}}} 0.0' 285 | in res.text 286 | ) 287 | assert ( 288 | f'celery_active_process_count{{queue_name="celery",{static_labels_str}}} 0.0' 289 | in res.text 290 | ) 291 | -------------------------------------------------------------------------------- /src/test_exporter.py: -------------------------------------------------------------------------------- 1 | from .exporter import transform_option_value 2 | 3 | 4 | def test_transform_option_value(): 5 | test_cases = [ 6 | {"input": "1423", "expected": 1423}, 7 | {"input": '{"password": "pass"}', "expected": {"password": "pass"}}, 8 | { 9 | "input": '{invalid_json: "value"}', 10 | "expected": '{invalid_json: "value"}', 11 | }, 12 | {"input": "my_master", "expected": "my_master"}, 13 | ] 14 | 15 | for case in test_cases: 16 | assert transform_option_value(case["input"]) == case["expected"] 17 | -------------------------------------------------------------------------------- /src/test_http_server.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=unused-argument 2 | import time 3 | 4 | import pytest 5 | import requests 6 | 7 | 8 | @pytest.mark.celery() 9 | def test_health(threaded_exporter): 10 | time.sleep(1) 11 | res = requests.get( 12 | f"http://localhost:{threaded_exporter.cfg['port']}/health", timeout=3 13 | ) 14 | res.raise_for_status() 15 | 16 | 17 | def test_index(threaded_exporter): 18 | time.sleep(1) 19 | res = requests.get(f"http://localhost:{threaded_exporter.cfg['port']}", timeout=3) 20 | res.raise_for_status() 21 | assert "/metrics" in res.text 22 | -------------------------------------------------------------------------------- /src/test_metrics.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | import pytest 5 | from celery.contrib.testing.worker import start_worker # type: ignore 6 | from celery.utils.time import adjust_timestamp # type: ignore 7 | 8 | from src.exporter import reverse_adjust_timestamp 9 | 10 | 11 | @pytest.fixture 12 | def assert_exporter_metric_called(mocker, celery_app, celery_worker, hostname): 13 | def fn(metric): 14 | labels = mocker.patch.object(metric, "labels") 15 | 16 | @celery_app.task 17 | def slow_task(): 18 | logging.info("Started the slow task") 19 | time.sleep(3) 20 | logging.info("Finished the slow task") 21 | 22 | # Reload so that the worker detects the task 23 | celery_worker.reload() 24 | slow_task.delay().get() 25 | assert labels.call_count >= 1 26 | labels.assert_called_with(hostname=hostname) 27 | labels.return_value.set.assert_any_call(1) 28 | 29 | return fn 30 | 31 | 32 | @pytest.mark.celery() 33 | def test_worker_tasks_active(broker, threaded_exporter, assert_exporter_metric_called): 34 | if broker != "memory": 35 | pytest.skip( 36 | reason="test_worker_tasks_active can only be tested for the in-memory broker" 37 | ) 38 | 39 | assert_exporter_metric_called(threaded_exporter.worker_tasks_active) 40 | 41 | 42 | @pytest.mark.celery() 43 | def test_worker_heartbeat_status( 44 | broker, threaded_exporter, assert_exporter_metric_called 45 | ): 46 | if broker != "memory": 47 | pytest.skip( 48 | reason="test_worker_tasks_active can only be tested for the in-memory broker" 49 | ) 50 | 51 | assert_exporter_metric_called(threaded_exporter.celery_worker_up) 52 | 53 | 54 | @pytest.mark.celery() 55 | def test_worker_status(threaded_exporter, celery_app, hostname): 56 | time.sleep(5) 57 | 58 | with start_worker(celery_app, without_heartbeat=False): 59 | time.sleep(2) 60 | assert ( 61 | threaded_exporter.registry.get_sample_value( 62 | "celery_worker_up", labels={"hostname": hostname} 63 | ) 64 | == 1.0 65 | ) 66 | 67 | time.sleep(2) 68 | assert ( 69 | threaded_exporter.registry.get_sample_value( 70 | "celery_worker_up", labels={"hostname": hostname} 71 | ) 72 | == 0.0 73 | ) 74 | 75 | 76 | @pytest.mark.parametrize( 77 | "input_utcoffset, sleep_seconds, expected_metric_value", 78 | [ 79 | (None, 5, 0.0), 80 | (0, 5, 0.0), 81 | (7, 5, 0.0), 82 | (7, 0, 1.0), 83 | ], # Eg: PST (America/Los_Angeles) 84 | ) 85 | def test_worker_timeout_status( 86 | input_utcoffset, sleep_seconds, expected_metric_value, threaded_exporter, hostname 87 | ): 88 | ts = adjust_timestamp(time.time(), (input_utcoffset or 0)) 89 | threaded_exporter.track_worker_status( 90 | {"hostname": hostname, "timestamp": ts, "utcoffset": input_utcoffset}, True 91 | ) 92 | assert ( 93 | threaded_exporter.registry.get_sample_value( 94 | "celery_worker_up", labels={"hostname": hostname} 95 | ) 96 | == 1.0 97 | ) 98 | assert threaded_exporter.worker_last_seen[hostname] == { 99 | "forgotten": False, 100 | "ts": reverse_adjust_timestamp(ts, input_utcoffset), 101 | } 102 | 103 | time.sleep(sleep_seconds) 104 | threaded_exporter.scrape() 105 | assert ( 106 | threaded_exporter.registry.get_sample_value( 107 | "celery_worker_up", labels={"hostname": hostname} 108 | ) 109 | == expected_metric_value 110 | ) 111 | 112 | 113 | @pytest.mark.parametrize( 114 | "input_utcoffset, sleep_seconds, expected_metric_value", 115 | [ 116 | (None, 15, None), 117 | (0, 15, None), 118 | (7, 15, None), 119 | (7, 0, 1.0), 120 | ], # Eg: PST (America/Los_Angeles) 121 | ) 122 | def test_purge_offline_worker_metrics( 123 | input_utcoffset, sleep_seconds, expected_metric_value, threaded_exporter, hostname 124 | ): 125 | ts = adjust_timestamp(time.time(), (input_utcoffset or 0)) 126 | threaded_exporter.track_worker_status( 127 | {"hostname": hostname, "timestamp": ts, "utcoffset": input_utcoffset}, True 128 | ) 129 | threaded_exporter.worker_tasks_active.labels(hostname=hostname).inc() 130 | threaded_exporter.celery_task_runtime.labels( 131 | name="boosh", hostname=hostname, queue_name="test" 132 | ).observe(1.0) 133 | threaded_exporter.state_counters["task-sent"].labels( 134 | name="boosh", hostname=hostname, queue_name="test" 135 | ).inc() 136 | 137 | assert ( 138 | threaded_exporter.registry.get_sample_value( 139 | "celery_worker_up", labels={"hostname": hostname} 140 | ) 141 | == 1.0 142 | ) 143 | assert ( 144 | threaded_exporter.registry.get_sample_value( 145 | "celery_worker_tasks_active", labels={"hostname": hostname} 146 | ) 147 | == 1.0 148 | ) 149 | assert ( 150 | threaded_exporter.registry.get_sample_value( 151 | "celery_task_runtime_count", 152 | labels={"hostname": hostname, "queue_name": "test", "name": "boosh"}, 153 | ) 154 | == 1.0 155 | ) 156 | assert ( 157 | threaded_exporter.registry.get_sample_value( 158 | "celery_task_sent_total", 159 | labels={"hostname": hostname, "queue_name": "test", "name": "boosh"}, 160 | ) 161 | == 1.0 162 | ) 163 | 164 | assert threaded_exporter.worker_last_seen[hostname] == { 165 | "forgotten": False, 166 | "ts": reverse_adjust_timestamp(ts, input_utcoffset), 167 | } 168 | 169 | time.sleep(sleep_seconds) 170 | threaded_exporter.scrape() 171 | assert ( 172 | threaded_exporter.registry.get_sample_value( 173 | "celery_worker_up", labels={"hostname": hostname} 174 | ) 175 | == expected_metric_value 176 | ) 177 | assert ( 178 | threaded_exporter.registry.get_sample_value( 179 | "celery_worker_tasks_active", labels={"hostname": hostname} 180 | ) 181 | == expected_metric_value 182 | ) 183 | assert ( 184 | threaded_exporter.registry.get_sample_value( 185 | "celery_task_runtime_count", 186 | labels={"hostname": hostname, "queue_name": "test", "name": "boosh"}, 187 | ) 188 | == expected_metric_value 189 | ) 190 | assert ( 191 | threaded_exporter.registry.get_sample_value( 192 | "celery_task_sent_total", 193 | labels={"hostname": hostname, "queue_name": "test", "name": "boosh"}, 194 | ) 195 | == expected_metric_value 196 | ) 197 | 198 | 199 | def test_worker_generic_task_sent_hostname(threaded_exporter, celery_app): 200 | threaded_exporter.generic_hostname_task_sent_metric = True 201 | time.sleep(5) 202 | 203 | @celery_app.task 204 | def succeed(): 205 | pass 206 | 207 | succeed.apply_async() 208 | 209 | with start_worker(celery_app, without_heartbeat=False): 210 | time.sleep(5) 211 | assert ( 212 | threaded_exporter.registry.get_sample_value( 213 | "celery_task_sent_total", 214 | labels={ 215 | "hostname": "generic", 216 | "name": "src.test_metrics.succeed", 217 | "queue_name": "celery", 218 | }, 219 | ) 220 | == 1.0 221 | ) 222 | -------------------------------------------------------------------------------- /vendor/github.com/honeylogic-io/utils-libsonnet/lib/celery.libsonnet: -------------------------------------------------------------------------------- 1 | local k = import 'github.com/grafana/jsonnet-libs/ksonnet-util/kausal.libsonnet'; 2 | local statefulSet = k.apps.v1.statefulSet; 3 | local container = k.core.v1.container; 4 | local deployment = k.apps.v1.deployment; 5 | 6 | { 7 | createContainers(name, image, command, args, env):: container.new(name, image) + 8 | container.withCommand(command) + 9 | container.withArgs(args) + 10 | container.withEnvMap(env) + 11 | container.withImagePullPolicy('Always'), 12 | 13 | worker: { 14 | new(name, image, replicas=1, command=['celery'], args, env): { 15 | local containers = $.createContainers(name, image, command, args, env), 16 | statefulSet: statefulSet.new(name, replicas, containers) + 17 | statefulSet.spec.withServiceName(name), 18 | }, 19 | }, 20 | beat: { 21 | new(name, image, command=['celery'], args, env): { 22 | local containers = $.createContainers(name, image, command, args, env), 23 | deployment: deployment.new(name, replicas=1, containers=containers), 24 | }, 25 | }, 26 | } 27 | -------------------------------------------------------------------------------- /vendor/github.com/honeylogic-io/utils-libsonnet/lib/django.libsonnet: -------------------------------------------------------------------------------- 1 | local k = import 'github.com/grafana/jsonnet-libs/ksonnet-util/kausal.libsonnet'; 2 | local deployment = k.apps.v1.deployment; 3 | local container = k.core.v1.container; 4 | local port = k.core.v1.containerPort; 5 | local service = k.core.v1.service; 6 | local withInitContainers = deployment.spec.template.spec.withInitContainers; 7 | local withArgs = container.withArgs; 8 | 9 | { 10 | new(name, image, envMap): { 11 | local containers = container.new(name, image) + 12 | container.withImagePullPolicy('Always') + 13 | container.withVolumeMounts([{ 14 | name: 'staticfiles', 15 | mountPath: '/app/staticfiles', 16 | }]) + 17 | container.withEnvMap(envMap), 18 | local webArgs = withArgs(['config.wsgi', '--bind=0.0.0.0:80']), 19 | local webContainer = containers + container.withPorts([port.new('http', 80)]) + 20 | container.withCommand(['gunicorn']) + 21 | webArgs, 22 | local collectstaticArgs = withArgs(['collectstatic', '--no-input', '--clear']), 23 | local collectstatic = containers + 24 | container.withName('collectstatic') + 25 | container.withCommand(['./manage.py']) + 26 | collectstaticArgs, 27 | local migrate = containers + container.withName('migrate') + 28 | container.withCommand(['./manage.py']) + 29 | withArgs(['migrate']), 30 | 31 | deployment: deployment.new(name, replicas=1, containers=webContainer) 32 | + withInitContainers([collectstatic, migrate]) 33 | + deployment.spec.template.spec.withVolumes([{ 34 | name: 'staticfiles', 35 | emptyDir: { 36 | medium: 'Memory', 37 | }, 38 | }]), 39 | service: k.util.serviceFor(self.deployment), 40 | }, 41 | } 42 | -------------------------------------------------------------------------------- /vendor/github.com/honeylogic-io/utils-libsonnet/lib/drone.libsonnet: -------------------------------------------------------------------------------- 1 | local pythonStepCommon = { 2 | depends_on: ['install-python-deps'], 3 | commands: [ 4 | '. .poetry/env && . $(poetry env info -p)/bin/activate', 5 | ], 6 | }; 7 | 8 | local installDepsStep = pythonStepCommon { 9 | name: 'install-python-deps', 10 | depends_on: ['restore-cache'], 11 | environment: { 12 | POETRY_CACHE_DIR: '/drone/src/.poetry-cache', 13 | POETRY_VIRTUALENVS_IN_PROJECT: 'false', 14 | }, 15 | commands: [ 16 | ||| 17 | export POETRY_HOME=$DRONE_WORKSPACE/.poetry 18 | if [ ! -d "$POETRY_HOME" ]; then 19 | curl -fsS -o /tmp/get-poetry.py https://raw.githubusercontent.com/python-poetry/poetry/master/get-poetry.py 20 | python /tmp/get-poetry.py -y 21 | fi 22 | |||, 23 | '. .poetry/env', 24 | 'poetry install --no-root', 25 | ], 26 | }; 27 | 28 | local formatStep = pythonStepCommon { 29 | name: 'format', 30 | commands+: [ 31 | 'black . --check', 32 | 'isort --check-only .', 33 | ], 34 | }; 35 | 36 | local mypyStep = pythonStepCommon { 37 | name: 'typecheck', 38 | commands+: [ 39 | 'mypy .', 40 | ], 41 | }; 42 | 43 | 44 | local pylintStep = pythonStepCommon { 45 | name: 'lint', 46 | commands+: [ 47 | "pylint $(git ls-files -- '*.py' ':!:**/migrations/*.py')", 48 | ], 49 | }; 50 | 51 | local testStep = pythonStepCommon { 52 | name: 'test', 53 | commands+: ['pytest --ignore .poetry --ignore .poetry-cache --cov'], 54 | }; 55 | 56 | 57 | local pipelineCommon(image) = { 58 | kind: 'pipeline', 59 | type: 'docker', 60 | name: 'python', 61 | trigger: { 62 | event: [ 63 | 'push', 64 | ], 65 | }, 66 | volumes: [ 67 | { 68 | name: 'cache', 69 | host: { 70 | path: '/tmp/cache', 71 | }, 72 | }, 73 | ], 74 | steps: [ 75 | installDepsStep { image: image }, 76 | formatStep { image: image }, 77 | mypyStep { image: image }, 78 | pylintStep { image: image }, 79 | testStep { image: image }, 80 | ], 81 | }; 82 | 83 | { 84 | pythonPipeline: { 85 | new(pipeline, image): pipelineCommon(image) + pipeline, 86 | }, 87 | dockerPipeline: { 88 | kind: 'pipeline', 89 | type: 'docker', 90 | }, 91 | } 92 | -------------------------------------------------------------------------------- /vendor/github.com/honeylogic-io/utils-libsonnet/lib/ingress.libsonnet: -------------------------------------------------------------------------------- 1 | local k = import 'github.com/grafana/jsonnet-libs/ksonnet-util/kausal.libsonnet'; 2 | local ingress = k.networking.v1.ingress; 3 | 4 | local mapRules(host, service, servicePort) = ({ host: host, http: { paths: [{ 5 | path: '/', 6 | pathType: 'Prefix', 7 | backend: { service: { name: service, port: { number: servicePort } } }, 8 | }] } }); 9 | 10 | { 11 | new(name, hosts, service, servicePort, annotations): 12 | ingress.new(name) 13 | + ingress.metadata.withAnnotations(annotations) 14 | + ingress.spec.withTls([{ hosts: hosts, secretName: name + '-cert' }]) 15 | + ingress.spec.withRules([mapRules(host, service, servicePort) for host in hosts]), 16 | } 17 | -------------------------------------------------------------------------------- /vendor/lib: -------------------------------------------------------------------------------- 1 | github.com/honeylogic-io/utils-libsonnet/lib --------------------------------------------------------------------------------