├── requirements.txt
├── Dockerfile
├── pyproject.toml
├── .github
    ├── workflows
    │   └── lint.yml
    └── ISSUE_TEMPLATE
    │   └── bug_report.md
├── .gitignore
├── icinga2
    ├── command.conf
    └── service.conf
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── README.md
├── LICENSE
├── grafana
    └── pve-metrics-dashboard.json
└── check_pve.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | argparse
3 | packaging
4 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3
 2 | 
 3 | ADD check_pve.py /
 4 | ADD requirements.txt /
 5 | RUN apt-get update
 6 | RUN apt install -y python3 python3-requests python3-packaging
 7 | RUN pip3 install -r requirements.txt
 8 | 
 9 | 
10 | CMD ["tail", "-f", "/dev/null"]
11 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.black]
 2 | line-length = 100
 3 | 
 4 | [tool.ruff]
 5 | line-length = 100
 6 | lint.select = [
 7 |     "ANN", # flake8-annotations
 8 |     "B",   # flake8-bugbear
 9 |     "D",   # pydocstyle
10 |     "E",   # pycodestyle
11 |     "F",   # Pyflakes
12 |     "Q",   # flake8-quotes
13 | ]
14 | lint.ignore = [
15 |     "ANN101", # missing-type-self
16 |     "D107",   # undocumented-public-init
17 | ]
18 | 
19 | [tool.ruff.lint.mccabe]
20 | max-complexity = 10
21 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | on: [push, pull_request]
 2 | name: Linter
 3 | 
 4 | jobs:
 5 |   build:
 6 |     if:
 7 |       github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - name: Checkout repository
11 |         uses: actions/checkout@v4
12 | 
13 |       - name: Setup Python
14 |         uses: actions/setup-python@v5
15 |         with:
16 |           python-version: "*"
17 | 
18 |       - name: Install dependencies
19 |         run: |
20 |           python -m pip install --upgrade pip
21 |           python -m pip install black ruff
22 | 
23 |       - name: Check styling with black
24 |         run: |
25 |           black --check *.py
26 | 
27 |       - name: Run ruff linter
28 |         run: |
29 |           ruff check *.py
30 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG]"
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Environment (please complete the following information):**
27 |  - OS: [e.g. Debian Bullseye, CentOS 7, ...]
28 |  - Python Version [e.g. 3.6, 3.9, ...]
29 |  - PVE Version [e.g. 6.5, 7.1-8, ...]
30 | - Monitoring Tool [e.g. Icinga2, Nagios, ...]
31 | 
32 | **Additional context**
33 | Add any other context about the problem here.
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/icinga2/command.conf:
--------------------------------------------------------------------------------
 1 | object CheckCommand "pve" {
 2 |         import "plugin-check-command"
 3 | 
 4 |         command = [ PluginDir + "/check_pve.py" ]
 5 | 
 6 |         arguments = {
 7 |                 "-e" = {
 8 |                         value = "$pve_host$"
 9 |                         required = true
10 |                         description = "Hostname for PVE API"
11 |                 }
12 |                 "-u" = {
13 |                         value = "$pve_user$"
14 |                         required = true
15 |                         description = "API user (ex. monitoring@pve)"
16 |                 }
17 |                 "-p" = {
18 |                         value = "$pve_password$"
19 |                         description = "API user password"
20 |                 }
21 |                 "-t" = {
22 |                         value = "$pve_token$"
23 |                         description = "API user token"
24 |                 }
25 |                 "-k" = {
26 |                         set_if = "$pve_insecure_connection$"
27 |                         description = "Connect to this host instead of $pve_host$"
28 |                 }
29 |                 "-m" = {
30 |                         value = "$pve_mode$"
31 |                         required = true
32 |                         description = "Check mode (cluster, version, updates, subscription, storage, cpu, memory, io_wait, vm, replication)"
33 |                 }
34 |                 "-n" = {
35 |                         value = "$pve_node$"
36 |                         description = "Node to check (necessary for all modes except cluster and version)"
37 |                 }
38 |                 "--name" = {
39 |                         value = "$pve_resource_name$"
40 |                         description = "Name of storage or vm to check"
41 |                 }
42 |                 "--expected-vm-status" = {
43 |                         value = "$pve_expected_vm_status$"
44 |                         description = "Expected status of the VM"
45 |                 }
46 |                 "--ignore-service" = {
47 |                         repeat_key = true
48 |                         value = "$pve_ignore_services$"
49 |                         description = "Ignore services in check"
50 |                 }
51 |                 "--ignore-disk" = {
52 |                         repeat_key = true
53 |                         value = "$pve_ignore_disks$"
54 |                         description = "Ignore disks in check"
55 |                 }
56 |                 "--ignore-vm-status" = {
57 |                         set_if = "$pve_ignore_vm_status$"
58 |                         description = "Ignore VM status in check"
59 |                 }
60 |                 "-w" = {
61 |                         value = "$pve_warning$"
62 |                         description = "Warning treshold"
63 |                 }
64 |                 "-c" = {
65 |                         value = "$pve_critical$"
66 |                         description = "Critical treshold"
67 |                 }
68 |                 "-M" = {
69 |                         set_if = "$pve_tresholds_mb$"
70 |                         description = "Unit of tresholds and values is MB"
71 |                 }
72 |                 "-V" = {
73 |                         value = "$pve_min_version$"
74 |                         description = "Minimal pve version. Everything lower than this will return CRITICAL."
75 |                 }
76 |         }
77 | }
78 | 


--------------------------------------------------------------------------------
/icinga2/service.conf:
--------------------------------------------------------------------------------
  1 | template Host "proxmox-host" {
  2 |   import "generic-host"
  3 | 
  4 |   vars.pve_host = name
  5 |   vars.pve_node = name.split(".")[0]
  6 |   // ... or if not matching the fqdn (nodename.domain.example)
  7 |   // vars.pve_node = "proxmox-host"
  8 | 
  9 |   // if your icinga host don't trust your pve certificate, you'll have to uncomment this line
 10 |   // vars.pve_insecure_connection = true
 11 |   vars.pve_user = "monitor@pve"
 12 |   // either use password or token
 13 |   // vars.pve_password = "SuperSecretPassw0rd"
 14 |   // vars.pve_token = "monitoring=GeneratedToken"
 15 | 
 16 |   // change to false, if node is no member of a pve cluster
 17 |   vars.pve_cluster = true
 18 | }
 19 | 
 20 | object Host "proxmox-host.domain.example" {
 21 |   import "proxmox-host"
 22 | 
 23 |   address = "192.168.42.42"
 24 | 
 25 |   vars.pve_storage["flashpool"] = {
 26 |     pve_warning = 80
 27 |     pve_critical = 90
 28 |   }
 29 | 
 30 |   vars.pve_storage["diskpool"] = {
 31 |     pve_warning = 80
 32 |     pve_critical = 90
 33 |   }
 34 | 
 35 |   // Ignore these disks in health check (USB sticks, SD cards, etc.)
 36 |   vars.pve_ignore_disks = [ "sdn", "sdg" ]
 37 | 
 38 |   vars.virtual_machines["vm-01"] = {
 39 |   }
 40 | }
 41 | 
 42 | template Service "pve-service" {
 43 |   import "generic-service"
 44 | 
 45 |   check_command = "pve"
 46 | }
 47 | 
 48 | apply Service "cluster" {
 49 |   import "pve-service"
 50 | 
 51 |   vars.pve_mode = "cluster"
 52 | 
 53 |   assign where host.vars.pve_host && host.vars.pve_cluster
 54 | }
 55 | 
 56 | apply Service "services" {
 57 |   import "pve-service"
 58 | 
 59 |   vars.pve_mode = "services"
 60 | 
 61 |   // Ignore cluster status on single nodes
 62 |   if (!host.vars.pve_cluster) {
 63 |     vars.pve_ignore_services = host.vars.pve_ignore_services || []
 64 |     vars.pve_ignore_services.add("corosync")
 65 |   }
 66 | 
 67 |   assign where host.vars.pve_host
 68 | }
 69 | 
 70 | apply Service "updates" {
 71 |   import "pve-service"
 72 | 
 73 |   check_interval = 12h
 74 |   retry_interval = 2h
 75 |   max_check_attempts = 3
 76 | 
 77 |   vars.pve_mode = "updates"
 78 | 
 79 |   assign where host.vars.pve_host
 80 | }
 81 | 
 82 | apply Service "disk-health" {
 83 |   import "pve-service"
 84 | 
 85 |   vars.pve_mode = "disk-health"
 86 | 
 87 |   assign where host.vars.pve_host
 88 | }
 89 | 
 90 | apply Service "io_wait" {
 91 |   import "pve-service"
 92 | 
 93 |   vars.pve_mode = "io_wait"
 94 | 
 95 |   vars.pve_warning = 10
 96 |   vars.pve_critical = 30
 97 | 
 98 |   assign where host.vars.pve_host
 99 | }
100 | 
101 | apply Service "cpu" {
102 |   import "pve-service"
103 | 
104 |   vars.pve_mode = "cpu"
105 | 
106 |   vars.pve_warning = 70
107 |   vars.pve_critical = 90
108 | 
109 |   assign where host.vars.pve_host
110 | }
111 | 
112 | apply Service "memory" {
113 |   import "pve-service"
114 | 
115 |   vars.pve_mode = "memory"
116 | 
117 |   vars.pve_warning = 80
118 |   vars.pve_critical = 90
119 | 
120 |   assign where host.vars.pve_host
121 | }
122 | 
123 | apply Service "storage " for (storage => config in host.vars.pve_storage) {
124 |   import "pve-service"
125 | 
126 |   vars += config
127 | 
128 |   vars.pve_mode = "storage"
129 |   vars.pve_resource_name = storage
130 | }
131 | 
132 | apply Service "pve-vm " for (vm => config in host.vars.virtual_machines) {
133 |   import "pve-service"
134 | 
135 |   vars += config
136 | 
137 |   vars.pve_mode = "vm"
138 |   vars.pve_resource_name = vm
139 | 
140 |   assign where host.vars.pve_host
141 | }
142 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement.
 63 | All complaints will be reviewed and investigated promptly and fairly.
 64 | 
 65 | All community leaders are obligated to respect the privacy and security of the
 66 | reporter of any incident.
 67 | 
 68 | ## Enforcement Guidelines
 69 | 
 70 | Community leaders will follow these Community Impact Guidelines in determining
 71 | the consequences for any action they deem in violation of this Code of Conduct:
 72 | 
 73 | ### 1. Correction
 74 | 
 75 | **Community Impact**: Use of inappropriate language or other behavior deemed
 76 | unprofessional or unwelcome in the community.
 77 | 
 78 | **Consequence**: A private, written warning from community leaders, providing
 79 | clarity around the nature of the violation and an explanation of why the
 80 | behavior was inappropriate. A public apology may be requested.
 81 | 
 82 | ### 2. Warning
 83 | 
 84 | **Community Impact**: A violation through a single incident or series
 85 | of actions.
 86 | 
 87 | **Consequence**: A warning with consequences for continued behavior. No
 88 | interaction with the people involved, including unsolicited interaction with
 89 | those enforcing the Code of Conduct, for a specified period of time. This
 90 | includes avoiding interactions in community spaces as well as external channels
 91 | like social media. Violating these terms may lead to a temporary or
 92 | permanent ban.
 93 | 
 94 | ### 3. Temporary Ban
 95 | 
 96 | **Community Impact**: A serious violation of community standards, including
 97 | sustained inappropriate behavior.
 98 | 
 99 | **Consequence**: A temporary ban from any sort of interaction or public
100 | communication with the community for a specified period of time. No public or
101 | private interaction with the people involved, including unsolicited interaction
102 | with those enforcing the Code of Conduct, is allowed during this period.
103 | Violating these terms may lead to a permanent ban.
104 | 
105 | ### 4. Permanent Ban
106 | 
107 | **Community Impact**: Demonstrating a pattern of violation of community
108 | standards, including sustained inappropriate behavior,  harassment of an
109 | individual, or aggression toward or disparagement of classes of individuals.
110 | 
111 | **Consequence**: A permanent ban from any sort of public interaction within
112 | the community.
113 | 
114 | ## Attribution
115 | 
116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
117 | version 2.0, available at
118 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
119 | 
120 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
121 | enforcement ladder](https://github.com/mozilla/diversity).
122 | 
123 | [homepage]: https://www.contributor-covenant.org
124 | 
125 | For answers to common questions about this code of conduct, see the FAQ at
126 | https://www.contributor-covenant.org/faq. Translations are available at
127 | https://www.contributor-covenant.org/translations.
128 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | <!-- omit in toc -->
  2 | # Contributing to check_pve
  3 | 
  4 | First off, thanks for taking the time to contribute! ❤️
  5 | 
  6 | All types of contributions are encouraged and valued. See the [Table of Contents](#table-of-contents) for different ways to help and details about how this project handles them. Please make sure to read the relevant section before making your contribution. It will make it a lot easier for us maintainers and smooth out the experience for all involved. The community looks forward to your contributions. 🎉
  7 | 
  8 | > And if you like the project, but just don't have time to contribute, that's fine. There are other easy ways to support the project and show your appreciation, which we would also be very happy about:
  9 | > - Star the project
 10 | > - Tweet about it
 11 | > - Refer this project in your project's readme
 12 | > - Mention the project at local meetups and tell your friends/colleagues
 13 | 
 14 | <!-- omit in toc -->
 15 | ## Table of Contents
 16 | 
 17 | - [I Have a Question](#i-have-a-question)
 18 | - [I Want To Contribute](#i-want-to-contribute)
 19 | - [Reporting Bugs](#reporting-bugs)
 20 | - [Suggesting Enhancements](#suggesting-enhancements)
 21 | - [Your First Code Contribution](#your-first-code-contribution)
 22 | - [Improving The Documentation](#improving-the-documentation)
 23 | - [Styleguides](#styleguides)
 24 | - [Commit Messages](#commit-messages)
 25 | - [Join The Project Team](#join-the-project-team)
 26 | 
 27 | 
 28 | 
 29 | ## I Have a Question
 30 | 
 31 | > If you want to ask a question, we assume that you have read the available [Documentation]().
 32 | 
 33 | Before you ask a question, it is best to search for existing [Issues](https://github.com/nbuchwitz/check_pve/issues) that might help you. In case you have found a suitable issue and still need clarification, you can write your question in this issue. It is also advisable to search the internet for answers first.
 34 | 
 35 | If you then still feel the need to ask a question and need clarification, we recommend the following:
 36 | 
 37 | - Open an [Issue](https://github.com/nbuchwitz/check_pve/issues/new).
 38 | - Provide as much context as you can about what you're running into.
 39 | - Provide project and platform versions (Python, os, icinga or other monitoring tool version etc), depending on what seems relevant.
 40 | 
 41 | We will then take care of the issue as soon as possible.
 42 | 
 43 | ## I Want To Contribute
 44 | 
 45 | > ### Legal Notice <!-- omit in toc -->
 46 | > When contributing to this project, you must agree that you have authored 100% of the content, that you have the necessary rights to the content and that the content you contribute may be provided under the project license.
 47 | 
 48 | ### Reporting Bugs
 49 | 
 50 | <!-- omit in toc -->
 51 | #### Before Submitting a Bug Report
 52 | 
 53 | A good bug report shouldn't leave others needing to chase you up for more information. Therefore, we ask you to investigate carefully, collect information and describe the issue in detail in your report. Please complete the following steps in advance to help us fix any potential bug as fast as possible.
 54 | 
 55 | - Make sure that you are using the latest version.
 56 | - Determine if your bug is really a bug and not an error on your side e.g. using incompatible environment components/versions (Make sure that you have read the [README](README.md).
 57 | - To see if other users have experienced (and potentially already solved) the same issue you are having, check if there is not already a bug report existing for your bug or error in the [bug tracker](https://github.com/nbuchwitz/check_pve/issues?q=label%3Abug).
 58 | - Also make sure to search the internet to see if users outside of the GitHub community have discussed the issue.
 59 | - Collect information about the bug:
 60 | - Stack trace (Traceback)
 61 | - OS, Platform and Version (*BSD,  Linux, x86, ARM)
 62 | - Version of the Python interpreter, icinga or monitoring tool version, depending on what seems relevant.
 63 | - Possibly your input and the output
 64 | - Can you reliably reproduce the issue? And can you also reproduce it with older versions?
 65 | 
 66 | <!-- omit in toc -->
 67 | #### How Do I Submit a Good Bug Report?
 68 | 
 69 | > You must never report security related issues, vulnerabilities or bugs including sensitive information to the issue tracker, or elsewhere in public. Instead sensitive bugs must be sent by email to <nb+checkpve@tipi-net.de>.
 70 | 
 71 | We use GitHub issues to track bugs and errors. If you run into an issue with the project:
 72 | 
 73 | - Open an [Issue](https://github.com/nbuchwitz/check_pve/issues/new). (Since we can't be sure at this point whether it is a bug or not, we ask you not to talk about a bug yet and not to label the issue.)
 74 | - Explain the behavior you would expect and the actual behavior.
 75 | - Please provide as much context as possible and describe the *reproduction steps* that someone else can follow to recreate the issue on their own. This usually includes your code. For good bug reports you should isolate the problem and create a reduced test case.
 76 | - Provide the information you collected in the previous section.
 77 | 
 78 | Once it's filed, We will label the issue accordingly and try to reproduce the issue with your provided steps. If there are no reproduction steps or no obvious way to reproduce the issue, we will ask you for additional details.
 79 | 
 80 | ### Suggesting Enhancements
 81 | 
 82 | This section guides you through submitting an enhancement suggestion for check_pve, **including completely new features and minor improvements to existing functionality**. Following these guidelines will help maintainers and the community to understand your suggestion and find related suggestions.
 83 | 
 84 | <!-- omit in toc -->
 85 | #### Before Submitting an Enhancement
 86 | 
 87 | - Make sure that you are using the latest version.
 88 | - Read the [documentation]() carefully and find out if the functionality is already covered, maybe by an individual configuration.
 89 | - Perform a [search](https://github.com/nbuchwitz/check_pve/issues) to see if the enhancement has already been suggested. If it has, add a comment to the existing issue instead of opening a new one.
 90 | - Find out whether your idea fits with the scope and aims of the project. It's up to you to make a strong case to convince the project's developers of the merits of this feature. Keep in mind that we want features that will be useful to the majority of our users and not just a small subset. If you're just targeting a minority of users, consider writing an add-on/plugin library.
 91 | 
 92 | <!-- omit in toc -->
 93 | #### How Do I Submit a Good Enhancement Suggestion?
 94 | 
 95 | Enhancement suggestions are tracked as [GitHub issues](https://github.com/nbuchwitz/check_pve/issues).
 96 | 
 97 | - Use a **clear and descriptive title** for the issue to identify the suggestion.
 98 | - Provide a **step-by-step description of the suggested enhancement** in as many details as possible.
 99 | - **Describe the current behavior** and **explain which behavior you expected to see instead** and why. At this point you can also tell which alternatives do not work for you.
100 | - **Explain why this enhancement would be useful** to most check_pve users. You may also want to point out the other projects that solved it better and which could serve as inspiration.
101 | 
102 | ### Your First Code Contribution
103 | 1. Fork the repository and create a feature branch (eg. `git checkout -b my-feature`)
104 | 2. Make the changes in the code base
105 | 3. Update README.if if needed
106 | 4. Commit the changes. Keep in mind to break functionality into logical chunks, representet by one commit each. Also don't forget about the [format of the commit message](#commit-messages) and the [Developer Certificate of Origin (DCO)](https://wiki.linuxfoundation.org/dco)
107 | 5. Push your feature branch to your fork and open merge request
108 | 
109 | ## Styleguides
110 | ### Commit Messages
111 | 
112 | The project commit messages are usually written according to the [conventional commit](https://www.conventionalcommits.org/en/v1.0.0/) format.
113 | 
114 | ### Code Style
115 | 
116 | The Python source code files are formatted with [black](https://github.com/psf/black).
117 | 
118 | <!-- omit in toc -->
119 | ## Attribution
120 | This guide is based on the **contributing-gen**. [Make your own](https://github.com/bttger/contributing-gen)!
121 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # check_pve
  2 | Icinga check command for Proxmox VE via API
  3 | 
  4 | ![Linter](https://github.com/nbuchwitz/check_pve/actions/workflows/lint.yml/badge.svg)
  5 | 
  6 | ## Setup
  7 | 
  8 | ### Requirements
  9 | 
 10 | This check command depends on **Python 3** and the following modules:
 11 |  * requests
 12 |  * argparse
 13 |  * packaging
 14 | 
 15 | **Installation on Debian / Ubuntu**
 16 | ```
 17 | apt install python3 python3-requests python3-packaging
 18 | ```
 19 | 
 20 | **Installation on Rocky / Alma Linux 9**
 21 | ```
 22 | yum install python3 python3-requests python3-packaging
 23 | ```
 24 | 
 25 | **Installation on FreeBSD**
 26 | ```
 27 | pkg install python3 py39-requests py39-packaging
 28 | ```
 29 | 
 30 | **Installation from requirements file**
 31 | ```
 32 | pip3 install -r requirements.txt
 33 | ```
 34 | 
 35 | **Installation as Docker container**
 36 | ```
 37 | docker build -t check_pve .
 38 | ```
 39 | After this, you can start the container like so:
 40 | ```
 41 | docker run -d --name check_pve --rm check_pve
 42 | ```
 43 | The container will keep running without having the need for any of the requirements listed above (for environments that do not support this).
 44 | Running a check is as simple as:
 45 | ```
 46 | docker exec check_pve python check_pve.py ....rest of the default arguments listed below....
 47 | ```
 48 | 
 49 | ### Create a API user in Proxmox VE
 50 | 
 51 | Create a role named ``Monitoring`` and assign necessary privileges:
 52 | 
 53 | ```
 54 | pveum roleadd Monitoring
 55 | pveum rolemod Monitoring --privs VM.Monitor,Sys.Audit,Sys.Modify,Datastore.Audit,VM.Audit
 56 | ```
 57 | 
 58 | Create a user named ``monitoring`` and set password:
 59 | 
 60 | ```
 61 | pveum useradd monitoring@pve --comment "The ICINGA 2 monitoring user"
 62 | ```
 63 | 
 64 | #### Use token based authorization (recommended)
 65 | 
 66 | Create an API token named `monitoring` for the user `monitoring` with backend `pve`:
 67 | 
 68 | ```
 69 | pveum user token add monitoring@pve monitoring
 70 | ```
 71 | 
 72 | Please save the token secret as there isn't any way to fetch it at a later point.
 73 | 
 74 | Assign role `Monitoring` to token `monitoring` and the user `monitoring@pve`:
 75 | 
 76 | ```
 77 | pveum acl modify / --roles Monitoring --user 'monitoring@pve'
 78 | pveum acl modify / --roles Monitoring --tokens 'monitoring@pve!monitoring'
 79 | ```
 80 | 
 81 | You can now use the check command like this: `./check_pve.py -u monitoring@pve -t monitoring=abcdef12-3456-7890-abcd-deadbeef1234 ...`
 82 | 
 83 | #### Use password based authorization
 84 | 
 85 | Set password for the user `monitoring`:
 86 | 
 87 | ```
 88 | pveum passwd monitoring@pve
 89 | ```
 90 | 
 91 | Assign ``monitoring`` role to user ``monitoring``
 92 | 
 93 | ```
 94 | pveum acl modify / --users monitoring@pve --roles Monitoring
 95 | ```
 96 | 
 97 | For further information about the Proxmox VE privilege system have a look into the [documentation](https://pve.proxmox.com/pve-docs/pve-admin-guide.html#_strong_pveum_strong_proxmox_ve_user_manager).
 98 | 
 99 | 
100 | ## Usage
101 | 
102 | The ``icinga2`` folder contains the command definition and service examples for use with Icinga2.
103 | 
104 | ```
105 | usage: check_pve.py [-h] [--version] [-e API_ENDPOINT] [--api-port API_PORT] [-u API_USER] [-p API_PASSWORD |
106 |                     -P API_PASSWORD_FILE | -t API_TOKEN | -T API_TOKEN_FILE] [-k]
107 |                     [-m {cluster,version,cpu,memory,swap,storage,io_wait,io-wait,updates,services,subscription,vm,vm_status,vm-status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation,backup,snapshot-age}]
108 |                     [-n NODE] [--name NAME] [--vmid VMID] [--expected-vm-status {running,stopped,paused}]
109 |                     [--ignore-vmid VMID] [--ignore-vm-status] [--ignore-service NAME] [--ignore-disk NAME]
110 |                     [--ignore-pools NAME] [-w THRESHOLD_WARNING] [-c THRESHOLD_CRITICAL] [-M] [-V MIN_VERSION]
111 |                     [--unit {GB,MB,KB,GiB,MiB,KiB,B}]
112 | 
113 | Check command for PVE hosts via API
114 | 
115 | options:
116 |   -h, --help            show this help message and exit
117 |   --version             Show version of check command
118 | 
119 | API Options:
120 |   -e, -H, --api-endpoint API_ENDPOINT
121 |                         PVE api endpoint hostname or ip address (no additional data like paths)
122 |   --api-port API_PORT   PVE api endpoint port
123 |   -u, --username API_USER
124 |                         PVE api user (e.g. icinga2@pve or icinga2@pam, depending on which backend you have chosen
125 |                         in proxmox)
126 |   -p, --password API_PASSWORD
127 |                         PVE API user password
128 |   -P, --password-file API_PASSWORD_FILE
129 |                         PVE API user password in a file
130 |   -t, --api-token API_TOKEN
131 |                         PVE API token (format: TOKEN_ID=TOKEN_SECRET)
132 |   -T, --api-token-file API_TOKEN_FILE
133 |                         PVE API token contained in a file (format: TOKEN_ID=TOKEN_SECRET)
134 |   -k, --insecure        Don't verify HTTPS certificate
135 | 
136 | Check Options:
137 |   -m, --mode {cluster,version,cpu,memory,swap,storage,io_wait,io-wait,updates,services,subscription,vm,vm_status,vm-status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation,backup,snapshot-age}
138 |                         Mode to use.
139 |   -n, --node NODE       Node to check (necessary for all modes except cluster, version and backup)
140 |   --name NAME           Name of storage, vm, or container
141 |   --vmid VMID           ID of virtual machine or container
142 |   --expected-vm-status {running,stopped,paused}
143 |                         Expected VM status
144 |   --ignore-vmid VMID    Ignore VM with vmid in checks
145 |   --ignore-vm-status    Ignore VM status in checks
146 |   --ignore-service NAME
147 |                         Ignore service NAME in checks
148 |   --ignore-disk NAME    Ignore disk NAME in health check
149 |   --ignore-pools NAME   Ignore vms and containers in pool(s) NAME in checks
150 |   -w, --warning THRESHOLD_WARNING
151 |                         Warning threshold for check value. Mutiple thresholds with name:value,name:value
152 |   -c, --critical THRESHOLD_CRITICAL
153 |                         Critical threshold for check value. Mutiple thresholds with name:value,name:value
154 |   -M                    Values are shown in the unit which is set with --unit (if available). Thresholds are also
155 |                         treated in this unit
156 |   -V, --min-version MIN_VERSION
157 |                         The minimal pve version to check for. Any version lower than this will return CRITICAL.
158 |   --unit {GB,MB,KB,GiB,MiB,KiB,B}
159 |                         Unit which is used for performance data and other values
160 | ```
161 | 
162 | ## Check examples
163 | 
164 | 
165 | **Check cluster health**
166 | ```
167 | ./check_pve.py -u <API_USER> -t <API_TOKEN> -e <API_ENDPOINT> -m cluster
168 | OK - Cluster 'proxmox1' is healthy'
169 | ```
170 | 
171 | **Check PVE version**
172 | ```
173 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m version -V 5.0.0
174 | OK - Your pve instance version '5.2' (0fcd7879) is up to date
175 | ```
176 | 
177 | **Check CPU load**
178 | ```
179 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m cpu -n node1
180 | OK - CPU usage is 2.4%|usage=2.4%;;
181 | ```
182 | 
183 | **Check memory usage**
184 | ```
185 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m memory -n node1
186 | OK - Memory usage is 37.44%|usage=37.44%;; used=96544.72MB;;;257867.91
187 | ```
188 | 
189 | **Check disk-health**
190 | ```
191 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m disk-health -n node1
192 | OK - All disks are healthy|wearout_sdb=96%;; wearout_sdc=96%;; wearout_sdd=96%;; wearout_sde=96%;;
193 | ```
194 | 
195 | **Check storage usage**
196 | ```
197 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m storage -n node1 --name local
198 | OK - Storage usage is 54.23%|usage=54.23%;; used=128513.11MB;;;236980.36
199 | 
200 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m storage -n node1 --name vms-disx
201 | CRITICAL - Storage 'vms-disx' doesn't exist on node 'node01'
202 | ```
203 | 
204 | **Check subscription status**
205 | ```
206 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m subscription -n node1 -w 50 -c 10
207 | OK - Subscription of level 'Community' is valid until 2019-01-09
208 | ```
209 | 
210 | **Check VM status**
211 | 
212 | Without specifying a node name:
213 | ```
214 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm --name test-vm
215 | OK - VM 'test-vm' is running on 'node1'|cpu=1.85%;; memory=8.33%;;
216 | ```
217 | 
218 | You can also pass a container name for the VM check:
219 | ```
220 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm --name test-lxc
221 | OK - LXC 'test-lxc' on node 'node1' is running|cpu=0.11%;; memory=13.99%;;
222 | ```
223 | 
224 | With memory thresholds:
225 | ```
226 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm --name test-vm -w 50 -c 80
227 | OK - VM 'test-vm' is running on 'node1'|cpu=1.85%;; memory=40.33%;50.0;80.0
228 | ```
229 | 
230 | With a specified node name, the check plugin verifies on which node the VM runs.
231 | ```
232 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm -n node1 --name test-vm
233 | OK - VM 'test-vm' is running on node 'node1'|cpu=1.85%;; memory=8.33%;;
234 | 
235 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm -n node1 --name test-vm
236 | WARNING - VM 'test-vm' is running on node 'node2' instead of 'node1'|cpu=1.85%;; memory=8.33%;;
237 | ```
238 | 
239 | If you only want to gather metrics and don't care about the vm status add the ``--ignore-vm-status`` flag:
240 | ```
241 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm --name test-vm --ignore-vm-status
242 | OK - VM 'test-vm' is not running
243 | ```
244 | 
245 | Specify the expected VM status:
246 | ```
247 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm --name test-vm --expected-vm-status stopped
248 | OK - VM 'test-vm' is not running
249 | 
250 | ```
251 | 
252 | For hostalive checks without gathering performance data use ``vm_status`` instead of ``vm``. The parameters are the same as with ``vm``.
253 | 
254 | **Check swap usage**
255 | ```
256 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m swap -n pve
257 | OK - Swap usage is 0.0 %|usage=0.0%;; used=0.0MB;;;8192.0
258 | ```
259 | 
260 | **Check storage replication status**
261 | ```
262 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m replication -n node1
263 | OK - No failed replication jobs on node1
264 | ```
265 | 
266 | **Check ceph cluster health**
267 | ```
268 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m ceph-health
269 | WARNING - Ceph Cluster is in warning state
270 | ```
271 | 
272 | **Check ZFS pool health**
273 | ```
274 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m zfs-health -n pve
275 | OK - All ZFS pools are healthy
276 | ```
277 | 
278 | Check for specific pool:
279 | ```
280 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m zfs-health -n pve --name rpool
281 | OK - ZFS pool 'rpool' is healthy
282 | ```
283 | 
284 | **Check ZFS pool fragmentation**
285 | ```
286 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m zfs-fragmentation -n pve -w 40 -c 60
287 | CRITICAL - 2 of 2 ZFS pools are above fragmentation thresholds:
288 | 
289 | - rpool (71 %) is CRITICAL
290 | - diskpool (50 %) is WARNING
291 | |fragmentation_diskpool=50%;40.0;60.0 fragmentation_rpool=71%;40.0;60.0
292 | 
293 | ```
294 | 
295 | Check for specific pool:
296 | ```
297 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m zfs-fragmentation -n pve --name diskpool -w 40 -c 60
298 | WARNING - Fragmentation of ZFS pool 'diskpool' is above thresholds: 50 %|fragmentation=50%;40.0;60.0
299 | ```
300 | 
301 | **Check VZDump Backups**
302 | 
303 | Check task history on all nodes:
304 | 
305 | ```
306 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m backup
307 | CRITICAL - 8 backup tasks successful, 3 backup tasks failed
308 | ```
309 | 
310 | Check for specific node and time frame:
311 | 
312 | ```
313 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m backup -n pve -c 86400
314 | OK - 2 backup tasks successful, 0 backup tasks failed within the last 86400.0s
315 | ```
316 | 
317 | Ignore a VM by their id from backup check:
318 | ```
319 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m backup --ignore-vmid 123
320 | ```
321 | 
322 | **Check snapshots age**
323 | Check age of snapshots on all nodes (thresholds are specified in seconds):
324 | ```
325 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m snapshot-age -w 43200 -c 86400
326 | ```
327 | You can filter by a specific node:
328 | ```
329 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m snapshot-age -n pve -w 43200 -c 86400
330 | ```
331 | Or by VM/Container:
332 | ```
333 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m snapshot-age --name test-vm -w 43200 -c 86400
334 | ```
335 | Or both:
336 | ```
337 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m snapshot-age -n pve --name test-vm -w 43200 -c 86400
338 | ```
339 | You can also filter by VM/Container id:
340 | ```
341 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m snapshot-age -n pve --vmid 123 -w 43200 -c 86400
342 | ```
343 | 
344 | ## FAQ
345 | 
346 | ### Individual thresholds per metric
347 | 
348 | You can either specify a threshold for warning or critical which is applied to all metrics or define individual thresholds like this (`name:value,name:value,...`):
349 | 
350 | ```
351 | ./check_pve.py -u <API_USER> -p <API_PASSWORD> -e <API_ENDPOINT> -m vm --name test-vm -w memory:50 -c cpu:50,memory:80
352 | OK - VM 'test-vm' is running on 'node1'|cpu=1.85%;50.0; memory=40.33%;50.0;80.0
353 | ```
354 | 
355 | ### Could not connect to PVE API: Failed to resolve hostname
356 | 
357 | Verify that your DNS server is working and can resolve your hostname. If everything is fine check for proxyserver environment variables (HTTP_PROXY,HTTPS_PROXY), which maybe not allow communication to port 8006.
358 | 
359 | ## Contributors
360 | 
361 | Thank you to everyone, who is contributing to `check_pve`: https://github.com/nbuchwitz/check_pve/graphs/contributors.
362 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/grafana/pve-metrics-dashboard.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "annotations": {
  3 |     "list": [
  4 |       {
  5 |         "builtIn": 1,
  6 |         "datasource": "-- Grafana --",
  7 |         "enable": true,
  8 |         "hide": true,
  9 |         "iconColor": "rgba(0, 211, 255, 1)",
 10 |         "name": "Annotations & Alerts",
 11 |         "type": "dashboard"
 12 |       }
 13 |     ]
 14 |   },
 15 |   "editable": true,
 16 |   "gnetId": null,
 17 |   "graphTooltip": 0,
 18 |   "hideControls": false,
 19 |   "id": 11,
 20 |   "links": [],
 21 |   "refresh": "30s",
 22 |   "rows": [
 23 |     {
 24 |       "collapse": false,
 25 |       "height": "250px",
 26 |       "panels": [
 27 |         {
 28 |           "aliasColors": {},
 29 |           "bars": false,
 30 |           "dashLength": 10,
 31 |           "dashes": false,
 32 |           "datasource": "icinga2",
 33 |           "fill": 1,
 34 |           "id": 1,
 35 |           "legend": {
 36 |             "alignAsTable": true,
 37 |             "avg": true,
 38 |             "current": true,
 39 |             "hideEmpty": false,
 40 |             "hideZero": false,
 41 |             "max": true,
 42 |             "min": true,
 43 |             "rightSide": false,
 44 |             "show": true,
 45 |             "total": false,
 46 |             "values": true
 47 |           },
 48 |           "lines": true,
 49 |           "linewidth": 1,
 50 |           "links": [],
 51 |           "nullPointMode": "null",
 52 |           "percentage": false,
 53 |           "pointradius": 5,
 54 |           "points": false,
 55 |           "renderer": "flot",
 56 |           "seriesOverrides": [
 57 |             {
 58 |               "alias": "CRITICAL",
 59 |               "color": "#BF1B00",
 60 |               "fill": 0,
 61 |               "legend": false
 62 |             },
 63 |             {
 64 |               "alias": "WARNING",
 65 |               "color": "#EAB839",
 66 |               "fill": 0,
 67 |               "legend": false
 68 |             },
 69 |             {
 70 |               "alias": "memory used",
 71 |               "color": "#0A437C",
 72 |               "yaxis": 2
 73 |             },
 74 |             {
 75 |               "alias": "memory used",
 76 |               "fill": 0
 77 |             }
 78 |           ],
 79 |           "spaceLength": 10,
 80 |           "span": 4,
 81 |           "stack": false,
 82 |           "steppedLine": false,
 83 |           "targets": [
 84 |             {
 85 |               "alias": "$service usage",
 86 |               "dsType": "influxdb",
 87 |               "groupBy": [
 88 |                 {
 89 |                   "params": [
 90 |                     "$__interval"
 91 |                   ],
 92 |                   "type": "time"
 93 |                 },
 94 |                 {
 95 |                   "params": [
 96 |                     "metric"
 97 |                   ],
 98 |                   "type": "tag"
 99 |                 },
100 |                 {
101 |                   "params": [
102 |                     "none"
103 |                   ],
104 |                   "type": "fill"
105 |                 }
106 |               ],
107 |               "hide": false,
108 |               "measurement": "pve",
109 |               "orderByTime": "ASC",
110 |               "policy": "default",
111 |               "refId": "A",
112 |               "resultFormat": "time_series",
113 |               "select": [
114 |                 [
115 |                   {
116 |                     "params": [
117 |                       "value"
118 |                     ],
119 |                     "type": "field"
120 |                   },
121 |                   {
122 |                     "params": [],
123 |                     "type": "mean"
124 |                   }
125 |                 ]
126 |               ],
127 |               "tags": [
128 |                 {
129 |                   "key": "hostname",
130 |                   "operator": "=~",
131 |                   "value": "/^$hostname$/"
132 |                 },
133 |                 {
134 |                   "condition": "AND",
135 |                   "key": "service",
136 |                   "operator": "=~",
137 |                   "value": "/^$service$/"
138 |                 },
139 |                 {
140 |                   "condition": "AND",
141 |                   "key": "metric",
142 |                   "operator": "=",
143 |                   "value": "usage"
144 |                 }
145 |               ]
146 |             },
147 |             {
148 |               "alias": "WARNING",
149 |               "dsType": "influxdb",
150 |               "groupBy": [
151 |                 {
152 |                   "params": [
153 |                     "$__interval"
154 |                   ],
155 |                   "type": "time"
156 |                 },
157 |                 {
158 |                   "params": [
159 |                     "metric"
160 |                   ],
161 |                   "type": "tag"
162 |                 },
163 |                 {
164 |                   "params": [
165 |                     "none"
166 |                   ],
167 |                   "type": "fill"
168 |                 }
169 |               ],
170 |               "hide": false,
171 |               "measurement": "pve",
172 |               "orderByTime": "ASC",
173 |               "policy": "default",
174 |               "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)",
175 |               "rawQuery": false,
176 |               "refId": "C",
177 |               "resultFormat": "time_series",
178 |               "select": [
179 |                 [
180 |                   {
181 |                     "params": [
182 |                       "warn"
183 |                     ],
184 |                     "type": "field"
185 |                   },
186 |                   {
187 |                     "params": [],
188 |                     "type": "mean"
189 |                   }
190 |                 ]
191 |               ],
192 |               "tags": [
193 |                 {
194 |                   "key": "hostname",
195 |                   "operator": "=~",
196 |                   "value": "/^$hostname$/"
197 |                 },
198 |                 {
199 |                   "condition": "AND",
200 |                   "key": "service",
201 |                   "operator": "=~",
202 |                   "value": "/^$service$/"
203 |                 },
204 |                 {
205 |                   "condition": "AND",
206 |                   "key": "metric",
207 |                   "operator": "=",
208 |                   "value": "usage"
209 |                 }
210 |               ]
211 |             },
212 |             {
213 |               "alias": "CRITICAL",
214 |               "dsType": "influxdb",
215 |               "groupBy": [
216 |                 {
217 |                   "params": [
218 |                     "$__interval"
219 |                   ],
220 |                   "type": "time"
221 |                 },
222 |                 {
223 |                   "params": [
224 |                     "metric"
225 |                   ],
226 |                   "type": "tag"
227 |                 },
228 |                 {
229 |                   "params": [
230 |                     "none"
231 |                   ],
232 |                   "type": "fill"
233 |                 }
234 |               ],
235 |               "hide": false,
236 |               "measurement": "pve",
237 |               "orderByTime": "ASC",
238 |               "policy": "default",
239 |               "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)",
240 |               "rawQuery": false,
241 |               "refId": "B",
242 |               "resultFormat": "time_series",
243 |               "select": [
244 |                 [
245 |                   {
246 |                     "params": [
247 |                       "crit"
248 |                     ],
249 |                     "type": "field"
250 |                   },
251 |                   {
252 |                     "params": [],
253 |                     "type": "mean"
254 |                   }
255 |                 ]
256 |               ],
257 |               "tags": [
258 |                 {
259 |                   "key": "hostname",
260 |                   "operator": "=~",
261 |                   "value": "/^$hostname$/"
262 |                 },
263 |                 {
264 |                   "condition": "AND",
265 |                   "key": "service",
266 |                   "operator": "=~",
267 |                   "value": "/^$service$/"
268 |                 },
269 |                 {
270 |                   "condition": "AND",
271 |                   "key": "metric",
272 |                   "operator": "=",
273 |                   "value": "usage"
274 |                 }
275 |               ]
276 |             }
277 |           ],
278 |           "thresholds": [],
279 |           "timeFrom": null,
280 |           "timeShift": null,
281 |           "title": "$service usage",
282 |           "tooltip": {
283 |             "shared": true,
284 |             "sort": 0,
285 |             "value_type": "individual"
286 |           },
287 |           "type": "graph",
288 |           "xaxis": {
289 |             "buckets": null,
290 |             "mode": "time",
291 |             "name": null,
292 |             "show": true,
293 |             "values": []
294 |           },
295 |           "yaxes": [
296 |             {
297 |               "format": "percent",
298 |               "label": "% usage",
299 |               "logBase": 1,
300 |               "max": null,
301 |               "min": "0",
302 |               "show": true
303 |             },
304 |             {
305 |               "format": "bytes",
306 |               "label": "used MB",
307 |               "logBase": 1,
308 |               "max": null,
309 |               "min": "0",
310 |               "show": false
311 |             }
312 |           ]
313 |         },
314 |         {
315 |           "aliasColors": {},
316 |           "bars": false,
317 |           "dashLength": 10,
318 |           "dashes": false,
319 |           "datasource": "icinga2",
320 |           "fill": 1,
321 |           "id": 2,
322 |           "legend": {
323 |             "alignAsTable": true,
324 |             "avg": true,
325 |             "current": true,
326 |             "hideEmpty": false,
327 |             "hideZero": false,
328 |             "max": true,
329 |             "min": true,
330 |             "rightSide": false,
331 |             "show": true,
332 |             "total": false,
333 |             "values": true
334 |           },
335 |           "lines": true,
336 |           "linewidth": 1,
337 |           "links": [],
338 |           "nullPointMode": "null",
339 |           "percentage": false,
340 |           "pointradius": 5,
341 |           "points": false,
342 |           "renderer": "flot",
343 |           "seriesOverrides": [
344 |             {
345 |               "alias": "CRITICAL",
346 |               "color": "#BF1B00",
347 |               "fill": 0,
348 |               "legend": false
349 |             },
350 |             {
351 |               "alias": "WARNING",
352 |               "color": "#EAB839",
353 |               "fill": 0,
354 |               "legend": false
355 |             }
356 |           ],
357 |           "spaceLength": 10,
358 |           "span": 4,
359 |           "stack": false,
360 |           "steppedLine": false,
361 |           "targets": [
362 |             {
363 |               "alias": "$service used",
364 |               "dsType": "influxdb",
365 |               "groupBy": [
366 |                 {
367 |                   "params": [
368 |                     "$__interval"
369 |                   ],
370 |                   "type": "time"
371 |                 },
372 |                 {
373 |                   "params": [
374 |                     "metric"
375 |                   ],
376 |                   "type": "tag"
377 |                 },
378 |                 {
379 |                   "params": [
380 |                     "none"
381 |                   ],
382 |                   "type": "fill"
383 |                 }
384 |               ],
385 |               "hide": false,
386 |               "measurement": "pve",
387 |               "orderByTime": "ASC",
388 |               "policy": "default",
389 |               "refId": "A",
390 |               "resultFormat": "time_series",
391 |               "select": [
392 |                 [
393 |                   {
394 |                     "params": [
395 |                       "value"
396 |                     ],
397 |                     "type": "field"
398 |                   },
399 |                   {
400 |                     "params": [],
401 |                     "type": "mean"
402 |                   }
403 |                 ]
404 |               ],
405 |               "tags": [
406 |                 {
407 |                   "key": "hostname",
408 |                   "operator": "=~",
409 |                   "value": "/^$hostname$/"
410 |                 },
411 |                 {
412 |                   "condition": "AND",
413 |                   "key": "service",
414 |                   "operator": "=~",
415 |                   "value": "/^$service$/"
416 |                 },
417 |                 {
418 |                   "condition": "AND",
419 |                   "key": "metric",
420 |                   "operator": "=",
421 |                   "value": "used"
422 |                 }
423 |               ]
424 |             },
425 |             {
426 |               "alias": "WARNING",
427 |               "dsType": "influxdb",
428 |               "groupBy": [
429 |                 {
430 |                   "params": [
431 |                     "$__interval"
432 |                   ],
433 |                   "type": "time"
434 |                 },
435 |                 {
436 |                   "params": [
437 |                     "metric"
438 |                   ],
439 |                   "type": "tag"
440 |                 },
441 |                 {
442 |                   "params": [
443 |                     "none"
444 |                   ],
445 |                   "type": "fill"
446 |                 }
447 |               ],
448 |               "hide": false,
449 |               "measurement": "pve",
450 |               "orderByTime": "ASC",
451 |               "policy": "default",
452 |               "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)",
453 |               "rawQuery": false,
454 |               "refId": "C",
455 |               "resultFormat": "time_series",
456 |               "select": [
457 |                 [
458 |                   {
459 |                     "params": [
460 |                       "warn"
461 |                     ],
462 |                     "type": "field"
463 |                   },
464 |                   {
465 |                     "params": [],
466 |                     "type": "mean"
467 |                   }
468 |                 ]
469 |               ],
470 |               "tags": [
471 |                 {
472 |                   "key": "hostname",
473 |                   "operator": "=~",
474 |                   "value": "/^$hostname$/"
475 |                 },
476 |                 {
477 |                   "condition": "AND",
478 |                   "key": "service",
479 |                   "operator": "=~",
480 |                   "value": "/^$service$/"
481 |                 },
482 |                 {
483 |                   "condition": "AND",
484 |                   "key": "metric",
485 |                   "operator": "=",
486 |                   "value": "used"
487 |                 }
488 |               ]
489 |             },
490 |             {
491 |               "alias": "CRITICAL",
492 |               "dsType": "influxdb",
493 |               "groupBy": [
494 |                 {
495 |                   "params": [
496 |                     "$__interval"
497 |                   ],
498 |                   "type": "time"
499 |                 },
500 |                 {
501 |                   "params": [
502 |                     "metric"
503 |                   ],
504 |                   "type": "tag"
505 |                 },
506 |                 {
507 |                   "params": [
508 |                     "none"
509 |                   ],
510 |                   "type": "fill"
511 |                 }
512 |               ],
513 |               "hide": false,
514 |               "measurement": "pve",
515 |               "orderByTime": "ASC",
516 |               "policy": "default",
517 |               "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)",
518 |               "rawQuery": false,
519 |               "refId": "B",
520 |               "resultFormat": "time_series",
521 |               "select": [
522 |                 [
523 |                   {
524 |                     "params": [
525 |                       "crit"
526 |                     ],
527 |                     "type": "field"
528 |                   },
529 |                   {
530 |                     "params": [],
531 |                     "type": "mean"
532 |                   }
533 |                 ]
534 |               ],
535 |               "tags": [
536 |                 {
537 |                   "key": "hostname",
538 |                   "operator": "=~",
539 |                   "value": "/^$hostname$/"
540 |                 },
541 |                 {
542 |                   "condition": "AND",
543 |                   "key": "service",
544 |                   "operator": "=~",
545 |                   "value": "/^$service$/"
546 |                 },
547 |                 {
548 |                   "condition": "AND",
549 |                   "key": "metric",
550 |                   "operator": "=",
551 |                   "value": "used"
552 |                 }
553 |               ]
554 |             }
555 |           ],
556 |           "thresholds": [],
557 |           "timeFrom": null,
558 |           "timeShift": null,
559 |           "title": "$service used",
560 |           "tooltip": {
561 |             "shared": true,
562 |             "sort": 0,
563 |             "value_type": "individual"
564 |           },
565 |           "type": "graph",
566 |           "xaxis": {
567 |             "buckets": null,
568 |             "mode": "time",
569 |             "name": null,
570 |             "show": true,
571 |             "values": []
572 |           },
573 |           "yaxes": [
574 |             {
575 |               "format": "bytes",
576 |               "label": "used",
577 |               "logBase": 1,
578 |               "max": null,
579 |               "min": "0",
580 |               "show": true
581 |             },
582 |             {
583 |               "format": "bytes",
584 |               "label": "used MB",
585 |               "logBase": 1,
586 |               "max": null,
587 |               "min": "0",
588 |               "show": false
589 |             }
590 |           ]
591 |         },
592 |         {
593 |           "aliasColors": {},
594 |           "bars": false,
595 |           "dashLength": 10,
596 |           "dashes": false,
597 |           "datasource": "icinga2",
598 |           "fill": 1,
599 |           "id": 3,
600 |           "legend": {
601 |             "alignAsTable": true,
602 |             "avg": true,
603 |             "current": true,
604 |             "hideEmpty": false,
605 |             "hideZero": false,
606 |             "max": true,
607 |             "min": true,
608 |             "rightSide": false,
609 |             "show": true,
610 |             "total": false,
611 |             "values": true
612 |           },
613 |           "lines": true,
614 |           "linewidth": 1,
615 |           "links": [],
616 |           "nullPointMode": "null",
617 |           "percentage": false,
618 |           "pointradius": 5,
619 |           "points": false,
620 |           "renderer": "flot",
621 |           "seriesOverrides": [
622 |             {
623 |               "alias": "CRITICAL",
624 |               "color": "#BF1B00",
625 |               "fill": 0,
626 |               "legend": false
627 |             },
628 |             {
629 |               "alias": "WARNING",
630 |               "color": "#EAB839",
631 |               "fill": 0,
632 |               "legend": false
633 |             },
634 |             {
635 |               "alias": "memory used",
636 |               "color": "#0A437C",
637 |               "yaxis": 2
638 |             },
639 |             {
640 |               "alias": "memory used",
641 |               "fill": 0
642 |             }
643 |           ],
644 |           "spaceLength": 10,
645 |           "span": 4,
646 |           "stack": false,
647 |           "steppedLine": false,
648 |           "targets": [
649 |             {
650 |               "alias": "I/O wait",
651 |               "dsType": "influxdb",
652 |               "groupBy": [
653 |                 {
654 |                   "params": [
655 |                     "$__interval"
656 |                   ],
657 |                   "type": "time"
658 |                 },
659 |                 {
660 |                   "params": [
661 |                     "metric"
662 |                   ],
663 |                   "type": "tag"
664 |                 },
665 |                 {
666 |                   "params": [
667 |                     "none"
668 |                   ],
669 |                   "type": "fill"
670 |                 }
671 |               ],
672 |               "hide": false,
673 |               "measurement": "pve",
674 |               "orderByTime": "ASC",
675 |               "policy": "default",
676 |               "refId": "A",
677 |               "resultFormat": "time_series",
678 |               "select": [
679 |                 [
680 |                   {
681 |                     "params": [
682 |                       "value"
683 |                     ],
684 |                     "type": "field"
685 |                   },
686 |                   {
687 |                     "params": [],
688 |                     "type": "mean"
689 |                   }
690 |                 ]
691 |               ],
692 |               "tags": [
693 |                 {
694 |                   "key": "hostname",
695 |                   "operator": "=~",
696 |                   "value": "/^$hostname$/"
697 |                 },
698 |                 {
699 |                   "condition": "AND",
700 |                   "key": "service",
701 |                   "operator": "=~",
702 |                   "value": "/^$service$/"
703 |                 },
704 |                 {
705 |                   "condition": "AND",
706 |                   "key": "metric",
707 |                   "operator": "=",
708 |                   "value": "wait"
709 |                 }
710 |               ]
711 |             },
712 |             {
713 |               "alias": "WARNING",
714 |               "dsType": "influxdb",
715 |               "groupBy": [
716 |                 {
717 |                   "params": [
718 |                     "$__interval"
719 |                   ],
720 |                   "type": "time"
721 |                 },
722 |                 {
723 |                   "params": [
724 |                     "metric"
725 |                   ],
726 |                   "type": "tag"
727 |                 },
728 |                 {
729 |                   "params": [
730 |                     "none"
731 |                   ],
732 |                   "type": "fill"
733 |                 }
734 |               ],
735 |               "hide": false,
736 |               "measurement": "pve",
737 |               "orderByTime": "ASC",
738 |               "policy": "default",
739 |               "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)",
740 |               "rawQuery": false,
741 |               "refId": "C",
742 |               "resultFormat": "time_series",
743 |               "select": [
744 |                 [
745 |                   {
746 |                     "params": [
747 |                       "warn"
748 |                     ],
749 |                     "type": "field"
750 |                   },
751 |                   {
752 |                     "params": [],
753 |                     "type": "mean"
754 |                   }
755 |                 ]
756 |               ],
757 |               "tags": [
758 |                 {
759 |                   "key": "hostname",
760 |                   "operator": "=~",
761 |                   "value": "/^$hostname$/"
762 |                 },
763 |                 {
764 |                   "condition": "AND",
765 |                   "key": "service",
766 |                   "operator": "=~",
767 |                   "value": "/^$service$/"
768 |                 },
769 |                 {
770 |                   "condition": "AND",
771 |                   "key": "metric",
772 |                   "operator": "=",
773 |                   "value": "wait"
774 |                 }
775 |               ]
776 |             },
777 |             {
778 |               "alias": "CRITICAL",
779 |               "dsType": "influxdb",
780 |               "groupBy": [
781 |                 {
782 |                   "params": [
783 |                     "$__interval"
784 |                   ],
785 |                   "type": "time"
786 |                 },
787 |                 {
788 |                   "params": [
789 |                     "metric"
790 |                   ],
791 |                   "type": "tag"
792 |                 },
793 |                 {
794 |                   "params": [
795 |                     "none"
796 |                   ],
797 |                   "type": "fill"
798 |                 }
799 |               ],
800 |               "hide": false,
801 |               "measurement": "pve",
802 |               "orderByTime": "ASC",
803 |               "policy": "default",
804 |               "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)",
805 |               "rawQuery": false,
806 |               "refId": "B",
807 |               "resultFormat": "time_series",
808 |               "select": [
809 |                 [
810 |                   {
811 |                     "params": [
812 |                       "crit"
813 |                     ],
814 |                     "type": "field"
815 |                   },
816 |                   {
817 |                     "params": [],
818 |                     "type": "mean"
819 |                   }
820 |                 ]
821 |               ],
822 |               "tags": [
823 |                 {
824 |                   "key": "hostname",
825 |                   "operator": "=~",
826 |                   "value": "/^$hostname$/"
827 |                 },
828 |                 {
829 |                   "condition": "AND",
830 |                   "key": "service",
831 |                   "operator": "=~",
832 |                   "value": "/^$service$/"
833 |                 },
834 |                 {
835 |                   "condition": "AND",
836 |                   "key": "metric",
837 |                   "operator": "=",
838 |                   "value": "wait"
839 |                 }
840 |               ]
841 |             }
842 |           ],
843 |           "thresholds": [],
844 |           "timeFrom": null,
845 |           "timeShift": null,
846 |           "title": "I/O wait",
847 |           "tooltip": {
848 |             "shared": true,
849 |             "sort": 0,
850 |             "value_type": "individual"
851 |           },
852 |           "type": "graph",
853 |           "xaxis": {
854 |             "buckets": null,
855 |             "mode": "time",
856 |             "name": null,
857 |             "show": true,
858 |             "values": []
859 |           },
860 |           "yaxes": [
861 |             {
862 |               "format": "percent",
863 |               "label": "% usage",
864 |               "logBase": 1,
865 |               "max": null,
866 |               "min": "0",
867 |               "show": true
868 |             },
869 |             {
870 |               "format": "bytes",
871 |               "label": "used MB",
872 |               "logBase": 1,
873 |               "max": null,
874 |               "min": "0",
875 |               "show": false
876 |             }
877 |           ]
878 |         }
879 |       ],
880 |       "repeat": null,
881 |       "repeatIteration": null,
882 |       "repeatRowId": null,
883 |       "showTitle": false,
884 |       "title": "icmp checks",
885 |       "titleSize": "h6"
886 |     }
887 |   ],
888 |   "schemaVersion": 14,
889 |   "style": "dark",
890 |   "tags": [],
891 |   "templating": {
892 |     "list": [
893 |       {
894 |         "allValue": null,
895 |         "current": {
896 |           "text": "pve01.willi-graf.local",
897 |           "value": "pve01.willi-graf.local"
898 |         },
899 |         "datasource": "icinga2",
900 |         "hide": 0,
901 |         "includeAll": false,
902 |         "label": null,
903 |         "multi": false,
904 |         "name": "hostname",
905 |         "options": [],
906 |         "query": "SHOW TAG VALUES  WITH KEY = \"hostname\"",
907 |         "refresh": 1,
908 |         "regex": "",
909 |         "sort": 1,
910 |         "tagValuesQuery": "",
911 |         "tags": [],
912 |         "tagsQuery": "",
913 |         "type": "query",
914 |         "useTags": false
915 |       },
916 |       {
917 |         "allValue": null,
918 |         "current": {
919 |           "text": "io_wait",
920 |           "value": "io_wait"
921 |         },
922 |         "datasource": "icinga2",
923 |         "hide": 0,
924 |         "includeAll": false,
925 |         "label": null,
926 |         "multi": false,
927 |         "name": "service",
928 |         "options": [],
929 |         "query": "SHOW TAG VALUES  WITH KEY = \"service\" where hostname =~ /^$hostname$/",
930 |         "refresh": 1,
931 |         "regex": "",
932 |         "sort": 1,
933 |         "tagValuesQuery": "",
934 |         "tags": [],
935 |         "tagsQuery": "",
936 |         "type": "query",
937 |         "useTags": false
938 |       }
939 |     ]
940 |   },
941 |   "time": {
942 |     "from": "now-2m",
943 |     "to": "now"
944 |   },
945 |   "timepicker": {
946 |     "refresh_intervals": [
947 |       "5s",
948 |       "10s",
949 |       "30s",
950 |       "1m",
951 |       "5m",
952 |       "15m",
953 |       "30m",
954 |       "1h",
955 |       "2h",
956 |       "1d"
957 |     ],
958 |     "time_options": [
959 |       "5m",
960 |       "15m",
961 |       "1h",
962 |       "6h",
963 |       "12h",
964 |       "24h",
965 |       "2d",
966 |       "7d",
967 |       "30d"
968 |     ]
969 |   },
970 |   "timezone": "browser",
971 |   "title": "icinga-pve-metrics",
972 |   "version": 23
973 | }
974 | 


--------------------------------------------------------------------------------
/check_pve.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | # -*- coding: utf-8 -*-
   3 | 
   4 | # ------------------------------------------------------------------------------
   5 | # check_pve.py - A check plugin for Proxmox Virtual Environment (PVE).
   6 | # Copyright (C) 2018-2025  Nicolai Buchwitz <nb@tipi-net.de>
   7 | #
   8 | # Version: 1.5.0
   9 | #
  10 | # ------------------------------------------------------------------------------
  11 | # This program is free software; you can redistribute it and/or
  12 | # modify it under the terms of the GNU General Public License
  13 | # as published by the Free Software Foundation; either version 2
  14 | # of the License, or (at your option) any later version.
  15 | #
  16 | # This program is distributed in the hope that it will be useful,
  17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  19 | # GNU General Public License for more details.
  20 | #
  21 | # You should have received a copy of the GNU General Public License
  22 | # along with this program; if not, write to the Free Software
  23 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  24 | # ------------------------------------------------------------------------------
  25 | 
  26 | """Proxmox VE monitoring check command for various monitoring systems like Icinga and others."""
  27 | 
  28 | import re
  29 | import sys
  30 | from typing import Callable, Dict, Optional, Union, List
  31 | 
  32 | try:
  33 |     import argparse
  34 |     from datetime import datetime, timezone
  35 |     from enum import Enum
  36 | 
  37 |     import requests
  38 |     from packaging import version
  39 |     from requests.packages.urllib3.exceptions import InsecureRequestWarning
  40 | 
  41 | except ImportError as e:
  42 |     print(f"Missing python module: {str(e)}")
  43 |     sys.exit(255)
  44 | 
  45 | # Timeout for API requests in seconds
  46 | CHECK_API_TIMEOUT = 30
  47 | 
  48 | 
  49 | def compare_thresholds(
  50 |     threshold_warning: Dict, threshold_critical: Dict, comparator: Callable
  51 | ) -> bool:
  52 |     """Perform sanity checks on thresholds parameters (used for argparse validation)."""
  53 |     ok = True
  54 |     keys = set(list(threshold_warning.keys()) + list(threshold_critical.keys()))
  55 |     for key in keys:
  56 |         if (key in threshold_warning and key in threshold_critical) or (
  57 |             None in threshold_warning and None in threshold_critical
  58 |         ):
  59 |             ok = ok and comparator(threshold_warning[key], threshold_critical[key])
  60 |         elif key in threshold_warning and None in threshold_critical:
  61 |             ok = ok and comparator(threshold_warning[key], threshold_critical[None])
  62 |         elif key in threshold_critical and None in threshold_warning:
  63 |             ok = ok and comparator(threshold_warning[None], threshold_critical[key])
  64 | 
  65 |     return ok
  66 | 
  67 | 
  68 | class CheckState(Enum):
  69 |     """Check return values."""
  70 | 
  71 |     OK = 0
  72 |     WARNING = 1
  73 |     CRITICAL = 2
  74 |     UNKNOWN = 3
  75 | 
  76 | 
  77 | class CheckThreshold:
  78 |     """Threshold representation used by the check command."""
  79 | 
  80 |     def __init__(self, value: float) -> None:
  81 |         self.value = value
  82 | 
  83 |     def __eq__(self, other: "CheckThreshold") -> bool:
  84 |         """Threshold is equal to given one."""
  85 |         return self.value == other.value
  86 | 
  87 |     def __lt__(self, other: "CheckThreshold") -> bool:
  88 |         """Threshold is lower to given one."""
  89 |         return self.value < other.value
  90 | 
  91 |     def __le__(self, other: "CheckThreshold") -> bool:
  92 |         """Threshold is lower or equal to given one."""
  93 |         return self.value <= other.value
  94 | 
  95 |     def __gt__(self, other: "CheckThreshold") -> bool:
  96 |         """Threshold is greater than given one."""
  97 |         return self.value > other.value
  98 | 
  99 |     def __ge__(self, other: "CheckThreshold") -> bool:
 100 |         """Threshold is greater or equal than given one."""
 101 |         return self.value >= other.value
 102 | 
 103 |     def check(self, value: float, lower: bool = False) -> bool:
 104 |         """Check threshold value as upper or lower boundary for given value."""
 105 |         if lower:
 106 |             return value < self.value
 107 | 
 108 |         return value > self.value
 109 | 
 110 |     @staticmethod
 111 |     def threshold_type(arg: str) -> Dict[str, "CheckThreshold"]:
 112 |         """Convert string argument(s) to threshold dict."""
 113 |         thresholds = {}
 114 | 
 115 |         try:
 116 |             thresholds[None] = CheckThreshold(float(arg))
 117 |         except ValueError:
 118 |             for t in arg.split(","):
 119 |                 m = re.match("([a-z_0-9]+):([0-9.]+)", t)
 120 | 
 121 |                 if m:
 122 |                     thresholds[m.group(1)] = CheckThreshold(float(m.group(2)))
 123 |                 else:
 124 |                     raise argparse.ArgumentTypeError(f"Invalid threshold format: {t}")  # noqa: B904
 125 | 
 126 |         return thresholds
 127 | 
 128 | 
 129 | class RequestError(Exception):
 130 |     """Exception for request related errors."""
 131 | 
 132 |     def __init__(self, message: str, rc: int) -> None:
 133 |         self.message = message
 134 |         self.rc = rc
 135 | 
 136 |         super().__init__(self.message)
 137 | 
 138 | 
 139 | class CheckPVE:
 140 |     """Check command for Proxmox VE."""
 141 | 
 142 |     VERSION = "1.5.0"
 143 |     API_URL = "https://{hostname}:{port}/api2/json/{command}"
 144 |     UNIT_SCALE = {
 145 |         "GB": 10**9,
 146 |         "MB": 10**6,
 147 |         "KB": 10**3,
 148 |         "GiB": 2**30,
 149 |         "MiB": 2**20,
 150 |         "KiB": 2**10,
 151 |         "B": 1,
 152 |     }
 153 | 
 154 |     def check_output(self) -> None:
 155 |         """Print check command output with perfdata and return code."""
 156 |         message = self.check_message
 157 |         if self.perfdata:
 158 |             message += self.get_perfdata()
 159 | 
 160 |         self.output(self.check_result, message)
 161 | 
 162 |     @staticmethod
 163 |     def output(rc: CheckState, message: str) -> None:
 164 |         """Print message to stdout and exit with given return code."""
 165 |         prefix = rc.name
 166 |         print(f"{prefix} - {message}")
 167 |         sys.exit(rc.value)
 168 | 
 169 |     def get_url(self, command: str) -> str:
 170 |         """Get API url for specific command."""
 171 |         return self.API_URL.format(
 172 |             hostname=self.options.api_endpoint, command=command, port=self.options.api_port
 173 |         )
 174 | 
 175 |     def get_file_line(self, filename: str) -> str:
 176 |         """Read the first line of a file and return it without the newline."""
 177 |         return open(filename, "r").readline().strip()
 178 | 
 179 |     def request(self, url: str, method: str = "get", **kwargs: Dict) -> Union[Dict, None]:
 180 |         """Execute request against Proxmox VE API and return json data."""
 181 |         response = None
 182 |         try:
 183 |             if method == "post":
 184 |                 response = requests.post(
 185 |                     url,
 186 |                     verify=not self.options.api_insecure,
 187 |                     data=kwargs.get("data", None),
 188 |                     timeout=5,
 189 |                 )
 190 |             elif method == "get":
 191 |                 response = requests.get(
 192 |                     url,
 193 |                     verify=not self.options.api_insecure,
 194 |                     cookies=self.__cookies,
 195 |                     headers=self.__headers,
 196 |                     params=kwargs.get("params", None),
 197 |                     timeout=CHECK_API_TIMEOUT,
 198 |                 )
 199 |             else:
 200 |                 self.output(CheckState.CRITICAL, f"Unsupport request method: {method}")
 201 |         except requests.exceptions.ConnectTimeout:
 202 |             self.output(CheckState.UNKNOWN, "Could not connect to PVE API: Connection timeout")
 203 |         except requests.exceptions.SSLError:
 204 |             self.output(
 205 |                 CheckState.UNKNOWN, "Could not connect to PVE API: Certificate validation failed"
 206 |             )
 207 |         except requests.exceptions.ConnectionError:
 208 |             self.output(
 209 |                 CheckState.UNKNOWN, "Could not connect to PVE API: Failed to resolve hostname"
 210 |             )
 211 | 
 212 |         if response.ok:
 213 |             return response.json()["data"]
 214 | 
 215 |         message = "Could not fetch data from API: "
 216 |         if response.status_code == 401:
 217 |             message += "Could not connection to PVE API: invalid username or password"
 218 |         elif response.status_code == 403:
 219 |             message += (
 220 |                 "Access denied. Please check if API user has sufficient permissions / "
 221 |                 "the correct role has been assigned."
 222 |             )
 223 |         else:
 224 |             message += f"HTTP error code was {response.status_code}"
 225 | 
 226 |         if kwargs.get("raise_error", False):
 227 |             raise RequestError(message, response.status_code)
 228 | 
 229 |         self.output(CheckState.UNKNOWN, message)
 230 | 
 231 |     def get_ticket(self) -> str:
 232 |         """Perform login and fetch ticket for further API calls."""
 233 |         url = self.get_url("access/ticket")
 234 |         data = {"username": self.options.api_user, "password": self.options.api_password}
 235 |         result = self.request(url, "post", data=data)
 236 | 
 237 |         return result["ticket"]
 238 | 
 239 |     def check_api_value(self, url: StopIteration, message: str, **kwargs: Dict) -> None:
 240 |         """Perform simple threshold based check command."""
 241 |         result = self.request(url)
 242 |         used = None
 243 | 
 244 |         if "key" in kwargs:
 245 |             result = result[kwargs.get("key")]
 246 | 
 247 |         if isinstance(result, (dict,)):
 248 |             used_percent = self.get_value(result["used"], result["total"])
 249 |             used = self.get_value(result["used"])
 250 |             total = self.get_value(result["total"])
 251 | 
 252 |             self.add_perfdata(kwargs.get("perfkey", "usage"), used_percent)
 253 |             self.add_perfdata(
 254 |                 kwargs.get("perfkey", "used"), used, max=total, unit=self.options.unit
 255 |             )
 256 |         else:
 257 |             used_percent = round(float(result) * 100, 2)
 258 |             self.add_perfdata(kwargs.get("perfkey", "usage"), used_percent)
 259 | 
 260 |         if self.options.values_mb:
 261 |             message += f" {used} {self.options.unit}"
 262 |             value = used
 263 |         else:
 264 |             message += f" {used_percent} %"
 265 |             value = used_percent
 266 | 
 267 |         self.check_thresholds(value, message)
 268 | 
 269 |     def check_vm_status(self, idx: Union[str, int], **kwargs: str) -> None:
 270 |         """Check status of virtual machine by vmid or name."""
 271 |         url = self.get_url(
 272 |             "cluster/resources",
 273 |         )
 274 |         data = self.request(url, params={"type": "vm"})
 275 | 
 276 |         expected_state = kwargs.get("expected_state", "running")
 277 |         only_status = kwargs.get("only_status", False)
 278 | 
 279 |         found = False
 280 |         for vm in data:
 281 |             if idx in (vm.get("name", None), vm.get("vmid", None)):
 282 |                 # Check if VM (default) or LXC
 283 |                 vm_type = "VM"
 284 |                 if vm["type"] == "lxc":
 285 |                     vm_type = "LXC"
 286 | 
 287 |                 if vm["status"] != expected_state:
 288 |                     self.check_message = (
 289 |                         f"{vm_type} '{vm['name']}' is {vm['status']} (expected: {expected_state})"
 290 |                     )
 291 |                     if not self.options.ignore_vm_status:
 292 |                         self.check_result = CheckState.CRITICAL
 293 |                 else:
 294 |                     if self.options.node and self.options.node != vm["node"]:
 295 |                         self.check_message = (
 296 |                             f"{vm_type} '{vm['name']}' is {expected_state}, "
 297 |                             f"but located on node '{vm['node']}' instead of '{self.options.node}'"
 298 |                         )
 299 |                         self.check_result = CheckState.WARNING
 300 |                     else:
 301 |                         self.check_message = (
 302 |                             f"{vm_type} '{vm['name']}' is {expected_state} on node '{vm['node']}'"
 303 |                         )
 304 | 
 305 |                 if vm["status"] == "running" and not only_status:
 306 |                     cpu = round(vm["cpu"] * 100, 2)
 307 |                     self.add_perfdata("cpu", cpu)
 308 | 
 309 |                     if self.options.values_mb:
 310 |                         memory = self.scale_value(vm["mem"])
 311 |                         self.add_perfdata(
 312 |                             "memory",
 313 |                             memory,
 314 |                             unit=self.options.unit,
 315 |                             max=self.scale_value(vm["maxmem"]),
 316 |                         )
 317 |                         disk = self.scale_value(vm["disk"])
 318 |                         self.add_perfdata(
 319 |                             "disk",
 320 |                             disk,
 321 |                             unit=self.options.unit,
 322 |                             max=self.scale_value(vm["maxdisk"]),
 323 |                         )
 324 | 
 325 |                     else:
 326 |                         memory = self.get_value(vm["mem"], vm["maxmem"])
 327 |                         self.add_perfdata("memory", memory)
 328 |                         disk = self.get_value(vm["disk"], vm["maxdisk"])
 329 |                         self.add_perfdata("disk", disk)
 330 | 
 331 |                     self.check_thresholds(
 332 |                         {"cpu": cpu, "memory": memory, "disk": disk}, message=self.check_message
 333 |                     )
 334 | 
 335 |                 found = True
 336 |                 break
 337 | 
 338 |         if not found:
 339 |             self.check_message = f"VM or LXC '{idx}' not found"
 340 |             self.check_result = CheckState.WARNING
 341 | 
 342 |     def check_disks(self) -> None:
 343 |         """Check disk health on specific Proxmox VE node."""
 344 |         url = self.get_url(f"nodes/{self.options.node}/disks")
 345 | 
 346 |         failed = []
 347 |         unknown = []
 348 |         disks = self.request(url + "/list")
 349 |         for disk in disks:
 350 |             name = disk["devpath"].replace("/dev/", "")
 351 | 
 352 |             if name in self.options.ignore_disks:
 353 |                 continue
 354 | 
 355 |             if disk["health"] == "UNKNOWN":
 356 |                 self.check_result = CheckState.WARNING
 357 |                 unknown.append({"serial": disk["serial"], "device": disk["devpath"]})
 358 | 
 359 |             elif disk["health"] not in ("PASSED", "OK"):
 360 |                 self.check_result = CheckState.WARNING
 361 |                 failed.append({"serial": disk["serial"], "device": disk["devpath"]})
 362 | 
 363 |             if disk["wearout"] != "N/A":
 364 |                 self.add_perfdata(f"wearout_{name}", disk["wearout"])
 365 | 
 366 |         if failed:
 367 |             self.check_message = f"{len(failed)} of {len(disks)} disks failed the health test:\n"
 368 |             for disk in failed:
 369 |                 self.check_message += f"- {disk['device']} with serial '{disk['serial']}'\n"
 370 | 
 371 |         if unknown:
 372 |             self.check_message += (
 373 |                 f"{len(unknown)} of {len(disks)} disks have unknown health status:\n"
 374 |             )
 375 |             for disk in unknown:
 376 |                 self.check_message += f"- {disk['device']} with serial '{disk['serial']}'\n"
 377 | 
 378 |         if not failed and not unknown:
 379 |             self.check_message = "All disks are healthy"
 380 | 
 381 |     def check_replication(self) -> None:
 382 |         """Check replication status for either all or one specific vm / container."""
 383 |         url = self.get_url(f"nodes/{self.options.node}/replication")
 384 | 
 385 |         if self.options.vmid:
 386 |             data = self.request(url, params={"guest": self.options.vmid})
 387 |         else:
 388 |             data = self.request(url)
 389 | 
 390 |         failed_jobs = []  # format: [{guest: str, fail_count: int, error: str}]
 391 |         performance_data = []
 392 | 
 393 |         for job in data:
 394 |             if job["fail_count"] > 0:
 395 |                 failed_jobs.append(
 396 |                     {"guest": job["guest"], "fail_count": job["fail_count"], "error": job["error"]}
 397 |                 )
 398 |             else:
 399 |                 performance_data.append({"id": job["id"], "duration": job["duration"]})
 400 | 
 401 |         if len(failed_jobs) > 0:
 402 |             message = f"Failed replication jobs on {self.options.node}: "
 403 |             for job in failed_jobs:
 404 |                 message = (
 405 |                     message
 406 |                     + "GUEST: {j[guest]}, FAIL_COUNT: {j[fail_count]}, ERROR: {j[error]} ; ".format(
 407 |                         j=job
 408 |                     )
 409 |                 )
 410 |             self.check_message = message
 411 |             self.check_result = CheckState.WARNING
 412 |         else:
 413 |             self.check_message = f"No failed replication jobs on {self.options.node}"
 414 |             self.check_result = CheckState.OK
 415 | 
 416 |         if len(performance_data) > 0:
 417 |             for metric in performance_data:
 418 |                 self.add_perfdata("duration_" + metric["id"], metric["duration"], unit="s")
 419 | 
 420 |     def check_services(self) -> None:
 421 |         """Check state of core services on Proxmox VE node."""
 422 |         url = self.get_url(f"nodes/{self.options.node}/services")
 423 |         data = self.request(url)
 424 | 
 425 |         failed = {}
 426 |         for service in data:
 427 |             if (
 428 |                 service["state"] != "running"
 429 |                 and service.get("active-state", "active") == "active"
 430 |                 and service["name"] not in self.options.ignore_services
 431 |             ):
 432 |                 failed[service["name"]] = service["desc"]
 433 | 
 434 |         if failed:
 435 |             self.check_result = CheckState.CRITICAL
 436 |             message = f"{len(failed)} services are not running:\n\n"
 437 |             for name, description in failed.items():
 438 |                 message += f"- {description} ({name}) is not running\n"
 439 |             self.check_message = message
 440 |         else:
 441 |             self.check_message = "All services are running"
 442 | 
 443 |     def check_subscription(self) -> None:
 444 |         """Check subscription status on Proxmox VE node."""
 445 |         url = self.get_url(f"nodes/{self.options.node}/subscription")
 446 |         data = self.request(url)
 447 | 
 448 |         # 'status' is an enum, values are documented in Proxmox's API viewer:
 449 |         # https://pve.proxmox.com/pve-docs/api-viewer/#/nodes/{node}/subscription
 450 |         if data["status"].lower() == "new":
 451 |             self.check_result = CheckState.WARNING
 452 |             self.check_message = "Subscription not yet checked"
 453 |         elif data["status"].lower() == "notfound":
 454 |             self.check_result = CheckState.WARNING
 455 |             self.check_message = "No valid subscription found"
 456 |         elif data["status"].lower() == "suspended":
 457 |             self.check_result = CheckState.WARNING
 458 |             self.check_message = "Subscription suspended"
 459 |         elif data["status"].lower() == "expired":
 460 |             self.check_result = CheckState.CRITICAL
 461 |             self.check_message = "Subscription expired"
 462 |         elif data["status"].lower() == "invalid":
 463 |             self.check_result = CheckState.CRITICAL
 464 |             self.check_message = "Subscription invalid"
 465 |         elif data["status"].lower() == "active":
 466 |             subscription_due_date = data["nextduedate"]
 467 |             subscription_product_name = data["productname"]
 468 | 
 469 |             date_expire = datetime.strptime(subscription_due_date, "%Y-%m-%d")
 470 |             date_today = datetime.today()
 471 |             delta = (date_expire - date_today).days
 472 | 
 473 |             message = f"{subscription_product_name} is valid until {subscription_due_date}"
 474 |             message_warning_critical = (
 475 |                 f"{subscription_product_name} will expire in {delta} days ({subscription_due_date})"
 476 |             )
 477 | 
 478 |             self.check_thresholds(
 479 |                 delta,
 480 |                 message,
 481 |                 messageWarning=message_warning_critical,
 482 |                 messageCritical=message_warning_critical,
 483 |                 lowerValue=True,
 484 |             )
 485 |         else:
 486 |             self.check_result = CheckState.UNKNOWN
 487 |             self.check_message = "PVE API returned unexpected status '{}'".format(data["status"])
 488 | 
 489 |     def check_updates(self) -> None:
 490 |         """Check for package updates on Proxmox VE node."""
 491 |         url = self.get_url(f"nodes/{self.options.node}/apt/update")
 492 |         count = len(self.request(url))
 493 | 
 494 |         if count:
 495 |             self.check_result = CheckState.WARNING
 496 |             msg = "{} pending update"
 497 |             if count > 1:
 498 |                 msg += "s"
 499 |             self.check_message = msg.format(count)
 500 |         else:
 501 |             self.check_message = "System up to date"
 502 | 
 503 |     def check_cluster_status(self) -> None:
 504 |         """Check if cluster is operational."""
 505 |         url = self.get_url("cluster/status")
 506 |         data = self.request(url)
 507 | 
 508 |         nodes = {}
 509 |         quorate = None
 510 |         cluster = ""
 511 |         for elem in data:
 512 |             if elem["type"] == "cluster":
 513 |                 quorate = elem["quorate"]
 514 |                 cluster = elem["name"]
 515 |             elif elem["type"] == "node":
 516 |                 nodes[elem["name"]] = elem["online"]
 517 | 
 518 |         if quorate is None:
 519 |             self.check_message = "No cluster configuration found"
 520 |         elif quorate:
 521 |             node_count = len(nodes)
 522 |             nodes_online_count = len({k: v for k, v in nodes.items() if v})
 523 | 
 524 |             if node_count > nodes_online_count:
 525 |                 diff = node_count - nodes_online_count
 526 |                 self.check_result = CheckState.WARNING
 527 |                 self.check_message = f"Cluster '{cluster}' is healthy, but {diff} node(s) offline'"
 528 |             else:
 529 |                 self.check_message = f"Cluster '{cluster}' is healthy'"
 530 | 
 531 |             self.add_perfdata("nodes_total", node_count, unit="")
 532 |             self.add_perfdata("nodes_online", nodes_online_count, unit="")
 533 |         else:
 534 |             self.check_result = CheckState.CRITICAL
 535 |             self.check_message = "Cluster is unhealthy - no quorum"
 536 | 
 537 |     def check_zfs_fragmentation(self, name: Optional[str] = None) -> None:
 538 |         """Check all or one specific ZFS pool for fragmentation."""
 539 |         url = self.get_url(f"nodes/{self.options.node}/disks/zfs")
 540 |         data = self.request(url)
 541 | 
 542 |         warnings = []
 543 |         critical = []
 544 |         found = name is None
 545 |         for pool in data:
 546 |             found = found or name == pool["name"]
 547 |             if (name is not None and name == pool["name"]) or name is None:
 548 |                 key = "fragmentation"
 549 |                 if name is None:
 550 |                     key += f"_{pool['name']}"
 551 |                 self.add_perfdata(key, pool["frag"])
 552 | 
 553 |                 threshold_name = f"fragmentation_{name}"
 554 |                 threshold_warning = self.threshold_warning(threshold_name)
 555 |                 threshold_critical = self.threshold_critical(threshold_name)
 556 | 
 557 |                 if threshold_critical is not None and pool["frag"] > float(
 558 |                     threshold_critical.value
 559 |                 ):
 560 |                     critical.append(pool)
 561 |                 elif threshold_warning is not None and pool["frag"] > float(
 562 |                     threshold_warning.value
 563 |                 ):
 564 |                     warnings.append(pool)
 565 | 
 566 |         if not found:
 567 |             self.check_result = CheckState.UNKNOWN
 568 |             self.check_message = f"Could not fetch fragmentation of ZFS pool '{name}'"
 569 |         else:
 570 |             if warnings or critical:
 571 |                 value = None
 572 |                 if critical:
 573 |                     self.check_result = CheckState.CRITICAL
 574 |                     if name is not None:
 575 |                         value = critical[0]["frag"]
 576 |                 else:
 577 |                     self.check_result = CheckState.WARNING
 578 |                     if name is not None:
 579 |                         value = warnings[0]["frag"]
 580 | 
 581 |                 if name is not None:
 582 |                     self.check_message = (
 583 |                         f"Fragmentation of ZFS pool '{name}' is above thresholds: {value} %"
 584 |                     )
 585 |                 else:
 586 |                     pool_above = len(warnings) + len(critical)
 587 |                     message = (
 588 |                         f"{pool_above} of {len(data)} ZFS pools are above fragmentation "
 589 |                         "thresholds:\n\n"
 590 |                     )
 591 |                     message += "\n".join(
 592 |                         [f"- {pool['name']} ({pool['frag']} %) is CRITICAL\n" for pool in critical]
 593 |                     )
 594 |                     message += "\n".join(
 595 |                         [f"- {pool['name']} ({pool['frag']} %) is WARNING\n" for pool in warnings]
 596 |                     )
 597 |                     self.check_message = message
 598 |             else:
 599 |                 self.check_result = CheckState.OK
 600 |                 if name is not None:
 601 |                     self.check_message = f"Fragmentation of ZFS pool '{name}' is OK"
 602 |                 else:
 603 |                     self.check_message = "Fragmentation of all ZFS pools is OK"
 604 | 
 605 |     def check_zfs_health(self, name: Optional[str] = None) -> None:
 606 |         """Check all or one specific ZFS pool for health."""
 607 |         url = self.get_url(f"nodes/{self.options.node}/disks/zfs")
 608 |         data = self.request(url)
 609 | 
 610 |         unhealthy = []
 611 |         found = name is None
 612 |         healthy_conditions = ["online"]
 613 |         for pool in data:
 614 |             found = found or name == pool["name"]
 615 |             if (name is not None and name == pool["name"]) or name is None:
 616 |                 if pool["health"].lower() not in healthy_conditions:
 617 |                     unhealthy.append(pool)
 618 | 
 619 |         if not found:
 620 |             self.check_result = CheckState.UNKNOWN
 621 |             self.check_message = f"Could not fetch health of ZFS pool '{name}'"
 622 |         else:
 623 |             if unhealthy:
 624 |                 self.check_result = CheckState.CRITICAL
 625 |                 message = f"{len(unhealthy)} ZFS pools are not healthy:\n\n"
 626 |                 message += "\n".join(
 627 |                     [f"- {pool['name']} ({pool['health']}) is not healthy" for pool in unhealthy]
 628 |                 )
 629 |                 self.check_message = message
 630 |             else:
 631 |                 self.check_result = CheckState.OK
 632 |                 if name is not None:
 633 |                     self.check_message = f"ZFS pool '{name}' is healthy"
 634 |                 else:
 635 |                     self.check_message = "All ZFS pools are healthy"
 636 | 
 637 |     def check_ceph_health(self) -> None:
 638 |         """Check health of CEPH cluster."""
 639 |         url = self.get_url("cluster/ceph/status")
 640 |         data = self.request(url)
 641 |         ceph_health = data.get("health", {})
 642 | 
 643 |         if "status" not in ceph_health:
 644 |             self.check_result = CheckState.UNKNOWN
 645 |             self.check_message = (
 646 |                 "Could not fetch Ceph status from API. "
 647 |                 "Check the output of 'pvesh get cluster/ceph' on your node"
 648 |             )
 649 |             return
 650 | 
 651 |         if ceph_health["status"] == "HEALTH_OK":
 652 |             self.check_result = CheckState.OK
 653 |             self.check_message = "Ceph Cluster is healthy"
 654 |         elif ceph_health["status"] == "HEALTH_WARN":
 655 |             self.check_result = CheckState.WARNING
 656 |             self.check_message = "Ceph Cluster is in warning state"
 657 |         elif ceph_health["status"] == "HEALTH_CRIT":
 658 |             self.check_result = CheckState.CRITICAL
 659 |             self.check_message = "Ceph Cluster is in critical state"
 660 |         else:
 661 |             self.check_result = CheckState.UNKNOWN
 662 |             self.check_message = "Ceph Cluster is in unknown state"
 663 | 
 664 |     def check_storage(self, name: str) -> None:
 665 |         """Check if storage exists and return usage."""
 666 |         url = self.get_url(f"nodes/{self.options.node}/storage")
 667 |         data = self.request(url)
 668 | 
 669 |         if not any(s["storage"] == name for s in data):
 670 |             self.check_result = CheckState.CRITICAL
 671 |             self.check_message = f"Storage '{name}' doesn't exist on node '{self.options.node}'"
 672 |             return
 673 | 
 674 |         url = self.get_url(f"nodes/{self.options.node}/storage/{name}/status")
 675 |         self.check_api_value(url, f"Usage of storage '{name}' is")
 676 | 
 677 |     def check_version(self) -> None:
 678 |         """Check PVE version."""
 679 |         url = self.get_url("version")
 680 |         data = self.request(url)
 681 |         if not data["version"]:
 682 |             self.check_result = CheckState.UNKNOWN
 683 |             self.check_message = "Unable to determine pve version"
 684 |         elif self.options.min_version and version.parse(self.options.min_version) > version.parse(
 685 |             data["version"]
 686 |         ):
 687 |             self.check_result = CheckState.CRITICAL
 688 |             self.check_message = (
 689 |                 f"Current PVE version '{data['version']}' "
 690 |                 f"({data['repoid']}) is lower than the min. "
 691 |                 f"required version '{self.options.min_version}'"
 692 |             )
 693 |         else:
 694 |             self.check_message = (
 695 |                 f"Your PVE instance version '{data['version']}' ({data['repoid']}) is up to date"
 696 |             )
 697 | 
 698 |     def _get_pool_members(self, pool: str) -> List[int]:
 699 |         """Get a list of vmids, which are members of a given resource pool.
 700 | 
 701 |         NOTE: The request needs the Pool.Audit permission!
 702 |         """
 703 |         members = []
 704 | 
 705 |         try:
 706 |             url = self.get_url(f"pools/{pool}")
 707 |             pools = self.request(url, raise_error=True)
 708 |             for pool in pools.get("members", []):
 709 |                 members.append(pool["vmid"])
 710 |         except RequestError:
 711 |             print(
 712 |                 f"Unable to fetch members of pool '{pool}'. "
 713 |                 "Check if the name is correct and the role has the 'Pool.Audit' permission"
 714 |             )
 715 | 
 716 |         return members
 717 | 
 718 |     def check_vzdump_backup(self, name: Optional[str] = None) -> None:
 719 |         """Check for failed vzdump backup jobs."""
 720 |         tasks_url = self.get_url("cluster/tasks")
 721 |         tasks = self.request(tasks_url)
 722 |         tasks = [t for t in tasks if t["type"] == "vzdump"]
 723 | 
 724 |         # Filter by node id, if one is provided
 725 |         if self.options.node is not None:
 726 |             tasks = [t for t in tasks if t["node"] == self.options.node]
 727 | 
 728 |         # Filter by timestamp, if provided
 729 |         delta = self.threshold_critical("delta")
 730 |         if delta is not None:
 731 |             now = datetime.now(timezone.utc).timestamp()
 732 | 
 733 |             tasks = [t for t in tasks if not delta.check(now - t["starttime"])]
 734 | 
 735 |         # absent status = job still running
 736 |         tasks = [t for t in tasks if "status" in t]
 737 |         failed = len([t for t in tasks if t["status"] != "OK"])
 738 |         success = len(tasks) - failed
 739 |         self.check_message = f"{success} backup tasks successful, {failed} backup tasks failed"
 740 | 
 741 |         if failed > 0:
 742 |             self.check_result = CheckState.CRITICAL
 743 |         else:
 744 |             self.check_result = CheckState.OK
 745 |         if delta is not None:
 746 |             self.check_message += f" within the last {delta.value}s"
 747 | 
 748 |         nbu_url = self.get_url("cluster/backup-info/not-backed-up")
 749 |         not_backed_up = self.request(nbu_url)
 750 | 
 751 |         if len(not_backed_up) > 0:
 752 |             guest_ids = []
 753 | 
 754 |             for guest in not_backed_up:
 755 |                 guest_ids.append(guest["vmid"])
 756 | 
 757 |             ignored_vmids = []
 758 |             for pool in self.options.ignore_pools:
 759 |                 # ignore vms based on their membership of a certain pool
 760 |                 ignored_vmids += self._get_pool_members(pool)
 761 | 
 762 |             if self.options.ignore_vmids:
 763 |                 # ignore vms based on their id
 764 |                 ignored_vmids = ignored_vmids + self.options.ignore_vmids
 765 | 
 766 |             remaining_not_backed_up = sorted(list(set(guest_ids) - set(ignored_vmids)))
 767 |             if len(remaining_not_backed_up) > 0:
 768 |                 if self.check_result not in [CheckState.CRITICAL, CheckState.UNKNOWN]:
 769 |                     self.check_result = CheckState.WARNING
 770 |                     self.check_message += (
 771 |                         "\nThere are unignored guests not covered by any backup schedule: "
 772 |                         + ", ".join(map(str, remaining_not_backed_up))
 773 |                     )
 774 | 
 775 |     def check_snapshot_age(self, idx: Optional[Union[str, int]]) -> None:
 776 |         """Check age of snapshots."""
 777 |         url = self.get_url(
 778 |             "cluster/resources",
 779 |         )
 780 |         data = self.request(url, params={"type": "vm"})
 781 | 
 782 |         warnings = []
 783 |         criticals = []
 784 |         snapshots_exist = False
 785 |         found = False
 786 |         for vm in data:
 787 |             vm_type = "qemu"
 788 |             if vm["type"] == "lxc":
 789 |                 vm_type = "lxc"
 790 |             vm_name = vm.get("name", None)
 791 |             vm_id = vm.get("vmid", None)
 792 | 
 793 |             if not self.options.node:
 794 |                 node_name = vm.get("node", None)
 795 |             else:
 796 |                 node_name = self.options.node
 797 |             if node_name != vm.get("node", None):
 798 |                 continue
 799 |             url = self.get_url(f"nodes/{node_name}/{vm_type}/{vm_id}/snapshot")
 800 |             data = self.request(url)
 801 | 
 802 |             for snapshot in data:
 803 |                 snapshot_name = snapshot.get("name", None)
 804 | 
 805 |                 if snapshot_name == "current":
 806 |                     continue
 807 |                 snapshots_exist = True
 808 | 
 809 |                 threshold_name = f"snapshot_age_{vm_name}_{snapshot_name}"
 810 |                 threshold_warning = self.threshold_warning(threshold_name)
 811 |                 threshold_critical = self.threshold_critical(threshold_name)
 812 | 
 813 |                 snapshot_time = snapshot.get("snaptime", None)
 814 |                 snapshot_age = int(datetime.now(timezone.utc).timestamp()) - snapshot_time
 815 | 
 816 |                 if threshold_critical is not None and snapshot_age > int(threshold_critical.value):
 817 |                     criticals.append([vm_id, vm_name, snapshot_name, snapshot_time])
 818 |                 elif threshold_warning is not None and snapshot_age > int(threshold_warning.value):
 819 |                     warnings.append([vm_id, vm_name, snapshot_name, snapshot_time])
 820 | 
 821 |             if idx and idx in (vm.get("name", None), vm.get("vmid", None)):
 822 |                 found = True
 823 |                 break
 824 | 
 825 |         if idx and not found:
 826 |             self.check_result = CheckState.UNKNOWN
 827 |             self.check_message = f"VM or LXC '{idx}' not found"
 828 |         elif not snapshots_exist:
 829 |             self.check_result = CheckState.OK
 830 |             if idx:
 831 |                 self.check_message = f"No snapshots of '{idx}' exist"
 832 |             else:
 833 |                 self.check_message = "No snapshots exist"
 834 |         else:
 835 |             if idx:
 836 |                 self.check_message = f"Age of snapshots of '{idx}' is "
 837 |             else:
 838 |                 self.check_message = "Age of snapshots is "
 839 |             if criticals or warnings:
 840 |                 if criticals:
 841 |                     self.check_result = CheckState.CRITICAL
 842 |                 else:
 843 |                     self.check_result = CheckState.WARNING
 844 |                 self.check_message += "above thresholds"
 845 |                 for snapshot in criticals:
 846 |                     snap_time = datetime.fromtimestamp(snapshot[3]).strftime("%Y-%m-%d %H:%M:%S")
 847 |                     self.check_message += (
 848 |                         f"\n{snapshot[0]} ({snapshot[1]}): snapshot "
 849 |                         + f"'{snapshot[2]}' taken on {snap_time} is CRITICAL"
 850 |                     )
 851 |                 for snapshot in warnings:
 852 |                     snap_time = datetime.fromtimestamp(snapshot[3]).strftime("%Y-%m-%d %H:%M:%S")
 853 |                     self.check_message += (
 854 |                         f"\n{snapshot[0]} ({snapshot[1]}): snapshot "
 855 |                         + f"'{snapshot[2]}' taken on {snap_time} is WARNING"
 856 |                     )
 857 |             else:
 858 |                 self.check_result = CheckState.OK
 859 |                 self.check_message += "OK"
 860 | 
 861 |     def check_memory(self) -> None:
 862 |         """Check memory usage of Proxmox VE node."""
 863 |         url = self.get_url(f"nodes/{self.options.node}/status")
 864 |         self.check_api_value(url, "Memory usage is", key="memory")
 865 | 
 866 |     def check_swap(self) -> None:
 867 |         """Check swap usage of Proxmox VE node."""
 868 |         url = self.get_url(f"nodes/{self.options.node}/status")
 869 |         self.check_api_value(url, "Swap usage is", key="swap")
 870 | 
 871 |     def check_cpu(self) -> None:
 872 |         """Check cpu usage of Proxmox VE node."""
 873 |         url = self.get_url(f"nodes/{self.options.node}/status")
 874 |         self.check_api_value(url, "CPU usage is", key="cpu")
 875 | 
 876 |     def check_io_wait(self) -> None:
 877 |         """Check io wait of Proxmox VE node."""
 878 |         url = self.get_url(f"nodes/{self.options.node}/status")
 879 |         self.check_api_value(url, "IO wait is", key="wait", perfkey="wait")
 880 | 
 881 |     def check_thresholds(
 882 |         self,
 883 |         values: Union[Dict[str, Union[int, float]], Union[int, float]],
 884 |         message: str,
 885 |         **kwargs: Dict,
 886 |     ) -> None:
 887 |         """Check numeric value against threshold for given metric name."""
 888 |         is_warning = False
 889 |         is_critical = False
 890 | 
 891 |         if not isinstance(values, dict):
 892 |             values = {None: values}
 893 | 
 894 |         for metric, value in values.items():
 895 |             value_warning = self.threshold_warning(metric)
 896 |             if value_warning is not None:
 897 |                 is_warning = is_warning or value_warning.check(
 898 |                     value, kwargs.get("lowerValue", False)
 899 |                 )
 900 | 
 901 |             value_critical = self.threshold_critical(metric)
 902 |             if value_critical is not None:
 903 |                 is_critical = is_critical or value_critical.check(
 904 |                     value, kwargs.get("lowerValue", False)
 905 |                 )
 906 | 
 907 |         if is_critical:
 908 |             self.check_result = CheckState.CRITICAL
 909 |             self.check_message = kwargs.get("messageCritical", message)
 910 |         elif is_warning:
 911 |             self.check_result = CheckState.WARNING
 912 |             self.check_message = kwargs.get("messageWarning", message)
 913 |         else:
 914 |             self.check_message = message
 915 | 
 916 |     def scale_value(self, value: Union[int, float]) -> float:
 917 |         """Scale value according to unit."""
 918 |         if self.options.unit in self.UNIT_SCALE:
 919 |             return value / self.UNIT_SCALE[self.options.unit]
 920 | 
 921 |         raise ValueError("wrong unit")
 922 | 
 923 |     def threshold_warning(self, name: str) -> CheckThreshold:
 924 |         """Get warning threshold for metric name (empty if none)."""
 925 |         return self.options.threshold_warning.get(
 926 |             name, self.options.threshold_warning.get(None, None)
 927 |         )
 928 | 
 929 |     def threshold_critical(self, name: str) -> CheckThreshold:
 930 |         """Get critical threshold for metric name (empty if none)."""
 931 |         return self.options.threshold_critical.get(
 932 |             name, self.options.threshold_critical.get(None, None)
 933 |         )
 934 | 
 935 |     def get_value(
 936 |         self, value: Union[int, float], total: Optional[Union[int, float]] = None
 937 |     ) -> float:
 938 |         """Get value scaled or as percentage."""
 939 |         value = float(value)
 940 | 
 941 |         if total:
 942 |             value /= float(total) / 100
 943 |         else:
 944 |             value = self.scale_value(value)
 945 | 
 946 |         return round(value, 2)
 947 | 
 948 |     def add_perfdata(self, name: str, value: Union[int, float], **kwargs: Dict) -> None:
 949 |         """Add metric to perfdata output."""
 950 |         unit = kwargs.get("unit", "%")
 951 | 
 952 |         perfdata = f"{name}={value}{unit}"
 953 | 
 954 |         threshold_warning = self.threshold_warning(name)
 955 |         threshold_critical = self.threshold_critical(name)
 956 | 
 957 |         perfdata += ";"
 958 |         if threshold_warning:
 959 |             perfdata += str(threshold_warning.value)
 960 | 
 961 |         perfdata += ";"
 962 |         if threshold_critical:
 963 |             perfdata += str(threshold_critical.value)
 964 | 
 965 |         perfdata += ";" + str(kwargs.get("min", 0))
 966 |         perfdata += ";" + str(kwargs.get("max", ""))
 967 | 
 968 |         self.perfdata.append(perfdata)
 969 | 
 970 |     def get_perfdata(self) -> str:
 971 |         """Get perfdata string."""
 972 |         perfdata = ""
 973 | 
 974 |         if self.perfdata:
 975 |             perfdata = "|"
 976 |             perfdata += " ".join(self.perfdata)
 977 | 
 978 |         return perfdata
 979 | 
 980 |     def check(self) -> None:
 981 |         """Execute the real check command."""
 982 |         self.check_result = CheckState.OK
 983 | 
 984 |         if self.options.mode == "cluster":
 985 |             self.check_cluster_status()
 986 |         elif self.options.mode == "version":
 987 |             self.check_version()
 988 |         elif self.options.mode == "memory":
 989 |             self.check_memory()
 990 |         elif self.options.mode == "swap":
 991 |             self.check_swap()
 992 |         elif self.options.mode in ("io_wait", "io-wait"):
 993 |             self.check_io_wait()
 994 |         elif self.options.mode == "disk-health":
 995 |             self.check_disks()
 996 |         elif self.options.mode == "cpu":
 997 |             self.check_cpu()
 998 |         elif self.options.mode == "services":
 999 |             self.check_services()
1000 |         elif self.options.mode == "updates":
1001 |             self.check_updates()
1002 |         elif self.options.mode == "subscription":
1003 |             self.check_subscription()
1004 |         elif self.options.mode == "storage":
1005 |             self.check_storage(self.options.name)
1006 |         elif self.options.mode in ["vm", "vm_status", "vm-status"]:
1007 |             only_status = self.options.mode in ["vm_status", "vm-status"]
1008 | 
1009 |             if self.options.name:
1010 |                 idx = self.options.name
1011 |             else:
1012 |                 idx = self.options.vmid
1013 | 
1014 |             if self.options.expected_vm_status:
1015 |                 self.check_vm_status(
1016 |                     idx, expected_state=self.options.expected_vm_status, only_status=only_status
1017 |                 )
1018 |             else:
1019 |                 self.check_vm_status(idx, only_status=only_status)
1020 |         elif self.options.mode == "replication":
1021 |             self.check_replication()
1022 |         elif self.options.mode == "ceph-health":
1023 |             self.check_ceph_health()
1024 |         elif self.options.mode == "zfs-health":
1025 |             self.check_zfs_health(self.options.name)
1026 |         elif self.options.mode == "zfs-fragmentation":
1027 |             self.check_zfs_fragmentation(self.options.name)
1028 |         elif self.options.mode == "backup":
1029 |             self.check_vzdump_backup(self.options.name)
1030 |         elif self.options.mode == "snapshot-age":
1031 |             if self.options.name:
1032 |                 idx = self.options.name
1033 |             else:
1034 |                 idx = self.options.vmid
1035 | 
1036 |             self.check_snapshot_age(idx)
1037 |         else:
1038 |             message = f"Check mode '{self.options.mode}' not known"
1039 |             self.output(CheckState.UNKNOWN, message)
1040 | 
1041 |         self.check_output()
1042 | 
1043 |     def parse_args(self) -> None:
1044 |         """Parse CLI arguments."""
1045 |         p = argparse.ArgumentParser(description="Check command for PVE hosts via API")
1046 | 
1047 |         p.add_argument(
1048 |             "--version", help="Show version of check command", action="store_true", default=False
1049 |         )
1050 | 
1051 |         api_opts = p.add_argument_group("API Options")
1052 | 
1053 |         api_opts.add_argument(
1054 |             "-e",
1055 |             "-H",
1056 |             "--api-endpoint",
1057 |             help="PVE api endpoint hostname or ip address (no additional data like paths)",
1058 |         )
1059 |         api_opts.add_argument("--api-port", required=False, help="PVE api endpoint port")
1060 | 
1061 |         api_opts.add_argument(
1062 |             "-u",
1063 |             "--username",
1064 |             dest="api_user",
1065 |             help="PVE api user (e.g. icinga2@pve or icinga2@pam, depending on which backend you "
1066 |             "have chosen in proxmox)",
1067 |         )
1068 | 
1069 |         group = api_opts.add_mutually_exclusive_group()
1070 |         group.add_argument("-p", "--password", dest="api_password", help="PVE API user password")
1071 |         group.add_argument(
1072 |             "-P",
1073 |             "--password-file",
1074 |             dest="api_password_file",
1075 |             help="PVE API user password in a file",
1076 |         )
1077 |         group.add_argument(
1078 |             "-t",
1079 |             "--api-token",
1080 |             dest="api_token",
1081 |             help="PVE API token (format: TOKEN_ID=TOKEN_SECRET)",
1082 |         )
1083 |         group.add_argument(
1084 |             "-T",
1085 |             "--api-token-file",
1086 |             dest="api_token_file",
1087 |             help="PVE API token contained in a file (format: TOKEN_ID=TOKEN_SECRET)",
1088 |         )
1089 | 
1090 |         api_opts.add_argument(
1091 |             "-k",
1092 |             "--insecure",
1093 |             dest="api_insecure",
1094 |             action="store_true",
1095 |             default=False,
1096 |             help="Don't verify HTTPS certificate",
1097 |         )
1098 | 
1099 |         api_opts.set_defaults(api_port=8006)
1100 | 
1101 |         check_opts = p.add_argument_group("Check Options")
1102 | 
1103 |         check_opts.add_argument(
1104 |             "-m",
1105 |             "--mode",
1106 |             choices=(
1107 |                 "cluster",
1108 |                 "version",
1109 |                 "cpu",
1110 |                 "memory",
1111 |                 "swap",
1112 |                 "storage",
1113 |                 "io_wait",
1114 |                 "io-wait",
1115 |                 "updates",
1116 |                 "services",
1117 |                 "subscription",
1118 |                 "vm",
1119 |                 "vm_status",
1120 |                 "vm-status",
1121 |                 "replication",
1122 |                 "disk-health",
1123 |                 "ceph-health",
1124 |                 "zfs-health",
1125 |                 "zfs-fragmentation",
1126 |                 "backup",
1127 |                 "snapshot-age",
1128 |             ),
1129 |             help="Mode to use.",
1130 |         )
1131 | 
1132 |         check_opts.add_argument(
1133 |             "-n",
1134 |             "--node",
1135 |             dest="node",
1136 |             help="Node to check (necessary for all modes except cluster, version and backup)",
1137 |         )
1138 | 
1139 |         check_opts.add_argument("--name", dest="name", help="Name of storage, vm, or container")
1140 | 
1141 |         check_opts.add_argument(
1142 |             "--vmid", dest="vmid", type=int, help="ID of virtual machine or container"
1143 |         )
1144 | 
1145 |         check_opts.add_argument(
1146 |             "--expected-vm-status",
1147 |             choices=("running", "stopped", "paused"),
1148 |             help="Expected VM status",
1149 |         )
1150 | 
1151 |         check_opts.add_argument(
1152 |             "--ignore-vmid",
1153 |             dest="ignore_vmids",
1154 |             metavar="VMID",
1155 |             action="append",
1156 |             help="Ignore VM with vmid in checks",
1157 |             default=[],
1158 |             type=int,
1159 |         )
1160 | 
1161 |         check_opts.add_argument(
1162 |             "--ignore-vm-status",
1163 |             dest="ignore_vm_status",
1164 |             action="store_true",
1165 |             help="Ignore VM status in checks",
1166 |             default=False,
1167 |         )
1168 | 
1169 |         check_opts.add_argument(
1170 |             "--ignore-service",
1171 |             dest="ignore_services",
1172 |             action="append",
1173 |             metavar="NAME",
1174 |             help="Ignore service NAME in checks",
1175 |             default=[],
1176 |         )
1177 | 
1178 |         check_opts.add_argument(
1179 |             "--ignore-disk",
1180 |             dest="ignore_disks",
1181 |             action="append",
1182 |             metavar="NAME",
1183 |             help="Ignore disk NAME in health check",
1184 |             default=[],
1185 |         )
1186 | 
1187 |         check_opts.add_argument(
1188 |             "--ignore-pools",
1189 |             dest="ignore_pools",
1190 |             action="append",
1191 |             metavar="NAME",
1192 |             help="Ignore vms and containers in pool(s) NAME in checks",
1193 |             default=[],
1194 |         )
1195 | 
1196 |         check_opts.add_argument(
1197 |             "-w",
1198 |             "--warning",
1199 |             dest="threshold_warning",
1200 |             type=CheckThreshold.threshold_type,
1201 |             default={},
1202 |             help="Warning threshold for check value. Mutiple thresholds with name:value,name:value",
1203 |         )
1204 |         check_opts.add_argument(
1205 |             "-c",
1206 |             "--critical",
1207 |             dest="threshold_critical",
1208 |             type=CheckThreshold.threshold_type,
1209 |             default={},
1210 |             help=(
1211 |                 "Critical threshold for check value. "
1212 |                 "Mutiple thresholds with name:value,name:value"
1213 |             ),
1214 |         )
1215 |         check_opts.add_argument(
1216 |             "-M",
1217 |             dest="values_mb",
1218 |             action="store_true",
1219 |             default=False,
1220 |             help=(
1221 |                 "Values are shown in the unit which is set with --unit (if available). "
1222 |                 "Thresholds are also treated in this unit"
1223 |             ),
1224 |         )
1225 |         check_opts.add_argument(
1226 |             "-V",
1227 |             "--min-version",
1228 |             dest="min_version",
1229 |             type=str,
1230 |             help="The minimal pve version to check for. Any version lower than this will return "
1231 |             "CRITICAL.",
1232 |         )
1233 | 
1234 |         check_opts.add_argument(
1235 |             "--unit",
1236 |             choices=self.UNIT_SCALE.keys(),
1237 |             default="MiB",
1238 |             help="Unit which is used for performance data and other values",
1239 |         )
1240 | 
1241 |         options = p.parse_args()
1242 | 
1243 |         if options.version:
1244 |             print(f"check_pve version {self.VERSION}")
1245 |             sys.exit(0)
1246 | 
1247 |         missing = []
1248 |         if not options.api_endpoint:
1249 |             missing.append("--api-endpoint")
1250 |         if not options.api_user:
1251 |             missing.append("--username")
1252 |         if not (
1253 |             options.api_password
1254 |             or options.api_password_file
1255 |             or options.api_token
1256 |             or options.api_token_file
1257 |         ):
1258 |             missing.append("--password, --api-password-file, --api-token or --api-token-file")
1259 |         if not options.mode:
1260 |             missing.append("--mode")
1261 | 
1262 |         if missing:
1263 |             p.error(f"The following arguments are required: {', '.join(missing)}")
1264 | 
1265 |         if not options.node and options.mode not in [
1266 |             "cluster",
1267 |             "vm",
1268 |             "vm_status",
1269 |             "version",
1270 |             "ceph-health",
1271 |             "backup",
1272 |             "snapshot-age",
1273 |         ]:
1274 |             p.print_usage()
1275 |             message = f"{p.prog}: error: --mode {options.mode} requires node name (--node)"
1276 |             self.output(CheckState.UNKNOWN, message)
1277 | 
1278 |         if (
1279 |             not options.vmid
1280 |             and not options.name
1281 |             and options.mode in ("vm", "vm_status", "vm-status")
1282 |         ):
1283 |             p.print_usage()
1284 |             message = (
1285 |                 f"{p.prog}: error: --mode {options.mode} requires either "
1286 |                 "vm name (--name) or id (--vmid)"
1287 |             )
1288 |             self.output(CheckState.UNKNOWN, message)
1289 | 
1290 |         if not options.name and options.mode == "storage":
1291 |             p.print_usage()
1292 |             message = f"{p.prog}: error: --mode {options.mode} requires storage name (--name)"
1293 |             self.output(CheckState.UNKNOWN, message)
1294 | 
1295 |         if options.threshold_warning and options.threshold_critical:
1296 |             if options.mode != "subscription" and not compare_thresholds(
1297 |                 options.threshold_warning, options.threshold_critical, lambda w, c: w <= c
1298 |             ):
1299 |                 p.error("Critical value must be greater than warning value")
1300 |             elif options.mode == "subscription" and not compare_thresholds(
1301 |                 options.threshold_warning, options.threshold_critical, lambda w, c: w >= c
1302 |             ):
1303 |                 p.error("Critical value must be lower than warning value")
1304 | 
1305 |         self.options = options
1306 | 
1307 |     def __init__(self) -> None:
1308 |         self.options = {}
1309 |         self.ticket = None
1310 |         self.perfdata = []
1311 |         self.check_result = CheckState.UNKNOWN
1312 |         self.check_message = ""
1313 | 
1314 |         self.__headers = {}
1315 |         self.__cookies = {}
1316 | 
1317 |         self.parse_args()
1318 | 
1319 |         if self.options.api_insecure:
1320 |             # disable urllib3 warning about insecure requests
1321 |             requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
1322 | 
1323 |         if self.options.api_token_file is not None:
1324 |             self.options.api_token = self.get_file_line(self.options.api_token_file)
1325 |         if self.options.api_password_file is not None:
1326 |             self.options.api_password = self.get_file_line(self.options.api_password_file)
1327 |         if self.options.api_password is not None:
1328 |             self.__cookies["PVEAuthCookie"] = self.get_ticket()
1329 |         elif self.options.api_token is not None:
1330 |             token = f"{self.options.api_user}!{self.options.api_token}"
1331 |             self.__headers["Authorization"] = f"PVEAPIToken={token}"
1332 | 
1333 | 
1334 | if __name__ == "__main__":
1335 |     pve = CheckPVE()
1336 |     pve.check()
1337 | 


--------------------------------------------------------------------------------