├── requirements.txt ├── Dockerfile ├── pyproject.toml ├── .github ├── workflows │ └── lint.yml └── ISSUE_TEMPLATE │ └── bug_report.md ├── .gitignore ├── icinga2 ├── command.conf └── service.conf ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── README.md ├── LICENSE ├── grafana └── pve-metrics-dashboard.json └── check_pve.py /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | argparse 3 | packaging 4 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3 2 | 3 | ADD check_pve.py / 4 | ADD requirements.txt / 5 | RUN apt-get update 6 | RUN apt install -y python3 python3-requests python3-packaging 7 | RUN pip3 install -r requirements.txt 8 | 9 | 10 | CMD ["tail", "-f", "/dev/null"] 11 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 100 3 | 4 | [tool.ruff] 5 | line-length = 100 6 | lint.select = [ 7 | "ANN", # flake8-annotations 8 | "B", # flake8-bugbear 9 | "D", # pydocstyle 10 | "E", # pycodestyle 11 | "F", # Pyflakes 12 | "Q", # flake8-quotes 13 | ] 14 | lint.ignore = [ 15 | "ANN101", # missing-type-self 16 | "D107", # undocumented-public-init 17 | ] 18 | 19 | [tool.ruff.lint.mccabe] 20 | max-complexity = 10 21 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | name: Linter 3 | 4 | jobs: 5 | build: 6 | if: 7 | github.event_name == 'push' || github.event.pull_request.head.repo.full_name != github.repository 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout repository 11 | uses: actions/checkout@v4 12 | 13 | - name: Setup Python 14 | uses: actions/setup-python@v5 15 | with: 16 | python-version: "*" 17 | 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | python -m pip install black ruff 22 | 23 | - name: Check styling with black 24 | run: | 25 | black --check *.py 26 | 27 | - name: Run ruff linter 28 | run: | 29 | ruff check *.py 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Environment (please complete the following information):** 27 | - OS: [e.g. Debian Bullseye, CentOS 7, ...] 28 | - Python Version [e.g. 3.6, 3.9, ...] 29 | - PVE Version [e.g. 6.5, 7.1-8, ...] 30 | - Monitoring Tool [e.g. Icinga2, Nagios, ...] 31 | 32 | **Additional context** 33 | Add any other context about the problem here. 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /icinga2/command.conf: -------------------------------------------------------------------------------- 1 | object CheckCommand "pve" { 2 | import "plugin-check-command" 3 | 4 | command = [ PluginDir + "/check_pve.py" ] 5 | 6 | arguments = { 7 | "-e" = { 8 | value = "$pve_host$" 9 | required = true 10 | description = "Hostname for PVE API" 11 | } 12 | "-u" = { 13 | value = "$pve_user$" 14 | required = true 15 | description = "API user (ex. monitoring@pve)" 16 | } 17 | "-p" = { 18 | value = "$pve_password$" 19 | description = "API user password" 20 | } 21 | "-t" = { 22 | value = "$pve_token$" 23 | description = "API user token" 24 | } 25 | "-k" = { 26 | set_if = "$pve_insecure_connection$" 27 | description = "Connect to this host instead of $pve_host$" 28 | } 29 | "-m" = { 30 | value = "$pve_mode$" 31 | required = true 32 | description = "Check mode (cluster, version, updates, subscription, storage, cpu, memory, io_wait, vm, replication)" 33 | } 34 | "-n" = { 35 | value = "$pve_node$" 36 | description = "Node to check (necessary for all modes except cluster and version)" 37 | } 38 | "--name" = { 39 | value = "$pve_resource_name$" 40 | description = "Name of storage or vm to check" 41 | } 42 | "--expected-vm-status" = { 43 | value = "$pve_expected_vm_status$" 44 | description = "Expected status of the VM" 45 | } 46 | "--ignore-service" = { 47 | repeat_key = true 48 | value = "$pve_ignore_services$" 49 | description = "Ignore services in check" 50 | } 51 | "--ignore-disk" = { 52 | repeat_key = true 53 | value = "$pve_ignore_disks$" 54 | description = "Ignore disks in check" 55 | } 56 | "--ignore-vm-status" = { 57 | set_if = "$pve_ignore_vm_status$" 58 | description = "Ignore VM status in check" 59 | } 60 | "-w" = { 61 | value = "$pve_warning$" 62 | description = "Warning treshold" 63 | } 64 | "-c" = { 65 | value = "$pve_critical$" 66 | description = "Critical treshold" 67 | } 68 | "-M" = { 69 | set_if = "$pve_tresholds_mb$" 70 | description = "Unit of tresholds and values is MB" 71 | } 72 | "-V" = { 73 | value = "$pve_min_version$" 74 | description = "Minimal pve version. Everything lower than this will return CRITICAL." 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /icinga2/service.conf: -------------------------------------------------------------------------------- 1 | template Host "proxmox-host" { 2 | import "generic-host" 3 | 4 | vars.pve_host = name 5 | vars.pve_node = name.split(".")[0] 6 | // ... or if not matching the fqdn (nodename.domain.example) 7 | // vars.pve_node = "proxmox-host" 8 | 9 | // if your icinga host don't trust your pve certificate, you'll have to uncomment this line 10 | // vars.pve_insecure_connection = true 11 | vars.pve_user = "monitor@pve" 12 | // either use password or token 13 | // vars.pve_password = "SuperSecretPassw0rd" 14 | // vars.pve_token = "monitoring=GeneratedToken" 15 | 16 | // change to false, if node is no member of a pve cluster 17 | vars.pve_cluster = true 18 | } 19 | 20 | object Host "proxmox-host.domain.example" { 21 | import "proxmox-host" 22 | 23 | address = "192.168.42.42" 24 | 25 | vars.pve_storage["flashpool"] = { 26 | pve_warning = 80 27 | pve_critical = 90 28 | } 29 | 30 | vars.pve_storage["diskpool"] = { 31 | pve_warning = 80 32 | pve_critical = 90 33 | } 34 | 35 | // Ignore these disks in health check (USB sticks, SD cards, etc.) 36 | vars.pve_ignore_disks = [ "sdn", "sdg" ] 37 | 38 | vars.virtual_machines["vm-01"] = { 39 | } 40 | } 41 | 42 | template Service "pve-service" { 43 | import "generic-service" 44 | 45 | check_command = "pve" 46 | } 47 | 48 | apply Service "cluster" { 49 | import "pve-service" 50 | 51 | vars.pve_mode = "cluster" 52 | 53 | assign where host.vars.pve_host && host.vars.pve_cluster 54 | } 55 | 56 | apply Service "services" { 57 | import "pve-service" 58 | 59 | vars.pve_mode = "services" 60 | 61 | // Ignore cluster status on single nodes 62 | if (!host.vars.pve_cluster) { 63 | vars.pve_ignore_services = host.vars.pve_ignore_services || [] 64 | vars.pve_ignore_services.add("corosync") 65 | } 66 | 67 | assign where host.vars.pve_host 68 | } 69 | 70 | apply Service "updates" { 71 | import "pve-service" 72 | 73 | check_interval = 12h 74 | retry_interval = 2h 75 | max_check_attempts = 3 76 | 77 | vars.pve_mode = "updates" 78 | 79 | assign where host.vars.pve_host 80 | } 81 | 82 | apply Service "disk-health" { 83 | import "pve-service" 84 | 85 | vars.pve_mode = "disk-health" 86 | 87 | assign where host.vars.pve_host 88 | } 89 | 90 | apply Service "io_wait" { 91 | import "pve-service" 92 | 93 | vars.pve_mode = "io_wait" 94 | 95 | vars.pve_warning = 10 96 | vars.pve_critical = 30 97 | 98 | assign where host.vars.pve_host 99 | } 100 | 101 | apply Service "cpu" { 102 | import "pve-service" 103 | 104 | vars.pve_mode = "cpu" 105 | 106 | vars.pve_warning = 70 107 | vars.pve_critical = 90 108 | 109 | assign where host.vars.pve_host 110 | } 111 | 112 | apply Service "memory" { 113 | import "pve-service" 114 | 115 | vars.pve_mode = "memory" 116 | 117 | vars.pve_warning = 80 118 | vars.pve_critical = 90 119 | 120 | assign where host.vars.pve_host 121 | } 122 | 123 | apply Service "storage " for (storage => config in host.vars.pve_storage) { 124 | import "pve-service" 125 | 126 | vars += config 127 | 128 | vars.pve_mode = "storage" 129 | vars.pve_resource_name = storage 130 | } 131 | 132 | apply Service "pve-vm " for (vm => config in host.vars.virtual_machines) { 133 | import "pve-service" 134 | 135 | vars += config 136 | 137 | vars.pve_mode = "vm" 138 | vars.pve_resource_name = vm 139 | 140 | assign where host.vars.pve_host 141 | } 142 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement. 63 | All complaints will be reviewed and investigated promptly and fairly. 64 | 65 | All community leaders are obligated to respect the privacy and security of the 66 | reporter of any incident. 67 | 68 | ## Enforcement Guidelines 69 | 70 | Community leaders will follow these Community Impact Guidelines in determining 71 | the consequences for any action they deem in violation of this Code of Conduct: 72 | 73 | ### 1. Correction 74 | 75 | **Community Impact**: Use of inappropriate language or other behavior deemed 76 | unprofessional or unwelcome in the community. 77 | 78 | **Consequence**: A private, written warning from community leaders, providing 79 | clarity around the nature of the violation and an explanation of why the 80 | behavior was inappropriate. A public apology may be requested. 81 | 82 | ### 2. Warning 83 | 84 | **Community Impact**: A violation through a single incident or series 85 | of actions. 86 | 87 | **Consequence**: A warning with consequences for continued behavior. No 88 | interaction with the people involved, including unsolicited interaction with 89 | those enforcing the Code of Conduct, for a specified period of time. This 90 | includes avoiding interactions in community spaces as well as external channels 91 | like social media. Violating these terms may lead to a temporary or 92 | permanent ban. 93 | 94 | ### 3. Temporary Ban 95 | 96 | **Community Impact**: A serious violation of community standards, including 97 | sustained inappropriate behavior. 98 | 99 | **Consequence**: A temporary ban from any sort of interaction or public 100 | communication with the community for a specified period of time. No public or 101 | private interaction with the people involved, including unsolicited interaction 102 | with those enforcing the Code of Conduct, is allowed during this period. 103 | Violating these terms may lead to a permanent ban. 104 | 105 | ### 4. Permanent Ban 106 | 107 | **Community Impact**: Demonstrating a pattern of violation of community 108 | standards, including sustained inappropriate behavior, harassment of an 109 | individual, or aggression toward or disparagement of classes of individuals. 110 | 111 | **Consequence**: A permanent ban from any sort of public interaction within 112 | the community. 113 | 114 | ## Attribution 115 | 116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 117 | version 2.0, available at 118 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 119 | 120 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 121 | enforcement ladder](https://github.com/mozilla/diversity). 122 | 123 | [homepage]: https://www.contributor-covenant.org 124 | 125 | For answers to common questions about this code of conduct, see the FAQ at 126 | https://www.contributor-covenant.org/faq. Translations are available at 127 | https://www.contributor-covenant.org/translations. 128 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributing to check_pve 3 | 4 | First off, thanks for taking the time to contribute! ❤️ 5 | 6 | All types of contributions are encouraged and valued. See the [Table of Contents](#table-of-contents) for different ways to help and details about how this project handles them. Please make sure to read the relevant section before making your contribution. It will make it a lot easier for us maintainers and smooth out the experience for all involved. The community looks forward to your contributions. 🎉 7 | 8 | > And if you like the project, but just don't have time to contribute, that's fine. There are other easy ways to support the project and show your appreciation, which we would also be very happy about: 9 | > - Star the project 10 | > - Tweet about it 11 | > - Refer this project in your project's readme 12 | > - Mention the project at local meetups and tell your friends/colleagues 13 | 14 | 15 | ## Table of Contents 16 | 17 | - [I Have a Question](#i-have-a-question) 18 | - [I Want To Contribute](#i-want-to-contribute) 19 | - [Reporting Bugs](#reporting-bugs) 20 | - [Suggesting Enhancements](#suggesting-enhancements) 21 | - [Your First Code Contribution](#your-first-code-contribution) 22 | - [Improving The Documentation](#improving-the-documentation) 23 | - [Styleguides](#styleguides) 24 | - [Commit Messages](#commit-messages) 25 | - [Join The Project Team](#join-the-project-team) 26 | 27 | 28 | 29 | ## I Have a Question 30 | 31 | > If you want to ask a question, we assume that you have read the available [Documentation](). 32 | 33 | Before you ask a question, it is best to search for existing [Issues](https://github.com/nbuchwitz/check_pve/issues) that might help you. In case you have found a suitable issue and still need clarification, you can write your question in this issue. It is also advisable to search the internet for answers first. 34 | 35 | If you then still feel the need to ask a question and need clarification, we recommend the following: 36 | 37 | - Open an [Issue](https://github.com/nbuchwitz/check_pve/issues/new). 38 | - Provide as much context as you can about what you're running into. 39 | - Provide project and platform versions (Python, os, icinga or other monitoring tool version etc), depending on what seems relevant. 40 | 41 | We will then take care of the issue as soon as possible. 42 | 43 | ## I Want To Contribute 44 | 45 | > ### Legal Notice 46 | > When contributing to this project, you must agree that you have authored 100% of the content, that you have the necessary rights to the content and that the content you contribute may be provided under the project license. 47 | 48 | ### Reporting Bugs 49 | 50 | 51 | #### Before Submitting a Bug Report 52 | 53 | A good bug report shouldn't leave others needing to chase you up for more information. Therefore, we ask you to investigate carefully, collect information and describe the issue in detail in your report. Please complete the following steps in advance to help us fix any potential bug as fast as possible. 54 | 55 | - Make sure that you are using the latest version. 56 | - Determine if your bug is really a bug and not an error on your side e.g. using incompatible environment components/versions (Make sure that you have read the [README](README.md). 57 | - To see if other users have experienced (and potentially already solved) the same issue you are having, check if there is not already a bug report existing for your bug or error in the [bug tracker](https://github.com/nbuchwitz/check_pve/issues?q=label%3Abug). 58 | - Also make sure to search the internet to see if users outside of the GitHub community have discussed the issue. 59 | - Collect information about the bug: 60 | - Stack trace (Traceback) 61 | - OS, Platform and Version (*BSD, Linux, x86, ARM) 62 | - Version of the Python interpreter, icinga or monitoring tool version, depending on what seems relevant. 63 | - Possibly your input and the output 64 | - Can you reliably reproduce the issue? And can you also reproduce it with older versions? 65 | 66 | 67 | #### How Do I Submit a Good Bug Report? 68 | 69 | > You must never report security related issues, vulnerabilities or bugs including sensitive information to the issue tracker, or elsewhere in public. Instead sensitive bugs must be sent by email to . 70 | 71 | We use GitHub issues to track bugs and errors. If you run into an issue with the project: 72 | 73 | - Open an [Issue](https://github.com/nbuchwitz/check_pve/issues/new). (Since we can't be sure at this point whether it is a bug or not, we ask you not to talk about a bug yet and not to label the issue.) 74 | - Explain the behavior you would expect and the actual behavior. 75 | - Please provide as much context as possible and describe the *reproduction steps* that someone else can follow to recreate the issue on their own. This usually includes your code. For good bug reports you should isolate the problem and create a reduced test case. 76 | - Provide the information you collected in the previous section. 77 | 78 | Once it's filed, We will label the issue accordingly and try to reproduce the issue with your provided steps. If there are no reproduction steps or no obvious way to reproduce the issue, we will ask you for additional details. 79 | 80 | ### Suggesting Enhancements 81 | 82 | This section guides you through submitting an enhancement suggestion for check_pve, **including completely new features and minor improvements to existing functionality**. Following these guidelines will help maintainers and the community to understand your suggestion and find related suggestions. 83 | 84 | 85 | #### Before Submitting an Enhancement 86 | 87 | - Make sure that you are using the latest version. 88 | - Read the [documentation]() carefully and find out if the functionality is already covered, maybe by an individual configuration. 89 | - Perform a [search](https://github.com/nbuchwitz/check_pve/issues) to see if the enhancement has already been suggested. If it has, add a comment to the existing issue instead of opening a new one. 90 | - Find out whether your idea fits with the scope and aims of the project. It's up to you to make a strong case to convince the project's developers of the merits of this feature. Keep in mind that we want features that will be useful to the majority of our users and not just a small subset. If you're just targeting a minority of users, consider writing an add-on/plugin library. 91 | 92 | 93 | #### How Do I Submit a Good Enhancement Suggestion? 94 | 95 | Enhancement suggestions are tracked as [GitHub issues](https://github.com/nbuchwitz/check_pve/issues). 96 | 97 | - Use a **clear and descriptive title** for the issue to identify the suggestion. 98 | - Provide a **step-by-step description of the suggested enhancement** in as many details as possible. 99 | - **Describe the current behavior** and **explain which behavior you expected to see instead** and why. At this point you can also tell which alternatives do not work for you. 100 | - **Explain why this enhancement would be useful** to most check_pve users. You may also want to point out the other projects that solved it better and which could serve as inspiration. 101 | 102 | ### Your First Code Contribution 103 | 1. Fork the repository and create a feature branch (eg. `git checkout -b my-feature`) 104 | 2. Make the changes in the code base 105 | 3. Update README.if if needed 106 | 4. Commit the changes. Keep in mind to break functionality into logical chunks, representet by one commit each. Also don't forget about the [format of the commit message](#commit-messages) and the [Developer Certificate of Origin (DCO)](https://wiki.linuxfoundation.org/dco) 107 | 5. Push your feature branch to your fork and open merge request 108 | 109 | ## Styleguides 110 | ### Commit Messages 111 | 112 | The project commit messages are usually written according to the [conventional commit](https://www.conventionalcommits.org/en/v1.0.0/) format. 113 | 114 | ### Code Style 115 | 116 | The Python source code files are formatted with [black](https://github.com/psf/black). 117 | 118 | 119 | ## Attribution 120 | This guide is based on the **contributing-gen**. [Make your own](https://github.com/bttger/contributing-gen)! 121 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # check_pve 2 | Icinga check command for Proxmox VE via API 3 | 4 | ![Linter](https://github.com/nbuchwitz/check_pve/actions/workflows/lint.yml/badge.svg) 5 | 6 | ## Setup 7 | 8 | ### Requirements 9 | 10 | This check command depends on **Python 3** and the following modules: 11 | * requests 12 | * argparse 13 | * packaging 14 | 15 | **Installation on Debian / Ubuntu** 16 | ``` 17 | apt install python3 python3-requests python3-packaging 18 | ``` 19 | 20 | **Installation on Rocky / Alma Linux 9** 21 | ``` 22 | yum install python3 python3-requests python3-packaging 23 | ``` 24 | 25 | **Installation on FreeBSD** 26 | ``` 27 | pkg install python3 py39-requests py39-packaging 28 | ``` 29 | 30 | **Installation from requirements file** 31 | ``` 32 | pip3 install -r requirements.txt 33 | ``` 34 | 35 | **Installation as Docker container** 36 | ``` 37 | docker build -t check_pve . 38 | ``` 39 | After this, you can start the container like so: 40 | ``` 41 | docker run -d --name check_pve --rm check_pve 42 | ``` 43 | The container will keep running without having the need for any of the requirements listed above (for environments that do not support this). 44 | Running a check is as simple as: 45 | ``` 46 | docker exec check_pve python check_pve.py ....rest of the default arguments listed below.... 47 | ``` 48 | 49 | ### Create a API user in Proxmox VE 50 | 51 | Create a role named ``Monitoring`` and assign necessary privileges: 52 | 53 | ``` 54 | pveum roleadd Monitoring 55 | pveum rolemod Monitoring --privs VM.Monitor,Sys.Audit,Sys.Modify,Datastore.Audit,VM.Audit 56 | ``` 57 | 58 | Create a user named ``monitoring`` and set password: 59 | 60 | ``` 61 | pveum useradd monitoring@pve --comment "The ICINGA 2 monitoring user" 62 | ``` 63 | 64 | #### Use token based authorization (recommended) 65 | 66 | Create an API token named `monitoring` for the user `monitoring` with backend `pve`: 67 | 68 | ``` 69 | pveum user token add monitoring@pve monitoring 70 | ``` 71 | 72 | Please save the token secret as there isn't any way to fetch it at a later point. 73 | 74 | Assign role `Monitoring` to token `monitoring` and the user `monitoring@pve`: 75 | 76 | ``` 77 | pveum acl modify / --roles Monitoring --user 'monitoring@pve' 78 | pveum acl modify / --roles Monitoring --tokens 'monitoring@pve!monitoring' 79 | ``` 80 | 81 | You can now use the check command like this: `./check_pve.py -u monitoring@pve -t monitoring=abcdef12-3456-7890-abcd-deadbeef1234 ...` 82 | 83 | #### Use password based authorization 84 | 85 | Set password for the user `monitoring`: 86 | 87 | ``` 88 | pveum passwd monitoring@pve 89 | ``` 90 | 91 | Assign ``monitoring`` role to user ``monitoring`` 92 | 93 | ``` 94 | pveum acl modify / --users monitoring@pve --roles Monitoring 95 | ``` 96 | 97 | For further information about the Proxmox VE privilege system have a look into the [documentation](https://pve.proxmox.com/pve-docs/pve-admin-guide.html#_strong_pveum_strong_proxmox_ve_user_manager). 98 | 99 | 100 | ## Usage 101 | 102 | The ``icinga2`` folder contains the command definition and service examples for use with Icinga2. 103 | 104 | ``` 105 | usage: check_pve.py [-h] [--version] [-e API_ENDPOINT] [--api-port API_PORT] [-u API_USER] [-p API_PASSWORD | 106 | -P API_PASSWORD_FILE | -t API_TOKEN | -T API_TOKEN_FILE] [-k] 107 | [-m {cluster,version,cpu,memory,swap,storage,io_wait,io-wait,updates,services,subscription,vm,vm_status,vm-status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation,backup,snapshot-age}] 108 | [-n NODE] [--name NAME] [--vmid VMID] [--expected-vm-status {running,stopped,paused}] 109 | [--ignore-vmid VMID] [--ignore-vm-status] [--ignore-service NAME] [--ignore-disk NAME] 110 | [--ignore-pools NAME] [-w THRESHOLD_WARNING] [-c THRESHOLD_CRITICAL] [-M] [-V MIN_VERSION] 111 | [--unit {GB,MB,KB,GiB,MiB,KiB,B}] 112 | 113 | Check command for PVE hosts via API 114 | 115 | options: 116 | -h, --help show this help message and exit 117 | --version Show version of check command 118 | 119 | API Options: 120 | -e, -H, --api-endpoint API_ENDPOINT 121 | PVE api endpoint hostname or ip address (no additional data like paths) 122 | --api-port API_PORT PVE api endpoint port 123 | -u, --username API_USER 124 | PVE api user (e.g. icinga2@pve or icinga2@pam, depending on which backend you have chosen 125 | in proxmox) 126 | -p, --password API_PASSWORD 127 | PVE API user password 128 | -P, --password-file API_PASSWORD_FILE 129 | PVE API user password in a file 130 | -t, --api-token API_TOKEN 131 | PVE API token (format: TOKEN_ID=TOKEN_SECRET) 132 | -T, --api-token-file API_TOKEN_FILE 133 | PVE API token contained in a file (format: TOKEN_ID=TOKEN_SECRET) 134 | -k, --insecure Don't verify HTTPS certificate 135 | 136 | Check Options: 137 | -m, --mode {cluster,version,cpu,memory,swap,storage,io_wait,io-wait,updates,services,subscription,vm,vm_status,vm-status,replication,disk-health,ceph-health,zfs-health,zfs-fragmentation,backup,snapshot-age} 138 | Mode to use. 139 | -n, --node NODE Node to check (necessary for all modes except cluster, version and backup) 140 | --name NAME Name of storage, vm, or container 141 | --vmid VMID ID of virtual machine or container 142 | --expected-vm-status {running,stopped,paused} 143 | Expected VM status 144 | --ignore-vmid VMID Ignore VM with vmid in checks 145 | --ignore-vm-status Ignore VM status in checks 146 | --ignore-service NAME 147 | Ignore service NAME in checks 148 | --ignore-disk NAME Ignore disk NAME in health check 149 | --ignore-pools NAME Ignore vms and containers in pool(s) NAME in checks 150 | -w, --warning THRESHOLD_WARNING 151 | Warning threshold for check value. Mutiple thresholds with name:value,name:value 152 | -c, --critical THRESHOLD_CRITICAL 153 | Critical threshold for check value. Mutiple thresholds with name:value,name:value 154 | -M Values are shown in the unit which is set with --unit (if available). Thresholds are also 155 | treated in this unit 156 | -V, --min-version MIN_VERSION 157 | The minimal pve version to check for. Any version lower than this will return CRITICAL. 158 | --unit {GB,MB,KB,GiB,MiB,KiB,B} 159 | Unit which is used for performance data and other values 160 | ``` 161 | 162 | ## Check examples 163 | 164 | 165 | **Check cluster health** 166 | ``` 167 | ./check_pve.py -u -t -e -m cluster 168 | OK - Cluster 'proxmox1' is healthy' 169 | ``` 170 | 171 | **Check PVE version** 172 | ``` 173 | ./check_pve.py -u -p -e -m version -V 5.0.0 174 | OK - Your pve instance version '5.2' (0fcd7879) is up to date 175 | ``` 176 | 177 | **Check CPU load** 178 | ``` 179 | ./check_pve.py -u -p -e -m cpu -n node1 180 | OK - CPU usage is 2.4%|usage=2.4%;; 181 | ``` 182 | 183 | **Check memory usage** 184 | ``` 185 | ./check_pve.py -u -p -e -m memory -n node1 186 | OK - Memory usage is 37.44%|usage=37.44%;; used=96544.72MB;;;257867.91 187 | ``` 188 | 189 | **Check disk-health** 190 | ``` 191 | ./check_pve.py -u -p -e -m disk-health -n node1 192 | OK - All disks are healthy|wearout_sdb=96%;; wearout_sdc=96%;; wearout_sdd=96%;; wearout_sde=96%;; 193 | ``` 194 | 195 | **Check storage usage** 196 | ``` 197 | ./check_pve.py -u -p -e -m storage -n node1 --name local 198 | OK - Storage usage is 54.23%|usage=54.23%;; used=128513.11MB;;;236980.36 199 | 200 | ./check_pve.py -u -p -e -m storage -n node1 --name vms-disx 201 | CRITICAL - Storage 'vms-disx' doesn't exist on node 'node01' 202 | ``` 203 | 204 | **Check subscription status** 205 | ``` 206 | ./check_pve.py -u -p -e -m subscription -n node1 -w 50 -c 10 207 | OK - Subscription of level 'Community' is valid until 2019-01-09 208 | ``` 209 | 210 | **Check VM status** 211 | 212 | Without specifying a node name: 213 | ``` 214 | ./check_pve.py -u -p -e -m vm --name test-vm 215 | OK - VM 'test-vm' is running on 'node1'|cpu=1.85%;; memory=8.33%;; 216 | ``` 217 | 218 | You can also pass a container name for the VM check: 219 | ``` 220 | ./check_pve.py -u -p -e -m vm --name test-lxc 221 | OK - LXC 'test-lxc' on node 'node1' is running|cpu=0.11%;; memory=13.99%;; 222 | ``` 223 | 224 | With memory thresholds: 225 | ``` 226 | ./check_pve.py -u -p -e -m vm --name test-vm -w 50 -c 80 227 | OK - VM 'test-vm' is running on 'node1'|cpu=1.85%;; memory=40.33%;50.0;80.0 228 | ``` 229 | 230 | With a specified node name, the check plugin verifies on which node the VM runs. 231 | ``` 232 | ./check_pve.py -u -p -e -m vm -n node1 --name test-vm 233 | OK - VM 'test-vm' is running on node 'node1'|cpu=1.85%;; memory=8.33%;; 234 | 235 | ./check_pve.py -u -p -e -m vm -n node1 --name test-vm 236 | WARNING - VM 'test-vm' is running on node 'node2' instead of 'node1'|cpu=1.85%;; memory=8.33%;; 237 | ``` 238 | 239 | If you only want to gather metrics and don't care about the vm status add the ``--ignore-vm-status`` flag: 240 | ``` 241 | ./check_pve.py -u -p -e -m vm --name test-vm --ignore-vm-status 242 | OK - VM 'test-vm' is not running 243 | ``` 244 | 245 | Specify the expected VM status: 246 | ``` 247 | ./check_pve.py -u -p -e -m vm --name test-vm --expected-vm-status stopped 248 | OK - VM 'test-vm' is not running 249 | 250 | ``` 251 | 252 | For hostalive checks without gathering performance data use ``vm_status`` instead of ``vm``. The parameters are the same as with ``vm``. 253 | 254 | **Check swap usage** 255 | ``` 256 | ./check_pve.py -u -p -e -m swap -n pve 257 | OK - Swap usage is 0.0 %|usage=0.0%;; used=0.0MB;;;8192.0 258 | ``` 259 | 260 | **Check storage replication status** 261 | ``` 262 | ./check_pve.py -u -p -e -m replication -n node1 263 | OK - No failed replication jobs on node1 264 | ``` 265 | 266 | **Check ceph cluster health** 267 | ``` 268 | ./check_pve.py -u -p -e -m ceph-health 269 | WARNING - Ceph Cluster is in warning state 270 | ``` 271 | 272 | **Check ZFS pool health** 273 | ``` 274 | ./check_pve.py -u -p -e -m zfs-health -n pve 275 | OK - All ZFS pools are healthy 276 | ``` 277 | 278 | Check for specific pool: 279 | ``` 280 | ./check_pve.py -u -p -e -m zfs-health -n pve --name rpool 281 | OK - ZFS pool 'rpool' is healthy 282 | ``` 283 | 284 | **Check ZFS pool fragmentation** 285 | ``` 286 | ./check_pve.py -u -p -e -m zfs-fragmentation -n pve -w 40 -c 60 287 | CRITICAL - 2 of 2 ZFS pools are above fragmentation thresholds: 288 | 289 | - rpool (71 %) is CRITICAL 290 | - diskpool (50 %) is WARNING 291 | |fragmentation_diskpool=50%;40.0;60.0 fragmentation_rpool=71%;40.0;60.0 292 | 293 | ``` 294 | 295 | Check for specific pool: 296 | ``` 297 | ./check_pve.py -u -p -e -m zfs-fragmentation -n pve --name diskpool -w 40 -c 60 298 | WARNING - Fragmentation of ZFS pool 'diskpool' is above thresholds: 50 %|fragmentation=50%;40.0;60.0 299 | ``` 300 | 301 | **Check VZDump Backups** 302 | 303 | Check task history on all nodes: 304 | 305 | ``` 306 | ./check_pve.py -u -p -e -m backup 307 | CRITICAL - 8 backup tasks successful, 3 backup tasks failed 308 | ``` 309 | 310 | Check for specific node and time frame: 311 | 312 | ``` 313 | ./check_pve.py -u -p -e -m backup -n pve -c 86400 314 | OK - 2 backup tasks successful, 0 backup tasks failed within the last 86400.0s 315 | ``` 316 | 317 | Ignore a VM by their id from backup check: 318 | ``` 319 | ./check_pve.py -u -p -e -m backup --ignore-vmid 123 320 | ``` 321 | 322 | **Check snapshots age** 323 | Check age of snapshots on all nodes (thresholds are specified in seconds): 324 | ``` 325 | ./check_pve.py -u -p -e -m snapshot-age -w 43200 -c 86400 326 | ``` 327 | You can filter by a specific node: 328 | ``` 329 | ./check_pve.py -u -p -e -m snapshot-age -n pve -w 43200 -c 86400 330 | ``` 331 | Or by VM/Container: 332 | ``` 333 | ./check_pve.py -u -p -e -m snapshot-age --name test-vm -w 43200 -c 86400 334 | ``` 335 | Or both: 336 | ``` 337 | ./check_pve.py -u -p -e -m snapshot-age -n pve --name test-vm -w 43200 -c 86400 338 | ``` 339 | You can also filter by VM/Container id: 340 | ``` 341 | ./check_pve.py -u -p -e -m snapshot-age -n pve --vmid 123 -w 43200 -c 86400 342 | ``` 343 | 344 | ## FAQ 345 | 346 | ### Individual thresholds per metric 347 | 348 | You can either specify a threshold for warning or critical which is applied to all metrics or define individual thresholds like this (`name:value,name:value,...`): 349 | 350 | ``` 351 | ./check_pve.py -u -p -e -m vm --name test-vm -w memory:50 -c cpu:50,memory:80 352 | OK - VM 'test-vm' is running on 'node1'|cpu=1.85%;50.0; memory=40.33%;50.0;80.0 353 | ``` 354 | 355 | ### Could not connect to PVE API: Failed to resolve hostname 356 | 357 | Verify that your DNS server is working and can resolve your hostname. If everything is fine check for proxyserver environment variables (HTTP_PROXY,HTTPS_PROXY), which maybe not allow communication to port 8006. 358 | 359 | ## Contributors 360 | 361 | Thank you to everyone, who is contributing to `check_pve`: https://github.com/nbuchwitz/check_pve/graphs/contributors. 362 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /grafana/pve-metrics-dashboard.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": true, 16 | "gnetId": null, 17 | "graphTooltip": 0, 18 | "hideControls": false, 19 | "id": 11, 20 | "links": [], 21 | "refresh": "30s", 22 | "rows": [ 23 | { 24 | "collapse": false, 25 | "height": "250px", 26 | "panels": [ 27 | { 28 | "aliasColors": {}, 29 | "bars": false, 30 | "dashLength": 10, 31 | "dashes": false, 32 | "datasource": "icinga2", 33 | "fill": 1, 34 | "id": 1, 35 | "legend": { 36 | "alignAsTable": true, 37 | "avg": true, 38 | "current": true, 39 | "hideEmpty": false, 40 | "hideZero": false, 41 | "max": true, 42 | "min": true, 43 | "rightSide": false, 44 | "show": true, 45 | "total": false, 46 | "values": true 47 | }, 48 | "lines": true, 49 | "linewidth": 1, 50 | "links": [], 51 | "nullPointMode": "null", 52 | "percentage": false, 53 | "pointradius": 5, 54 | "points": false, 55 | "renderer": "flot", 56 | "seriesOverrides": [ 57 | { 58 | "alias": "CRITICAL", 59 | "color": "#BF1B00", 60 | "fill": 0, 61 | "legend": false 62 | }, 63 | { 64 | "alias": "WARNING", 65 | "color": "#EAB839", 66 | "fill": 0, 67 | "legend": false 68 | }, 69 | { 70 | "alias": "memory used", 71 | "color": "#0A437C", 72 | "yaxis": 2 73 | }, 74 | { 75 | "alias": "memory used", 76 | "fill": 0 77 | } 78 | ], 79 | "spaceLength": 10, 80 | "span": 4, 81 | "stack": false, 82 | "steppedLine": false, 83 | "targets": [ 84 | { 85 | "alias": "$service usage", 86 | "dsType": "influxdb", 87 | "groupBy": [ 88 | { 89 | "params": [ 90 | "$__interval" 91 | ], 92 | "type": "time" 93 | }, 94 | { 95 | "params": [ 96 | "metric" 97 | ], 98 | "type": "tag" 99 | }, 100 | { 101 | "params": [ 102 | "none" 103 | ], 104 | "type": "fill" 105 | } 106 | ], 107 | "hide": false, 108 | "measurement": "pve", 109 | "orderByTime": "ASC", 110 | "policy": "default", 111 | "refId": "A", 112 | "resultFormat": "time_series", 113 | "select": [ 114 | [ 115 | { 116 | "params": [ 117 | "value" 118 | ], 119 | "type": "field" 120 | }, 121 | { 122 | "params": [], 123 | "type": "mean" 124 | } 125 | ] 126 | ], 127 | "tags": [ 128 | { 129 | "key": "hostname", 130 | "operator": "=~", 131 | "value": "/^$hostname$/" 132 | }, 133 | { 134 | "condition": "AND", 135 | "key": "service", 136 | "operator": "=~", 137 | "value": "/^$service$/" 138 | }, 139 | { 140 | "condition": "AND", 141 | "key": "metric", 142 | "operator": "=", 143 | "value": "usage" 144 | } 145 | ] 146 | }, 147 | { 148 | "alias": "WARNING", 149 | "dsType": "influxdb", 150 | "groupBy": [ 151 | { 152 | "params": [ 153 | "$__interval" 154 | ], 155 | "type": "time" 156 | }, 157 | { 158 | "params": [ 159 | "metric" 160 | ], 161 | "type": "tag" 162 | }, 163 | { 164 | "params": [ 165 | "none" 166 | ], 167 | "type": "fill" 168 | } 169 | ], 170 | "hide": false, 171 | "measurement": "pve", 172 | "orderByTime": "ASC", 173 | "policy": "default", 174 | "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)", 175 | "rawQuery": false, 176 | "refId": "C", 177 | "resultFormat": "time_series", 178 | "select": [ 179 | [ 180 | { 181 | "params": [ 182 | "warn" 183 | ], 184 | "type": "field" 185 | }, 186 | { 187 | "params": [], 188 | "type": "mean" 189 | } 190 | ] 191 | ], 192 | "tags": [ 193 | { 194 | "key": "hostname", 195 | "operator": "=~", 196 | "value": "/^$hostname$/" 197 | }, 198 | { 199 | "condition": "AND", 200 | "key": "service", 201 | "operator": "=~", 202 | "value": "/^$service$/" 203 | }, 204 | { 205 | "condition": "AND", 206 | "key": "metric", 207 | "operator": "=", 208 | "value": "usage" 209 | } 210 | ] 211 | }, 212 | { 213 | "alias": "CRITICAL", 214 | "dsType": "influxdb", 215 | "groupBy": [ 216 | { 217 | "params": [ 218 | "$__interval" 219 | ], 220 | "type": "time" 221 | }, 222 | { 223 | "params": [ 224 | "metric" 225 | ], 226 | "type": "tag" 227 | }, 228 | { 229 | "params": [ 230 | "none" 231 | ], 232 | "type": "fill" 233 | } 234 | ], 235 | "hide": false, 236 | "measurement": "pve", 237 | "orderByTime": "ASC", 238 | "policy": "default", 239 | "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)", 240 | "rawQuery": false, 241 | "refId": "B", 242 | "resultFormat": "time_series", 243 | "select": [ 244 | [ 245 | { 246 | "params": [ 247 | "crit" 248 | ], 249 | "type": "field" 250 | }, 251 | { 252 | "params": [], 253 | "type": "mean" 254 | } 255 | ] 256 | ], 257 | "tags": [ 258 | { 259 | "key": "hostname", 260 | "operator": "=~", 261 | "value": "/^$hostname$/" 262 | }, 263 | { 264 | "condition": "AND", 265 | "key": "service", 266 | "operator": "=~", 267 | "value": "/^$service$/" 268 | }, 269 | { 270 | "condition": "AND", 271 | "key": "metric", 272 | "operator": "=", 273 | "value": "usage" 274 | } 275 | ] 276 | } 277 | ], 278 | "thresholds": [], 279 | "timeFrom": null, 280 | "timeShift": null, 281 | "title": "$service usage", 282 | "tooltip": { 283 | "shared": true, 284 | "sort": 0, 285 | "value_type": "individual" 286 | }, 287 | "type": "graph", 288 | "xaxis": { 289 | "buckets": null, 290 | "mode": "time", 291 | "name": null, 292 | "show": true, 293 | "values": [] 294 | }, 295 | "yaxes": [ 296 | { 297 | "format": "percent", 298 | "label": "% usage", 299 | "logBase": 1, 300 | "max": null, 301 | "min": "0", 302 | "show": true 303 | }, 304 | { 305 | "format": "bytes", 306 | "label": "used MB", 307 | "logBase": 1, 308 | "max": null, 309 | "min": "0", 310 | "show": false 311 | } 312 | ] 313 | }, 314 | { 315 | "aliasColors": {}, 316 | "bars": false, 317 | "dashLength": 10, 318 | "dashes": false, 319 | "datasource": "icinga2", 320 | "fill": 1, 321 | "id": 2, 322 | "legend": { 323 | "alignAsTable": true, 324 | "avg": true, 325 | "current": true, 326 | "hideEmpty": false, 327 | "hideZero": false, 328 | "max": true, 329 | "min": true, 330 | "rightSide": false, 331 | "show": true, 332 | "total": false, 333 | "values": true 334 | }, 335 | "lines": true, 336 | "linewidth": 1, 337 | "links": [], 338 | "nullPointMode": "null", 339 | "percentage": false, 340 | "pointradius": 5, 341 | "points": false, 342 | "renderer": "flot", 343 | "seriesOverrides": [ 344 | { 345 | "alias": "CRITICAL", 346 | "color": "#BF1B00", 347 | "fill": 0, 348 | "legend": false 349 | }, 350 | { 351 | "alias": "WARNING", 352 | "color": "#EAB839", 353 | "fill": 0, 354 | "legend": false 355 | } 356 | ], 357 | "spaceLength": 10, 358 | "span": 4, 359 | "stack": false, 360 | "steppedLine": false, 361 | "targets": [ 362 | { 363 | "alias": "$service used", 364 | "dsType": "influxdb", 365 | "groupBy": [ 366 | { 367 | "params": [ 368 | "$__interval" 369 | ], 370 | "type": "time" 371 | }, 372 | { 373 | "params": [ 374 | "metric" 375 | ], 376 | "type": "tag" 377 | }, 378 | { 379 | "params": [ 380 | "none" 381 | ], 382 | "type": "fill" 383 | } 384 | ], 385 | "hide": false, 386 | "measurement": "pve", 387 | "orderByTime": "ASC", 388 | "policy": "default", 389 | "refId": "A", 390 | "resultFormat": "time_series", 391 | "select": [ 392 | [ 393 | { 394 | "params": [ 395 | "value" 396 | ], 397 | "type": "field" 398 | }, 399 | { 400 | "params": [], 401 | "type": "mean" 402 | } 403 | ] 404 | ], 405 | "tags": [ 406 | { 407 | "key": "hostname", 408 | "operator": "=~", 409 | "value": "/^$hostname$/" 410 | }, 411 | { 412 | "condition": "AND", 413 | "key": "service", 414 | "operator": "=~", 415 | "value": "/^$service$/" 416 | }, 417 | { 418 | "condition": "AND", 419 | "key": "metric", 420 | "operator": "=", 421 | "value": "used" 422 | } 423 | ] 424 | }, 425 | { 426 | "alias": "WARNING", 427 | "dsType": "influxdb", 428 | "groupBy": [ 429 | { 430 | "params": [ 431 | "$__interval" 432 | ], 433 | "type": "time" 434 | }, 435 | { 436 | "params": [ 437 | "metric" 438 | ], 439 | "type": "tag" 440 | }, 441 | { 442 | "params": [ 443 | "none" 444 | ], 445 | "type": "fill" 446 | } 447 | ], 448 | "hide": false, 449 | "measurement": "pve", 450 | "orderByTime": "ASC", 451 | "policy": "default", 452 | "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)", 453 | "rawQuery": false, 454 | "refId": "C", 455 | "resultFormat": "time_series", 456 | "select": [ 457 | [ 458 | { 459 | "params": [ 460 | "warn" 461 | ], 462 | "type": "field" 463 | }, 464 | { 465 | "params": [], 466 | "type": "mean" 467 | } 468 | ] 469 | ], 470 | "tags": [ 471 | { 472 | "key": "hostname", 473 | "operator": "=~", 474 | "value": "/^$hostname$/" 475 | }, 476 | { 477 | "condition": "AND", 478 | "key": "service", 479 | "operator": "=~", 480 | "value": "/^$service$/" 481 | }, 482 | { 483 | "condition": "AND", 484 | "key": "metric", 485 | "operator": "=", 486 | "value": "used" 487 | } 488 | ] 489 | }, 490 | { 491 | "alias": "CRITICAL", 492 | "dsType": "influxdb", 493 | "groupBy": [ 494 | { 495 | "params": [ 496 | "$__interval" 497 | ], 498 | "type": "time" 499 | }, 500 | { 501 | "params": [ 502 | "metric" 503 | ], 504 | "type": "tag" 505 | }, 506 | { 507 | "params": [ 508 | "none" 509 | ], 510 | "type": "fill" 511 | } 512 | ], 513 | "hide": false, 514 | "measurement": "pve", 515 | "orderByTime": "ASC", 516 | "policy": "default", 517 | "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)", 518 | "rawQuery": false, 519 | "refId": "B", 520 | "resultFormat": "time_series", 521 | "select": [ 522 | [ 523 | { 524 | "params": [ 525 | "crit" 526 | ], 527 | "type": "field" 528 | }, 529 | { 530 | "params": [], 531 | "type": "mean" 532 | } 533 | ] 534 | ], 535 | "tags": [ 536 | { 537 | "key": "hostname", 538 | "operator": "=~", 539 | "value": "/^$hostname$/" 540 | }, 541 | { 542 | "condition": "AND", 543 | "key": "service", 544 | "operator": "=~", 545 | "value": "/^$service$/" 546 | }, 547 | { 548 | "condition": "AND", 549 | "key": "metric", 550 | "operator": "=", 551 | "value": "used" 552 | } 553 | ] 554 | } 555 | ], 556 | "thresholds": [], 557 | "timeFrom": null, 558 | "timeShift": null, 559 | "title": "$service used", 560 | "tooltip": { 561 | "shared": true, 562 | "sort": 0, 563 | "value_type": "individual" 564 | }, 565 | "type": "graph", 566 | "xaxis": { 567 | "buckets": null, 568 | "mode": "time", 569 | "name": null, 570 | "show": true, 571 | "values": [] 572 | }, 573 | "yaxes": [ 574 | { 575 | "format": "bytes", 576 | "label": "used", 577 | "logBase": 1, 578 | "max": null, 579 | "min": "0", 580 | "show": true 581 | }, 582 | { 583 | "format": "bytes", 584 | "label": "used MB", 585 | "logBase": 1, 586 | "max": null, 587 | "min": "0", 588 | "show": false 589 | } 590 | ] 591 | }, 592 | { 593 | "aliasColors": {}, 594 | "bars": false, 595 | "dashLength": 10, 596 | "dashes": false, 597 | "datasource": "icinga2", 598 | "fill": 1, 599 | "id": 3, 600 | "legend": { 601 | "alignAsTable": true, 602 | "avg": true, 603 | "current": true, 604 | "hideEmpty": false, 605 | "hideZero": false, 606 | "max": true, 607 | "min": true, 608 | "rightSide": false, 609 | "show": true, 610 | "total": false, 611 | "values": true 612 | }, 613 | "lines": true, 614 | "linewidth": 1, 615 | "links": [], 616 | "nullPointMode": "null", 617 | "percentage": false, 618 | "pointradius": 5, 619 | "points": false, 620 | "renderer": "flot", 621 | "seriesOverrides": [ 622 | { 623 | "alias": "CRITICAL", 624 | "color": "#BF1B00", 625 | "fill": 0, 626 | "legend": false 627 | }, 628 | { 629 | "alias": "WARNING", 630 | "color": "#EAB839", 631 | "fill": 0, 632 | "legend": false 633 | }, 634 | { 635 | "alias": "memory used", 636 | "color": "#0A437C", 637 | "yaxis": 2 638 | }, 639 | { 640 | "alias": "memory used", 641 | "fill": 0 642 | } 643 | ], 644 | "spaceLength": 10, 645 | "span": 4, 646 | "stack": false, 647 | "steppedLine": false, 648 | "targets": [ 649 | { 650 | "alias": "I/O wait", 651 | "dsType": "influxdb", 652 | "groupBy": [ 653 | { 654 | "params": [ 655 | "$__interval" 656 | ], 657 | "type": "time" 658 | }, 659 | { 660 | "params": [ 661 | "metric" 662 | ], 663 | "type": "tag" 664 | }, 665 | { 666 | "params": [ 667 | "none" 668 | ], 669 | "type": "fill" 670 | } 671 | ], 672 | "hide": false, 673 | "measurement": "pve", 674 | "orderByTime": "ASC", 675 | "policy": "default", 676 | "refId": "A", 677 | "resultFormat": "time_series", 678 | "select": [ 679 | [ 680 | { 681 | "params": [ 682 | "value" 683 | ], 684 | "type": "field" 685 | }, 686 | { 687 | "params": [], 688 | "type": "mean" 689 | } 690 | ] 691 | ], 692 | "tags": [ 693 | { 694 | "key": "hostname", 695 | "operator": "=~", 696 | "value": "/^$hostname$/" 697 | }, 698 | { 699 | "condition": "AND", 700 | "key": "service", 701 | "operator": "=~", 702 | "value": "/^$service$/" 703 | }, 704 | { 705 | "condition": "AND", 706 | "key": "metric", 707 | "operator": "=", 708 | "value": "wait" 709 | } 710 | ] 711 | }, 712 | { 713 | "alias": "WARNING", 714 | "dsType": "influxdb", 715 | "groupBy": [ 716 | { 717 | "params": [ 718 | "$__interval" 719 | ], 720 | "type": "time" 721 | }, 722 | { 723 | "params": [ 724 | "metric" 725 | ], 726 | "type": "tag" 727 | }, 728 | { 729 | "params": [ 730 | "none" 731 | ], 732 | "type": "fill" 733 | } 734 | ], 735 | "hide": false, 736 | "measurement": "pve", 737 | "orderByTime": "ASC", 738 | "policy": "default", 739 | "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)", 740 | "rawQuery": false, 741 | "refId": "C", 742 | "resultFormat": "time_series", 743 | "select": [ 744 | [ 745 | { 746 | "params": [ 747 | "warn" 748 | ], 749 | "type": "field" 750 | }, 751 | { 752 | "params": [], 753 | "type": "mean" 754 | } 755 | ] 756 | ], 757 | "tags": [ 758 | { 759 | "key": "hostname", 760 | "operator": "=~", 761 | "value": "/^$hostname$/" 762 | }, 763 | { 764 | "condition": "AND", 765 | "key": "service", 766 | "operator": "=~", 767 | "value": "/^$service$/" 768 | }, 769 | { 770 | "condition": "AND", 771 | "key": "metric", 772 | "operator": "=", 773 | "value": "wait" 774 | } 775 | ] 776 | }, 777 | { 778 | "alias": "CRITICAL", 779 | "dsType": "influxdb", 780 | "groupBy": [ 781 | { 782 | "params": [ 783 | "$__interval" 784 | ], 785 | "type": "time" 786 | }, 787 | { 788 | "params": [ 789 | "metric" 790 | ], 791 | "type": "tag" 792 | }, 793 | { 794 | "params": [ 795 | "none" 796 | ], 797 | "type": "fill" 798 | } 799 | ], 800 | "hide": false, 801 | "measurement": "pve", 802 | "orderByTime": "ASC", 803 | "policy": "default", 804 | "query": "SELECT mean(\"value\") FROM \"pve\" WHERE (\"hostname\" =~ /^$hostname$/ AND \"service\" =~ /^$service$/ AND \"metric\" = 'used') AND $timeFilter GROUP BY time($__interval) fill(none)", 805 | "rawQuery": false, 806 | "refId": "B", 807 | "resultFormat": "time_series", 808 | "select": [ 809 | [ 810 | { 811 | "params": [ 812 | "crit" 813 | ], 814 | "type": "field" 815 | }, 816 | { 817 | "params": [], 818 | "type": "mean" 819 | } 820 | ] 821 | ], 822 | "tags": [ 823 | { 824 | "key": "hostname", 825 | "operator": "=~", 826 | "value": "/^$hostname$/" 827 | }, 828 | { 829 | "condition": "AND", 830 | "key": "service", 831 | "operator": "=~", 832 | "value": "/^$service$/" 833 | }, 834 | { 835 | "condition": "AND", 836 | "key": "metric", 837 | "operator": "=", 838 | "value": "wait" 839 | } 840 | ] 841 | } 842 | ], 843 | "thresholds": [], 844 | "timeFrom": null, 845 | "timeShift": null, 846 | "title": "I/O wait", 847 | "tooltip": { 848 | "shared": true, 849 | "sort": 0, 850 | "value_type": "individual" 851 | }, 852 | "type": "graph", 853 | "xaxis": { 854 | "buckets": null, 855 | "mode": "time", 856 | "name": null, 857 | "show": true, 858 | "values": [] 859 | }, 860 | "yaxes": [ 861 | { 862 | "format": "percent", 863 | "label": "% usage", 864 | "logBase": 1, 865 | "max": null, 866 | "min": "0", 867 | "show": true 868 | }, 869 | { 870 | "format": "bytes", 871 | "label": "used MB", 872 | "logBase": 1, 873 | "max": null, 874 | "min": "0", 875 | "show": false 876 | } 877 | ] 878 | } 879 | ], 880 | "repeat": null, 881 | "repeatIteration": null, 882 | "repeatRowId": null, 883 | "showTitle": false, 884 | "title": "icmp checks", 885 | "titleSize": "h6" 886 | } 887 | ], 888 | "schemaVersion": 14, 889 | "style": "dark", 890 | "tags": [], 891 | "templating": { 892 | "list": [ 893 | { 894 | "allValue": null, 895 | "current": { 896 | "text": "pve01.willi-graf.local", 897 | "value": "pve01.willi-graf.local" 898 | }, 899 | "datasource": "icinga2", 900 | "hide": 0, 901 | "includeAll": false, 902 | "label": null, 903 | "multi": false, 904 | "name": "hostname", 905 | "options": [], 906 | "query": "SHOW TAG VALUES WITH KEY = \"hostname\"", 907 | "refresh": 1, 908 | "regex": "", 909 | "sort": 1, 910 | "tagValuesQuery": "", 911 | "tags": [], 912 | "tagsQuery": "", 913 | "type": "query", 914 | "useTags": false 915 | }, 916 | { 917 | "allValue": null, 918 | "current": { 919 | "text": "io_wait", 920 | "value": "io_wait" 921 | }, 922 | "datasource": "icinga2", 923 | "hide": 0, 924 | "includeAll": false, 925 | "label": null, 926 | "multi": false, 927 | "name": "service", 928 | "options": [], 929 | "query": "SHOW TAG VALUES WITH KEY = \"service\" where hostname =~ /^$hostname$/", 930 | "refresh": 1, 931 | "regex": "", 932 | "sort": 1, 933 | "tagValuesQuery": "", 934 | "tags": [], 935 | "tagsQuery": "", 936 | "type": "query", 937 | "useTags": false 938 | } 939 | ] 940 | }, 941 | "time": { 942 | "from": "now-2m", 943 | "to": "now" 944 | }, 945 | "timepicker": { 946 | "refresh_intervals": [ 947 | "5s", 948 | "10s", 949 | "30s", 950 | "1m", 951 | "5m", 952 | "15m", 953 | "30m", 954 | "1h", 955 | "2h", 956 | "1d" 957 | ], 958 | "time_options": [ 959 | "5m", 960 | "15m", 961 | "1h", 962 | "6h", 963 | "12h", 964 | "24h", 965 | "2d", 966 | "7d", 967 | "30d" 968 | ] 969 | }, 970 | "timezone": "browser", 971 | "title": "icinga-pve-metrics", 972 | "version": 23 973 | } 974 | -------------------------------------------------------------------------------- /check_pve.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # ------------------------------------------------------------------------------ 5 | # check_pve.py - A check plugin for Proxmox Virtual Environment (PVE). 6 | # Copyright (C) 2018-2025 Nicolai Buchwitz 7 | # 8 | # Version: 1.5.0 9 | # 10 | # ------------------------------------------------------------------------------ 11 | # This program is free software; you can redistribute it and/or 12 | # modify it under the terms of the GNU General Public License 13 | # as published by the Free Software Foundation; either version 2 14 | # of the License, or (at your option) any later version. 15 | # 16 | # This program is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with this program; if not, write to the Free Software 23 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 24 | # ------------------------------------------------------------------------------ 25 | 26 | """Proxmox VE monitoring check command for various monitoring systems like Icinga and others.""" 27 | 28 | import re 29 | import sys 30 | from typing import Callable, Dict, Optional, Union, List 31 | 32 | try: 33 | import argparse 34 | from datetime import datetime, timezone 35 | from enum import Enum 36 | 37 | import requests 38 | from packaging import version 39 | from requests.packages.urllib3.exceptions import InsecureRequestWarning 40 | 41 | except ImportError as e: 42 | print(f"Missing python module: {str(e)}") 43 | sys.exit(255) 44 | 45 | # Timeout for API requests in seconds 46 | CHECK_API_TIMEOUT = 30 47 | 48 | 49 | def compare_thresholds( 50 | threshold_warning: Dict, threshold_critical: Dict, comparator: Callable 51 | ) -> bool: 52 | """Perform sanity checks on thresholds parameters (used for argparse validation).""" 53 | ok = True 54 | keys = set(list(threshold_warning.keys()) + list(threshold_critical.keys())) 55 | for key in keys: 56 | if (key in threshold_warning and key in threshold_critical) or ( 57 | None in threshold_warning and None in threshold_critical 58 | ): 59 | ok = ok and comparator(threshold_warning[key], threshold_critical[key]) 60 | elif key in threshold_warning and None in threshold_critical: 61 | ok = ok and comparator(threshold_warning[key], threshold_critical[None]) 62 | elif key in threshold_critical and None in threshold_warning: 63 | ok = ok and comparator(threshold_warning[None], threshold_critical[key]) 64 | 65 | return ok 66 | 67 | 68 | class CheckState(Enum): 69 | """Check return values.""" 70 | 71 | OK = 0 72 | WARNING = 1 73 | CRITICAL = 2 74 | UNKNOWN = 3 75 | 76 | 77 | class CheckThreshold: 78 | """Threshold representation used by the check command.""" 79 | 80 | def __init__(self, value: float) -> None: 81 | self.value = value 82 | 83 | def __eq__(self, other: "CheckThreshold") -> bool: 84 | """Threshold is equal to given one.""" 85 | return self.value == other.value 86 | 87 | def __lt__(self, other: "CheckThreshold") -> bool: 88 | """Threshold is lower to given one.""" 89 | return self.value < other.value 90 | 91 | def __le__(self, other: "CheckThreshold") -> bool: 92 | """Threshold is lower or equal to given one.""" 93 | return self.value <= other.value 94 | 95 | def __gt__(self, other: "CheckThreshold") -> bool: 96 | """Threshold is greater than given one.""" 97 | return self.value > other.value 98 | 99 | def __ge__(self, other: "CheckThreshold") -> bool: 100 | """Threshold is greater or equal than given one.""" 101 | return self.value >= other.value 102 | 103 | def check(self, value: float, lower: bool = False) -> bool: 104 | """Check threshold value as upper or lower boundary for given value.""" 105 | if lower: 106 | return value < self.value 107 | 108 | return value > self.value 109 | 110 | @staticmethod 111 | def threshold_type(arg: str) -> Dict[str, "CheckThreshold"]: 112 | """Convert string argument(s) to threshold dict.""" 113 | thresholds = {} 114 | 115 | try: 116 | thresholds[None] = CheckThreshold(float(arg)) 117 | except ValueError: 118 | for t in arg.split(","): 119 | m = re.match("([a-z_0-9]+):([0-9.]+)", t) 120 | 121 | if m: 122 | thresholds[m.group(1)] = CheckThreshold(float(m.group(2))) 123 | else: 124 | raise argparse.ArgumentTypeError(f"Invalid threshold format: {t}") # noqa: B904 125 | 126 | return thresholds 127 | 128 | 129 | class RequestError(Exception): 130 | """Exception for request related errors.""" 131 | 132 | def __init__(self, message: str, rc: int) -> None: 133 | self.message = message 134 | self.rc = rc 135 | 136 | super().__init__(self.message) 137 | 138 | 139 | class CheckPVE: 140 | """Check command for Proxmox VE.""" 141 | 142 | VERSION = "1.5.0" 143 | API_URL = "https://{hostname}:{port}/api2/json/{command}" 144 | UNIT_SCALE = { 145 | "GB": 10**9, 146 | "MB": 10**6, 147 | "KB": 10**3, 148 | "GiB": 2**30, 149 | "MiB": 2**20, 150 | "KiB": 2**10, 151 | "B": 1, 152 | } 153 | 154 | def check_output(self) -> None: 155 | """Print check command output with perfdata and return code.""" 156 | message = self.check_message 157 | if self.perfdata: 158 | message += self.get_perfdata() 159 | 160 | self.output(self.check_result, message) 161 | 162 | @staticmethod 163 | def output(rc: CheckState, message: str) -> None: 164 | """Print message to stdout and exit with given return code.""" 165 | prefix = rc.name 166 | print(f"{prefix} - {message}") 167 | sys.exit(rc.value) 168 | 169 | def get_url(self, command: str) -> str: 170 | """Get API url for specific command.""" 171 | return self.API_URL.format( 172 | hostname=self.options.api_endpoint, command=command, port=self.options.api_port 173 | ) 174 | 175 | def get_file_line(self, filename: str) -> str: 176 | """Read the first line of a file and return it without the newline.""" 177 | return open(filename, "r").readline().strip() 178 | 179 | def request(self, url: str, method: str = "get", **kwargs: Dict) -> Union[Dict, None]: 180 | """Execute request against Proxmox VE API and return json data.""" 181 | response = None 182 | try: 183 | if method == "post": 184 | response = requests.post( 185 | url, 186 | verify=not self.options.api_insecure, 187 | data=kwargs.get("data", None), 188 | timeout=5, 189 | ) 190 | elif method == "get": 191 | response = requests.get( 192 | url, 193 | verify=not self.options.api_insecure, 194 | cookies=self.__cookies, 195 | headers=self.__headers, 196 | params=kwargs.get("params", None), 197 | timeout=CHECK_API_TIMEOUT, 198 | ) 199 | else: 200 | self.output(CheckState.CRITICAL, f"Unsupport request method: {method}") 201 | except requests.exceptions.ConnectTimeout: 202 | self.output(CheckState.UNKNOWN, "Could not connect to PVE API: Connection timeout") 203 | except requests.exceptions.SSLError: 204 | self.output( 205 | CheckState.UNKNOWN, "Could not connect to PVE API: Certificate validation failed" 206 | ) 207 | except requests.exceptions.ConnectionError: 208 | self.output( 209 | CheckState.UNKNOWN, "Could not connect to PVE API: Failed to resolve hostname" 210 | ) 211 | 212 | if response.ok: 213 | return response.json()["data"] 214 | 215 | message = "Could not fetch data from API: " 216 | if response.status_code == 401: 217 | message += "Could not connection to PVE API: invalid username or password" 218 | elif response.status_code == 403: 219 | message += ( 220 | "Access denied. Please check if API user has sufficient permissions / " 221 | "the correct role has been assigned." 222 | ) 223 | else: 224 | message += f"HTTP error code was {response.status_code}" 225 | 226 | if kwargs.get("raise_error", False): 227 | raise RequestError(message, response.status_code) 228 | 229 | self.output(CheckState.UNKNOWN, message) 230 | 231 | def get_ticket(self) -> str: 232 | """Perform login and fetch ticket for further API calls.""" 233 | url = self.get_url("access/ticket") 234 | data = {"username": self.options.api_user, "password": self.options.api_password} 235 | result = self.request(url, "post", data=data) 236 | 237 | return result["ticket"] 238 | 239 | def check_api_value(self, url: StopIteration, message: str, **kwargs: Dict) -> None: 240 | """Perform simple threshold based check command.""" 241 | result = self.request(url) 242 | used = None 243 | 244 | if "key" in kwargs: 245 | result = result[kwargs.get("key")] 246 | 247 | if isinstance(result, (dict,)): 248 | used_percent = self.get_value(result["used"], result["total"]) 249 | used = self.get_value(result["used"]) 250 | total = self.get_value(result["total"]) 251 | 252 | self.add_perfdata(kwargs.get("perfkey", "usage"), used_percent) 253 | self.add_perfdata( 254 | kwargs.get("perfkey", "used"), used, max=total, unit=self.options.unit 255 | ) 256 | else: 257 | used_percent = round(float(result) * 100, 2) 258 | self.add_perfdata(kwargs.get("perfkey", "usage"), used_percent) 259 | 260 | if self.options.values_mb: 261 | message += f" {used} {self.options.unit}" 262 | value = used 263 | else: 264 | message += f" {used_percent} %" 265 | value = used_percent 266 | 267 | self.check_thresholds(value, message) 268 | 269 | def check_vm_status(self, idx: Union[str, int], **kwargs: str) -> None: 270 | """Check status of virtual machine by vmid or name.""" 271 | url = self.get_url( 272 | "cluster/resources", 273 | ) 274 | data = self.request(url, params={"type": "vm"}) 275 | 276 | expected_state = kwargs.get("expected_state", "running") 277 | only_status = kwargs.get("only_status", False) 278 | 279 | found = False 280 | for vm in data: 281 | if idx in (vm.get("name", None), vm.get("vmid", None)): 282 | # Check if VM (default) or LXC 283 | vm_type = "VM" 284 | if vm["type"] == "lxc": 285 | vm_type = "LXC" 286 | 287 | if vm["status"] != expected_state: 288 | self.check_message = ( 289 | f"{vm_type} '{vm['name']}' is {vm['status']} (expected: {expected_state})" 290 | ) 291 | if not self.options.ignore_vm_status: 292 | self.check_result = CheckState.CRITICAL 293 | else: 294 | if self.options.node and self.options.node != vm["node"]: 295 | self.check_message = ( 296 | f"{vm_type} '{vm['name']}' is {expected_state}, " 297 | f"but located on node '{vm['node']}' instead of '{self.options.node}'" 298 | ) 299 | self.check_result = CheckState.WARNING 300 | else: 301 | self.check_message = ( 302 | f"{vm_type} '{vm['name']}' is {expected_state} on node '{vm['node']}'" 303 | ) 304 | 305 | if vm["status"] == "running" and not only_status: 306 | cpu = round(vm["cpu"] * 100, 2) 307 | self.add_perfdata("cpu", cpu) 308 | 309 | if self.options.values_mb: 310 | memory = self.scale_value(vm["mem"]) 311 | self.add_perfdata( 312 | "memory", 313 | memory, 314 | unit=self.options.unit, 315 | max=self.scale_value(vm["maxmem"]), 316 | ) 317 | disk = self.scale_value(vm["disk"]) 318 | self.add_perfdata( 319 | "disk", 320 | disk, 321 | unit=self.options.unit, 322 | max=self.scale_value(vm["maxdisk"]), 323 | ) 324 | 325 | else: 326 | memory = self.get_value(vm["mem"], vm["maxmem"]) 327 | self.add_perfdata("memory", memory) 328 | disk = self.get_value(vm["disk"], vm["maxdisk"]) 329 | self.add_perfdata("disk", disk) 330 | 331 | self.check_thresholds( 332 | {"cpu": cpu, "memory": memory, "disk": disk}, message=self.check_message 333 | ) 334 | 335 | found = True 336 | break 337 | 338 | if not found: 339 | self.check_message = f"VM or LXC '{idx}' not found" 340 | self.check_result = CheckState.WARNING 341 | 342 | def check_disks(self) -> None: 343 | """Check disk health on specific Proxmox VE node.""" 344 | url = self.get_url(f"nodes/{self.options.node}/disks") 345 | 346 | failed = [] 347 | unknown = [] 348 | disks = self.request(url + "/list") 349 | for disk in disks: 350 | name = disk["devpath"].replace("/dev/", "") 351 | 352 | if name in self.options.ignore_disks: 353 | continue 354 | 355 | if disk["health"] == "UNKNOWN": 356 | self.check_result = CheckState.WARNING 357 | unknown.append({"serial": disk["serial"], "device": disk["devpath"]}) 358 | 359 | elif disk["health"] not in ("PASSED", "OK"): 360 | self.check_result = CheckState.WARNING 361 | failed.append({"serial": disk["serial"], "device": disk["devpath"]}) 362 | 363 | if disk["wearout"] != "N/A": 364 | self.add_perfdata(f"wearout_{name}", disk["wearout"]) 365 | 366 | if failed: 367 | self.check_message = f"{len(failed)} of {len(disks)} disks failed the health test:\n" 368 | for disk in failed: 369 | self.check_message += f"- {disk['device']} with serial '{disk['serial']}'\n" 370 | 371 | if unknown: 372 | self.check_message += ( 373 | f"{len(unknown)} of {len(disks)} disks have unknown health status:\n" 374 | ) 375 | for disk in unknown: 376 | self.check_message += f"- {disk['device']} with serial '{disk['serial']}'\n" 377 | 378 | if not failed and not unknown: 379 | self.check_message = "All disks are healthy" 380 | 381 | def check_replication(self) -> None: 382 | """Check replication status for either all or one specific vm / container.""" 383 | url = self.get_url(f"nodes/{self.options.node}/replication") 384 | 385 | if self.options.vmid: 386 | data = self.request(url, params={"guest": self.options.vmid}) 387 | else: 388 | data = self.request(url) 389 | 390 | failed_jobs = [] # format: [{guest: str, fail_count: int, error: str}] 391 | performance_data = [] 392 | 393 | for job in data: 394 | if job["fail_count"] > 0: 395 | failed_jobs.append( 396 | {"guest": job["guest"], "fail_count": job["fail_count"], "error": job["error"]} 397 | ) 398 | else: 399 | performance_data.append({"id": job["id"], "duration": job["duration"]}) 400 | 401 | if len(failed_jobs) > 0: 402 | message = f"Failed replication jobs on {self.options.node}: " 403 | for job in failed_jobs: 404 | message = ( 405 | message 406 | + "GUEST: {j[guest]}, FAIL_COUNT: {j[fail_count]}, ERROR: {j[error]} ; ".format( 407 | j=job 408 | ) 409 | ) 410 | self.check_message = message 411 | self.check_result = CheckState.WARNING 412 | else: 413 | self.check_message = f"No failed replication jobs on {self.options.node}" 414 | self.check_result = CheckState.OK 415 | 416 | if len(performance_data) > 0: 417 | for metric in performance_data: 418 | self.add_perfdata("duration_" + metric["id"], metric["duration"], unit="s") 419 | 420 | def check_services(self) -> None: 421 | """Check state of core services on Proxmox VE node.""" 422 | url = self.get_url(f"nodes/{self.options.node}/services") 423 | data = self.request(url) 424 | 425 | failed = {} 426 | for service in data: 427 | if ( 428 | service["state"] != "running" 429 | and service.get("active-state", "active") == "active" 430 | and service["name"] not in self.options.ignore_services 431 | ): 432 | failed[service["name"]] = service["desc"] 433 | 434 | if failed: 435 | self.check_result = CheckState.CRITICAL 436 | message = f"{len(failed)} services are not running:\n\n" 437 | for name, description in failed.items(): 438 | message += f"- {description} ({name}) is not running\n" 439 | self.check_message = message 440 | else: 441 | self.check_message = "All services are running" 442 | 443 | def check_subscription(self) -> None: 444 | """Check subscription status on Proxmox VE node.""" 445 | url = self.get_url(f"nodes/{self.options.node}/subscription") 446 | data = self.request(url) 447 | 448 | # 'status' is an enum, values are documented in Proxmox's API viewer: 449 | # https://pve.proxmox.com/pve-docs/api-viewer/#/nodes/{node}/subscription 450 | if data["status"].lower() == "new": 451 | self.check_result = CheckState.WARNING 452 | self.check_message = "Subscription not yet checked" 453 | elif data["status"].lower() == "notfound": 454 | self.check_result = CheckState.WARNING 455 | self.check_message = "No valid subscription found" 456 | elif data["status"].lower() == "suspended": 457 | self.check_result = CheckState.WARNING 458 | self.check_message = "Subscription suspended" 459 | elif data["status"].lower() == "expired": 460 | self.check_result = CheckState.CRITICAL 461 | self.check_message = "Subscription expired" 462 | elif data["status"].lower() == "invalid": 463 | self.check_result = CheckState.CRITICAL 464 | self.check_message = "Subscription invalid" 465 | elif data["status"].lower() == "active": 466 | subscription_due_date = data["nextduedate"] 467 | subscription_product_name = data["productname"] 468 | 469 | date_expire = datetime.strptime(subscription_due_date, "%Y-%m-%d") 470 | date_today = datetime.today() 471 | delta = (date_expire - date_today).days 472 | 473 | message = f"{subscription_product_name} is valid until {subscription_due_date}" 474 | message_warning_critical = ( 475 | f"{subscription_product_name} will expire in {delta} days ({subscription_due_date})" 476 | ) 477 | 478 | self.check_thresholds( 479 | delta, 480 | message, 481 | messageWarning=message_warning_critical, 482 | messageCritical=message_warning_critical, 483 | lowerValue=True, 484 | ) 485 | else: 486 | self.check_result = CheckState.UNKNOWN 487 | self.check_message = "PVE API returned unexpected status '{}'".format(data["status"]) 488 | 489 | def check_updates(self) -> None: 490 | """Check for package updates on Proxmox VE node.""" 491 | url = self.get_url(f"nodes/{self.options.node}/apt/update") 492 | count = len(self.request(url)) 493 | 494 | if count: 495 | self.check_result = CheckState.WARNING 496 | msg = "{} pending update" 497 | if count > 1: 498 | msg += "s" 499 | self.check_message = msg.format(count) 500 | else: 501 | self.check_message = "System up to date" 502 | 503 | def check_cluster_status(self) -> None: 504 | """Check if cluster is operational.""" 505 | url = self.get_url("cluster/status") 506 | data = self.request(url) 507 | 508 | nodes = {} 509 | quorate = None 510 | cluster = "" 511 | for elem in data: 512 | if elem["type"] == "cluster": 513 | quorate = elem["quorate"] 514 | cluster = elem["name"] 515 | elif elem["type"] == "node": 516 | nodes[elem["name"]] = elem["online"] 517 | 518 | if quorate is None: 519 | self.check_message = "No cluster configuration found" 520 | elif quorate: 521 | node_count = len(nodes) 522 | nodes_online_count = len({k: v for k, v in nodes.items() if v}) 523 | 524 | if node_count > nodes_online_count: 525 | diff = node_count - nodes_online_count 526 | self.check_result = CheckState.WARNING 527 | self.check_message = f"Cluster '{cluster}' is healthy, but {diff} node(s) offline'" 528 | else: 529 | self.check_message = f"Cluster '{cluster}' is healthy'" 530 | 531 | self.add_perfdata("nodes_total", node_count, unit="") 532 | self.add_perfdata("nodes_online", nodes_online_count, unit="") 533 | else: 534 | self.check_result = CheckState.CRITICAL 535 | self.check_message = "Cluster is unhealthy - no quorum" 536 | 537 | def check_zfs_fragmentation(self, name: Optional[str] = None) -> None: 538 | """Check all or one specific ZFS pool for fragmentation.""" 539 | url = self.get_url(f"nodes/{self.options.node}/disks/zfs") 540 | data = self.request(url) 541 | 542 | warnings = [] 543 | critical = [] 544 | found = name is None 545 | for pool in data: 546 | found = found or name == pool["name"] 547 | if (name is not None and name == pool["name"]) or name is None: 548 | key = "fragmentation" 549 | if name is None: 550 | key += f"_{pool['name']}" 551 | self.add_perfdata(key, pool["frag"]) 552 | 553 | threshold_name = f"fragmentation_{name}" 554 | threshold_warning = self.threshold_warning(threshold_name) 555 | threshold_critical = self.threshold_critical(threshold_name) 556 | 557 | if threshold_critical is not None and pool["frag"] > float( 558 | threshold_critical.value 559 | ): 560 | critical.append(pool) 561 | elif threshold_warning is not None and pool["frag"] > float( 562 | threshold_warning.value 563 | ): 564 | warnings.append(pool) 565 | 566 | if not found: 567 | self.check_result = CheckState.UNKNOWN 568 | self.check_message = f"Could not fetch fragmentation of ZFS pool '{name}'" 569 | else: 570 | if warnings or critical: 571 | value = None 572 | if critical: 573 | self.check_result = CheckState.CRITICAL 574 | if name is not None: 575 | value = critical[0]["frag"] 576 | else: 577 | self.check_result = CheckState.WARNING 578 | if name is not None: 579 | value = warnings[0]["frag"] 580 | 581 | if name is not None: 582 | self.check_message = ( 583 | f"Fragmentation of ZFS pool '{name}' is above thresholds: {value} %" 584 | ) 585 | else: 586 | pool_above = len(warnings) + len(critical) 587 | message = ( 588 | f"{pool_above} of {len(data)} ZFS pools are above fragmentation " 589 | "thresholds:\n\n" 590 | ) 591 | message += "\n".join( 592 | [f"- {pool['name']} ({pool['frag']} %) is CRITICAL\n" for pool in critical] 593 | ) 594 | message += "\n".join( 595 | [f"- {pool['name']} ({pool['frag']} %) is WARNING\n" for pool in warnings] 596 | ) 597 | self.check_message = message 598 | else: 599 | self.check_result = CheckState.OK 600 | if name is not None: 601 | self.check_message = f"Fragmentation of ZFS pool '{name}' is OK" 602 | else: 603 | self.check_message = "Fragmentation of all ZFS pools is OK" 604 | 605 | def check_zfs_health(self, name: Optional[str] = None) -> None: 606 | """Check all or one specific ZFS pool for health.""" 607 | url = self.get_url(f"nodes/{self.options.node}/disks/zfs") 608 | data = self.request(url) 609 | 610 | unhealthy = [] 611 | found = name is None 612 | healthy_conditions = ["online"] 613 | for pool in data: 614 | found = found or name == pool["name"] 615 | if (name is not None and name == pool["name"]) or name is None: 616 | if pool["health"].lower() not in healthy_conditions: 617 | unhealthy.append(pool) 618 | 619 | if not found: 620 | self.check_result = CheckState.UNKNOWN 621 | self.check_message = f"Could not fetch health of ZFS pool '{name}'" 622 | else: 623 | if unhealthy: 624 | self.check_result = CheckState.CRITICAL 625 | message = f"{len(unhealthy)} ZFS pools are not healthy:\n\n" 626 | message += "\n".join( 627 | [f"- {pool['name']} ({pool['health']}) is not healthy" for pool in unhealthy] 628 | ) 629 | self.check_message = message 630 | else: 631 | self.check_result = CheckState.OK 632 | if name is not None: 633 | self.check_message = f"ZFS pool '{name}' is healthy" 634 | else: 635 | self.check_message = "All ZFS pools are healthy" 636 | 637 | def check_ceph_health(self) -> None: 638 | """Check health of CEPH cluster.""" 639 | url = self.get_url("cluster/ceph/status") 640 | data = self.request(url) 641 | ceph_health = data.get("health", {}) 642 | 643 | if "status" not in ceph_health: 644 | self.check_result = CheckState.UNKNOWN 645 | self.check_message = ( 646 | "Could not fetch Ceph status from API. " 647 | "Check the output of 'pvesh get cluster/ceph' on your node" 648 | ) 649 | return 650 | 651 | if ceph_health["status"] == "HEALTH_OK": 652 | self.check_result = CheckState.OK 653 | self.check_message = "Ceph Cluster is healthy" 654 | elif ceph_health["status"] == "HEALTH_WARN": 655 | self.check_result = CheckState.WARNING 656 | self.check_message = "Ceph Cluster is in warning state" 657 | elif ceph_health["status"] == "HEALTH_CRIT": 658 | self.check_result = CheckState.CRITICAL 659 | self.check_message = "Ceph Cluster is in critical state" 660 | else: 661 | self.check_result = CheckState.UNKNOWN 662 | self.check_message = "Ceph Cluster is in unknown state" 663 | 664 | def check_storage(self, name: str) -> None: 665 | """Check if storage exists and return usage.""" 666 | url = self.get_url(f"nodes/{self.options.node}/storage") 667 | data = self.request(url) 668 | 669 | if not any(s["storage"] == name for s in data): 670 | self.check_result = CheckState.CRITICAL 671 | self.check_message = f"Storage '{name}' doesn't exist on node '{self.options.node}'" 672 | return 673 | 674 | url = self.get_url(f"nodes/{self.options.node}/storage/{name}/status") 675 | self.check_api_value(url, f"Usage of storage '{name}' is") 676 | 677 | def check_version(self) -> None: 678 | """Check PVE version.""" 679 | url = self.get_url("version") 680 | data = self.request(url) 681 | if not data["version"]: 682 | self.check_result = CheckState.UNKNOWN 683 | self.check_message = "Unable to determine pve version" 684 | elif self.options.min_version and version.parse(self.options.min_version) > version.parse( 685 | data["version"] 686 | ): 687 | self.check_result = CheckState.CRITICAL 688 | self.check_message = ( 689 | f"Current PVE version '{data['version']}' " 690 | f"({data['repoid']}) is lower than the min. " 691 | f"required version '{self.options.min_version}'" 692 | ) 693 | else: 694 | self.check_message = ( 695 | f"Your PVE instance version '{data['version']}' ({data['repoid']}) is up to date" 696 | ) 697 | 698 | def _get_pool_members(self, pool: str) -> List[int]: 699 | """Get a list of vmids, which are members of a given resource pool. 700 | 701 | NOTE: The request needs the Pool.Audit permission! 702 | """ 703 | members = [] 704 | 705 | try: 706 | url = self.get_url(f"pools/{pool}") 707 | pools = self.request(url, raise_error=True) 708 | for pool in pools.get("members", []): 709 | members.append(pool["vmid"]) 710 | except RequestError: 711 | print( 712 | f"Unable to fetch members of pool '{pool}'. " 713 | "Check if the name is correct and the role has the 'Pool.Audit' permission" 714 | ) 715 | 716 | return members 717 | 718 | def check_vzdump_backup(self, name: Optional[str] = None) -> None: 719 | """Check for failed vzdump backup jobs.""" 720 | tasks_url = self.get_url("cluster/tasks") 721 | tasks = self.request(tasks_url) 722 | tasks = [t for t in tasks if t["type"] == "vzdump"] 723 | 724 | # Filter by node id, if one is provided 725 | if self.options.node is not None: 726 | tasks = [t for t in tasks if t["node"] == self.options.node] 727 | 728 | # Filter by timestamp, if provided 729 | delta = self.threshold_critical("delta") 730 | if delta is not None: 731 | now = datetime.now(timezone.utc).timestamp() 732 | 733 | tasks = [t for t in tasks if not delta.check(now - t["starttime"])] 734 | 735 | # absent status = job still running 736 | tasks = [t for t in tasks if "status" in t] 737 | failed = len([t for t in tasks if t["status"] != "OK"]) 738 | success = len(tasks) - failed 739 | self.check_message = f"{success} backup tasks successful, {failed} backup tasks failed" 740 | 741 | if failed > 0: 742 | self.check_result = CheckState.CRITICAL 743 | else: 744 | self.check_result = CheckState.OK 745 | if delta is not None: 746 | self.check_message += f" within the last {delta.value}s" 747 | 748 | nbu_url = self.get_url("cluster/backup-info/not-backed-up") 749 | not_backed_up = self.request(nbu_url) 750 | 751 | if len(not_backed_up) > 0: 752 | guest_ids = [] 753 | 754 | for guest in not_backed_up: 755 | guest_ids.append(guest["vmid"]) 756 | 757 | ignored_vmids = [] 758 | for pool in self.options.ignore_pools: 759 | # ignore vms based on their membership of a certain pool 760 | ignored_vmids += self._get_pool_members(pool) 761 | 762 | if self.options.ignore_vmids: 763 | # ignore vms based on their id 764 | ignored_vmids = ignored_vmids + self.options.ignore_vmids 765 | 766 | remaining_not_backed_up = sorted(list(set(guest_ids) - set(ignored_vmids))) 767 | if len(remaining_not_backed_up) > 0: 768 | if self.check_result not in [CheckState.CRITICAL, CheckState.UNKNOWN]: 769 | self.check_result = CheckState.WARNING 770 | self.check_message += ( 771 | "\nThere are unignored guests not covered by any backup schedule: " 772 | + ", ".join(map(str, remaining_not_backed_up)) 773 | ) 774 | 775 | def check_snapshot_age(self, idx: Optional[Union[str, int]]) -> None: 776 | """Check age of snapshots.""" 777 | url = self.get_url( 778 | "cluster/resources", 779 | ) 780 | data = self.request(url, params={"type": "vm"}) 781 | 782 | warnings = [] 783 | criticals = [] 784 | snapshots_exist = False 785 | found = False 786 | for vm in data: 787 | vm_type = "qemu" 788 | if vm["type"] == "lxc": 789 | vm_type = "lxc" 790 | vm_name = vm.get("name", None) 791 | vm_id = vm.get("vmid", None) 792 | 793 | if not self.options.node: 794 | node_name = vm.get("node", None) 795 | else: 796 | node_name = self.options.node 797 | if node_name != vm.get("node", None): 798 | continue 799 | url = self.get_url(f"nodes/{node_name}/{vm_type}/{vm_id}/snapshot") 800 | data = self.request(url) 801 | 802 | for snapshot in data: 803 | snapshot_name = snapshot.get("name", None) 804 | 805 | if snapshot_name == "current": 806 | continue 807 | snapshots_exist = True 808 | 809 | threshold_name = f"snapshot_age_{vm_name}_{snapshot_name}" 810 | threshold_warning = self.threshold_warning(threshold_name) 811 | threshold_critical = self.threshold_critical(threshold_name) 812 | 813 | snapshot_time = snapshot.get("snaptime", None) 814 | snapshot_age = int(datetime.now(timezone.utc).timestamp()) - snapshot_time 815 | 816 | if threshold_critical is not None and snapshot_age > int(threshold_critical.value): 817 | criticals.append([vm_id, vm_name, snapshot_name, snapshot_time]) 818 | elif threshold_warning is not None and snapshot_age > int(threshold_warning.value): 819 | warnings.append([vm_id, vm_name, snapshot_name, snapshot_time]) 820 | 821 | if idx and idx in (vm.get("name", None), vm.get("vmid", None)): 822 | found = True 823 | break 824 | 825 | if idx and not found: 826 | self.check_result = CheckState.UNKNOWN 827 | self.check_message = f"VM or LXC '{idx}' not found" 828 | elif not snapshots_exist: 829 | self.check_result = CheckState.OK 830 | if idx: 831 | self.check_message = f"No snapshots of '{idx}' exist" 832 | else: 833 | self.check_message = "No snapshots exist" 834 | else: 835 | if idx: 836 | self.check_message = f"Age of snapshots of '{idx}' is " 837 | else: 838 | self.check_message = "Age of snapshots is " 839 | if criticals or warnings: 840 | if criticals: 841 | self.check_result = CheckState.CRITICAL 842 | else: 843 | self.check_result = CheckState.WARNING 844 | self.check_message += "above thresholds" 845 | for snapshot in criticals: 846 | snap_time = datetime.fromtimestamp(snapshot[3]).strftime("%Y-%m-%d %H:%M:%S") 847 | self.check_message += ( 848 | f"\n{snapshot[0]} ({snapshot[1]}): snapshot " 849 | + f"'{snapshot[2]}' taken on {snap_time} is CRITICAL" 850 | ) 851 | for snapshot in warnings: 852 | snap_time = datetime.fromtimestamp(snapshot[3]).strftime("%Y-%m-%d %H:%M:%S") 853 | self.check_message += ( 854 | f"\n{snapshot[0]} ({snapshot[1]}): snapshot " 855 | + f"'{snapshot[2]}' taken on {snap_time} is WARNING" 856 | ) 857 | else: 858 | self.check_result = CheckState.OK 859 | self.check_message += "OK" 860 | 861 | def check_memory(self) -> None: 862 | """Check memory usage of Proxmox VE node.""" 863 | url = self.get_url(f"nodes/{self.options.node}/status") 864 | self.check_api_value(url, "Memory usage is", key="memory") 865 | 866 | def check_swap(self) -> None: 867 | """Check swap usage of Proxmox VE node.""" 868 | url = self.get_url(f"nodes/{self.options.node}/status") 869 | self.check_api_value(url, "Swap usage is", key="swap") 870 | 871 | def check_cpu(self) -> None: 872 | """Check cpu usage of Proxmox VE node.""" 873 | url = self.get_url(f"nodes/{self.options.node}/status") 874 | self.check_api_value(url, "CPU usage is", key="cpu") 875 | 876 | def check_io_wait(self) -> None: 877 | """Check io wait of Proxmox VE node.""" 878 | url = self.get_url(f"nodes/{self.options.node}/status") 879 | self.check_api_value(url, "IO wait is", key="wait", perfkey="wait") 880 | 881 | def check_thresholds( 882 | self, 883 | values: Union[Dict[str, Union[int, float]], Union[int, float]], 884 | message: str, 885 | **kwargs: Dict, 886 | ) -> None: 887 | """Check numeric value against threshold for given metric name.""" 888 | is_warning = False 889 | is_critical = False 890 | 891 | if not isinstance(values, dict): 892 | values = {None: values} 893 | 894 | for metric, value in values.items(): 895 | value_warning = self.threshold_warning(metric) 896 | if value_warning is not None: 897 | is_warning = is_warning or value_warning.check( 898 | value, kwargs.get("lowerValue", False) 899 | ) 900 | 901 | value_critical = self.threshold_critical(metric) 902 | if value_critical is not None: 903 | is_critical = is_critical or value_critical.check( 904 | value, kwargs.get("lowerValue", False) 905 | ) 906 | 907 | if is_critical: 908 | self.check_result = CheckState.CRITICAL 909 | self.check_message = kwargs.get("messageCritical", message) 910 | elif is_warning: 911 | self.check_result = CheckState.WARNING 912 | self.check_message = kwargs.get("messageWarning", message) 913 | else: 914 | self.check_message = message 915 | 916 | def scale_value(self, value: Union[int, float]) -> float: 917 | """Scale value according to unit.""" 918 | if self.options.unit in self.UNIT_SCALE: 919 | return value / self.UNIT_SCALE[self.options.unit] 920 | 921 | raise ValueError("wrong unit") 922 | 923 | def threshold_warning(self, name: str) -> CheckThreshold: 924 | """Get warning threshold for metric name (empty if none).""" 925 | return self.options.threshold_warning.get( 926 | name, self.options.threshold_warning.get(None, None) 927 | ) 928 | 929 | def threshold_critical(self, name: str) -> CheckThreshold: 930 | """Get critical threshold for metric name (empty if none).""" 931 | return self.options.threshold_critical.get( 932 | name, self.options.threshold_critical.get(None, None) 933 | ) 934 | 935 | def get_value( 936 | self, value: Union[int, float], total: Optional[Union[int, float]] = None 937 | ) -> float: 938 | """Get value scaled or as percentage.""" 939 | value = float(value) 940 | 941 | if total: 942 | value /= float(total) / 100 943 | else: 944 | value = self.scale_value(value) 945 | 946 | return round(value, 2) 947 | 948 | def add_perfdata(self, name: str, value: Union[int, float], **kwargs: Dict) -> None: 949 | """Add metric to perfdata output.""" 950 | unit = kwargs.get("unit", "%") 951 | 952 | perfdata = f"{name}={value}{unit}" 953 | 954 | threshold_warning = self.threshold_warning(name) 955 | threshold_critical = self.threshold_critical(name) 956 | 957 | perfdata += ";" 958 | if threshold_warning: 959 | perfdata += str(threshold_warning.value) 960 | 961 | perfdata += ";" 962 | if threshold_critical: 963 | perfdata += str(threshold_critical.value) 964 | 965 | perfdata += ";" + str(kwargs.get("min", 0)) 966 | perfdata += ";" + str(kwargs.get("max", "")) 967 | 968 | self.perfdata.append(perfdata) 969 | 970 | def get_perfdata(self) -> str: 971 | """Get perfdata string.""" 972 | perfdata = "" 973 | 974 | if self.perfdata: 975 | perfdata = "|" 976 | perfdata += " ".join(self.perfdata) 977 | 978 | return perfdata 979 | 980 | def check(self) -> None: 981 | """Execute the real check command.""" 982 | self.check_result = CheckState.OK 983 | 984 | if self.options.mode == "cluster": 985 | self.check_cluster_status() 986 | elif self.options.mode == "version": 987 | self.check_version() 988 | elif self.options.mode == "memory": 989 | self.check_memory() 990 | elif self.options.mode == "swap": 991 | self.check_swap() 992 | elif self.options.mode in ("io_wait", "io-wait"): 993 | self.check_io_wait() 994 | elif self.options.mode == "disk-health": 995 | self.check_disks() 996 | elif self.options.mode == "cpu": 997 | self.check_cpu() 998 | elif self.options.mode == "services": 999 | self.check_services() 1000 | elif self.options.mode == "updates": 1001 | self.check_updates() 1002 | elif self.options.mode == "subscription": 1003 | self.check_subscription() 1004 | elif self.options.mode == "storage": 1005 | self.check_storage(self.options.name) 1006 | elif self.options.mode in ["vm", "vm_status", "vm-status"]: 1007 | only_status = self.options.mode in ["vm_status", "vm-status"] 1008 | 1009 | if self.options.name: 1010 | idx = self.options.name 1011 | else: 1012 | idx = self.options.vmid 1013 | 1014 | if self.options.expected_vm_status: 1015 | self.check_vm_status( 1016 | idx, expected_state=self.options.expected_vm_status, only_status=only_status 1017 | ) 1018 | else: 1019 | self.check_vm_status(idx, only_status=only_status) 1020 | elif self.options.mode == "replication": 1021 | self.check_replication() 1022 | elif self.options.mode == "ceph-health": 1023 | self.check_ceph_health() 1024 | elif self.options.mode == "zfs-health": 1025 | self.check_zfs_health(self.options.name) 1026 | elif self.options.mode == "zfs-fragmentation": 1027 | self.check_zfs_fragmentation(self.options.name) 1028 | elif self.options.mode == "backup": 1029 | self.check_vzdump_backup(self.options.name) 1030 | elif self.options.mode == "snapshot-age": 1031 | if self.options.name: 1032 | idx = self.options.name 1033 | else: 1034 | idx = self.options.vmid 1035 | 1036 | self.check_snapshot_age(idx) 1037 | else: 1038 | message = f"Check mode '{self.options.mode}' not known" 1039 | self.output(CheckState.UNKNOWN, message) 1040 | 1041 | self.check_output() 1042 | 1043 | def parse_args(self) -> None: 1044 | """Parse CLI arguments.""" 1045 | p = argparse.ArgumentParser(description="Check command for PVE hosts via API") 1046 | 1047 | p.add_argument( 1048 | "--version", help="Show version of check command", action="store_true", default=False 1049 | ) 1050 | 1051 | api_opts = p.add_argument_group("API Options") 1052 | 1053 | api_opts.add_argument( 1054 | "-e", 1055 | "-H", 1056 | "--api-endpoint", 1057 | help="PVE api endpoint hostname or ip address (no additional data like paths)", 1058 | ) 1059 | api_opts.add_argument("--api-port", required=False, help="PVE api endpoint port") 1060 | 1061 | api_opts.add_argument( 1062 | "-u", 1063 | "--username", 1064 | dest="api_user", 1065 | help="PVE api user (e.g. icinga2@pve or icinga2@pam, depending on which backend you " 1066 | "have chosen in proxmox)", 1067 | ) 1068 | 1069 | group = api_opts.add_mutually_exclusive_group() 1070 | group.add_argument("-p", "--password", dest="api_password", help="PVE API user password") 1071 | group.add_argument( 1072 | "-P", 1073 | "--password-file", 1074 | dest="api_password_file", 1075 | help="PVE API user password in a file", 1076 | ) 1077 | group.add_argument( 1078 | "-t", 1079 | "--api-token", 1080 | dest="api_token", 1081 | help="PVE API token (format: TOKEN_ID=TOKEN_SECRET)", 1082 | ) 1083 | group.add_argument( 1084 | "-T", 1085 | "--api-token-file", 1086 | dest="api_token_file", 1087 | help="PVE API token contained in a file (format: TOKEN_ID=TOKEN_SECRET)", 1088 | ) 1089 | 1090 | api_opts.add_argument( 1091 | "-k", 1092 | "--insecure", 1093 | dest="api_insecure", 1094 | action="store_true", 1095 | default=False, 1096 | help="Don't verify HTTPS certificate", 1097 | ) 1098 | 1099 | api_opts.set_defaults(api_port=8006) 1100 | 1101 | check_opts = p.add_argument_group("Check Options") 1102 | 1103 | check_opts.add_argument( 1104 | "-m", 1105 | "--mode", 1106 | choices=( 1107 | "cluster", 1108 | "version", 1109 | "cpu", 1110 | "memory", 1111 | "swap", 1112 | "storage", 1113 | "io_wait", 1114 | "io-wait", 1115 | "updates", 1116 | "services", 1117 | "subscription", 1118 | "vm", 1119 | "vm_status", 1120 | "vm-status", 1121 | "replication", 1122 | "disk-health", 1123 | "ceph-health", 1124 | "zfs-health", 1125 | "zfs-fragmentation", 1126 | "backup", 1127 | "snapshot-age", 1128 | ), 1129 | help="Mode to use.", 1130 | ) 1131 | 1132 | check_opts.add_argument( 1133 | "-n", 1134 | "--node", 1135 | dest="node", 1136 | help="Node to check (necessary for all modes except cluster, version and backup)", 1137 | ) 1138 | 1139 | check_opts.add_argument("--name", dest="name", help="Name of storage, vm, or container") 1140 | 1141 | check_opts.add_argument( 1142 | "--vmid", dest="vmid", type=int, help="ID of virtual machine or container" 1143 | ) 1144 | 1145 | check_opts.add_argument( 1146 | "--expected-vm-status", 1147 | choices=("running", "stopped", "paused"), 1148 | help="Expected VM status", 1149 | ) 1150 | 1151 | check_opts.add_argument( 1152 | "--ignore-vmid", 1153 | dest="ignore_vmids", 1154 | metavar="VMID", 1155 | action="append", 1156 | help="Ignore VM with vmid in checks", 1157 | default=[], 1158 | type=int, 1159 | ) 1160 | 1161 | check_opts.add_argument( 1162 | "--ignore-vm-status", 1163 | dest="ignore_vm_status", 1164 | action="store_true", 1165 | help="Ignore VM status in checks", 1166 | default=False, 1167 | ) 1168 | 1169 | check_opts.add_argument( 1170 | "--ignore-service", 1171 | dest="ignore_services", 1172 | action="append", 1173 | metavar="NAME", 1174 | help="Ignore service NAME in checks", 1175 | default=[], 1176 | ) 1177 | 1178 | check_opts.add_argument( 1179 | "--ignore-disk", 1180 | dest="ignore_disks", 1181 | action="append", 1182 | metavar="NAME", 1183 | help="Ignore disk NAME in health check", 1184 | default=[], 1185 | ) 1186 | 1187 | check_opts.add_argument( 1188 | "--ignore-pools", 1189 | dest="ignore_pools", 1190 | action="append", 1191 | metavar="NAME", 1192 | help="Ignore vms and containers in pool(s) NAME in checks", 1193 | default=[], 1194 | ) 1195 | 1196 | check_opts.add_argument( 1197 | "-w", 1198 | "--warning", 1199 | dest="threshold_warning", 1200 | type=CheckThreshold.threshold_type, 1201 | default={}, 1202 | help="Warning threshold for check value. Mutiple thresholds with name:value,name:value", 1203 | ) 1204 | check_opts.add_argument( 1205 | "-c", 1206 | "--critical", 1207 | dest="threshold_critical", 1208 | type=CheckThreshold.threshold_type, 1209 | default={}, 1210 | help=( 1211 | "Critical threshold for check value. " 1212 | "Mutiple thresholds with name:value,name:value" 1213 | ), 1214 | ) 1215 | check_opts.add_argument( 1216 | "-M", 1217 | dest="values_mb", 1218 | action="store_true", 1219 | default=False, 1220 | help=( 1221 | "Values are shown in the unit which is set with --unit (if available). " 1222 | "Thresholds are also treated in this unit" 1223 | ), 1224 | ) 1225 | check_opts.add_argument( 1226 | "-V", 1227 | "--min-version", 1228 | dest="min_version", 1229 | type=str, 1230 | help="The minimal pve version to check for. Any version lower than this will return " 1231 | "CRITICAL.", 1232 | ) 1233 | 1234 | check_opts.add_argument( 1235 | "--unit", 1236 | choices=self.UNIT_SCALE.keys(), 1237 | default="MiB", 1238 | help="Unit which is used for performance data and other values", 1239 | ) 1240 | 1241 | options = p.parse_args() 1242 | 1243 | if options.version: 1244 | print(f"check_pve version {self.VERSION}") 1245 | sys.exit(0) 1246 | 1247 | missing = [] 1248 | if not options.api_endpoint: 1249 | missing.append("--api-endpoint") 1250 | if not options.api_user: 1251 | missing.append("--username") 1252 | if not ( 1253 | options.api_password 1254 | or options.api_password_file 1255 | or options.api_token 1256 | or options.api_token_file 1257 | ): 1258 | missing.append("--password, --api-password-file, --api-token or --api-token-file") 1259 | if not options.mode: 1260 | missing.append("--mode") 1261 | 1262 | if missing: 1263 | p.error(f"The following arguments are required: {', '.join(missing)}") 1264 | 1265 | if not options.node and options.mode not in [ 1266 | "cluster", 1267 | "vm", 1268 | "vm_status", 1269 | "version", 1270 | "ceph-health", 1271 | "backup", 1272 | "snapshot-age", 1273 | ]: 1274 | p.print_usage() 1275 | message = f"{p.prog}: error: --mode {options.mode} requires node name (--node)" 1276 | self.output(CheckState.UNKNOWN, message) 1277 | 1278 | if ( 1279 | not options.vmid 1280 | and not options.name 1281 | and options.mode in ("vm", "vm_status", "vm-status") 1282 | ): 1283 | p.print_usage() 1284 | message = ( 1285 | f"{p.prog}: error: --mode {options.mode} requires either " 1286 | "vm name (--name) or id (--vmid)" 1287 | ) 1288 | self.output(CheckState.UNKNOWN, message) 1289 | 1290 | if not options.name and options.mode == "storage": 1291 | p.print_usage() 1292 | message = f"{p.prog}: error: --mode {options.mode} requires storage name (--name)" 1293 | self.output(CheckState.UNKNOWN, message) 1294 | 1295 | if options.threshold_warning and options.threshold_critical: 1296 | if options.mode != "subscription" and not compare_thresholds( 1297 | options.threshold_warning, options.threshold_critical, lambda w, c: w <= c 1298 | ): 1299 | p.error("Critical value must be greater than warning value") 1300 | elif options.mode == "subscription" and not compare_thresholds( 1301 | options.threshold_warning, options.threshold_critical, lambda w, c: w >= c 1302 | ): 1303 | p.error("Critical value must be lower than warning value") 1304 | 1305 | self.options = options 1306 | 1307 | def __init__(self) -> None: 1308 | self.options = {} 1309 | self.ticket = None 1310 | self.perfdata = [] 1311 | self.check_result = CheckState.UNKNOWN 1312 | self.check_message = "" 1313 | 1314 | self.__headers = {} 1315 | self.__cookies = {} 1316 | 1317 | self.parse_args() 1318 | 1319 | if self.options.api_insecure: 1320 | # disable urllib3 warning about insecure requests 1321 | requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning) 1322 | 1323 | if self.options.api_token_file is not None: 1324 | self.options.api_token = self.get_file_line(self.options.api_token_file) 1325 | if self.options.api_password_file is not None: 1326 | self.options.api_password = self.get_file_line(self.options.api_password_file) 1327 | if self.options.api_password is not None: 1328 | self.__cookies["PVEAuthCookie"] = self.get_ticket() 1329 | elif self.options.api_token is not None: 1330 | token = f"{self.options.api_user}!{self.options.api_token}" 1331 | self.__headers["Authorization"] = f"PVEAPIToken={token}" 1332 | 1333 | 1334 | if __name__ == "__main__": 1335 | pve = CheckPVE() 1336 | pve.check() 1337 | --------------------------------------------------------------------------------