├── .github └── workflows │ ├── code-tests.yml │ └── integration-tests.yml ├── .gitignore ├── .safety-policy.yml ├── Dockerfile ├── LICENSE ├── README.md ├── debian ├── changelog ├── compat ├── control ├── copyright ├── dirs ├── rebootmgr.install ├── rebootmgr.links ├── rebootmgr.postinst ├── rebootmgr.postrm └── rules ├── docker-compose.yml ├── docs ├── design.md ├── flowdiagram.png ├── install.md └── reference.md ├── extra └── notify-reboot-required ├── rebootmgr ├── __init__.py └── main.py ├── renovate.json ├── requirements.txt ├── setup.cfg ├── setup.py ├── systemd └── rebootmgr.service ├── tests ├── conftest.py ├── test_config.py ├── test_lock.py ├── test_post_reboot.py ├── test_reboot.py ├── test_stopflag.py ├── test_tasks.py ├── test_triggers.py └── test_whitelist.py └── tox.ini /.github/workflows/code-tests.yml: -------------------------------------------------------------------------------- 1 | name: Code and Syntax tests 2 | on: 3 | - push 4 | jobs: 5 | Syntax-Lint: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - name: Install python prerequisites 9 | run: "sudo pip3 install tox" 10 | - name: Checkout repository 11 | uses: actions/checkout@v3 12 | - name: Perform syntax lint 13 | run: "cd ${{ github.workspace }} && tox -e lint" 14 | Safety-Checks: 15 | runs-on: ubuntu-latest 16 | continue-on-error: true 17 | steps: 18 | - name: Install python prerequisites 19 | run: "sudo pip3 install tox" 20 | - name: Checkout repository 21 | uses: actions/checkout@v3 22 | - name: Perform safety checks 23 | run: "cd ${{ github.workspace }} && tox -e safety" 24 | -------------------------------------------------------------------------------- /.github/workflows/integration-tests.yml: -------------------------------------------------------------------------------- 1 | name: Integration tests 2 | on: 3 | - push 4 | jobs: 5 | Integration-Test-Python: 6 | name: Integration Tests for Python 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python_version: [36, 37, 38, 39] 11 | steps: 12 | - name: Install python prerequisites 13 | run: "sudo pip3 install docker-compose" 14 | - name: Checkout repository 15 | uses: actions/checkout@v3 16 | - name: Perform integration tests for python version ${{ matrix.python_version }} 17 | run: "cd ${{ github.workspace }} && docker-compose run --rm integration_tests_py${{ matrix.python_version }}" 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # Mac OS 107 | .DS_Store 108 | 109 | # Vim 110 | *.sw[po] -------------------------------------------------------------------------------- /.safety-policy.yml: -------------------------------------------------------------------------------- 1 | security: 2 | ignore-vulnerabilities: 3 | 58755: 4 | reason: we implemented the recommended workaround 5 | expires: '2024-07-01' 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PYTHON_VERSION 2 | FROM python:$PYTHON_VERSION-alpine 3 | 4 | RUN apk add bash git socat 5 | RUN pip install tox 6 | 7 | WORKDIR /src 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 SysEleven GmbH 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Unattended Reboot Manager 2 | 3 | ## Overview 4 | 5 | Rebootmgr is an operations tool, that can help you safely automate reboots of nodes in complex, distributed environments. 6 | 7 | We created rebootmgr for our public cloud offering SysEleven Stack, because we wanted to make sure that our services are always up-to-date and secure. 8 | 9 | We noticed that rebootmgr does not only save valuable time for our engineers, it can also reboot more reliably, because it is more vigilant than a human, always keeping an eye on the cluster's health. 10 | 11 | ## Design 12 | 13 | Using consul, rebootmgr is able to have an overview of your cluster's services health. It also uses the locking and key-value store features of Consul to make sure, that only one node in the cluster is rebooting at a time. 14 | 15 | For a deep dive how exactly rebootmgr works internally and why we created it, see our [design document](docs/design.md). 16 | 17 | ## Getting started 18 | 19 | If you want to try rebootmgr hands-on, have a look at our [installation guide](docs/install.md). 20 | 21 | ## Reference 22 | 23 | For a deep-dive into rebootmgr usage scenarios, have a look at our [reference guide](docs/reference.md) 24 | 25 | ## Testing 26 | 27 | For running the integration tests you need docker compose. For running the linter and safety checks, you need tox. 28 | 29 | ``` 30 | # Run integration tests with different python versions 31 | $ docker-compose run --rm integration_tests_py38 32 | $ docker-compose run --rm integration_tests_py37 33 | $ docker-compose run --rm integration_tests_py36 34 | 35 | # Run linter and safety checks 36 | $ tox -e lint 37 | $ tox -e safety 38 | 39 | # Clean up docker 40 | docker-compose down --rmi local -v 41 | ``` 42 | 43 | ## Contributing 44 | 45 | We would love seeing community contributions for rebootmgr, and are eager to collaborate with you. 46 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | rebootmgr (0.0.25-0+syseleven1) focal; urgency=medium 2 | 3 | * Add option to skip the creation of the reboot_in_progress consul key 4 | 5 | -- Dennis Kuhn Mo,8 May 2023 17:09:00 +0100 6 | 7 | rebootmgr (0.0.22-0+syseleven1) xenial; urgency=medium 8 | 9 | * Only allow reboots if per-host configuration is present 10 | 11 | -- Olaf Seibert Tue,10 Dec 2019 17:09:00 +0100 12 | 13 | rebootmgr (0.0.21-0+syseleven1) xenial; urgency=medium 14 | 15 | * In dry-run pass env variable REBOOTMGR_DRY_RUN to tasks 16 | 17 | -- Bodo Petermann Wed,19 Dec 2018 15:08:00 +0100 18 | rebootmgr (0.0.20-0+syseleven1) xenial; urgency=medium 19 | 20 | * Don't restart rebootmgr after update/installation 21 | 22 | -- Dennis Kuhn Wed,22 Aug 2018 15:03:37 +0200 23 | rebootmgr (0.0.19-0+syseleven1) xenial; urgency=medium 24 | 25 | * Change default cli options 26 | 27 | -- Dennis Kuhn Fri,17 Aug 2018 15:03:37 +0200 28 | rebootmgr (0.0.18-0+syseleven1) xenial; urgency=medium 29 | 30 | * Remove systemd timer from package 31 | 32 | -- Dennis Kuhn Fri,27 Jul 2018 15:03:37 +0200 33 | rebootmgr (0.0.17-0+syseleven1) xenial; urgency=medium 34 | 35 | * Version update 36 | 37 | -- Dennis Kuhn Fri,29 Jun 2018 15:03:37 +0200 38 | rebootmgr (0.0.16-0+syseleven1) xenial; urgency=medium 39 | 40 | * Version update 41 | 42 | -- Dennis Kuhn Wed,11 Sep 2018 15:03:37 +0200 43 | rebootmgr (0.0.15-0+syseleven1) xenial; urgency=medium 44 | 45 | * Version update 46 | 47 | -- Steffen Neubauer Wed,04 Sep 2017 15:03:37 +0200 48 | rebootmgr (0.0.14-0+syseleven1) xenial; urgency=medium 49 | 50 | * Version update 51 | 52 | -- Dennis Kuhn Wed,16 Aug 2017 15:03:37 +0200 53 | rebootmgr (0.0.13-0+syseleven1) xenial; urgency=medium 54 | 55 | * Version update 56 | 57 | -- Dennis Kuhn Mon,17 Jul 2017 15:03:37 +0200 58 | rebootmgr (0.0.12-0+syseleven1) xenial; urgency=medium 59 | 60 | * Version update 61 | 62 | -- Dennis Kuhn Tue,4 Jul 2017 15:03:37 +0200 63 | rebootmgr (0.0.11-0+syseleven1) xenial; urgency=medium 64 | 65 | * Version update 66 | 67 | -- Dennis Kuhn Thu,3 Jul 2017 15:03:37 +0200 68 | rebootmgr (0.0.10-0+syseleven1) xenial; urgency=medium 69 | 70 | * Version update 71 | 72 | -- Dennis Kuhn Thu, 20 Apr 2017 15:03:37 +0200 73 | rebootmgr (0.0.9-0+syseleven1) xenial; urgency=medium 74 | 75 | * Version update 76 | 77 | -- Dennis Kuhn Thu, 20 Apr 2017 15:03:37 +0200 78 | rebootmgr (0.0.8-0+syseleven1) xenial; urgency=medium 79 | 80 | * Version update 81 | 82 | -- Dennis Kuhn Thu, 20 Apr 2017 15:03:37 +0200 83 | rebootmgr (0.0.7-0+syseleven1) xenial; urgency=medium 84 | 85 | * Version update 86 | 87 | -- Dennis Kuhn Wed, 22 Mar 2017 15:03:37 +0200 88 | rebootmgr (0.0.6-0+syseleven2) xenial; urgency=medium 89 | 90 | * use symlink for etc/kernel/postinst.d/update-notifier 91 | 92 | -- Dennis Kuhn Wed, 22 Mar 2017 15:03:37 +0200 93 | rebootmgr (0.0.6-0+syseleven1) xenial; urgency=medium 94 | 95 | * Version update to 0.0.6 96 | 97 | -- Dennis Kuhn Wed, 22 Mar 2017 15:03:37 +0200 98 | rebootmgr (0.0.5-0+syseleven1) xenial; urgency=medium 99 | 100 | * Version update to 0.0.5 101 | 102 | -- Dennis Kuhn Fri, 17 Mar 2017 15:03:37 +0200 103 | rebootmgr (0.0.4-0+syseleven1) xenial; urgency=medium 104 | 105 | * Version update to 0.0.4 106 | 107 | -- Dennis Kuhn Thu, 09 Mar 2017 15:03:37 +0200 108 | rebootmgr (0.0.3-0+syseleven1) xenial; urgency=medium 109 | 110 | * Version update 111 | 112 | -- Dennis Kuhn Thu, 09 Mar 2017 15:03:37 +0200 113 | rebootmgr (0.0.2-0+syseleven1) xenial; urgency=medium 114 | 115 | * fix pybuild install 116 | 117 | -- Dennis Kuhn Thu, 09 Mar 2017 15:03:37 +0200 118 | rebootmgr (0.0.1-0+syseleven1) xenial; urgency=medium 119 | 120 | * Initial build of this package for Ubuntu 16.04 121 | 122 | -- Dennis Kuhn Thu, 09 Mar 2017 15:03:37 +0200 123 | -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 9 2 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: rebootmgr 2 | Maintainer: Dennis Kuhn 3 | Section: python 4 | Priority: optional 5 | Build-Depends: debhelper (>= 9), 6 | python3-setuptools (>= 0.6b3), 7 | python3-consul, 8 | python3-requests, 9 | python3-urllib3, 10 | python3-click, 11 | python3-colorlog, 12 | python3-retrying, 13 | python3-all, 14 | dh-systemd, 15 | dh-python 16 | X-Python3-Version: >= 3.2 17 | Standards-Version: 3.9.7 18 | Homepage: https://github.com/syseleven/rebootmgr 19 | 20 | Package: rebootmgr 21 | Architecture: all 22 | Depends: ${misc:Depends}, ${python3:Depends}, python3-consul-lib, python3-holidays 23 | Conflicts: update-notifier-common 24 | Description: Rebootmgr 25 | Rebootmgr reboots a single node. Rebootmgr checks all 26 | consul checks with the tag "rebootmgr" and executes all task 27 | in /etc/rebootmgr/post_boot_tasks or /etc/rebootmgr/pre_boot_tasks 28 | . 29 | This package contains the binary in /usr/bin and 30 | systemd service. 31 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ 2 | Upstream-Name: rebootmgr 3 | Source: https://gitlab.syseleven.de/openstack/underlay/tree/master/rebootmgr 4 | 5 | Files: * 6 | Copyright: 2016 SysEleven GmbH 7 | License: Apache-2 8 | 9 | Files: debian/* 10 | Copyright: 2016 SysEleven GmbH 11 | License: Apache-2 12 | 13 | License: Apache-2 14 | Licensed under the Apache License, Version 2.0 (the "License"); 15 | you may not use this file except in compliance with the License. 16 | You may obtain a copy of the License at 17 | . 18 | http://www.apache.org/licenses/LICENSE-2.0 19 | . 20 | Unless required by applicable law or agreed to in writing, software 21 | distributed under the License is distributed on an "AS IS" BASIS, 22 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 23 | See the License for the specific language governing permissions and 24 | limitations under the License. 25 | . 26 | On Debian-based systems the full text of the Apache version 2.0 license 27 | can be found in `/usr/share/common-licenses/Apache-2.0'. 28 | -------------------------------------------------------------------------------- /debian/dirs: -------------------------------------------------------------------------------- 1 | /etc/rebootmgr/post_boot_tasks 2 | /etc/rebootmgr/pre_boot_tasks 3 | -------------------------------------------------------------------------------- /debian/rebootmgr.install: -------------------------------------------------------------------------------- 1 | systemd/rebootmgr.service lib/systemd/system 2 | extra/notify-reboot-required usr/share/update-notifier/ 3 | -------------------------------------------------------------------------------- /debian/rebootmgr.links: -------------------------------------------------------------------------------- 1 | usr/share/update-notifier/notify-reboot-required etc/kernel/postinst.d/update-notifier 2 | -------------------------------------------------------------------------------- /debian/rebootmgr.postinst: -------------------------------------------------------------------------------- 1 | if [ -d /run/systemd/system ]; then 2 | systemctl --system daemon-reload >/dev/null || true 3 | fi 4 | -------------------------------------------------------------------------------- /debian/rebootmgr.postrm: -------------------------------------------------------------------------------- 1 | if [ -d /run/systemd/system ]; then 2 | systemctl --system daemon-reload >/dev/null || true 3 | fi 4 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | export PBR_VERSION=0.0.1 4 | 5 | %: 6 | dh $@ --with python3,systemd --buildsystem=pybuild 7 | 8 | override_dh_systemd_start: 9 | echo "don't run dh_systemd_start" 10 | 11 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | integration_tests_py36: 4 | build: 5 | context: . 6 | dockerfile: Dockerfile 7 | args: 8 | PYTHON_VERSION: 3.6 9 | volumes: 10 | - .:/src 11 | depends_on: [consul1, consul2, consul3, consul4] 12 | command: 13 | ["tox", "-e", "py36"] 14 | integration_tests_py37: 15 | build: 16 | context: . 17 | dockerfile: Dockerfile 18 | args: 19 | PYTHON_VERSION: 3.7 20 | volumes: 21 | - .:/src 22 | depends_on: [consul1, consul2, consul3, consul4] 23 | command: 24 | ["tox", "-e", "py37"] 25 | integration_tests_py38: 26 | build: 27 | context: . 28 | dockerfile: Dockerfile 29 | args: 30 | PYTHON_VERSION: 3.8 31 | volumes: 32 | - .:/src 33 | depends_on: [consul1, consul2, consul3, consul4] 34 | command: 35 | ["tox", "-e", "py38"] 36 | integration_tests_py39: 37 | build: 38 | context: . 39 | dockerfile: Dockerfile 40 | args: 41 | PYTHON_VERSION: 3.9 42 | volumes: 43 | - .:/src 44 | depends_on: [consul1, consul2, consul3, consul4] 45 | command: 46 | ["tox", "-e", "py39"] 47 | lint: 48 | build: 49 | context: . 50 | dockerfile: Dockerfile 51 | args: 52 | PYTHON_VERSION: 3.5 53 | volumes: 54 | - .:/src 55 | command: 56 | ["tox", "-e", "lint"] 57 | consul1: 58 | image: consul:1.15.4 59 | ports: 60 | - 8500:8500 61 | command: 'agent -client=0.0.0.0 -datacenter=test -server -bootstrap -node=consul1 -ui' 62 | consul2: 63 | image: consul:1.15.4 64 | command: 'agent -client=0.0.0.0 -datacenter=test -retry-join consul1 -retry-interval 1s -node=consul2' 65 | consul3: 66 | image: consul:1.15.4 67 | command: 'agent -client=0.0.0.0 -datacenter=test -retry-join consul1 -retry-interval 1s -node=consul3' 68 | consul4: 69 | image: consul:1.15.4 70 | command: 'agent -client=0.0.0.0 -datacenter=test -retry-join consul1 -retry-interval 1s -node=consul4' 71 | -------------------------------------------------------------------------------- /docs/design.md: -------------------------------------------------------------------------------- 1 | # Problem Statement 2 | We need to reboot nodes for e.g. updates of the kernel or quobyte. Currently we do this by hand, but this already doesn't scale. Just rebooting the nodes requires a full day of work with the number of nodes we have now. It is not an option to never reboot, since we need to keep our systems up to date with the latest security or performance improvements. 3 | 4 | We are not able to reboot the entire cluster without any coordination. At least in the first step, we only allow rebooting one node at a time. There is already a solution by CoreOS called locksmith, that solves exactly this problem. But with a limitation that is not acceptable to us: It does not allow any constraints other than the number of nodes in reboot at the same time. Another limitation, it is also tightly coupled to CoreOS and its update engine. 5 | 6 | We need to come up with our own solution. 7 | 8 | # Proposed Design 9 | We will create a cli tool and a systemd service rebootmgr (for now), which is designed to run on every node, every 5 Minutes. 10 | We need to take care, that only one instance is running at the time. We use consul to coordinate the reboot and to check the required services. 11 | 12 | We have the following constraints: 13 | 14 | - Don't reboot, if another reboot is currently in progress 15 | - Never reboot a gateway/loadbalancer if there is only one 16 | - Never reboot a controller if there are less than 3 17 | - Never reboot a controller if 18 | - the galera is not healthy 19 | - zookeeper is not healthy 20 | - cassandra-cluster is not healthy 21 | - rabbitmq is not ok 22 | - Never reboot a compute with running VMs 23 | - Never reboot if "dkms status" is not ok 24 | - Never reboot a compute/controller node when quobyte is not "green" 25 | - Never reboot a node that is currently updating 26 | 27 | Some of this is a larger task than the other one. E.g. we even don't want to reboot a controller if one galera service is not available, but the rest is. For compute nodes we need to talk to the OpenStack API to evacuate and disable the node. We may as well query virsh list for still running VMs, and do not proceed before the node is in fact empty. 28 | 29 | We already register important services to the consul service discovery. We can reuse the check mechanism built into the service discovery, to get the health of the services. 30 | 31 | ## Reboot events 32 | 33 | The rebootmgr should reboot a node when either of the following is true 34 | - the file `/var/run/reboot_required` exists 35 | - the consul key `/service/rebootmgr//reboot_required` exists 36 | - an operator runs `rebootmgr` on the commandline 37 | 38 | ## Pre- and Post-reboot tasks 39 | 40 | Different kinds of nodes need different kinds of preparation before rebooting. For example on compute nodes, we need to talk to the OpenStack API to evacuate and disable the node. We may as well query virsh list for still running VMs, and do not proceed before the node is in fact empty. After the reboot we want to re-enable the node. 41 | 42 | We will accomplish this with a simple task system: Tasks are simple scripts and are located in `/etc/rebootmgr/pre_boot_tasks` and `/etc/rebootmgr/post_boot_tasks`. The scripts in this directories are executed in alphabetical order. When a script fails with an exit code > 0, rebootmgr exits. 43 | 44 | ## Consul checks 45 | 46 | Rebootmanager will consider only checks with the tag `rebootmgr`. Every command needs a timeout of 2 minutes, the maximum interval is one minute. The rebootmgr has to wait for more than 2 minutes (timeout + interval) after the pre_boot_checks were executed. 47 | 48 | The consul Services may have additional tags: 49 | 50 | - `ignore_maintenance`: Consul adds a check with the id "_node_maintenance" if a node is in maintenance mode. This check is ignored if the tag is present. 51 | - `min_passing=`: Rebootmgr will only consider the service healthy, if a minimum of `` nodes in the cluster report the service to be healthy. 52 | - `max_failing=`: Rebootmgr will only consider the service healthy, if a maximum of `` nodes in the cluster report the service to be failing. 53 | 54 | ## Global stop flag 55 | 56 | If something goes wrong, we want to be able to stop rebootmgr as fast as possible. It should not do anything anymore when `service/rebootmgr/stop` is set in the consul kv store, regardless of the content. 57 | 58 | ## Flow Diagram 59 | 60 | ![Rebootmgr Flow Diagram](flowdiagram.png) 61 | -------------------------------------------------------------------------------- /docs/flowdiagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syseleven/rebootmgr/5a9bea418b7d5bd017de4d23c2818f45e385a1b7/docs/flowdiagram.png -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # Getting started with rebootmgr 2 | 3 | At SysEleven, we are using debian packages to install rebootmgr. 4 | 5 | As soon as we publish the first release on GitHub, we will add a public Launchpad Repository with a getting-started guide here. 6 | 7 | On all nodes to be managed with rebootmgr, run `rebootmgr --ensure-config`, 8 | to ensure that default configuration for the node is present in the 9 | consul key/value store (if not present yet). -------------------------------------------------------------------------------- /docs/reference.md: -------------------------------------------------------------------------------- 1 | # Rebootmgr reference guide 2 | 3 | ## Overview 4 | 5 | On a very high level, the functionality can be summarized with the following bullet points: 6 | - `rebootmgr` is a command-line tool, that can be used as a safer replacement for the `reboot` command 7 | - It will only reboot when safe and/or necessary 8 | - You can run it as a systemd timer, or manually 9 | - It relies on Consul service discovery, for a cluster overview 10 | - It can execute tasks before and after rebooting 11 | 12 | ## Configuration options 13 | 14 | You can configure rebootmgr in your cluster using the consul key/value (kv) store. 15 | 16 | ### Global stop flag (`service/rebootmgr/stop`) 17 | 18 | If this exists in the consul key/value store, rebootmgr won't do anything unless you specify the option `--ignore-global-stop-flag`. 19 | 20 | Content of the key does not matter. You can use the content to explain the reason for stopping rebootmgr. 21 | 22 | Example for enabling the stop flag: 23 | 24 | ``` 25 | $ consul kv put service/rebootmgr/stop "Stop rebootmgr for some reason" 26 | ``` 27 | 28 | ### Ignore failed checks on certain hosts (`service/rebootmgr/ignore_failed_checks`) 29 | 30 | If there are failed checks on certain hosts, that you want to be ignored by reboot manager, you can configure a list of hostnames, whose failed checks should be ignored. 31 | 32 | Example for ignoring a host: 33 | 34 | ``` 35 | $ consul kv put service/rebootmgr/ignore_failed_checks '["some_hostname"]' 36 | ``` 37 | 38 | ### Host-specific configuration (`service/rebootmgr/nodes/{hostname}/config`) 39 | 40 | You can enable or disable Reboot Manager on individual hosts using this key. 41 | 42 | Reboot Manager will allow reboots only if this configuration is present and 43 | well-formed, so by default, Reboot Manager will be disabled[^1]. 44 | 45 | To ensure that the configuration is present: 46 | ``` 47 | some_hostname$ rebootmgr --ensure-config 48 | ``` 49 | If the configuration is absent or invalid, it will create it, allowing reboots. 50 | 51 | Example for disabling reboot manager on `some_hostname`: 52 | ``` 53 | $ consul kv put service/rebootmgr/nodes/some_hostname/config '{"disabled": true}' 54 | ``` 55 | 56 | [^1]: This is the reverse of earlier versions. We decided for safety reasons to 57 | allow reboots only when the configuration is properly present. 58 | 59 | ## Consul service monitoring 60 | 61 | For an overview of how to register services and checks in consul, please refer to [the consul documentation](https://www.consul.io/docs/agent/services.html). 62 | 63 | ### Relevant services 64 | 65 | - If service X is registered to the agent that is being rebooted, health of all instances of service X in the whole cluster (also on other nodes) are taken into consideration. 66 | - Only services with the tags`rebootmgr`, `rebootmgr_postboot` and `rebootmgr_preboot` will be taken into consideration. 67 | - Services tagged with `rebootmgr` are considered before and after the reboot, `rebootmgr_preboot` only before and `rebootmgr_postboot` only after a reboot. 68 | - Rebootmgr will consider services with consul maintenance mode enabled as broken, unless the service is tagged with `ignore_maintenance` 69 | - Rebootmgr assumes that `check_interval + check_timeout < 2 minutes` 70 | 71 | Example service definition: 72 | 73 | ``` 74 | { 75 | "ID": "nova-compute", 76 | "Name": "nova-compute", 77 | "Tags": [ 78 | "openstack", 79 | "rebootmgr" # leaving the service untagged, it would be ignored by rebootmgr, but still could be queried by a MAT 80 | ], 81 | "Check": { 82 | "Script": "check_nova_compute.sh", 83 | "Interval": "30s" 84 | } 85 | } 86 | ``` 87 | 88 | ## Task system 89 | 90 | Before and after rebooting, rebootmgr can run tasks. 91 | 92 | Tasks are simple executable files (usually shell or python scripts). 93 | 94 | Rebootmgr is looking for them in `/etc/rebootmgr/pre_boot_tasks/` (for tasks that should be executed before rebooting) and `/etc/rebootmgr/post_boot_tasks/` (for tasks that should run after rebooting). 95 | 96 | Tasks will run in alphabetical order by filename. 97 | 98 | If a task runtime exceeds two hours, reboot manager will fail and disable itself on that node. 99 | 100 | If a task exits with any other code than `0`, reboot manager will fail and not reboot. 101 | 102 | ## Reboot triggers 103 | 104 | Reboot manager will reboot when one of the following is true: 105 | 106 | - It has been invoked without the option `--check-triggers` 107 | - The consul key `service/rebootmgr/nodes/{hostname}/reboot_required` is set 108 | - The file `/var/run/reboot-required` exists 109 | 110 | ## Holidays 111 | 112 | If the option `--check-holidays` is specified, reboot manager will refuse to reboot on german holidays. 113 | -------------------------------------------------------------------------------- /extra/notify-reboot-required: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script is called by some packages (kernel, systemd, libc6) from the postinstall script 4 | # This is a simplified version from the update-notifier-common package 5 | 6 | if [ "$0" = "/etc/kernel/postinst.d/update-notifier" ]; then 7 | DPKG_MAINTSCRIPT_PACKAGE=linux-base 8 | fi 9 | 10 | touch /var/run/reboot-required 11 | echo "$DPKG_MAINTSCRIPT_PACKAGE" >> /var/run/reboot-required.pkgs 12 | -------------------------------------------------------------------------------- /rebootmgr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/syseleven/rebootmgr/5a9bea418b7d5bd017de4d23c2818f45e385a1b7/rebootmgr/__init__.py -------------------------------------------------------------------------------- /rebootmgr/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import click 3 | import getpass 4 | import logging 5 | import socket 6 | import sys 7 | import json 8 | import subprocess 9 | import time 10 | import colorlog 11 | import holidays 12 | import datetime 13 | from typing import List 14 | 15 | from retrying import retry 16 | from consul import Consul 17 | from consul_lib import Lock 18 | from consul_lib.services import get_local_checks, get_failed_cluster_checks 19 | 20 | LOG = logging.getLogger(__name__) 21 | 22 | EXIT_UNKNOWN_ERROR = 1 23 | 24 | # exit codes < 100 are transient 25 | EXIT_CONSUL_CHECKS_FAILED = 2 26 | EXIT_CONSUL_NODE_FAILED = 3 27 | EXIT_CONSUL_LOCK_FAILED = 4 28 | EXIT_CONSUL_LOST_LOCK = 5 29 | EXIT_HOLIDAY = 6 30 | 31 | # exit codes >= 100 are permanent 32 | EXIT_TASK_FAILED = 100 33 | EXIT_NODE_DISABLED = 101 34 | EXIT_GLOBAL_STOP_FLAG_SET = 102 35 | EXIT_DID_NOT_REALLY_REBOOT = 103 36 | EXIT_CONFIGURATION_IS_MISSING = 104 37 | 38 | 39 | def logsetup(verbosity): 40 | level = logging.WARNING 41 | 42 | if verbosity > 0: 43 | level = logging.INFO 44 | if verbosity > 1: 45 | level = logging.DEBUG 46 | 47 | stderr_formatter = colorlog.ColoredFormatter("%(log_color)s%(name)s [%(levelname)s] %(message)s") 48 | stderr_handler = logging.StreamHandler() 49 | stderr_handler.setFormatter(stderr_formatter) 50 | 51 | logging.basicConfig(handlers=[stderr_handler], level=level) 52 | 53 | LOG.info("Verbose logging enabled") 54 | LOG.debug("Debug logging enabled") 55 | 56 | 57 | def run_tasks(tasktype, con, hostname, dryrun, task_timeout): 58 | """ 59 | run every script in /etc/rebootmgr/pre_boot_tasks or 60 | /etc/rebootmgr/post_boot_tasks 61 | 62 | tasktype is either pre_boot or post_boot 63 | dryrun If true the environment variable REBOOTMGR_DRY_RUN=1 is passed to 64 | the scripts 65 | """ 66 | env = dict(os.environ) 67 | if dryrun: 68 | env["REBOOTMGR_DRY_RUN"] = "1" 69 | 70 | for task in sorted(os.listdir("/etc/rebootmgr/%s_tasks/" % tasktype)): 71 | task = os.path.join("/etc/rebootmgr/%s_tasks" % tasktype, task) 72 | LOG.info("Run task %s" % task) 73 | p = subprocess.Popen(task, env=env) 74 | try: 75 | ret = p.wait(timeout=(task_timeout * 60)) 76 | except subprocess.TimeoutExpired: 77 | p.terminate() 78 | try: 79 | p.wait(timeout=10) 80 | except subprocess.TimeoutExpired: 81 | p.kill() 82 | LOG.error("Could not finish task %s in %i minutes. Exit" % (task, task_timeout)) 83 | LOG.error("Disable rebootmgr in consul for this node") 84 | data = get_config(con, hostname) 85 | data["enabled"] = False 86 | data["message"] = "Could not finish task %s in %i minutes" % (task, task_timeout) 87 | put_config(con, hostname, data) 88 | con.kv.delete("service/rebootmgr/reboot_in_progress") 89 | sys.exit(EXIT_TASK_FAILED) 90 | if ret != 0: 91 | LOG.error("Task %s failed with return code %s. Exit" % (task, ret)) 92 | sys.exit(EXIT_TASK_FAILED) 93 | LOG.info("task %s finished" % task) 94 | 95 | 96 | def get_whitelist(con) -> List[str]: 97 | """ 98 | Reads a list of hosts which should be ignored. May be absent. 99 | """ 100 | k, v = con.kv.get("service/rebootmgr/ignore_failed_checks") 101 | if v and "Value" in v.keys() and v["Value"]: 102 | return json.loads(v["Value"].decode()) 103 | return [] 104 | 105 | 106 | def check_consul_services(con, hostname, ignore_failed_checks: bool, tags: List[str], wait_until_healthy=False): 107 | """ 108 | check all consul services for this node with the tag "rebootmgr" 109 | """ 110 | whitelist = get_whitelist(con) 111 | 112 | if whitelist: 113 | LOG.warning("Checks from the following hosts will be ignored, " + 114 | "because service/rebootmgr/ignore_failed_checks is set: {}".format(", ".join(whitelist))) 115 | 116 | local_checks = get_local_checks(con, tags=tags) 117 | LOG.debug("local_checks: %s" % local_checks) 118 | 119 | if ignore_failed_checks: 120 | LOG.warning("All consul service checks are ignored.") 121 | else: 122 | failed_cluster_checks = get_failed_cluster_checks(con, local_checks).items() 123 | failed_names = [] 124 | 125 | LOG.debug("failed_cluster_checks: %s" % failed_cluster_checks) 126 | for name, check in failed_cluster_checks: 127 | if check["Node"] not in whitelist: 128 | # If the check is failing because the node is us and it is the 129 | # is-in-maintenance-mode check, ignore it. 130 | if name == '_node_maintenance' and check["Node"] == hostname: 131 | pass 132 | else: 133 | failed_names.append(name + " on " + check["Node"]) 134 | 135 | if failed_names: 136 | if wait_until_healthy: 137 | LOG.error("There were failed consul checks (%s). Trying again in 2 minutes.", failed_names) 138 | time.sleep(120) 139 | check_consul_services(con, hostname, ignore_failed_checks, tags, wait_until_healthy) 140 | else: 141 | LOG.error("There were failed consul checks (%s). Exit.", failed_names) 142 | sys.exit(EXIT_CONSUL_CHECKS_FAILED) 143 | else: 144 | LOG.info("All consul checks passed.") 145 | 146 | 147 | @retry(wait_fixed=2000, stop_max_delay=20000) 148 | def check_reboot_in_progress(con): 149 | """ 150 | Check for the key service/rebootmgr/reboot_in_progress. 151 | If the key contains the nodename, this node is in post reboot state. 152 | The key may be absent. 153 | """ 154 | k, v = con.kv.get("service/rebootmgr/reboot_in_progress") 155 | if v and "Value" in v.keys() and v["Value"]: 156 | return v["Value"].decode() 157 | return "" 158 | 159 | 160 | @retry(wait_fixed=2000, stop_max_delay=20000) 161 | def check_stop_flag(con) -> bool: 162 | """ 163 | Check the global stop flag. Present is True, absent is False. 164 | """ 165 | k, v = con.kv.get("service/rebootmgr/stop") 166 | if v: 167 | return True 168 | return False 169 | 170 | 171 | @retry(wait_fixed=2000, stop_max_delay=20000) 172 | def is_reboot_required(con, nodename) -> bool: 173 | """ 174 | Check the node's reboot_required flags. Present is True, absent is False. 175 | """ 176 | k, v = con.kv.get("service/rebootmgr/nodes/%s/reboot_required" % nodename) 177 | if v: 178 | LOG.debug("Found key %s. Reboot required" % nodename) 179 | return True 180 | if os.path.isfile("/var/run/reboot-required"): 181 | LOG.debug("Found file /var/run/reboot-required. Reboot required") 182 | return True 183 | LOG.info("No reboot necessary") 184 | return False 185 | 186 | 187 | def uptime() -> float: 188 | with open('/proc/uptime', 'r') as f: 189 | uptime = float(f.readline().split()[0]) 190 | return uptime 191 | 192 | 193 | def check_consul_cluster(con, ignore_failed_checks: bool) -> None: 194 | whitelist = get_whitelist(con) 195 | if whitelist: 196 | LOG.warning("Status of the following hosts will be ignored, " + 197 | "because service/rebootmgr/ignore_failed_checks is set: {}".format(", ".join(whitelist))) 198 | if ignore_failed_checks: 199 | LOG.warning("All consul cluster checks are ignored.") 200 | else: 201 | for member in con.agent.members(): 202 | # Consul member status 1 = Alive, 3 = Left 203 | if "Status" in member.keys() and member["Status"] not in [1, 3] and member["Name"] not in whitelist: 204 | LOG.error("Consul cluster not healthy: Node %s failed. Exit" % member["Name"]) 205 | sys.exit(EXIT_CONSUL_NODE_FAILED) 206 | 207 | 208 | @retry(wait_fixed=2000, stop_max_delay=20000) 209 | def is_node_disabled(con, hostname) -> bool: 210 | data = get_config(con, hostname) 211 | return not data.get('enabled', False) 212 | 213 | 214 | def post_reboot_state(con, consul_lock, hostname, flags, wait_until_healthy, task_timeout): 215 | LOG.info("Found my hostname in service/rebootmgr/reboot_in_progress") 216 | 217 | # Uptime greater 2 hours 218 | if flags.get("check_uptime") and uptime() > 2 * 60 * 60: 219 | LOG.error("We are in post reboot state but uptime is higher then 2 hours. Exit") 220 | sys.exit(EXIT_DID_NOT_REALLY_REBOOT) 221 | 222 | LOG.info("Entering post reboot state") 223 | 224 | check_consul_services(con, hostname, flags.get("ignore_failed_checks"), ["rebootmgr", "rebootmgr_postboot"], wait_until_healthy) 225 | run_tasks("post_boot", con, hostname, flags.get("dryrun"), task_timeout) 226 | check_consul_services(con, hostname, flags.get("ignore_failed_checks"), ["rebootmgr", "rebootmgr_postboot"], wait_until_healthy) 227 | 228 | # Disable consul (and Zabbix) maintenance 229 | con.agent.maintenance(False) 230 | 231 | LOG.info("Remove consul key service/rebootmgr/nodes/%s/reboot_required" % hostname) 232 | con.kv.delete("service/rebootmgr/nodes/%s/reboot_required" % hostname) 233 | LOG.info("Remove consul key service/rebootmgr/reboot_in_progress") 234 | con.kv.delete("service/rebootmgr/reboot_in_progress") 235 | 236 | consul_lock.release() 237 | 238 | 239 | def pre_reboot_state(con, consul_lock, hostname, flags, task_timeout): 240 | today = datetime.date.today() 241 | if flags.get("check_holidays") and today in holidays.DE(): 242 | LOG.info("Refuse to run on holiday") 243 | sys.exit(EXIT_HOLIDAY) 244 | 245 | if check_stop_flag(con) and not flags.get("ignore_global_stop_flag"): 246 | LOG.info("Global stop flag is set: exit") 247 | sys.exit(EXIT_GLOBAL_STOP_FLAG_SET) 248 | 249 | if is_node_disabled(con, hostname) and not flags.get("ignore_node_disabled"): 250 | LOG.info("Rebootmgr is disabled in consul config for this node. Exit") 251 | sys.exit(EXIT_NODE_DISABLED) 252 | 253 | if flags.get("check_triggers") and not is_reboot_required(con, hostname): 254 | sys.exit(0) 255 | 256 | LOG.info("Entering pre reboot state") 257 | 258 | check_consul_services(con, hostname, flags.get("ignore_failed_checks"), ["rebootmgr", "rebootmgr_preboot"]) 259 | 260 | LOG.info("Executing pre reboot tasks") 261 | run_tasks("pre_boot", con, hostname, flags.get("dryrun"), task_timeout) 262 | 263 | if not flags.get("lazy_consul_checks"): 264 | LOG.info("Sleep for 2 minutes. Waiting for consul checks.") 265 | time.sleep((60 * 2) + 10) 266 | 267 | check_consul_cluster(con, flags.get("ignore_failed_checks")) 268 | check_consul_services(con, hostname, flags.get("ignore_failed_checks"), ["rebootmgr", "rebootmgr_preboot"]) 269 | 270 | if not consul_lock.acquired: 271 | LOG.error("Lost consul lock. Exit") 272 | sys.exit(EXIT_CONSUL_LOST_LOCK) 273 | 274 | if check_stop_flag(con) and not flags.get("ignore_global_stop_flag"): 275 | LOG.info("Global stop flag is set: exit") 276 | sys.exit(EXIT_GLOBAL_STOP_FLAG_SET) 277 | 278 | # check again if reboot is still required 279 | if flags.get("check_triggers") and not is_reboot_required(con, hostname): 280 | sys.exit(0) 281 | 282 | if not flags.get("skip_reboot_in_progress_key"): 283 | if not flags.get("dryrun"): 284 | LOG.debug("Write %s in key service/rebootmgr/reboot_in_progress" % hostname) 285 | con.kv.put("service/rebootmgr/reboot_in_progress", hostname) 286 | else: 287 | LOG.debug("Would write %s in key service/rebootmgr/reboot_in_progress" % hostname) 288 | 289 | consul_lock.release() 290 | 291 | 292 | def get_config(con, hostname) -> dict: 293 | """ 294 | Get the node's config data. It should be a JSON dictionary. 295 | 296 | If the config is absent, the rebootmgr should consider itself disabled. 297 | """ 298 | idx, data = con.kv.get("service/rebootmgr/nodes/%s/config" % hostname) 299 | 300 | try: 301 | if data and "Value" in data.keys() and data["Value"]: 302 | config = json.loads(data["Value"].decode()) 303 | if isinstance(config, dict): 304 | maybe_migrate_config(con, hostname, config) 305 | return config 306 | except Exception: 307 | pass 308 | 309 | LOG.error("Configuration data missing or malformed.") 310 | return {} 311 | 312 | 313 | def maybe_migrate_config(con, hostname, config): 314 | if 'disabled' in config and 'enabled' not in config: 315 | config['enabled'] = not config['disabled'] 316 | del config['disabled'] 317 | put_config(con, hostname, config) 318 | 319 | 320 | def put_config(con, hostname, config): 321 | con.kv.put("service/rebootmgr/nodes/%s/config" % hostname, json.dumps(config)) 322 | 323 | 324 | def config_is_present_and_valid(con, hostname) -> bool: 325 | """ 326 | Checks if there is configuration for this node and does minimal validation. 327 | 328 | If the config is absent or not valid, 329 | the rebootmgr should consider itself disabled. 330 | """ 331 | config = get_config(con, hostname) 332 | if 'enabled' not in config: 333 | return False 334 | 335 | return True 336 | 337 | 338 | def ensure_configuration(con, hostname, dryrun) -> bool: 339 | """ 340 | Make sure there is a configuration set up for this node. 341 | 342 | If there already is one that looks valid, don't change it. 343 | """ 344 | if not config_is_present_and_valid(con, hostname): 345 | config = { 346 | "enabled": True, # maybe default should be False? 347 | "message": "Default config created", 348 | } 349 | if not dryrun: 350 | put_config(con, hostname, config) 351 | return True 352 | return False 353 | 354 | 355 | def getuser(): 356 | user = os.environ.get('SUDO_USER') 357 | return user or getpass.getuser() 358 | 359 | 360 | def do_set_global_stop_flag(con, dc, reason=None): 361 | msg_parts = ["Set by", getuser(), str(datetime.datetime.now())] 362 | if reason: 363 | msg_parts.append(reason) 364 | message = " ".join(msg_parts) 365 | con.kv.put("service/rebootmgr/stop", message, dc=dc) 366 | LOG.warning("Set %s global stop flag: %s", dc, message) 367 | 368 | 369 | def do_unset_global_stop_flag(con, dc): 370 | con.kv.delete("service/rebootmgr/stop", dc=dc) 371 | LOG.warning("Remove %s global stop flag", dc) 372 | 373 | 374 | def do_set_local_stop_flag(con, hostname, reason=None): 375 | msg_parts = ["Node disabled by", getuser(), str(datetime.datetime.now())] 376 | if reason: 377 | msg_parts.append(reason) 378 | message = " ".join(msg_parts) 379 | config = get_config(con, hostname) 380 | config["enabled"] = False 381 | config["message"] = message 382 | put_config(con, hostname, config) 383 | LOG.warning("Set %s local stop flag: %s", hostname, message) 384 | 385 | 386 | def do_unset_local_stop_flag(con, hostname): 387 | config = get_config(con, hostname) 388 | config["enabled"] = True 389 | config["message"] = "" 390 | put_config(con, hostname, config) 391 | LOG.warning("Unset %s local stop flag", hostname) 392 | 393 | 394 | @click.command() 395 | @click.option("-v", "--verbose", count=True, help="Once for INFO logging, twice for DEBUG") 396 | @click.option("--check-triggers", help="Only reboot if a reboot is necessary", is_flag=True) 397 | @click.option("-n", "--dryrun", help="Run tasks and check services but don't reboot", is_flag=True) 398 | @click.option("-u", "--check-uptime", help="Make sure, that the uptime is less than 2 hours.", is_flag=True) 399 | @click.option("-s", "--ignore-global-stop-flag", help="ignore the global stop flag (service/rebootmgr/stop).", is_flag=True) 400 | @click.option("--check-holidays", help="Don't reboot on holidays", is_flag=True) 401 | @click.option("--post-reboot-wait-until-healthy", help="Wait until healthy in post reboot, instead of exit", is_flag=True) 402 | @click.option("--lazy-consul-checks", help="Don't repeat consul checks after two minutes", is_flag=True) 403 | @click.option("-l", "--ignore-node-disabled", help="ignore the node specific stop flag (service/rebootmgr/hostname/config)", is_flag=True) 404 | @click.option("--ignore-failed-checks", help="Reboot even if consul checks fail", is_flag=True) 405 | @click.option("--maintenance-reason", help="""Reason for the downtime in consul. If the text starts with "reboot", """ + 406 | "a 15 minute maintenance period is scheduled in zabbix\nDefault: reboot by rebootmgr", 407 | default="reboot by rebootmgr") 408 | @click.option("--consul", metavar="CONSUL_IP_ADDR", help="Address of Consul. Default env REBOOTMGR_CONSUL_ADDR or 127.0.0.1.", 409 | default=os.environ.get("REBOOTMGR_CONSUL_ADDR", "127.0.0.1")) 410 | @click.option("--consul-port", help="Port of Consul. Default env REBOOTMGR_CONSUL_PORT or 8500", 411 | default=os.environ.get("REBOOTMGR_CONSUL_PORT", 8500)) 412 | @click.option("--ensure-config", help="If there is no valid configuration in consul, create a default one.", is_flag=True) 413 | @click.option("--set-global-stop-flag", metavar="CLUSTER", help="Stop the rebootmgr cluster-wide in the specified cluster") 414 | @click.option("--unset-global-stop-flag", metavar="CLUSTER", help="Remove the cluster-wide stop flag in the specified cluster") 415 | @click.option("--set-local-stop-flag", help="Stop the rebootmgr on this node", is_flag=True) 416 | @click.option("--unset-local-stop-flag", help="Remove the stop flag on this node", is_flag=True) 417 | @click.option("--stop-reason", help="Reason to set the stop flag") 418 | @click.option("--skip-reboot-in-progress-key", help="Don't set the reboot_in_progress consul key before rebooting", is_flag=True) 419 | @click.option("--task-timeout", help="Minutes that rebootmgr waits for each task to finish. Default are 120 minutes", default=120, type=int) 420 | @click.version_option() 421 | def cli(verbose, consul, consul_port, check_triggers, check_uptime, dryrun, maintenance_reason, ignore_global_stop_flag, 422 | ignore_node_disabled, ignore_failed_checks, check_holidays, post_reboot_wait_until_healthy, lazy_consul_checks, 423 | ensure_config, set_global_stop_flag, unset_global_stop_flag, set_local_stop_flag, unset_local_stop_flag, stop_reason, 424 | skip_reboot_in_progress_key, task_timeout): 425 | """Reboot Manager 426 | 427 | Default values of parameteres are environment variables (if set) 428 | """ 429 | logsetup(verbose) 430 | 431 | con = Consul(host=consul, port=int(consul_port)) 432 | hostname = socket.gethostname().split(".")[0] 433 | 434 | if ensure_config: 435 | if ensure_configuration(con, hostname, dryrun): 436 | LOG.warning("Created default configuration, " 437 | "since it was missing or invalid. Exit.") 438 | else: 439 | LOG.debug("Did not create default configuration, " 440 | "since there already was one. Exit.") 441 | sys.exit(0) 442 | 443 | if set_global_stop_flag: 444 | do_set_global_stop_flag(con, set_global_stop_flag, reason=stop_reason) 445 | sys.exit(0) 446 | 447 | if unset_global_stop_flag: 448 | do_unset_global_stop_flag(con, unset_global_stop_flag) 449 | sys.exit(0) 450 | 451 | if set_local_stop_flag: 452 | do_set_local_stop_flag(con, hostname, reason=stop_reason) 453 | sys.exit(0) 454 | 455 | if unset_local_stop_flag: 456 | do_unset_local_stop_flag(con, hostname) 457 | sys.exit(0) 458 | 459 | if not config_is_present_and_valid(con, hostname): 460 | LOG.error("The configuration of this node (%s) seems to be missing. " 461 | "Exit." % hostname) 462 | sys.exit(EXIT_CONFIGURATION_IS_MISSING) 463 | 464 | flags = {"check_triggers": check_triggers, 465 | "check_uptime": check_uptime, 466 | "dryrun": dryrun, 467 | "maintenance_reason": maintenance_reason, 468 | "ignore_global_stop_flag": ignore_global_stop_flag, 469 | "ignore_node_disabled": ignore_node_disabled, 470 | "ignore_failed_checks": ignore_failed_checks, 471 | "check_holidays": check_holidays, 472 | "lazy_consul_checks": lazy_consul_checks, 473 | "skip_reboot_in_progress_key": skip_reboot_in_progress_key} 474 | 475 | check_consul_cluster(con, ignore_failed_checks) 476 | 477 | consul_lock = Lock(con, "service/rebootmgr/lock") 478 | try: 479 | # Try to get Lock without waiting 480 | if not consul_lock.acquire(blocking=False): 481 | LOG.error("Could not get consul lock. Exit.") 482 | sys.exit(EXIT_CONSUL_LOCK_FAILED) 483 | 484 | reboot_in_progress = check_reboot_in_progress(con) 485 | if reboot_in_progress: 486 | if reboot_in_progress.startswith(hostname): 487 | # We are in post_reboot state 488 | post_reboot_state(con, consul_lock, hostname, flags, post_reboot_wait_until_healthy, task_timeout) 489 | sys.exit(0) 490 | # Another node has the lock 491 | else: 492 | LOG.info("Another Node %s is rebooting. Exit." % reboot_in_progress) 493 | sys.exit(EXIT_CONSUL_LOCK_FAILED) 494 | # consul-key reboot_in_progress does not exist 495 | # we are free to reboot 496 | else: 497 | # We are in pre_reboot state 498 | pre_reboot_state(con, consul_lock, hostname, flags, task_timeout) 499 | 500 | if not dryrun: 501 | # Set a consul maintenance, which creates a 15 maintenance window in Zabbix 502 | con.agent.maintenance(True, maintenance_reason) 503 | 504 | LOG.warning("Reboot now ...") 505 | try: 506 | # NOTE(sneubauer): Reboot after 1 minutes. This was added 507 | # for the MachineDB reboot task, so it can report success 508 | # to the API before the actual reboot happens. 509 | subprocess.run(["shutdown", "-r", "+1"], check=True) 510 | except Exception as e: 511 | LOG.error("Could not run reboot") 512 | LOG.error("Remove consul key service/rebootmgr/reboot_in_progress") 513 | con.kv.delete("service/rebootmgr/reboot_in_progress") 514 | raise e 515 | finally: 516 | consul_lock.release() 517 | 518 | 519 | if __name__ == "__main__": 520 | cli() # pragma: no cover 521 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "config:base" 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click==8.0.4 2 | colorlog==6.7.0 3 | python-consul>=0.7.1 4 | requests==2.27.1 5 | urllib3==1.26.16 6 | retrying==1.3.4 7 | holidays>=0.9.8 8 | git+https://github.com/syseleven/consul_lib.git#egg=consul_lib 9 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 160 3 | max-complexity = 10 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from setuptools import setup, find_packages 4 | 5 | if not sys.version_info >= (3, 5): 6 | sys.exit("This tool was developed on Python 3.5, please upgrade") 7 | 8 | setup( 9 | name="rebootmgr", 10 | version="0.0.27", 11 | packages=find_packages(), 12 | maintainer="Syseleven Cloudstackers", 13 | maintainer_email="cloudstackers@syseleven.de", 14 | url="https://github.com/syseleven/rebootmgr", 15 | include_package_data=True, 16 | install_requires=[ 17 | "click>=6.0", 18 | "colorlog>=3.1", 19 | "python-consul>=0.7.1", 20 | "requests>=2.20", 21 | "retrying>=1.3", 22 | "holidays>=0.9", 23 | # TODO(sneubauer): Pin consul_lib once it is released on pypi 24 | "consul_lib", 25 | ], 26 | entry_points=""" 27 | [console_scripts] 28 | rebootmgr=rebootmgr.main:cli 29 | """, 30 | ) 31 | -------------------------------------------------------------------------------- /systemd/rebootmgr.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Rebootmgr 3 | After=network-online.target consul.service 4 | 5 | Documentation=https://github.com/syseleven/rebootmgr/blob/master/docs/design.md 6 | 7 | [Service] 8 | Type=oneshot 9 | ExecStart=/usr/bin/rebootmgr -v --check-holidays --check-uptime --check-triggers --post-reboot-wait-until-healthy 10 | # see rebootmgr/rebootmgr/main.py for a list of error codes 11 | SuccessExitStatus=0 3 4 101 102 12 | 13 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | import logging 4 | import requests 5 | import socket 6 | import subprocess 7 | 8 | from unittest.mock import DEFAULT 9 | from unittest.mock import MagicMock 10 | 11 | import pytest 12 | import consul 13 | 14 | 15 | @pytest.fixture 16 | def consul_cluster(mocker): 17 | clients = [consul.Consul(host="consul{}".format(i + 1)) for i in range(4)] 18 | 19 | while not clients[0].status.leader(): 20 | time.sleep(.1) 21 | 22 | snapshot_url = 'http://consul1:8500/v1/snapshot' 23 | snapshot = requests.get(snapshot_url, allow_redirects=False) 24 | snapshot.raise_for_status() 25 | 26 | # Pretend we are the same host as clients[0] 27 | def fake_gethostname(): 28 | return "consul1" 29 | 30 | mocker.patch('socket.gethostname', new=fake_gethostname) 31 | 32 | try: 33 | yield clients 34 | finally: 35 | # Consul cluster state includes the KV store. 36 | resp = requests.put(snapshot_url, data=snapshot.content) 37 | resp.raise_for_status() 38 | for c in clients: 39 | c.agent.maintenance(False) 40 | for name, service in c.agent.services().items(): 41 | c.agent.service.deregister(name) 42 | 43 | 44 | @pytest.fixture 45 | def mock_subprocess_popen(mocker): 46 | """ 47 | Fixture for testing with mocked `subprocess.Popen`. 48 | 49 | Returns a configured `MagicMock` instance. 50 | 51 | You can optionally pass a `side_effect` as a second argument 52 | which will be used as a side_effect for Popen.wait. 53 | 54 | `side_effect` can be an Exception and will then be raised; 55 | see the `MagicMock.side_effect` documentation for more information. 56 | 57 | Example: 58 | 59 | mocked_popen = mock_subprocess_popen(["reboot"]) 60 | 61 | call_your_tested_code() 62 | 63 | mocked_popen.assert_any_call(["reboot"]) 64 | mocked_popen.wait.assert_called() 65 | """ 66 | wait_results = {} 67 | 68 | def get_wait_result(command): 69 | if isinstance(command, str): 70 | command = [command] 71 | elif isinstance(command, list): 72 | pass 73 | else: 74 | raise ValueError("command must be either string or list") 75 | 76 | return wait_results[json.dumps(command)] 77 | 78 | def get_mocked_popen(command, *args, **kwargs): 79 | mock = MagicMock() 80 | return_value, side_effect = get_wait_result(command) 81 | mock.wait.return_value = return_value 82 | mock.wait.side_effect = side_effect 83 | return mock 84 | 85 | mocked_popen = mocker.patch("subprocess.Popen") 86 | mocked_popen.side_effect = get_mocked_popen 87 | 88 | def add(command, wait_return_value=None, wait_side_effect=None): 89 | wait_results[json.dumps(command)] = wait_return_value, wait_side_effect 90 | return mocked_popen 91 | 92 | return add 93 | 94 | 95 | @pytest.fixture 96 | def mock_subprocess_run(mocker): 97 | """ 98 | Fixture for testing `subprocess.run`. Returns and configures a `MagicMock` instance. 99 | 100 | You can optionally pass a `side_effect` as a second argument. 101 | 102 | `side_effect` can be an Exception and will then be raised; see the `MagicMock.side_effect` documentation for more information. 103 | 104 | Example: 105 | 106 | mocked_run = mock_subprocess_run(["reboot"]) 107 | 108 | call_your_tested_code() 109 | 110 | mocked_run.assert_any_call(["reboot"]) 111 | """ 112 | side_effects = {} 113 | 114 | def get_side_effect(command, *args, **kwargs): 115 | if isinstance(command, str): 116 | command = [command] 117 | elif isinstance(command, list): 118 | pass 119 | else: 120 | raise ValueError("command must be either string or list") 121 | 122 | side_effect = side_effects[json.dumps(command)] 123 | if isinstance(side_effect, Exception): 124 | raise side_effect 125 | if side_effect: 126 | return side_effect 127 | return DEFAULT 128 | 129 | mocked_run = mocker.patch("subprocess.run") 130 | mocked_run.side_effect = get_side_effect 131 | 132 | def add(command, side_effect=None): 133 | side_effects[json.dumps(command)] = side_effect 134 | return mocked_run 135 | 136 | return add 137 | 138 | 139 | @pytest.fixture 140 | def run_cli(): 141 | from click.testing import CliRunner 142 | 143 | def run(*args, catch_exceptions=False, **kwargs): 144 | # See https://github.com/pallets/click/issues/1053 145 | logging.getLogger("").handlers = [] 146 | 147 | runner = CliRunner(mix_stderr=True) 148 | result = runner.invoke(*args, catch_exceptions=catch_exceptions, **kwargs) 149 | print(result.output) 150 | return result 151 | 152 | return run 153 | 154 | 155 | @pytest.fixture 156 | def reboot_task(mocker, mock_subprocess_popen): 157 | tasks = {"pre_boot": [], "post_boot": []} 158 | 159 | def listdir(directory): 160 | # TODO: Make task directories configurable to avoid mocking them in tests. 161 | # Hence, we would be able to use Pytest's tmpdir fixture. 162 | if directory == "/etc/rebootmgr/pre_boot_tasks/": 163 | return tasks["pre_boot"] 164 | elif directory == "/etc/rebootmgr/post_boot_tasks/": 165 | return tasks["post_boot"] 166 | else: 167 | raise FileNotFoundError 168 | mocker.patch("os.listdir", new=listdir) 169 | 170 | def create_task(tasktype, filename, exit_code=0, raise_timeout_expired=False): 171 | assert tasktype in ["pre_boot", "post_boot"], "task type must be either pre_boot or post_boot" 172 | 173 | tasks[tasktype] += [filename] 174 | 175 | if raise_timeout_expired: 176 | return_value = None 177 | side_effect = subprocess.TimeoutExpired(filename, 1234) 178 | else: 179 | return_value = exit_code 180 | side_effect = None 181 | 182 | return mock_subprocess_popen( 183 | ["/etc/rebootmgr/{}_tasks/{}".format(tasktype, filename)], 184 | wait_return_value=return_value, 185 | wait_side_effect=side_effect) 186 | 187 | return create_task 188 | 189 | 190 | @pytest.fixture 191 | def default_config(consul_cluster): 192 | hostname = socket.gethostname() 193 | key = "service/rebootmgr/nodes/%s/config" % hostname 194 | consul_cluster[0].kv.put(key, '{"enabled": true}') 195 | 196 | yield 197 | 198 | consul_cluster[0].kv.delete("service/rebootmgr", recurse=True) 199 | 200 | 201 | @pytest.fixture 202 | def forward_consul_port(forward_port, consul_cluster): 203 | forward_port.consul(consul_cluster[0]) 204 | 205 | 206 | @pytest.fixture 207 | def forward_port(): 208 | """ 209 | Forwards tcp ports. 210 | 211 | We need this, because rebootmgr assumes that consul is reachable on localhost:8500. 212 | 213 | This example will forward `127.0.0.1:8500` to `10.0.0.1:8500` 214 | 215 | forward_port.tcp("10.0.0.1", 8500) 216 | """ 217 | f = _PortForwardingFixture() 218 | try: 219 | yield f 220 | finally: 221 | f.restore() 222 | 223 | 224 | class _PortForwardingFixture: 225 | """ 226 | See the `forward_port` fixture for an explanation and an example. 227 | """ 228 | def __init__(self): 229 | self.forwarders = [] 230 | 231 | def consul(self, con): 232 | self.tcp(8500, con.http.host, con.http.port) 233 | 234 | def tcp(self, listen_port, forward_host, forward_ip): 235 | forwarder = _TCPPortForwarder(listen_port, forward_host, forward_ip) 236 | forwarder.start() 237 | self.forwarders += [forwarder] 238 | 239 | def restore(self): 240 | for forwarder in self.forwarders: 241 | forwarder.stop() 242 | 243 | 244 | class _TCPPortForwarder(): 245 | """ 246 | Forward TCP port using socat under the hood. 247 | 248 | See the `forward_port` fixture for an explanation and an example. 249 | 250 | This is known to be a hack; usinfg socat was the simplest and most reliable solution I found. 251 | """ 252 | def __init__(self, listen_port, forward_host, forward_port): 253 | self.listen_port = listen_port 254 | self.forward_host = forward_host 255 | self.forward_port = forward_port 256 | self.process = None 257 | 258 | def start(self): 259 | self.process = subprocess.Popen([ 260 | "socat", 261 | "tcp-listen:{},reuseaddr,fork".format(self.listen_port), 262 | "tcp:{}:{}".format(self.forward_host, self.forward_port) 263 | ]) 264 | # XXX(sneubauer): Dirty fix for race condition, where socat is not ready yet when test runs. 265 | time.sleep(0.05) 266 | 267 | def stop(self): 268 | self.process.terminate() 269 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | from rebootmgr.main import cli as rebootmgr 2 | 3 | import json 4 | import pytest 5 | import socket 6 | 7 | 8 | def test_ensure_config_when_already_valid(run_cli, forward_consul_port, default_config): 9 | result = run_cli(rebootmgr, ["-vv", "--ensure-config"]) 10 | 11 | assert "Did not create default configuration, since there already was one." in result.output 12 | assert result.exit_code == 0 13 | 14 | 15 | def test_ensure_config_when_old_style_config_present( 16 | run_cli, forward_consul_port, consul_cluster): 17 | hostname = socket.gethostname() 18 | consul_cluster[0].kv.put("service/rebootmgr/nodes/%s/config" % hostname, '{"disabled": false}') 19 | 20 | result = run_cli(rebootmgr, ["-vv", "--ensure-config"]) 21 | 22 | assert "Did not create default configuration, since there already was one." in result.output 23 | 24 | _, data = consul_cluster[0].kv.get("service/rebootmgr/nodes/{}/config".format( 25 | hostname)) 26 | assert json.loads(data["Value"].decode()) == { 27 | "enabled": True, 28 | } 29 | 30 | assert result.exit_code == 0 31 | 32 | 33 | @pytest.mark.parametrize("bad_config", 34 | [None, '', '{}', 'disabled', '{"somekey": false}']) 35 | def test_ensure_config_when_invalid(run_cli, forward_consul_port, 36 | consul_cluster, bad_config): 37 | hostname = socket.gethostname() 38 | if bad_config is None: 39 | consul_cluster[0].kv.delete("service/rebootmgr/nodes/%s/config" % hostname) 40 | else: 41 | consul_cluster[0].kv.put("service/rebootmgr/nodes/%s/config" % hostname, bad_config) 42 | 43 | result = run_cli(rebootmgr, ["-v", "--ensure-config"]) 44 | 45 | assert "Created default configuration, since it was missing or invalid." in result.output 46 | 47 | _, data = consul_cluster[0].kv.get("service/rebootmgr/nodes/{}/config".format( 48 | hostname)) 49 | assert json.loads(data["Value"].decode()) == { 50 | "enabled": True, 51 | "message": "Default config created" 52 | } 53 | 54 | assert result.exit_code == 0 55 | -------------------------------------------------------------------------------- /tests/test_lock.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import PropertyMock 2 | 3 | from rebootmgr.main import cli as rebootmgr 4 | from consul_lib import Lock 5 | 6 | 7 | def test_consul_lock_fails( 8 | run_cli, forward_consul_port, consul_cluster, default_config): 9 | with Lock(consul_cluster[0], "service/rebootmgr/lock"): 10 | result = run_cli(rebootmgr, ["-v"], catch_exceptions=True) 11 | 12 | assert "Could not get consul lock. Exit" in result.output 13 | assert result.exit_code == 4 14 | 15 | 16 | def test_consul_lock_fails_later( 17 | run_cli, forward_consul_port, consul_cluster, default_config, 18 | reboot_task, mocker): 19 | mocked_sleep = mocker.patch("time.sleep") 20 | # Lock.acquired is called only once, after the sleep period. 21 | mocker.patch("consul_lib.lock.Lock.acquired", 22 | new_callable=PropertyMock, 23 | return_value=False) 24 | 25 | result = run_cli(rebootmgr, ["-v"], catch_exceptions=True) 26 | 27 | assert "Lost consul lock. Exit" in result.output 28 | mocked_sleep.assert_any_call(130) 29 | assert result.exit_code == 5 30 | -------------------------------------------------------------------------------- /tests/test_post_reboot.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import socket 3 | import time 4 | 5 | from consul import Check 6 | from rebootmgr.main import cli as rebootmgr 7 | from rebootmgr.main import EXIT_CONSUL_CHECKS_FAILED, \ 8 | EXIT_DID_NOT_REALLY_REBOOT 9 | from unittest.mock import mock_open 10 | 11 | WAIT_UNTIL_HEALTHY_SLEEP_TIME = 120 12 | 13 | 14 | @pytest.fixture 15 | def reboot_in_progress(consul_cluster): 16 | # Set up kv so that rebootmgr runs in post-reboot mode 17 | hostname = socket.gethostname().split(".")[0] 18 | try: 19 | consul_cluster[0].kv.put("service/rebootmgr/reboot_in_progress", hostname) 20 | yield 21 | finally: 22 | consul_cluster[0].kv.delete("service/rebootmgr/reboot_in_progress") 23 | 24 | 25 | def test_post_reboot_consul_checks_passing( 26 | run_cli, consul_cluster, forward_consul_port, default_config, 27 | reboot_in_progress, reboot_task, mocker): 28 | """ 29 | Test if we succeed if consul checks are passing after reboot. 30 | """ 31 | mocker.patch("time.sleep") 32 | mocked_run = mocker.patch("subprocess.run") 33 | mocked_popen = mocker.patch("subprocess.Popen") 34 | 35 | result = run_cli(rebootmgr, ["-v"]) 36 | 37 | mocked_run.assert_not_called() 38 | mocked_popen.assert_not_called() 39 | assert result.exit_code == 0 40 | 41 | 42 | def test_post_reboot_consul_checks_failing( 43 | run_cli, consul_cluster, forward_consul_port, default_config, 44 | reboot_in_progress, reboot_task, mocker): 45 | """ 46 | Test if we fail if consul checks are failing after reboot. 47 | """ 48 | consul_cluster[0].agent.service.register("A", tags=["rebootmgr"]) 49 | consul_cluster[1].agent.service.register("A", tags=["rebootmgr"], 50 | check=Check.ttl("1ms")) # Failing 51 | time.sleep(0.01) 52 | 53 | mocker.patch("time.sleep") 54 | mocked_run = mocker.patch("subprocess.run") 55 | mocked_popen = mocker.patch("subprocess.Popen") 56 | 57 | result = run_cli(rebootmgr, ["-v"]) 58 | 59 | mocked_run.assert_not_called() 60 | mocked_popen.assert_not_called() 61 | assert result.exit_code == EXIT_CONSUL_CHECKS_FAILED 62 | 63 | 64 | def test_post_reboot_wait_until_healthy_and_are_healthy( 65 | run_cli, consul_cluster, forward_consul_port, default_config, 66 | reboot_in_progress, reboot_task, mocker): 67 | """ 68 | Test if we wait until consul checks are passing after reboot 69 | (when we don't actually need to wait) 70 | """ 71 | mocker.patch("time.sleep") 72 | mocked_run = mocker.patch("subprocess.run") 73 | mocked_popen = mocker.patch("subprocess.Popen") 74 | 75 | result = run_cli(rebootmgr, ["-v", "--post-reboot-wait-until-healthy"]) 76 | 77 | mocked_run.assert_not_called() 78 | mocked_popen.assert_not_called() 79 | assert result.exit_code == 0 80 | 81 | 82 | def test_post_reboot_wait_until_healthy( 83 | run_cli, consul_cluster, forward_consul_port, default_config, 84 | reboot_in_progress, reboot_task, mocker): 85 | """ 86 | Test if we wait until consul checks are passing after reboot. 87 | """ 88 | consul_cluster[0].agent.service.register("A", tags=["rebootmgr"]) 89 | consul_cluster[1].agent.service.register("A", tags=["rebootmgr"], 90 | check=Check.ttl("1000s")) 91 | consul_cluster[1].agent.check.ttl_fail("service:A") 92 | 93 | sleep_counter = 2 94 | 95 | def fake_sleep(seconds): 96 | """ 97 | While we're waiting for consul checks to start passing, 98 | we sleep 120 seconds at a time. 99 | Count how often this happens, and after a few times, we 100 | will set the failing check to passing. 101 | 102 | We ignore sleep requests for different amounts of time. 103 | """ 104 | nonlocal sleep_counter 105 | if seconds == WAIT_UNTIL_HEALTHY_SLEEP_TIME: 106 | sleep_counter -= 1 107 | if sleep_counter <= 0: 108 | consul_cluster[1].agent.check.ttl_pass("service:A") 109 | 110 | mocker.patch("time.sleep", new=fake_sleep) 111 | mocked_run = mocker.patch("subprocess.run") 112 | mocked_popen = mocker.patch("subprocess.Popen") 113 | 114 | result = run_cli(rebootmgr, ["-v", "--post-reboot-wait-until-healthy"]) 115 | 116 | mocked_run.assert_not_called() 117 | mocked_popen.assert_not_called() 118 | assert sleep_counter == 0 119 | assert result.exit_code == 0 120 | 121 | 122 | def test_post_reboot_phase_fails_without_tasks( 123 | run_cli, forward_consul_port, default_config, reboot_in_progress): 124 | result = run_cli(rebootmgr, ["-v"], catch_exceptions=True) 125 | 126 | assert "Entering post reboot state" in result.output 127 | assert result.exit_code == 1 128 | assert isinstance(result.exception, FileNotFoundError) 129 | 130 | 131 | def test_post_reboot_phase_succeeds_with_tasks( 132 | run_cli, forward_consul_port, default_config, reboot_in_progress, 133 | reboot_task): 134 | reboot_task("post_boot", "50_another_task.sh") 135 | 136 | result = run_cli(rebootmgr, ["-v"]) 137 | 138 | assert result.exit_code == 0 139 | assert "50_another_task.sh" in result.output 140 | 141 | 142 | def test_post_reboot_phase_fails_with_uptime( 143 | run_cli, forward_consul_port, default_config, reboot_in_progress, 144 | reboot_task, mocker): 145 | mocker.patch('rebootmgr.main.open', new=mock_open(read_data='99999999.9 99999999.9')) 146 | mocker.patch("subprocess.run") 147 | reboot_task("post_boot", "50_another_task.sh") 148 | 149 | result = run_cli(rebootmgr, ["-v", "--check-uptime"]) 150 | 151 | assert "We are in post reboot state but uptime is higher then 2 hours." in result.output 152 | assert result.exit_code == EXIT_DID_NOT_REALLY_REBOOT 153 | 154 | 155 | def test_post_reboot_succeeds_with_current_node_in_maintenance( 156 | run_cli, consul_cluster, reboot_in_progress, forward_consul_port, 157 | default_config, reboot_task, mocker): 158 | mocker.patch("subprocess.run") 159 | consul_cluster[0].agent.service.register("A", tags=["rebootmgr"]) 160 | consul_cluster[1].agent.service.register("A", tags=["rebootmgr"]) 161 | consul_cluster[2].agent.service.register("A", tags=["rebootmgr"]) 162 | 163 | consul_cluster[0].agent.maintenance(True) 164 | 165 | result = run_cli(rebootmgr, ["-v"]) 166 | 167 | assert "All consul checks passed." in result.output 168 | assert "Remove consul key service/rebootmgr/reboot_in_progress" in result.output 169 | 170 | assert result.exit_code == 0 171 | 172 | 173 | def test_post_reboot_fails_with_other_node_in_maintenance( 174 | run_cli, consul_cluster, reboot_in_progress, forward_consul_port, 175 | default_config, reboot_task, mocker): 176 | mocker.patch("subprocess.run") 177 | consul_cluster[0].agent.service.register("A", tags=["rebootmgr"]) 178 | consul_cluster[1].agent.service.register("A", tags=["rebootmgr"]) 179 | consul_cluster[2].agent.service.register("A", tags=["rebootmgr"]) 180 | 181 | consul_cluster[1].agent.maintenance(True) 182 | 183 | result = run_cli(rebootmgr, ["-v"]) 184 | 185 | assert 'There were failed consul checks' in result.output 186 | assert '_node_maintenance on consul2' in result.output 187 | 188 | assert result.exit_code == EXIT_CONSUL_CHECKS_FAILED 189 | 190 | 191 | def test_post_reboot_succeeds_with_other_node_in_maintenance_but_ignoring( 192 | run_cli, consul_cluster, reboot_in_progress, forward_consul_port, 193 | default_config, reboot_task, mocker): 194 | 195 | mocker.patch("subprocess.run") 196 | consul_cluster[0].agent.service.register("A", tags=["rebootmgr"]) 197 | consul_cluster[1].agent.service.register("A", tags=["rebootmgr", "ignore_maintenance"]) 198 | consul_cluster[2].agent.service.register("A", tags=["rebootmgr"]) 199 | 200 | consul_cluster[1].agent.maintenance(True) 201 | 202 | result = run_cli(rebootmgr, ["-v"]) 203 | 204 | assert "All consul checks passed." in result.output 205 | assert "Remove consul key service/rebootmgr/reboot_in_progress" in result.output 206 | 207 | assert result.exit_code == 0 208 | 209 | 210 | def test_post_reboot_wait_until_healthy_with_maintenance( 211 | run_cli, consul_cluster, forward_consul_port, default_config, 212 | reboot_in_progress, reboot_task, mocker): 213 | """ 214 | Test if we wait until consul checks are passing after reboot. 215 | Since none of these services have the tag "ignore_maintenance", they count 216 | as broken when their node is in maintenance mode. 217 | """ 218 | consul_cluster[0].agent.service.register("A", tags=["rebootmgr"]) 219 | consul_cluster[1].agent.service.register("A", tags=["rebootmgr"]) 220 | consul_cluster[2].agent.service.register("A", tags=["rebootmgr"]) 221 | 222 | consul_cluster[1].agent.maintenance(True) 223 | 224 | sleep_counter = 2 225 | 226 | def fake_sleep(seconds): 227 | """ 228 | While we're waiting for consul checks to start passing, 229 | we sleep 120 seconds at a time. 230 | Count how often this happens, and after a few times, we 231 | will remove the maintenance. 232 | 233 | We ignore sleep requests for different amounts of time. 234 | """ 235 | nonlocal sleep_counter 236 | if seconds == WAIT_UNTIL_HEALTHY_SLEEP_TIME: 237 | sleep_counter -= 1 238 | if sleep_counter <= 0: 239 | consul_cluster[1].agent.maintenance(False) 240 | 241 | mocker.patch("time.sleep", new=fake_sleep) 242 | mocked_run = mocker.patch("subprocess.run") 243 | mocked_popen = mocker.patch("subprocess.Popen") 244 | 245 | result = run_cli(rebootmgr, ["-v", "--post-reboot-wait-until-healthy"]) 246 | 247 | mocked_run.assert_not_called() 248 | mocked_popen.assert_not_called() 249 | assert sleep_counter == 0 250 | assert 'There were failed consul checks' in result.output 251 | assert '_node_maintenance on consul2' in result.output 252 | assert "All consul checks passed." in result.output 253 | assert result.exit_code == 0 254 | -------------------------------------------------------------------------------- /tests/test_reboot.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | from rebootmgr.main import cli as rebootmgr 4 | from rebootmgr.main import EXIT_CONSUL_LOCK_FAILED, \ 5 | EXIT_CONSUL_CHECKS_FAILED, EXIT_CONFIGURATION_IS_MISSING 6 | 7 | 8 | def test_reboot_fails_without_config(run_cli, forward_consul_port): 9 | result = run_cli(rebootmgr, ["-v"], catch_exceptions=True) 10 | 11 | assert "Configuration data missing" in result.output 12 | assert "Executing pre reboot tasks" not in result.output 13 | assert result.exit_code == EXIT_CONFIGURATION_IS_MISSING 14 | 15 | 16 | def test_reboot_fails_without_tasks(run_cli, forward_consul_port, default_config): 17 | result = run_cli(rebootmgr, ["-v"], catch_exceptions=True) 18 | 19 | assert "Executing pre reboot tasks" in result.output 20 | assert result.exit_code == 1 21 | assert isinstance(result.exception, FileNotFoundError) 22 | 23 | 24 | def test_reboot_succeeds_with_tasks(run_cli, forward_consul_port, consul_cluster, 25 | default_config, reboot_task, 26 | mock_subprocess_run, mocker): 27 | mocked_sleep = mocker.patch("time.sleep") 28 | reboot_task("pre_boot", "00_some_task.sh") 29 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 30 | 31 | result = run_cli(rebootmgr, ["-v"]) 32 | 33 | assert "00_some_task.sh" in result.output 34 | assert result.exit_code == 0 35 | 36 | mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True) 37 | 38 | # We want rebootmgr to sleep for 2 minutes after running the pre boot tasks, 39 | # so that we can notice when the tasks broke some consul checks. 40 | mocked_sleep.assert_any_call(130) 41 | 42 | # Check that it sets the reboot_in_progress flag 43 | _, data = consul_cluster[0].kv.get("service/rebootmgr/reboot_in_progress") 44 | assert data["Value"].decode() == socket.gethostname() 45 | 46 | 47 | def test_dryrun_reboot_succeeds_with_tasks(run_cli, forward_consul_port, 48 | consul_cluster, default_config, 49 | reboot_task, mock_subprocess_run, 50 | mocker): 51 | mocked_sleep = mocker.patch("time.sleep") 52 | mocked_popen = reboot_task("pre_boot", "00_some_task.sh") 53 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 54 | 55 | result = run_cli(rebootmgr, ["-vv", "--dryrun"]) 56 | 57 | assert "00_some_task.sh" in result.output 58 | assert "in key service/rebootmgr/reboot_in_progress" in result.output 59 | assert result.exit_code == 0 60 | 61 | # shutdown must not be called 62 | mocked_run.assert_not_called() 63 | # task should be called 64 | assert mocked_popen.call_count == 1 65 | args, kwargs = mocked_popen.call_args 66 | assert args[0] == "/etc/rebootmgr/pre_boot_tasks/00_some_task.sh" 67 | assert 'env' in kwargs 68 | assert 'REBOOTMGR_DRY_RUN' in kwargs['env'] 69 | assert kwargs['env']['REBOOTMGR_DRY_RUN'] == "1" 70 | # In particular, 'shutdown' is not called 71 | 72 | # We want rebootmgr to sleep for 2 minutes after running the pre boot tasks, 73 | # so that we can notice when the tasks broke some consul checks. 74 | mocked_sleep.assert_any_call(130) 75 | 76 | # Check that it does not set the reboot_in_progress flag 77 | _, data = consul_cluster[0].kv.get("service/rebootmgr/reboot_in_progress") 78 | assert not data 79 | 80 | 81 | def test_reboot_fail( 82 | run_cli, forward_consul_port, default_config, reboot_task, 83 | mock_subprocess_run, mocker): 84 | mocked_sleep = mocker.patch("time.sleep") 85 | 86 | mocked_popen = mocker.patch("subprocess.Popen") 87 | mocked_run = mock_subprocess_run( 88 | ["shutdown", "-r", "+1"], 89 | side_effect=Exception("Failed to run reboot command")) 90 | 91 | result = run_cli(rebootmgr, ["-v"], catch_exceptions=True) 92 | 93 | assert result.exit_code == 1 94 | 95 | mocked_popen.assert_not_called() 96 | mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True) 97 | 98 | # We want rebootmgr to sleep for 2 minutes after running the pre boot tasks, 99 | # so that we can notice when the tasks broke some consul checks. 100 | mocked_sleep.assert_any_call(130) 101 | 102 | 103 | def test_reboot_fails_if_another_reboot_is_in_progress( 104 | run_cli, forward_consul_port, default_config, consul_cluster): 105 | consul_cluster[0].kv.put("service/rebootmgr/reboot_in_progress", "some_hostname") 106 | 107 | result = run_cli(rebootmgr, ["-v"]) 108 | 109 | assert "some_hostname" in result.output 110 | assert result.exit_code == EXIT_CONSUL_LOCK_FAILED 111 | 112 | 113 | def test_reboot_succeeds_if_this_node_is_in_maintenance( 114 | run_cli, forward_consul_port, default_config, consul_cluster, 115 | reboot_task, mock_subprocess_run, mocker): 116 | consul_cluster[0].agent.service.register("A", tags=["rebootmgr"]) 117 | consul_cluster[0].agent.maintenance(True) 118 | 119 | mocker.patch("time.sleep") 120 | mocked_popen = mocker.patch("subprocess.Popen") 121 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 122 | 123 | result = run_cli(rebootmgr, ["-v"]) 124 | 125 | mocked_popen.assert_not_called() 126 | mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True) 127 | assert result.exit_code == 0 128 | 129 | 130 | def test_reboot_fails_if_another_node_is_in_maintenance( 131 | run_cli, forward_consul_port, default_config, consul_cluster, 132 | reboot_task, mock_subprocess_run, mocker): 133 | consul_cluster[0].agent.service.register("A", tags=["rebootmgr"]) 134 | consul_cluster[1].agent.service.register("A", tags=["rebootmgr"]) 135 | consul_cluster[2].agent.service.register("A", tags=["rebootmgr"]) 136 | consul_cluster[1].agent.maintenance(True) 137 | 138 | mocker.patch("time.sleep") 139 | mocked_popen = mocker.patch("subprocess.Popen") 140 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 141 | 142 | result = run_cli(rebootmgr, ["-v"]) 143 | 144 | mocked_popen.assert_not_called() 145 | mocked_run.assert_not_called() 146 | assert 'There were failed consul checks' in result.output 147 | assert '_node_maintenance on consul2' in result.output 148 | 149 | assert result.exit_code == EXIT_CONSUL_CHECKS_FAILED 150 | 151 | 152 | def test_reboot_succeeds_if_another_node_is_in_maintenance_but_ignoring( 153 | run_cli, forward_consul_port, default_config, consul_cluster, 154 | reboot_task, mock_subprocess_run, mocker): 155 | consul_cluster[0].agent.service.register("A", tags=["rebootmgr"]) 156 | consul_cluster[1].agent.service.register("A", tags=["rebootmgr", "ignore_maintenance"]) 157 | consul_cluster[2].agent.service.register("A", tags=["rebootmgr"]) 158 | consul_cluster[1].agent.maintenance(True) 159 | 160 | mocker.patch("time.sleep") 161 | mocked_popen = mocker.patch("subprocess.Popen") 162 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 163 | 164 | result = run_cli(rebootmgr, ["-v"]) 165 | 166 | mocked_popen.assert_not_called() 167 | mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True) 168 | assert result.exit_code == 0 169 | -------------------------------------------------------------------------------- /tests/test_stopflag.py: -------------------------------------------------------------------------------- 1 | import json 2 | import socket 3 | 4 | from rebootmgr.main import cli as rebootmgr 5 | 6 | 7 | def test_not_verbose(run_cli, consul_cluster, forward_consul_port, default_config): 8 | consul_cluster[0].kv.put("service/rebootmgr/stop", "reason: stopped for testing") 9 | 10 | result = run_cli(rebootmgr) 11 | 12 | # There is a distinct exit code when the global stop flag is set 13 | assert result.exit_code == 102 14 | 15 | # We did not ask for verbose logging 16 | assert not result.output 17 | 18 | 19 | def test_verbose(run_cli, consul_cluster, forward_consul_port, default_config): 20 | consul_cluster[0].kv.put("service/rebootmgr/stop", "reason: stopped for testing") 21 | 22 | result1 = run_cli(rebootmgr, ["-v"]) 23 | assert "Global stop flag is set" in result1.output 24 | assert "service/rebootmgr/stop" not in result1.output 25 | 26 | result2 = run_cli(rebootmgr, ["-vv"]) 27 | assert "Global stop flag is set" in result2.output 28 | assert "service/rebootmgr/stop" in result2.output 29 | 30 | 31 | def test_set_global_stop_flag( 32 | run_cli, forward_consul_port, consul_cluster, 33 | mock_subprocess_run, mocker): 34 | mocked_sleep = mocker.patch("time.sleep") 35 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 36 | mocked_popen = mocker.patch("subprocess.Popen") 37 | datacenter = "test" 38 | 39 | result = run_cli(rebootmgr, ["-v", "--set-global-stop-flag", datacenter]) 40 | 41 | mocked_sleep.assert_not_called() 42 | mocked_run.assert_not_called() 43 | mocked_popen.assert_not_called() 44 | assert "Set "+datacenter+" global stop flag:" in result.output 45 | idx, data = consul_cluster[0].kv.get("service/rebootmgr/stop", dc=datacenter) 46 | assert idx is not None 47 | assert data["Value"] 48 | assert result.exit_code == 0 49 | 50 | 51 | def test_set_global_stop_flag_with_reason( 52 | run_cli, forward_consul_port, consul_cluster, 53 | mock_subprocess_run, mocker): 54 | mocked_sleep = mocker.patch("time.sleep") 55 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 56 | datacenter = "test" 57 | 58 | result = run_cli(rebootmgr, ["-v", "--set-global-stop-flag", datacenter, 59 | "--stop-reason", "My reason"]) 60 | 61 | mocked_sleep.assert_not_called() 62 | mocked_run.assert_not_called() 63 | assert "Set "+datacenter+" global stop flag:" in result.output 64 | idx, data = consul_cluster[0].kv.get("service/rebootmgr/stop", dc=datacenter) 65 | assert idx is not None 66 | assert "My reason" in data["Value"].decode() 67 | assert result.exit_code == 0 68 | 69 | 70 | def test_unset_global_stop_flag( 71 | run_cli, forward_consul_port, consul_cluster, 72 | mock_subprocess_run, mocker): 73 | mocked_sleep = mocker.patch("time.sleep") 74 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 75 | datacenter = "test" 76 | 77 | result = run_cli(rebootmgr, ["-v", "--set-global-stop-flag", datacenter]) 78 | 79 | mocked_sleep.assert_not_called() 80 | mocked_run.assert_not_called() 81 | idx, data = consul_cluster[0].kv.get("service/rebootmgr/stop", dc=datacenter) 82 | assert idx is not None 83 | assert data 84 | assert result.exit_code == 0 85 | 86 | result = run_cli(rebootmgr, ["-v", "--unset-global-stop-flag", datacenter]) 87 | mocked_sleep.assert_not_called() 88 | mocked_run.assert_not_called() 89 | idx, data = consul_cluster[0].kv.get("service/rebootmgr/stop", dc=datacenter) 90 | assert "Remove "+datacenter+" global stop flag" in result.output 91 | assert idx is not None 92 | assert data is None 93 | assert result.exit_code == 0 94 | 95 | 96 | def test_set_local_stop_flag( 97 | run_cli, forward_consul_port, consul_cluster, 98 | mock_subprocess_run, mocker): 99 | mocked_sleep = mocker.patch("time.sleep") 100 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 101 | mocked_popen = mocker.patch("subprocess.Popen") 102 | hostname = socket.gethostname().split(".")[0] 103 | 104 | result = run_cli(rebootmgr, ["-v", "--set-local-stop-flag"]) 105 | 106 | mocked_sleep.assert_not_called() 107 | mocked_run.assert_not_called() 108 | mocked_popen.assert_not_called() 109 | assert "Set "+hostname+" local stop flag:" in result.output 110 | idx, data = consul_cluster[0].kv.get("service/rebootmgr/nodes/{}/config".format( 111 | hostname)) 112 | assert idx is not None 113 | config = json.loads(data["Value"].decode()) 114 | assert config['enabled'] is False 115 | assert result.exit_code == 0 116 | 117 | 118 | def test_set_local_stop_flag_with_reason( 119 | run_cli, forward_consul_port, consul_cluster, 120 | mock_subprocess_run, mocker): 121 | mocked_sleep = mocker.patch("time.sleep") 122 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 123 | hostname = socket.gethostname().split(".")[0] 124 | 125 | result = run_cli(rebootmgr, ["-v", "--set-local-stop-flag", 126 | "--stop-reason", "My reason"]) 127 | 128 | mocked_sleep.assert_not_called() 129 | mocked_run.assert_not_called() 130 | assert "Set "+hostname+" local stop flag:" in result.output 131 | idx, data = consul_cluster[0].kv.get("service/rebootmgr/nodes/{}/config".format( 132 | hostname)) 133 | assert idx is not None 134 | config = json.loads(data["Value"].decode()) 135 | assert config['enabled'] is False 136 | assert "My reason" in config["message"] 137 | assert result.exit_code == 0 138 | 139 | 140 | def test_unset_local_stop_flag( 141 | run_cli, forward_consul_port, consul_cluster, 142 | mock_subprocess_run, mocker): 143 | mocked_sleep = mocker.patch("time.sleep") 144 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 145 | hostname = socket.gethostname().split(".")[0] 146 | 147 | result = run_cli(rebootmgr, ["-v", "--set-local-stop-flag"]) 148 | 149 | consul_key = "service/rebootmgr/nodes/{}/config".format(hostname) 150 | mocked_sleep.assert_not_called() 151 | mocked_run.assert_not_called() 152 | idx, data = consul_cluster[0].kv.get(consul_key) 153 | assert idx is not None 154 | config = json.loads(data["Value"].decode()) 155 | assert config['enabled'] is False 156 | assert result.exit_code == 0 157 | 158 | result = run_cli(rebootmgr, ["-v", "--unset-local-stop-flag"]) 159 | 160 | mocked_sleep.assert_not_called() 161 | mocked_run.assert_not_called() 162 | assert "Unset "+hostname+" local stop flag" in result.output 163 | idx, data = consul_cluster[0].kv.get(consul_key) 164 | assert idx is not None 165 | config = json.loads(data["Value"].decode()) 166 | assert config['enabled'] is True 167 | assert result.exit_code == 0 168 | -------------------------------------------------------------------------------- /tests/test_tasks.py: -------------------------------------------------------------------------------- 1 | import json 2 | import socket 3 | 4 | from rebootmgr.main import cli as rebootmgr 5 | 6 | 7 | def test_reboot_task_timeout(run_cli, consul_cluster, forward_consul_port, default_config, reboot_task, mocker): 8 | mocker.patch("time.sleep") 9 | reboot_task("pre_boot", "00_some_task.sh", raise_timeout_expired=True) 10 | 11 | result = run_cli(rebootmgr) 12 | 13 | assert "Could not finish task /etc/rebootmgr/pre_boot_tasks/00_some_task.sh in 120 minutes" in result.output 14 | assert result.exit_code == 100 15 | 16 | # TODO(oseibert): check that shutdown is NOT called. 17 | 18 | 19 | def test_reboot_preboot_task_fails(run_cli, consul_cluster, forward_consul_port, default_config, reboot_task, mocker): 20 | mocker.patch("time.sleep") 21 | mocked_run = mocker.patch("subprocess.run") 22 | mocked_popen = reboot_task("pre_boot", "00_some_task.sh", exit_code=1) 23 | 24 | result = run_cli(rebootmgr) 25 | 26 | assert "Task /etc/rebootmgr/pre_boot_tasks/00_some_task.sh failed with return code 1" in result.output 27 | assert result.exit_code == 100 28 | 29 | _, data = consul_cluster[0].kv.get("service/rebootmgr/nodes/{}/config".format(socket.gethostname())) 30 | assert json.loads(data["Value"].decode()) == { 31 | "enabled": True, 32 | } 33 | assert mocked_popen.call_count == 1 34 | mocked_run.assert_not_called() 35 | 36 | 37 | def test_reboot_task_timeout_with_preexisting_config(run_cli, consul_cluster, forward_consul_port, reboot_task, mocker): 38 | consul_cluster[0].kv.put("service/rebootmgr/nodes/{}/config".format(socket.gethostname()), '{"enabled": true, "test_preserved": true}') 39 | mocker.patch("time.sleep") 40 | mocked_run = mocker.patch("subprocess.run") 41 | mocked_popen = reboot_task("pre_boot", "00_some_task.sh", raise_timeout_expired=True) 42 | 43 | result = run_cli(rebootmgr) 44 | 45 | assert "Could not finish task /etc/rebootmgr/pre_boot_tasks/00_some_task.sh in 120 minutes" in result.output 46 | assert result.exit_code == 100 47 | 48 | _, data = consul_cluster[0].kv.get("service/rebootmgr/nodes/{}/config".format(socket.gethostname())) 49 | assert json.loads(data["Value"].decode()) == { 50 | "test_preserved": True, 51 | "enabled": False, 52 | "message": "Could not finish task /etc/rebootmgr/pre_boot_tasks/00_some_task.sh in 120 minutes" 53 | } 54 | assert mocked_popen.call_count == 1 55 | mocked_run.assert_not_called() 56 | 57 | 58 | def test_post_reboot_phase_task_timeout(run_cli, consul_cluster, forward_consul_port, default_config, reboot_task, mocker): 59 | mocked_run = mocker.patch("subprocess.run") 60 | mocked_popen = reboot_task("post_boot", "50_another_task.sh", raise_timeout_expired=True) 61 | 62 | mocker.patch("time.sleep") 63 | consul_cluster[0].kv.put("service/rebootmgr/reboot_in_progress", socket.gethostname()) 64 | 65 | result = run_cli(rebootmgr) 66 | 67 | assert "Could not finish task /etc/rebootmgr/post_boot_tasks/50_another_task.sh in 120 minutes" in result.output 68 | assert result.exit_code == 100 69 | 70 | _, data = consul_cluster[0].kv.get("service/rebootmgr/nodes/{}/config".format(socket.gethostname())) 71 | assert json.loads(data["Value"].decode()) == { 72 | "enabled": False, 73 | "message": "Could not finish task /etc/rebootmgr/post_boot_tasks/50_another_task.sh in 120 minutes" 74 | } 75 | assert mocked_popen.call_count == 1 76 | mocked_run.assert_not_called() 77 | -------------------------------------------------------------------------------- /tests/test_triggers.py: -------------------------------------------------------------------------------- 1 | from rebootmgr.main import cli as rebootmgr 2 | 3 | import datetime 4 | import pytest 5 | import socket 6 | 7 | 8 | def test_reboot_not_required(run_cli, forward_consul_port, default_config, reboot_task): 9 | result = run_cli(rebootmgr, ["-v", "--check-triggers"]) 10 | 11 | assert "No reboot necessary" in result.output 12 | assert result.exit_code == 0 13 | 14 | 15 | def test_reboot_required_because_consul( 16 | run_cli, forward_consul_port, consul_cluster, default_config, 17 | reboot_task, mock_subprocess_run, mocker): 18 | consul_cluster[0].kv.put("service/rebootmgr/nodes/%s/reboot_required" % socket.gethostname(), "") 19 | 20 | mocked_sleep = mocker.patch("time.sleep") 21 | mocked_popen = mocker.patch("subprocess.Popen") 22 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 23 | 24 | result = run_cli(rebootmgr, ["-v", "--check-triggers"]) 25 | 26 | mocked_sleep.assert_any_call(130) 27 | mocked_popen.assert_not_called() 28 | mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True) 29 | assert "Reboot now ..." in result.output 30 | assert result.exit_code == 0 31 | 32 | 33 | def test_reboot_required_because_consul_but_removed_after_sleep( 34 | run_cli, forward_consul_port, consul_cluster, default_config, 35 | reboot_task, mock_subprocess_run, mocker): 36 | consul_cluster[0].kv.put("service/rebootmgr/nodes/%s/reboot_required" % socket.gethostname(), "") 37 | 38 | def remove_reboot_required(seconds): 39 | if seconds == 130: 40 | consul_cluster[0].kv.delete("service/rebootmgr/nodes/%s/reboot_required" % socket.gethostname()) 41 | 42 | mocked_sleep = mocker.patch("time.sleep", side_effect=remove_reboot_required) 43 | mocked_popen = mocker.patch("subprocess.Popen") 44 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 45 | 46 | result = run_cli(rebootmgr, ["-v", "--check-triggers"]) 47 | 48 | mocked_sleep.assert_any_call(130) 49 | mocked_popen.assert_not_called() 50 | mocked_run.assert_not_called() 51 | assert "No reboot necessary" in result.output 52 | assert result.exit_code == 0 53 | 54 | 55 | def test_reboot_required_because_file( 56 | run_cli, forward_consul_port, default_config, reboot_task, 57 | mock_subprocess_run, mocker): 58 | mocked_sleep = mocker.patch("time.sleep") 59 | mocked_popen = mocker.patch("subprocess.Popen") 60 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 61 | mocker.patch("os.path.isfile", new=lambda f: f == "/var/run/reboot-required") 62 | 63 | result = run_cli(rebootmgr, ["-v", "--check-triggers"]) 64 | 65 | mocked_sleep.assert_any_call(130) 66 | mocked_popen.assert_not_called() 67 | mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True) 68 | assert "Reboot now ..." in result.output 69 | assert result.exit_code == 0 70 | 71 | 72 | def test_reboot_required_because_file_but_removed_after_sleep( 73 | run_cli, forward_consul_port, default_config, reboot_task, 74 | mock_subprocess_run, mocker): 75 | reboot_required_file_is_present = True 76 | 77 | def remove_file(seconds): 78 | nonlocal reboot_required_file_is_present 79 | if seconds == 130: 80 | reboot_required_file_is_present = False 81 | 82 | def new_isfile(f): 83 | return reboot_required_file_is_present and \ 84 | f == "/var/run/reboot-required" 85 | 86 | mocked_sleep = mocker.patch("time.sleep", side_effect=remove_file) 87 | mocked_popen = mocker.patch("subprocess.Popen") 88 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 89 | mocker.patch("os.path.isfile", new=new_isfile) 90 | 91 | result = run_cli(rebootmgr, ["-v", "--check-triggers"]) 92 | 93 | mocked_sleep.assert_any_call(130) 94 | mocked_popen.assert_not_called() 95 | mocked_run.assert_not_called() 96 | assert "No reboot necessary" in result.output 97 | assert result.exit_code == 0 98 | 99 | 100 | def test_reboot_on_holiday( 101 | run_cli, forward_consul_port, default_config, reboot_task, 102 | mock_subprocess_run, mocker): 103 | mocker.patch("time.sleep") 104 | mocked_popen = mocker.patch("subprocess.Popen") 105 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 106 | 107 | today = datetime.date.today() 108 | tomorrow = today + datetime.timedelta(days=1) 109 | mocker.patch("holidays.DE", new=lambda: [today, tomorrow]) 110 | 111 | result = run_cli(rebootmgr, ["-v", "--check-holidays"]) 112 | 113 | mocked_popen.assert_not_called() 114 | mocked_run.assert_not_called() 115 | assert "Refuse to run on holiday" in result.output 116 | assert result.exit_code == 6 117 | 118 | 119 | def test_reboot_on_not_a_holiday( 120 | run_cli, forward_consul_port, default_config, reboot_task, 121 | mock_subprocess_run, mocker): 122 | mocked_sleep = mocker.patch("time.sleep") 123 | mocked_popen = mocker.patch("subprocess.Popen") 124 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 125 | 126 | mocker.patch("holidays.DE", new=lambda: []) 127 | 128 | result = run_cli(rebootmgr, ["-v", "--check-holidays"]) 129 | 130 | mocked_sleep.assert_any_call(130) 131 | mocked_popen.assert_not_called() 132 | mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True) 133 | assert "Reboot now ..." in result.output 134 | assert result.exit_code == 0 135 | 136 | 137 | def test_reboot_when_node_disabled( 138 | run_cli, forward_consul_port, consul_cluster, reboot_task, 139 | mock_subprocess_run, mocker): 140 | consul_cluster[0].kv.put("service/rebootmgr/nodes/{}/config".format(socket.gethostname()), '{"enabled": false}') 141 | 142 | mocker.patch("time.sleep") 143 | mocked_popen = mocker.patch("subprocess.Popen") 144 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 145 | 146 | result = run_cli(rebootmgr, ["-v"]) 147 | 148 | mocked_popen.assert_not_called() 149 | mocked_run.assert_not_called() 150 | assert "Rebootmgr is disabled in consul config for this node" in result.output 151 | assert result.exit_code == 101 152 | 153 | 154 | def test_reboot_when_node_disabled_but_ignored( 155 | run_cli, forward_consul_port, consul_cluster, reboot_task, 156 | mock_subprocess_run, mocker): 157 | consul_cluster[0].kv.put("service/rebootmgr/nodes/{}/config".format(socket.gethostname()), '{"enabled": false}') 158 | 159 | mocked_sleep = mocker.patch("time.sleep") 160 | mocked_popen = mocker.patch("subprocess.Popen") 161 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 162 | 163 | result = run_cli(rebootmgr, ["-v", "--ignore-node-disabled"]) 164 | 165 | mocked_sleep.assert_any_call(130) 166 | mocked_popen.assert_not_called() 167 | mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True) 168 | assert "Reboot now ..." in result.output 169 | assert result.exit_code == 0 170 | 171 | # TODO(oseibert): Should a MISSING configuration also be ignored with --ignore-node-disabled? 172 | 173 | 174 | # TODO(oseibert): Fix this bug. 175 | @pytest.mark.xfail 176 | def test_reboot_when_node_disabled_after_sleep( 177 | run_cli, forward_consul_port, consul_cluster, default_config, 178 | reboot_task, mock_subprocess_run, mocker): 179 | def set_configuration_disabled(seconds): 180 | if seconds == 130: 181 | consul_cluster[0].kv.put("service/rebootmgr/nodes/{}/config".format(socket.gethostname()), '{"enabled": false}') 182 | 183 | # When rebootmgr sleeps for 2 minutes, the stop flag will be set. 184 | mocked_sleep = mocker.patch("time.sleep", side_effect=set_configuration_disabled) 185 | mocked_popen = mocker.patch("subprocess.Popen") 186 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 187 | 188 | result = run_cli(rebootmgr, ["-v"]) 189 | 190 | mocked_sleep.assert_any_call(130) 191 | mocked_popen.assert_not_called() 192 | mocked_run.assert_not_called() 193 | assert result.exit_code == 101 194 | 195 | 196 | def test_reboot_when_global_stop_flag( 197 | run_cli, forward_consul_port, consul_cluster, default_config, 198 | reboot_task, mock_subprocess_run, mocker): 199 | consul_cluster[0].kv.put("service/rebootmgr/stop", "") 200 | 201 | mocker.patch("time.sleep") 202 | mocked_popen = mocker.patch("subprocess.Popen") 203 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 204 | 205 | result = run_cli(rebootmgr, ["-v"]) 206 | 207 | mocked_popen.assert_not_called() 208 | mocked_run.assert_not_called() 209 | assert "Global stop flag is set: exit" in result.output 210 | assert result.exit_code == 102 211 | 212 | 213 | def test_reboot_when_global_stop_flag_after_sleep( 214 | run_cli, forward_consul_port, consul_cluster, default_config, 215 | reboot_task, mock_subprocess_run, mocker): 216 | def set_stop_flag(seconds): 217 | if seconds == 130: 218 | consul_cluster[0].kv.put("service/rebootmgr/stop", "") 219 | 220 | # When rebootmgr sleeps for 2 minutes, the stop flag will be set. 221 | mocked_sleep = mocker.patch("time.sleep", side_effect=set_stop_flag) 222 | mocked_popen = mocker.patch("subprocess.Popen") 223 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 224 | 225 | result = run_cli(rebootmgr, ["-v"]) 226 | 227 | mocked_sleep.assert_any_call(130) 228 | mocked_popen.assert_not_called() 229 | mocked_run.assert_not_called() 230 | assert "Global stop flag is set: exit" in result.output 231 | assert result.exit_code == 102 232 | 233 | 234 | def test_reboot_when_global_stop_flag_when_ignored( 235 | run_cli, forward_consul_port, consul_cluster, default_config, 236 | reboot_task, mock_subprocess_run, mocker): 237 | consul_cluster[0].kv.put("service/rebootmgr/stop", "") 238 | 239 | mocked_sleep = mocker.patch("time.sleep") 240 | mocked_popen = mocker.patch("subprocess.Popen") 241 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 242 | 243 | result = run_cli(rebootmgr, ["-v", "--ignore-global-stop-flag"]) 244 | 245 | mocked_sleep.assert_any_call(130) 246 | mocked_popen.assert_not_called() 247 | mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True) 248 | assert "Reboot now ..." in result.output 249 | assert result.exit_code == 0 250 | 251 | 252 | def test_reboot_when_global_stop_flag_after_sleep_when_ignored( 253 | run_cli, forward_consul_port, consul_cluster, default_config, 254 | reboot_task, mock_subprocess_run, mocker): 255 | def set_stop_flag(seconds): 256 | if seconds == 130: 257 | consul_cluster[0].kv.put("service/rebootmgr/stop", "") 258 | 259 | # When rebootmgr sleeps for 2 minutes, the stop flag will be set. 260 | mocked_sleep = mocker.patch("time.sleep", side_effect=set_stop_flag) 261 | mocked_popen = mocker.patch("subprocess.Popen") 262 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 263 | 264 | result = run_cli(rebootmgr, ["-v", "--ignore-global-stop-flag"]) 265 | 266 | mocked_sleep.assert_any_call(130) 267 | mocked_popen.assert_not_called() 268 | mocked_run.assert_any_call(["shutdown", "-r", "+1"], check=True) 269 | assert "Reboot now ..." in result.output 270 | assert result.exit_code == 0 271 | -------------------------------------------------------------------------------- /tests/test_whitelist.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from rebootmgr.main import cli as rebootmgr 4 | from consul import Check 5 | 6 | 7 | def test_reboot_succeeds_with_failing_checks_if_whitelisted( 8 | run_cli, consul_cluster, forward_consul_port, default_config, 9 | reboot_task, mock_subprocess_run, mocker): 10 | consul_cluster[0].kv.put("service/rebootmgr/ignore_failed_checks", '["consul2"]') 11 | 12 | consul_cluster[0].agent.service.register("A", tags=["rebootmgr"]) 13 | consul_cluster[1].agent.service.register("A", tags=["rebootmgr"], 14 | check=Check.ttl("1ms")) # Failing 15 | time.sleep(0.01) 16 | 17 | mocker.patch("time.sleep") 18 | mocker.patch("subprocess.Popen") 19 | mock_subprocess_run(["shutdown", "-r", "+1"]) 20 | 21 | result = run_cli(rebootmgr, ["-v"]) 22 | 23 | assert result.exit_code == 0 24 | 25 | 26 | def test_reboot_succeeds_with_failing_checks_if_ignored( 27 | run_cli, consul_cluster, forward_consul_port, default_config, 28 | reboot_task, mock_subprocess_run, mocker): 29 | consul_cluster[0].agent.service.register("A", tags=["rebootmgr"]) 30 | consul_cluster[1].agent.service.register("A", tags=["rebootmgr"], 31 | check=Check.ttl("1ms")) # Failing 32 | time.sleep(0.01) 33 | 34 | mocker.patch("time.sleep") 35 | mocker.patch("subprocess.Popen") 36 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 37 | 38 | result = run_cli(rebootmgr, ["-v", "--ignore-failed-checks"]) 39 | 40 | assert result.exit_code == 0 41 | assert mocked_run.call_count == 1 42 | 43 | 44 | def test_reboot_fails_with_failing_checks( 45 | run_cli, consul_cluster, forward_consul_port, default_config, 46 | reboot_task, mock_subprocess_run, mocker): 47 | consul_cluster[0].agent.service.register("A", tags=["rebootmgr"]) 48 | consul_cluster[1].agent.service.register("A", tags=["rebootmgr"], 49 | check=Check.ttl("1ms")) # Failing 50 | time.sleep(0.01) 51 | 52 | mocker.patch("time.sleep") 53 | mocker.patch("subprocess.Popen") 54 | mock_subprocess_run(["shutdown", "-r", "+1"]) 55 | 56 | result = run_cli(rebootmgr, ["-v"]) 57 | 58 | assert result.exit_code == 2 59 | 60 | 61 | def test_reboot_fails_with_failing_consul_cluster( 62 | run_cli, forward_consul_port, default_config, 63 | reboot_task, mock_subprocess_run, mocker): 64 | # mocker.patch("time.sleep") 65 | mocker.patch("subprocess.Popen") 66 | mock_subprocess_run(["shutdown", "-r", "+1"]) 67 | 68 | def newmembers(self): 69 | return [ 70 | {'Status': 1, 'Name': 'consul1'}, 71 | {'Status': 0, 'Name': 'consul2'}, 72 | ] 73 | 74 | mocker.patch("consul.base.Consul.Agent.members", new=newmembers) 75 | 76 | result = run_cli(rebootmgr, ["-v"]) 77 | 78 | assert result.exit_code == 3 79 | 80 | 81 | def test_reboot_succeeds_with_failing_consul_cluster_if_whitelisted( 82 | run_cli, consul_cluster, forward_consul_port, default_config, 83 | reboot_task, mock_subprocess_run, mocker): 84 | consul_cluster[0].kv.put("service/rebootmgr/ignore_failed_checks", '["consul2"]') 85 | mocker.patch("time.sleep") 86 | mocker.patch("subprocess.Popen") 87 | mock_subprocess_run(["shutdown", "-r", "+1"]) 88 | 89 | def newmembers(self): 90 | return [ 91 | {'Status': 1, 'Name': 'consul1'}, 92 | {'Status': 0, 'Name': 'consul2'}, 93 | ] 94 | 95 | mocker.patch("consul.base.Consul.Agent.members", new=newmembers) 96 | 97 | result = run_cli(rebootmgr, ["-v"]) 98 | 99 | assert result.exit_code == 0 100 | 101 | 102 | def test_reboot_succeeds_with_failing_consul_cluster_if_ignored( 103 | run_cli, consul_cluster, forward_consul_port, default_config, 104 | reboot_task, mock_subprocess_run, mocker): 105 | mocker.patch("time.sleep") 106 | mocker.patch("subprocess.Popen") 107 | mocked_run = mock_subprocess_run(["shutdown", "-r", "+1"]) 108 | 109 | def newmembers(self): 110 | return [ 111 | {'Status': 1, 'Name': 'consul1'}, 112 | {'Status': 0, 'Name': 'consul2'}, 113 | ] 114 | 115 | mocker.patch("consul.base.Consul.Agent.members", new=newmembers) 116 | 117 | result = run_cli(rebootmgr, ["-v", "--ignore-failed-checks"]) 118 | 119 | assert result.exit_code == 0 120 | assert mocked_run.call_count == 1 121 | 122 | 123 | # TODO(oseibert): Test cases where consul service checks succeed/fail after the 124 | # (2 * 60) + 10 seconds sleeping time, when they are done the second time. 125 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = lint, py36, py37, py38, py39, safety 3 | 4 | [testenv] 5 | deps = 6 | coverage 7 | pytest!=5.3.4 # Error in 5.3.4: https://github.com/pytest-dev/pytest/issues/6517 8 | pytest-mock 9 | python-consul>=1.1.0 10 | -rrequirements.txt 11 | commands = 12 | coverage run -m pytest -v --color=yes --maxfail 1 {posargs} tests/ 13 | coverage report --fail-under=100 14 | 15 | [testenv:dev] 16 | basepython = python3.7 17 | commands = 18 | sitepackages = True 19 | usedevelop = True 20 | 21 | [testenv:lint] 22 | deps = 23 | flake8 24 | pep8-naming 25 | commands = flake8 --append-config tox.ini {posargs} rebootmgr/ tests/ 26 | usedevelop = True 27 | skip_install = True 28 | 29 | [testenv:local] 30 | basepython = python3.7 31 | commands = 32 | deps = 33 | {[testenv]deps} 34 | {[testenv:lint]deps} 35 | envdir = venv 36 | usedevelop = True 37 | 38 | [testenv:safety] 39 | deps = 40 | safety 41 | -rrequirements.txt 42 | commands = safety check {posargs} --full-report 43 | usedevelop = True 44 | 45 | 46 | # Coverage 47 | 48 | [coverage:run] 49 | source = rebootmgr/ 50 | 51 | [coverage:report] 52 | show_missing = true 53 | skip_covered = true 54 | 55 | 56 | # Static Code Analysis 57 | 58 | [flake8] 59 | application-import-names = rebootmgr 60 | count = true 61 | import-order-style = edited 62 | max-complexity = 16 63 | statistics = true 64 | --------------------------------------------------------------------------------